Example #1
0
    def test_cache_not_populated_when_disabled(self):
        wiki = WikiApi({'cache': False})

        assert self._get_cache_size(wiki) == 0
        wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 0
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)
Example #2
0
    def test_cache_not_populated_when_disabled(self):
        wiki = WikiApi({'cache': False})

        assert self._get_cache_size(wiki) == 0
        wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 0
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)
Example #3
0
 def runSearchInput(self):
     searchFor = self.getPluginParamValue("SearchFor")
     locale = self.getPluginParamValue("Locale")
     limitResultsTo = self.getPluginParamValueAsInt("LimitResultsTo")
     includeContent = self.getPluginParamValueAsTrueOrFalse(
         "IncludeContent")
     includeHeading = self.getPluginParamValueAsTrueOrFalse(
         "IncludeHeading")
     includeSummary = self.getPluginParamValueAsTrueOrFalse(
         "IncludeSummary")
     includeURL = self.getPluginParamValueAsTrueOrFalse("IncludeURL")
     wiki = WikiApi({"locale": locale})
     content = ""
     cnt = 0
     for result in wiki.find(searchFor):
         article = wiki.get_article(result)
         if includeHeading:
             content = "{0}\n{1}".format(content, article.heading)
         if includeURL:
             content = "{0}\n{1}".format(content, article.url)
         if includeSummary:
             content = "{0}\n{1}".format(content, article.summary)
         if includeContent:
             content = "{0}\n{1}".format(content, article.content)
         content = "{0}\n\n".format(content)
         cnt += 1
         if cnt >= limitResultsTo:
             break
     content = content.strip()
     self.setInputContent(content)
     return content
Example #4
0
 def set_up(self):
     # using an Italian-Emilian locale that is full of unicode symbols
     wiki = WikiApi({'locale': 'eml'})
     result = wiki.find('Bulaggna')[0]
     return {
         'wiki': wiki,
         'result': result,
     }
Example #5
0
 def set_up(self):
     # using an Italian-Emilian locale that is full of unicode symbols
     wiki = WikiApi({'locale': 'eml'})
     result = wiki.find('Bulaggna')[0]
     return {
         'wiki': wiki,
         'result': result,
     }
Example #6
0
 def set_up(self):
     wiki = WikiApi()
     results = wiki.find('Bill Clinton')
     article = wiki.get_article(results[0])
     return {
         'wiki': wiki,
         'results': results,
         'article': article,
     }
Example #7
0
 def set_up(self):
     wiki = WikiApi()
     results = wiki.find('Bill Clinton')
     article = wiki.get_article(results[0])
     return {
         'wiki': wiki,
         'results': results,
         'article': article,
     }
def get_url(query, log_file):
    wiki = WikiApi()
    results = wiki.find(query)
    if len(results) == 0:
        sys.stderr.write("No wikipedia article found for '" + query + "'\n")
    else:
        article = wiki.get_article(results[0])
        print article.url
        with open(log_file, 'a') as f:
            f.write(article.url + "\n")
Example #9
0
def get_url(query, log_file):
  wiki = WikiApi()
  results = wiki.find(query)
  if len(results) == 0:
    sys.stderr.write("No wikipedia article found for '" + query + "'\n")
  else:
    article = wiki.get_article(results[0])
    print article.url
    with open(log_file, 'a') as f:
      f.write(article.url + "\n")
Example #10
0
    def wikiqueryresults(searchQuery):

        wiki = WikiApi({})

        wiki = WikiApi({ 'locale' : 'en' }) # Top specify your locale, 'en' is default

        wikiSearch = wiki.find(searchQuery)

        wikiArticle = wiki.get_article(wikiSearch[0])

        return wikiArticle.summary
Example #11
0
def wiki_api(options):
	wiki = WikiApi()
	wiki = WikiApi({ 'locale' : 'en'}) # to specify your locale, 'en' is default
	results = wiki.find(options['q'])
	for result in results:
		article = wiki.get_article(results)
		title = article.heading
		url = article.url

		print(url)
		link = Link(topic = options['topic'], title = title, url = url)
		link.save()
Example #12
0
    def __init__(self, add_gloss_list, del_gloss_list, category, label):
        """
		Initialize the class.
		"""
        self.add_phrases = get_phrases(add_gloss_list)
        self.del_phrases = get_phrases(del_gloss_list)
        self.category = category
        self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
        self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
        self.wiki = WikiApi({})
        self.visited_results = self.get_results(self.del_phrases)
        self.count = 0
Example #13
0
    def test_cache_populated(self):
        wiki = WikiApi({'cache': True, 'cache_dir': '/tmp/wikiapi-test'})

        assert self._get_cache_size(wiki) == 0
        # Make multiple calls to ensure no duplicate cache items created
        assert wiki.find('Bob Marley') == wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 1

        # Check cache keys are unique
        assert wiki.find('Tom Hanks') != wiki.find('Bob Marley')

        assert self._get_cache_size(wiki) == 2
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)
Example #14
0
def get_wikipedia_details(keyword):
    wiki = WikiApi()
    results = wiki.find(keyword)
    if len(results) > 0:
        article = wiki.get_article(results[0])
        if not 'Disambig' in article.image:
            return {
                'heading': article.heading,
                'image': article.image,
                'summary': article.summary,
                'url': article.url
            }
    return None
Example #15
0
class TestUnicode(unittest.TestCase):
    def setUp(self):
        # using an Italian-Emilian locale that is full of unicode symbols
        self.wiki = WikiApi({'locale': 'eml'})
        self.res = self.wiki.find('Bulaggna')[0]
        self.article = None

    def test_search(self):
        # this is urlencoded.
        self.assertEqual(self.res, u'Bul%C3%A5ggna')

    def test_article(self):
        #unicode errors will likely blow in your face here
        self.assertIsNotNone(self.wiki.get_article(self.res))
Example #16
0
class TestUnicode:
    @pytest.fixture(autouse=True)
    def set_up(self):
        # using an Italian-Emilian locale that is full of unicode symbols
        self.wiki = WikiApi({'locale': 'eml'})
        self.result = self.wiki.find('Bulaggna')[0]

    def test_search(self):
        # this is urlencoded.
        assert self.result == 'Bul%C3%A5ggna'

    def test_article(self):
        # unicode errors will likely blow in your face here
        assert self.wiki.get_article(self.result) is not None
Example #17
0
class TestUnicode(unittest.TestCase):
    def setUp(self):
        # using an Italian-Emilian locale that is full of unicode symbols
        self.wiki = WikiApi({'locale': 'eml'})
        self.res = self.wiki.find('Bulagna')[0]
        self.article = None

    def test_search(self):
        # this is urlencoded.
        self.assertEqual(self.res, u'Bul%C3%A5ggna')

    def test_article(self):
        #unicode errors will likely blow in your face here
        self.assertIsNotNone(self.wiki.get_article(self.res))
def wiki(tokens, message):

        print("\033[1;34;1m")        
        print("\nHazel : Please wait while I surf the web for a result")
        try:

            wiki = WikiApi()
            WikiApi({'locale': 'en'})
            if "search" in tokens:
                tokens.remove("search") # remove search keyword to retrieve the main content to be searched
            if "what" in tokens:
                tokens.remove("what")
            if "who" in tokens:
                tokens.remove("who")
            if "look" in tokens:
                tokens.remove("look")
            if "tell" in tokens:
                tokens.remove("tell")
            if "more" in tokens:
                tokens.remove("more")
            if "about" in tokens:
                tokens.remove("about")
            stop_words = set(stopwords.words('english')) # Remove stop words
            filtered_sentence = [w for w in tokens if not w in stop_words]
            filtered_sentence = []
            for w in tokens: # Filtering input by removing stopwords such as 'I', 'for', 'is', etc.
                if w not in stop_words:
                    filtered_sentence.append(w) # Get and store message without stopeords

            filtered_list = filtered_sentence
            filtered_sentence = ' '.join(filtered_list) # Making a sentance out of the tokens

            message = filtered_sentence # storing input in message
            tokens = word_tokenize(message) # tokenize new message
     
            s = "" # appends the remaining tokens to be searched for
            for i in tokens:
                s = s + i + " " # appending the tokens to form a search keyword
            results = wiki.find(s) # package function to do online searched
            #print("websearch\n"
            if results == "":
                results = "null"
            print("\nFound result for : ", results[0]) # print the first search result
            print("\033[1;37;1m") # set console color
            print(wi.summary(s))
            #main()
        except Exception as e:
            print("I didnt get that. You may want to try that again")
Example #19
0
def setup():
    """
    Sets up global wiki object for Wikipedia lookups.
    """
    global wiki, imdb
    wiki = WikiApi()
    imdb = Imdb(anonymize=True)
Example #20
0
def wiki_search(query):
	wiki = WikiApi()
	wikiurls=[]
	lst=query.split(",")
	num = 10/len(lst)
#	print num
	for i in lst:
		results = wiki.find(i)
		cnt=0
		for j in results:
			cnt=cnt+1
			article = wiki.get_article(j)
			wikiurls.append(article.url)
			if cnt==num:
				break
	return wikiurls
Example #21
0
def wiki_search(query):
    wiki = WikiApi()
    wikiurls = []
    lst = query.split(",")
    num = 10 / len(lst)
    #	print num
    for i in lst:
        results = wiki.find(i)
        cnt = 0
        for j in results:
            cnt = cnt + 1
            article = wiki.get_article(j)
            wikiurls.append(article.url)
            if cnt == num:
                break
    return wikiurls
Example #22
0
	def __init__(self, add_gloss_list, del_gloss_list, category, label):
		"""
		Initialize the class.
		"""
		self.add_phrases = get_phrases(add_gloss_list)
		self.del_phrases = get_phrases(del_gloss_list)
		self.category = category
		self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
		self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
		self.wiki = WikiApi({})
		self.visited_results = self.get_results(self.del_phrases)
		self.count = 0
Example #23
0
def get_full_name_from_wiki(name):
    wiki = WikiApi()
    results = wiki.find(name)
    if len(results) > 0:
        article = wiki.get_article(results[0])
        new_name = article.summary
        new_name = new_name[:new_name.find('(')-1]
        if new_name.find(' refer ') != -1:
            if len(results) > 1:
                article = wiki.get_article(results[1])
                new_name = article.summary
                new_name = new_name[:new_name.find('(') - 1]
            else:
                return None
        table = str.maketrans({key: None for key in string.punctuation + '\r\n'})
        new_name = new_name.translate(table)
        if len(new_name) > 4 and len(new_name) < 50:
            return new_name
        else:
            return None
    else:
        return None
Example #24
0
def get_security_results(filenames):
    """
    Pre-fill visited with security term results.
    """
    global visited_results

    wiki = WikiApi({})

    phrases = []
    for filename in filenames:
        lines = readLines(filename)
        for line in lines:
            line = line.strip()
            if ((len(line) > 0) and (line[0] != '#')):
                if (line[0] == '/'):
                    phrases.append(line.split(' ', 1)[1])
                else:
                    phrases.append(line)

    for phrase in phrases:
        results = wiki.find(phrase)
        for result in results:
            if (result not in visited_results):
                visited_results.append(result)
Example #25
0
    def test_cache_populated(self):
        wiki = WikiApi({'cache': True, 'cache_dir': '/tmp/wikiapi-test'})

        assert self._get_cache_size(wiki) == 0
        # Make multiple calls to ensure no duplicate cache items created
        assert wiki.find('Bob Marley') == wiki.find('Bob Marley')
        assert self._get_cache_size(wiki) == 1

        # Check cache keys are unique
        assert wiki.find('Tom Hanks') != wiki.find('Bob Marley')

        assert self._get_cache_size(wiki) == 2
        shutil.rmtree(wiki.cache_dir, ignore_errors=True)
Example #26
0
    def setUp(self):
        """Set up all of the requirements for testing
        """

        self.pos_lex = naivebayes.generate('sentiment/pos.txt',
                                           naivebayes.lexicon())
        self.neg_lex = naivebayes.generate('sentiment/neg.txt',
                                           naivebayes.lexicon())
        self.wiki = WikiApi()
        self.r = praw.Reddit(client_id='l-Gz5blkt7GCUg',
                             client_secret='_xLEgNing89k6__sWItU1_j9aR8',
                             user_agent='testscript by /u/pbexe')
        self.test_sentence = 'The cat sat on the mat. The dog however, did not!'
        self.test_sentence_tokenized = [[('The', 'DT'), ('cat', 'NN'),
                                         ('sat', 'VBD'), ('on', 'IN'),
                                         ('the', 'DT'), ('mat', 'NN'),
                                         ('.', '.')],
                                        [('The', 'DT'), ('dog', 'NN'),
                                         ('however', 'RB'), (',', ','),
                                         ('did', 'VBD'), ('not', 'RB'),
                                         ('!', '.')]]
        self.test_sentence_with_entities = 'Dr Foster went to Glouster'
        self.test_sentence_with_entities_nodes = ['Dr Foster', 'Glouster']
        self.story = Story(source='http://example.com/',
                           content='This is a title')
        self.story.save()
        self.node1 = Node(name='Key word 1',
                          date=timezone.now(),
                          collectedFrom=self.story)
        self.node1.save()
        self.node2 = Node(name='Key word 2',
                          date=timezone.now(),
                          collectedFrom=self.story)
        self.node2.save()
        self.node3 = Node(name='Key word 3',
                          date=timezone.now(),
                          collectedFrom=self.story)
        self.node3.save()
Example #27
0
def get_wiki_phrases(word):
    wiki = WikiApi()
    wiki = WikiApi({'locale': 'en'})
    results = wiki.find(word)
    print results
    phrase = ""
    for i in range(min(4, len(results))):
        article = wiki.get_article(results[i])
        #print article.content
        phrase = phrase + " " + article.content
        #print phrase
    rake_object = rake.Rake("SmartStoplist.txt", 4, 3, 10)

    #Now, we have a RAKE object that extracts keywords where:
    #   Each word has at least 4 characters
    #   Each phrase has at most 3 words
    #   Each keyword appears in the text at least 4 times
    keywords = rake_object.run(phrase)
    return keywords[0:20]
Example #28
0
class WikiGrabber(object):
    """
    Class to grab the wiki articles.
    """

    def __init__(self, filenames):
        """
        Initialize the WikiGrabber class.
        """
        self.glossary = Glossary(filenames)
        self.wiki = WikiApi({})


    def get_articles(self, dir_name):
        """
        Get wiki articles for all the phrases and convert to xml.
        """
        global visited_results
        step = 1000 + len(visited_results)
        try:
            for phrase, flag in self.glossary.phrases:
                print phrase
                results = self.wiki.find(phrase)
                for result in results:
                    if (result not in visited_results):
                        article = self.wiki.get_article(result)
                        self.article_to_xml(article, flag, dir_name)
                        visited_results.append(result)
                        if (len(visited_results) > step):
                            print phrase, len(visited_results)
                            step = step + 1000
        except:
            print phrase, len(visited_results)


    def article_to_xml(self, article, flag, dir_name):
        """
        Create a xml from the article.
        """
        try:
            docId = 'Wiki_' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')
            docType = 'Wiki'
            docSource = 'wikipedia'
            docDate = ''
            docTitle = article.heading
            docDesc = clean(article.summary)

            if (len(docDesc.split()) < WORD_LEN_THRESHOLD):
                return 

            if (flag and ('security' not in docDesc.lower())):
                return

            document = lb.E.Document(
                lb.E.Title(docTitle),
                lb.E.Date(docDate),
                lb.E.Description(docDesc),
                id=docId, type=docType, src=docSource)		
            doc = etree.tostring(document, pretty_print=True)

            xml_filename = dir_name + docId + '.xml'
            writeString(xml_filename, XML_HEAD + doc)
        except Exception as e:
            print e
Example #29
0
class Wikipedia_Scanner(object):
	"""
	Class to Scann wikipedia articles.
	"""

	def __init__(self, add_gloss_list, del_gloss_list, category, label):
		"""
		Initialize the class.
		"""
		self.add_phrases = get_phrases(add_gloss_list)
		self.del_phrases = get_phrases(del_gloss_list)
		self.category = category
		self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
		self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
		self.wiki = WikiApi({})
		self.visited_results = self.get_results(self.del_phrases)
		self.count = 0


	def get_results(self, phrases):
		"""
		Return dictionary of wiki results corresponding to phrases.
		"""
		visited_results = {}
		for phrase in phrases:
			results = self.wiki.find(phrase)
			for result in results:
				if (not visited_results.has_key(result)):
					visited_results[result] = True
		return visited_results


	def get_articles(self):
		"""
		Fetches articles and puts in data directory.
		"""
		for phrase in self.add_phrases:
			try:
				results = self.wiki.find(phrase)
				for result in results:
					if (not self.visited_results.has_key(result)):
						self.visited_results[result] = True

						article = self.wiki.get_article(result)
						entry_src = 'wikipedia_' + self.category
						entry_type = 'article'
						entry_id = 'wikipedia_' + result.replace(' ', '_').replace('/', '_')
						entry_title = article.heading
						entry_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')
						entry_desc = clean(article.summary)

						if (''.join(entry_desc.split()) != ''):
							xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc)
			
							write_string(self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False)
							write_string(self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False)
						
							self.count = self.count + 1
							if (self.count % 100 == 0):
								print 'Scanned ' + str(self.count) + ' wiki articles.'

			except Exception as e:
				print 'Wiki Api Error! [' + str(e) + ']'
from wikiapi import WikiApi
wiki = WikiApi()
wiki = WikiApi({ 'locale' : 'en'})

keywords=[]

with open("Important_Names.txt","r") as f:
	for line in f:
		keywords.append(line)
f.close()

count=0
for word in keywords:
	count=count+1
	results = wiki.find(word.strip('\n'))
	if len(results)!=0:
		article = wiki.get_article(results[0])
		text=article.content.encode('utf-8')
		with open("Web"+str(count)+".txt","w") as f:
			f.write(text)
		f.close()
		print article.url
 def __init__(self):
     self.classifier = classifier.Classifier()
     self.wiki = WikiApi()
     self.bad_urls = set(
         [p['url'] for p in self.classifier.non_accepted_pages])
Example #32
0
 def setUp(self):
     # using an Italian-Emilian locale that is full of unicode symbols
     self.wiki = WikiApi({'locale': 'eml'})
     self.res = self.wiki.find('Bulagna')[0]
     self.article = None
Example #33
0
from wikiapi import WikiApi
wiki = WikiApi()
wiki = WikiApi({'locale': 'es'})  # to specify your locale, 'en' is default

wiki.options

results = wiki.find('hereditary myopathies')
print()
Example #34
0
from bs4 import BeautifulSoup
import os
import time
import matplotlib
matplotlib.use('Agg')
from wikiapi import WikiApi
wiki = WikiApi()

import tools
from answer_finder import AnswerFinder
from config import config as c

with open('./questions.txt') as f:
    questions = [line[:-1] for line in f]

model = AnswerFinder(config=c, restore=True, mode="inference")
print('\n\n\n\n\n\n\n')
print(
    '''Hello! This is Alpha version of program for reading wikipedia to answer the question.
Program was writing basing on paper https://arxiv.org/pdf/1704.00051.pdf
For more detail [email protected]\n''')

c.inf_threshold = 0.7
while True:
    while True:
        print('What or who do you want to ask about? Example: Barak Obama')
        thing = input()
        results = wiki.find(thing)
        if len(results) > 0:
            print('Ok. I found few wiki pages about {}.'.format(thing))
            break
Example #35
0
 def wiki_search(self, text):
     wiki = WikiApi()
     results = wiki.find(text)
     article = wiki.get_article(results[0])
     return article
Example #36
0
 def __init__(self, filenames):
     """
     Initialize the WikiGrabber class.
     """
     self.glossary = Glossary(filenames)
     self.wiki = WikiApi({})
Example #37
0
from wikiapi import WikiApi

sub = "sachin tendulkar"
wiki = WikiApi()
wiki = WikiApi({'locale': 'en'})
page = wiki.get_article(sub)
print(page.content)
Example #38
0
#!/usr/bin/env python
#_*_coding:utf8_*_
import os, json, re, codecs, sys, argparse, collections
from pprint import pprint
from wikiapi import WikiApi
# from nltk.corpus import stopwords
from math import sqrt

json_data = {}
mxspath = os.environ.get('MXS_PATH')
n = 0
list_path = []

wiki = WikiApi()
wiki = WikiApi({'locale': 'fr'})


def cut_word(content):
    text = re.sub("[^a-zA-Z]", " ", content)
    words = text.lower().split()
    # stops = set(stopwords.words('french'))
    tags = [w for w in words]
    return (tags)


def merge_tag(tag1=None, tag2=None):
    v1 = []
    v2 = []
    tag_dict1 = collections.Counter(tag1)
    tag_dict2 = collections.Counter(tag2)
    merged_tag = set()
Example #39
0
class Wikipedia_Scanner(object):
    """
	Class to Scann wikipedia articles.
	"""
    def __init__(self, add_gloss_list, del_gloss_list, category, label):
        """
		Initialize the class.
		"""
        self.add_phrases = get_phrases(add_gloss_list)
        self.del_phrases = get_phrases(del_gloss_list)
        self.category = category
        self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category
        self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category
        self.wiki = WikiApi({})
        self.visited_results = self.get_results(self.del_phrases)
        self.count = 0

    def get_results(self, phrases):
        """
		Return dictionary of wiki results corresponding to phrases.
		"""
        visited_results = {}
        for phrase in phrases:
            results = self.wiki.find(phrase)
            for result in results:
                if (not visited_results.has_key(result)):
                    visited_results[result] = True
        return visited_results

    def get_articles(self):
        """
		Fetches articles and puts in data directory.
		"""
        for phrase in self.add_phrases:
            try:
                results = self.wiki.find(phrase)
                for result in results:
                    if (not self.visited_results.has_key(result)):
                        self.visited_results[result] = True

                        article = self.wiki.get_article(result)
                        entry_src = 'wikipedia_' + self.category
                        entry_type = 'article'
                        entry_id = 'wikipedia_' + result.replace(
                            ' ', '_').replace('/', '_')
                        entry_title = article.heading
                        entry_date = datetime.now().strftime(
                            '%Y-%m-%d_%H-%M-%S-%f')
                        entry_desc = clean(article.summary)

                        if (''.join(entry_desc.split()) != ''):
                            xml_string = bundle_xml(entry_src, entry_type,
                                                    entry_id, entry_title,
                                                    entry_date, entry_desc)

                            write_string(
                                self.corpus_dir + '/' + entry_id.lower() +
                                '.xml', xml_string, False)
                            write_string(
                                self.raw_dir + '/' + entry_id.lower() + '.txt',
                                entry_desc, False)

                            self.count = self.count + 1
                            if (self.count % 100 == 0):
                                print 'Scanned ' + str(
                                    self.count) + ' wiki articles.'

            except Exception as e:
                print 'Wiki Api Error! [' + str(e) + ']'
class Scraper:

    prohibited_headers = set(['Contents', 'See also', 'References'])

    # The scraper uses the classifier to only send out articles that are more likely to
    # be music related
    def __init__(self):
        self.classifier = classifier.Classifier()
        self.wiki = WikiApi()
        self.bad_urls = set(
            [p['url'] for p in self.classifier.non_accepted_pages])

    # The stream method is used for scraping a large number of maximum links.
    # This method does not implement the classifier filtering because its main
    # purpose is for building the database of pages for manual classification
    def stream(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)

        for i in range(maxLinks):
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)
            for u in urls:
                queue.put(u)
            yield page

    # The scrape method is used for a smaller number of maximum links. It performs
    # a breadth first search given an initial term. It uses a queue to keep track
    # of the pages to be scraped and a set of the already scraped to prevent
    # duplicates
    def scrape(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)
        pages = []

        while len(pages) < maxLinks:
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)

            # Only if the classifier predicts it as a good page, a page will
            # be added to the pages list which is returned at the end
            if self.classifier.classify(
                    page) == 1 and page.url not in self.bad_urls:
                pages.append(page)
                print page.name
            for u in urls:
                queue.put(u)
        return pages

    # Common code for both methods that crawl wikipedia
    def scrape_common(self, start_term):
        finished = set()
        queue = Queue()
        search_results = self.wiki.find(start_term)
        if not search_results:
            print 'No pages found. Try a different term'
        else:
            queue.put('https://en.wikipedia.org/wiki/' + search_results[0])
        return finished, queue, search_results

    # Process a page's HTML using BeautifulSoup to extract useful information
    def process_page(self, url):
        html = self.wiki.get(url)

        soup = BeautifulSoup(html)
        body_html = soup.find(id='mw-content-text')
        title_tag = soup.find(id='firstHeading')
        if title_tag.string == None:
            contents = title_tag.contents
            string_contents = []
            for c in contents:
                if type(c) != str:
                    string_contents.append(c.string)
                else:
                    string_contents.append(c)
            title = ''.join(string_contents)
        else:
            title = title_tag.string

        urls, links_text, media_link_count = self.find_urls(body_html)
        (clean_text, headers) = self.clean_html(body_html)
        page = Page(url, title, clean_text, headers, links_text,
                    media_link_count)
        return (page, urls)

    # Find all URLs in a given HTML that redirect to another article in Wikipedia
    # Page links and media links (pictures, audio) are stored in different lists
    # but are both used.
    def find_urls(self, html):
        link_urls = []
        good_link = re.compile('/wiki/')
        bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)')
        media_link = re.compile('.*\.jpg|.*\.ogg')
        media_link_count = 0
        media_found = set()
        links_text = dd(int)

        all_links = html.find_all('a')
        for l in all_links:
            link = l.get('href')
            content = self.extract_content([l])[0]
            if good_link.match(link) and not bad_link.match(link):
                link_urls.append('https://en.wikipedia.org' + link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

            elif media_link.match(link):
                if link not in media_found:
                    media_link_count += 1
                    media_found.add(link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

        return (link_urls, links_text, media_link_count)

    # Fucntion to extract the body and the headers of an article
    def clean_html(self, html):
        paragraphs = html.find_all('p')
        headers = html.find_all(re.compile('h\d'))
        clean_text = ''.join(self.extract_content(paragraphs))
        headers_list = self.clean_headers(headers)
        return (clean_text, headers_list)

    # Clean the list of headers of the prohibited, common headers
    def clean_headers(self, array):
        raw_headers = self.extract_content(array)
        final_headers = []
        for h in raw_headers:
            if h not in Scraper.prohibited_headers:
                final_headers.append(h)
        return final_headers

    # Fuction to clean the HTML body of a page. It removes common links that
    # would cause noise in our system such as [edit] buttons and reference numbers
    # e.g. [2].
    def extract_content(self, array):
        for i in range(len(array)):
            array[i] = re.sub(r'<[^>]*>', '', str(array[i]))
            array[i] = re.sub(r'\[edit\]', '', str(array[i]))
            array[i] = re.sub(r'\[\d*\]', '', str(array[i]))
            array[i] = re.sub(r'\^', '', str(array[i]))
        return array
Example #41
0
# -*- coding: utf-8 -*-
from wikiapi import WikiApi
import unittest

wiki = WikiApi({})
results = wiki.find('Bill Clinton')
article = wiki.get_article(results[0])  # taking first search result


class TestWiki(unittest.TestCase):
    def test_heading(self):
        self.assertIsNotNone(article.heading)

    def test_image(self):
        self.assertTrue(isinstance(article.image, str))

    def test_summary(self):
        self.assertGreater(len(article.summary), 100)

    def test_content(self):
        self.assertGreater(len(article.content), 200)

    def test_references(self):
        self.assertTrue(isinstance(article.references, list))

    def test_url(self):
        self.assertTrue(article.url,
                        u"http://en.wikipedia.org/wiki/Bill_Clinton")

    def test_get_relevant_article(self):
        keywords = ['president', 'hilary']
Example #42
0
 def setUp(self):
     # using an Italian-Emilian locale that is full of unicode symbols
     self.wiki = WikiApi({'locale': 'eml'})
     self.res = self.wiki.find('Bulaggna')[0]
     self.article = None
 def __init__(self):
     self.classifier = classifier.Classifier()
     self.wiki = WikiApi()
     self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages])
class Scraper:

    prohibited_headers = set(['Contents', 'See also', 'References'])

    # The scraper uses the classifier to only send out articles that are more likely to
    # be music related
    def __init__(self):
        self.classifier = classifier.Classifier()
        self.wiki = WikiApi()
        self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages])

    # The stream method is used for scraping a large number of maximum links.
    # This method does not implement the classifier filtering because its main
    # purpose is for building the database of pages for manual classification
    def stream(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)

        for i in range(maxLinks):
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)
            for u in urls:
                queue.put(u)
            yield page

    # The scrape method is used for a smaller number of maximum links. It performs
    # a breadth first search given an initial term. It uses a queue to keep track
    # of the pages to be scraped and a set of the already scraped to prevent 
    # duplicates
    def scrape(self, start_term, maxLinks):
        finished, queue, search_results = self.scrape_common(start_term)
        pages = []

        while len(pages) < maxLinks:
            if queue.empty():
                break
            current_url = queue.get()
            while current_url in finished:
                current_url = queue.get()
            (page, urls) = self.process_page(current_url)
            finished.add(current_url)

            # Only if the classifier predicts it as a good page, a page will
            # be added to the pages list which is returned at the end
            if self.classifier.classify(page) == 1 and page.url not in self.bad_urls:
                pages.append(page)
                print page.name
            for u in urls:
                queue.put(u)
        return pages

    # Common code for both methods that crawl wikipedia
    def scrape_common(self, start_term):
        finished = set()
        queue = Queue()
        search_results = self.wiki.find(start_term)
        if not search_results:
            print 'No pages found. Try a different term'
        else:
            queue.put('https://en.wikipedia.org/wiki/' + search_results[0])
        return finished, queue, search_results

    # Process a page's HTML using BeautifulSoup to extract useful information
    def process_page(self, url):
        html = self.wiki.get(url)

        soup = BeautifulSoup(html)
        body_html = soup.find(id='mw-content-text')
        title_tag = soup.find(id='firstHeading')
        if title_tag.string == None:
            contents = title_tag.contents
            string_contents = []
            for c in contents:
                if type(c) != str:
                    string_contents.append(c.string)
                else:
                    string_contents.append(c)
            title = ''.join(string_contents)
        else:
            title = title_tag.string

        urls, links_text, media_link_count = self.find_urls(body_html)
        (clean_text, headers) = self.clean_html(body_html)
        page = Page(url, title, clean_text, headers, links_text, media_link_count)
        return (page, urls)

    # Find all URLs in a given HTML that redirect to another article in Wikipedia
    # Page links and media links (pictures, audio) are stored in different lists
    # but are both used.
    def find_urls(self, html):
        link_urls = []
        good_link = re.compile('/wiki/')
        bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)')
        media_link = re.compile('.*\.jpg|.*\.ogg')
        media_link_count = 0
        media_found = set()
        links_text = dd(int)

        all_links = html.find_all('a')
        for l in all_links:
            link = l.get('href')
            content = self.extract_content([l])[0]
            if good_link.match(link) and not bad_link.match(link):
                link_urls.append('https://en.wikipedia.org' + link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

            elif media_link.match(link):
                if link not in media_found:
                    media_link_count += 1
                    media_found.add(link)

                if str(content) != '':
                    links_text[content] = links_text[content] + 1

        return (link_urls, links_text, media_link_count)

    # Fucntion to extract the body and the headers of an article
    def clean_html(self, html):
        paragraphs = html.find_all('p')
        headers = html.find_all(re.compile('h\d'))
        clean_text = ''.join(self.extract_content(paragraphs))
        headers_list = self.clean_headers(headers)
        return (clean_text, headers_list)

    # Clean the list of headers of the prohibited, common headers
    def clean_headers(self, array):
        raw_headers = self.extract_content(array)
        final_headers = []
        for h in raw_headers:
            if h not in Scraper.prohibited_headers:
                final_headers.append(h)
        return final_headers

    # Fuction to clean the HTML body of a page. It removes common links that 
    # would cause noise in our system such as [edit] buttons and reference numbers
    # e.g. [2]. 
    def extract_content(self, array):
        for i in range(len(array)):
            array[i] = re.sub(r'<[^>]*>', '', str(array[i]))
            array[i] = re.sub(r'\[edit\]', '', str(array[i]))
            array[i] = re.sub(r'\[\d*\]', '', str(array[i]))
            array[i] = re.sub(r'\^', '', str(array[i]))
        return array
Example #45
0
from bs4 import BeautifulSoup
import urllib2

from wikiapi import WikiApi
wiki = WikiApi()
wiki = WikiApi({'locale': 'en'})

# b = wiki.get_article('High Crusade')

# print(b.url)

#results = wiki.find('Barack Obama').content

#print(results)


def get_title_from_search(string):
    return wiki.find(string)[0]


def get_url_from_search(string):
    try:
        article_title = wiki.find(string)[0]
    except IndexError:
        return False
    article_contents = wiki.get_article(article_title)
    return article_contents.url


# print(get_url_from_search('Stranger in a Strange Land'))
Example #46
0
def main():
    status = True
    pygame.mixer.music.play(-1)
    music_status = 1
#   Create a wikiapi instance
    wiki_status = 1
    wiki_instance = WikiApi()
    wiki_instance = WikiApi({'locale': 'en'})
    namespace = None

    index1 = 0
    data_list = []

#   Load weather data into lists and dictionaries
    weather_location = 0
    connector = yweather.Client()
    weather_id_ny = connector.fetch_woeid('New York')
    weather_data_ny = connector.fetch_weather(str(weather_id_ny), metric=True)
    data_dict_ny = {}
    data_dict_ny.update({'Current Temperature': weather_data_ny["condition"]["temp"], \
                    'Sunrise': weather_data_ny['astronomy']['sunrise'],\
                         'Sunset': weather_data_ny['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_ny['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_ny['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_ny['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_ny['condition']['text']})
    keys_list_ny = data_dict_ny.keys()

    weather_id_buffalo = connector.fetch_woeid('Buffalo')
    weather_data_buffalo = connector.fetch_weather(str(weather_id_buffalo), metric=True)
    data_dict_buffalo = {}
    data_dict_buffalo.update({'Current Temperature': weather_data_buffalo["condition"]["temp"], \
                    'Sunrise': weather_data_buffalo['astronomy']['sunrise'],\
                              'Sunset': weather_data_buffalo['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_buffalo['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_buffalo['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_buffalo['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_buffalo['condition']['text']})
    keys_list_buffalo = data_dict_buffalo.keys()

    weather_id_hyd = connector.fetch_woeid('Hyderabad')
    weather_data_hyd = connector.fetch_weather(str(weather_id_hyd), metric=True)
    data_dict_hyd = {}
    data_dict_hyd.update({'Current Temperature': weather_data_hyd["condition"]["temp"], \
                    'Sunrise': weather_data_hyd['astronomy']['sunrise'], \
                          'Sunset': weather_data_hyd['astronomy']['sunset'],
                    'Max Temperature': (str(weather_data_hyd['forecast'][0]['high']) + " Degrees C"), \
                    'Min Temperature': (str(weather_data_hyd['forecast'][0]['low'] + " Degrees C")),
                    'Wind': (str(weather_data_hyd['wind']['speed'] + " km/h")), \
                    'Condition': weather_data_hyd['condition']['text']})
    keys_list_hyd = data_dict_hyd.keys()

    while status:
            main_display.fill(black)
            pointer_location = pygame.mouse.get_pos()
            pointer_click = pygame.mouse.get_pressed()
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    
#   Music Button
                if 325 < pointer_location[0] < 405 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        wiki_status = 1
                if 700 < pointer_location[0] < 780 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        music_status = not music_status
                        if music_status == 0:
                            pygame.mixer.music.pause()
                        else:
                            pygame.mixer.music.unpause()
#   New York  Button Check
                if 20 < pointer_location[0] < 80 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 2
                    
#   Buffalo  Button Check
                if 100 < pointer_location[0] < 160 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 1
                    
#   Hyderabad  Button Check
                if 180 < pointer_location[0] < 240 and 20 < pointer_location[1] < 50:
                    if pointer_click[0] == 1:
                        weather_location = 0
            try:
                main_display.blit(weather_image, (0,0))
            except:
                pass
            
#   Data Display
            if weather_location == 0:
                data_display(110, data_dict_hyd['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_hyd[5] + " : " + data_dict_hyd['Condition'], white, 95, 260)  # Condition
                data_display(15, keys_list_hyd[1] + " : " + data_dict_hyd['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_hyd[6] + " : " + data_dict_hyd['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_hyd[4] + " : " + data_dict_hyd['Sunrise'], white, 95, 400)   # Sunrise
                data_display(15, keys_list_hyd[0] + " : " + data_dict_hyd['Sunset'], white, 95, 440)    # Sunset
                data_display(15, keys_list_hyd[3] + " : " + data_dict_hyd['Wind'], white, 95, 480)  # Wind Speed

            elif weather_location == 1:
                data_display(110, data_dict_buffalo['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_buffalo[5] + " : " + data_dict_buffalo['Condition'], white, 95, 260)
                data_display(15, keys_list_buffalo[1] + " : " + data_dict_buffalo['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_buffalo[6] + " : " + data_dict_buffalo['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_buffalo[4] + " : " + data_dict_buffalo['Sunrise'], white, 95, 400)
                data_display(15, keys_list_buffalo[0] + " : " + data_dict_buffalo['Sunset'], white, 95, 440)
                data_display(15, keys_list_buffalo[3] + " : " + data_dict_buffalo['Wind'], white, 95, 480)


            elif weather_location == 2:
                data_display(110, data_dict_ny['Current Temperature'], white, 80, 160)  # Temperature number
                data_display(20, "Deg C", white, 180, 130)                           # Degree
                data_display(15, keys_list_ny[5] + " : " + data_dict_ny['Condition'], white, 95, 260)  # Condition
                data_display(15, keys_list_ny[1] + " : " + data_dict_ny['Min Temperature'], white, 130, 320)
                data_display(15, keys_list_ny[6] + " : " + data_dict_ny['Max Temperature'], white, 130, 360)
                data_display(15, keys_list_ny[4] + " : " + data_dict_ny['Sunrise'], white, 95, 400)   # Sunrise
                data_display(15, keys_list_ny[0] + " : " + data_dict_ny['Sunset'], white, 95, 440)    # Sunset
                data_display(15, keys_list_ny[3] + " : " + data_dict_ny['Wind'], white, 95, 480)  # Wind Speed
                
#   Display Wiki Article
            if wiki_status == 1:
                del data_list[:]
                wiki_status = 0
                blahblah = True
                try:
                    url = 'http://en.wikipedia.org/wiki/Special:Random'
                    if namespace != None:
                        url += '/' + namespace
                    req = urllib2.Request(url, None, { 'User-Agent' : 'x'})
                    page = urllib2.urlopen(req).readlines()
                    wiki_draft1 = remove_tags(page[4])
                    wiki_title = wiki_draft1[:wiki_draft1.index('Wikipedia') - 2]
                    wiki_data_list = wiki_instance.find(wiki_title)
                    wiki_data = wiki_instance.get_article(wiki_data_list[0])
                    temp = endlinefunction(wiki_data.summary, data_list, 90)
                except (urllib2.HTTPError, urllib2.URLError):
                    print "Failed to get article"
                    raise
                    
#   Buttons and Division Display
            pygame.draw.rect(main_display, white, (300, 0, 5, 600))
            pygame.draw.rect(main_display, white, (300, 70, 500, 5))
            drawbutton(wood, 700, 20, 80, 30, 10, "Toggle Music", black)
            drawbutton(white, 20, 20, 60, 30, 10, "New York", black)
            drawbutton(white, 100, 20, 60, 30, 10, "Buffalo", black)
            drawbutton(white, 180, 20, 60, 30, 10, "Hyderabad", black)
            drawbutton(wood, 325, 20, 80, 30, 10, "Next Article", black)
            
#   Cursor Display
            data_display(15, wiki_data.heading, wood, 540, 130)
            y_cood = 150
            j = 25
            for i in range(0, len(data_list)):
                y_cood = y_cood + j
                data_display(10, data_list[i], black, 540, y_cood)
            clock.tick(100)
            pygame.display.flip()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from articleData import my_articles
import requests
from wikiapi import WikiApi

wiki = WikiApi({'locale': 'es'})


def getURL(searchQuery):
    results = wiki.find(searchQuery)

    try:
        article = wiki.get_article(results[0])
    except:
        article = "no article exists for: " + searchQuery

    try:
        url = article.url
    except:
        url = "no url exists for: " + searchQuery

    # try:
    #     summary = article.summary
    # except:
    #     summary

    print url
    # print summary

Example #48
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-

from wikiapi import WikiApi
import requests, pprint

# This is suitable for extracting content that is organized by pages under a title
# This code requires the wiki-api python library created by Richard O'Dwyer of UK
# https://github.com/richardasaurus/wiki-api

wiki = WikiApi()
wiki = WikiApi({'locale': 'ta'})  # to specify your locale, 'en' is default


# Get the page text of the article with the given title
def getArticleParagraphs(title):
    print(title)
    articleFull = wiki.get_article(title)
    fullText = articleFull.content

    article = ""
    paragraphs = fullText.split('\n\n')
    # print(paragraphs)
    # We want only whole paragraphs that end in a ".", "!", "?" or '"' not fragments
    for paragraph in paragraphs:
        if len(paragraph) > 30:
            end = paragraph[-1]
            if end == '.' or end == '!' or end == '?' or end == '"':
                article = article + "\n\n" + paragraph
    return article
Example #49
0
except Exception as e:
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('words')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.corpus import wordnet
import sys
import os
from wikiapi import WikiApi
import urllib2
import html2text

wiki_ = WikiApi()
common_ = set(nltk.corpus.words.words())

isNoun = lambda x: x[:2] == 'NN'


def url_exists(url):
    ret = urllib2.urlopen(url)
    if ret.code == 200:
        return True
    return False


def wiki_link(query):
    wikiLink = 'https://en.wikipedia.org/wiki/%s' % query
    if url_exists(wikiLink):
Example #50
0
'''
Translate the names in VGGFace2 dataset into English.
You will need to manually install the following libararies:
unidecode
googletrans
wikiapi
author: Feng Wang (UESTC)
'''
import os
import csv
import string
import unidecode
from googletrans import Translator
translator = Translator()
from wikiapi import WikiApi
wiki = WikiApi()

def is_number(uchar):
    return uchar >= u'0' and uchar<=u'9'

def is_alphabet(uchar):
    return (uchar >= u'a' and uchar<=u'z') or (uchar >= u'A' and uchar<=u'Z')

def check_english(name):
    flag = True
    for uchar in name:
        if (not is_alphabet(uchar)) and (not is_number(uchar)) and (uchar != u'\u0020') and (uchar != u'-') and (uchar != u'.'):
            flag = False
    return flag

def non_english_character_count(name):
Example #51
0
	restagcont=[]
  	for i in range(len(ttcontent)):
 	 	if ttcontent[i][1]:
	 		#print ttcontent[i][0],":",ttcontent[i][1]
	 		restagcont.append(ttcontent[i][0].lower())
  	keyw=[]
  	stpw=["navigation","search","about","http","edit","Intro","read","help","removed","above"]
 	for i in restagcont:
		if i not in stpw:
 			keyw.append(i)

	return keyw
#************************************
#************************************
	
wiki = WikiApi({})
dic_cont={} #diccionary
mlist=[] #word base
#************************************

for wtopic in file1.readlines():
	w=wtopic.split()
 	mlist.append(w[0])
	results = wiki.find(w[0])
	if results:

		article = wiki.get_article(results[0])
 		r=article.content 
 		rtoken= wordpunct_tokenize(r)
 		
 		#implementation of stopwords
Example #52
0
# -*- coding: utf-8 -*-
from wikiapi import WikiApi
import unittest

wiki = WikiApi({})
results = wiki.find('Bill Clinton')
article = wiki.get_article(results[0])  # taking first search result


class TestWiki(unittest.TestCase):
    def test_heading(self):
        self.assertIsNotNone(article.heading)

    def test_image(self):
        self.assertTrue(isinstance(article.image, str))

    def test_summary(self):
        self.assertGreater(len(article.summary), 100)

    def test_content(self):
        self.assertGreater(len(article.content), 200)

    def test_references(self):
        self.assertTrue(isinstance(article.references, list))

    def test_get_relevant_article(self):
        keywords = ['president', 'hilary']
        _article = wiki.get_relevant_article(results, keywords)
        self.assertTrue('Bill Clinton' in _article.heading)

    def test_get_relevant_article_no_result(self):