def test_cache_not_populated_when_disabled(self): wiki = WikiApi({'cache': False}) assert self._get_cache_size(wiki) == 0 wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 0 shutil.rmtree(wiki.cache_dir, ignore_errors=True)
def test_cache_populated(self): wiki = WikiApi({'cache': True, 'cache_dir': '/tmp/wikiapi-test'}) assert self._get_cache_size(wiki) == 0 # Make multiple calls to ensure no duplicate cache items created assert wiki.find('Bob Marley') == wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 1 # Check cache keys are unique assert wiki.find('Tom Hanks') != wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 2 shutil.rmtree(wiki.cache_dir, ignore_errors=True)
def set_up(self): # using an Italian-Emilian locale that is full of unicode symbols wiki = WikiApi({'locale': 'eml'}) result = wiki.find('Bulaggna')[0] return { 'wiki': wiki, 'result': result, }
def set_up(self): wiki = WikiApi() results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) return { 'wiki': wiki, 'results': results, 'article': article, }
def get_url(query, log_file): wiki = WikiApi() results = wiki.find(query) if len(results) == 0: sys.stderr.write("No wikipedia article found for '" + query + "'\n") else: article = wiki.get_article(results[0]) print article.url with open(log_file, 'a') as f: f.write(article.url + "\n")
def wikiqueryresults(searchQuery): wiki = WikiApi({}) wiki = WikiApi({ 'locale' : 'en' }) # Top specify your locale, 'en' is default wikiSearch = wiki.find(searchQuery) wikiArticle = wiki.get_article(wikiSearch[0]) return wikiArticle.summary
def wiki_api(options): wiki = WikiApi() wiki = WikiApi({ 'locale' : 'en'}) # to specify your locale, 'en' is default results = wiki.find(options['q']) for result in results: article = wiki.get_article(results) title = article.heading url = article.url print(url) link = Link(topic = options['topic'], title = title, url = url) link.save()
class TestUnicode(unittest.TestCase): def setUp(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.res = self.wiki.find('Bulagna')[0] self.article = None def test_search(self): # this is urlencoded. self.assertEqual(self.res, u'Bul%C3%A5ggna') def test_article(self): #unicode errors will likely blow in your face here self.assertIsNotNone(self.wiki.get_article(self.res))
class TestUnicode: @pytest.fixture(autouse=True) def set_up(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.result = self.wiki.find('Bulaggna')[0] def test_search(self): # this is urlencoded. assert self.result == 'Bul%C3%A5ggna' def test_article(self): # unicode errors will likely blow in your face here assert self.wiki.get_article(self.result) is not None
def wiki(tokens, message): print("\033[1;34;1m") print("\nHazel : Please wait while I surf the web for a result") try: wiki = WikiApi() WikiApi({'locale': 'en'}) if "search" in tokens: tokens.remove("search") # remove search keyword to retrieve the main content to be searched if "what" in tokens: tokens.remove("what") if "who" in tokens: tokens.remove("who") if "look" in tokens: tokens.remove("look") if "tell" in tokens: tokens.remove("tell") if "more" in tokens: tokens.remove("more") if "about" in tokens: tokens.remove("about") stop_words = set(stopwords.words('english')) # Remove stop words filtered_sentence = [w for w in tokens if not w in stop_words] filtered_sentence = [] for w in tokens: # Filtering input by removing stopwords such as 'I', 'for', 'is', etc. if w not in stop_words: filtered_sentence.append(w) # Get and store message without stopeords filtered_list = filtered_sentence filtered_sentence = ' '.join(filtered_list) # Making a sentance out of the tokens message = filtered_sentence # storing input in message tokens = word_tokenize(message) # tokenize new message s = "" # appends the remaining tokens to be searched for for i in tokens: s = s + i + " " # appending the tokens to form a search keyword results = wiki.find(s) # package function to do online searched #print("websearch\n" if results == "": results = "null" print("\nFound result for : ", results[0]) # print the first search result print("\033[1;37;1m") # set console color print(wi.summary(s)) #main() except Exception as e: print("I didnt get that. You may want to try that again")
def wiki_search(query): wiki = WikiApi() wikiurls = [] lst = query.split(",") num = 10 / len(lst) # print num for i in lst: results = wiki.find(i) cnt = 0 for j in results: cnt = cnt + 1 article = wiki.get_article(j) wikiurls.append(article.url) if cnt == num: break return wikiurls
def wiki_search(query): wiki = WikiApi() wikiurls=[] lst=query.split(",") num = 10/len(lst) # print num for i in lst: results = wiki.find(i) cnt=0 for j in results: cnt=cnt+1 article = wiki.get_article(j) wikiurls.append(article.url) if cnt==num: break return wikiurls
def get_wiki_phrases(word): wiki = WikiApi() wiki = WikiApi({'locale': 'en'}) results = wiki.find(word) print results phrase = "" for i in range(min(4, len(results))): article = wiki.get_article(results[i]) #print article.content phrase = phrase + " " + article.content #print phrase rake_object = rake.Rake("SmartStoplist.txt", 4, 3, 10) #Now, we have a RAKE object that extracts keywords where: # Each word has at least 4 characters # Each phrase has at most 3 words # Each keyword appears in the text at least 4 times keywords = rake_object.run(phrase) return keywords[0:20]
def get_full_name_from_wiki(name): wiki = WikiApi() results = wiki.find(name) if len(results) > 0: article = wiki.get_article(results[0]) new_name = article.summary new_name = new_name[:new_name.find('(')-1] if new_name.find(' refer ') != -1: if len(results) > 1: article = wiki.get_article(results[1]) new_name = article.summary new_name = new_name[:new_name.find('(') - 1] else: return None table = str.maketrans({key: None for key in string.punctuation + '\r\n'}) new_name = new_name.translate(table) if len(new_name) > 4 and len(new_name) < 50: return new_name else: return None else: return None
def get_security_results(filenames): """ Pre-fill visited with security term results. """ global visited_results wiki = WikiApi({}) phrases = [] for filename in filenames: lines = readLines(filename) for line in lines: line = line.strip() if ((len(line) > 0) and (line[0] != '#')): if (line[0] == '/'): phrases.append(line.split(' ', 1)[1]) else: phrases.append(line) for phrase in phrases: results = wiki.find(phrase) for result in results: if (result not in visited_results): visited_results.append(result)
class Wikipedia_Scanner(object): """ Class to Scann wikipedia articles. """ def __init__(self, add_gloss_list, del_gloss_list, category, label): """ Initialize the class. """ self.add_phrases = get_phrases(add_gloss_list) self.del_phrases = get_phrases(del_gloss_list) self.category = category self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category self.wiki = WikiApi({}) self.visited_results = self.get_results(self.del_phrases) self.count = 0 def get_results(self, phrases): """ Return dictionary of wiki results corresponding to phrases. """ visited_results = {} for phrase in phrases: results = self.wiki.find(phrase) for result in results: if (not visited_results.has_key(result)): visited_results[result] = True return visited_results def get_articles(self): """ Fetches articles and puts in data directory. """ for phrase in self.add_phrases: try: results = self.wiki.find(phrase) for result in results: if (not self.visited_results.has_key(result)): self.visited_results[result] = True article = self.wiki.get_article(result) entry_src = 'wikipedia_' + self.category entry_type = 'article' entry_id = 'wikipedia_' + result.replace(' ', '_').replace('/', '_') entry_title = article.heading entry_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f') entry_desc = clean(article.summary) if (''.join(entry_desc.split()) != ''): xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc) write_string(self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False) write_string(self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False) self.count = self.count + 1 if (self.count % 100 == 0): print 'Scanned ' + str(self.count) + ' wiki articles.' except Exception as e: print 'Wiki Api Error! [' + str(e) + ']'
class Wikipedia_Scanner(object): """ Class to Scann wikipedia articles. """ def __init__(self, add_gloss_list, del_gloss_list, category, label): """ Initialize the class. """ self.add_phrases = get_phrases(add_gloss_list) self.del_phrases = get_phrases(del_gloss_list) self.category = category self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category self.wiki = WikiApi({}) self.visited_results = self.get_results(self.del_phrases) self.count = 0 def get_results(self, phrases): """ Return dictionary of wiki results corresponding to phrases. """ visited_results = {} for phrase in phrases: results = self.wiki.find(phrase) for result in results: if (not visited_results.has_key(result)): visited_results[result] = True return visited_results def get_articles(self): """ Fetches articles and puts in data directory. """ for phrase in self.add_phrases: try: results = self.wiki.find(phrase) for result in results: if (not self.visited_results.has_key(result)): self.visited_results[result] = True article = self.wiki.get_article(result) entry_src = 'wikipedia_' + self.category entry_type = 'article' entry_id = 'wikipedia_' + result.replace( ' ', '_').replace('/', '_') entry_title = article.heading entry_date = datetime.now().strftime( '%Y-%m-%d_%H-%M-%S-%f') entry_desc = clean(article.summary) if (''.join(entry_desc.split()) != ''): xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc) write_string( self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False) write_string( self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False) self.count = self.count + 1 if (self.count % 100 == 0): print 'Scanned ' + str( self.count) + ' wiki articles.' except Exception as e: print 'Wiki Api Error! [' + str(e) + ']'
print(grouped) # "calculate" genders from Wikipedia articles gender = [] seen = {} # memoization: author -> gender wiki = WikiApi() for author in grouped.column("AUTHOR"): if author.lower() in seen: print(author, "already found previously") gender.append(seen[author.lower()]) continue try: try: print("trying to find " + author + " in wikipedia") results = wiki.find(author) wikipedia_page = wiki.get_article(results[0]).url except Exception: # errors when article is not found; use google search # instead we try to limit number of google search queries # because google limits them for free accounts or something print("trying to find " + author + " in google") wikipedia_page = google_search(author + ' site: en.wikipedia.org', num=1)[0]['link'] g = find_gender(wikipedia_page) except Exception: # TODO: Possibly search on google for the book title and author if still # not found, and find some other site that has pronouns on it, if there
from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({'locale': 'es'}) # to specify your locale, 'en' is default wiki.options results = wiki.find('hereditary myopathies') print()
# -*- coding: utf-8 -*- from wikiapi import WikiApi import unittest wiki = WikiApi({}) results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) # taking first search result class TestWiki(unittest.TestCase): def test_heading(self): self.assertIsNotNone(article.heading) def test_image(self): self.assertTrue(isinstance(article.image, str)) def test_summary(self): self.assertGreater(len(article.summary), 100) def test_content(self): self.assertGreater(len(article.content), 200) def test_references(self): self.assertTrue(isinstance(article.references, list)) def test_get_relevant_article(self): keywords = ['president', 'hilary'] _article = wiki.get_relevant_article(results, keywords) self.assertTrue('Bill Clinton' in _article.heading) def test_get_relevant_article_no_result(self):
def wiki_search(self, text): wiki = WikiApi() results = wiki.find(text) article = wiki.get_article(results[0]) return article
def jarvis(data): first = data.split(" ") if first[0] == "locate" or first[0] == "location": import location return location.loco(first[1]) if (first[0] == "play" or first[0] == "search") and first[1] == "youtube": del (first[0]) del (first[0]) a = "+".join(first) b = " ".join(first) import urllib.request import urllib.parse import re query_string = urllib.parse.urlencode({"search_query": a}) html_content = urllib.request.urlopen( "http://www.youtube.com/results?" + query_string) search_results = re.findall(r'href=\"\/watch\?v=(.{11})', html_content.read().decode()) print("playing:" + a) return webbrowser.open("http://www.youtube.com/watch?v=" + search_results[0]) if first[0] == "google" or first[0] == "search": del (first[0]) a = "+".join(first) return webbrowser.open('https://www.google.co.in/search?q=' + a) if first[0] == "connect": del (first[0]) a = "".join(first) return webbrowser.open(a + ".com") if first[0] == "who": del (first[0]) a = "".join(first) from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({'locale': 'en'}) results = wiki.find(a) article = wiki.get_article(results[0]) print(article.summary) return webbrowser.open(article.image) while (1): if data in wikipedia: wiki() break if data in status: cpustatus() break if data in welcome: speak("hi there") break if data in play: speak("ok sir") playsong() break if data in newfile: writefile() break if data in readfile: readfile() break if data in searchweb: speak("ok sir") search() break if data in time: speak(ctime()) break if "close notepad" in data: clsnotepad() break if "close video" in data: clsvlc() break if "close browser" in data: clsbrowser() break if data in display: log.display() break if data in end: com = "close" return com break if data in shutdownpc: shutdown() break if data in folders: directory() break if data in closeprogram: close() break else: print("I don't understand the command!! Try again") break
class TestWiki: @pytest.fixture(autouse=True) def set_up(self): self.wiki = WikiApi() self.results = self.wiki.find('Bill Clinton') self.article = self.wiki.get_article(self.results[0]) def test_heading(self): assert self.article.heading == 'Bill Clinton' def test_image(self): assert_url_valid(url=self.article.image) def test_summary(self): results = self.wiki.find('Tom Hanks') article = self.wiki.get_article(results[0]) assert 'Thomas' in article.summary assert 'Jeffrey' in article.summary assert 'Hanks' in article.summary def test_content(self): assert len(self.article.content) > 200 def test_references(self): assert isinstance(self.article.references, list) is True def test_url(self): assert_url_valid(url=self.article.url) assert self.article.url == 'https://en.wikipedia.org/wiki/Bill_Clinton' def test_get_relevant_article(self): keywords = ['president', 'hilary'] _article = self.wiki.get_relevant_article(self.results, keywords) assert 'Bill Clinton' in _article.heading assert len(_article.content) > 5000 assert 'President Bill Clinton' in _article.content def test_get_relevant_article_no_result(self): keywords = ['hockey player'] _article = self.wiki.get_relevant_article(self.results, keywords) assert _article is None def test__remove_ads_from_content(self): content = ( 'From Wikipedia, the free encyclopedia. \n\nLee Strasberg ' '(November 17, 1901 2013 February 17, 1982) was an American ' 'actor, director and acting teacher.\n' 'Today, Ellen Burstyn, Al Pacino, and Harvey Keitel lead this ' 'nonprofit studio dedicated to the development of actors, ' 'playwrights, and directors.\n\nDescription above from the ' 'Wikipedia article\xa0Lee Strasberg,\xa0licensed under CC-BY-SA, ' 'full list of contributors on Wikipedia.') result_content = self.wiki._remove_ads_from_content(content) expected_content = ( ' \n\nLee Strasberg ' '(November 17, 1901 2013 February 17, 1982) was an American ' 'actor, director and acting teacher.\n' 'Today, Ellen Burstyn, Al Pacino, and Harvey Keitel lead this ' 'nonprofit studio dedicated to the development of actors, ' 'playwrights, and directors.') assert expected_content == result_content @pytest.mark.parametrize( 'url, expected_tables', [ ( 'https://en.wikipedia.org/wiki/World_population', [ 'Population by continent', '10 most populous countries', '10 most densely populated countries', 'Countries ranking highly in both total population and ' 'population density', ], ), ( 'https://en.wikipedia.org/wiki/List_of_countries_and_' 'dependencies_by_population', ['Sovereign states and dependencies by population'], ), ( 'https://en.wikipedia.org/wiki/Influenza', [], ), ('https://en.wikipedia.org/wiki/Germany', ['Constituent states']), ( 'https://en.wikipedia.org/wiki/Chess_Classic', [ 'Chess Classic Championship', # 'Rapid Chess Open', # 'Chess960 Rapid chess World Championship', 'FiNet Open Chess960', # 'Chess960 Computer World Championship', ], ), ( 'https://en.wikipedia.org/wiki/List_of_missions_to_the_Moon', ['Missions by date'], ), ( 'https://en.wikipedia.org/wiki/' 'List_of_people_who_have_walked_on_the_Moon', ['Moonwalkers'], ) ], ) def test_get_tables_returns_expected_keys(self, url, expected_tables): tables = self.wiki.get_tables(url=url) assert list(tables.keys()) == expected_tables def test_get_tables(self, mocker): url = ('https://en.wikipedia.org/wiki/' 'COVID-19_pandemic_by_country_and_territory') tables = self.wiki.get_tables(url=url) assert tables assert isinstance(tables, dict) assert tuple(tables.keys()) == ( 'COVID-19 pandemic by location 20 September 2020', 'COVID-19 cases and deaths by region, ' 'in absolute figures and per million ' 'inhabitants as of 5 September 2020', 'First COVID-19 cases by country or territory', 'States with no confirmed COVID-19 cases', 'Partially recognized states with no confirmed cases', 'Dependencies with no confirmed cases', ) assert tables['Dependencies with no confirmed cases'].T.to_dict() == { 0: { 'Rank': 1, 'Territory': 'American Samoa', 'Population': 56700, 'Status': 'Unincorporated territory', 'Country': 'United States', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 1: { 'Rank': 2, 'Territory': 'Cook Islands', 'Population': 15200, 'Status': 'Associated state', 'Country': 'New Zealand', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 2: { 'Rank': 3, 'Territory': 'Wallis and Futuna', 'Population': 11700, 'Status': 'Overseas collectivity', 'Country': 'France', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 3: { 'Rank': 4, 'Territory': 'Saint Helena, Ascension and Tristan da Cunha', 'Population': 5633, 'Status': 'Overseas territory', 'Country': 'United Kingdom', 'Continent': 'Africa', 'Ref.': mocker.ANY, }, 4: { 'Rank': 5, 'Territory': 'Svalbard', 'Population': 2667, 'Status': 'Unincorporated area', 'Country': 'Norway', 'Continent': 'Europe', 'Ref.': mocker.ANY, }, 5: { 'Rank': 6, 'Territory': 'Christmas Island', 'Population': 1955, 'Status': 'External territory', 'Country': 'Australia', 'Continent': 'Asia', 'Ref.': mocker.ANY, }, 6: { 'Rank': 7, 'Territory': 'Norfolk Island', 'Population': 1735, 'Status': 'External territory', 'Country': 'Australia', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 7: { 'Rank': 8, 'Territory': 'Niue', 'Population': 1520, 'Status': 'Associated state', 'Country': 'New Zealand', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 8: { 'Rank': 9, 'Territory': 'Tokelau', 'Population': 1400, 'Status': 'Dependent territory', 'Country': 'New Zealand', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 9: { 'Rank': 10, 'Territory': 'Cocos (Keeling) Islands', 'Population': 555, 'Status': 'External territory', 'Country': 'Australia', 'Continent': 'Asia', 'Ref.': mocker.ANY, }, 10: { 'Rank': 11, 'Territory': 'Pitcairn Islands', 'Population': 50, 'Status': 'Overseas territory', 'Country': 'United Kingdom', 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, } assert tables['States with no confirmed COVID-19 cases'].T.to_dict( ) == { 0: { 'Rank': 1, 'Country': 'North Korea[a]', 'Population': 25778816, 'Continent': 'Asia', 'Ref.': mocker.ANY, }, 1: { 'Rank': 2, 'Country': 'Turkmenistan[a]', 'Population': 6031200, 'Continent': 'Asia', 'Ref.': mocker.ANY, }, 2: { 'Rank': 3, 'Country': 'Solomon Islands', 'Population': 686884, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 3: { 'Rank': 4, 'Country': 'Vanuatu', 'Population': 307145, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 4: { 'Rank': 5, 'Country': 'Samoa', 'Population': 198413, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 5: { 'Rank': 6, 'Country': 'Kiribati', 'Population': 119451, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 6: { 'Rank': 7, 'Country': 'Micronesia', 'Population': 115030, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 7: { 'Rank': 8, 'Country': 'Tonga', 'Population': 105695, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 8: { 'Rank': 9, 'Country': 'Marshall Islands', 'Population': 59190, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 9: { 'Rank': 10, 'Country': 'Palau', 'Population': 18094, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 10: { 'Rank': 11, 'Country': 'Tuvalu', 'Population': 11793, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, 11: { 'Rank': 12, 'Country': 'Nauru', 'Population': 10823, 'Continent': 'Oceania', 'Ref.': mocker.ANY, }, }
from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({ 'locale' : 'en'}) keywords=[] with open("Important_Names.txt","r") as f: for line in f: keywords.append(line) f.close() count=0 for word in keywords: count=count+1 results = wiki.find(word.strip('\n')) if len(results)!=0: article = wiki.get_article(results[0]) text=article.content.encode('utf-8') with open("Web"+str(count)+".txt","w") as f: f.write(text) f.close() print article.url
class WikiGrabber(object): """ Class to grab the wiki articles. """ def __init__(self, filenames): """ Initialize the WikiGrabber class. """ self.glossary = Glossary(filenames) self.wiki = WikiApi({}) def get_articles(self, dir_name): """ Get wiki articles for all the phrases and convert to xml. """ global visited_results step = 1000 + len(visited_results) try: for phrase, flag in self.glossary.phrases: print phrase results = self.wiki.find(phrase) for result in results: if (result not in visited_results): article = self.wiki.get_article(result) self.article_to_xml(article, flag, dir_name) visited_results.append(result) if (len(visited_results) > step): print phrase, len(visited_results) step = step + 1000 except: print phrase, len(visited_results) def article_to_xml(self, article, flag, dir_name): """ Create a xml from the article. """ try: docId = 'Wiki_' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f') docType = 'Wiki' docSource = 'wikipedia' docDate = '' docTitle = article.heading docDesc = clean(article.summary) if (len(docDesc.split()) < WORD_LEN_THRESHOLD): return if (flag and ('security' not in docDesc.lower())): return document = lb.E.Document( lb.E.Title(docTitle), lb.E.Date(docDate), lb.E.Description(docDesc), id=docId, type=docType, src=docSource) doc = etree.tostring(document, pretty_print=True) xml_filename = dir_name + docId + '.xml' writeString(xml_filename, XML_HEAD + doc) except Exception as e: print e
def getWikiArticle(word, locale): wiki = WikiApi({ 'locale' : locale}) results = wiki.find(word) result = next(iter(results or []), None) return wiki.get_article(result) if result else None
def main(): status = True pygame.mixer.music.play(-1) music_status = 1 # Create a wikiapi instance wiki_status = 1 wiki_instance = WikiApi() wiki_instance = WikiApi({'locale': 'en'}) namespace = None index1 = 0 data_list = [] # Load weather data into lists and dictionaries weather_location = 0 connector = yweather.Client() weather_id_ny = connector.fetch_woeid('New York') weather_data_ny = connector.fetch_weather(str(weather_id_ny), metric=True) data_dict_ny = {} data_dict_ny.update({'Current Temperature': weather_data_ny["condition"]["temp"], \ 'Sunrise': weather_data_ny['astronomy']['sunrise'],\ 'Sunset': weather_data_ny['astronomy']['sunset'], 'Max Temperature': (str(weather_data_ny['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_ny['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_ny['wind']['speed'] + " km/h")), \ 'Condition': weather_data_ny['condition']['text']}) keys_list_ny = data_dict_ny.keys() weather_id_buffalo = connector.fetch_woeid('Buffalo') weather_data_buffalo = connector.fetch_weather(str(weather_id_buffalo), metric=True) data_dict_buffalo = {} data_dict_buffalo.update({'Current Temperature': weather_data_buffalo["condition"]["temp"], \ 'Sunrise': weather_data_buffalo['astronomy']['sunrise'],\ 'Sunset': weather_data_buffalo['astronomy']['sunset'], 'Max Temperature': (str(weather_data_buffalo['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_buffalo['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_buffalo['wind']['speed'] + " km/h")), \ 'Condition': weather_data_buffalo['condition']['text']}) keys_list_buffalo = data_dict_buffalo.keys() weather_id_hyd = connector.fetch_woeid('Hyderabad') weather_data_hyd = connector.fetch_weather(str(weather_id_hyd), metric=True) data_dict_hyd = {} data_dict_hyd.update({'Current Temperature': weather_data_hyd["condition"]["temp"], \ 'Sunrise': weather_data_hyd['astronomy']['sunrise'], \ 'Sunset': weather_data_hyd['astronomy']['sunset'], 'Max Temperature': (str(weather_data_hyd['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_hyd['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_hyd['wind']['speed'] + " km/h")), \ 'Condition': weather_data_hyd['condition']['text']}) keys_list_hyd = data_dict_hyd.keys() while status: main_display.fill(black) pointer_location = pygame.mouse.get_pos() pointer_click = pygame.mouse.get_pressed() for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() # Music Button if 325 < pointer_location[0] < 405 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: wiki_status = 1 if 700 < pointer_location[0] < 780 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: music_status = not music_status if music_status == 0: pygame.mixer.music.pause() else: pygame.mixer.music.unpause() # New York Button Check if 20 < pointer_location[0] < 80 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 2 # Buffalo Button Check if 100 < pointer_location[0] < 160 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 1 # Hyderabad Button Check if 180 < pointer_location[0] < 240 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 0 try: main_display.blit(weather_image, (0,0)) except: pass # Data Display if weather_location == 0: data_display(110, data_dict_hyd['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_hyd[5] + " : " + data_dict_hyd['Condition'], white, 95, 260) # Condition data_display(15, keys_list_hyd[1] + " : " + data_dict_hyd['Min Temperature'], white, 130, 320) data_display(15, keys_list_hyd[6] + " : " + data_dict_hyd['Max Temperature'], white, 130, 360) data_display(15, keys_list_hyd[4] + " : " + data_dict_hyd['Sunrise'], white, 95, 400) # Sunrise data_display(15, keys_list_hyd[0] + " : " + data_dict_hyd['Sunset'], white, 95, 440) # Sunset data_display(15, keys_list_hyd[3] + " : " + data_dict_hyd['Wind'], white, 95, 480) # Wind Speed elif weather_location == 1: data_display(110, data_dict_buffalo['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_buffalo[5] + " : " + data_dict_buffalo['Condition'], white, 95, 260) data_display(15, keys_list_buffalo[1] + " : " + data_dict_buffalo['Min Temperature'], white, 130, 320) data_display(15, keys_list_buffalo[6] + " : " + data_dict_buffalo['Max Temperature'], white, 130, 360) data_display(15, keys_list_buffalo[4] + " : " + data_dict_buffalo['Sunrise'], white, 95, 400) data_display(15, keys_list_buffalo[0] + " : " + data_dict_buffalo['Sunset'], white, 95, 440) data_display(15, keys_list_buffalo[3] + " : " + data_dict_buffalo['Wind'], white, 95, 480) elif weather_location == 2: data_display(110, data_dict_ny['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_ny[5] + " : " + data_dict_ny['Condition'], white, 95, 260) # Condition data_display(15, keys_list_ny[1] + " : " + data_dict_ny['Min Temperature'], white, 130, 320) data_display(15, keys_list_ny[6] + " : " + data_dict_ny['Max Temperature'], white, 130, 360) data_display(15, keys_list_ny[4] + " : " + data_dict_ny['Sunrise'], white, 95, 400) # Sunrise data_display(15, keys_list_ny[0] + " : " + data_dict_ny['Sunset'], white, 95, 440) # Sunset data_display(15, keys_list_ny[3] + " : " + data_dict_ny['Wind'], white, 95, 480) # Wind Speed # Display Wiki Article if wiki_status == 1: del data_list[:] wiki_status = 0 blahblah = True try: url = 'http://en.wikipedia.org/wiki/Special:Random' if namespace != None: url += '/' + namespace req = urllib2.Request(url, None, { 'User-Agent' : 'x'}) page = urllib2.urlopen(req).readlines() wiki_draft1 = remove_tags(page[4]) wiki_title = wiki_draft1[:wiki_draft1.index('Wikipedia') - 2] wiki_data_list = wiki_instance.find(wiki_title) wiki_data = wiki_instance.get_article(wiki_data_list[0]) temp = endlinefunction(wiki_data.summary, data_list, 90) except (urllib2.HTTPError, urllib2.URLError): print "Failed to get article" raise # Buttons and Division Display pygame.draw.rect(main_display, white, (300, 0, 5, 600)) pygame.draw.rect(main_display, white, (300, 70, 500, 5)) drawbutton(wood, 700, 20, 80, 30, 10, "Toggle Music", black) drawbutton(white, 20, 20, 60, 30, 10, "New York", black) drawbutton(white, 100, 20, 60, 30, 10, "Buffalo", black) drawbutton(white, 180, 20, 60, 30, 10, "Hyderabad", black) drawbutton(wood, 325, 20, 80, 30, 10, "Next Article", black) # Cursor Display data_display(15, wiki_data.heading, wood, 540, 130) y_cood = 150 j = 25 for i in range(0, len(data_list)): y_cood = y_cood + j data_display(10, data_list[i], black, 540, y_cood) clock.tick(100) pygame.display.flip()
class Scraper: prohibited_headers = set(['Contents', 'See also', 'References']) # The scraper uses the classifier to only send out articles that are more likely to # be music related def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages]) # The stream method is used for scraping a large number of maximum links. # This method does not implement the classifier filtering because its main # purpose is for building the database of pages for manual classification def stream(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) for i in range(maxLinks): if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) for u in urls: queue.put(u) yield page # The scrape method is used for a smaller number of maximum links. It performs # a breadth first search given an initial term. It uses a queue to keep track # of the pages to be scraped and a set of the already scraped to prevent # duplicates def scrape(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) pages = [] while len(pages) < maxLinks: if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) # Only if the classifier predicts it as a good page, a page will # be added to the pages list which is returned at the end if self.classifier.classify(page) == 1 and page.url not in self.bad_urls: pages.append(page) print page.name for u in urls: queue.put(u) return pages # Common code for both methods that crawl wikipedia def scrape_common(self, start_term): finished = set() queue = Queue() search_results = self.wiki.find(start_term) if not search_results: print 'No pages found. Try a different term' else: queue.put('https://en.wikipedia.org/wiki/' + search_results[0]) return finished, queue, search_results # Process a page's HTML using BeautifulSoup to extract useful information def process_page(self, url): html = self.wiki.get(url) soup = BeautifulSoup(html) body_html = soup.find(id='mw-content-text') title_tag = soup.find(id='firstHeading') if title_tag.string == None: contents = title_tag.contents string_contents = [] for c in contents: if type(c) != str: string_contents.append(c.string) else: string_contents.append(c) title = ''.join(string_contents) else: title = title_tag.string urls, links_text, media_link_count = self.find_urls(body_html) (clean_text, headers) = self.clean_html(body_html) page = Page(url, title, clean_text, headers, links_text, media_link_count) return (page, urls) # Find all URLs in a given HTML that redirect to another article in Wikipedia # Page links and media links (pictures, audio) are stored in different lists # but are both used. def find_urls(self, html): link_urls = [] good_link = re.compile('/wiki/') bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)') media_link = re.compile('.*\.jpg|.*\.ogg') media_link_count = 0 media_found = set() links_text = dd(int) all_links = html.find_all('a') for l in all_links: link = l.get('href') content = self.extract_content([l])[0] if good_link.match(link) and not bad_link.match(link): link_urls.append('https://en.wikipedia.org' + link) if str(content) != '': links_text[content] = links_text[content] + 1 elif media_link.match(link): if link not in media_found: media_link_count += 1 media_found.add(link) if str(content) != '': links_text[content] = links_text[content] + 1 return (link_urls, links_text, media_link_count) # Fucntion to extract the body and the headers of an article def clean_html(self, html): paragraphs = html.find_all('p') headers = html.find_all(re.compile('h\d')) clean_text = ''.join(self.extract_content(paragraphs)) headers_list = self.clean_headers(headers) return (clean_text, headers_list) # Clean the list of headers of the prohibited, common headers def clean_headers(self, array): raw_headers = self.extract_content(array) final_headers = [] for h in raw_headers: if h not in Scraper.prohibited_headers: final_headers.append(h) return final_headers # Fuction to clean the HTML body of a page. It removes common links that # would cause noise in our system such as [edit] buttons and reference numbers # e.g. [2]. def extract_content(self, array): for i in range(len(array)): array[i] = re.sub(r'<[^>]*>', '', str(array[i])) array[i] = re.sub(r'\[edit\]', '', str(array[i])) array[i] = re.sub(r'\[\d*\]', '', str(array[i])) array[i] = re.sub(r'\^', '', str(array[i])) return array
with open('./questions.txt') as f: questions = [line[:-1] for line in f] model = AnswerFinder(config=c, restore=True, mode="inference") print('\n\n\n\n\n\n\n') print( '''Hello! This is Alpha version of program for reading wikipedia to answer the question. Program was writing basing on paper https://arxiv.org/pdf/1704.00051.pdf For more detail [email protected]\n''') c.inf_threshold = 0.7 while True: while True: print('What or who do you want to ask about? Example: Barak Obama') thing = input() results = wiki.find(thing) if len(results) > 0: print('Ok. I found few wiki pages about {}.'.format(thing)) break else: print( 'Can\'t find any wiki pages about {}. Try another one.'.format( thing)) article = wiki.get_article(results[0]) context = article.content for question in questions: os.system('clear') print('Q: {}'.format(question)) print('Search answers ...') answers, probs = tools.get_answer(context, question, model, c)
class Scraper: prohibited_headers = set(['Contents', 'See also', 'References']) # The scraper uses the classifier to only send out articles that are more likely to # be music related def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set( [p['url'] for p in self.classifier.non_accepted_pages]) # The stream method is used for scraping a large number of maximum links. # This method does not implement the classifier filtering because its main # purpose is for building the database of pages for manual classification def stream(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) for i in range(maxLinks): if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) for u in urls: queue.put(u) yield page # The scrape method is used for a smaller number of maximum links. It performs # a breadth first search given an initial term. It uses a queue to keep track # of the pages to be scraped and a set of the already scraped to prevent # duplicates def scrape(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) pages = [] while len(pages) < maxLinks: if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) # Only if the classifier predicts it as a good page, a page will # be added to the pages list which is returned at the end if self.classifier.classify( page) == 1 and page.url not in self.bad_urls: pages.append(page) print page.name for u in urls: queue.put(u) return pages # Common code for both methods that crawl wikipedia def scrape_common(self, start_term): finished = set() queue = Queue() search_results = self.wiki.find(start_term) if not search_results: print 'No pages found. Try a different term' else: queue.put('https://en.wikipedia.org/wiki/' + search_results[0]) return finished, queue, search_results # Process a page's HTML using BeautifulSoup to extract useful information def process_page(self, url): html = self.wiki.get(url) soup = BeautifulSoup(html) body_html = soup.find(id='mw-content-text') title_tag = soup.find(id='firstHeading') if title_tag.string == None: contents = title_tag.contents string_contents = [] for c in contents: if type(c) != str: string_contents.append(c.string) else: string_contents.append(c) title = ''.join(string_contents) else: title = title_tag.string urls, links_text, media_link_count = self.find_urls(body_html) (clean_text, headers) = self.clean_html(body_html) page = Page(url, title, clean_text, headers, links_text, media_link_count) return (page, urls) # Find all URLs in a given HTML that redirect to another article in Wikipedia # Page links and media links (pictures, audio) are stored in different lists # but are both used. def find_urls(self, html): link_urls = [] good_link = re.compile('/wiki/') bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)') media_link = re.compile('.*\.jpg|.*\.ogg') media_link_count = 0 media_found = set() links_text = dd(int) all_links = html.find_all('a') for l in all_links: link = l.get('href') content = self.extract_content([l])[0] if good_link.match(link) and not bad_link.match(link): link_urls.append('https://en.wikipedia.org' + link) if str(content) != '': links_text[content] = links_text[content] + 1 elif media_link.match(link): if link not in media_found: media_link_count += 1 media_found.add(link) if str(content) != '': links_text[content] = links_text[content] + 1 return (link_urls, links_text, media_link_count) # Fucntion to extract the body and the headers of an article def clean_html(self, html): paragraphs = html.find_all('p') headers = html.find_all(re.compile('h\d')) clean_text = ''.join(self.extract_content(paragraphs)) headers_list = self.clean_headers(headers) return (clean_text, headers_list) # Clean the list of headers of the prohibited, common headers def clean_headers(self, array): raw_headers = self.extract_content(array) final_headers = [] for h in raw_headers: if h not in Scraper.prohibited_headers: final_headers.append(h) return final_headers # Fuction to clean the HTML body of a page. It removes common links that # would cause noise in our system such as [edit] buttons and reference numbers # e.g. [2]. def extract_content(self, array): for i in range(len(array)): array[i] = re.sub(r'<[^>]*>', '', str(array[i])) array[i] = re.sub(r'\[edit\]', '', str(array[i])) array[i] = re.sub(r'\[\d*\]', '', str(array[i])) array[i] = re.sub(r'\^', '', str(array[i])) return array
# -*- coding: utf-8 -*- from wikiapi import WikiApi import unittest wiki = WikiApi({}) results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) # taking first search result class TestWiki(unittest.TestCase): def test_heading(self): self.assertIsNotNone(article.heading) def test_image(self): self.assertTrue(isinstance(article.image, str)) def test_summary(self): self.assertGreater(len(article.summary), 100) def test_content(self): self.assertGreater(len(article.content), 200) def test_references(self): self.assertTrue(isinstance(article.references, list)) def test_url(self): self.assertTrue(article.url, u"http://en.wikipedia.org/wiki/Bill_Clinton") def test_get_relevant_article(self): keywords = ['president', 'hilary']
if i not in stpw: keyw.append(i) return keyw #************************************ #************************************ wiki = WikiApi({}) dic_cont={} #diccionary mlist=[] #word base #************************************ for wtopic in file1.readlines(): w=wtopic.split() mlist.append(w[0]) results = wiki.find(w[0]) if results: article = wiki.get_article(results[0]) r=article.content rtoken= wordpunct_tokenize(r) #implementation of stopwords stopwords = nltk.corpus.stopwords.words('english') content = [wip for wip in rtoken if wip.lower() not in stopwords] #implementation if there are a characters into the system fcontent = [wip for wip in content if re.sub(r'[^A-Za-z]', "", wip)] gcontent=[] for i in fcontent: gcontent.append(i.encode('utf-8').lower().strip())