def test_cache_not_populated_when_disabled(self): wiki = WikiApi({'cache': False}) assert self._get_cache_size(wiki) == 0 wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 0 shutil.rmtree(wiki.cache_dir, ignore_errors=True)
def runSearchInput(self): searchFor = self.getPluginParamValue("SearchFor") locale = self.getPluginParamValue("Locale") limitResultsTo = self.getPluginParamValueAsInt("LimitResultsTo") includeContent = self.getPluginParamValueAsTrueOrFalse( "IncludeContent") includeHeading = self.getPluginParamValueAsTrueOrFalse( "IncludeHeading") includeSummary = self.getPluginParamValueAsTrueOrFalse( "IncludeSummary") includeURL = self.getPluginParamValueAsTrueOrFalse("IncludeURL") wiki = WikiApi({"locale": locale}) content = "" cnt = 0 for result in wiki.find(searchFor): article = wiki.get_article(result) if includeHeading: content = "{0}\n{1}".format(content, article.heading) if includeURL: content = "{0}\n{1}".format(content, article.url) if includeSummary: content = "{0}\n{1}".format(content, article.summary) if includeContent: content = "{0}\n{1}".format(content, article.content) content = "{0}\n\n".format(content) cnt += 1 if cnt >= limitResultsTo: break content = content.strip() self.setInputContent(content) return content
def set_up(self): # using an Italian-Emilian locale that is full of unicode symbols wiki = WikiApi({'locale': 'eml'}) result = wiki.find('Bulaggna')[0] return { 'wiki': wiki, 'result': result, }
def set_up(self): wiki = WikiApi() results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) return { 'wiki': wiki, 'results': results, 'article': article, }
def get_url(query, log_file): wiki = WikiApi() results = wiki.find(query) if len(results) == 0: sys.stderr.write("No wikipedia article found for '" + query + "'\n") else: article = wiki.get_article(results[0]) print article.url with open(log_file, 'a') as f: f.write(article.url + "\n")
def wikiqueryresults(searchQuery): wiki = WikiApi({}) wiki = WikiApi({ 'locale' : 'en' }) # Top specify your locale, 'en' is default wikiSearch = wiki.find(searchQuery) wikiArticle = wiki.get_article(wikiSearch[0]) return wikiArticle.summary
def wiki_api(options): wiki = WikiApi() wiki = WikiApi({ 'locale' : 'en'}) # to specify your locale, 'en' is default results = wiki.find(options['q']) for result in results: article = wiki.get_article(results) title = article.heading url = article.url print(url) link = Link(topic = options['topic'], title = title, url = url) link.save()
def __init__(self, add_gloss_list, del_gloss_list, category, label): """ Initialize the class. """ self.add_phrases = get_phrases(add_gloss_list) self.del_phrases = get_phrases(del_gloss_list) self.category = category self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category self.wiki = WikiApi({}) self.visited_results = self.get_results(self.del_phrases) self.count = 0
def test_cache_populated(self): wiki = WikiApi({'cache': True, 'cache_dir': '/tmp/wikiapi-test'}) assert self._get_cache_size(wiki) == 0 # Make multiple calls to ensure no duplicate cache items created assert wiki.find('Bob Marley') == wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 1 # Check cache keys are unique assert wiki.find('Tom Hanks') != wiki.find('Bob Marley') assert self._get_cache_size(wiki) == 2 shutil.rmtree(wiki.cache_dir, ignore_errors=True)
def get_wikipedia_details(keyword): wiki = WikiApi() results = wiki.find(keyword) if len(results) > 0: article = wiki.get_article(results[0]) if not 'Disambig' in article.image: return { 'heading': article.heading, 'image': article.image, 'summary': article.summary, 'url': article.url } return None
class TestUnicode(unittest.TestCase): def setUp(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.res = self.wiki.find('Bulaggna')[0] self.article = None def test_search(self): # this is urlencoded. self.assertEqual(self.res, u'Bul%C3%A5ggna') def test_article(self): #unicode errors will likely blow in your face here self.assertIsNotNone(self.wiki.get_article(self.res))
class TestUnicode: @pytest.fixture(autouse=True) def set_up(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.result = self.wiki.find('Bulaggna')[0] def test_search(self): # this is urlencoded. assert self.result == 'Bul%C3%A5ggna' def test_article(self): # unicode errors will likely blow in your face here assert self.wiki.get_article(self.result) is not None
class TestUnicode(unittest.TestCase): def setUp(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.res = self.wiki.find('Bulagna')[0] self.article = None def test_search(self): # this is urlencoded. self.assertEqual(self.res, u'Bul%C3%A5ggna') def test_article(self): #unicode errors will likely blow in your face here self.assertIsNotNone(self.wiki.get_article(self.res))
def wiki(tokens, message): print("\033[1;34;1m") print("\nHazel : Please wait while I surf the web for a result") try: wiki = WikiApi() WikiApi({'locale': 'en'}) if "search" in tokens: tokens.remove("search") # remove search keyword to retrieve the main content to be searched if "what" in tokens: tokens.remove("what") if "who" in tokens: tokens.remove("who") if "look" in tokens: tokens.remove("look") if "tell" in tokens: tokens.remove("tell") if "more" in tokens: tokens.remove("more") if "about" in tokens: tokens.remove("about") stop_words = set(stopwords.words('english')) # Remove stop words filtered_sentence = [w for w in tokens if not w in stop_words] filtered_sentence = [] for w in tokens: # Filtering input by removing stopwords such as 'I', 'for', 'is', etc. if w not in stop_words: filtered_sentence.append(w) # Get and store message without stopeords filtered_list = filtered_sentence filtered_sentence = ' '.join(filtered_list) # Making a sentance out of the tokens message = filtered_sentence # storing input in message tokens = word_tokenize(message) # tokenize new message s = "" # appends the remaining tokens to be searched for for i in tokens: s = s + i + " " # appending the tokens to form a search keyword results = wiki.find(s) # package function to do online searched #print("websearch\n" if results == "": results = "null" print("\nFound result for : ", results[0]) # print the first search result print("\033[1;37;1m") # set console color print(wi.summary(s)) #main() except Exception as e: print("I didnt get that. You may want to try that again")
def setup(): """ Sets up global wiki object for Wikipedia lookups. """ global wiki, imdb wiki = WikiApi() imdb = Imdb(anonymize=True)
def wiki_search(query): wiki = WikiApi() wikiurls=[] lst=query.split(",") num = 10/len(lst) # print num for i in lst: results = wiki.find(i) cnt=0 for j in results: cnt=cnt+1 article = wiki.get_article(j) wikiurls.append(article.url) if cnt==num: break return wikiurls
def wiki_search(query): wiki = WikiApi() wikiurls = [] lst = query.split(",") num = 10 / len(lst) # print num for i in lst: results = wiki.find(i) cnt = 0 for j in results: cnt = cnt + 1 article = wiki.get_article(j) wikiurls.append(article.url) if cnt == num: break return wikiurls
def get_full_name_from_wiki(name): wiki = WikiApi() results = wiki.find(name) if len(results) > 0: article = wiki.get_article(results[0]) new_name = article.summary new_name = new_name[:new_name.find('(')-1] if new_name.find(' refer ') != -1: if len(results) > 1: article = wiki.get_article(results[1]) new_name = article.summary new_name = new_name[:new_name.find('(') - 1] else: return None table = str.maketrans({key: None for key in string.punctuation + '\r\n'}) new_name = new_name.translate(table) if len(new_name) > 4 and len(new_name) < 50: return new_name else: return None else: return None
def get_security_results(filenames): """ Pre-fill visited with security term results. """ global visited_results wiki = WikiApi({}) phrases = [] for filename in filenames: lines = readLines(filename) for line in lines: line = line.strip() if ((len(line) > 0) and (line[0] != '#')): if (line[0] == '/'): phrases.append(line.split(' ', 1)[1]) else: phrases.append(line) for phrase in phrases: results = wiki.find(phrase) for result in results: if (result not in visited_results): visited_results.append(result)
def setUp(self): """Set up all of the requirements for testing """ self.pos_lex = naivebayes.generate('sentiment/pos.txt', naivebayes.lexicon()) self.neg_lex = naivebayes.generate('sentiment/neg.txt', naivebayes.lexicon()) self.wiki = WikiApi() self.r = praw.Reddit(client_id='l-Gz5blkt7GCUg', client_secret='_xLEgNing89k6__sWItU1_j9aR8', user_agent='testscript by /u/pbexe') self.test_sentence = 'The cat sat on the mat. The dog however, did not!' self.test_sentence_tokenized = [[('The', 'DT'), ('cat', 'NN'), ('sat', 'VBD'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN'), ('.', '.')], [('The', 'DT'), ('dog', 'NN'), ('however', 'RB'), (',', ','), ('did', 'VBD'), ('not', 'RB'), ('!', '.')]] self.test_sentence_with_entities = 'Dr Foster went to Glouster' self.test_sentence_with_entities_nodes = ['Dr Foster', 'Glouster'] self.story = Story(source='http://example.com/', content='This is a title') self.story.save() self.node1 = Node(name='Key word 1', date=timezone.now(), collectedFrom=self.story) self.node1.save() self.node2 = Node(name='Key word 2', date=timezone.now(), collectedFrom=self.story) self.node2.save() self.node3 = Node(name='Key word 3', date=timezone.now(), collectedFrom=self.story) self.node3.save()
def get_wiki_phrases(word): wiki = WikiApi() wiki = WikiApi({'locale': 'en'}) results = wiki.find(word) print results phrase = "" for i in range(min(4, len(results))): article = wiki.get_article(results[i]) #print article.content phrase = phrase + " " + article.content #print phrase rake_object = rake.Rake("SmartStoplist.txt", 4, 3, 10) #Now, we have a RAKE object that extracts keywords where: # Each word has at least 4 characters # Each phrase has at most 3 words # Each keyword appears in the text at least 4 times keywords = rake_object.run(phrase) return keywords[0:20]
class WikiGrabber(object): """ Class to grab the wiki articles. """ def __init__(self, filenames): """ Initialize the WikiGrabber class. """ self.glossary = Glossary(filenames) self.wiki = WikiApi({}) def get_articles(self, dir_name): """ Get wiki articles for all the phrases and convert to xml. """ global visited_results step = 1000 + len(visited_results) try: for phrase, flag in self.glossary.phrases: print phrase results = self.wiki.find(phrase) for result in results: if (result not in visited_results): article = self.wiki.get_article(result) self.article_to_xml(article, flag, dir_name) visited_results.append(result) if (len(visited_results) > step): print phrase, len(visited_results) step = step + 1000 except: print phrase, len(visited_results) def article_to_xml(self, article, flag, dir_name): """ Create a xml from the article. """ try: docId = 'Wiki_' + datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f') docType = 'Wiki' docSource = 'wikipedia' docDate = '' docTitle = article.heading docDesc = clean(article.summary) if (len(docDesc.split()) < WORD_LEN_THRESHOLD): return if (flag and ('security' not in docDesc.lower())): return document = lb.E.Document( lb.E.Title(docTitle), lb.E.Date(docDate), lb.E.Description(docDesc), id=docId, type=docType, src=docSource) doc = etree.tostring(document, pretty_print=True) xml_filename = dir_name + docId + '.xml' writeString(xml_filename, XML_HEAD + doc) except Exception as e: print e
class Wikipedia_Scanner(object): """ Class to Scann wikipedia articles. """ def __init__(self, add_gloss_list, del_gloss_list, category, label): """ Initialize the class. """ self.add_phrases = get_phrases(add_gloss_list) self.del_phrases = get_phrases(del_gloss_list) self.category = category self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category self.wiki = WikiApi({}) self.visited_results = self.get_results(self.del_phrases) self.count = 0 def get_results(self, phrases): """ Return dictionary of wiki results corresponding to phrases. """ visited_results = {} for phrase in phrases: results = self.wiki.find(phrase) for result in results: if (not visited_results.has_key(result)): visited_results[result] = True return visited_results def get_articles(self): """ Fetches articles and puts in data directory. """ for phrase in self.add_phrases: try: results = self.wiki.find(phrase) for result in results: if (not self.visited_results.has_key(result)): self.visited_results[result] = True article = self.wiki.get_article(result) entry_src = 'wikipedia_' + self.category entry_type = 'article' entry_id = 'wikipedia_' + result.replace(' ', '_').replace('/', '_') entry_title = article.heading entry_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f') entry_desc = clean(article.summary) if (''.join(entry_desc.split()) != ''): xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc) write_string(self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False) write_string(self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False) self.count = self.count + 1 if (self.count % 100 == 0): print 'Scanned ' + str(self.count) + ' wiki articles.' except Exception as e: print 'Wiki Api Error! [' + str(e) + ']'
from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({ 'locale' : 'en'}) keywords=[] with open("Important_Names.txt","r") as f: for line in f: keywords.append(line) f.close() count=0 for word in keywords: count=count+1 results = wiki.find(word.strip('\n')) if len(results)!=0: article = wiki.get_article(results[0]) text=article.content.encode('utf-8') with open("Web"+str(count)+".txt","w") as f: f.write(text) f.close() print article.url
def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set( [p['url'] for p in self.classifier.non_accepted_pages])
def setUp(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.res = self.wiki.find('Bulagna')[0] self.article = None
from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({'locale': 'es'}) # to specify your locale, 'en' is default wiki.options results = wiki.find('hereditary myopathies') print()
from bs4 import BeautifulSoup import os import time import matplotlib matplotlib.use('Agg') from wikiapi import WikiApi wiki = WikiApi() import tools from answer_finder import AnswerFinder from config import config as c with open('./questions.txt') as f: questions = [line[:-1] for line in f] model = AnswerFinder(config=c, restore=True, mode="inference") print('\n\n\n\n\n\n\n') print( '''Hello! This is Alpha version of program for reading wikipedia to answer the question. Program was writing basing on paper https://arxiv.org/pdf/1704.00051.pdf For more detail [email protected]\n''') c.inf_threshold = 0.7 while True: while True: print('What or who do you want to ask about? Example: Barak Obama') thing = input() results = wiki.find(thing) if len(results) > 0: print('Ok. I found few wiki pages about {}.'.format(thing)) break
def wiki_search(self, text): wiki = WikiApi() results = wiki.find(text) article = wiki.get_article(results[0]) return article
def __init__(self, filenames): """ Initialize the WikiGrabber class. """ self.glossary = Glossary(filenames) self.wiki = WikiApi({})
from wikiapi import WikiApi sub = "sachin tendulkar" wiki = WikiApi() wiki = WikiApi({'locale': 'en'}) page = wiki.get_article(sub) print(page.content)
#!/usr/bin/env python #_*_coding:utf8_*_ import os, json, re, codecs, sys, argparse, collections from pprint import pprint from wikiapi import WikiApi # from nltk.corpus import stopwords from math import sqrt json_data = {} mxspath = os.environ.get('MXS_PATH') n = 0 list_path = [] wiki = WikiApi() wiki = WikiApi({'locale': 'fr'}) def cut_word(content): text = re.sub("[^a-zA-Z]", " ", content) words = text.lower().split() # stops = set(stopwords.words('french')) tags = [w for w in words] return (tags) def merge_tag(tag1=None, tag2=None): v1 = [] v2 = [] tag_dict1 = collections.Counter(tag1) tag_dict2 = collections.Counter(tag2) merged_tag = set()
class Wikipedia_Scanner(object): """ Class to Scann wikipedia articles. """ def __init__(self, add_gloss_list, del_gloss_list, category, label): """ Initialize the class. """ self.add_phrases = get_phrases(add_gloss_list) self.del_phrases = get_phrases(del_gloss_list) self.category = category self.corpus_dir = CORPUS_DIR + '/' + label + '/wikipedia/' + category self.raw_dir = RAW_DATA_DIR + '/' + label + '/wikipedia/' + category self.wiki = WikiApi({}) self.visited_results = self.get_results(self.del_phrases) self.count = 0 def get_results(self, phrases): """ Return dictionary of wiki results corresponding to phrases. """ visited_results = {} for phrase in phrases: results = self.wiki.find(phrase) for result in results: if (not visited_results.has_key(result)): visited_results[result] = True return visited_results def get_articles(self): """ Fetches articles and puts in data directory. """ for phrase in self.add_phrases: try: results = self.wiki.find(phrase) for result in results: if (not self.visited_results.has_key(result)): self.visited_results[result] = True article = self.wiki.get_article(result) entry_src = 'wikipedia_' + self.category entry_type = 'article' entry_id = 'wikipedia_' + result.replace( ' ', '_').replace('/', '_') entry_title = article.heading entry_date = datetime.now().strftime( '%Y-%m-%d_%H-%M-%S-%f') entry_desc = clean(article.summary) if (''.join(entry_desc.split()) != ''): xml_string = bundle_xml(entry_src, entry_type, entry_id, entry_title, entry_date, entry_desc) write_string( self.corpus_dir + '/' + entry_id.lower() + '.xml', xml_string, False) write_string( self.raw_dir + '/' + entry_id.lower() + '.txt', entry_desc, False) self.count = self.count + 1 if (self.count % 100 == 0): print 'Scanned ' + str( self.count) + ' wiki articles.' except Exception as e: print 'Wiki Api Error! [' + str(e) + ']'
class Scraper: prohibited_headers = set(['Contents', 'See also', 'References']) # The scraper uses the classifier to only send out articles that are more likely to # be music related def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set( [p['url'] for p in self.classifier.non_accepted_pages]) # The stream method is used for scraping a large number of maximum links. # This method does not implement the classifier filtering because its main # purpose is for building the database of pages for manual classification def stream(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) for i in range(maxLinks): if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) for u in urls: queue.put(u) yield page # The scrape method is used for a smaller number of maximum links. It performs # a breadth first search given an initial term. It uses a queue to keep track # of the pages to be scraped and a set of the already scraped to prevent # duplicates def scrape(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) pages = [] while len(pages) < maxLinks: if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) # Only if the classifier predicts it as a good page, a page will # be added to the pages list which is returned at the end if self.classifier.classify( page) == 1 and page.url not in self.bad_urls: pages.append(page) print page.name for u in urls: queue.put(u) return pages # Common code for both methods that crawl wikipedia def scrape_common(self, start_term): finished = set() queue = Queue() search_results = self.wiki.find(start_term) if not search_results: print 'No pages found. Try a different term' else: queue.put('https://en.wikipedia.org/wiki/' + search_results[0]) return finished, queue, search_results # Process a page's HTML using BeautifulSoup to extract useful information def process_page(self, url): html = self.wiki.get(url) soup = BeautifulSoup(html) body_html = soup.find(id='mw-content-text') title_tag = soup.find(id='firstHeading') if title_tag.string == None: contents = title_tag.contents string_contents = [] for c in contents: if type(c) != str: string_contents.append(c.string) else: string_contents.append(c) title = ''.join(string_contents) else: title = title_tag.string urls, links_text, media_link_count = self.find_urls(body_html) (clean_text, headers) = self.clean_html(body_html) page = Page(url, title, clean_text, headers, links_text, media_link_count) return (page, urls) # Find all URLs in a given HTML that redirect to another article in Wikipedia # Page links and media links (pictures, audio) are stored in different lists # but are both used. def find_urls(self, html): link_urls = [] good_link = re.compile('/wiki/') bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)') media_link = re.compile('.*\.jpg|.*\.ogg') media_link_count = 0 media_found = set() links_text = dd(int) all_links = html.find_all('a') for l in all_links: link = l.get('href') content = self.extract_content([l])[0] if good_link.match(link) and not bad_link.match(link): link_urls.append('https://en.wikipedia.org' + link) if str(content) != '': links_text[content] = links_text[content] + 1 elif media_link.match(link): if link not in media_found: media_link_count += 1 media_found.add(link) if str(content) != '': links_text[content] = links_text[content] + 1 return (link_urls, links_text, media_link_count) # Fucntion to extract the body and the headers of an article def clean_html(self, html): paragraphs = html.find_all('p') headers = html.find_all(re.compile('h\d')) clean_text = ''.join(self.extract_content(paragraphs)) headers_list = self.clean_headers(headers) return (clean_text, headers_list) # Clean the list of headers of the prohibited, common headers def clean_headers(self, array): raw_headers = self.extract_content(array) final_headers = [] for h in raw_headers: if h not in Scraper.prohibited_headers: final_headers.append(h) return final_headers # Fuction to clean the HTML body of a page. It removes common links that # would cause noise in our system such as [edit] buttons and reference numbers # e.g. [2]. def extract_content(self, array): for i in range(len(array)): array[i] = re.sub(r'<[^>]*>', '', str(array[i])) array[i] = re.sub(r'\[edit\]', '', str(array[i])) array[i] = re.sub(r'\[\d*\]', '', str(array[i])) array[i] = re.sub(r'\^', '', str(array[i])) return array
# -*- coding: utf-8 -*- from wikiapi import WikiApi import unittest wiki = WikiApi({}) results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) # taking first search result class TestWiki(unittest.TestCase): def test_heading(self): self.assertIsNotNone(article.heading) def test_image(self): self.assertTrue(isinstance(article.image, str)) def test_summary(self): self.assertGreater(len(article.summary), 100) def test_content(self): self.assertGreater(len(article.content), 200) def test_references(self): self.assertTrue(isinstance(article.references, list)) def test_url(self): self.assertTrue(article.url, u"http://en.wikipedia.org/wiki/Bill_Clinton") def test_get_relevant_article(self): keywords = ['president', 'hilary']
def setUp(self): # using an Italian-Emilian locale that is full of unicode symbols self.wiki = WikiApi({'locale': 'eml'}) self.res = self.wiki.find('Bulaggna')[0] self.article = None
def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages])
class Scraper: prohibited_headers = set(['Contents', 'See also', 'References']) # The scraper uses the classifier to only send out articles that are more likely to # be music related def __init__(self): self.classifier = classifier.Classifier() self.wiki = WikiApi() self.bad_urls = set([p['url'] for p in self.classifier.non_accepted_pages]) # The stream method is used for scraping a large number of maximum links. # This method does not implement the classifier filtering because its main # purpose is for building the database of pages for manual classification def stream(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) for i in range(maxLinks): if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) for u in urls: queue.put(u) yield page # The scrape method is used for a smaller number of maximum links. It performs # a breadth first search given an initial term. It uses a queue to keep track # of the pages to be scraped and a set of the already scraped to prevent # duplicates def scrape(self, start_term, maxLinks): finished, queue, search_results = self.scrape_common(start_term) pages = [] while len(pages) < maxLinks: if queue.empty(): break current_url = queue.get() while current_url in finished: current_url = queue.get() (page, urls) = self.process_page(current_url) finished.add(current_url) # Only if the classifier predicts it as a good page, a page will # be added to the pages list which is returned at the end if self.classifier.classify(page) == 1 and page.url not in self.bad_urls: pages.append(page) print page.name for u in urls: queue.put(u) return pages # Common code for both methods that crawl wikipedia def scrape_common(self, start_term): finished = set() queue = Queue() search_results = self.wiki.find(start_term) if not search_results: print 'No pages found. Try a different term' else: queue.put('https://en.wikipedia.org/wiki/' + search_results[0]) return finished, queue, search_results # Process a page's HTML using BeautifulSoup to extract useful information def process_page(self, url): html = self.wiki.get(url) soup = BeautifulSoup(html) body_html = soup.find(id='mw-content-text') title_tag = soup.find(id='firstHeading') if title_tag.string == None: contents = title_tag.contents string_contents = [] for c in contents: if type(c) != str: string_contents.append(c.string) else: string_contents.append(c) title = ''.join(string_contents) else: title = title_tag.string urls, links_text, media_link_count = self.find_urls(body_html) (clean_text, headers) = self.clean_html(body_html) page = Page(url, title, clean_text, headers, links_text, media_link_count) return (page, urls) # Find all URLs in a given HTML that redirect to another article in Wikipedia # Page links and media links (pictures, audio) are stored in different lists # but are both used. def find_urls(self, html): link_urls = [] good_link = re.compile('/wiki/') bad_link = re.compile('.*:.*|.*\..*|.*\(disambiguation\)') media_link = re.compile('.*\.jpg|.*\.ogg') media_link_count = 0 media_found = set() links_text = dd(int) all_links = html.find_all('a') for l in all_links: link = l.get('href') content = self.extract_content([l])[0] if good_link.match(link) and not bad_link.match(link): link_urls.append('https://en.wikipedia.org' + link) if str(content) != '': links_text[content] = links_text[content] + 1 elif media_link.match(link): if link not in media_found: media_link_count += 1 media_found.add(link) if str(content) != '': links_text[content] = links_text[content] + 1 return (link_urls, links_text, media_link_count) # Fucntion to extract the body and the headers of an article def clean_html(self, html): paragraphs = html.find_all('p') headers = html.find_all(re.compile('h\d')) clean_text = ''.join(self.extract_content(paragraphs)) headers_list = self.clean_headers(headers) return (clean_text, headers_list) # Clean the list of headers of the prohibited, common headers def clean_headers(self, array): raw_headers = self.extract_content(array) final_headers = [] for h in raw_headers: if h not in Scraper.prohibited_headers: final_headers.append(h) return final_headers # Fuction to clean the HTML body of a page. It removes common links that # would cause noise in our system such as [edit] buttons and reference numbers # e.g. [2]. def extract_content(self, array): for i in range(len(array)): array[i] = re.sub(r'<[^>]*>', '', str(array[i])) array[i] = re.sub(r'\[edit\]', '', str(array[i])) array[i] = re.sub(r'\[\d*\]', '', str(array[i])) array[i] = re.sub(r'\^', '', str(array[i])) return array
from bs4 import BeautifulSoup import urllib2 from wikiapi import WikiApi wiki = WikiApi() wiki = WikiApi({'locale': 'en'}) # b = wiki.get_article('High Crusade') # print(b.url) #results = wiki.find('Barack Obama').content #print(results) def get_title_from_search(string): return wiki.find(string)[0] def get_url_from_search(string): try: article_title = wiki.find(string)[0] except IndexError: return False article_contents = wiki.get_article(article_title) return article_contents.url # print(get_url_from_search('Stranger in a Strange Land'))
def main(): status = True pygame.mixer.music.play(-1) music_status = 1 # Create a wikiapi instance wiki_status = 1 wiki_instance = WikiApi() wiki_instance = WikiApi({'locale': 'en'}) namespace = None index1 = 0 data_list = [] # Load weather data into lists and dictionaries weather_location = 0 connector = yweather.Client() weather_id_ny = connector.fetch_woeid('New York') weather_data_ny = connector.fetch_weather(str(weather_id_ny), metric=True) data_dict_ny = {} data_dict_ny.update({'Current Temperature': weather_data_ny["condition"]["temp"], \ 'Sunrise': weather_data_ny['astronomy']['sunrise'],\ 'Sunset': weather_data_ny['astronomy']['sunset'], 'Max Temperature': (str(weather_data_ny['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_ny['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_ny['wind']['speed'] + " km/h")), \ 'Condition': weather_data_ny['condition']['text']}) keys_list_ny = data_dict_ny.keys() weather_id_buffalo = connector.fetch_woeid('Buffalo') weather_data_buffalo = connector.fetch_weather(str(weather_id_buffalo), metric=True) data_dict_buffalo = {} data_dict_buffalo.update({'Current Temperature': weather_data_buffalo["condition"]["temp"], \ 'Sunrise': weather_data_buffalo['astronomy']['sunrise'],\ 'Sunset': weather_data_buffalo['astronomy']['sunset'], 'Max Temperature': (str(weather_data_buffalo['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_buffalo['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_buffalo['wind']['speed'] + " km/h")), \ 'Condition': weather_data_buffalo['condition']['text']}) keys_list_buffalo = data_dict_buffalo.keys() weather_id_hyd = connector.fetch_woeid('Hyderabad') weather_data_hyd = connector.fetch_weather(str(weather_id_hyd), metric=True) data_dict_hyd = {} data_dict_hyd.update({'Current Temperature': weather_data_hyd["condition"]["temp"], \ 'Sunrise': weather_data_hyd['astronomy']['sunrise'], \ 'Sunset': weather_data_hyd['astronomy']['sunset'], 'Max Temperature': (str(weather_data_hyd['forecast'][0]['high']) + " Degrees C"), \ 'Min Temperature': (str(weather_data_hyd['forecast'][0]['low'] + " Degrees C")), 'Wind': (str(weather_data_hyd['wind']['speed'] + " km/h")), \ 'Condition': weather_data_hyd['condition']['text']}) keys_list_hyd = data_dict_hyd.keys() while status: main_display.fill(black) pointer_location = pygame.mouse.get_pos() pointer_click = pygame.mouse.get_pressed() for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() # Music Button if 325 < pointer_location[0] < 405 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: wiki_status = 1 if 700 < pointer_location[0] < 780 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: music_status = not music_status if music_status == 0: pygame.mixer.music.pause() else: pygame.mixer.music.unpause() # New York Button Check if 20 < pointer_location[0] < 80 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 2 # Buffalo Button Check if 100 < pointer_location[0] < 160 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 1 # Hyderabad Button Check if 180 < pointer_location[0] < 240 and 20 < pointer_location[1] < 50: if pointer_click[0] == 1: weather_location = 0 try: main_display.blit(weather_image, (0,0)) except: pass # Data Display if weather_location == 0: data_display(110, data_dict_hyd['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_hyd[5] + " : " + data_dict_hyd['Condition'], white, 95, 260) # Condition data_display(15, keys_list_hyd[1] + " : " + data_dict_hyd['Min Temperature'], white, 130, 320) data_display(15, keys_list_hyd[6] + " : " + data_dict_hyd['Max Temperature'], white, 130, 360) data_display(15, keys_list_hyd[4] + " : " + data_dict_hyd['Sunrise'], white, 95, 400) # Sunrise data_display(15, keys_list_hyd[0] + " : " + data_dict_hyd['Sunset'], white, 95, 440) # Sunset data_display(15, keys_list_hyd[3] + " : " + data_dict_hyd['Wind'], white, 95, 480) # Wind Speed elif weather_location == 1: data_display(110, data_dict_buffalo['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_buffalo[5] + " : " + data_dict_buffalo['Condition'], white, 95, 260) data_display(15, keys_list_buffalo[1] + " : " + data_dict_buffalo['Min Temperature'], white, 130, 320) data_display(15, keys_list_buffalo[6] + " : " + data_dict_buffalo['Max Temperature'], white, 130, 360) data_display(15, keys_list_buffalo[4] + " : " + data_dict_buffalo['Sunrise'], white, 95, 400) data_display(15, keys_list_buffalo[0] + " : " + data_dict_buffalo['Sunset'], white, 95, 440) data_display(15, keys_list_buffalo[3] + " : " + data_dict_buffalo['Wind'], white, 95, 480) elif weather_location == 2: data_display(110, data_dict_ny['Current Temperature'], white, 80, 160) # Temperature number data_display(20, "Deg C", white, 180, 130) # Degree data_display(15, keys_list_ny[5] + " : " + data_dict_ny['Condition'], white, 95, 260) # Condition data_display(15, keys_list_ny[1] + " : " + data_dict_ny['Min Temperature'], white, 130, 320) data_display(15, keys_list_ny[6] + " : " + data_dict_ny['Max Temperature'], white, 130, 360) data_display(15, keys_list_ny[4] + " : " + data_dict_ny['Sunrise'], white, 95, 400) # Sunrise data_display(15, keys_list_ny[0] + " : " + data_dict_ny['Sunset'], white, 95, 440) # Sunset data_display(15, keys_list_ny[3] + " : " + data_dict_ny['Wind'], white, 95, 480) # Wind Speed # Display Wiki Article if wiki_status == 1: del data_list[:] wiki_status = 0 blahblah = True try: url = 'http://en.wikipedia.org/wiki/Special:Random' if namespace != None: url += '/' + namespace req = urllib2.Request(url, None, { 'User-Agent' : 'x'}) page = urllib2.urlopen(req).readlines() wiki_draft1 = remove_tags(page[4]) wiki_title = wiki_draft1[:wiki_draft1.index('Wikipedia') - 2] wiki_data_list = wiki_instance.find(wiki_title) wiki_data = wiki_instance.get_article(wiki_data_list[0]) temp = endlinefunction(wiki_data.summary, data_list, 90) except (urllib2.HTTPError, urllib2.URLError): print "Failed to get article" raise # Buttons and Division Display pygame.draw.rect(main_display, white, (300, 0, 5, 600)) pygame.draw.rect(main_display, white, (300, 70, 500, 5)) drawbutton(wood, 700, 20, 80, 30, 10, "Toggle Music", black) drawbutton(white, 20, 20, 60, 30, 10, "New York", black) drawbutton(white, 100, 20, 60, 30, 10, "Buffalo", black) drawbutton(white, 180, 20, 60, 30, 10, "Hyderabad", black) drawbutton(wood, 325, 20, 80, 30, 10, "Next Article", black) # Cursor Display data_display(15, wiki_data.heading, wood, 540, 130) y_cood = 150 j = 25 for i in range(0, len(data_list)): y_cood = y_cood + j data_display(10, data_list[i], black, 540, y_cood) clock.tick(100) pygame.display.flip()
#!/usr/bin/env python # -*- coding: utf-8 -*- from articleData import my_articles import requests from wikiapi import WikiApi wiki = WikiApi({'locale': 'es'}) def getURL(searchQuery): results = wiki.find(searchQuery) try: article = wiki.get_article(results[0]) except: article = "no article exists for: " + searchQuery try: url = article.url except: url = "no url exists for: " + searchQuery # try: # summary = article.summary # except: # summary print url # print summary
#!/usr/bin/python3 # -*- coding: utf-8 -*- from wikiapi import WikiApi import requests, pprint # This is suitable for extracting content that is organized by pages under a title # This code requires the wiki-api python library created by Richard O'Dwyer of UK # https://github.com/richardasaurus/wiki-api wiki = WikiApi() wiki = WikiApi({'locale': 'ta'}) # to specify your locale, 'en' is default # Get the page text of the article with the given title def getArticleParagraphs(title): print(title) articleFull = wiki.get_article(title) fullText = articleFull.content article = "" paragraphs = fullText.split('\n\n') # print(paragraphs) # We want only whole paragraphs that end in a ".", "!", "?" or '"' not fragments for paragraph in paragraphs: if len(paragraph) > 30: end = paragraph[-1] if end == '.' or end == '!' or end == '?' or end == '"': article = article + "\n\n" + paragraph return article
except Exception as e: nltk.download('stopwords') nltk.download('punkt') nltk.download('words') nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') from nltk.corpus import stopwords from nltk.corpus import wordnet import sys import os from wikiapi import WikiApi import urllib2 import html2text wiki_ = WikiApi() common_ = set(nltk.corpus.words.words()) isNoun = lambda x: x[:2] == 'NN' def url_exists(url): ret = urllib2.urlopen(url) if ret.code == 200: return True return False def wiki_link(query): wikiLink = 'https://en.wikipedia.org/wiki/%s' % query if url_exists(wikiLink):
''' Translate the names in VGGFace2 dataset into English. You will need to manually install the following libararies: unidecode googletrans wikiapi author: Feng Wang (UESTC) ''' import os import csv import string import unidecode from googletrans import Translator translator = Translator() from wikiapi import WikiApi wiki = WikiApi() def is_number(uchar): return uchar >= u'0' and uchar<=u'9' def is_alphabet(uchar): return (uchar >= u'a' and uchar<=u'z') or (uchar >= u'A' and uchar<=u'Z') def check_english(name): flag = True for uchar in name: if (not is_alphabet(uchar)) and (not is_number(uchar)) and (uchar != u'\u0020') and (uchar != u'-') and (uchar != u'.'): flag = False return flag def non_english_character_count(name):
restagcont=[] for i in range(len(ttcontent)): if ttcontent[i][1]: #print ttcontent[i][0],":",ttcontent[i][1] restagcont.append(ttcontent[i][0].lower()) keyw=[] stpw=["navigation","search","about","http","edit","Intro","read","help","removed","above"] for i in restagcont: if i not in stpw: keyw.append(i) return keyw #************************************ #************************************ wiki = WikiApi({}) dic_cont={} #diccionary mlist=[] #word base #************************************ for wtopic in file1.readlines(): w=wtopic.split() mlist.append(w[0]) results = wiki.find(w[0]) if results: article = wiki.get_article(results[0]) r=article.content rtoken= wordpunct_tokenize(r) #implementation of stopwords
# -*- coding: utf-8 -*- from wikiapi import WikiApi import unittest wiki = WikiApi({}) results = wiki.find('Bill Clinton') article = wiki.get_article(results[0]) # taking first search result class TestWiki(unittest.TestCase): def test_heading(self): self.assertIsNotNone(article.heading) def test_image(self): self.assertTrue(isinstance(article.image, str)) def test_summary(self): self.assertGreater(len(article.summary), 100) def test_content(self): self.assertGreater(len(article.content), 200) def test_references(self): self.assertTrue(isinstance(article.references, list)) def test_get_relevant_article(self): keywords = ['president', 'hilary'] _article = wiki.get_relevant_article(results, keywords) self.assertTrue('Bill Clinton' in _article.heading) def test_get_relevant_article_no_result(self):