def search_wiki(self, k): # scrape query's wikipedia article article = Wikipedia().search(k) if (article == None): try: ny = wikipedia.page(k) except requests.exceptions.ConnectionError as e: return 'ReStArT' # except wikipedia.exceptions.HTTPTimeoutError as e: # print('here1') # print(str(e)) # except requests.exceptions.ConnectTimeout as e: # print('here2') # print(str(e)) except wikipedia.exceptions.DisambiguationError as e: try: ny = wikipedia.page(e.options[0]) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: return -5 except wikipedia.exceptions.PageError as e: return -5 article = Wikipedia().search(ny.title) contents = [section.content for section in article.sections] kk = 0 for content in contents: if (len(content) == 0): kk = kk + 1 if (contents == [] or kk == len(contents)): try: ny = wikipedia.page(k) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: try: ny = wikipedia.page(e.options[0]) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: return -5 content = ny.content if (len(content) == 0): return -5 else: contents = [] contents.append(content) d = [] for content in contents: a = content.split() d.append(a) content = [ j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1 ] # take only meaningful content self.content = ' '.join(content) return self.content
def get_articles(self, start, depth=1, max_count=1): article = Wikipedia().article(start) links = article.links list_of_strings = [] for link in links: text = Wikipedia().article(link) text = self.text_cleaning(plaintext(text.source)) list_of_strings.append(text) return list_of_strings
def get_articles(self, start): article = Wikipedia().article(start) links = article.links list_of_strings = [] for l in links: l_art = Wikipedia().article(l) l_text = re.sub(' +', ' ', plaintext(l_art.source), flags = re.DOTALL) l_text = re.sub('[.;:!?\'\"-\(\)1234567890@,]+', '', l_text, flags = re.DOTALL).lower() list_of_strings.append(l_text.split()) return list_of_strings
def testWikipedia(palabra): """ Obtiene el articulo de la wikipedia """ engine = Wikipedia(license=None, throttle=5.0) resultados = engine.search(palabra) print resultados.plaintext() return resultados
def get_articles(self, start): arr = [] wiki = Wikipedia(language="en") article = wiki.article(start) arr.append(self.text_cleaning(article)) for title in article.links: article = wiki.article(title) arr.append(self.text_cleaning(article)) return arr
def getFeaturedList(): wiki = Wikipedia() article = wiki.search("Wikipedia:Featured articles") file = open("articalsTitle.txt", 'w') for section in article.sections: if section.string != "": for title in article.string.split('\n'): file.write(((str)(title)).strip() + "\n") file.close()
def get_articles(self, start): list_of_strings = [] article = Wikipedia().article(start).plaintext() list_of_strings.append(self.plain_text(article)) #for link in Wikipedia().article(start).links: # if Wikipedia().article(link).language is 'en': # list_of_strings.append(self.plain_text(link)) return list_of_strings
def get_articles(self, start): from pattern.web import Wikipedia, plaintext file = open(start+'.txt', 'w') file.write('') file.close() list_of_strings = [] start_article = Wikipedia().article(start) links = set(start_article.links) for link in start_article.links: cur_article = Wikipedia().article(link) if cur_article != None: newtext = plaintext(cur_article.source) newtext = self.normalize(newtext) list_of_strings.append(newtext) file = open(start+'.txt', 'a') file.write(newtext.encode('utf-8')) file.write('\n\n-----\n\n') file.close() return list_of_strings
def show_results(self): parser = WikiParser() articles = parser.get_articles('Natural language processing') art_statistics = TextStatistics(articles) print('топ-20 3-грамм по корпусу статей\r\n' + art_statistics.get_top_3grams(20)) print('топ-20 слов по корпусу статей\r\n' + art_statistics.get_top_words(20)) article = Wikipedia().article('Natural language processing') a_statistics = TextStatistics(article) print('топ-5 3-грамм по самой статье \r\n' + a_statistics.get_top_3grams(5)) print('топ-5 слов по самой статье \r\n' + a_statistics.get_top_words(5))
def wiki(titulo): article = Wikipedia(language="en").search(query=titulo) result=[] if article: for section in article.sections: result.append(section.title+"\n"+section.string+"\n") for link in section.links: result.append(link+"\n") return result
def getFeaturedContent(): wiki = Wikipedia() list = open("articalsTitle.txt", 'r') file = open("wikiData.txt", 'w') for i in range(2000): title = list.readline().replace("\n", "") article = wiki.search(title) if article is not None: for section in article.sections: if section.string != "": file.write(section.string + "\n") time.sleep(0.2) print(title + " Get! " + str(i) + "/2000")
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() for section in p.sections: if section.parent and section.parent.title in ("Individuals", ): for name in section.links: if "* " + name in s: celebrities.add(name) return celebrities
def get_articles(self, search_term): text_list = [] article = Wikipedia().search(search_term) links = article.links s = self.__get_text(search_term) text_list.append(s) for link in links: text_list.append(self.__get_text(link)) print(link) return text_list
def gender(name): """ Returns the gender of the given person (m/f). """ try: w = Wikipedia(language="en") p = w.search(name, cached=True) s = plaintext(p.string) s = s.lower() s = s.replace("\n", "\n ") m = sum(s.count(" %s " % x) for x in ("he", "his")) f = sum(s.count(" %s " % y) for y in ("she", "her")) g = m > f and "m" or "f" # More "he" or more "she"? return g except: return None
def search_wiki(self, k): # scrape query's wikipedia article article = Wikipedia().search(k) contents = [ section.content.encode("utf8") for section in article.sections ] d = [] for content in contents: a = content.split() d.append(a) content = [ j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1 ] # take only meaningful content self.content = ' '.join(content) return self.content
def get_info(search_query): if isinstance(search_query, str): search_query = str(search_query) else: return { "Error": "Pass a string, from mine.py [7]", "Result": [None] } result = [] engineGoogle = Google(license=None, throttle=0.5, language=None) engineBing = Bing(license=None, throttle=0.5, language=None) engineTwitter = Twitter(license=None, throttle=0.5, language=None) engineFacebook = Facebook(license=None, throttle=1.0, language='en') engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None) engineFlickr = Flickr(license=None, throttle=5.0, language=None) engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr] engineArray = [engineGoogle, engineTwitter] ''' for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)]) [result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray] # print repr(plaintext(para.text)) # print repr(plaintext(para.url)) + '\n\n' # result.append(repr(plaintext(para.text))) ''' # Google for i in range(1, 5): result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)]) for i in range(1, 5): result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) ''' # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)]) for i in range(1,2): result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)]) ''' return { "Error": None, "Result": result } # return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
def crawl_wiki(): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords() for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/txt/wiki.pkl') return wikis
def isnews(topic): engine = Wikipedia() result = engine.search(topic) if result: if topic.lower() not in result.title.lower(): return False newsthings = [ 'places', 'cities', 'capitals', 'countries', 'people', 'wars' ] categories = result.categories for category in categories: for thing in newsthings: if thing in category.lower(): return True return False else: return False
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() # Extract the links from this page, excluding links in the footnotes section, # or links to band names (we just want names of individuals). for section in p.sections: if section.parent and section.parent.title in ("Individuals", ): for name in section.links: # Yes = * [Rob Zombie], musician and filmmaker # No = * Mark White, bass guitarist for [Spin Doctors] if "* " + name in s: celebrities.add(name) return celebrities
def age(name): """ Returns the age of the given person. """ # Use regular expression to try and parse # a number of date formats from Wikipedia. try: w = Wikipedia(language="en") p = w.search(name, cached=True) t = DOM(p.src) s = plaintext(p.string) s = re.sub(r"\[[0-9]+\]", "", s) r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000) x = t(".bday") y = t(".dday") x = x[0].content if x else re.search(r, s).group(2) y = y[0].content if y else str(date().year) x = plaintext(x) y = plaintext(y) x = x.split("-")[0] # YYYY-MM-DD y = y.split("-")[0] a = int(y) - int(x) return a except: pass try: r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})" r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003) x = re.search(r, s).group(1) y = re.search(r, s).group(2) a = int(y) - int(x) return a except: pass try: r = r"\(aged ([0-9]+)\)" # (aged 78) a = t(".infobox td:contains('aged')") a = a[0].content a = plaintext(a) a = re.search(r, a).group(1) a = int(a) return a except: pass return None
def __get_text(self, search_term): article = Wikipedia().search(search_term) if article: s = article.string if s: while '\n' in s: s = s.replace('\n', ' ') while ' ' in s: s = s.replace(' ', ' ') s = re.sub(r'[^\w\s]', '', s) s = s.lower() return s else: return None else: return None
def __init__(self): from pattern.web import Wikipedia, plaintext import string import re self.parser = WikiParser() self.articles = self.parser.get_articles('Natural_language_processing') self.stats = TextStatistics(self.articles) self.grams = self.stats.get_top_3grams() self.words = self.stats.get_top_words() self.top_grams_out = [' - '.join([self.grams[0][i], str(self.grams[1][i])]) for i in range(20)] self.top_grams = self.grams[0][:20] self.top_words_out = [' - '.join([self.words[0][i], str(self.words[1][i])]) for i in range(20)] self.top_words = self.words[0][:20] nlp = Wikipedia().article('Natural_language_processing') nlptext = plaintext(nlp.source) parse = WikiParser() nlptext = parse.normalize(nlptext) self.nlp_stats = TextStatistics([nlptext]) self.nlp_top_grams = [' - '.join([self.nlp_stats.get_top_3grams()[0][i], str(self.nlp_stats.get_top_3grams()[1][i])]) for i in range(20)][:5] self.nlp_top_words = [' - '.join([self.nlp_stats.get_top_words()[0][i], str(self.nlp_stats.get_top_words()[1][i])]) for i in range(20)][:5]
def get_wiki_article(self, search, separate_in_section=False, type_of_text=u'Plain text'): segments = list() article = Wikipedia(language=self.dico_lang[self.language]).search( search, cached=False) if article: if separate_in_section: for section in article.sections: if type_of_text == u'Plain text': wiki_article = Input(section.string) else: wiki_article = Input(section.html) annotations = { 'source': 'Wikipedia', 'section title': section.title, 'section level': section.level, 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) else: if type_of_text == u'Plain text': wiki_article = Input(article.string) else: wiki_article = Input(article.html) annotations = { 'source': 'Wikipedia', 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) return segments
def Avg(s): ''' The Averaging Function Takes in a string, searches for the Wiki article and then returns the average length of words. No there is no error handling. ''' length = 0 numb = 0 # if s in db: return float(db[s]) article = Wikipedia().search(s) for i in xrange(len(article.sections)): if article.sections[i].title == "See also": break text = article.sections[i].content.replace('\n', ' ') for words in text.split(' '): numb += 1 numbtmp += 1 db[s] = str(float(length) / float(numb)) return float(length) / float(numb)
""" Retrieves an article from Wikipedia (http://en.wikipedia.org). """ from pattern.web import Wikipedia ENGINE = Wikipedia(language="en") def main(search_query): """ Returns Twitter Search Results :param search_query: (str) """ final = "Wikipedia Search Results:" article = ENGINE.search(search_query, cached=True, timeout=30) print article.title # Article title (may differ from the search query). print "" print article.languages[ "tr"] # Article in Turkish, can be retrieved with Wikipedia(language="tr"). print article.links[:10] # List of linked Wikipedia articles. print article.external[:5] # List of external URL's. print "" for s in article.sections: final = final + "\n\n" + s.title.upper() + "\n" + s.content return final if __name__ == '__main__': main("")
from __future__ import print_function import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Wikipedia # This example retrieves an article from Wikipedia (http://en.wikipedia.org). # Wikipedia queries request the article HTML source from the server. This can be slow. # It is a good idea to cache results from Wikipedia locally, # and to set a high timeout when calling Wikipedia.search(). engine = Wikipedia(language="en") # Contrary to the other search engines in the pattern.web module, # Wikipedia simply returns one WikipediaArticle object (or None), # instead of a list of results. article = engine.search("alice in wonderland", cached=True, timeout=30) # Article title (may differ from the search query). print(article.title) print() # Article in French, can be retrieved with Wikipedia(language="fr"). print(article.languages["fr"]) print(article.links[:10], "...") # List of linked Wikipedia articles. print(article.external[:5], "...") # List of external URL's. print() # print article.source # The full article content as HTML. # print article.string # The full article content, plain text with HTML # tags stripped.
# Word frequency. from pattern.web import Wikipedia from pattern.vector import words frequency = {} # Spreading activation. # Parse links from seed article & visit those articles. links, seen = set(["Italia"]), {} while len(links) > 0: try: article = Wikipedia(language="it").search(links.pop(), throttle=10) seen[article.title] = True # Parse links from article. for link in article.links: if link not in seen: links.add(link) # Parse words from article. Count words. for word in words(article.string): if word not in frequency: frequency[word] = 0
def main(): article = Wikipedia(language="en").search('small', throttle=10) print article.string
def descarga(titulo): engine = Wikipedia(language="en") result= engine.search(titulo, type=SEARCH) return repr(plaintext(result.string))
from pattern.web import Wikipedia import re from string import punctuation from pymorphy2 import MorphAnalyzer import pandas as pd safe = lambda x: 0 if x < 0 else x wiki = Wikipedia(language="ru") punc = re.compile('[%s]' % re.escape(punctuation)) start_points = ["Проектирование", "Номинативная_конструкция"] lemma = "конструкция" translator = str.maketrans('', '', punctuation) regex = re.compile("[%s0-9]" % (punctuation + "—")) analyzer = MorphAnalyzer() stoplinks = ["Википедия", "Английский"] with open("stop-words-russian.txt") as io: stopwords = [x.strip() for x in io.readlines()] def get_articles(max_count): sents = {0: set(), 1: set()} for i in range(len(start_points)): start = start_points[i] links = [start] list_of_strings = [] while len(sents[i]) <= max_count: links_temp = [] for link in links: if len(list_of_strings) <= max_count: