def search_wiki(self, k): # scrape query's wikipedia article article = Wikipedia().search(k) if (article == None): try: ny = wikipedia.page(k) except requests.exceptions.ConnectionError as e: return 'ReStArT' # except wikipedia.exceptions.HTTPTimeoutError as e: # print('here1') # print(str(e)) # except requests.exceptions.ConnectTimeout as e: # print('here2') # print(str(e)) except wikipedia.exceptions.DisambiguationError as e: try: ny = wikipedia.page(e.options[0]) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: return -5 except wikipedia.exceptions.PageError as e: return -5 article = Wikipedia().search(ny.title) contents = [section.content for section in article.sections] kk = 0 for content in contents: if (len(content) == 0): kk = kk + 1 if (contents == [] or kk == len(contents)): try: ny = wikipedia.page(k) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: try: ny = wikipedia.page(e.options[0]) except requests.exceptions.ConnectionError as e: return 'ReStArT' except wikipedia.exceptions.DisambiguationError as e: return -5 content = ny.content if (len(content) == 0): return -5 else: contents = [] contents.append(content) d = [] for content in contents: a = content.split() d.append(a) content = [ j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1 ] # take only meaningful content self.content = ' '.join(content) return self.content
def extract_unique_nes(self, input_dir='../workspace/frog_periodicals', fresh=False, max_documents=None, max_words_per_doc=None, filename='../workspace/nes2wikilinks.p'): """ Extracts all unique entities in the frogged files under input_dir as a dict. Registers in this dict: which relevant wiki-pages the NE could refer to according to the Wikipedia search interface. Only considers NEs that are: * capitalized * have len > 3 (cf. 'Van') * don't end in a dot (e.g. 'A.F.Th.') * tagged as B-PER by Frog """ if fresh: print('Extracting NEs from ', max_documents, 'documents!') wikipedia = Wikipedia(language='nl', throttle=3) self.nes2wikilinks = {} for filepath in glob.glob(input_dir+'/*.txt.out'): max_words = max_words_per_doc for line in codecs.open(filepath, 'r', 'utf8'): try: comps = [c for c in line.strip().split('\t') if c] idx, token, lemma, pos, conf, ne = comps token = token.replace('_', ' ') if ne.startswith('B-PER') and token[0].isupper() and len(token) > 3 and not token.endswith('.'): if token not in self.nes2wikilinks: try: # to look up the page in wikipedia: article = wikipedia.search(token) if article: # if we find something... if article.categories[0] == 'Wikipedia:Doorverwijspagina': # we are dealing a referral page for link in article.links: if link in self.page_ids: if token not in self.nes2wikilinks: self.nes2wikilinks[token] = set() self.nes2wikilinks[token].add(link) else: if article.title in self.page_ids: self.nes2wikilinks[token] = set([article.title]) except: # probably a download issue... continue max_words -= 1 if max_words < 0: break except ValueError: # probably parsing error in the frog file continue # update stats: max_documents -= 1 if max_documents % 10 == 0: print('\t+ ', max_documents, 'documents to go') print('\t+ ', len(self.nes2wikilinks), 'NEs collected') if max_documents < 0: break pickle.dump(self.nes2wikilinks, open(filename, 'wb')) else: self.nes2wikilinks = pickle.load(open(filename, 'rb'))
def get_articles(self, start, depth=1, max_count=1): article = Wikipedia().article(start) links = article.links list_of_strings = [] for link in links: text = Wikipedia().article(link) text = self.text_cleaning(plaintext(text.source)) list_of_strings.append(text) return list_of_strings
def get_articles(self, start): arr = [] wiki = Wikipedia(language="en") article = wiki.article(start) arr.append(self.text_cleaning(article)) for title in article.links: article = wiki.article(title) arr.append(self.text_cleaning(article)) return arr
def testWikipedia(palabra): """ Obtiene el articulo de la wikipedia """ engine = Wikipedia(license=None, throttle=5.0) resultados = engine.search(palabra) print resultados.plaintext() return resultados
def getFeaturedList(): wiki = Wikipedia() article = wiki.search("Wikipedia:Featured articles") file = open("articalsTitle.txt", 'w') for section in article.sections: if section.string != "": for title in article.string.split('\n'): file.write(((str)(title)).strip() + "\n") file.close()
def mentions_from_backlinks(self, backlinks={}, fresh=False, filename='../workspace/mentions.p', context_window_size=150): """ Mines backlinking pages for mentions of the page_ids in backlinks. Returns 5 tuples, with for each mention: * target_id (correct page title) * the name variant (inside the a-tag) * left context of the mention (continguous character string, with len = context_window_size) * right context of the mention (continguous character string, with len = context_window_size) * a counter of other page_ids mentioned on the page """ if not backlinks: backlinks = self.backlinks # intitialize data containers: target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], [] if fresh: print('>>> Mining mentions from', sum([len(v) for k,v in backlinks.items()]), 'backlinking pages to', len(backlinks), 'target pages') print(backlinks) wikipedia = Wikipedia(language='nl', throttle=2) for idx, (page_id, links) in enumerate(backlinks.items()): print('\t + mining mentions of', page_id, '('+str(len(links)), 'backlinks) | page', idx+1, '/', len(backlinks)) for backlink in links: try: article = wikipedia.search(backlink) # fetch the linking page via pattern if not article.categories[0] == 'Wikipedia:Doorverwijspagina': # skip referral pages print('\t\t* backlink:', backlink) section_sources = [] # fetch the html-sections of individual sections: if not article.sections: # article doesn't have sections section_sources = [article.source] else: section_sources = [section.source for section in article.sections] # loop over the section sources and extract all relevant mentions: for section_source in section_sources: ts, nvs, lcs, rcs, cnts = self.mentions_from_section(source=section_source, target_id=page_id, context_window_size=context_window_size) if nvs: target_ids.extend(ts) name_variants.extend(nvs) left_contexts.extend(lcs) right_contexts.extend(rcs) page_counts.extend(cnts) except: continue pickle.dump((target_ids, name_variants, left_contexts, right_contexts, page_counts), open(filename, 'wb')) else: target_ids, name_variants, left_contexts, right_contexts, page_counts = \ pickle.load(open(filename, 'rb')) self.mentions = (target_ids, name_variants, left_contexts, right_contexts, page_counts)
def get_articles(self, start): article = Wikipedia().article(start) links = article.links list_of_strings = [] for l in links: l_art = Wikipedia().article(l) l_text = re.sub(' +', ' ', plaintext(l_art.source), flags = re.DOTALL) l_text = re.sub('[.;:!?\'\"-\(\)1234567890@,]+', '', l_text, flags = re.DOTALL).lower() list_of_strings.append(l_text.split()) return list_of_strings
def getFeaturedContent(): wiki = Wikipedia() list = open("articalsTitle.txt", 'r') file = open("wikiData.txt", 'w') for i in range(2000): title = list.readline().replace("\n", "") article = wiki.search(title) if article is not None: for section in article.sections: if section.string != "": file.write(section.string + "\n") time.sleep(0.2) print(title + " Get! " + str(i) + "/2000")
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() for section in p.sections: if section.parent and section.parent.title in ("Individuals",): for name in section.links: if "* " + name in s: celebrities.add(name) return celebrities
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() for section in p.sections: if section.parent and section.parent.title in ("Individuals", ): for name in section.links: if "* " + name in s: celebrities.add(name) return celebrities
def gender(name): """ Returns the gender of the given person (m/f). """ try: w = Wikipedia(language="en") p = w.search(name, cached=True) s = plaintext(p.string) s = s.lower() s = s.replace("\n", "\n ") m = sum(s.count(" %s " % x) for x in ("he", "his")) f = sum(s.count(" %s " % y) for y in ("she", "her")) g = m > f and "m" or "f" # More "he" or more "she"? return g except: return None
def gender(name): """ Returns the gender of the given person (m/f). """ try: w = Wikipedia(language="en") p = w.search(name, cached=True) s = plaintext(p.string) s = s.lower() s = s.replace("\n", "\n ") m = sum(s.count(" %s " % x) for x in ( "he", "his")) f = sum(s.count(" %s " % y) for y in ("she", "her")) g = m > f and "m" or "f" # More "he" or more "she"? return g except: return None
def isnews(topic): engine = Wikipedia() result = engine.search(topic) if result: if topic.lower() not in result.title.lower(): return False newsthings = ['places','cities','capitals','countries','people','wars'] categories = result.categories for category in categories: for thing in newsthings: if thing in category.lower(): return True return False else: return False
class WikiParser: def __init__(self): self.wiki = Wikipedia(language="en") self.punc = re.compile('[%s]' % re.escape(punctuation)) def get_articles(self, start, depth, max_count): iterations = 0 links = [start] list_of_strings = [] while iterations <= depth and len(list_of_strings) <= max_count: links_temp = [] for link in links: if iterations <= depth and len(list_of_strings) <= max_count: try: article = self.wiki.article(link) text = self.process(article.plaintext()) new_links = article.links list_of_strings.append(text) links_temp.extend(new_links) print(f"Processed link {link}") except AttributeError: print(f"Skipped link {link}") continue else: break links = links_temp iterations += 1 return list_of_strings def process(self, text): tokens = text.split(" ") return " ".join([self.punc.sub("", x.lower().strip()) for x in tokens])
def isnews(topic): engine = Wikipedia() result = engine.search(topic) if result: if topic.lower() not in result.title.lower(): return False newsthings = [ 'places', 'cities', 'capitals', 'countries', 'people', 'wars' ] categories = result.categories for category in categories: for thing in newsthings: if thing in category.lower(): return True return False else: return False
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() # Extract the links from this page, excluding links in the footnotes section, # or links to band names (we just want names of individuals). for section in p.sections: if section.parent and section.parent.title in ("Individuals", ): for name in section.links: # Yes = * [Rob Zombie], musician and filmmaker # No = * Mark White, bass guitarist for [Spin Doctors] if "* " + name in s: celebrities.add(name) return celebrities
def celebrities(): """ Returns a list of celebrities from Wikipedia. """ celebrities = set() w = Wikipedia(language="en") p = w.search("List of people on the cover of Rolling Stone", cached=True) s = p.plaintext() # Extract the links from this page, excluding links in the footnotes section, # or links to band names (we just want names of individuals). for section in p.sections: if section.parent and section.parent.title in ("Individuals",): for name in section.links: # Yes = * [Rob Zombie], musician and filmmaker # No = * Mark White, bass guitarist for [Spin Doctors] if "* " + name in s: celebrities.add(name) return celebrities
def crawl_wiki(): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords() for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub( '-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/txt/wiki.pkl') return wikis
def age(name): """ Returns the age of the given person. """ # Use regular expression to try and parse # a number of date formats from Wikipedia. try: w = Wikipedia(language="en") p = w.search(name, cached=True) t = DOM(p.src) s = plaintext(p.string) s = re.sub(r"\[[0-9]+\]", "", s) r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000) x = t(".bday") y = t(".dday") x = x[0].content if x else re.search(r, s).group(2) y = y[0].content if y else str(date().year) x = plaintext(x) y = plaintext(y) x = x.split("-")[0] # YYYY-MM-DD y = y.split("-")[0] a = int(y) - int(x) return a except: pass try: r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})" r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003) x = re.search(r, s).group(1) y = re.search(r, s).group(2) a = int(y) - int(x) return a except: pass try: r = r"\(aged ([0-9]+)\)" # (aged 78) a = t(".infobox td:contains('aged')") a = a[0].content a = plaintext(a) a = re.search(r, a).group(1) a = int(a) return a except: pass return None
def crawl_wiki(model_path): engine = Wikipedia(license=None, throttle=1.0, language='en') wikis = {} keywords = get_keywords(model_path=model_path, threshold=0.001) for keyword in keywords: stop = False while stop is False: try: article = engine.search(query=keyword) except Exception as e: print str(e) article = None if type(article) is pattern.web.WikipediaArticle: if article.disambiguation is False: print '\nretrieving', keyword, '...', wikis[keyword] = {} wikis[keyword]['keyword'] = keyword wikis[keyword]['text'] = article.plaintext() stop = True else: print '\n[', keyword, '] leads to disambiguation page!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False else: print '\n[', keyword, '] doesn\'t exist on wikipedia!', stop = True if '-' in keyword: keyword = re.sub('-', ' ', keyword) # convert hyphen into white space stop = False if keyword.islower() and len(keyword) <= 5: keyword = keyword.upper() stop = False enpickle(wikis, 'data/others/wikis.pkl') print '\n' return wikis
def education(name, discrete=False, raw=False): """ Returns the education level of the given person (0.0-1.0). """ try: w = Wikipedia(language="en") p = w.search(name, cached=True) t = DOM(p.src) # e = the percentage of links to articles about academic titles / achievements. e = [t("a[href*='%s']" % x) for x in ( "academi" , "academy_of" , "bachelor_of" , "college" , "degree" , "doctor" , "emeritus" , "engineer" , "faculty" , "fellow" , "genius" , "grandmaster" , "institut" , "invent" , "master_of" , "mathemati" , "phd" , "ph.d" , "physics" , "professor" , "school_of" , "scien" , "student" , "universi" , "valedictor" , )] e = sum(map(len, e)) e = e / float(len(t("a"))) if raw: return e # Convert e to a human-interpretable range (0.0-1.0), # based on observations in the list of p people below, # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5. e = max(e, 0.0) e = min(e, 1.0) m = { 0.000: 0.40, 0.003: 0.50, 0.010: 0.60, 0.020: 0.70, 0.030: 0.80, 0.060: 0.90, 1.000: 1.00 } for x, y in zip(sorted(m), sorted(m)[1:]): if y > e: e = m[x] + (m[y] - m[x]) * (e - x) / (y - x) break # With discrete=True, returns "+" or "-". e = e if not discrete else ("-", "+")[e > 0.01] return e except: return None
def get_articles(self, start): from pattern.web import Wikipedia, plaintext file = open(start+'.txt', 'w') file.write('') file.close() list_of_strings = [] start_article = Wikipedia().article(start) links = set(start_article.links) for link in start_article.links: cur_article = Wikipedia().article(link) if cur_article != None: newtext = plaintext(cur_article.source) newtext = self.normalize(newtext) list_of_strings.append(newtext) file = open(start+'.txt', 'a') file.write(newtext.encode('utf-8')) file.write('\n\n-----\n\n') file.close() return list_of_strings
def get_articles(self, start): list_of_strings = [] article = Wikipedia().article(start).plaintext() list_of_strings.append(self.plain_text(article)) #for link in Wikipedia().article(start).links: # if Wikipedia().article(link).language is 'en': # list_of_strings.append(self.plain_text(link)) return list_of_strings
def show_results(self): parser = WikiParser() articles = parser.get_articles('Natural language processing') art_statistics = TextStatistics(articles) print('топ-20 3-грамм по корпусу статей\r\n' + art_statistics.get_top_3grams(20)) print('топ-20 слов по корпусу статей\r\n' + art_statistics.get_top_words(20)) article = Wikipedia().article('Natural language processing') a_statistics = TextStatistics(article) print('топ-5 3-грамм по самой статье \r\n' + a_statistics.get_top_3grams(5)) print('топ-5 слов по самой статье \r\n' + a_statistics.get_top_words(5))
def wiki(titulo): article = Wikipedia(language="en").search(query=titulo) result=[] if article: for section in article.sections: result.append(section.title+"\n"+section.string+"\n") for link in section.links: result.append(link+"\n") return result
def run(o): #http://www.clips.ua.ac.be/pages/pattern-web#mail # should be able to do some cool stuff with the pattern libs import os, sys;# sys.path.insert(0, os.path.join("..", "..")) from pattern.web import Wikipedia # This example retrieves an article from Wikipedia (http://en.wikipedia.org). # Wikipedia queries request the article HTML source from the server. This can be slow. # It is a good idea to cache results from Wikipedia locally, # and to set a high timeout when calling Wikipedia.search(). engine = Wikipedia(language="en") # Contrary to the other search engines in the pattern.web module, # Wikipedia simply returns one WikipediaArticle object (or None), # instead of a list of results. article = engine.search("alice in wonderland", cached=True, timeout=30) print article.title # Article title (may differ from the search query). print print article.languages["fr"] # Article in French, can be retrieved with Wikipedia(language="fr"). print article.links[:10], "..." # List of linked Wikipedia articles. print article.external[:5], "..." # List of external URL's. print #print article.source # The full article content as HTML. #print article.string # The full article content, plain text with HTML tags stripped. # An article is made up of different sections with a title. # WikipediaArticle.sections is a list of WikipediaSection objects. # Each section has a title + content and can have a linked parent section or child sections. for s in article.sections: print s.title.upper() print print s.content # = ArticleSection.string, minus the title. print
def get_articles(self, search_term): text_list = [] article = Wikipedia().search(search_term) links = article.links s = self.__get_text(search_term) text_list.append(s) for link in links: text_list.append(self.__get_text(link)) print(link) return text_list
def search(self, query, language='es'): ''' query: string language: 'en' or 'es' ''' wikipedia = Wikipedia(language=language) google_result_list = self.google.simple_search(query + ' ' + 'wikipedia') wikipedia_results = [] for result in google_result_list: try: if self.url_pattern in result['link']: article = {} title = result['title'].split(' - ')[0] print title art = wikipedia.search(title) print art article['title'] = art.title article['text'] = art.string article['related'] = art.links wikipedia_results.append(article) except: pass return wikipedia_results
def search_wiki(self, k): # scrape query's wikipedia article article = Wikipedia().search(k) contents = [ section.content.encode("utf8") for section in article.sections ] d = [] for content in contents: a = content.split() d.append(a) content = [ j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1 ] # take only meaningful content self.content = ' '.join(content) return self.content
def get_info(search_query): if isinstance(search_query, str): search_query = str(search_query) else: return { "Error": "Pass a string, from mine.py [7]", "Result": [None] } result = [] engineGoogle = Google(license=None, throttle=0.5, language=None) engineBing = Bing(license=None, throttle=0.5, language=None) engineTwitter = Twitter(license=None, throttle=0.5, language=None) engineFacebook = Facebook(license=None, throttle=1.0, language='en') engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None) engineFlickr = Flickr(license=None, throttle=5.0, language=None) engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr] engineArray = [engineGoogle, engineTwitter] ''' for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)]) [result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray] # print repr(plaintext(para.text)) # print repr(plaintext(para.url)) + '\n\n' # result.append(repr(plaintext(para.text))) ''' # Google for i in range(1, 5): result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)]) for i in range(1, 5): result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) ''' # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)]) for i in range(1,2): result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)]) # for i in range(1,2): # result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)]) ''' return { "Error": None, "Result": result } # return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
def __get_text(self, search_term): article = Wikipedia().search(search_term) if article: s = article.string if s: while '\n' in s: s = s.replace('\n', ' ') while ' ' in s: s = s.replace(' ', ' ') s = re.sub(r'[^\w\s]', '', s) s = s.lower() return s else: return None else: return None
def __init__(self): from pattern.web import Wikipedia, plaintext import string import re self.parser = WikiParser() self.articles = self.parser.get_articles('Natural_language_processing') self.stats = TextStatistics(self.articles) self.grams = self.stats.get_top_3grams() self.words = self.stats.get_top_words() self.top_grams_out = [' - '.join([self.grams[0][i], str(self.grams[1][i])]) for i in range(20)] self.top_grams = self.grams[0][:20] self.top_words_out = [' - '.join([self.words[0][i], str(self.words[1][i])]) for i in range(20)] self.top_words = self.words[0][:20] nlp = Wikipedia().article('Natural_language_processing') nlptext = plaintext(nlp.source) parse = WikiParser() nlptext = parse.normalize(nlptext) self.nlp_stats = TextStatistics([nlptext]) self.nlp_top_grams = [' - '.join([self.nlp_stats.get_top_3grams()[0][i], str(self.nlp_stats.get_top_3grams()[1][i])]) for i in range(20)][:5] self.nlp_top_words = [' - '.join([self.nlp_stats.get_top_words()[0][i], str(self.nlp_stats.get_top_words()[1][i])]) for i in range(20)][:5]
def get_wiki_article(self, search, separate_in_section=False, type_of_text=u'Plain text'): segments = list() article = Wikipedia(language=self.dico_lang[self.language]).search( search, cached=False) if article: if separate_in_section: for section in article.sections: if type_of_text == u'Plain text': wiki_article = Input(section.string) else: wiki_article = Input(section.html) annotations = { 'source': 'Wikipedia', 'section title': section.title, 'section level': section.level, 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) else: if type_of_text == u'Plain text': wiki_article = Input(article.string) else: wiki_article = Input(article.html) annotations = { 'source': 'Wikipedia', 'search': search, } segment = wiki_article[0] segment.annotations.update(annotations) wiki_article[0] = segment segments.append(wiki_article) return segments
def Avg(s): ''' The Averaging Function Takes in a string, searches for the Wiki article and then returns the average length of words. No there is no error handling. ''' length = 0 numb = 0 # if s in db: return float(db[s]) article = Wikipedia().search(s) for i in xrange(len(article.sections)): if article.sections[i].title == "See also": break text = article.sections[i].content.replace('\n', ' ') for words in text.split(' '): numb += 1 numbtmp += 1 db[s] = str(float(length) / float(numb)) return float(length) / float(numb)
self.writerow(row) output = open("Lua_scraping_output.csv", "wb") writer = UnicodeWriter(output) writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"]) url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Lua_(programming_language)&offset=&limit=500' + '&action=history' url = URL(url_string) dom = DOM(url.download(cached=True)) engine = Wikipedia(license=None) article = engine.search('Lua Programming Language') a = 0 while (len(dom.by_class("mw-nextlink")) > 0): page_history_links = dom.by_tag("ul")[0].by_tag("li") for link in page_history_links: date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore') bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore') ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true' req = urllib2.urlopen(ip_url) req_request = urllib2.Request(ip_url) #handler = urllib2.urlopen(req)
import os, sys; sys.path.insert(0, os.path.join("..", "..")) from pattern.web import Wikipedia # This example retrieves an article from Wikipedia (http://en.wikipedia.org). # A query requests the article's HTML source from the server, which can be quite slow. # It is a good idea to cache results from Wikipedia locally, # and to set a high timeout when calling Wikipedia.search(). engine = Wikipedia(language="en") # Contrary to other search engines in the module, # Wikipedia simply returns one WikipediaArticle object (or None) instead of a list of results. article = engine.search("alice in wonderland", cached=True, timeout=30) print article.title # Article title (may differ from the search query). print print article.languages["fr"] # Article in French, can be retrieved with Wikipedia(language="fr"). print article.links[:10], "..." # List of linked Wikipedia articles. print article.external[:5], "..." # List of external URL's. print #print article.source # The full article content as HTML. #print article.string # The full article content, plain text with HTML tags stripped. # An article is made up of different sections with a title. # WikipediaArticle.sections is a list of WikipediaSection objects. # Each section has a title + content and can have a linked parent section or child sections. for s in article.sections: print s.title.upper() print
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from pattern.web import Wikipedia import webapp2 import logging import jinja2 from convert2html import plaintext2html from urllib2 import quote from backend import plot,title_from_imdb,more_movie_info jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader("template"), autoescape = True) import os, sys; sys.path.insert(0, os.path.join("..", "..")) engine = Wikipedia(language="en") class Handler(webapp2.RequestHandler): def write(self,*a,**kw): self.response.out.write(*a,**kw) def render_str(self,template,**params): t = jinja_env.get_template(template) return t.render(params) def render(self,template,**kw): self.write(self.render_str(template, **kw)) class MainHandler(Handler): def get(self): self.render("index.html") def post(self): movie_name = self.request.get('movie_name')
self.writerow(row) output = open("better_ruby_scraping_output.csv", "wb") writer = UnicodeWriter(output) writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"]) url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Ruby_(programming_language)&offset=&limit=500' + '&action=history' url = URL(url_string) dom = DOM(url.download(cached=True)) engine = Wikipedia(license=None) article = engine.search('ruby programming language') a = 0 while (len(dom.by_class("mw-nextlink")) > 0): page_history_links = dom.by_tag("ul")[0].by_tag("li") for link in page_history_links: date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore') bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore') ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true' req = urllib2.urlopen(ip_url) req_request = urllib2.Request(ip_url) #handler = urllib2.urlopen(req)
""" Retrieves an article from Wikipedia (http://en.wikipedia.org). """ from pattern.web import Wikipedia ENGINE = Wikipedia(language="en") def main(search_query): """ Returns Twitter Search Results :param search_query: (str) """ final = "Wikipedia Search Results:" article = ENGINE.search(search_query, cached=True, timeout=30) print article.title # Article title (may differ from the search query). print "" print article.languages[ "tr"] # Article in Turkish, can be retrieved with Wikipedia(language="tr"). print article.links[:10] # List of linked Wikipedia articles. print article.external[:5] # List of external URL's. print "" for s in article.sections: final = final + "\n\n" + s.title.upper() + "\n" + s.content return final if __name__ == '__main__': main("")
def descarga(titulo): engine = Wikipedia(language="en") result= engine.search(titulo, type=SEARCH) return repr(plaintext(result.string))
self.writerow(row) output = open("new_scraping_output.csv", "wb") writer = UnicodeWriter(output) writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"]) url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Python_(programming_language)&offset=&limit=500' + '&action=history' url = URL(url_string) dom = DOM(url.download(cached=True)) engine = Wikipedia(license=None) article = engine.search('python programming language') a = 0 while (len(dom.by_class("mw-nextlink")) > 0): page_history_links = dom.by_tag("ul")[0].by_tag("li") for link in page_history_links: date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore') bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore') ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true' req = urllib2.urlopen(ip_url) req_request = urllib2.Request(ip_url) #handler = urllib2.urlopen(req)
self.writerow(row) output = open("ocaml_scraping_output.csv", "wb") writer = UnicodeWriter(output) writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"]) url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history' url = URL(url_string) dom = DOM(url.download(cached=True)) engine = Wikipedia(license=None) article = engine.search('Ocaml') a = 0 while (len(dom.by_class("mw-nextlink")) > 0): page_history_links = dom.by_tag("ul")[0].by_tag("li") for link in page_history_links: date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore') bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore') ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true' req = urllib2.urlopen(ip_url) req_request = urllib2.Request(ip_url) #handler = urllib2.urlopen(req)
# Word frequency. from pattern.web import Wikipedia from pattern.vector import words frequency = {} # Spreading activation. # Parse links from seed article & visit those articles. links, seen = set(["Italia"]), {} while len(links) > 0: try: article = Wikipedia(language="it").search(links.pop(), throttle=10) seen[article.title] = True # Parse links from article. for link in article.links: if link not in seen: links.add(link) # Parse words from article. Count words. for word in words(article.string): if word not in frequency: frequency[word] = 0
from __future__ import print_function import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Wikipedia # This example retrieves an article from Wikipedia (http://en.wikipedia.org). # Wikipedia queries request the article HTML source from the server. This can be slow. # It is a good idea to cache results from Wikipedia locally, # and to set a high timeout when calling Wikipedia.search(). engine = Wikipedia(language="en") # Contrary to the other search engines in the pattern.web module, # Wikipedia simply returns one WikipediaArticle object (or None), # instead of a list of results. article = engine.search("alice in wonderland", cached=True, timeout=30) # Article title (may differ from the search query). print(article.title) print() # Article in French, can be retrieved with Wikipedia(language="fr"). print(article.languages["fr"]) print(article.links[:10], "...") # List of linked Wikipedia articles. print(article.external[:5], "...") # List of external URL's. print() # print article.source # The full article content as HTML. # print article.string # The full article content, plain text with HTML # tags stripped.
def main(): article = Wikipedia(language="en").search('small', throttle=10) print article.string
def isa(entity, type=PERSON): """ Returns True if the given entity is of the given type. """ # - Wikipedia.search() returns a WikipediaArticle: # http://www.clips.ua.ac.be/pages/pattern-web#wikipedia # - The article comes with the HTML source code. # - The article comes with a plaintext version (no HTML tags). # - We can count how many times a word occurs in the plain text # (e.g., articles about cities don't often use "he" or "she"). # - We can search the HTML parse tree with CSS selectors. # (e.g., the HTML often has a <div class="infobox"> with interesting metadata). try: w = Wikipedia(language="en") p = w.search(entity, cached=True) t = DOM(p.src) # HTML parse tree s = p.plaintext() s = s.lower() s = s.replace(".", " ") s = s.replace(",", " ") s = s.replace("'", " ") s = s.replace('"', " ") n = s.count(" ") * 1.0 or 0.0 # approximate word count except: pass # A person is an entity with a biography, a birthdate, and # a life's description containing gender-specific pronouns # (e.g., Noam Chomsky, Arnold Schwarzenegger). if type == PERSON: if t(".infobox.biography"): return True if t(".infobox th:contains('born')"): return True if any("early life" in x.title.lower() for x in p.sections): return True if sum(s.count(" %s " % w) for w in ( "he", "his")) / n > 0.01: # 1% he return True if sum(s.count(" %s " % w) for w in ("she", "her")) / n > 0.01: # 1% she return True # A place is an entity with a geography and/or a population # (e.g., New York, Jupiter, Middle Earth). if type == PLACE: if t(".infobox.geography"): return True if t(".infobox th:contains('coordinates')"): return True if t(".infobox th:contains('location')"): return True if t(".infobox th:contains('population')"): return True if t(".infobox th:contains('orbital period')"): return True if t("h2:contains('climate')"): return True if t("h2:contains('architecture')"): return True if any("geography" in x.title.lower() for x in p.sections): return True if any("flora" in x.title.lower() for x in p.sections): return True if any("fauna" in x.title.lower() for x in p.sections): return True if sum(s.count(" %s " % w) for w in ( "city", "country", "house", "land", "location", "place", "room", "rooms", "space", "setting", "town")) / n > 0.01: return True return False
def recherche_wikipedia(recherche, language): """ Fonction principale de ce script python 27 qui fait une requête wikipedia afin de récupérer les informations sur une page wikipedia. La fonction n'a formatté que les informations des recherches "fr" et "en" et plus précisément les sections pour "fr" Fiche technique, distribution, voix françaises et originales et en "en" le casting. Cette fonction peut évidemment être étoffée pour récupérer plus d'informations. """ datas = yaml.load(open("/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml")) engine = Wikipedia(language=language) searching = engine.search(recherche) Sections = searching.sections metadata = {} def fonction(part=True,sepa = ":"): """ fonction interne à recherche wikipedia qui permet de mettre en forme le texte récupéré pour être formatté avant le passage dans YAML.load() """ temp = [x.strip() for x in section.content.replace('* ','').split('\n') if x != u""] liste = [] for element in temp: element = element.encode('utf-8') if part: (cle,sep,attr) = element.partition(sepa) else: (cle,sep,attr) = element.rpartition(sepa) attr = attr.strip() cle = cle.strip() if "'" in cle: attr = attr.replace("'","''") if "'" in attr: attr = attr.replace("'","''") if ":" in cle: cle = cle.replace(':','--') if ":" in attr: attr = attr.replace(":","--") element = " ".join([x for x in [cle+sep, attr] if x != '""']) if element_inString(element,datas["countries"]): element = " "+element elif (not ":" in element): element = " "+element liste.append(element) return liste if language == "fr": for section in Sections: if section.title == u"Fiche technique": metadata.update({"Fiche_technique":yaml.load("\n".join(fonction()[1:-1]))}) elif section.title == u"Distribution": temp = fonction() if len(temp) != 1: metadata.update({"Distribution":yaml.load("\n".join(fonction(part=False)[1:-1]))}) elif section.title == u"Voix françaises": metadata.update({u"Voix françaises":yaml.load('\n'.join(fonction()[1:-1]))}) elif section.title == u"Voix originales": metadata.update({"Voix originales":yaml.load('\n'.join(fonction()[1:-1]))}) if language == "en": for section in sections: if section.title == 'Cast': liste = [] for element in fonction(sepa="as")[1:-1]: (cle, sep, val) = element.partition('as') element = cle+":"+val liste.append(element) metadata.update({"Casting":yaml.load('\n'.join(liste))}) #return metadata return yaml.dump(metadata, default_flow_style = False, allow_unicode = True)