def main(path_topN_pkl): wiki_html = Wikipedia('fr', extract_format=ExtractFormat.HTML) wiki_page = Wikipedia('fr', extract_format=ExtractFormat.WIKI) sources = pkl.load(Path(path_topN_pkl).open('rb')) sources = [s[1].strip() for s in sources] for s in sources: dump_page(s, target_folder='data/10khtml', wiki_obj=wiki_html) dump_page(s, target_folder='data/10kpages', wiki_obj=wiki_page)
def scrape_article_texts(self, titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> texts = list(texts) >>> len(texts) 1 >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)) >>> len(texts) 10 """ if isinstance(titles, str): log.error(f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"') titles = find_titles(titles) exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) # depth starts at zero here, but as additional titles are appended the depth will increase title_depths = list(zip(titles, [0] * len(titles))) text_lens = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) d, num_articles = 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths) > 0: title = '' # skip titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) log.info(f'len(title_depths): {len(title_depths)}') text = self.cache.get(title, None) if text: yield text page = wiki.article(title) if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))): log.warning(f"Unable to retrieve _{title}_ because article text and summary len are 0.")
def get_article( self, title: str, exclude_headings=EXCLUDE_HEADINGS, see_also=True, prepend_section_headings=True, prepend_title_text=True, ): """ same as scrape_article_texts but for single article, and checks cache first """ page_dict = self.cache.get(title) if page_dict and page_dict.get('text') and page_dict.get('summary'): return copy.copy(page_dict) self.wiki = Wikipedia() page = self.wiki.article(title) text, summary, see_also_links = '', '', [] if page.exists(): text = getattr(page, 'text', '') summary = getattr(page, 'summary', '') else: time.sleep(self.sleep_nonexistent_page) self.cache[title] = {} return {} # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if section: for t in section.text.split('\n'): log.info(f" Checking _SEE ALSO_ link: {t}") if t in page.links: see_also_links.append(t) text = f'{page.title}\n\n' if prepend_title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if prepend_section_headings else '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. text += section.text.replace('’', "'") + '\n' self.section_titles[str(section.title).strip()] = str( section.title).lower().strip().replace('’', "'") page_dict = dict(title=page.title, text=text, summary=summary, see_also_links=see_also_links) self.cache[title] = page_dict return page_dict
def main(self): from aion_core.utils import remove_brackets, remove_space, remove_string_sequence from wikipediaapi import Wikipedia splitted_acph = self.activate_phrase.split("__and__") searched_article = remove_string_sequence(self.speech_input, splitted_acph[0], splitted_acph[-1]) wiki = Wikipedia(aconf.Aion().get_language().split("_")[0]) article = wiki.page(searched_article) if article.exists(): article_text = remove_brackets(article.summary) article_text = remove_space(article_text) self.speech_output(alang.start("skills", "wikipedia", {"article_text": article_text})) else: self.speech_output(alang.start("skills", "wikipedia_article_not_found", {"article_name": searched_article}))
class Wikipedia: def __init__(self, title): self.wiki = Wiki('ru') self.title = title def page(self): page = self.wiki.page(self.title) if not page.exists(): page = self setattr(page, 'sections', []) return page def summary(self): page = self.page() if page.sections != []: return {'Общая информация': page.summary} def parse_sections(self, sections, summary=None): info = {} if summary is not None: info.update(summary) for section in sections: if section.text is '': value = self.parse_sections(section.sections) else: value = section.text info[section.title] = value return info def sections(self): return self.parse_sections(self.page().sections, self.summary())
def __init__(self, team_names='', team_colors='', team1_players='', team2_players='', options='', header='', question='', twTags=''): if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '': return if options == '' or header == '' or question == '': return self.listitems = list(options) self.header = str(header) self.question = str(question) self.wiki = Wikipedia('en') self.team_names = list(team_names) self.team_colors = list(team_colors) self.team1_players = list(team1_players) self.team2_players = list(team2_players) self.twTags = list(twTags)
def main(args): files = glob.glob(args.filepattern) id_wiki = Wikipedia(language='id') en_wiki = Wikipedia(language='en', extract_format=args.format) for corpus in files: print(corpus) if os.path.isfile(corpus): _, fname = os.path.split(corpus) if args.output_dir and os.path.isdir(args.output_dir): output_file = os.path.join(args.output_dir, fname) mode = 'w+' print(output_file) if os.path.exists(output_file) and args.duplicate_append: print('file exists') mode = 'a' fileout = codecs.open(output_file, mode=mode, encoding=args.encoding) else: fileout = sys.stdout data = codecs.open(corpus, mode='r', encoding=args.encoding) for title in get_jsonlpage(data): page = id_wiki.page(title) print(title) try: # print(page.langlinks) if 'en' in page.langlinks: en_title = page.langlinks['en'].title en_page = en_wiki.page(en_title) print(en_title) # print(en_page.text) en_text = print_page(en_page, args.format) print(en_text, file=fileout) except Exception: continue
def birthday_of_rich(id=1): ''' Parsing Billionaire Birthdays ''' wiki = Wikipedia() for id in Rich.select().where( Rich.id >= id): #No data on the wikipedia site # print(id, id.name) no_bday = [ 'Qin Yinglin', 'Colin Zheng Huang', 'Zhong Huijuan', 'Walter P.J. Droege', 'Li Xiting', 'Yang Huiyan', 'Joseph Safra', 'Lukas Walton', 'Theo Albrecht, Jr.', 'Zhang Yiming', 'Lee Man Tat', 'Wang Wei', 'Radhakishan Damani', 'Liu Yonghao', 'Wu Yajun', 'Sun Piaoyang', 'Pang Kang', 'Udo Tschira', 'Xu Hang', 'Pallonji Mistry', 'Zhang Yong', 'Robert Ng', 'Iris Fontbona', 'Donald Newhouse', 'Graeme Hart', 'Goh Cheng Liang', 'Hank Meijer', 'Robin Zeng', 'Andreas Struengmann', 'Thomas Struengmann', 'Hui Wing Mau', 'Quek Leng Chan', 'Sun Hongbin', 'Zhang Bangxin', 'Lu Zhongfang', 'Cyrus Poonawalla', 'Scott Farquhar', 'Gong Hongjia', 'Eric Wittouck', 'Xu Shihui', 'Wang Wenyin', 'Zhang Fan', 'Chen Bang', 'Jiang Rensheng', 'Blair Parry-Okeden', 'David Duffield', 'Eyal Ofer', 'John Grayken' ] if id.name in no_bday: id.bday = datetime.datetime(1, 1, 1) id.save() continue page_py = wiki.page(id.name) link = page_py.fullurl response = get_response(link) html_doc = BeautifulSoup(response.text, features='html.parser') date = html_doc.find('span', {'class': 'bday'}) if date is None: bday = fix_for_data(id.name) else: bday = datetime.datetime.strptime(date.text, '%Y-%m-%d') zodiac = find_zodiac(bday) id.bday = bday.date() id.zodiac = zodiac id.save()
def dump_page(source: str, target_folder: Union[Path, str] = "pages", wiki_obj: Wikipedia = None, lang: str = 'fr'): if not wiki_obj: wiki_obj = Wikipedia(lang) target_folder = Path(target_folder) if not target_folder.exists(): makedirs(target_folder) wikipage = wiki_obj.page(source) if not wikipage.exists(): print(f"page {source} does not exist") else: page_info = wiki_obj.info(wikipage) if page_info.title != wikipage.title: wikipage = wiki_obj.page(page_info.title) wiki_title = wikipage.title.replace(' ', '_') target_file = target_folder / (wiki_title.replace("/", "__SLASH__") + ".pkl") pkl.dump(wikipage, target_file.open('wb'))
def get_company_details(company): """ Params: - company (str) Returns: - company_description (str) """ wiki_wiki = Wikipedia('en') try: # try different methods for searching for the company until something good is returned page = wiki_wiki.page(company + " (company)") if not page.exists(): page = wiki_wiki.page(company) except Exception as err: printColoured(err, colour="red") raise InvalidUserInput( description="Connection timed out. Please try again later") company_data = page.text company_description = company_data.split("\n")[0] return company_description
def process_page( nlp: spacy.Language, category: Text, wiki: wikipediaapi.Wikipedia, page: Text, ) -> bool: """Fetches a single page and creates index files.""" filename = os.path.join("pages", f"{page}.sentences.json") output_filename = filename.replace(".sentences.", ".index.") if not os.path.exists(filename): article = wiki.page(page) summary = wikipediaapi.WikipediaPageSection(wiki=wiki, title='Summary', text=article.summary) sections = [summary] + article.sections sentences = [ dict(id=id, **sentence) for id, sentence in enumerate(extract_sections(nlp, sections)) ] if any(refer in sentences[0]["line"].lower() for refer in kREFER): return False with open(filename, 'w') as outfile: json.dump( { "category": category, "title": page, "sentences": sentences }, outfile, indent=2) if not os.path.exists(output_filename): command = f'node util/single_index.js "{filename}" "{output_filename}"' subprocess.call(command, shell=True) bucket = fibs_firebase_config.get_bucket() blob = bucket.blob(f"pages/{page}.json") if not blob.exists(): blob.upload_from_filename(filename, content_type='application/json') bucket.blob(f"indices/{page}.json").upload_from_filename( output_filename, content_type='application/json') return True return False
class Wiki(commands.Cog): """Error handling. """ def __init__(self, bot): self.bot = bot self.wiki = Wikipedia('en') @commands.command(name='wiki', aliases=['page']) async def page(self, ctx, *, search): result = self.wiki.page(search) if not result.exists(): return await ctx.send(embed=Embed( title="Page not found ⛔", description=f"No page was found under the name `{search}`", color=Color.blurple())) wiki = Embed(color=Color.dark_gold()) wiki.title = result.title wiki.url = result.fullurl wiki.description = f'{result.text[0:500]}...' wiki.set_footer( text="Powered by Wikipedia", icon_url="https://i.ibb.co/jyX08CD/wikipedia-PNG39.png") wiki.timestamp = ctx.message.created_at return await ctx.send(embed=wiki)
class Post: def __init__(self, team_names='', team_colors='', team1_players='', team2_players='', options='', header='', question='', twTags=''): if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '': return if options == '' or header == '' or question == '': return self.listitems = list(options) self.header = str(header) self.question = str(question) self.wiki = Wikipedia('en') self.team_names = list(team_names) self.team_colors = list(team_colors) self.team1_players = list(team1_players) self.team2_players = list(team2_players) self.twTags = list(twTags) def getWikiUrl(self, player_name=''): if player_name == '': return None page_py = self.wiki.page(player_name) if page_py.exists() is False: return None return page_py.fullurl @staticmethod def tag(name, *content, style=None, href=None, **attrs): if style is not None: attrs['style'] = style if href is not None: attrs['href'] = href if attrs: attr_str = ''.join(' %s="%s"' % (attr, value) for attr, value in sorted(attrs.items())) else: attr_str = "" if content: return '\n'.join('<%s%s>%s</%s>' % (name, attr_str, c, name) for c in content) else: return '<%s%s />' % (name, attr_str) def formatApi(self): http_part = "http --auth : --form POST http://www.tactification.com/api_rt/v1.0/new_post " question_tag = self.tag('div', self.question, style='color:black') br1 = self.tag('br') li_items = str() for item in self.listitems: li_items += ''.join(self.tag('li', item)) ul = self.tag('ul', li_items) div1 = self.tag('div', ul, style='color:black') starring_tag = self.tag('div', "Starring:", style='color:black') team1_url = self.getWikiUrl(self.team_names[0]) if team1_url is None: print(self.team_names[0]) return a_team1 = self.tag('a', self.team_names[0], href=team1_url) + ': ' a_items = str() for item in self.team1_players: print(item) player_url = self.getWikiUrl(item[0]) if player_url is None: print(item) return a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),') a_items.rstrip(',') i_team1 = self.tag('i', a_team1+a_items, style="color:" + str(self.team_colors[0])) team2_url = self.getWikiUrl(self.team_names[1]) if team2_url is None: print(self.team_names[1]) return a_team2 = self.tag('a', self.team_names[1], href=team2_url) + ': ' a_items = str() for item in self.team2_players: player_url = self.getWikiUrl(item[0]) if player_url is None: print(item) return a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),') a_items.rstrip(',') i_team2 = self.tag('i', a_team2+a_items, style="color:" + str(self.team_colors[1])) header = " header={!r} ".format(self.header) twTag = (" twTags='#{}, #{}, #{}' ".format(*self.twTags)) end_part = "tactical_gif@home_img.jpg tactical_pic_1750@with_help_msg.jpg tactical_pic_1575@with_help_msg_75.jpg tactical_pic_875@with_help_msg_50.jpg" final_command = http_part + "body='" + question_tag + br1 + div1 + starring_tag + i_team1 + br1 + i_team2 + "'" + header + twTag + end_part print(final_command)
def __init__(self, bot): self.bot = bot self.wiki = Wikipedia('en')
def metadata(table, min_majority=.8): ''' Returns a datamart schema, assigning types to each variable, if at least min_majority of the values are of that type. ''' lang = table.url.split('.', 1)[0].split('/')[-1] pg = Wikipedia(lang).page(table.url.rsplit('/', 1)[-1]) try: date_updated = pg.touched except: date_updated = dt.now().strftime('%Y-%m-%mT%H:%M:%SZ') try: categories = [kw.lower().split(':')[-1] for kw in pg.categories] kws = categories # kws = [kw for kw in kws if not any(c in kw for c in WIKIPEDIA_IGNORE_CATEGORIES)] # kws = set(word for kw in kws for word in findall(r'\w+', kw) if not len(FIND_STOPWORDS(kw))) except: categories = [] kws = [] try: description = pg.summary.split('\n', 1)[0] except: description = '' try: langlinks = list({v.title for v in pg.langlinks.values()}) except: langlinks = [] res = { "title": table.context['r0'] if 'r0' in table.context else 'Table in %s' % pg.title, "description": description, "url": table.url, "keywords": list(kws), "date_updated": date_updated, "provenance": { "source": "wikipedia.org" }, "materialization": { "python_path": "wikitables_materializer", "arguments": { "url": table.url, "xpath": table.xpath } }, "additional_info": { "categories": categories, "sections": [s.title for s in pg.sections], "translations": langlinks } } res['variables'] = [] for name in table.record[0].keys(): var = {'name': name, 'semantic_type': []} values = [r[name] for r in table.record] min_sample = min_majority * len(values) dates = [d for d in map(find_dates, values) if d != None] if len(dates) >= min_sample: var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Time') var['temporal_coverage'] = {'start': min(dates), 'end': max(dates)} entities = {v: t for v in values for v, t in find_entities(v).items()} locations = [v for v, t in entities.items() if t == 'GPE'] if len(locations) >= min_sample: var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Location') people = [v for v, t in entities.items() if t == 'PERSON'] if len(people) >= min_sample: var['semantic_type'].append('https://schema.org/Person') if len(entities) >= min_sample: var['named_entity'] = list(entities.keys()) numbers = [float(n) for n in values if n.strip().replace('.', '', 1).isdigit()] ranges = [n for n in values if BOOLEAN_SYNTAX_PROPERTIES['match-range'](n) is not None] if len(numbers) >= min_sample: var['semantic_type'].append('http://schema.org/Float') elif len(ranges) >= min_sample: var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Interval') if not len(var['semantic_type']): if any(len(c) for c in values): var['semantic_type'].append('http://schema.org/Text') else: var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/MissingData') res['variables'].append(var) return res
"Л": 0, "М": 0, "Н": 0, "О": 0, "П": 0, "Р": 0, "С": 0, "Т": 0, "У": 0, "Ф": 0, "Х": 0, "Ц": 0, "Ч": 0, "Ш": 0, "Щ": 0, "Э": 0, "Ю": 0, "Я": 0 } for name in names_list: alphabet_dict[name[:1].upper()] += 1 return alphabet_dict wiki_wiki = Wikipedia('ru') members = wiki_wiki.page('Категория:Животные по алфавиту').categorymembers animal_names = get_russian_category_members(members) counted_names = count_names(animal_names) for key, value in counted_names.items(): print(f'{key}: {value}')
from wikipediaapi import Wikipedia, ExtractFormat wiki = Wikipedia('en', extract_format=ExtractFormat.HTML) not_found = '<h1> Page not found </h1>'.encode('utf-8') def search(term): page = wiki.page(term) if page.exists(): summary = page.summary.encode('utf-8') if 'may refer to' in summary.decode('utf-8'): return not_found return summary else: return not_found
import pandas as pd from wikipediaapi import Wikipedia W_API = Wikipedia('en') TOP_25_URL = "https://en.wikipedia.org/wiki/Wikipedia:Top_25_Report" def wiki_page_list(url, n_pages=None, article_column='Article'): tables = pd.read_html(url) selected_table = None for table in tables: if article_column in table: selected_table = table break if selected_table is None: raise KeyError(f'No column "{article_column}" in any tables at {url}.') if n_pages is None: n_pages = selected_table.shape[0] return list(selected_table[article_column][:n_pages]) def top25(): return wiki_page_list(TOP_25_URL, 25)
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> df = scrape_articles(['ELIZA'], see_also=False) >>> df.shape[0] > 80 True >>> df.columns Index(['depth', 'title', 'section', 'sentence'], dtype='object') """ titles = list([titles] if isinstance(titles, str) else titles) exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) depths = list([0] * len(titles)) title_depths = list(zip(titles, depths)) sentences = [] num_articles = 0 # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) title, d = '', 0 wiki = Wikipedia() for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths): title = None # skip None titles and titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.warning(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) page = wiki.article(title) if not (len(page.text) + len(page.summary)): log.error(f"Unable to retrieve {title}") time.sleep(2.17) continue num_articles += 1 # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f' Checking see also link: {t}') if t in page.links: log.info(f' yep, found it in page.links') title_depths.append((t, d + 1)) log.info(f' extended title_depths at depth {d}: {title_depths}') for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text = section.text.replace('’', "'") # spacy doesn't handle "latin" (extended ascii) apostrophes well. # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...' # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly... sentences.extend([ (d, title, section.title, s.text) for s in nlp(text).sents if ( len(s.text.strip().strip('"').strip("'").strip()) > 1) ]) log.debug(f'Parsed {len(sentences)} sentences.') # retval = parse_sentences( # title=title, sentences=sentences, title_depths=title_depths, see_also=see_also, # exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth) # if retval is None: # continue # else: # sentences, title_depths = retval log.info(str([depth, d, num_articles, title])) if d > depth: log.warning(f"{d} > {depth}") break return pd.DataFrame(sentences, columns='depth title section sentence'.split())
from django.forms.models import model_to_dict from django.http import Http404, JsonResponse from django.shortcuts import render from django.utils.functional import lazy from .models import WikiArticle from wikipediaapi import Wikipedia WIKI_WIKI = Wikipedia('en') class Message: send = True def __init__(self, text): if Message.send: self.action = 'send' else: self.action = 'receive' Message.send = not self.send self.text = text def home(request): return render(request, 'home.html', {'name_json': 'Grieves'}) def chat_page(request, bot_name): return render(request, 'home.html', {'name_json': bot_name})
from typing import List from wikipediaapi import Wikipedia, WikipediaPage from src.exceptions.downloader_exceptions import PageNotFoundError from src.downloader.models.DownloadedData import DownloadedData wiki_extractor = Wikipedia(language='ja') def get_wikipedia_data_for_output( search_page_names: List[str]) -> List[DownloadedData]: pages = download_wikipedia_pages(search_page_names=search_page_names) return [DownloadedData.from_wikipedia_page(page) for page in pages] def download_wikipedia_pages( search_page_names: List[str]) -> List[WikipediaPage]: pages = list() for page_name in search_page_names: page = wiki_extractor.page(page_name) if not page.exists(): raise PageNotFoundError(page_name=page_name, page_source="Wikipedia") pages.append(page) return pages
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> len(texts) 1 >>> texts = scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3) >>> len(texts) == 10 True """ titles = [titles] if isinstance(titles, str) else titles exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) depths = list([0] * len(titles)) # depth is always zero here, but this would be useful further down title_depths = list(zip(titles, depths)) texts = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) title, d, num_articles = '', 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths): title = None # skip titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) page = wiki.article(title) if not (len(page.text) + len(page.summary)): log.warn(f"Unable to retrieve {title}") time.sleep(2.17) continue # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f' Checking see also link: {t}') if t in page.links: log.info(f' yep, found it in page.links') title_depths.append((t, d + 1)) log.info(f' extended title_depths at depth {d}: {title_depths}') text = f'{page.title}\n\n' if title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if heading_text else '\n' text += section.text.replace('’', "'") + '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. texts.append(text) log.warn(f'Added article "{page.title}" with {len(text)} characters. Total chars = {sum((len(t) for t in texts))}') log.warn(str([depth, d, num_articles, title])) if len(texts) >= max_articles: log.warn(f"num_articles={num_articles} ==> len(texts)={len(texts)} > max_depth={max_depth}") break if d > depth: log.warn(f"{d} > {depth}") break return texts
def get_filtered_complete_dic(pkl_with_stats_fn, min_paragraphs=5, min_len_paragraphs=500, max_len_paragraphs=1000, draft=False, homonym=False, years=False, wiki_path=None, clean_duplicates=False): with open(pkl_with_stats_fn, 'rb') as f: stats_uncleaned = pkl.load(f) # We filter out the sections errors stats = { key: stats_uncleaned[key] for key in stats_uncleaned if stats_uncleaned[key] != 'SectionError' } filtered_stats = filter_dic(stats, min_len_paragraphs=min_len_paragraphs, draft=draft, homonym=homonym, max_len_paragraphs=max_len_paragraphs) filtered_stats = filter_min_paras(filtered_stats, min_paragraphs) # We filter the years if clean_duplicates: if wiki_path is None: print("Error : give a wikipath for duplicates cleaning") return new_ft_stats = {} wiki_obj = Wikipedia('fr') for filename, stats in filtered_stats.items(): try: with open(wiki_path + '/' + filename, 'rb') as f: page = pkl.load(f) except FileNotFoundError: print("Not found :" + filename) continue page_info = wiki_obj.info(page) new_title = title = page_info.title new_title = new_title.replace(' ', '_') new_title += '.pkl' new_ft_stats[new_title] = stats filtered_stats = new_ft_stats if not years: print("Length before year fitering :", len(filtered_stats)) if wiki_path is None: filtered_stats = { filename: filtered_stats[filename] for filename in filtered_stats if filter_years_articles(filename) } else: filtered_stats = { filename: filtered_stats[filename] for filename in filtered_stats if filter_years_articles(wiki_path + filename) } print("Final length : ", len(filtered_stats)) return filtered_stats
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> texts = list(texts) >>> len(texts) 1 >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)) >>> len(texts) 10 """ if isinstance(titles, str): log.error( f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"' ) titles = find_titles(titles) exclude_headings = set( [eh.lower().strip() for eh in (exclude_headings or [])]) # depth starts at zero here, but as additional titles are appended the depth will increase title_depths = list(zip(titles, [0] * len(titles))) text_lens = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) d, num_articles = 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len( title_depths) > 0: title = '' # skip titles already scraped while len(title_depths) and len(titles_scraped) and ( not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) log.info(f'len(title_depths): {len(title_depths)}Looking') page = wiki.article(title) if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))): log.warning( f"Unable to retrieve _{title}_ because article text and summary len are 0." ) time.sleep(2.17) continue # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f" Checking _SEE ALSO_ link: {t}") if t in page.links: log.info( f' Found title "{t}" in page.links at depth {d}, so adding it to titles to scrape...' ) title_depths.append((t, d + 1)) log.info( f' extended title_depths at depth {d}: {title_depths}') text = f'{page.title}\n\n' if title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if heading_text else '\n' text += section.text.replace( '’', "'" ) + '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. yield text text_lens.append(len(text)) log.warning( f'Added article "{page.title}" with {len(text)} chars.') log.info(f' Total scraped {sum(text_lens)} chars') log.warning(str([depth, d, num_articles, title])) if len(text_lens) >= max_articles: log.warning( f"num_articles={num_articles} ==> len(text_lens)={len(text_lens)} > max_depth={max_depth}" ) break if d > depth: log.warning(f"{d} > {depth}") break
def _wiki_api(lang): return Wikipedia(language=lang, extract_format=ExtractFormat.HTML)
def __init__(self, title): self.wiki = Wiki('ru') self.title = title
from wikipediaapi import Wikipedia player_name = input("Enter player name: ") wiki = Wikipedia(language='en') page = wiki.page(player_name) for s in page.sections: if 'Club career' in s.title: print(s.title) for p in s.sections: print("\t"+p.title) if p.sections != None: for q in p.sections: print("\t\t"+q.title)