def scrape_article_texts(self, titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> texts = list(texts) >>> len(texts) 1 >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)) >>> len(texts) 10 """ if isinstance(titles, str): log.error(f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"') titles = find_titles(titles) exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) # depth starts at zero here, but as additional titles are appended the depth will increase title_depths = list(zip(titles, [0] * len(titles))) text_lens = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) d, num_articles = 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths) > 0: title = '' # skip titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) log.info(f'len(title_depths): {len(title_depths)}') text = self.cache.get(title, None) if text: yield text page = wiki.article(title) if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))): log.warning(f"Unable to retrieve _{title}_ because article text and summary len are 0.")
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> len(texts) 1 >>> texts = scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3) >>> len(texts) == 10 True """ titles = [titles] if isinstance(titles, str) else titles exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) depths = list([0] * len(titles)) # depth is always zero here, but this would be useful further down title_depths = list(zip(titles, depths)) texts = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) title, d, num_articles = '', 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths): title = None # skip titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) page = wiki.article(title) if not (len(page.text) + len(page.summary)): log.warn(f"Unable to retrieve {title}") time.sleep(2.17) continue # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f' Checking see also link: {t}') if t in page.links: log.info(f' yep, found it in page.links') title_depths.append((t, d + 1)) log.info(f' extended title_depths at depth {d}: {title_depths}') text = f'{page.title}\n\n' if title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if heading_text else '\n' text += section.text.replace('’', "'") + '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. texts.append(text) log.warn(f'Added article "{page.title}" with {len(text)} characters. Total chars = {sum((len(t) for t in texts))}') log.warn(str([depth, d, num_articles, title])) if len(texts) >= max_articles: log.warn(f"num_articles={num_articles} ==> len(texts)={len(texts)} > max_depth={max_depth}") break if d > depth: log.warn(f"{d} > {depth}") break return texts
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> df = scrape_articles(['ELIZA'], see_also=False) >>> df.shape[0] > 80 True >>> df.columns Index(['depth', 'title', 'section', 'sentence'], dtype='object') """ titles = list([titles] if isinstance(titles, str) else titles) exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])]) depths = list([0] * len(titles)) title_depths = list(zip(titles, depths)) sentences = [] num_articles = 0 # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) title, d = '', 0 wiki = Wikipedia() for depth in range(max_depth): while num_articles < max_articles and d <= depth and len(title_depths): title = None # skip None titles and titles already scraped while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.warning(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) page = wiki.article(title) if not (len(page.text) + len(page.summary)): log.error(f"Unable to retrieve {title}") time.sleep(2.17) continue num_articles += 1 # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f' Checking see also link: {t}') if t in page.links: log.info(f' yep, found it in page.links') title_depths.append((t, d + 1)) log.info(f' extended title_depths at depth {d}: {title_depths}') for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text = section.text.replace('’', "'") # spacy doesn't handle "latin" (extended ascii) apostrophes well. # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...' # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly... sentences.extend([ (d, title, section.title, s.text) for s in nlp(text).sents if ( len(s.text.strip().strip('"').strip("'").strip()) > 1) ]) log.debug(f'Parsed {len(sentences)} sentences.') # retval = parse_sentences( # title=title, sentences=sentences, title_depths=title_depths, see_also=see_also, # exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth) # if retval is None: # continue # else: # sentences, title_depths = retval log.info(str([depth, d, num_articles, title])) if d > depth: log.warning(f"{d} > {depth}") break return pd.DataFrame(sentences, columns='depth title section sentence'.split())
class WikiScraper: """ RAM caching of scraped wikipedia pages TODO: preserve cache between runs in a sqlite database or flatfile or h5 (hdf) file """ def __init__(self, sleep_empty_page=2.17, sleep_downloaded_page=0.01, sleep_nonexistent_page=0.02): self.sleep_empty_page = sleep_empty_page self.sleep_nonexistent_page = sleep_nonexistent_page self.sleep_downloaded_page = sleep_downloaded_page self.cache = {} self.section_titles = {} def get_article( self, title: str, exclude_headings=EXCLUDE_HEADINGS, see_also=True, prepend_section_headings=True, prepend_title_text=True, ): """ same as scrape_article_texts but for single article, and checks cache first """ page_dict = self.cache.get(title) if page_dict and page_dict.get('text') and page_dict.get('summary'): return copy.copy(page_dict) self.wiki = Wikipedia() page = self.wiki.article(title) text, summary, see_also_links = '', '', [] if page.exists(): text = getattr(page, 'text', '') summary = getattr(page, 'summary', '') else: time.sleep(self.sleep_nonexistent_page) self.cache[title] = {} return {} # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if section: for t in section.text.split('\n'): log.info(f" Checking _SEE ALSO_ link: {t}") if t in page.links: see_also_links.append(t) text = f'{page.title}\n\n' if prepend_title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if prepend_section_headings else '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. text += section.text.replace('’', "'") + '\n' self.section_titles[str(section.title).strip()] = str( section.title).lower().strip().replace('’', "'") page_dict = dict(title=page.title, text=text, summary=summary, see_also_links=see_also_links) self.cache[title] = page_dict return page_dict def scrape_article_pages(self, titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, prepend_section_headings=True, prepend_title_text=True, max_articles=10_000, max_depth=1):
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS, see_also=True, max_articles=10000, max_depth=1, heading_text=True, title_text=True): """ Download text for an article and parse into sections and sentences >>> nlp('hello') # to eager-load spacy model hello >>> texts = scrape_article_texts(['ELIZA'], see_also=False) >>> texts = list(texts) >>> len(texts) 1 >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)) >>> len(texts) 10 """ if isinstance(titles, str): log.error( f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"' ) titles = find_titles(titles) exclude_headings = set( [eh.lower().strip() for eh in (exclude_headings or [])]) # depth starts at zero here, but as additional titles are appended the depth will increase title_depths = list(zip(titles, [0] * len(titles))) text_lens = [] # FIXME: breadth-first search so you can do a tqdm progress bar for each depth # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences titles_scraped = set(['']) d, num_articles = 0, 0 wiki = Wikipedia() # TODO: should be able to use depth rather than d: for depth in range(max_depth): while num_articles < max_articles and d <= depth and len( title_depths) > 0: title = '' # skip titles already scraped while len(title_depths) and len(titles_scraped) and ( not title or title in titles_scraped): # log.warning(f"Skipping {title} (already scraped)") try: title, d = title_depths.pop() except IndexError: log.info(f'Out of titles: {title_depths}') break title = title.strip() if d > max_depth or not title: log.info(f"{d} > {max_depth} or title ('{title}') is empty") continue titles_scraped.add(title) log.info(f'len(title_depths): {len(title_depths)}Looking') page = wiki.article(title) if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))): log.warning( f"Unable to retrieve _{title}_ because article text and summary len are 0." ) time.sleep(2.17) continue # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article if see_also and d + 1 < max_depth: # .full_text() includes the section heading ("See also"). .text does not section = page.section_by_title('See also') if not section: continue for t in section.text.split('\n')[1:]: log.info(f" Checking _SEE ALSO_ link: {t}") if t in page.links: log.info( f' Found title "{t}" in page.links at depth {d}, so adding it to titles to scrape...' ) title_depths.append((t, d + 1)) log.info( f' extended title_depths at depth {d}: {title_depths}') text = f'{page.title}\n\n' if title_text else '' # page.text for section in page.sections: if section.title.lower().strip() in exclude_headings: continue # TODO: use pugnlp.to_ascii() or nlpia.to_ascii() text += f'\n{section.title}\n' if heading_text else '\n' text += section.text.replace( '’', "'" ) + '\n' # spacy doesn't handle "latin" (extended ascii) apostrophes well. yield text text_lens.append(len(text)) log.warning( f'Added article "{page.title}" with {len(text)} chars.') log.info(f' Total scraped {sum(text_lens)} chars') log.warning(str([depth, d, num_articles, title])) if len(text_lens) >= max_articles: log.warning( f"num_articles={num_articles} ==> len(text_lens)={len(text_lens)} > max_depth={max_depth}" ) break if d > depth: log.warning(f"{d} > {depth}") break