Ejemplo n.º 1
0
    def scrape_article_texts(self,
                             titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                             see_also=True, max_articles=10000, max_depth=1,
                             heading_text=True, title_text=True):
        """ Download text for an article and parse into sections and sentences

        >>> nlp('hello')  # to eager-load spacy model
        hello
        >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
        >>> texts = list(texts)
        >>> len(texts)
        1
        >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3))
        >>> len(texts)
        10
        """
        if isinstance(titles, str):
            log.error(f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"')
            titles = find_titles(titles)
        exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
        # depth starts at zero here, but as additional titles are appended the depth will increase
        title_depths = list(zip(titles, [0] * len(titles)))
        text_lens = []
        # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
        # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
        titles_scraped = set([''])
        d, num_articles = 0, 0
        wiki = Wikipedia()
        # TODO: should be able to use depth rather than d:
        for depth in range(max_depth):
            while num_articles < max_articles and d <= depth and len(title_depths) > 0:
                title = ''

                # skip titles already scraped
                while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                    # log.warning(f"Skipping {title} (already scraped)")
                    try:
                        title, d = title_depths.pop()
                    except IndexError:
                        log.info(f'Out of titles: {title_depths}')
                        break
                    title = title.strip()
                if d > max_depth or not title:
                    log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                    continue
                titles_scraped.add(title)
                log.info(f'len(title_depths): {len(title_depths)}')
                text = self.cache.get(title, None)
                if text:
                    yield text
                page = wiki.article(title)
                if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))):
                    log.warning(f"Unable to retrieve _{title}_ because article text and summary len are 0.")
Ejemplo n.º 2
0
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                         see_also=True, max_articles=10000, max_depth=1,
                         heading_text=True, title_text=True):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
    >>> len(texts)
    1
    >>> texts = scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)
    >>> len(texts) == 10
    True
    """
    titles = [titles] if isinstance(titles, str) else titles
    exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
    depths = list([0] * len(titles))
    # depth is always zero here, but this would be useful further down
    title_depths = list(zip(titles, depths))
    texts = []
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    title, d, num_articles = '', 0, 0
    wiki = Wikipedia()
    # TODO: should be able to use depth rather than d:
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(title_depths):
            title = None

            # skip titles already scraped
            while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.info(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            page = wiki.article(title)
            if not (len(page.text) + len(page.summary)):
                log.warn(f"Unable to retrieve {title}")
                time.sleep(2.17)
                continue
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f'  Checking see also link: {t}')
                    if t in page.links:
                        log.info(f'    yep, found it in page.links')
                        title_depths.append((t, d + 1))
                log.info(f'  extended title_depths at depth {d}: {title_depths}')
            text = f'{page.title}\n\n' if title_text else ''
            # page.text
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text += f'\n{section.title}\n' if heading_text else '\n'
                text += section.text.replace('’', "'") + '\n'  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            texts.append(text)
            log.warn(f'Added article "{page.title}" with {len(text)} characters. Total chars = {sum((len(t) for t in texts))}')
            log.warn(str([depth, d, num_articles, title]))
            if len(texts) >= max_articles:
                log.warn(f"num_articles={num_articles} ==> len(texts)={len(texts)} > max_depth={max_depth}")
                break
            if d > depth:
                log.warn(f"{d} > {depth}")
                break
    return texts
Ejemplo n.º 3
0
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                    see_also=True, max_articles=10000, max_depth=1):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> df = scrape_articles(['ELIZA'], see_also=False)
    >>> df.shape[0] > 80
    True
    >>> df.columns
    Index(['depth', 'title', 'section', 'sentence'], dtype='object')
    """
    titles = list([titles] if isinstance(titles, str) else titles)
    exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
    depths = list([0] * len(titles))
    title_depths = list(zip(titles, depths))
    sentences = []
    num_articles = 0
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    title, d = '', 0
    wiki = Wikipedia()
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(title_depths):
            title = None
            # skip None titles and titles already scraped
            while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.warning(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            page = wiki.article(title)
            if not (len(page.text) + len(page.summary)):
                log.error(f"Unable to retrieve {title}")
                time.sleep(2.17)
                continue
            num_articles += 1
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f'  Checking see also link: {t}')
                    if t in page.links:
                        log.info(f'    yep, found it in page.links')
                        title_depths.append((t, d + 1))
                log.info(f'  extended title_depths at depth {d}: {title_depths}')
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text = section.text.replace('’', "'")  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
                # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...'
                # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly...
                sentences.extend([
                    (d, title, section.title, s.text) for s in nlp(text).sents if (
                        len(s.text.strip().strip('"').strip("'").strip()) > 1)
                ])
            log.debug(f'Parsed {len(sentences)} sentences.')

            # retval = parse_sentences(
            #     title=title, sentences=sentences, title_depths=title_depths, see_also=see_also,
            #     exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth)
            # if retval is None:
            #     continue
            # else:
            #     sentences, title_depths = retval
            log.info(str([depth, d, num_articles, title]))
            if d > depth:
                log.warning(f"{d} > {depth}")
                break

    return pd.DataFrame(sentences, columns='depth title section sentence'.split())
Ejemplo n.º 4
0
class WikiScraper:
    """ RAM caching of scraped wikipedia pages

    TODO: preserve cache between runs in a sqlite database or flatfile or h5 (hdf) file
    """
    def __init__(self,
                 sleep_empty_page=2.17,
                 sleep_downloaded_page=0.01,
                 sleep_nonexistent_page=0.02):
        self.sleep_empty_page = sleep_empty_page
        self.sleep_nonexistent_page = sleep_nonexistent_page
        self.sleep_downloaded_page = sleep_downloaded_page
        self.cache = {}
        self.section_titles = {}

    def get_article(
        self,
        title: str,
        exclude_headings=EXCLUDE_HEADINGS,
        see_also=True,
        prepend_section_headings=True,
        prepend_title_text=True,
    ):
        """ same as scrape_article_texts but for single article, and checks cache first """
        page_dict = self.cache.get(title)
        if page_dict and page_dict.get('text') and page_dict.get('summary'):
            return copy.copy(page_dict)
        self.wiki = Wikipedia()
        page = self.wiki.article(title)

        text, summary, see_also_links = '', '', []
        if page.exists():
            text = getattr(page, 'text', '')
            summary = getattr(page, 'summary', '')
        else:
            time.sleep(self.sleep_nonexistent_page)
            self.cache[title] = {}
            return {}

        # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition
        # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
        if see_also:
            # .full_text() includes the section heading ("See also"). .text does not
            section = page.section_by_title('See also')
            if section:
                for t in section.text.split('\n'):
                    log.info(f"  Checking _SEE ALSO_ link: {t}")
                    if t in page.links:
                        see_also_links.append(t)

        text = f'{page.title}\n\n' if prepend_title_text else ''
        # page.text
        for section in page.sections:
            if section.title.lower().strip() in exclude_headings:
                continue
            # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
            text += f'\n{section.title}\n' if prepend_section_headings else '\n'
            # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            text += section.text.replace('’', "'") + '\n'
            self.section_titles[str(section.title).strip()] = str(
                section.title).lower().strip().replace('’', "'")
        page_dict = dict(title=page.title,
                         text=text,
                         summary=summary,
                         see_also_links=see_also_links)
        self.cache[title] = page_dict
        return page_dict

    def scrape_article_pages(self,
                             titles=TITLES,
                             exclude_headings=EXCLUDE_HEADINGS,
                             see_also=True,
                             prepend_section_headings=True,
                             prepend_title_text=True,
                             max_articles=10_000,
                             max_depth=1):
Ejemplo n.º 5
0
def scrape_article_texts(titles=TITLES,
                         exclude_headings=EXCLUDE_HEADINGS,
                         see_also=True,
                         max_articles=10000,
                         max_depth=1,
                         heading_text=True,
                         title_text=True):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
    >>> texts = list(texts)
    >>> len(texts)
    1
    >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3))
    >>> len(texts)
    10
    """
    if isinstance(titles, str):
        log.error(
            f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"'
        )
        titles = find_titles(titles)
    exclude_headings = set(
        [eh.lower().strip() for eh in (exclude_headings or [])])
    # depth starts at zero here, but as additional titles are appended the depth will increase
    title_depths = list(zip(titles, [0] * len(titles)))
    text_lens = []
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    d, num_articles = 0, 0
    wiki = Wikipedia()
    # TODO: should be able to use depth rather than d:
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(
                title_depths) > 0:
            title = ''

            # skip titles already scraped
            while len(title_depths) and len(titles_scraped) and (
                    not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.info(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            log.info(f'len(title_depths): {len(title_depths)}Looking')
            page = wiki.article(title)
            if not (len(getattr(page, 'text', '')) +
                    len(getattr(page, 'summary', ''))):
                log.warning(
                    f"Unable to retrieve _{title}_ because article text and summary len are 0."
                )
                time.sleep(2.17)
                continue
            # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f"  Checking _SEE ALSO_ link: {t}")
                    if t in page.links:
                        log.info(
                            f'     Found title "{t}" in page.links at depth {d}, so adding it to titles to scrape...'
                        )
                        title_depths.append((t, d + 1))
                log.info(
                    f'  extended title_depths at depth {d}: {title_depths}')
            text = f'{page.title}\n\n' if title_text else ''
            # page.text
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text += f'\n{section.title}\n' if heading_text else '\n'
                text += section.text.replace(
                    '’', "'"
                ) + '\n'  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            yield text
            text_lens.append(len(text))
            log.warning(
                f'Added article "{page.title}" with {len(text)} chars.')
            log.info(f'  Total scraped {sum(text_lens)} chars')
            log.warning(str([depth, d, num_articles, title]))
            if len(text_lens) >= max_articles:
                log.warning(
                    f"num_articles={num_articles} ==> len(text_lens)={len(text_lens)} > max_depth={max_depth}"
                )
                break
            if d > depth:
                log.warning(f"{d} > {depth}")
                break