def append_next_page(
        get_article_func,
        parsed_urls,
        page_index,
        page_url,
        doc,
        options
        ):
    logging.debug('appending next page: %s' % page_url)

    if page_index >= MAX_PAGES:
        return

    fetcher = options['urlfetch']
    try:
        html = fetcher.urlread(page_url)
    except Exception as e:
        logging.warning('exception fetching %s' % page_url, exc_info = True)
        return
    orig_page_doc = parse(html, page_url)
    next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
    page_article = get_article_func(orig_page_doc, options)
    page_doc = fragment_fromstring(page_article.html)
    make_page_elem(page_index, page_doc)
    if not is_suspected_duplicate(doc, page_doc):
        doc.append(page_doc)
        if next_page_url is not None:
            append_next_page(
                    get_article_func,
                    parsed_urls,
                    page_index + 1,
                    next_page_url,
                    doc,
                    options
                    )
Beispiel #2
0
def append_next_page(get_article_func, parsed_urls, page_index, page_url, doc,
                     options):
    logging.debug('appending next page: %s' % page_url)

    if page_index >= MAX_PAGES:
        return

    fetcher = options['urlfetch']
    try:
        html = fetcher.urlread(page_url)
    except Exception as e:
        logging.warning('exception fetching %s' % page_url, exc_info=True)
        return
    orig_page_doc = parse(html, page_url)
    next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
    page_article = get_article_func(orig_page_doc, options)
    page_doc = fragment_fromstring(page_article.html)
    make_page_elem(page_index, page_doc)
    if not is_suspected_duplicate(doc, page_doc):
        doc.append(page_doc)
        if next_page_url is not None:
            append_next_page(get_article_func, parsed_urls, page_index + 1,
                             next_page_url, doc, options)
Beispiel #3
0
 def _html(self, force=False):
     if force or self.html is None:
         self.html = parse(self.input, self.options['url'])
     return self.html
Beispiel #4
0
 def _html(self, force=False):
     if force or self.html is None:
         self.html = parse(self.input, self.options['url'])
     return self.html