def extract(html, url):
    in_soup = create_soup_with_ids(html)
    html = unicode(in_soup)
    
    out_soup = bs4.BeautifulSoup('<div></div>', 'lxml')
    stack = [out_soup.find('div')]
    
    blacklist, whitelist = id_blacklist_and_whitelist_for_soup(in_soup, url)
    readability_ids = ids_preserved_by_readability(html)
    
    NONE = 1
    WHITELISTED = 2
    READABILITY = 3
    BLACKLISTED = 4
    
    # print 'IN SOUP', in_soup.prettify().encode('utf-8')
    # print readability_ids
    
    state_stack = [NONE]
    for kind, data in iterate_tree(in_soup):
        if kind == 'enter':
            id = data.get('data-subscribed-id')
            parent_state = state_stack[-1]
            
            if parent_state in (WHITELISTED, BLACKLISTED):
                state = parent_state
            else:
                if id in readability_ids:
                    state = READABILITY
                else:
                    state = NONE
            
            if id in whitelist or should_auto_whitelist(data):
                state = WHITELISTED
            elif id in blacklist or should_auto_blacklist(data):
                state = BLACKLISTED
            
            if id: del data['data-subscribed-id']
            
            state_stack.append(state)
            if data.name != '[document]':
                clone = clone_node(data, out_soup)
                stack[-1].append(clone)
                stack.append(clone)
        elif kind == 'text':
            if state_stack[-1] in (WHITELISTED, READABILITY):
                stack[-1].append(data)
        elif kind == 'exit':
            node = stack[-1]
            node_contains_content = (len(list(node)) > 0) 
            node_is_content = node.name in ('img', 'video', 'object', 'hr', 'br') and state_stack[-1] in (WHITELISTED, READABILITY) 
            if  not node_contains_content and not node_is_content:
                node.extract()
            stack.pop()
            state_stack.pop()
    # return out_soup.prettify()
    return unicode(out_soup)
def create_soup_with_ids(html):
    i = 1
    soup = bs4.BeautifulSoup(html, 'lxml')
    for kind, data in iterate_tree(soup):
        if kind == 'enter':
            data['data-subscribed-id'] = str(i)
            i += 1
            if data.name == 'amp-img':
                data.name = 'img'
    return soup
def ids_preserved_by_readability(html):
    ids = set()
    extracted_html = readability.Document(html).summary(html_partial=True)
    # print 'READABILITY SOUP', bs4.BeautifulSoup(extracted_html, 'lxml').prettify().encode('utf-8')
    
    for kind, data in iterate_tree(bs4.BeautifulSoup(extracted_html, 'lxml')):
        if kind == 'enter':
            id = data.get('data-subscribed-id')
            if id:
                ids.add(id)
            else:
                pass
                # print 'No ID for', data
    return ids
Beispiel #4
0
def populate_article_json(article, content):
    if not content.html: return

    root_url = article.url

    def process_url(url):
        if url:
            return urlparse.urljoin(root_url, url)
        else:
            return None

    futures = []
    fetch_timeout = 3 # TODO: raise this?

    soup = bs4.BeautifulSoup(content.html, 'lxml')
    segments = []

    cur_segment = None
    tag_stack = []
    block_elements = set(['p', 'div', 'table', 'header', 'section', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'caption', 'pre', 'blockquote', 'li', 'figcaption'])
    text_tag_attributes = {'strong': {'bold': True}, 'b': {'bold': True}, 'em': {'italic': True}, 'i': {'italic': True}, 'a': {}, 'code': {'monospace': True}, 'span': {}}

    def create_text_segment(kind):
        s = TextSegment(kind)
        if 'blockquote' in tag_stack: s.left_padding += 1
        return s

    def ensure_text(segment):
        # if the segment passed in is text, returns it
        # otherwise, creates a new one:
        if segment == None or not segment.is_text_segment():
            # create a new text segment:
            segment = create_text_segment('p')
            segments.append(segment)
        return segment

    for (event, data) in iterate_tree(soup):
        if event == 'enter' and data.name == 'br':
            event = 'text'
            data = u"\n"

        if event == 'enter':
            tag_stack.append(data.name)
            if data.name in block_elements:
                # open a new block segment:
                kind = {'h1': 'h1', 'h2': 'h2', 'h3': 'h3', 'h4': 'h4', 'h5': 'h5', 'h6': 'h6', 'blockquote': 'blockquote', 'caption': 'caption', 'li': 'li', 'figcaption': 'caption'}.get(data.name, 'p')
                cur_segment = create_text_segment(kind)

                attrs = cur_segment.content[0]
                if data.name == 'pre': attrs['monospace'] = True

                segments.append(cur_segment)
            elif data.name == 'img':
                cur_segment = ImageSegment(process_url(data.get('src')))
                futures.append(cur_segment.fetch_image_data_async(fetch_timeout))
                segments.append(cur_segment)
            else:
                # this is an inline (text) tag:
                cur_segment = ensure_text(cur_segment)
                attrs = dict(text_tag_attributes.get(data.name, {}))
                if data.name == 'a':
                    attrs['link'] = process_url(data.get('href'))
                cur_segment.open_text_section(attrs)
        elif event == 'text':
            cur_segment = ensure_text(cur_segment)
            cur_segment.add_text(data)
        elif event == 'exit':
            if 'twitter-tweet' in data.get('class', []) and cur_segment and cur_segment.is_text_segment:
                # cur_segment.content[0]['color'] = 'twitter' # mark only the LAST child as twitter
                cur_segment.kind = 'caption'
            if data.name in block_elements:
                cur_segment = None
            elif data.name in text_tag_attributes and cur_segment != None and cur_segment.is_text_segment():
                cur_segment.close_text_section()            
            if data.name != 'br':
                tag_stack.pop()

    segments = [s for s in segments if not s.is_empty()]
    
    for future in futures: future()
    
    # discard small images:
    segments = [s for s in segments if not (isinstance(s, ImageSegment) and s.size and s.size[0] * s.size[1] < (100 * 100))]
    
    # discard things on the text blacklist:
    segments = [s for s in segments if not (isinstance(s, TextSegment) and text_blacklist.should_remove(s.text_content()))]

    content.article_text = u"\n"

    title_segment = None
    if article.title:
        for seg in segments[:min(3,len(segments))]:
            if normalized_compare(seg.text_content(), article.title):
                title_segment = seg

    early_h1s = [seg for seg in segments[:min(3,len(segments))] if seg.is_text_segment() and seg.kind == 'h1']
    early_h1 = early_h1s[0] if len(early_h1s) else None

    early_images = [seg for seg in segments[:min(3,len(segments))] if isinstance(seg, ImageSegment)]
    early_image = early_images[0] if len(early_images) else None

    if article.title and not (title_segment or early_h1):
        title_segment = TextSegment('title')
        title_segment.add_text(article.title)
        segments = [title_segment] + segments

    top_image = None
    if article.top_image and not early_image:
        top_image = ImageSegment(article.top_image)
        top_image.fetch_image_data_async(2)()
        segments = [top_image] + segments

    # identify parts of the title:
    title_seg = first_non_null([title_segment, early_h1])
    if title_seg: title_seg.is_part_of_title = True

    title_image = first_non_null([early_image, top_image])
    if title_image: title_image.is_part_of_title = True

    index_to_insert_meta_line = ([0] + [i+1 for i, seg in enumerate(segments) if seg.is_part_of_title])[-1]
    meta_line = create_meta_line(article)
    meta_line.is_part_of_title = True
    segments.insert(index_to_insert_meta_line, meta_line)

    content.text = u"\n".join([seg.text_content() for seg in segments if seg.is_text_segment() and seg.kind not in ('title', 'meta')])
    content.is_low_quality_parse = len(content.text.split(" ")) < 50
    content.article_json = {"segments": [s.json() for s in segments], "is_low_quality_parse": content.is_low_quality_parse}

    if title_image:
        article.top_image = title_image.src
        article.top_image_tiny_json = title_image.tiny