def extract(html, url): in_soup = create_soup_with_ids(html) html = unicode(in_soup) out_soup = bs4.BeautifulSoup('<div></div>', 'lxml') stack = [out_soup.find('div')] blacklist, whitelist = id_blacklist_and_whitelist_for_soup(in_soup, url) readability_ids = ids_preserved_by_readability(html) NONE = 1 WHITELISTED = 2 READABILITY = 3 BLACKLISTED = 4 # print 'IN SOUP', in_soup.prettify().encode('utf-8') # print readability_ids state_stack = [NONE] for kind, data in iterate_tree(in_soup): if kind == 'enter': id = data.get('data-subscribed-id') parent_state = state_stack[-1] if parent_state in (WHITELISTED, BLACKLISTED): state = parent_state else: if id in readability_ids: state = READABILITY else: state = NONE if id in whitelist or should_auto_whitelist(data): state = WHITELISTED elif id in blacklist or should_auto_blacklist(data): state = BLACKLISTED if id: del data['data-subscribed-id'] state_stack.append(state) if data.name != '[document]': clone = clone_node(data, out_soup) stack[-1].append(clone) stack.append(clone) elif kind == 'text': if state_stack[-1] in (WHITELISTED, READABILITY): stack[-1].append(data) elif kind == 'exit': node = stack[-1] node_contains_content = (len(list(node)) > 0) node_is_content = node.name in ('img', 'video', 'object', 'hr', 'br') and state_stack[-1] in (WHITELISTED, READABILITY) if not node_contains_content and not node_is_content: node.extract() stack.pop() state_stack.pop() # return out_soup.prettify() return unicode(out_soup)
def create_soup_with_ids(html): i = 1 soup = bs4.BeautifulSoup(html, 'lxml') for kind, data in iterate_tree(soup): if kind == 'enter': data['data-subscribed-id'] = str(i) i += 1 if data.name == 'amp-img': data.name = 'img' return soup
def ids_preserved_by_readability(html): ids = set() extracted_html = readability.Document(html).summary(html_partial=True) # print 'READABILITY SOUP', bs4.BeautifulSoup(extracted_html, 'lxml').prettify().encode('utf-8') for kind, data in iterate_tree(bs4.BeautifulSoup(extracted_html, 'lxml')): if kind == 'enter': id = data.get('data-subscribed-id') if id: ids.add(id) else: pass # print 'No ID for', data return ids
def populate_article_json(article, content): if not content.html: return root_url = article.url def process_url(url): if url: return urlparse.urljoin(root_url, url) else: return None futures = [] fetch_timeout = 3 # TODO: raise this? soup = bs4.BeautifulSoup(content.html, 'lxml') segments = [] cur_segment = None tag_stack = [] block_elements = set(['p', 'div', 'table', 'header', 'section', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'caption', 'pre', 'blockquote', 'li', 'figcaption']) text_tag_attributes = {'strong': {'bold': True}, 'b': {'bold': True}, 'em': {'italic': True}, 'i': {'italic': True}, 'a': {}, 'code': {'monospace': True}, 'span': {}} def create_text_segment(kind): s = TextSegment(kind) if 'blockquote' in tag_stack: s.left_padding += 1 return s def ensure_text(segment): # if the segment passed in is text, returns it # otherwise, creates a new one: if segment == None or not segment.is_text_segment(): # create a new text segment: segment = create_text_segment('p') segments.append(segment) return segment for (event, data) in iterate_tree(soup): if event == 'enter' and data.name == 'br': event = 'text' data = u"\n" if event == 'enter': tag_stack.append(data.name) if data.name in block_elements: # open a new block segment: kind = {'h1': 'h1', 'h2': 'h2', 'h3': 'h3', 'h4': 'h4', 'h5': 'h5', 'h6': 'h6', 'blockquote': 'blockquote', 'caption': 'caption', 'li': 'li', 'figcaption': 'caption'}.get(data.name, 'p') cur_segment = create_text_segment(kind) attrs = cur_segment.content[0] if data.name == 'pre': attrs['monospace'] = True segments.append(cur_segment) elif data.name == 'img': cur_segment = ImageSegment(process_url(data.get('src'))) futures.append(cur_segment.fetch_image_data_async(fetch_timeout)) segments.append(cur_segment) else: # this is an inline (text) tag: cur_segment = ensure_text(cur_segment) attrs = dict(text_tag_attributes.get(data.name, {})) if data.name == 'a': attrs['link'] = process_url(data.get('href')) cur_segment.open_text_section(attrs) elif event == 'text': cur_segment = ensure_text(cur_segment) cur_segment.add_text(data) elif event == 'exit': if 'twitter-tweet' in data.get('class', []) and cur_segment and cur_segment.is_text_segment: # cur_segment.content[0]['color'] = 'twitter' # mark only the LAST child as twitter cur_segment.kind = 'caption' if data.name in block_elements: cur_segment = None elif data.name in text_tag_attributes and cur_segment != None and cur_segment.is_text_segment(): cur_segment.close_text_section() if data.name != 'br': tag_stack.pop() segments = [s for s in segments if not s.is_empty()] for future in futures: future() # discard small images: segments = [s for s in segments if not (isinstance(s, ImageSegment) and s.size and s.size[0] * s.size[1] < (100 * 100))] # discard things on the text blacklist: segments = [s for s in segments if not (isinstance(s, TextSegment) and text_blacklist.should_remove(s.text_content()))] content.article_text = u"\n" title_segment = None if article.title: for seg in segments[:min(3,len(segments))]: if normalized_compare(seg.text_content(), article.title): title_segment = seg early_h1s = [seg for seg in segments[:min(3,len(segments))] if seg.is_text_segment() and seg.kind == 'h1'] early_h1 = early_h1s[0] if len(early_h1s) else None early_images = [seg for seg in segments[:min(3,len(segments))] if isinstance(seg, ImageSegment)] early_image = early_images[0] if len(early_images) else None if article.title and not (title_segment or early_h1): title_segment = TextSegment('title') title_segment.add_text(article.title) segments = [title_segment] + segments top_image = None if article.top_image and not early_image: top_image = ImageSegment(article.top_image) top_image.fetch_image_data_async(2)() segments = [top_image] + segments # identify parts of the title: title_seg = first_non_null([title_segment, early_h1]) if title_seg: title_seg.is_part_of_title = True title_image = first_non_null([early_image, top_image]) if title_image: title_image.is_part_of_title = True index_to_insert_meta_line = ([0] + [i+1 for i, seg in enumerate(segments) if seg.is_part_of_title])[-1] meta_line = create_meta_line(article) meta_line.is_part_of_title = True segments.insert(index_to_insert_meta_line, meta_line) content.text = u"\n".join([seg.text_content() for seg in segments if seg.is_text_segment() and seg.kind not in ('title', 'meta')]) content.is_low_quality_parse = len(content.text.split(" ")) < 50 content.article_json = {"segments": [s.json() for s in segments], "is_low_quality_parse": content.is_low_quality_parse} if title_image: article.top_image = title_image.src article.top_image_tiny_json = title_image.tiny