def getRuleTitle(tree, rule_dict=rule_dc): texts = sorted_title_candidates(tree, rule_dict) metas = get_meta_titles(tree) maxs = 0 ele = '' for x in texts: xs = x.lower().split() for y in metas: ys = y.lower().split() newm = fscore(xs, ys) if newm > maxs: maxs = newm ele = x return ele
def process(self, url, tree, remove_visuals, exclude_data): self.remove_bad_xpaths_from_tree(tree) if self.detected_language is None: self.detected_language = get_language( tree, self.url_to_headers_mapping[url], self.domain) # print('language: {}'.format(self.detected_language)) # pre_text_content = normalize('\n'.join([get_text_and_tail(x) for x in tree.iter()])) # author has to be attempted before duplicate removal, since an author is # likely to occur more often self.domain_nodes_dict.remove_template(tree) hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors = get_author( tree, self.detected_language) self.domain_nodes_dict.remove_author(tree) title = getRuleTitle(tree) # filter duplicate images by src ok_imgs = get_images(tree) titleind = () imginds = [] contentinds = [] # such as title, date and later author link_eles = [link[0] for link in tree.iterlinks() if link[0].tag == 'a' and link[2] and link[2].startswith(self.domain) and get_text_and_tail(link[0]).strip()] linkinds = [] for num, node in enumerate(tree.iter()): if node in ok_imgs: imginds.append((node, num)) elif normalize(get_text_and_tail(node)) == title: titleind = (node, num) elif get_text_and_tail(node).strip(): if node in link_eles: linkinds.append((node, num)) contentinds.append((node, num)) # Cleanup trash for visual' if remove_visuals: if node.tag == 'input': node.set('type', 'hidden') elif node.tag == 'a' and not get_text_and_tail(node).strip(): for att in node.attrib: node.set(att, '') if node.tag == 'img': node.set('alt', '') if node.attrib and 'background-image' in node.attrib: node.set('background-image', '') if not titleind: # fuzzy token text / title matching title_set = set(title.split()) for num, node in enumerate(tree.iter()): text_content = get_text_and_tail(node) if text_content and len(text_content) < 500: text_set = set(text_content.split()) if fscore(title_set, text_set) > 0.5: titleind = (node, num) break if titleind: sortedimgs = sorted(imginds, key=lambda x: abs(x[1] - titleind[1])) else: sortedimgs = [] images = [] for x in sortedimgs: val = None if 'src' in x[0].attrib: val = x[0].attrib['src'] elif 'content' in x[0].attrib: val = x[0].attrib['content'] elif 'style' in x[0].attrib: tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', x[0].attrib['style']) if tmp: val = tmp[0] if val is not None and val not in images: images.append(val) author = '' author_node_index = None date = "1970-01-01" if titleind: date = get_dates(tree, titleind, self.detected_language) # excluding soft dates (meta, they wont work anyway) for at in [hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors]: if at: author, author_node_index = sorted( at, key=lambda x: abs(x[1] - titleind[1]))[0] break if not author and meta_authors: for ma in meta_authors: author = ma break if author_node_index is not None: for num, node in enumerate(tree.iter()): if num == author_node_index: break # It goes wrong when some year is mentioned in the title, then it removes title # print('removing author content', node.text) node.text = '' node.tail = '' cleaned_html = lxml.html.tostring(tree).decode('utf8') body_content = self.get_content(cleaned_html) if not body_content: body_content = [] title_len = len(title) title_tokens = set(title.split()) len_title_tokens = len(title_tokens) last_text_node_num = get_last_text_non_a_node(tree) for num, x in enumerate(tree.iter()): txt = normalize(get_text_and_tail(x)) if txt: if num < titleind[1]: # print('removed pre-title', txt) x.text = '' x.tail = '' continue if last_text_node_num > 0 and num > last_text_node_num: # print('removed post-content', txt) x.text = '' continue n = len(txt) # remove title txt_tokens = set(txt.split()) n_matching = len(txt_tokens & title_tokens) if (n < title_len * 3 and n_matching / len(txt_tokens) > 0.3 and n_matching / len_title_tokens > 0.3): # print('removed!', txt) continue body_content.append(txt) links = [x.attrib['href'] for x in tree.xpath('//a') if 'href' in x.attrib and x.attrib['href'].startswith(self.domain) and self.should_save(x.attrib['href'])] money_amounts = money.find('\n'.join(body_content), 1000) + money.find(title, 1000) data = {'title': title, 'body': body_content, 'images': images, 'publish_date': str(date), 'author': author, 'cleaned': cleaned_html, 'language': self.detected_language, 'url': url, 'domain': self.domain, 'money': money_amounts, 'summary': '', 'related': get_sorted_links(links, url)[:5]} if 'overwrite_values_by_xpath' in self.config: for k, v in self.config['overwrite_values_by_xpath'].items(): new = tree.xpath(v) data[k] = new[0] if isinstance(new, list) else new filtered_data = {k: v for k, v in data.items() if k not in exclude_data} return filtered_data