def possible_author(self, node): goods = ['author', 'by', 'publi', 'write', 'written', 'info'] attr_values = node.attrib.values() if any([g in a for a in attr_values for g in goods]): return True txt = get_text_and_tail(node) if re.search(r'\b(author|by)[: ]', txt): return True return False
def get_fingerprints(self, node): res = [] text = normalize(get_text_and_tail(node)).strip() if node.tag == 'a' and 'href' in node.attrib: res = [(node.tag, node.attrib['href'], '', '')] if text: res += [(node.tag, a, node.attrib[a], text) for a in node.attrib] if node.tag == 'a': res += [(node.tag, '', '', text)] if not res: res = [(node.tag, '', '', text)] else: if 'style' in node.attrib: tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', node.attrib['style']) if tmp: res += [('style', tmp[0])] return res
def get_author(tree, lang='en'): # Couple ways of matching: # - Both in meta and in text # - Node has one of the goods in it, text has it in it, and the case is authoric # - Node has one of the goods in it # - Text has one of the goods in it goods = ['author', 'by', 'publi', 'write', 'written', 'info'] hard_authors = [] meta_authors = [] text_hard_authors = [] text_soft_authors = [] meta_nodes = tree.xpath('//head/meta') for option in goods: for meta in meta_nodes: if not any([option in a for a in meta.values()]): continue # dit gaat nog best helemaal fout! (ik moet atts nog chcken op goods) for attr in meta.attrib: author = get_text_author( author_translation(meta.attrib[attr], lang)) if author: meta_authors.append(author) for num, node in enumerate(tree.iter()): # hard author if not any([g in a for a in node.attrib.values() for g in goods]): for parent in node.iterancestors(): attr_values = parent.attrib.values() if any([g in a for a in attr_values for g in goods]): break else: continue tailtext = get_text_and_tail(node).strip() if tailtext and len(tailtext) < 200: if lang != 'en': tailtext = author_translation(tailtext, lang) hard_author = get_text_author(tailtext) if hard_author: hard_authors.append((num, hard_author)) for num, node in enumerate(tree.iter()): tailtext = get_text_and_tail(node).strip() if tailtext and len(tailtext) < 200: res = re.findall(r"(author|Author|AUTHOR)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext) if res: res = res[0] if res in meta_authors: text_hard_authors.append((res, num)) else: text_soft_authors.append((res, num)) else: res = re.findall(r"\b(by|By|BY)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext) if res: res = res[0] if res in meta_authors: text_hard_authors.append((res, num)) else: text_soft_authors.append((res, num)) hardest_authors = [] not_hardest_authors = [] for num, ha in hard_authors: if ha in meta_authors: hardest_authors.append((ha, num)) else: not_hardest_authors.append((ha, num)) meta_authors = set(meta_authors) return hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors
def get_dates(tree, titleind=(None, 1), lang='en'): # make this faster, its friggin slow (stupid fuzzy matching) hard_dates = [] soft_dates = [] fuzzy_hard_dates = [] fuzzy_soft_dates = [] meta_nodes = tree.xpath('//head/meta') goods = ['ublish', 'ublicat', 'date', 'time'] for option in goods: for meta in meta_nodes: if not any([option in a for a in meta.values()]): continue for attr in meta.attrib: soft_dates.append( get_text_date(date_translation(meta.attrib[attr], lang))) for num, node in enumerate(tree.iter()): candi_dates = [ v for k, v in node.items() if v and any([x in k for x in goods]) ] for v in candi_dates: if within_years(v): if lang != 'en': v = date_translation(v, lang) d = get_text_date(v) if d: soft_dates.append(d) else: fuzzy_soft_dates.append(get_text_date(v, fuzzy=True)) # hard date tailtext = get_text_and_tail(node).strip() if tailtext and within_years(tailtext): if lang != 'en': tailtext = date_translation(tailtext, lang) hard_date = get_text_date(tailtext) if hard_date: hard_dates.append((num, hard_date)) else: fuzzy_hard_dates.append( (num, get_text_date(tailtext, fuzzy=True))) soft_dates = set(soft_dates) fuzzy_soft_dates = set(x for x in fuzzy_soft_dates if x) fuzzy_hard_dates = [x for x in fuzzy_hard_dates if x] # Note that num and hd get switched here hardest_dates = [] not_hardest_dates = [] for num, hd in hard_dates: if hd in soft_dates: hardest_dates.append((hd, num)) else: not_hardest_dates.append((hd, num)) fuzzy_hardest_dates = [] for num, hd in fuzzy_hard_dates: if hd in fuzzy_soft_dates: fuzzy_hardest_dates.append((hd, num)) else: not_hardest_dates.append((hd, num)) # if nothing, then try simply fuzzy on each node, and otherwise non fuzzy non_fuzzy_any = [] fuzzy_any = [] if not any( [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, soft_dates]): # no leads, try to parse everything non fuzzy for num, node in enumerate(tree.iter()): non_fuzzy_text = get_text_date(node, fuzzy=False) if non_fuzzy_text: non_fuzzy_any.append((non_fuzzy_text, num)) else: fuzzy_text = get_text_date(node, fuzzy=True) if fuzzy_text: fuzzy_any.append((fuzzy_text, num)) date = '' date_node_index = None for dt in [ hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any ]: if dt: date, date_node_index = sorted( dt, key=lambda x: abs(x[1] - titleind[1]))[0] break if not date and soft_dates: for sd in soft_dates: date = sd break all_dates = [ hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any ] if date_node_index is not None: # It goes wrong when some year is mentioned in the title, then it removes title # print('removing date content', node.text) ...... is dit nog gerecent? date_node_indices = [[y[1] for y in x if y[0] == date] for x in all_dates] date_node_indices = [item for sub in date_node_indices for item in sub] for num, node in enumerate(tree.iter()): if num in date_node_indices: # maybe i now remove too littlet tt = get_text_and_tail(node) if tt and len(tt) < 25 or '|' in tt and len(tt) < 50: node.text = '' node.tail = '' # fd.nl if not date: now = datetime.datetime.now() if tree.xpath('//time[contains(text(), "Vandaag")]'): date = now.strftime('%Y-%m-%d') elif tree.xpath('//time[contains(text(), "Gisteren")]'): yesterday = now - datetime.timedelta(1) date = yesterday.strftime('%Y-%m-%d') return date
def get_author(tree, lang='en'): # Couple ways of matching: # - Both in meta and in text # - Node has one of the goods in it, text has it in it, and the case is authoric # - Node has one of the goods in it # - Text has one of the goods in it goods = ['author', 'by', 'publi', 'write', 'written', 'info'] hard_authors = [] meta_authors = [] text_hard_authors = [] text_soft_authors = [] meta_nodes = tree.xpath('//head/meta') for option in goods: for meta in meta_nodes: if not any([option in a for a in meta.values()]): continue # dit gaat nog best helemaal fout! (ik moet atts nog chcken op goods) for attr in meta.attrib: author = get_text_author(author_translation(meta.attrib[attr], lang)) if author: meta_authors.append(author) for num, node in enumerate(tree.iter()): # hard author if not any([g in a for a in node.attrib.values() for g in goods]): for parent in node.iterancestors(): attr_values = parent.attrib.values() if any([g in a for a in attr_values for g in goods]): break else: continue tailtext = get_text_and_tail(node).strip() if tailtext and len(tailtext) < 200: if lang != 'en': tailtext = author_translation(tailtext, lang) hard_author = get_text_author(tailtext) if hard_author: hard_authors.append((num, hard_author)) for num, node in enumerate(tree.iter()): tailtext = get_text_and_tail(node).strip() if tailtext and len(tailtext) < 200: res = re.findall(r"(author|Author|AUTHOR)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext) if res: res = res[0] if res in meta_authors: text_hard_authors.append((res, num)) else: text_soft_authors.append((res, num)) else: res = re.findall(r"\b(by|By|BY)[:;]* ([A-Z][a-zA-Z' ]+)", tailtext) if res: res = res[0] if res in meta_authors: text_hard_authors.append((res, num)) else: text_soft_authors.append((res, num)) hardest_authors = [] not_hardest_authors = [] for num, ha in hard_authors: if ha in meta_authors: hardest_authors.append((ha, num)) else: not_hardest_authors.append((ha, num)) meta_authors = set(meta_authors) return hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors
def get_dates(tree, titleind=(None, 1), lang="en"): # make this faster, its friggin slow (stupid fuzzy matching) hard_dates = [] soft_dates = [] fuzzy_hard_dates = [] fuzzy_soft_dates = [] meta_nodes = tree.xpath("//head/meta") goods = ["ublish", "ublicat", "date", "time"] for option in goods: for meta in meta_nodes: if not any([option in a for a in meta.values()]): continue for attr in meta.attrib: soft_dates.append(get_text_date(date_translation(meta.attrib[attr], lang))) for num, node in enumerate(tree.iter()): candi_dates = [v for k, v in node.items() if v and any([x in k for x in goods])] for v in candi_dates: if within_years(v): if lang != "en": v = date_translation(v, lang) d = get_text_date(v) if d: soft_dates.append(d) else: fuzzy_soft_dates.append(get_text_date(v, fuzzy=True)) # hard date tailtext = get_text_and_tail(node).strip() if tailtext and within_years(tailtext): if lang != "en": tailtext = date_translation(tailtext, lang) hard_date = get_text_date(tailtext) if hard_date: hard_dates.append((num, hard_date)) else: fuzzy_hard_dates.append((num, get_text_date(tailtext, fuzzy=True))) soft_dates = set(soft_dates) fuzzy_soft_dates = set(x for x in fuzzy_soft_dates if x) fuzzy_hard_dates = [x for x in fuzzy_hard_dates if x] # Note that num and hd get switched here hardest_dates = [] not_hardest_dates = [] for num, hd in hard_dates: if hd in soft_dates: hardest_dates.append((hd, num)) else: not_hardest_dates.append((hd, num)) fuzzy_hardest_dates = [] for num, hd in fuzzy_hard_dates: if hd in fuzzy_soft_dates: fuzzy_hardest_dates.append((hd, num)) else: not_hardest_dates.append((hd, num)) # if nothing, then try simply fuzzy on each node, and otherwise non fuzzy non_fuzzy_any = [] fuzzy_any = [] if not any([hardest_dates, fuzzy_hardest_dates, not_hardest_dates, soft_dates]): # no leads, try to parse everything non fuzzy for num, node in enumerate(tree.iter()): non_fuzzy_text = get_text_date(node, fuzzy=False) if non_fuzzy_text: non_fuzzy_any.append((non_fuzzy_text, num)) else: fuzzy_text = get_text_date(node, fuzzy=True) if fuzzy_text: fuzzy_any.append((fuzzy_text, num)) date = "" date_node_index = None for dt in [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any]: if dt: date, date_node_index = sorted(dt, key=lambda x: abs(x[1] - titleind[1]))[0] break if not date and soft_dates: for sd in soft_dates: date = sd break all_dates = [hardest_dates, fuzzy_hardest_dates, not_hardest_dates, non_fuzzy_any, fuzzy_any] if date_node_index is not None: # It goes wrong when some year is mentioned in the title, then it removes title # print('removing date content', node.text) ...... is dit nog gerecent? date_node_indices = [[y[1] for y in x if y[0] == date] for x in all_dates] date_node_indices = [item for sub in date_node_indices for item in sub] for num, node in enumerate(tree.iter()): if num in date_node_indices: # maybe i now remove too little if node.text and len(node.text) < 25: node.text = "" node.tail = "" return date
def process(self, url, tree, remove_visuals, exclude_data): self.remove_bad_xpaths_from_tree(tree) if self.detected_language is None: self.detected_language = get_language( tree, self.url_to_headers_mapping[url], self.domain) # print('language: {}'.format(self.detected_language)) # pre_text_content = normalize('\n'.join([get_text_and_tail(x) for x in tree.iter()])) # author has to be attempted before duplicate removal, since an author is # likely to occur more often self.domain_nodes_dict.remove_template(tree) hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors, meta_authors = get_author( tree, self.detected_language) self.domain_nodes_dict.remove_author(tree) title = getRuleTitle(tree) # filter duplicate images by src ok_imgs = get_images(tree) titleind = () imginds = [] contentinds = [] # such as title, date and later author link_eles = [link[0] for link in tree.iterlinks() if link[0].tag == 'a' and link[2] and link[2].startswith(self.domain) and get_text_and_tail(link[0]).strip()] linkinds = [] for num, node in enumerate(tree.iter()): if node in ok_imgs: imginds.append((node, num)) elif normalize(get_text_and_tail(node)) == title: titleind = (node, num) elif get_text_and_tail(node).strip(): if node in link_eles: linkinds.append((node, num)) contentinds.append((node, num)) # Cleanup trash for visual' if remove_visuals: if node.tag == 'input': node.set('type', 'hidden') elif node.tag == 'a' and not get_text_and_tail(node).strip(): for att in node.attrib: node.set(att, '') if node.tag == 'img': node.set('alt', '') if node.attrib and 'background-image' in node.attrib: node.set('background-image', '') if not titleind: # fuzzy token text / title matching title_set = set(title.split()) for num, node in enumerate(tree.iter()): text_content = get_text_and_tail(node) if text_content and len(text_content) < 500: text_set = set(text_content.split()) if fscore(title_set, text_set) > 0.5: titleind = (node, num) break if titleind: sortedimgs = sorted(imginds, key=lambda x: abs(x[1] - titleind[1])) else: sortedimgs = [] images = [] for x in sortedimgs: val = None if 'src' in x[0].attrib: val = x[0].attrib['src'] elif 'content' in x[0].attrib: val = x[0].attrib['content'] elif 'style' in x[0].attrib: tmp = re.findall(r'background-image:[ ]*url\((http[^)]+)', x[0].attrib['style']) if tmp: val = tmp[0] if val is not None and val not in images: images.append(val) author = '' author_node_index = None date = "1970-01-01" if titleind: date = get_dates(tree, titleind, self.detected_language) # excluding soft dates (meta, they wont work anyway) for at in [hardest_authors, not_hardest_authors, text_hard_authors, text_soft_authors]: if at: author, author_node_index = sorted( at, key=lambda x: abs(x[1] - titleind[1]))[0] break if not author and meta_authors: for ma in meta_authors: author = ma break if author_node_index is not None: for num, node in enumerate(tree.iter()): if num == author_node_index: break # It goes wrong when some year is mentioned in the title, then it removes title # print('removing author content', node.text) node.text = '' node.tail = '' cleaned_html = lxml.html.tostring(tree).decode('utf8') body_content = self.get_content(cleaned_html) if not body_content: body_content = [] title_len = len(title) title_tokens = set(title.split()) len_title_tokens = len(title_tokens) last_text_node_num = get_last_text_non_a_node(tree) for num, x in enumerate(tree.iter()): txt = normalize(get_text_and_tail(x)) if txt: if num < titleind[1]: # print('removed pre-title', txt) x.text = '' x.tail = '' continue if last_text_node_num > 0 and num > last_text_node_num: # print('removed post-content', txt) x.text = '' continue n = len(txt) # remove title txt_tokens = set(txt.split()) n_matching = len(txt_tokens & title_tokens) if (n < title_len * 3 and n_matching / len(txt_tokens) > 0.3 and n_matching / len_title_tokens > 0.3): # print('removed!', txt) continue body_content.append(txt) links = [x.attrib['href'] for x in tree.xpath('//a') if 'href' in x.attrib and x.attrib['href'].startswith(self.domain) and self.should_save(x.attrib['href'])] money_amounts = money.find('\n'.join(body_content), 1000) + money.find(title, 1000) data = {'title': title, 'body': body_content, 'images': images, 'publish_date': str(date), 'author': author, 'cleaned': cleaned_html, 'language': self.detected_language, 'url': url, 'domain': self.domain, 'money': money_amounts, 'summary': '', 'related': get_sorted_links(links, url)[:5]} if 'overwrite_values_by_xpath' in self.config: for k, v in self.config['overwrite_values_by_xpath'].items(): new = tree.xpath(v) data[k] = new[0] if isinstance(new, list) else new filtered_data = {k: v for k, v in data.items() if k not in exclude_data} return filtered_data