def main_text_extraction(root): """ Extracts the main text of a detailed page :param HTMLNode root: :return: list of HTMLNode: the main content area """ main_content = [] # Get the node with the most content ret_dict = find_most_text_node(root) main_node = root.find_id(ret_dict['id']) main_content.append(main_node) # Checking the neighbors of the main node parent = main_node.parent neighbors = parent.get_children() ind = neighbors.index(main_node) left_neighbors = neighbors[0:ind] or [] left_neighbors = list(reversed(left_neighbors)) right_neighbors = neighbors[ind + 1:-1] or [] # The value of 10 is hardcoded and can be adjusted for l in left_neighbors: if tp.number_of_words(l.get_full_text()) >= 10: main_content.insert(0, l) elif l.type in HEADER: main_content.insert(0, l) break for r in right_neighbors: if tp.number_of_words(r.get_full_text()) >= 10: main_content.append(r) return main_content
def score_check(self, node): # Preparations score = 0 weights = [2, 2, 2, 2] max_value = functools.reduce((lambda x, y: x + y), weights) text = DataValidator.flatten_text(node.text) # Condition 1: has header tags found = False for tag in ['header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: if node.has_tag(tag): found = True break if not found: score += weights[0] # Condition 2: Contains address and house number # Regexp for searching for house numbers, allowed are f.e.: # Somestreet 12, Somestreet 12a # but not: # Somestreet 12sometextwithoutanywhitespace if re.search('([A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?\s+|[A-Z]{1}[a-z]+\s+[0-9]{1,3}\w?$)', text) is not None: score += weights[1] # Condition 3: Contains postal code(Germany) # Regexp for postal code if re.search('(\s+[0-9]{5}\s+|\s+[0-9]{5}$)', text) is not None: score += weights[2] # Condition 4: Contains less than 11 words if tp.number_of_words(text) <= 10: score += weights[3] return float(score) / float(max_value)
def score_check(self, node): # Preparations score = 0 weights = [2, 3, 2, 2, 2, 1] max_value = functools.reduce((lambda x, y: x + y), weights) # Condition 1: Contains more uppercase words if not DataValidator.contains_more_lower_than(node, 0.5): score += weights[0] # Condition 2: Contains header for tag in ['h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: if node.has_tag(tag): score += weights[1] break # Condition 3: Contains less than 10 words if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10: score += weights[2] # Condition 4: Contains symbols - and : if re.search('[:\-]+', DataValidator.flatten_text(node.text)): score += weights[3] # Condition 5: Contains a link node if node.has_tag('a'): score += weights[4] # Condition 6: Contains no numbers or slashes if re.search('[0-9\/]+', DataValidator.flatten_text( node.text)) is None: score += weights[5] return float(score) / float(max_value)
def base_check(self, node): text = DataValidator.flatten_text(node.text) if tp.number_of_words(text) > 15: return False if len(node.get_children()) > 1: return False return True
def hit_check(self, node): if DataValidator.in_class_ids(node, ShortDescValidator.ids_and_classes): return True if tp.number_of_words(DataValidator.flatten_text(node.text)) >= 20: return True return False
def base_check(self, node): text = DataValidator.flatten_text(node.text) if tp.number_of_words(text) < 3: return False for child in node.get_children(): if child.has_tag('div') or child.has_tag('span') or child.has_tag( 'td'): return False found = False for tag in ['span', 'div', 'td']: if node.type == tag: found = True if tp.number_of_words(text) >= 15: found = True return found
def score_check(self, node): # Preparations score = 0 weights = [2, 3, 1, 3] # 1 is subtracted because of the else condition max_value = functools.reduce((lambda x, y: x + y), weights) - 1 words = ['am', 'vom', 'bis', 'zum', 'datum'] # Condition 1: The text has fewer than 15 words if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15: score += weights[0] # Condition 2. Additionally contains suitable words for word in words: if word in DataValidator.flatten_text(node.text.lower()): score += weights[1] break else: # Condition 3: Contains suitable words but has more than 15 words for word in words: if word in DataValidator.flatten_text(node.text.lower()): score += weights[2] break # Condition 4: Contains suitable classes if DataValidator.in_class_ids(node, DateValidator.hit_labels): score += weights[3] return float(score) / float(max_value)
def hit_check(self, node): if DataValidator.in_class_ids(node, LocationValidator.ids_classes): return True if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 10: for word in LocationValidator.key_words: reg = '(\s+|^)%s(\s+|:)\s*\S+' % word if re.match(reg, DataValidator.flatten_text(node.text).lower()): return True return False
def run_node_detection(node, n, label_dict): """ This function runs all the validation process of each validator for a given node and saves the information in the data container of the node and in the given label_dict. The function is recursive, regarding the n number, which stands for the level in the tree of valid nodes. Here a valid node is a node, that is not a <p> or <strong> tag, but can contain those as children. :param HTMLNode node: the node to validate :param int n: the number of the valid node :param dict label_dict: the dict, where information about hits and scores will be stored for each label. :return: None """ # If the height is too big if n > MOD_MAX_HEIGHT: return hit = False class_list = [ NoiseValidator, DateValidator, TimeValidator, TitleValidator, ShortDescValidator, LocationValidator, LinkValidator ] node.data_container['label'] = {'hits': [], 'scores': [], 'not': []} for cl in class_list: obj = cl() if obj.get_label() not in label_dict: label_dict[obj.get_label()] = {'hits': [], 'scores': []} ret = obj.run_checks(node) if isinstance(ret, tuple): node.data_container['label']['scores'].append(ret) if (ret[0], node) not in label_dict[ret[1]]['scores']: label_dict[ret[1]]['scores'].append((ret[0], node)) else: if ret is not None: if ret is not DataLabel.UNKNOWN: if node not in label_dict[ret]['hits']: label_dict[ret]['hits'].append(node) node.data_container['label']['hits'].append(ret) if ret is DataLabel.NOISE: break hit = True else: node.data_container['label']['not'].append(obj.get_label()) if node.parent is not None: # Because those types are formatting areas of webpages if node.type in ['div', 'td', 'tr', 'tbody', 'table']: run_node_detection(node.parent, n + 1, label_dict) elif node.type in FORMAT_TAGS or tp.number_of_words( node.get_pure_text()) > 0: run_node_detection(node.parent, n, label_dict) else: run_node_detection(node.parent, n + 1, label_dict)
def base_check(self, node): text = DataValidator.flatten_text(node.text) if tp.number_of_words(text) > 15: return False if re.search('[\/?!]', text): return False # Checking the cases if DataValidator.contains_more_lower_than(node, (1.0/3.0)): return False return True
def hit_check(self, node): keywords = ['©', 'copyright'] if tp.number_of_words(DataValidator.flatten_text(node.text)) == 0: return True for kw in keywords: if kw in DataValidator.flatten_text(node.text): return True return False
def find_most_text_node(node): """ Finds recursively the node with the most word count :param HTMLNode node: the root node, where to start :return: dict : 'id' is the id of a node, 'words' is the word count """ text = get_unformatted_text(node) word_count = tp.number_of_words(text) ret_dict = { 'id': node.identification, 'words': word_count } for child in node.get_children(): ret_child = find_most_text_node(child) if ret_child['words'] > ret_dict['words']: ret_dict = ret_child return ret_dict
def score_check(self, node): # preparations score = 0 weights = [2, 1, 3, 2, 2] max_value = functools.reduce((lambda x, y: x * y), weights) text = DataValidator.flatten_text(node.text) # Condition 1: more than 15 words if tp.number_of_words(text) >= 15: score += weights[0] # Condition 2: Contains quotation marks if re.match('".+"', text) is not None: score += weights[1] # Condition 3: Does not contain any headers found = False for tag in [ 'stronger', 'header', 'h', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ]: if node.has_tag(tag): found = True break if not found: score += weights[2] # Condition 4: Contains more lowercase words than uppercase if DataValidator.contains_more_lower_than(node): score += weights[3] # Condition 5: Contains full stops, exclamation signs, # questions signs, semicolons, colons if re.match('[.:;?!]\s+|[.:;?!]$', text): score += weights[4] # Returning score return float(score) / float(max_value)
def score_check(self, node): # Preparations score = 0 weights = [2, 3, 1] # 1 gets subtracted because of the else statement max_value = functools.reduce((lambda x, y: x + y), weights) - 1 words = ['von', 'bis', 'ab', 'uhrzeit', 'um'] # Condition 1: contains less than 16 words if tp.number_of_words(DataValidator.flatten_text(node.text)) <= 15: score += weights[1] # Condition 2: combination of 1 and 2 for word in words: if word in DataValidator.flatten_text(node.text.lower()): score += weights[2] break else: # Condition 3: contains suitable words for word in words: if word in DataValidator.flatten_text(node.text.lower()): score += weights[0] break return float(score) / float(max_value)