def extract(self, element, record=None): """ extract the data record from data record candidate. Parameters ---------- element: lxml HTML element the HTML element of the candidate record: optional The seed record learned before. used to speed up the extraction without finding the seed elements. See Also -------- ``RecordAligner`` Returns ------- seed_record: ``Record`` the seed record to match against other record trees. mappings: dict a dict mapping from aligned record to a nested dict mapping from seed element to element. """ if record: n = len(element) clusters = [] # use index as cluster id for i in range(n): sims = [[clustered_tree_match(element[i], record[j]), j] for j in range(len(record))] clusters.append(max(sims, key=operator.itemgetter(0))[1]) rf = RecordFinder() records = rf.find_division(element.getchildren(), clusters, 0) else: m = self.calculate_similarity_matrix(element) clusters = self.hcluster(m) assert len(clusters) == len(m) rf = RecordFinder(self.tree_sim_cache) records = rf.find_best_division(element.getchildren(), clusters) if records: seed_record, mappings = self.ra.align(records, record) if record: mappings.pop(record) return seed_record, mappings return None, {}
def calculate_similarity_matrix(self, element): """calculate the similarity matrix for each child of the given element """ n = len(element) m = np.zeros((n, n), np.float) for i in range(n): for j in range(n): if j >= i: m[i, j] = clustered_tree_match(element[i], element[j]) self.tree_sim_cache[(element[i], element[j])] = m[i, j] self.tree_sim_cache[(element[j], element[i])] = m[i, j] m[j, i] = m[i, j] return m