コード例 #1
0
ファイル: mdr.py プロジェクト: sumeshshivan/AutoScrapper
    def extract(self, element, record=None):
        """
        extract the data record from data record candidate.

        Parameters
        ----------
        element: lxml HTML element
            the HTML element of the candidate

        record: optional
            The seed record learned before.
            used to speed up the extraction without finding the seed elements.

        See Also
        --------
        ``RecordAligner``

        Returns
        -------
        seed_record: ``Record``
             the seed record to match against other record trees.

        mappings: dict
             a dict mapping from aligned record to a nested dict mapping from seed element to element.

        """
        if record:
            n = len(element)
            clusters = []
            # use index as cluster id
            for i in range(n):
                sims = [[clustered_tree_match(element[i], record[j]), j] for j in range(len(record))]
                clusters.append(max(sims, key=operator.itemgetter(0))[1])
            rf = RecordFinder()
            records = rf.find_division(element.getchildren(), clusters, 0)

        else:
            m = self.calculate_similarity_matrix(element)
            clusters = self.hcluster(m)
            assert len(clusters) == len(m)

            rf = RecordFinder(self.tree_sim_cache)
            records = rf.find_best_division(element.getchildren(), clusters)

        if records:
            seed_record, mappings = self.ra.align(records, record)
            if record:
                mappings.pop(record)
            return seed_record, mappings

        return None, {}
コード例 #2
0
ファイル: mdr.py プロジェクト: sumeshshivan/AutoScrapper
    def calculate_similarity_matrix(self, element):
        """calculate the similarity matrix for each child of the given element
        """
        n = len(element)

        m = np.zeros((n, n), np.float)
        for i in range(n):
            for j in range(n):
                if j >= i:
                    m[i, j] = clustered_tree_match(element[i], element[j])
                    self.tree_sim_cache[(element[i], element[j])] = m[i, j]
                    self.tree_sim_cache[(element[j], element[i])] = m[i, j]
                    m[j, i] = m[i, j]
        return m