Example #1
0
 def __init__(self):
     self.pta = PartialTreeAligner()
Example #2
0
class RecordAligner(object):

    def __init__(self):
        self.pta = PartialTreeAligner()

    def align(self, records, record=None):
        """Partial align multiple data records with partial tree match [1]_

        Parameters
        ----------
        records: list
            A list of the data records.

        record: optional
            The seed record learned before.
            used to speed up the extraction without finding the seed elements.

        Returns
        -------
        seed_record: ``Record``
             the seed record to match against other record trees.

        mappings: a dict with record as key and a nested dict map from seed element to aligned element.

        References
        ----------
        .. [1] Web Data Extraction Based on Partial Tree Alignment
        <http://doi.acm.org/10.1145/1060745.1060761>

        """
        if record:
            seed = record
        else:
            # find biggest record
            seed = max(records, key=Record.size)
            records.remove(seed)

        seed_copy = copy.deepcopy(seed)

        mappings = collections.OrderedDict({seed: self._create_mapping(seed_copy, seed)})

        R = []
        while len(records):
            next = records.pop(0)
            modified, partial_match, aligned = self.pta.align_records(seed_copy, next)

            mappings.update({next: aligned})

            if modified:
                records.extend(R)
                R = []
            else:
                # add it back to try it later since seed might change
                if partial_match:
                    R.append(next)

        return seed_copy, mappings

    def _create_mapping(self, seed, tree):
        """create a mapping from seed tree to another tree.

        for example:

        >>> from lxml.html import fragment_fromstring
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> </p>")
        >>> t2 = fragment_fromstring("<p> <a></a> <b></b> </p>")
        >>> ra = RecordAligner()
        >>> d = ra._create_mapping(Record(t1), Record(t2))
        >>> d[t1] == t2
        True
        """
        d = {}
        for s, e in zip(seed, tree):
            d[s] = e
            d.update(self._create_mapping(s, e))
        return d