def __init__(self): self.pta = PartialTreeAligner()
class RecordAligner(object): def __init__(self): self.pta = PartialTreeAligner() def align(self, records, record=None): """Partial align multiple data records with partial tree match [1]_ Parameters ---------- records: list A list of the data records. record: optional The seed record learned before. used to speed up the extraction without finding the seed elements. Returns ------- seed_record: ``Record`` the seed record to match against other record trees. mappings: a dict with record as key and a nested dict map from seed element to aligned element. References ---------- .. [1] Web Data Extraction Based on Partial Tree Alignment <http://doi.acm.org/10.1145/1060745.1060761> """ if record: seed = record else: # find biggest record seed = max(records, key=Record.size) records.remove(seed) seed_copy = copy.deepcopy(seed) mappings = collections.OrderedDict({seed: self._create_mapping(seed_copy, seed)}) R = [] while len(records): next = records.pop(0) modified, partial_match, aligned = self.pta.align_records(seed_copy, next) mappings.update({next: aligned}) if modified: records.extend(R) R = [] else: # add it back to try it later since seed might change if partial_match: R.append(next) return seed_copy, mappings def _create_mapping(self, seed, tree): """create a mapping from seed tree to another tree. for example: >>> from lxml.html import fragment_fromstring >>> t1 = fragment_fromstring("<p> <a></a> <b></b> </p>") >>> t2 = fragment_fromstring("<p> <a></a> <b></b> </p>") >>> ra = RecordAligner() >>> d = ra._create_mapping(Record(t1), Record(t2)) >>> d[t1] == t2 True """ d = {} for s, e in zip(seed, tree): d[s] = e d.update(self._create_mapping(s, e)) return d