def find_clusters(para_blocks): def uniq_path(blocks): r = [] prev = None for b in blocks: if prev == None or b.path != prev: r.append(b.path) prev = b.path return r def find_common(seqs): s0 = None for s1 in seqs: if s0 == None: s0 = s1 else: s0 = [ s0[i0] for (i0,i1) in find_lcs(s0, s1) ] return s0 # obtain the common paths. common_paths = find_common([ uniq_path(blocks) for blocks in para_blocks ]) # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ), # ... # ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ] clusters = zip(*[ retrieve_blocks(common_paths, blocks) for blocks in para_blocks ]) # compare each cluster of text blocks. layout = [] for blockgroups in clusters: if blockgroups[0]: layout.append(LayoutSectionCluster(len(layout), blockgroups)) return layout
def match_blocks(self, blocks0, strict=True): diffs = [ d for (d,m,p) in self.pattern ] mains = [ m for (d,m,p) in self.pattern ] paths = [ p for (d,m,p) in self.pattern ] layout = [] for (diffscore,mainscore,blocks1) in zip(diffs, mains, retrieve_blocks(paths, blocks0)): if strict and not blocks1: return None layout.append(LayoutSection(len(layout), diffscore, mainscore, blocks1)) return layout