예제 #1
0
def find_clusters(para_blocks):
    def uniq_path(blocks):
        r = []
        prev = None
        for b in blocks:
            if prev == None or b.path != prev:
                r.append(b.path)
            prev = b.path
        return r
    
    def find_common(seqs):
        s0 = None
        for s1 in seqs:
            if s0 == None:
                s0 = s1
            else:
                s0 = [ s0[i0] for (i0,i1) in find_lcs(s0, s1) ]
        return s0
    
    # obtain the common paths.
    common_paths = find_common([ uniq_path(blocks) for blocks in para_blocks ])
    # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ),
    #                ...
    #              ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ]
    clusters = zip(*[ retrieve_blocks(common_paths, blocks) for blocks in para_blocks ])
    # compare each cluster of text blocks.
    layout = []
    for blockgroups in clusters:
        if blockgroups[0]:
            layout.append(LayoutSectionCluster(len(layout), blockgroups))
    return layout
예제 #2
0
 def match_blocks(self, blocks0, strict=True):
     diffs = [ d for (d,m,p) in self.pattern ]
     mains = [ m for (d,m,p) in self.pattern ]
     paths = [ p for (d,m,p) in self.pattern ]
     layout = []
     for (diffscore,mainscore,blocks1) in zip(diffs, mains, retrieve_blocks(paths, blocks0)):
         if strict and not blocks1:
             return None
         layout.append(LayoutSection(len(layout), diffscore, mainscore, blocks1))
     return layout