def find_clusters(para_blocks):
    def uniq_path(blocks):
        r = []
        prev = None
        for b in blocks:
            if prev == None or b.path != prev:
                r.append(b.path)
            prev = b.path
        return r

    def find_common(seqs):
        s0 = None
        for s1 in seqs:
            if s0 == None:
                s0 = s1
            else:
                s0 = [s0[i0] for (i0, i1) in find_lcs(s0, s1)]
        return s0

    # obtain the common paths.
    common_paths = find_common([uniq_path(blocks) for blocks in para_blocks])

    # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ),
    #                ...
    #              ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ]
    clusters = zip(
        *[retrieve_blocks(common_paths, blocks) for blocks in para_blocks])

    # compare each cluster of text blocks.
    layout = []
    for blockgroups in clusters:
        if blockgroups[0]:
            layout.append(LayoutSectionCluster(len(layout), blockgroups))

    return layout
Ejemplo n.º 2
0
def find_clusters(para_blocks):
  
  def uniq_path(blocks):
    r = []
    prev = None
    for b in blocks:
      if prev == None or b.path != prev:
        r.append(b.path)
      prev = b.path
    return r
  
  def find_common(seqs):
    s0 = None
    for s1 in seqs:
      if s0 == None:
        s0 = s1
      else:
        s0 = [ s0[i0] for (i0,i1) in find_lcs(s0, s1) ]
    return s0

  # obtain the common paths.
  common_paths = find_common([ uniq_path(blocks) for blocks in para_blocks ])
  
  # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ),
  #                ...
  #              ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ]
  clusters = zip(*[ retrieve_blocks(common_paths, blocks) for blocks in para_blocks ])
  
  # compare each cluster of text blocks.
  layout = []
  for blockgroups in clusters:
    if blockgroups[0]:
      layout.append(LayoutSectionCluster(len(layout), blockgroups))

  return layout
def match_blocks(self, blocks0, strict=True):
    diffs = [ d for (d,m,p) in self.pattern ]
    mains = [ m for (d,m,p) in self.pattern ]
    paths = [ p for (d,m,p) in self.pattern ]
    layout = []
    for (diffscore,mainscore,blocks1,path) in zip(diffs, mains, retrieve_blocks(paths, blocks0), paths):
      if strict and not blocks1:
        return None
      layout.append(LayoutSection(len(layout), diffscore, mainscore, blocks1, path))
    return layout
 def match_blocks(self, blocks0, strict=True):
     diffs = [d for (d, m, p) in self.pattern]
     mains = [m for (d, m, p) in self.pattern]
     paths = [p for (d, m, p) in self.pattern]
     layout = []
     for (diffscore, mainscore,
          blocks1) in zip(diffs, mains, retrieve_blocks(paths, blocks0)):
         if strict and not blocks1:
             return None
         layout.append(
             LayoutSection(len(layout), diffscore, mainscore, blocks1))
     return layout