def _prunePairs(self, possible_pairs): # find pairs which have duplicates, select only best more_possible = [] tiebreak_pairs = [] max_pair = None for each in possible_pairs: tiebreak_pairs = [] max_pair = each for pair in possible_pairs: if max_pair != pair and max_pair[0] == pair[0]: if max_pair[2] < pair[2]: max_pair = pair tiebreak_pairs = [] elif max_pair[2] == pair[2]: tiebreak_pairs.append(pair) tiebreak_pairs.append(max_pair) if not max_pair in tiebreak_pairs: more_possible.append(max_pair) if len(tiebreak_pairs) > 0: #possible_pairs.extend(tiebreak_pairs) pass tiebreak_pairs = [] most_possible = [] for each in more_possible: tiebreak_pairs = [] max_pair = each for pair in more_possible: if max_pair != pair and max_pair[1] == pair[1]: if max_pair[2] < pair[2]: max_pair = pair tiebreak_pairs = [] elif max_pair[2] == pair[2]: tiebreak_pairs.append(pair) tiebreak_pairs.append(max_pair) if not max_pair in tiebreak_pairs: most_possible.append(max_pair) if len(tiebreak_pairs) > 0: #possible_pairs.extend(tiebreak_pairs) pass return _uniq(most_possible)
def digestSCP(self, removed_set, added_set): # renames: yes, merges: no, splits: not handled, clones: yes possible_pairs = [] max_pair = None tiebreak_pairs = [] for r_block in removed_set: if max_pair is not None: #added_set.remove(max_pair[1]) # do not attempt to re-pair max_pair = None tiebreak_pairs = [] for a_block in added_set: # for pairing of blocks with a small number of sub_blocks (1-3), this # will be fairly inaccurate r_block_seq = None a_block_seq = None if r_block.has_sub_blocks and a_block.has_sub_blocks: if len(r_block.sub_blocks) > 2 and len(a_block.sub_blocks) > 2: r_block_seq = r_block.sub_blocks a_block_seq = a_block.sub_blocks if r_block_seq is None or a_block_seq is None: r_block_seq = r_block.text a_block_seq = a_block.text s = SequenceMatcher(None, r_block_seq, a_block_seq) relation_value = s.ratio() if relation_value == 0.0: continue if max_pair is None: max_pair = (r_block, a_block, relation_value) tiebreak_pairs = [] elif relation_value > max_pair[2]: max_pair = (r_block, a_block, relation_value) tiebreak_pairs = [] elif relation_value == max_pair[2]: # tie breaker needed, compare the names tb = self._tiebreaker(r_block.name, a_block.name, max_pair[1].name) if tb == 0: tb = self._tiebreaker(str(r_block), str(a_block), str(max_pair[1])) if tb == 0: tiebreak_pairs.append((r_block, a_block, relation_value)) tiebreak_pairs.append(max_pair) if tb == 1: max_pair = (r_block, a_block, relation_value) # since r_block->a_block pair has been found, should we remove # a_block from the list of possiblities? if max_pair is not None: if not max_pair in tiebreak_pairs: possible_pairs.append(max_pair) if len(tiebreak_pairs) > 0: #possible_pairs.extend(tiebreak_pairs) print('------------') for each in tiebreak_pairs: print('tiebreaker needed: %s, %s, %s' % each) print('------------') return self._prunePairs(_uniq(possible_pairs))