def _diff(cls, base, cmp, check_variant=True, label=None): lbl = {'base': 'base', 'cmp': 'cmp'} if label: lbl.update(label) ret, line_no = [], 1 s = CSequenceMatcher(None, base, cmp, autojunk=False) for tag, i1, i2, j1, j2 in s.get_opcodes(): t1, t2 = base[i1:i2], cmp[j1:j2] # print('{:7} a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2)) if '\n' in t1: # 换行符 lst1 = t1.split('\n') for k, _t1 in enumerate(lst1): if _t1 != '': ret.append({ 'line_no': line_no, 'is_same': False, lbl['base']: _t1, lbl['cmp']: t2 }) t2 = '' elif k == len(lst1) - 1 and t2: ret.append({ 'line_no': line_no, 'is_same': False, lbl['base']: _t1, lbl['cmp']: t2 }) if k < len(lst1) - 1: # 换行 ret.append({ 'line_no': line_no, 'is_same': True, lbl['base']: '\n' }) line_no += 1 else: is_same = True if tag == 'equal' else False r = { 'line_no': line_no, 'is_same': is_same, lbl['base']: t1, lbl['cmp']: t2 } if check_variant and len(t1) == 1 and len( t2) == 1 and t1 != t2 and is_variant(t1, t2): r['is_variant'] = True ret.append(r) # 设置起止位置 line_no, start = 1, 0 for r in ret: if r['line_no'] != line_no: # 换行 line_no += 1 start = 0 end = start + len(r[lbl['base']]) r['range'] = (start, end) start = end return ret
def fix_string(self, verbosity=0): """Obtain the changes to a path as a string. We use the file_mask to do a safe merge, avoiding any templated sections. First we need to detect where there have been changes between the fixed and templated versions. The file mask is of the format: (raw_file, templated_file, fixed_file). We use difflib.SequenceMatcher.get_opcodes See: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.get_opcodes It returns a list of tuples ('equal|replace|delete|insert', ia1, ia2, ib1, ib2). """ bencher = BenchIt() bencher("fix_string: start") # Do we have enough information to actually fix the file? if any(elem is None for elem in self.file_mask): verbosity_logger( "Insufficient information to fix file: {0}".format( self.file_mask), verbosity=verbosity) return None, False verbosity_logger("Persisting file masks: {0}".format(self.file_mask), verbosity=verbosity) # Compare Templated with Raw diff_templ = SequenceMatcher(autojunk=None, a=self.file_mask[0], b=self.file_mask[1]) bencher("fix_string: Match 0&1") diff_templ_codes = diff_templ.get_opcodes() verbosity_logger("Templater diff codes: {0}".format(diff_templ_codes), verbosity=verbosity) bencher("fix_string: Got Opcodes 0&1") # Compare Fixed with Templated diff_fix = SequenceMatcher(autojunk=None, a=self.file_mask[1], b=self.file_mask[2]) bencher("fix_string: Matched 1&2") # diff_fix = SequenceMatcher(autojunk=None, a=self.file_mask[1][0], b=self.file_mask[2][0]) diff_fix_codes = diff_fix.get_opcodes() verbosity_logger("Fixing diff codes: {0}".format(diff_fix_codes), verbosity=verbosity) bencher("fix_string: Got Opcodes 1&2") # If diff_templ isn't the same then we should just keep the template. If there *was* # a fix in that space, then we should raise an issue # If it is the same, then we can apply fixes as expected. write_buff = '' fixed_block = None templ_block = None # index in raw, templ and fix idx = (0, 0, 0) loop_idx = 0 bencher("fix_string: Loop Setup") while True: loop_idx += 1 verbosity_logger("{0:04d}: Write Loop: idx:{1}, buff:{2!r}".format( loop_idx, idx, write_buff), verbosity=verbosity) if templ_block is None: if diff_templ_codes: templ_block = diff_templ_codes.pop(0) # We've exhausted the template. Have we exhausted the fixes? elif fixed_block is None and not diff_fix_codes: # Yes - excellent. DONE break # Deal with the case that we only have inserts left. elif all(elem[0] == 'insert' for elem in diff_fix_codes): for fixed_block in diff_fix_codes: write_buff += self.file_mask[2][ fixed_block[3]:fixed_block[4]] break else: raise NotImplementedError( "Fix Block(s) left over! Don't know how to handle this! aeflf8wh" ) if fixed_block is None: if diff_fix_codes: fixed_block = diff_fix_codes.pop(0) # One case is that we just consumed the last block of both, so check indexes # to see if we're at the end of the raw file. elif idx[0] >= len(self.file_mask[0]): # Yep we're at the end break else: raise NotImplementedError( "Unexpectedly depleted the fixes. Panic!") verbosity_logger("{0:04d}: Blocks: template:{1}, fix:{2}".format( loop_idx, templ_block, fixed_block), verbosity=verbosity) if templ_block[0] == 'equal': if fixed_block[0] == 'equal': # No templating, no fixes, go with middle and advance indexes # Find out how far we can advance (we use the middle version because it's common) if templ_block[4] == fixed_block[2]: buff = self.file_mask[1][idx[1]:fixed_block[2]] # consume both blocks fixed_block = None templ_block = None elif templ_block[4] > fixed_block[2]: buff = self.file_mask[1][idx[1]:fixed_block[2]] # consume fixed block fixed_block = None elif templ_block[4] < fixed_block[2]: buff = self.file_mask[1][idx[1]:templ_block[4]] # consume templ block templ_block = None idx = (idx[0] + len(buff), idx[1] + len(buff), idx[2] + len(buff)) write_buff += buff continue elif fixed_block[0] == 'replace': # Consider how to apply fixes. # Can we implement the fix while staying in the equal segment? if fixed_block[2] <= templ_block[4]: # Yes! Write from the fixed version. write_buff += self.file_mask[2][idx[2]:fixed_block[4]] idx = (idx[0] + (fixed_block[2] - fixed_block[1]), fixed_block[2], fixed_block[4]) # Consume the fixed block because we've written the whole thing. fixed_block = None continue else: raise NotImplementedError("DEF") elif fixed_block[0] == 'delete': # We're deleting items, nothing to write but we can consume some # blocks and advance some indexes. idx = (idx[0] + (fixed_block[2] - fixed_block[1]), fixed_block[2], fixed_block[4]) fixed_block = None elif fixed_block[0] == 'insert': # We're inserting items, Write from the fix block, but only that index moves. write_buff += self.file_mask[2][idx[2]:fixed_block[4]] idx = (idx[0], idx[1], fixed_block[4]) fixed_block = None else: raise NotImplementedError(( "Unexpected opcode {0} for fix block! Please report this " "issue on github with the query and rules you're trying to " "fix.").format(fixed_block[0])) elif templ_block[0] == 'replace': # We're in a templated section - we should write the templated version. # we should consume the whole replace block and then deal with where # we end up. buff = self.file_mask[0][idx[0]:templ_block[2]] new_templ_idx = templ_block[4] # Fast forward through fix blocks until we catch up. We're not implementing # any changes in a templated section. while True: if fixed_block[2] > new_templ_idx >= fixed_block[1]: # this block contains the end point break else: # We're not at the end point yet, continue to fast forward through. if fixed_block[0] != 'equal': print("WARNING: Skipping edit block: {0}".format( fixed_block)) if diff_fix_codes: fixed_block = diff_fix_codes.pop(0) else: raise NotImplementedError( "Unexpectedly depleted the fixes. Panic!") # Are we exactly on a join? if new_templ_idx == fixed_block[1]: # GREAT - this makes things easy because we have an equality point already idx = (templ_block[2], new_templ_idx, fixed_block[3]) else: if fixed_block[0] == 'equal': # If it's in an equal block, we can use the same offset from the end. idx = (templ_block[2], new_templ_idx, fixed_block[3] + (new_templ_idx - fixed_block[1])) else: # TODO: We're trying to move through an templated section, but end up # in a fixed section. We've lost track of indexes. # We might need to panic if this happens... print("UMMMMMM!") print(new_templ_idx) print(fixed_block) raise NotImplementedError("ABC") write_buff += buff # consume template block templ_block = None elif templ_block[0] == 'delete': # The comparison, things that the templater has deleted # some characters. This is just a quirk of the differ. # In reality this means we just write these characters # and don't worry about advancing the other indexes. buff = self.file_mask[0][idx[0]:templ_block[2]] # consume templ block templ_block = None idx = (idx[0] + len(buff), idx[1], idx[2]) write_buff += buff elif templ_block[0] == 'insert': # The templater has inserted something here. We don't need # to write anything here (because whatever we're looking at # was inserted by the templater), but we do need to keep # track of what happened to the rest of the section we're in. # If nothing was fixed then it's easy because the indices # will be the same. Otherwise... great question... # For now let's just deal with the happy case where the fixed # block is equal if fixed_block[0] == 'equal': # Let's make sure we can consume enough to get through the # templ block and not get to the end of the fix block. if templ_block[4] <= fixed_block[2]: insert_len = templ_block[4] - templ_block[3] idx = (idx[0], idx[1] + insert_len, idx[2] + insert_len) # if things matched up perfectly, consume the fixed block if templ_block[4] == fixed_block[2]: fixed_block = None # always consume templ block in this case templ_block = None else: raise NotImplementedError(( "Unexpected scenario during insert opcode! Please report " "this issue on github with the query and rules you're trying " "to fix.")) else: raise NotImplementedError(( "Unexpected opcode {0} for fix block! Please report this " "issue on github with the query and rules you're trying to " "fix.").format(fixed_block[0])) else: raise NotImplementedError(( "Unexpected opcode {0} for template block! Please report this " "issue on github with the query and rules you're trying to " "fix.").format(templ_block[0])) bencher("fix_string: Fixing loop done") # The success metric here is whether anything ACTUALLY changed. return write_buff, write_buff != self.file_mask[0]
def _diff_two_v2(cls, base, cmp, label=None): lbl = {'base': 'base', 'cmp': 'cmp'} if label and isinstance(label, dict): lbl.update(label) # 和v1不同,v2在比较时,先去掉换行符,以免对diff算法干扰 base = base.replace('|', '\n').rstrip('\n') base_lines = base.split('\n') base = cls.pre_base(base, False) cmp = cls.pre_cmp(cmp) segments = [] s = CSequenceMatcher(None, base, cmp, autojunk=False) for tag, i1, i2, j1, j2 in s.get_opcodes(): t1, t2 = base[i1:i2], cmp[j1:j2] # print('{:7} a[{}:{}] --> b[{}:{}] {!r:>8} --> {!r}'.format(tag, i1, i2, j1, j2, t1, t2)) is_same = True if tag == 'equal' else False r = { 'line_no': None, 'is_same': is_same, lbl['base']: t1, lbl['cmp']: t2 } segments.append(r) # 合并diff时可能被异体字隔断的同文 for i, seg in enumerate(segments): if seg.get('is_same'): # 往前找一个没有被delete的同文seg进行合并 j = i - 1 while j >= 0: pre = segments[j] if not pre['is_same']: break if not pre.get('deleted'): pre[lbl['base']] += seg[lbl['base']] pre[lbl['cmp']] += seg[lbl['cmp']] seg['deleted'] = True break j -= 1 segments = [s for s in segments if not s.get('deleted')] # 根据diff比较的结果,按照base设置换行 line_segments, idx = [], 0 for i, line in enumerate(base_lines): if not len(line): # 如果line为空,则新增换行 line_segments.append({ 'line_no': i + 1, 'is_same': True, lbl['base']: '\n', lbl['cmp']: '\n' }) continue # 从segments中找len(line)长作为第i+1行 start, left_len = 0, len(line) while idx < len(segments) and left_len > 0: seg = segments[idx] if len(seg[lbl['base']]) <= left_len: # seg比left_len短,seg入栈 seg['line_no'] = i + 1 seg_len = len(seg[lbl['base']]) line_segments.append(seg) # 更新变量 left_len -= seg_len start += seg_len idx += 1 else: # seg比left_len长,截断seg front_part = { 'line_no': i + 1, 'is_same': seg['is_same'], lbl['base']: seg[lbl['base']][:left_len], lbl['cmp']: seg[lbl['cmp']][:left_len], } line_segments.append(front_part) seg.update({ lbl['cmp']: seg[lbl['cmp']][left_len:] if len(seg[lbl['cmp']]) > left_len else '', lbl['base']: seg[lbl['base']][left_len:], }) # 更新变量 left_len = 0 start = 0 if left_len == 0: # 换行 line_segments.append({ 'line_no': i + 1, 'is_same': True, lbl['base']: '\n', lbl['cmp']: '\n' }) # 检查换行符后是否有base为空的异文,有则往前提 for i, seg in enumerate(line_segments): pre = line_segments[i - 1] if i > 1 else {} if seg[lbl['base']] == '' and pre.get('is_same') and pre.get( lbl['base']) == '\n': # 当前为空异文,之前为换行,则交换二者位置 temp = seg.copy() seg.update(pre) pre.update(temp) # 设置range start = 0 for seg in line_segments: seg['range'] = (start, start + len(seg[lbl['base']])) start += len(seg[lbl['base']]) if seg['is_same'] and seg[lbl['base']] == '\n': start = 0 return line_segments