def find_prefix_suffix(list_seg): """ Find common prefix and suffix in list of files :param list_seg: list of filenames to analyse :return: longest prefix and suffix """ comp_s = SequenceMatcher() initial = list_seg[0] prefix_fin = None suffix_fin = None for i in range(1, len(list_seg)): comp_s.set_seqs(initial, list_seg[i]) all_poss = comp_s.get_matching_blocks() if all_poss[0].a == 0: prefix = initial[0:all_poss[0].size] else: prefix = '' comp_pre = SequenceMatcher() if prefix_fin is None: prefix_fin = prefix comp_pre.set_seqs(prefix, prefix_fin) pre_poss = comp_pre.get_matching_blocks() prefix_fin = prefix[0:pre_poss[0].size] if all_poss[-1].size == 0: suffix = initial[all_poss[-2].a:all_poss[-2].a + all_poss[-2].size] else: suffix = initial[all_poss[-1].a:] comp_suf = SequenceMatcher() if suffix_fin is None: suffix_fin = suffix comp_suf.set_seqs(suffix, suffix_fin) suf_poss = comp_suf.get_matching_blocks() suffix_fin = suffix[suf_poss[-2].a:] return prefix_fin, suffix_fin
def testCDifflibWithBug5Data(self): """Check cdifflib returns the same result for bug #5 (autojunk handling issues)""" from . import testdata # note: convert both to lists for Python 3.3 sm = SequenceMatcher(None, testdata.a5, testdata.b5) difflib_matches = list(sm.get_matching_blocks()) sm = CSequenceMatcher(None, testdata.a5, testdata.b5) cdifflib_matches = list(sm.get_matching_blocks()) self.assertEqual(difflib_matches, cdifflib_matches)
def testCDifflibWithBug5Data(self): """Check cdifflib returns the same result for bug #5 (autojunk handling issues)""" from . import testdata # note: convert both to lists for Python 3.3 sm = SequenceMatcher(None, testdata.a5, testdata.b5) difflib_matches = list(sm.get_matching_blocks()) sm = CSequenceMatcher(None, testdata.a5, testdata.b5) cdifflib_matches = list(sm.get_matching_blocks()) self.assertEqual(difflib_matches, cdifflib_matches)
def process(self, response): """ Process data :return: str """ if response.status in self.DEFAULT_STATUSES: super().process(response) length = self.__get_content_length() if self.MIN_CONTENT_LENGTH < length: # the page is allowed for comparison if not self.previous_item: # 1st match. Push items for next compare step self.previous_item.update({ 'length': length, 'text': self._body }) return None else: if length == self.previous_item.get( 'length') and self.MIN_CONTENT_LENGTH < length: # identical, seems to drop failed for success return self.RESPONSE_INDEX else: matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body) matcher.get_matching_blocks() if 'length' in self.current_item: next_matcher = SequenceMatcher( a=self.current_item['text'], b=self._body) if next_matcher.ratio() == matcher.ratio(): return self.RESPONSE_INDEX if self.MIN_RATIO_INDEX < matcher.ratio(): return self.RESPONSE_INDEX else: self.current_item.update({ 'length': length, 'text': self._body }) if self.MIN_CONTENT_LENGTH < length: self.previous_item.update({ 'length': length, 'text': self._body }) return None
def align_strings(str1, str2, max_lenght=0): from difflib import SequenceMatcher sm = SequenceMatcher(lambda x: x in " ") sm.set_seqs(str1, str2) # While there are matches # Rem: the last block is a dummy one, see doc of SequenceMatcher while len(sm.get_matching_blocks()) > 1: for m in sm.get_matching_blocks(): # If str1 and str2 are not aligned if m[0] != m[1]: if m[0] < m[1]: str1 = str1[:m[0]]+" "*(m[1]-m[0])+str1[m[0]:] if m[1] < m[0]: str2 = str2[:m[1]]+" "*(m[0]-m[1])+str2[m[1]:] sm.set_seqs(str1, str2) break else: # If all the blocks are for aligned texts break # Padding at the end of str so that both are same size if len(str1)<len(str2): str1 += " "*(len(str2)-len(str1)-1) if len(str2)<len(str1): str2 += " "*(len(str1)-len(str2)-1) # If we want to split in multiple lines if max_lenght!=0: ret_str1=[] ret_str2=[] # Add padding #str1 += " "*(len(str1)%max_lenght) #str2 += " "*(len(str2)%max_lenght) while len(str1) > max_lenght: ret_str1 += [str1[:max_lenght]] str1 = str1[max_lenght:] ret_str2 += [str2[:max_lenght]] str2 = str2[max_lenght:] ret_str1 += [str1] ret_str2 += [str2] return ret_str1, ret_str2 return str1, str2
def match_question_blocks(question_obj_list): clues = [] sas = [] clue_sents = set() # get clue and clue sentences in a pairewise manner for i in range(len(question_obj_list)): for j in range(i + 1, len(question_obj_list)): q1, q2 = clean_question(question_obj_list[i]["text"]), \ clean_question(question_obj_list[j]["text"]) question_matcher = SequenceMatcher(None, q1, q2) matching_blocks = question_matcher.get_matching_blocks() longest_match = sorted(matching_blocks, key=lambda k: k.size, reverse=True)[0] if longest_match.size > 10: sa, sb, span = longest_match.a, longest_match.b, longest_match.size # print(i, j, longest_match, "q1: ", q1[sa:sa + span], # "q2: ", q2[sb:sb + span]) clue = q1[sa:sa + span] complete_clue = get_complete_clue(sa, sa + span, q1) if clue != '': clues.append(complete_clue.strip()) sas.append(sa) clue_sent = clean_question( get_clue_sent(question_obj_list[i], sa)) if len(clue_sent) > 1: clue_sents.add(clue_sent.strip()) # deduplicate clues if len(clues) > 1: clues = deduplicate_clues(clues) return clues, list(clue_sents)
def align(sent): tagged = re.split(r'\s+', sent) raw_word = tagged[0] tagged[1] = re.compile(r'__[0-9]+').sub('', tagged[1]) tag_morph = re.split("(?<=/[A-Z]{2})\+|(?<=/[A-Z]{3})\+", tagged[1]) tagged = ''.join([morph_pos[:morph_pos.rfind('/')] for morph_pos in tag_morph]) fraction = list() for morph_tag in tag_morph: morph, tag = nltk.str2tuple(morph_tag) for i, syl in enumerate(morph): if i == 0: fraction.append([syl, "B-"+tag]) else: fraction.append([syl, "I-" + tag]) fraction[-1][1] = fraction[-1][1] + "+" ##태그 뒤에 +붙이기 fraction[-1][1] = fraction[-1][1][:-1] print(raw_word,tagged) if raw_word == tagged: return fraction SM = SequenceMatcher(None, raw_word, tagged) blocks = list() if include_delete(SM): blocks = make_del_block(fraction, raw_word, tagged) else: mat_blocks = SM.get_matching_blocks() blocks = generate_block(fraction, mat_blocks) if len(mat_blocks) == 1:# 온 오/vx+ㄴ/etm 혹시 모를 다틀린 형태. blocks = make_del_block(fraction, raw_word, tagged) print(blocks) for cur, nxt in pairwise(blocks): raw = raw_word[cur[0]:cur[1]] mor = tagged[cur[2]:cur[3]] print(raw,mor)
def longest_substring(str1, str2=None, min_match_len=2): if str2 is None: return str1 if len(str1) == 0 or len(str2) == 0: return str1 if len(str2) == 0 else str2 list1 = str1.split(" ") list2 = str2.split(" ") # print (list1, list2) # initialize SequenceMatcher object with # input string seq_matcher = SequenceMatcher(None, list1, list2) matching_blocks = seq_matcher.get_matching_blocks() if len(matching_blocks) == 0: raise ValueError( "No matched substrings found, in str1: \"%s\", str2: \"%s\"" % (str1, str2)) # return "".join([str1[match.a: match.a + match.size] for match in matching_blocks if match.size >= min_match_len]) return " ".join([ list1[match.a + i] for match in matching_blocks for i in range(match.size) ])
def partial_ratio(s1, s2): """ Return the ratio of the most similar substring as a number between 0 and 100. """ if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer, autojunk=False) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for (short_start, long_start, _) in blocks: long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr, autojunk=False) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return max(scores) * 100.0
def _find_matched_module_config(xml_dir_name, file_name): """Find the matched module config. e.g. wp7603.xml -> wp76xx.xml """ all_modules = os.listdir(xml_dir_name) matched_prefix_array = [] for module_name in all_modules: sequence_matcher = SequenceMatcher(None, file_name, module_name) all_matches = sequence_matcher.get_matching_blocks() prefix_match_length = 0 for _match in all_matches: if _match.a == 0 and _match.b == 0: prefix_match_length = _match.size matched_prefix_array.append(prefix_match_length) max_matched = max(matched_prefix_array) resolved_xml_file = None if max_matched >= 4: # Match at least four digits prefix. WPXX, ARXX, etc. # e.g. wp7604 will match wp76xx.xml xml_file_name_index = matched_prefix_array.index(max_matched) candidate_xml_file_name = all_modules[xml_file_name_index] resolved_xml_file = os.path.join(xml_dir_name, candidate_xml_file_name) swilog.warning("{} will be used!".format(resolved_xml_file)) return resolved_xml_file
def on_pre_save(self, view): settings = sublime.load_settings('Preferences.sublime-settings') patterns = settings.get("trim_if_present", []) # Trim all whitespace on my files. if any(view.find(pattern, 0, sublime.IGNORECASE) for pattern in patterns): view.run_command("erase_whitespace", {}) return if view.id() not in snapshots: print("No snapshot present to compare") return # Trim whitespace on any new files. old = snapshots[view.id()].split('\n') new = view.substr(sublime.Region(0, view.size())).split('\n') # Remove the line numbers that were present before. new_lines = set(range(len(new))) sm = SequenceMatcher(None, old, new) for i, j, n in sm.get_matching_blocks(): for k in range(j, j + n): new_lines.remove(k) # Trim the whitespace on the new lines: if new_lines: new_lines = ','.join(str(n) for n in new_lines) view.run_command("process_new_lines", dict(new_lines=new_lines))
def main(iob_file: Path): sentences = [] with iob_file.open() as f: for line in f: match = SENT_PATTERN.match(line) if match is not None: sentence: str = line[match.end():].strip() sentences.append(sentence) nlp = Hungarian() # noinspection PyUnresolvedReferences from huspacy.components import HunSentencizer nlp.add_pipe("hun_sentencizer") doc: Doc = nlp(" ".join(sentences)) predicted_sents = [str(s) + "\n" for s in doc.sents] sentences = [s + "\n" for s in sentences] seqmatcher = SequenceMatcher(None, sentences, predicted_sents) accuracy = sum(mb.size for mb in seqmatcher.get_matching_blocks()) / len(sentences) print(f"Accuracy: {accuracy:.2%}\n\n") diffs = list( context_diff(sentences, predicted_sents, fromfile="gold", tofile="predicted", n=0)) sys.stdout.writelines(diffs)
def getScore(self, str1, str2, limit=0.8): score_diff = 1 if (str1 != str2): str1_score = self.getScore(str1, str1, self.limit)[2] str2_score = self.getScore(str2, str2, self.limit)[2] score_diff = abs(str1_score - str2_score) if abs(str1_score - str2_score) != 0 else 1 interests = str1.split(" ") keywords = str2.split(" ") s = SequenceMatcher(None) seq_score = 0 nb_match = 0 score = 0 for interest in interests: s.set_seq2(interest) for keyword in keywords: s.set_seq1(keyword) b = s.ratio() >= self.limit and len( s.get_matching_blocks()) == 2 seq_score += s.ratio() if b: nb_match += 1 score = math.pow(nb_match, 5) * seq_score similarity = round(score * nb_match / score_diff) is_similar = similarity >= 1 return (seq_score, nb_match, score, score_diff, similarity, is_similar)
def get_match(haystack, needle): if len(haystack.strip()) < len(needle): return None from difflib import SequenceMatcher matcher = SequenceMatcher(a=needle, b=haystack) matching_acc = 0 for block in matcher.get_matching_blocks(): _, _, match_count = block matching_acc += match_count if matching_acc > 0.9 * len(needle): for block in matcher.get_matching_blocks(): needle_idx, _, match_count = block if needle_idx == 0 and match_count > 0: return needle[:match_count] return None
def similar(string: str, sub: str) -> float: sm = SequenceMatcher(None, string, sub) matchs = sm.get_matching_blocks() size = 0 for match in matchs: size += match.size return size / len(sub)
def partial_ratio(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") if len(s1) <= len(s2): shorter = s1; longer = s2; else: shorter = s2; longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return int(100 * max(scores))
def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores))
def get_align_indexes(seqmatch: SequenceMatcher): """Get indexes for matching and nonmatching parts of two token tuples (from SequenceMatcher).""" class MatchIndexes(object): """Start/end indexes for a matching block of sequences a and b, with a match indicator.""" def __init__(self, a_i: int, a_j: int, b_i: int, b_j: int, match: bool): """[ab]i: Start index, [ab]j: End index, match: Is this a matching tuple or not?""" self.ai, self.aj = a_i, a_j self.bi, self.bj = b_i, b_j self.match = match def __repr__(self): attr_reprs = [ f'{k}: {v}' for k, v in self.__dict__.items() if not k.startswith('__') ] return f'MatchIndexes({", ".join(attr_reprs)})' matchblocks = seqmatch.get_matching_blocks() align_indexes = [] for mpair in zip(matchblocks, matchblocks[1:]): ai = mpair[0].a # Indexes from the a side aj = ai + mpair[0].size ak = mpair[1].a bi = mpair[0].b # Indexes from the b side bj = bi + mpair[0].size bk = mpair[1].b align_indexes.append(MatchIndexes(ai, aj, bi, bj, match=True)) align_indexes.append(MatchIndexes(aj, ak, bj, bk, match=False)) # Fill in any missing mismatches at the beginning if align_indexes[0].ai > 0 or align_indexes[0].bi > 0: new_aj, new_bj = align_indexes[0].ai, align_indexes[0].bi align_indexes = [MatchIndexes(0, new_aj, 0, new_bj, match=False) ] + align_indexes return align_indexes
def match(self): sequence = SequenceMatcher(None, self.textAgrams, self.textBgrams) matchingBlocks = sequence.get_matching_blocks() highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold] numBlocks = len(highMatchingBlocks) report.write('Number of sentences quoted = %s ' % numBlocks) report.write('\n\n\n')
def s(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") if len(s1) == 0 or len(s2) == 0 or len(s1) > len(s2): return shorter = s1 longer = s2 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() #print blocks scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] # print long_substr m2 = SequenceMatcher(None, shorter, long_substr) if m2.ratio()>0.8: print shorter + " : " + long_substr print m2.ratio() print return 1 #r = m2.ratio() # print r #scores.append(r) return 0;
def substrings_en_comun(str1, str2, longitud_min=10): """ Encuentra los *substrings*, o cadena de caracteres internas, que \ tienen en común dos textos de entrada y cumplen con una longitud \ mínima. :param str1: (str) Primer texto de insumo. :param str2: (str) Segundo texto de insumo. :param longitud_min: (int) Cantidad mínima de caracteres que debe \ tener una coincidencia entre los dos textos de entrada, para \ ser considerada. :return: (list) Lista de *substrings* o cadenas de caracteres en \ común que cumplan con el requisito de longitud mínima. Si no \ hay ningúna cadena de caracteres que cumpla esta condición, \ se devuelve una lista vacía. """ # Inicializar objeto de SequenceMatcher con los dos textos seqMatch = SequenceMatcher(None, str1, str2) # Hallar el sub-string común de mayor longitud # Cada elemento tiene la forma de Match(a=0, b=0, size=5) coincidencias = seqMatch.get_matching_blocks() # Se filtran solo las coincidencias que cumplan la longitud mínima coincidencias = [i for i in coincidencias if i.size >= longitud_min] # Se devuelve la lista de strings con las coincidencias return [str1[i.a:i.a + i.size] for i in coincidencias]
def calc_similarity(s_standard, s_candidate): if s_standard is None or s_candidate is None: return 0 m = SequenceMatcher(None, s_standard, s_candidate) if len(s_standard) >= len(s_candidate): return m.ratio() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") blocks = m.get_matching_blocks() scores = [] for block in blocks: start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 end = start + len(s_standard) s_sub = s_candidate[start:end] m = SequenceMatcher(None, s_standard, s_sub) scores.append(m.ratio()) return max(scores)
def _compare_lines(self, la, lb): sa = '\n'.join(la) sb = '\n'.join(lb) ta_result = '' tb_result = '' str_diff_start = '<em class="str-diff">' str_diff_end = '</em>' s = SequenceMatcher(None, sa, sb) cnt_a = Counter() cnt_b = Counter() for block in s.get_matching_blocks(): (a_idx, b_idx, nmatch) = block print("a[%d] and b[%d] match for %d elements" % block) cnt_a.progress(a_idx, nmatch) cnt_b.progress(b_idx, nmatch) diff_a = cnt_a.slice_diff(sa) same_a = cnt_a.slice_match(sa) diff_b = cnt_b.slice_diff(sb) same_b = cnt_b.slice_match(sb) if diff_a or diff_b: ta_result += self._enclose(str_diff_start, diff_a, str_diff_end, consider_newline = True) ta_result += same_a if diff_a or diff_b: tb_result += self._enclose(str_diff_start, diff_b, str_diff_end, consider_newline = True) tb_result += same_b cnt_a.next() cnt_b.next() return (ta_result.split('\n'), tb_result.split('\n'))
def partial_ratio(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return int(100 * max(scores))
def s(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") if len(s1) == 0 or len(s2) == 0 or len(s1) > len(s2): return shorter = s1 longer = s2 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() #print blocks scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] # print long_substr m2 = SequenceMatcher(None, shorter, long_substr) if m2.ratio() > 0.8: print shorter + " : " + long_substr print m2.ratio() print return 1 #r = m2.ratio() # print r #scores.append(r) return 0
def extract_dynamic_content_marking(seq1: str, seq2: str, autojunk: bool = True, isjunk=None, border_length=20 ) -> typing.List[typing.Tuple[str, str]]: seqm = SequenceMatcher(isjunk, seq1, seq2, autojunk) blocks = list(seqm.get_matching_blocks()) mached_markings = [] while blocks: current_block = blocks.pop(0) if current_block.size < border_length: continue if not blocks: break for next_block in blocks: #next_block = blocks[0] if next_block.size < border_length: continue prefix = seq1[current_block.a:current_block.a + current_block.size][-border_length:] suffix = seq1[next_block.a:next_block.a + next_block.size][:border_length] mached_markings.append((prefix, suffix)) break return mached_markings
def plagerised_ratio(filename1, filename2): tokens1 = tokenize( filename1 ) #(elements of cleaned up code, their position in original code, position in cleaned up code) file1 = toText( tokens1 ) #cleaned up code - greatly increases effectiveness of plagiarism checker tokens2 = tokenize(filename2) file2 = toText(tokens2) SM = SequenceMatcher(None, file1, file2) similarity_ratio = SM.ratio() print(similarity_ratio) # ratio of plagiarised content blocks = list(SM.get_matching_blocks( )) #elements of blocks[] - (start-file1, start-file2, length) blocks = blocks[:-1] f1 = open(filename1, "r") for i in blocks: flag = 0 for j in range(len(tokens1)): if tokens1[j][2] == i[ 0]: #linking start of matching block to position in cleaned up code start = tokens1[j][ 1] #linking position in cleaned up code to position in original code file flag = 1 if tokens1[j][2] == (i[0] + i[2] - 1): #linking end to cleaned up code end = tokens1[j][1] #linking to original code file break if not flag == 0 and ( end - start ) > 100: #printing significant blocks of plagiarized content #the start and end of matching blocks is linked to the original code to properly mark the plagiarized content f1.seek(start, 0) print(f1.read(end - start))
def html_diff(str1, str2, max_lenght=80, html_same_class="blue", html_diff_class="red"): from difflib import SequenceMatcher str1, str2 = align_strings(str1, str2, max_lenght) sm = SequenceMatcher(lambda x: x in " ") same_span = "<span style='color: %s'>" % html_same_class diff_span = "<span style='color: %s'>" % html_diff_class clos_span = "</span>" ret_str1 = [] ret_str2 = [] for str1, str2 in zip(str1, str2): temp_str1 = "" temp_str2 = "" finished = 0 sm.set_seqs(str1, str2) for m in sm.get_matching_blocks(): temp_str1 += diff_span + str1[finished:m[0]] + clos_span temp_str1 += same_span + str1[m[0]:m[0]+m[2]] + clos_span temp_str2 += diff_span + str2[finished:m[1]] + clos_span temp_str2 += same_span + str2[m[1]:m[1]+m[2]] + clos_span finished = m[0]+m[2] ret_str1 += [temp_str1] ret_str2 += [temp_str2] return ret_str1, ret_str2
def diff_a_soup(s1, s2): # print(s1.prettify()) sx_ind_limit = None ind = 0 tabs = '' s1_list = [] s1_ind_max = 0 s1_list, s1_ind_max = rec_soup(s1, ind, s1_ind_max, sx_ind_limit, tabs, s1_list) s1_list_len = len(s1_list) ind = 0 tabs = '' s2_list = [] s2_ind_max = 0 s2_list, s2_ind_max = rec_soup(s2, ind, s2_ind_max, sx_ind_limit, tabs, s2_list) s2_list_len = len(s2_list) seq = SequenceMatcher(None, s1_list, s2_list) # if s1_list_len <= s2_list_len: # seq = SequenceMatcher(None, s1_list, s2_list) # else: # seq = SequenceMatcher(None, s2_list, s1_list) match_block = seq.get_matching_blocks() print('Length of s1: ' + str(s1_list_len)) print('s1 ind max: ' + str(s1_ind_max)) print('Length of s2: ' + str(s2_list_len)) print('s2 ind max: ' + str(s2_ind_max)) print('Number of matched blocks: ' + str(len(match_block))) # pprint(match_block) return s1_list, s2_list, match_block
def print_diffs(expected, actual): a = expected b = actual s = SequenceMatcher(None, a, b) print '\n' ctr = 0 for block in s.get_matching_blocks(): apos = block[0] bpos = block[0] aendpos = apos + block[2] bendpos = bpos + block[2] achunk = expected[apos:aendpos] bchunk = actual[bpos:bendpos] # print "a[%d] and b[%d] match for %d elements" % block print '\nACTUAL has matching Error at ' + str(aendpos) print 'Expected =' + expected[ bendpos:bendpos + 100] + '\nFound =' + actual[aendpos:aendpos + 100] print 'Matched values from 0 to ' + str(aendpos - 1) + ' are' print ' EXPECTED=' + bchunk print ' ACTUAL =' + achunk print '' if ctr == 0: break else: ctr += 1
def fuzzy_partial(s1, s2): """ Helper method to compare similarity of two strings. Adapted and improved from: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ :param s1: string 1 :param s2: string 2 :return: similarity ratio """ if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 seq = SequenceMatcher(None, shorter, longer) matches = seq.get_matching_blocks() ratios = [] for match in matches: long_start = match[1] - match[0] if (match[1] - match[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] seq2 = SequenceMatcher(None, shorter, long_substr) rat = seq2.ratio() if rat > .995: return 1 else: ratios.append(rat) return max(ratios)
def create_dict_from_raw_dataset(string): dictionary=eval(string) snippet=dictionary["evidences"][0]["snippet"] # api_key= "XXXXX" service_url = 'https://www.googleapis.com/freebase/v1/topic' params = { 'key': api_key, "filter":"suggest" } topic_id=dictionary["sub"] url = service_url + topic_id + '?' + urllib.urlencode(params) topic = json.loads(urllib.urlopen(url).read()) # try: name=topic['property']["/type/object/name"]["values"][0]["text"] # except: # keys=[key in topic] # print keys topic_id=dictionary["obj"] url = service_url + topic_id + '?' + urllib.urlencode(params) topic = json.loads(urllib.urlopen(url).read()) org=topic['property']["/type/object/name"]["values"][0]["text"] pseudo_list=tokenize.sent_tokenize(snippet.decode('utf-8')) sentence_list=[] for sentence in pseudo_list: words=sentence.split() pronouns=["He","She","he","she"] sent="" for word in words: if(word in pronouns): sent=sent+" "+name else: sent=sent+" "+word sent=sent[1:] sentence_list.append(sent) max_score=0 for sentence in sentence_list: s=SM(None, sentence, org) score=sum(n for i,j,n in s.get_matching_blocks()) if(score>max_score): max_score=score final_sentence=sentence yes_count=0 no_count=0 for judgment in dictionary["judgments"]: if(judgment["judgment"]=="yes"): yes_count=yes_count+1 else: no_count=no_count+1 if(yes_count>no_count): value=1 else: value=0 final_dictionary={} final_dictionary["value"]=value final_dictionary["sentence"]=final_sentence final_dictionary["name"]=name final_dictionary["organisation"]=org print final_dictionary return final_dictionary
class StoryComparator: """ Compares two stories by massaging the data and doing a glorified diff, using Python's difflib.SequenceMatcher(). The class is optimized around comparing one piece of control content to multiple pieces of variable content. """ def __init__( self, original ): """ The constructor takes the control content as a file (or StringIO) object, and reads and tokenizes it. """ self._orig = original.read() self.orig = list( tokenize( self._orig ) ) self.punk = [ x[0] for x in self.orig ] self.sm = SequenceMatcher( lambda x: x in " \t\n\r" ) self.sm.set_seq2( self.punk ) def compare( self, variable ): """ compare() takes a piece of variable content. """ results = [] for lineno, line in enumerate( variable ): line_tk = tokenize( line ) self.sm.set_seq1( [ x[ 0 ] for x in line_tk ] ) winnowed = self._winnow( self.sm.get_matching_blocks() ) if winnowed: results.append( ( lineno + 1, winnowed ) ) return results def _winnow( self, matches ): winnowed = [] prev = None for m in matches: # this is the dummy match at the end if m[ 2 ] == 0: continue if m[ 2 ] < 6: continue orig_beg_pos = self.orig[ m[1] ][1] orig_end_pos = self.orig[ m[1]+m[2] ][1] # XXX: here, the string length is getting conflated with the # word/token length. BUT I HAVE THE POSITION OF THE LAST TOKEN g = { 'var': m[0], 'orig': m[1], 'len': m[2], 'orig_beg_pos': orig_beg_pos, 'orig_end_pos': orig_end_pos, 'orig_string': self._orig[ orig_beg_pos:orig_end_pos ], 'words': self.punk[ m[1]:m[1]+m[2] ] } prev = m winnowed.append( g ) return winnowed
def matchingString(s1, s2): '''Compare 2 sequence of strings and return the matching sequences concatenated''' from difflib import SequenceMatcher matcher = SequenceMatcher(None, s1, s2) output = "" for (i,_,n) in matcher.get_matching_blocks(): output += s1[i:i+n] return output
def showDiff(self, before, after): """ Compute the diff and highlight changed parts. Parameters ---------- self : QWidget before : str Original text. after : str Edited text. """ beforeCursor = self.beforePTE.textCursor() afterCursor = self.afterPTE.textCursor() textFormat = QTextCharFormat() # delete any previous background textFormat.setBackground(QBrush(QColor('transparent'))) beforeCursor.mergeCharFormat(textFormat) afterCursor.mergeCharFormat(textFormat) self.beforePTE.setPlainText(before) self.afterPTE.setPlainText(after) # get matching sequences sm = SequenceMatcher(a=before, b=after) i, j, k = 0, 0, 0 # highlight mismatching sequences # NOTE: [ii:ii+kk] and [jj:jj+kk] are the matching sequences for the # first and second string, while [i+k:ii] and [j+k:jj] are the # mismatching ones for ii, jj, kk in sm.get_matching_blocks(): # highlight with red the removed parts in the before text beforeCursor.setPosition(i + k) beforeCursor.movePosition( QTextCursor.Right, QTextCursor.KeepAnchor, ii - i - k) textFormat.setBackground(QBrush(QColor('#F99'))) beforeCursor.mergeCharFormat(textFormat) # highlight with green the added parts in the after text afterCursor.setPosition(j + k) afterCursor.movePosition( QTextCursor.Right, QTextCursor.KeepAnchor, jj - j - k) textFormat.setBackground(QBrush(QColor('#CFC'))) afterCursor.mergeCharFormat(textFormat) i, j, k = ii, jj, kk
def compare(snippet1, snippet2): # TODO: convert punct and stuff to spaces with translate so it doesn't screw # up offsets sm = SequenceMatcher(lambda x: x in string.whitespace) sm.set_seq1(snippet1.unfscked_text().lower()) sm.set_seq2(snippet2.unfscked_text().lower()) # Note that the last block will always be of size 0 for a, b, size in sm.get_matching_blocks(): if size >= 5: yield a, size
def compare(textA, textB): lenA = len(textA) lenB = len(textB) matcher = SequenceMatcher(None, textA, textB) blocks = matcher.get_matching_blocks() return { 'changed': (1 - float(sum([m[2] for m in blocks])) / max(blocks[-1][0], blocks[-1][1])), 'growth': float(lenB - lenA) / float(lenA), 'lenA': lenA, 'lenB': lenB, 'similarity': matcher.ratio() }
def annotate_filter(name, completions): maxdepth = max(map(lambda c: c["depth"], completions)) + 1 lenn = len(name) for c in completions: cname = c["name"] if lenn > len(cname): continue # ignore shorter completions # if there is no identifier, give all equal chance if lenn == 0: score = 1 c["markup"] = cname else: m = SequenceMatcher(None, name.lower(), cname.lower()) blocks = list(m.get_matching_blocks()) if len(blocks) < 2: continue # no matches # run again, up to the last matching char, to have substring-scores last = blocks[-2] lastchar = last.b + last.size m = SequenceMatcher(None, name.lower(), cname[:lastchar].lower()) T = lenn + lastchar M = 0 for tag, i1, i2, j1, j2 in m.get_opcodes(): # TODO: score upper/lower casing differently from normal "replace" if tag == "equal": M += i2 - i1 if M < lenn: continue # not all chars are included score = 2.0*M / T # XXX: gtksourceview does not apply highlights that include the frist char markup = "\u200B" laststart = 0 for _, start, mlen in blocks: markup += cname[laststart:start] if mlen > 0: markup += "<b>" + cname[start:start+mlen] + "</b>" laststart = start + mlen c["markup"] = markup # deeper completions should rank lower depth = 1 - c["depth"] / maxdepth c["score"] = score * depth yield c
def find_overlaps(seq1, seq2): """ https://docs.python.org/3/library/difflib.html#sequencematcher-objects https://pypi.python.org/pypi/pydna/0.9.9 https://pypi.python.org/pypi/biopython http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/ http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/diffseq.html http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/seqmatchall.html http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/wordcount.html """ sm = SequenceMatcher(a=seq1, b=seq2) blocks = sm.get_matching_blocks() return blocks
def generateRedirectRegExp(self, firstLocation, secondLocation): if firstLocation is None or secondLocation is None: return None sm = SequenceMatcher(None, firstLocation, secondLocation) marks = [] for blocks in sm.get_matching_blocks(): i = blocks[0] n = blocks[2] # empty block if n == 0: continue mark = firstLocation[i : i + n] marks.append(mark) regexp = "^.*{0}.*$".format(".*".join(map(re.escape, marks))) return regexp
def process(self, response): """ Process data :return: str """ if response.status in self.DEFAULT_STATUSES: super().process(response) length = self.__get_content_length() if self.MIN_CONTENT_LENGTH < length: # the page is allowed for comparison if not self.previous_item: # 1st match. Push items for next compare step self.previous_item.update({'length': length, 'text': self._body}) return None else: if length == self.previous_item.get('length') and self.MIN_CONTENT_LENGTH < length: # identical, seems to drop failed for success return self.RESPONSE_INDEX else: matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body) matcher.get_matching_blocks() if 'length' in self.current_item: next_matcher = SequenceMatcher(a=self.current_item['text'], b=self._body) if next_matcher.ratio() == matcher.ratio(): return self.RESPONSE_INDEX if self.MIN_RATIO_INDEX < matcher.ratio(): return self.RESPONSE_INDEX else: self.current_item.update({'length': length, 'text': self._body}) if self.MIN_CONTENT_LENGTH < length: self.previous_item.update({'length': length, 'text': self._body}) return None
def _align(self, dat_sent0, dat_nsp0, t_sent0, t_recv0, t_nsp0, i_nsp0): dat_sent = []; dat_nsp = []; t_sent = []; t_recv = [] t_nsp = []; i_nsp = [] diff = SequenceMatcher(None, dat_sent0, dat_nsp0) for i, j, n in diff.get_matching_blocks(): dat_sent.extend(dat_sent0[i: i+n]) t_sent.extend(t_sent0[i: i+n]) t_recv.extend(t_recv0[i: i+n]) dat_nsp.extend(dat_nsp0[j: j+n]) t_nsp.extend(t_nsp0[j: j+n]) i_nsp.extend(i_nsp0[j: j+n]) return dat_sent, dat_nsp, t_sent, t_recv, t_nsp, i_nsp
def calculate_scores(annotated_filepath, original_filepath): text = extract_annotated_text(annotated_filepath) expected_terms = re.findall(r"\w+", text.lower(), flags=re.UNICODE) article_extractor = MSSArticleExtractor() with open(original_filepath, "r") as f: contents = f.read() contents = html.document_fromstring(contents) contents = clean_html(contents) with codecs.open("cleaned_text.html", "w", encoding="utf-8") as f: f.write(tostring(contents)) article = article_extractor.extract_article(tostring(contents)) with codecs.open("text.html", "w", encoding="utf-8") as f: f.write(article) terms = re.findall(r"\w+", article.lower(), flags=re.UNICODE) matcher = SequenceMatcher(None, expected_terms, terms) matches = matcher.get_matching_blocks() sretsrel = sum([match.size for match in matches]) srel = len(expected_terms) if terms: precision = float(sretsrel) / float(len(terms)) else: precision = 0.0 if srel > 0: recall = float(sretsrel) / float(srel) else: recall = 0.0 try: f1 = 2 * ((precision * recall) / (precision + recall)) except: f1 = 0.0 return (precision, recall, f1)
def partial_with_place(s1,s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) == 0 or len(s2) == 0: return 0 if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] score_triple = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return (100, long_start, long_end) else: scores.append(r) score_triple.append((int(r * 100), long_start, long_end)) m = max(scores) i = scores.index(m) return score_triple[i]
def compare(self, ta_lines, tb_lines, linebreak = ''): ta_result = [] tb_result = [] diff_lines = [] line_diff_start = '<span class="line-num line-diff">' line_diff_end = '</span>' line_same_start = '<span class="line-num line-same">' line_same_end = '</span>' def _do_lines(diff, same, block_prefix, idx): result = [] if diff: key = block_prefix + 'dl' + str(idx) result.append('<div class="diff-block" key="' + key + '" ref="' + key + '">') result.extend([self._enclose(line_diff_start, s, line_diff_end) for s in diff]) result.append('</div>') result.extend([self._enclose(line_same_start, s, line_same_end) for s in same]) return result s = SequenceMatcher(None, ta_lines, tb_lines) cnt_a = Counter() cnt_b = Counter() for block in s.get_matching_blocks(): (a_idx, b_idx, nmatch) = block print("a[%d] and b[%d] match for %d elements" % block) cnt_a.progress(a_idx, nmatch) cnt_b.progress(b_idx, nmatch) diff_a = cnt_a.slice_diff(ta_lines) same_a = cnt_a.slice_match(ta_lines) diff_b = cnt_b.slice_diff(tb_lines) same_b = cnt_b.slice_match(tb_lines) if diff_a or diff_b: diff_a, diff_b = self._compare_lines(diff_a, diff_b) ta_result.extend(_do_lines(diff_a, same_a, 'a', cnt_a.current)) tb_result.extend(_do_lines(diff_b, same_b, 'b', cnt_b.current)) diff_lines.append(('adl' + str(cnt_a.current), 'bdl' + str(cnt_b.current))) cnt_a.next() cnt_b.next() return (linebreak.join(ta_result), linebreak.join(tb_result), diff_lines)
def get_initial_matches(self): """ This does the main work of finding matching n-gram sequences between the texts. """ sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams) matchingBlocks = sequence.get_matching_blocks() # Only return the matching sequences that are higher than the # threshold given by the user. highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold] numBlocks = len(highMatchingBlocks) if numBlocks > 0: print('%s total matches found.' % numBlocks, flush=True) return highMatchingBlocks
def diff(self, a, b): assert a != b flipped = False if a < b: a, b = b, a flipped = True key = a + b if key in self.diffcache: r = self.diffcache[key] else: sm = SequenceMatcher() sm.set_seqs(self.filecache[a], self.filecache[b]) r = sm.get_matching_blocks() self.diffcache[key] = r if flipped: return [(v2, v1, v3) for (v1, v2, v3) in r] else: return r
def supa_changed(a, b): s = SequenceMatcher(a=a, b=b) a_end = -1 b_end = -1 for a_begin, b_begin, length in s.get_matching_blocks(): if length == 0: continue if a_end != -1: a_skipped = a[a_end:a_begin] b_skipped = b[b_end:b_begin] if a_skipped == '' and b_skipped == '-DO': pass elif a_skipped == '???' and b_skipped == 'OBJ-DO': pass else: return True break a_end = a_begin + length b_end = b_begin + length return False
def findPatterns(self, leftSide, rightSide, numberOfIterations, riskFactor): ''' Old outdated method, possibly useful in the future ''' patterns = [] sequenceMatcher = SequenceMatcher() sequenceMatcher.set_seqs(leftSide, rightSide) ratio = sequenceMatcher.ratio() print ratio if(True): matchingBlocks = sequenceMatcher.get_matching_blocks() print matchingBlocks for block in matchingBlocks: if(leftSide[block[0]:block[0] + block[2]] != '' and leftSide[block[0]:block[0] + block[2]] != ' ' ): print "Found a pattern!" print "Added:",leftSide[block[0]:block[0] + block[2]], "To the pattern list!" patterns.append(leftSide[block[0]:block[0] + block[2]]) return patterns
def test(self, result): result = [repr(unicode_compat(bit)) for bit in result] if self.expected == result: return BitDiffResult(True, "success") else: # pragma: no cover longest = max( [len(x) for x in self.expected] + [len(x) for x in result] + [len('Expected')] ) sm = SequenceMatcher() sm.set_seqs(self.expected, result) matches = sm.get_matching_blocks() lasta = 0 lastb = 0 data = [] for match in [_backwards_compat_match(match) for match in matches]: unmatcheda = self.expected[lasta:match.a] unmatchedb = result[lastb:match.b] unmatchedlen = max([len(unmatcheda), len(unmatchedb)]) unmatcheda += ['' for x in range(unmatchedlen)] unmatchedb += ['' for x in range(unmatchedlen)] for i in range(unmatchedlen): data.append((False, unmatcheda[i], unmatchedb[i])) for i in range(match.size): data.append(( True, self.expected[match.a + i], result[match.b + i] )) lasta = match.a + match.size lastb = match.b + match.size padlen = (longest - len('Expected')) padding = ' ' * padlen line1 = '-' * padlen line2 = '-' * (longest - len('Result')) msg = '\nExpected%s | | Result' % padding msg += '\n--------%s-|---|-------%s' % (line1, line2) for success, a, b in data: pad = ' ' * (longest - len(a)) if success: msg += '\n%s%s | | %s' % (a, pad, b) else: msg += '\n%s%s | ! | %s' % (a, pad, b) return BitDiffResult(False, msg)
def partial_match(s1, s2): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) == 0 or len(s2) == 0: return 0 if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] matches = {} for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return {"ratio": 100, "match": long_substr} elif int(r * 1000) not in matches.keys(): scores.append(r) matches[int(r * 1000)] = {"ratio": r * 100, "match": long_substr} return matches[int(max(scores)*1000)]
def ordered_files(syncdir, toread): 'Find the largest set of files in the correct order.' # Create a dict of files in syncdir with a valid index # In case of index clashes, the first candidate wins valid_files = {} for calibreid, filename in syncdir.items(): index = file_index(filename) if index and index not in valid_files: valid_files[index] = calibreid # Create lists to compare to find the largest common sequence synclist = [valid_files[index] for index in sorted(valid_files.keys())] toreadlist = toread.keys()[:ARGS.count] logging.debug('Comparing %r and %r', synclist, toreadlist) # Use diffutils.SequenceMatcher to do the heavy lifting matcher = SequenceMatcher(None, toreadlist, synclist) ordered_ids = [] for i, j, count in matcher.get_matching_blocks(): ordered_ids.extend(toreadlist[i:i+count]) logging.debug('Longest sorted subset: %r', ([ syncdir[title] for title in ordered_ids],)) return ordered_ids