def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length): reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length) multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length) edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence) if len(edit_ops) > 2: logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence) logger.info("Globally apply %s", edit_ops) start, end = 0, 0 while start < len(edit_ops): if edit_ops[start] == 'replace': atomic_sequence = Levenshtein.apply_edit( [edit_ops[start]], reference_sequence, multi_alternative_sequence) # print atomic_sequence atomic_path = ALT.kmerize(atomic_sequence, kmer_length) start += 1 else: start_e = edit_ops[start] end = start + 1 while (end < len(edit_ops) and edit_ops[end][0] == start_e[0] and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])): end += 1 edit_op_to_apply = edit_ops[start:end] start = end logger.info("Will apply %s", edit_op_to_apply) atomic_sequence = Levenshtein.apply_edit( edit_op_to_apply, reference_sequence, multi_alternative_sequence) atomic_path = ALT.kmerize(atomic_sequence, kmer_length) # record each atomic alteration logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence) yield atomic_sequence, atomic_path
def make_improved_old(old, new): """ 3. Modify the old version of the hunk by these typo edits, so that it looks more like the new version.""" # To avoid MemoryErroring out, we calculate # Calculate the edit moves necessary eo = lev.editops(old, new) # Now, filter those through something that looks for only "typo edits" do_these = only_typo_editops(eo) # Now, do them to old return lev.apply_edit(do_these, old, new)
def correct(match): word = match.group(0) normed_word = word.lower() if normed_word in correction_list: new_word = correction_list[normed_word] if word.isupper(): return new_word.upper() else: edits = Levenshtein.editops(normed_word, new_word) return Levenshtein.apply_edit(edits, word, new_word) elif normed_word in dictionary: return word else: return word
def apply_edits(e, s): import Levenshtein # print(s[0:150]) ocs = [x[0] for x in e] s2 = '' for edit in e: if edit[0][0] == b'equal': start = edit[0][3] stop = edit[0][4] s2 = s2 + (stop-start) * '.' else: s2 = s2 + edit[1] * edit[2] # print(s2[0:150]) edited = Levenshtein.apply_edit(ocs, unicode(s), unicode(s2)) # print(edited[0:150]) return edited
def test_op_edits_for_N_193_1(self): ref = "ATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCC" alt = "ATGCCAGAGGCTGCTCCCGCGTGGCCCTGCACCAGCAGCTCC" # matcher = difflib.SequenceMatcher(a=ref, b=alt) # print matcher.get_opcodes() # op = [x[0] for x in matcher.get_opcodes() if x[0] != 'equal'] # print op # alignments = pairwise2.align.globalms(ref, alt, 2, -3, -5, -2) # print alignments # # matcher2= difflib.SequenceMatcher(a="CCC",b="GC") # print matcher2.get_opcodes() editops = Levenshtein.editops(ref, alt) print editops # print opcodes # print Levenshtein.apply_edit(opcodes,ref,alt) for e in editops: print "applying", e try: transformed = Levenshtein.apply_edit([e], ref, alt) print align(ref, transformed) except Exception: print "Fail"
f_predict.write("\n") predict_list.append(candidates) f_misspell.close() f_correct.close() f_dictionary.close() f_predict.close() predict_count = 0 correct_count = 0 for i in range(0, len(c_list)): c_list[i] = c_list[i].strip() c_word = c_list[i] if c_word in predict_list[i]: correct_count += 1 for p in predict_list: predict_count += len(p) precision = round(correct_count / predict_count, 4) recall = round(correct_count / len(c_list), 4) print("\n====== Levenshtein Result ======") print("Precision\t" + str(precision)) print("Recall\t" + str(recall)) Levenshtein.apply_edit()
def micado_multi(sample_key, n_perm=25): kmer_length = 18 max_len = 10 # build reference graph g_reference = reference_graph.ReferenceGraph( kmer_length, fasta_file='data/reference/NM_000546.5.fasta', snp_file='data/reference/snp_TP53.tab') # build patient graph g_patient = patient_graph.PatientGraph( ['data/tp53_analysis/reads/%s.fastq' % sample_key], kmer_length) g_patient.graph_cleaned_init(3.0) # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation) g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg) # search for alternative paths in dbg_refrm (.alteration_list creation) g_patient.alteration_list_init(g_reference.dbg, kmer_length, 3.0, max_len) # TODO build real set of possible k-mers all_possible_kmers = set() for an_alt in g_patient.alteration_list: all_possible_kmers.update(an_alt.reference_path) all_possible_kmers.update(an_alt.alternative_path) # build a random read graph import seq_lib_TP53 as seq_lib random_ratio_dict = collections.defaultdict(list) lonely_ratio_dict = {} ref_seq_dict = {} alt_seq_dict = {} for n_perm in range(n_perm): print n_perm rg = randomreadsgraph.RandomReadsGraph({ "N": 0, "C": 0 }, k=kmer_length, seq_lib_module=seq_lib, restrict_to=None) for alt_i, putative_alt in enumerate(g_patient.alteration_list): # determine number of edit ops # There's at least one (since it's an alternative path) ref_seq = putative_alt.reference_sequence ref_seq_dict[alt_i] = ref_seq patient_seq = putative_alt.alternative_sequence edit_ops = Levenshtein.editops(ref_seq, patient_seq) lonely_ratio = putative_alt.ratio_read_count lonely_ratio_dict[alt_i] = lonely_ratio # print n_perm, alt_i, lonely_ratio, edit_ops for e in edit_ops: # print "Considering atomic edit op", e transformed = Levenshtein.apply_edit([e], ref_seq, patient_seq) ratio_random = rg.check_path(kmerize(ref_seq, kmer_length), kmerize(transformed, kmer_length), min_cov=putative_alt.min_coverage) random_ratio_dict[(alt_i, (e, ))].append(ratio_random[0]) alt_seq_dict[(alt_i, (e, ))] = transformed # perform for all edit_ops ratio_random = rg.check_path(kmerize(ref_seq, kmer_length), kmerize(patient_seq, kmer_length), min_cov=putative_alt.min_coverage) random_ratio_dict[(alt_i, tuple(edit_ops))].append(ratio_random[0]) alt_seq_dict[(alt_i, tuple(edit_ops))] = patient_seq for (alt_i, edit_ops_i), ratios in sorted(random_ratio_dict.items(), key=lambda x: x[0][0]): this_patient_ratio = lonely_ratio_dict[alt_i] random_ratios = alt_seq_dict[(alt_i, edit_ops_i)] print "Alt %d with real ratio %f, Edit ops %s, random_ratios :%s" % ( alt_i, this_patient_ratio, edit_ops_i, map(str, ratios)) print "Ref seq %s" % (ref_seq_dict[alt_i]) print "Alt seq %s" % (random_ratios) print "N higher: %d" % (len( [x for x in ratios if x > this_patient_ratio])) standard_deviation = np.std(ratios) zscore = float((this_patient_ratio - np.mean(ratios)) / np.std(ratios)) print "Z-Score", zscore
# print(day_two(split_s)) import Levenshtein from collections import defaultdict import pandas as pd test_case2 = ["abcde", "fghij", "klmno", "pqrst", "fguij", "axcye", "wvxyz"] def chars_apart(str1, str2): edit_list = Levenshtein.editops(str1, str2) return len(edit_list) def part_two(str_list: list): dd = defaultdict(lambda: int) for s in str_list: for t in str_list: dd[f'({s},{t})'] = chars_apart(s, t) df = pd.DataFrame(dd, index=[0]).T df = df[df.values == 1] return df.index test = part_two(test_case2) soln = part_two(split_s) print(soln) edits = Levenshtein.editops(soln[0], soln[1]) target = Levenshtein.apply_edit(edits, soln[0], soln[1]) print(target) # part_two(test_case2)=='fgij'