Example #1
0
def decompose_multiple_alterations(reference_path, alternative_path, kmer_length):
	reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
	multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path, kmer_length)

	edit_ops = Levenshtein.editops(reference_sequence, multi_alternative_sequence)
	if len(edit_ops) > 2:
		logger.info("Multiple alt when considering ref %s vs alt %s", reference_sequence, multi_alternative_sequence)
		logger.info("Globally apply %s", edit_ops)
	start, end = 0, 0
	while start < len(edit_ops):
		if edit_ops[start] == 'replace':
			atomic_sequence = Levenshtein.apply_edit([edit_ops[start]], reference_sequence, multi_alternative_sequence)
			# print atomic_sequence
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
			start += 1
		else:
			start_e = edit_ops[start]
			end = start + 1
			while (end < len(edit_ops)
				   and edit_ops[end][0] == start_e[0]
				   and (start_e[1] == edit_ops[end][1] or start_e[2] == edit_ops[end][2])):
				end += 1
			edit_op_to_apply = edit_ops[start:end]
			start = end
			logger.info("Will apply %s", edit_op_to_apply)
			atomic_sequence = Levenshtein.apply_edit(edit_op_to_apply, reference_sequence, multi_alternative_sequence)
			atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
		# record each atomic alteration
		logger.info("Adding atomic alteration for ref %s vs alt %s", reference_sequence, atomic_sequence)
		yield atomic_sequence, atomic_path
Example #2
0
def decompose_multiple_alterations(reference_path, alternative_path,
                                   kmer_length):
    reference_sequence = ALT.kmerpathToSeq(reference_path, kmer_length)
    multi_alternative_sequence = ALT.kmerpathToSeq(alternative_path,
                                                   kmer_length)

    edit_ops = Levenshtein.editops(reference_sequence,
                                   multi_alternative_sequence)
    if len(edit_ops) > 2:
        logger.info("Multiple alt when considering ref %s vs alt %s",
                    reference_sequence, multi_alternative_sequence)
        logger.info("Globally apply %s", edit_ops)
    start, end = 0, 0
    while start < len(edit_ops):
        if edit_ops[start] == 'replace':
            atomic_sequence = Levenshtein.apply_edit(
                [edit_ops[start]], reference_sequence,
                multi_alternative_sequence)
            # print atomic_sequence
            atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
            start += 1
        else:
            start_e = edit_ops[start]
            end = start + 1
            while (end < len(edit_ops) and edit_ops[end][0] == start_e[0]
                   and (start_e[1] == edit_ops[end][1]
                        or start_e[2] == edit_ops[end][2])):
                end += 1
            edit_op_to_apply = edit_ops[start:end]
            start = end
            logger.info("Will apply %s", edit_op_to_apply)
            atomic_sequence = Levenshtein.apply_edit(
                edit_op_to_apply, reference_sequence,
                multi_alternative_sequence)
            atomic_path = ALT.kmerize(atomic_sequence, kmer_length)
        # record each atomic alteration
        logger.info("Adding atomic alteration for ref %s vs alt %s",
                    reference_sequence, atomic_sequence)
        yield atomic_sequence, atomic_path
def make_improved_old(old, new):
    """ 3. Modify the old version of the hunk by these typo edits, so
    that it looks more like the new version."""
    # To avoid MemoryErroring out, we calculate

    # Calculate the edit moves necessary
    eo = lev.editops(old, new)

    # Now, filter those through something that looks for only "typo edits"
    do_these = only_typo_editops(eo)

    # Now, do them to old
    return lev.apply_edit(do_these, old, new)
Example #4
0
def correct(match):
    word = match.group(0)
    normed_word = word.lower()
    if normed_word in correction_list:
        new_word = correction_list[normed_word]
        if word.isupper():
            return new_word.upper()
        else:
            edits = Levenshtein.editops(normed_word, new_word)
            return Levenshtein.apply_edit(edits, word, new_word)
    elif normed_word in dictionary:
        return word
    else:
        return word
Example #5
0
def apply_edits(e, s):
    import Levenshtein

    # print(s[0:150])

    ocs = [x[0] for x in e]

    s2 = ''

    for edit in e:
        if edit[0][0] == b'equal':
            start = edit[0][3]
            stop = edit[0][4]
            s2 = s2 + (stop-start) * '.'
        else:
            s2 = s2 + edit[1] * edit[2]

    # print(s2[0:150])

    edited = Levenshtein.apply_edit(ocs, unicode(s), unicode(s2))

    # print(edited[0:150])

    return edited
Example #6
0
    def test_op_edits_for_N_193_1(self):
        ref = "ATGCCAGAGGCTGCTCCCCCCGTGGCCCCTGCACCAGCAGCTCC"
        alt = "ATGCCAGAGGCTGCTCCCGCGTGGCCCTGCACCAGCAGCTCC"
        # matcher = difflib.SequenceMatcher(a=ref, b=alt)
        # print matcher.get_opcodes()
        # op = [x[0] for x in matcher.get_opcodes() if x[0] != 'equal']
        # print op
        # alignments = pairwise2.align.globalms(ref, alt, 2, -3, -5, -2)
        # print alignments
        #
        # matcher2= difflib.SequenceMatcher(a="CCC",b="GC")
        # print matcher2.get_opcodes()

        editops = Levenshtein.editops(ref, alt)
        print editops
        # print opcodes
        # print Levenshtein.apply_edit(opcodes,ref,alt)
        for e in editops:
            print "applying", e
            try:
                transformed = Levenshtein.apply_edit([e], ref, alt)
                print align(ref, transformed)
            except Exception:
                print "Fail"
Example #7
0
    f_predict.write("\n")

    predict_list.append(candidates)

f_misspell.close()
f_correct.close()
f_dictionary.close()
f_predict.close()

predict_count = 0
correct_count = 0

for i in range(0, len(c_list)):
    c_list[i] = c_list[i].strip()
    c_word = c_list[i]

    if c_word in predict_list[i]:
        correct_count += 1

for p in predict_list:
    predict_count += len(p)

precision = round(correct_count / predict_count, 4)
recall = round(correct_count / len(c_list), 4)

print("\n====== Levenshtein Result ======")
print("Precision\t" + str(precision))
print("Recall\t" + str(recall))

Levenshtein.apply_edit()
Example #8
0
def micado_multi(sample_key, n_perm=25):
    kmer_length = 18
    max_len = 10
    # build reference graph
    g_reference = reference_graph.ReferenceGraph(
        kmer_length,
        fasta_file='data/reference/NM_000546.5.fasta',
        snp_file='data/reference/snp_TP53.tab')
    # build patient graph
    g_patient = patient_graph.PatientGraph(
        ['data/tp53_analysis/reads/%s.fastq' % sample_key], kmer_length)
    g_patient.graph_cleaned_init(3.0)
    # copy g_patient cleaned and remove reference edges on it (.dbg_refrm creation)
    g_patient.graph_rmRefEdges_init(g_patient.dbgclean, g_reference.dbg)
    # search for alternative paths in dbg_refrm (.alteration_list creation)
    g_patient.alteration_list_init(g_reference.dbg, kmer_length, 3.0, max_len)

    # TODO build real set of possible k-mers
    all_possible_kmers = set()
    for an_alt in g_patient.alteration_list:
        all_possible_kmers.update(an_alt.reference_path)
        all_possible_kmers.update(an_alt.alternative_path)

    # build a random read graph
    import seq_lib_TP53 as seq_lib
    random_ratio_dict = collections.defaultdict(list)
    lonely_ratio_dict = {}
    ref_seq_dict = {}
    alt_seq_dict = {}
    for n_perm in range(n_perm):
        print n_perm
        rg = randomreadsgraph.RandomReadsGraph({
            "N": 0,
            "C": 0
        },
                                               k=kmer_length,
                                               seq_lib_module=seq_lib,
                                               restrict_to=None)
        for alt_i, putative_alt in enumerate(g_patient.alteration_list):
            # determine number of edit ops
            # There's at least one (since it's an alternative path)
            ref_seq = putative_alt.reference_sequence
            ref_seq_dict[alt_i] = ref_seq
            patient_seq = putative_alt.alternative_sequence
            edit_ops = Levenshtein.editops(ref_seq, patient_seq)
            lonely_ratio = putative_alt.ratio_read_count
            lonely_ratio_dict[alt_i] = lonely_ratio
            # print n_perm, alt_i, lonely_ratio, edit_ops
            for e in edit_ops:
                # print "Considering atomic edit op", e
                transformed = Levenshtein.apply_edit([e], ref_seq, patient_seq)
                ratio_random = rg.check_path(kmerize(ref_seq, kmer_length),
                                             kmerize(transformed, kmer_length),
                                             min_cov=putative_alt.min_coverage)
                random_ratio_dict[(alt_i, (e, ))].append(ratio_random[0])
                alt_seq_dict[(alt_i, (e, ))] = transformed
            # perform for all edit_ops
            ratio_random = rg.check_path(kmerize(ref_seq, kmer_length),
                                         kmerize(patient_seq, kmer_length),
                                         min_cov=putative_alt.min_coverage)
            random_ratio_dict[(alt_i, tuple(edit_ops))].append(ratio_random[0])
            alt_seq_dict[(alt_i, tuple(edit_ops))] = patient_seq
    for (alt_i, edit_ops_i), ratios in sorted(random_ratio_dict.items(),
                                              key=lambda x: x[0][0]):
        this_patient_ratio = lonely_ratio_dict[alt_i]
        random_ratios = alt_seq_dict[(alt_i, edit_ops_i)]
        print "Alt %d with real ratio %f, Edit ops %s, random_ratios :%s" % (
            alt_i, this_patient_ratio, edit_ops_i, map(str, ratios))
        print "Ref seq %s" % (ref_seq_dict[alt_i])
        print "Alt seq %s" % (random_ratios)
        print "N higher: %d" % (len(
            [x for x in ratios if x > this_patient_ratio]))
        standard_deviation = np.std(ratios)
        zscore = float((this_patient_ratio - np.mean(ratios)) / np.std(ratios))
        print "Z-Score", zscore
Example #9
0
# print(day_two(split_s))
import Levenshtein
from collections import defaultdict
import pandas as pd

test_case2 = ["abcde", "fghij", "klmno", "pqrst", "fguij", "axcye", "wvxyz"]


def chars_apart(str1, str2):
    edit_list = Levenshtein.editops(str1, str2)
    return len(edit_list)


def part_two(str_list: list):
    dd = defaultdict(lambda: int)
    for s in str_list:
        for t in str_list:
            dd[f'({s},{t})'] = chars_apart(s, t)

    df = pd.DataFrame(dd, index=[0]).T
    df = df[df.values == 1]
    return df.index


test = part_two(test_case2)
soln = part_two(split_s)
print(soln)
edits = Levenshtein.editops(soln[0], soln[1])
target = Levenshtein.apply_edit(edits, soln[0], soln[1])
print(target)
# part_two(test_case2)=='fgij'