Ejemplo n.º 1
0
def get_pairwise_hom(jxn1, jxn2, chim_dir, side):
    hom_res = [0]
    # jxn1
    clean_jxn1 = su.common.safe_jxn(jxn1)
    jxn1_fa = os.path.join(chim_dir,
                           'transcripts-fusion-' + clean_jxn1 + '.fa')
    # jxn2
    clean_jxn2 = su.common.safe_jxn(jxn2)
    jxn2_fa = os.path.join(chim_dir,
                           'transcripts-fusion-' + clean_jxn2 + '.fa')
    # get homology scores
    aligner = ssw.Aligner(gap_open=12, gap_extend=4)  # match=2, mismatch=2
    fa1_gen = su.common.fasta_iter(jxn1_fa)
    max_res = 0
    # iterate through transcripts and do SW against fusion partner transcript
    for fa1_head, fa1_seq in fa1_gen:
        fa1_brk = int(fa1_head.split('|')[-1])
        if side == "left":
            fa1_seq = fa1_seq[:fa1_brk]
        else:
            fa1_seq = fa1_seq[fa1_brk:]
        fa2_gen = su.common.fasta_iter(jxn2_fa)
        for fa2_head, fa2_seq in fa2_gen:
            fa2_brk = int(fa2_head.split('|')[-1])
            if side == "left":
                fa2_seq = fa2_seq[:fa2_brk]
            else:
                fa2_seq = fa2_seq[fa2_brk:]
            cmp_align = aligner.align(reference=fa1_seq, query=fa2_seq)
            norm_res = cmp_align.score / (min(len(fa1_seq), len(fa2_seq)) * 2
                                          )  # normalize hom from 0-1 range
            max_res = max(max_res, norm_res)
            hom_res.append(max_res)
    return (np.max(hom_res))
Ejemplo n.º 2
0
def run_crosshom_ssw(reads_fq, trxleft_fa, trxright_fa):
    '''get sw score for left and right gene'''
    aligner = ssw.Aligner(gap_open=12, gap_extend=4)  # match=2, mismatch=2
    rfq_gen = su.common.FastqParser(reads_fq)
    matches = []
    # iterate through reads and do SW against fusion partner transcript
    for rfq in (x for _, x in zip(range(500), rfq_gen)):  # just do top 500 reads to maintain speed
        l_max = 0
        r_max = 0
        if rfq.seq_len <= 20: # small sequences will always give positive hits, so effectively skip
            matches.append(0)
        trxl_gen = su.common.fasta_iter(trxleft_fa)
        trxr_gen = su.common.fasta_iter(trxright_fa)
        for trxl_id, trxl_seq in trxl_gen:
            rfql_align = aligner.align(reference=rfq.sequence, query=trxl_seq)
            l_max = max(rfql_align.score, l_max)
        for trxr_id, trxr_seq in trxr_gen:
            rfqr_align = aligner.align(reference=rfq.sequence, query=trxr_seq)
            r_max = max(rfqr_align.score, r_max)
        read_norm = min(l_max, r_max)  / (rfq.seq_len * 2) # pct identity matching with flexibility for snvs
        matches.append(read_norm)
    if len(matches) > 0:
        return(float("{0:.3f}".format(np.sum(i > .5 for i in matches) / len(matches))))
    else:
        return(0)
Ejemplo n.º 3
0
def realign(read, chrom, ref):
    ref_start = max(read.reference_start - len(read.seq), 0)
    ref_end = read.reference_end + len(read.seq)
    ref_seq = ref.fetch(chrom, ref_start, ref_end)
    aligner = ssw.Aligner()
    alignment = aligner.align(reference=ref_seq, query=read.seq)
    new_pos = ref_start + alignment.reference_begin
    new_cigar = alignment.cigar
    return new_pos, new_cigar
Ejemplo n.º 4
0
 def test_mismatch(self):
     reference = "GTGCGATGTGCGATGAGATC"
     query = reference[:9] + 'A' + reference[10:]
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     self.assertEqual(al.match_count, 19)
     self.assertEqual(al.mismatch_count, 1)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 0)
     self.assertEqual(al.cigar, "20M")
Ejemplo n.º 5
0
 def test_rc_alignment(self):
     reference = "GTGCGATGTGCGATGAGATC"
     query = "GATCTCATCGCACATCGCAC"
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     self.assertEqual(al.match_count, 20)
     self.assertEqual(al.mismatch_count, 0)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 0)
     self.assertEqual(al.cigar, "20M")
Ejemplo n.º 6
0
 def test_perfect_alignment(self):
     reference = "GTGCGATGTGCGATGAGATC"
     query = reference
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     self.assertEqual(al.match_count, len(reference))
     self.assertEqual(al.mismatch_count, 0)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 0)
     self.assertEqual(al.cigar, '20M')
Ejemplo n.º 7
0
 def test_issue_1(self):
     # https://github.com/vishnubob/ssw/issues/1
     reference = "CCC" + "AGCT" * 10
     query = "AGGT" * 10
     aligner = ssw.Aligner()
     alignment = aligner.align(query, reference)
     (r_line, m_line, q_line) = alignment.alignment
     self.assertEqual(r_line, "AGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAG")
     self.assertEqual(m_line, "||*|||*|||*|||*|||*|||*|||*|||*|||*|||")
     self.assertEqual(q_line, "AGGTAGGTAGGTAGGTAGGTAGGTAGGTAGGTAGGTAG")
Ejemplo n.º 8
0
def getBestScoringAllelesForExon(genotypes, commonExon, secondary, consensus):
    exAlign = {}
    sw = ssw.Aligner()
    for allele in genotypes:
        exonSeq = secondary[allele][commonExon]
        alignment = sw.align(str(consensus.seq), exonSeq)
        exAlign[allele] = alignment.score
    return set(
        getBestScoringAlleles(
            sorted(exAlign.items(), key=operator.itemgetter(1), reverse=True)))
Ejemplo n.º 9
0
 def test_deletion(self):
     reference = "GTGCGATGTGCGATGAGATC"
     query = reference[:10] + reference[11:]
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     self.assertEqual(al.match_count, 19)
     self.assertEqual(al.mismatch_count, 0)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 1)
     self.assertEqual(al.cigar, "10M1D9M")
Ejemplo n.º 10
0
 def test_alignment_pickle(self):
     reference = "GTGCGATGTGCGATGAGATC"
     query = reference
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     orig_dict = al.__dict__.copy()
     clone = pickle.loads(pickle.dumps(al))
     clone_dict = clone.__dict__.copy()
     for key in orig_dict.keys():
         self.assertIn(key, clone_dict)
         self.assertEqual(orig_dict[key], clone_dict[key])
Ejemplo n.º 11
0
 def execute_alignment(self, query):
     row = []
     self.dna_alphabet = "AGTCNRYSWKMBDHV"
     matrix = ssw.DNA_ScoreMatrix(alphabet=self.dna_alphabet)
     aligner = ssw.Aligner(matrix=matrix)
     for reference in self.references:
         alignment = aligner.align(query, reference)
         row.append(alignment)
     row.sort(cmp=lambda x, y: cmp(x.score, y.score), reverse=True)
     winner = row[0]
     return (winner.reference, winner)
Ejemplo n.º 12
0
 def test_degen_alignment(self):
     # XXX: note, this fails if seq_1 and seq_2 are switched
     # see: https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library/issues/63
     seq_1 = "AGCGATCACGT"
     seq_2 = "MRYSWKBDHVN"
     aligner = ssw.Aligner()
     al = aligner.align(seq_1, seq_2)
     self.assertEqual(al.match_count, len(seq_1))
     self.assertEqual(al.mismatch_count, 0)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 0)
     self.assertEqual(al.cigar, '%dM' % len(seq_1))
Ejemplo n.º 13
0
def realign(initref, lastref, lastref_profile):
    """
    Realign lastref to initref

    The purpose is to generate a better alignment for restoring
    initref position numbers in multi-alignment results.
    """
    # compatible consideration; different versions of
    # iterative alignment represent deletion slightly varied
    cons_delpos = set(
        str_index_all(lastref, '-') + str_index_all(lastref_profile, '-'))
    lastref = ''.join('' if p in cons_delpos else n
                      for p, n in enumerate(lastref))
    aligner = ssw.Aligner()
    aln = aligner.align(reference=initref, query=lastref)
    if aln.query_begin > 0 or aln.reference_begin > 0:
        # Why?
        raise RuntimeError('Consensus misaligned')
    initref, _, lastref = aln.alignment
    ref_pos0 = -1
    alnprofile = []
    for refna, consna in zip(initref, lastref):
        if refna != '-':
            ref_pos0 += 1

        # The re-alignment could place deletions in slightly different
        # place comparing to the original alignment. Four cases need
        # to be considered:
        #   - +n+o: deletion presents in both alignments:
        #           no extra handling needed
        #   - +n-o: deletion only presents in new alignment:
        #           new deletion should be added to the multi-alignment
        #   - -n+o: deletion only presents in old alignment:
        #           multi-alignment NAs mapped to the old deletion
        #           should be removed to previous refpos (as insertion)
        #   - -n-o: no deletion:
        #           no extra handling needed
        newdel = consna == '-'
        olddel = len(alnprofile) in cons_delpos
        while not newdel and olddel:  # -n+o
            # mismatched old deletions
            prev_refpos0, _ = alnprofile[-1]
            alnprofile.append((prev_refpos0, -1))
            olddel = len(alnprofile) in cons_delpos
        else:
            if newdel:  # +n+o/+n-o
                # new deletions
                prev_refpos0, count = alnprofile[-1]
                alnprofile[-1] = (prev_refpos0, count + 1)
            else:  # -n-o
                # agree, non deletion
                alnprofile.append((ref_pos0, 0))
    return alnprofile
Ejemplo n.º 14
0
 def test_coverage(self):
     bplen = 5
     reference = "GTGCGATGTGCGATGAGATC"
     query = reference[bplen:bplen * 2]
     aligner = ssw.Aligner()
     al = aligner.align(query, reference)
     self.assertEqual(al.match_count, bplen)
     self.assertEqual(al.mismatch_count, 0)
     self.assertEqual(al.insertion_count, 0)
     self.assertEqual(al.deletion_count, 0)
     self.assertEqual(al.cigar, '%dM' % bplen)
     self.assertEqual(al.query_coverage, 1.0)
     self.assertEqual(al.reference_coverage, bplen / len(reference))
Ejemplo n.º 15
0
def selectGenotypesConsideringIntronsAndUTRs(genotypes, genomicRefs,
                                             consensus):
    """
	Now we are doing something similar as in getBestScoringAllelesForExon(), but for whole sequences
	"""
    alleleAlign = {}
    sw = ssw.Aligner()
    for allele in genotypes:
        alleleSeq = genomicRefs[allele]
        alignment = sw.align(str(consensus.seq), alleleSeq)
        alleleAlign[allele] = alignment.score
    return set(
        getBestScoringAlleles(
            sorted(alleleAlign.items(),
                   key=operator.itemgetter(1),
                   reverse=True)))
Ejemplo n.º 16
0
def get_ssw_alignments(best_edit_distances, querys, targets):
    score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2)
    aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix)
    best_edit_distances_ssw = {}
    best_cigars_ssw = {}
    for acc1 in best_edit_distances:
        seq1 = querys[acc1]
        best_ed = len(seq1)
        best_edit_distances_ssw[acc1] = {}
        best_cigars_ssw[acc1] = {}
        for acc2 in best_edit_distances[acc1]:
            seq2 = targets[acc2]
            result = aligner.align(seq1, seq2, revcomp=False)
            seq2_aln, match_line, seq1_aln = result.alignment
            matches, mismatches, indels = match_line.count(
                "|"), match_line.count("*"), match_line.count(" ")
            insertion_count = seq2_aln.count("-")
            deletion_count = seq1_aln.count("-")

            sw_ed = mismatches + indels
            best_edit_distances_ssw[acc1][
                acc2] = sw_ed  # (deletion_count, insertion_count, mismatches )
            seq1_aln, match_line, seq2_aln = result.alignment
            best_cigars_ssw[acc1][acc2] = (result.cigar, mismatches, indels,
                                           result.query_begin,
                                           len(seq1) - result.query_end - 1,
                                           result.reference_begin, len(seq2) -
                                           result.reference_end - 1)

            # print(acc1,acc2)
            # print(result.query_begin, len(seq1) - result.query_end - 1, result.reference_begin, len(seq2) - result.reference_end -1, result.cigar, mismatches, indels)
            # print()

            # print(sw_ed, (deletion_count, insertion_count, mismatches ))
            # print(seq1_aln)
            # print(match_line)
            # print(seq2_aln)
            # edit_distance, locations, cigar = edlib_traceback(seq1, seq2, k =1000)
            # print(edit_distance, locations, cigar)
            # print()
    # for acc in best_cigars_ssw:
    #     if len(best_cigars_ssw[acc]) ==0:
    #         print("!!!!", acc)
    # print(len(best_cigars_ssw))
    # sys.exit()
    return best_edit_distances_ssw, best_cigars_ssw
Ejemplo n.º 17
0
def preSelectTypes(primary, consensus, locus):
    """
	For each primary exon (or exon pair)
		make an alignment for exon 2, and put the result into a dictionary as alignmentEx2['allele'] = #score 
		# we are getting [score, matches, mismatches, inserts, deletions] if we want to 
		if there is exon 3 also, 
			do an alignmentEx3['allele'] = #mismatches
			merge the two in a way that sort both, keep the best for both, and make an intersect
		else
			sort, and keep the best alignments only
		
	"""
    print "Selecting best alleles using primary exons"
    # we are going to use https://github.com/vishnubob/ssw that is using
    # https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library for SW and SAM output
    # unfortunatelly it can not made multiprocess yet, but it is fast enough
    isClassI = locus in ["HLA-A", "HLA-B", "HLA-C"]
    alignmentEx2 = {}
    alignmentEx3 = {}
    sw = ssw.Aligner()
    for allele, exons in primary.items():
        alignment = sw.align(str(consensus.seq), exons[0])
        alignmentEx2[allele] = alignment.score
        # ditto for exon 3 if it is not a Class-I
        if isClassI:
            alignment = sw.align(str(consensus.seq), exons[1])
            alignmentEx3[allele] = alignment.score
        # print allele + " scores: exon 2: " + str(alignmentEx2[allele]) + " exon 3 " + str(alignmentEx3[allele])
    # sort the dict by values and reverse the result
    bestEx2 = getBestScoringAlleles(
        sorted(alignmentEx2.items(), key=operator.itemgetter(1), reverse=True))
    if isClassI:
        bestEx3 = getBestScoringAlleles(
            sorted(alignmentEx3.items(),
                   key=operator.itemgetter(1),
                   reverse=True))
    # return with the intersect of the two sets, leaving only entries that are in the besty matching set for both exon 2 and exon 3
    # or exon 2 only for other than Class-I alleles
    print "done"
    return list(set(bestEx2) & set(bestEx3)) if isClassI else bestEx2
Ejemplo n.º 18
0
def realign2(initref, lastref, lastref_profile):
    """
    Realign lastref to initref

    The purpose is to generate a better alignment for restoring
    initref position numbers in multi-alignment results.
    """
    # compatible consideration; different versions of
    # iterative alignment represent deletion slightly varied
    cons_delpos = set(
        str_index_all(lastref, '-') + str_index_all(lastref_profile, '-'))
    lastref = ''.join('' if p in cons_delpos else n
                      for p, n in enumerate(lastref))
    aligner = ssw.Aligner()
    aln = aligner.align(reference=initref, query=lastref)
    if aln.query_begin > 0 or \
            aln.reference_begin > 0 or \
            aln.query_end + 1 < len(lastref) or \
            aln.reference_end + 1 < len(initref):
        # this should never have since lastref is constructed
        # from initref but just in case
        raise NotImplementedError('Partial alignment not supported yet')
    initref, _, lastref = aln.alignment
    resultseq = []
    alnprofile = []
    for refna, consna in zip(initref, lastref):
        if refna == '-':
            resultseq.append(consna)
            alnprofile.append('+')
        elif consna == '-':
            resultseq.append(refna)
            alnprofile.append('-')
        else:
            resultseq.append(consna)
            alnprofile.append('.')
    return ''.join(resultseq), ''.join(alnprofile)
Ejemplo n.º 19
0
def get_best_match(consensus_transcripts, reference_transcripts, outfolder,
                   transcript_abundances, transcript_copies, sampled_dict,
                   params):
    out_file = open(os.path.join(outfolder, "results.tsv"), "w")
    aligner = ssw.Aligner(gap_open=2, gap_extend=1)
    # do SW
    nr_unique_refs = len(reference_transcripts)
    errors_container = {}
    identity_container = {}
    error_types_container = {}
    best_match_container = {}
    not_FN = set()
    # print(consensus_transcripts)
    if len(consensus_transcripts) == 0:
        out_file.write("{0}\t{1}\t{2}\n".format(
            nr_unique_refs, len(consensus_transcripts),
            ",".join([str(a) for a in transcript_abundances.values()])))
        return

    sorted_lengths = sorted([
        (len(q_seq), q_acc) for q_acc, q_seq in consensus_transcripts.items()
    ])
    # for l in sorted_lengths:
    #     print(l)

    print("REF LENGHTS")
    sorted_lengths = sorted(
        [len(r_seq) for r_acc, r_seq in reference_transcripts.items()])
    # for l in sorted_lengths:
    #     print(l)

    # pre check exact matches:
    if params.only_exact:
        ref_seq_to_acc = {
            seq: acc
            for acc, seq in reference_transcripts.items()
        }
        ref_seqs = set(reference_transcripts.values())
        exact_matches = set()
        for q_acc, q_seq in consensus_transcripts.items():
            if q_seq in ref_seqs:
                exact_matches.add(q_acc)
                ref_acc = ref_seq_to_acc[q_seq]  #.split("copy")[0]
                print("Exact", q_acc, "to transcript with copy number:",
                      transcript_copies[ref_acc])
                errors_container[q_acc] = 0
                best_match_container[q_acc] = ref_acc
                identity_container[q_acc] = 1.0
                error_types_container[q_acc] = (0, 0, 0)
                not_FN.add(ref_acc)

        print(len(ref_seqs))
        print(len(consensus_transcripts))
        print("EXACT MATCHES:", len(exact_matches))

    else:
        print("Start1")
        best_edit_distances = get_minimizers_2set_simple(
            consensus_transcripts, reference_transcripts)
        minimizer_graph_c_to_t = get_ssw_alignments(best_edit_distances,
                                                    consensus_transcripts,
                                                    reference_transcripts)
        for i, (q_acc, q_seq) in enumerate(minimizer_graph_c_to_t.items()):
            best_ed = 200000
            r_acc_max_id = "NONE"
            fewest_errors = len(q_seq)
            best_mismatches, best_insertions, best_deletions = len(q_seq), len(
                q_seq), len(q_seq)

            for j, (r_acc,
                    r_seq) in enumerate(minimizer_graph_c_to_t[q_acc].items()):
                deletions, insertions, mismatches = minimizer_graph_c_to_t[
                    q_acc][r_acc]
                edit_distance = deletions + insertions + mismatches

                if edit_distance < best_ed:
                    best_ed = edit_distance
                    r_acc_max_id = r_acc
                    fewest_errors = edit_distance
                    best_mismatches, best_insertions, best_deletions = mismatches, insertions, deletions

            errors_container[q_acc] = fewest_errors
            best_match_container[q_acc] = r_acc_max_id
            identity_container[q_acc] = 1.0 - (best_ed / float(
                max(len(q_seq), len(reference_transcripts[r_acc_max_id]))))
            error_types_container[q_acc] = (best_mismatches, best_insertions,
                                            best_deletions)
            not_FN.add(r_acc_max_id)

        print("Stop1!")

    if sampled_dict:
        FN = set(sampled_dict.keys()).difference(not_FN)
    else:
        FN = set(reference_transcripts.keys()).difference(not_FN)

    for ref in FN:
        print("FN:", ref, len(reference_transcripts[ref]))
    # current logging:
    # first row display info of number of uniqur reference transcripts, and number of inferred transcripts
    if sampled_dict:
        out_file.write("{0}\t{1}\t{2}\n".format(
            len(sampled_dict), len(consensus_transcripts),
            ",".join([str(a) for a in transcript_abundances.values()])))
    else:
        out_file.write("{0}\t{1}\t{2}\n".format(
            nr_unique_refs, len(consensus_transcripts),
            ",".join([str(a) for a in transcript_abundances.values()])))

    # total discoveries, total perfect matches (1.0 identity), errors for each consensus
    # print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(nr_unique_refs, q_acc, best_match_container[q_acc], errors_container[q_acc], identity_container[q_acc], *error_types_container[q_acc]))
    for q_acc in errors_container:
        # each ro displays values for a consensus transcript
        if identity_container[q_acc] > params.sim_cutoff:
            ssw_stats = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                q_acc, best_match_container[q_acc], errors_container[q_acc],
                identity_container[q_acc], *error_types_container[q_acc])
            # print(ssw_stats, minimizer_graph_c_to_t[q_acc])
            # print()
            out_file.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                q_acc, best_match_container[q_acc], errors_container[q_acc],
                identity_container[q_acc], *error_types_container[q_acc]))
Ejemplo n.º 20
0
import ssw
import sys

if len(sys.argv) < 3:
    print "Usage: python remove_messy.py in.bed out.bed"
    sys.exit()
in_bed = sys.argv[1]
out_bed = sys.argv[2]

aligner = ssw.Aligner()
allowed_percent = 0.0
with open(in_bed, 'r') as refin:
    with open(out_bed, 'w') as refout:
        messy = 0
        clean = 0
        for line in refin:
            rec = line.strip().split("\t")
            ref = rec[5]
            query = rec[4] * int((int(rec[2]) - int(rec[1]) + 1) / int(rec[3]))
            #alignment = aligner.align(ref, query)

            #if alignment.mismatch_count + alignment.deletion_count + alignment.insertion_count > allowed_percent / 100.0 * len(ref):
            if (ref != query):
                #print (alignment.alignment_report())
                messy = messy + 1
            else:
                clean = clean + 1
                refout.write('\t'.join(rec) + '\n')

print('Deleted ' + str(messy) + ' loci')
print(str(clean) + ' loci survived')
Ejemplo n.º 21
0
def ssw_alignment( x_acc, y_acc, x, y, i,j, max_discrepancy = 50):
    """
        Aligns two sequences with SSW
        x: query
        y: reference

    """
    if i % 10 == 0 and j % 10000 == 0:
        print("processing alignments on all y's where read x_i is participating. i={0}".format(i+1))

    score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-1)
    aligner = ssw.Aligner(gap_open=2, gap_extend=1, matrix=score_matrix)

    # for the ends that SSW leaves behind
    bio_matrix = matlist.blosum62
    g_open = -1
    g_extend = -0.5
    ######################################

    result = aligner.align(x, y, revcomp=True)
    y_alignment, match_line, x_alignment = result.alignment

    matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ")
    deletions = x_alignment.count("-")
    insertions = y_alignment.count("-")
    assert deletions + insertions == indels
    # alignment_length = len(match_line)
    
    start_discrepancy = max(result.query_begin, result.reference_begin)  # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin)
    query_end_discrepancy = len(x) - result.query_end - 1
    ref_end_discrepancy = len(y) - result.reference_end - 1
    end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy)  # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end)
    # print(query_end_discrepancy, ref_end_discrepancy)
    tot_discrepancy = start_discrepancy + end_discrepancy

    if 0 < start_discrepancy <= max_discrepancy:
        # print("HERE")
        matches_snippet = 0
        mismatches_snippet = 0
        if result.query_begin and result.reference_begin:
            query_start_snippet = x[:result.query_begin]
            ref_start_snippet = y[:result.reference_begin]
            alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend)
            top_aln = alns[0]
            # print(alns)
            mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1]))))
            indels_snippet = top_aln[0].count("-") + top_aln[1].count("-")
            matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet
            # print(matches_snippet, mismatches_snippet, indels_snippet)
            query_start_alignment_snippet = top_aln[0]
            ref_start_alignment_snippet = top_aln[1]
        elif result.query_begin:
            query_start_alignment_snippet = x[:result.query_begin]
            ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet)
            indels_snippet = len(ref_start_alignment_snippet)
        elif result.reference_begin:
            ref_start_alignment_snippet = y[:result.reference_begin]
            query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet)
            indels_snippet = len(query_start_alignment_snippet)
        else:
            print("BUG")
            sys.exit()
        matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet

        # print(ref_start_alignment_snippet)
        # print(query_start_alignment_snippet)
        y_alignment = ref_start_alignment_snippet + y_alignment
        x_alignment = query_start_alignment_snippet + x_alignment

    if 0 < end_discrepancy <= max_discrepancy:
        # print("HERE2", query_end_discrepancy, ref_end_discrepancy)
        # print(y_alignment)
        # print(y)
        # print(match_line)
        # print(x_alignment)
        # print(x)
        # print(matches, len(x_alignment))
        matches_snippet = 0
        mismatches_snippet = 0
        if query_end_discrepancy and ref_end_discrepancy:
            query_end_snippet = x[result.query_end+1:]
            ref_end_snippet = y[result.reference_end+1:]
            alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend)
            top_aln = alns[0]
            mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1]))))
            indels_snippet = top_aln[0].count("-") + top_aln[1].count("-")
            matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet
            query_end_alignment_snippet = top_aln[0]
            ref_end_alignment_snippet = top_aln[1]
        elif query_end_discrepancy:
            query_end_alignment_snippet = x[result.query_end+1:]
            ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet)
            indels_snippet = len(ref_end_alignment_snippet)

        elif ref_end_discrepancy:
            ref_end_alignment_snippet = y[result.reference_end+1:]
            query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet)
            indels_snippet = len(query_end_alignment_snippet)

        else:
            print("BUG")
            sys.exit()
        matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet

        y_alignment = y_alignment + ref_end_alignment_snippet
        x_alignment = x_alignment + query_end_alignment_snippet

    # matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ")
    deletions = x_alignment.count("-")
    insertions = y_alignment.count("-")
    assert deletions + insertions == indels

    if start_discrepancy > max_discrepancy or end_discrepancy > max_discrepancy:
        # print("REMOVING", start_discrepancy, end_discrepancy)
        return (y_alignment, x_alignment, None)

    else:
        return (y_alignment, x_alignment, (matches, mismatches, indels, deletions, insertions)) 
Ejemplo n.º 22
0
def reference_similarity(reference_transcripts, outfolder, params):
    """
        Stats about reference transcripts
    """
    seqs_seen = set()
    transcript_abundances = {}
    transcript_copies = Counter()
    transcript_sequences = {}
    for acc, seq in sorted(reference_transcripts.items(), key=lambda x: len(x[1])):

        try:
            tr_acc, copy_number_str = acc.split("copy")

        except ValueError:
            tr_acc, copy_number_str = acc, "1"  # viral data not simulated

        transcript_copies[tr_acc] += 1
        try :
            copy_number = int(copy_number_str)
        except ValueError:
            copy_number = 1

        if tr_acc not in transcript_abundances:
            transcript_abundances[tr_acc] = copy_number
            transcript_sequences[tr_acc] = seq
        elif tr_acc in transcript_abundances and copy_number >  transcript_abundances[tr_acc]:
            transcript_abundances[tr_acc] = copy_number
            transcript_sequences[tr_acc] = seq

        if seq in seqs_seen:
            # print("HERE!", len(seq))
            del reference_transcripts[acc]
        else:
            seqs_seen.add(seq)

    print("Number of unique references:", len(reference_transcripts))
    for t_acc, copy_nr in transcript_copies.items():
        print(t_acc, copy_nr)
    print("abundances:", transcript_abundances)



    print("calculating reference similarities")
    aligner = ssw.Aligner(gap_open=2, gap_extend=1)
    # do SW
    sorted_reference_tuples = sorted(reference_transcripts.items(), key = lambda x: len(x[1]))
    reference_similarities = {}
    for q_acc, q_seq in transcript_sequences.items():
        reference_similarities[q_acc] = {}
        for r_acc, r_seq in transcript_sequences.items():
            # r_aligned, q_aligned, stats = ssw_alignment( q_acc, r_acc, q_seq, r_seq, 0,0, max_discrepancy = 10000 )
            # if stats:
            #     matches, mismatches, indels, deletions, insertions = stats
            #     errors = mismatches + indels
            #     identity = errors/ float(errors + matches)
            #     reference_similarities[q_acc][r_acc] = errors
            # else:
            #     reference_similarities[q_acc][r_acc] = min(len(q_seq), len(r_seq))
            
            ed = edlib_ed(q_seq, r_seq, mode="NW", task="distance", k=10000)
            reference_similarities[q_acc][r_acc] = ed

    return transcript_abundances, transcript_copies, reference_similarities
Ejemplo n.º 23
0
def reference_similarity(reference_transcripts, outfolder, params):
    """
        Stats about reference transcripts
    """
    seqs_seen = set()
    transcript_abundances = {}
    transcript_copies = Counter()
    for acc, seq in sorted(reference_transcripts.items(),
                           key=lambda x: len(x[1])):

        try:
            tr_acc, copy_number_str = acc.split("copy")

        except ValueError:
            tr_acc, copy_number_str = acc, "1"  # viral data not simulated

        transcript_copies[tr_acc] += 1
        try:
            copy_number = int(copy_number_str)
        except ValueError:
            copy_number = 1

        if tr_acc not in transcript_abundances:
            transcript_abundances[tr_acc] = copy_number
        elif tr_acc in transcript_abundances and copy_number > transcript_abundances[
                tr_acc]:
            transcript_abundances[tr_acc] = copy_number

        if seq in seqs_seen:
            # print("HERE!", len(seq))
            del reference_transcripts[acc]
        else:
            seqs_seen.add(seq)

    print("Number of unique references:", len(reference_transcripts))
    for t_acc, copy_nr in transcript_copies.items():
        print(t_acc, copy_nr)
    # print("abundances:", transcript_abundances)

    if not params.no_ref_sim:
        # relative_abundance_matrix = {}
        # annotation_matrix = {}

        sorted_reference_tuples = sorted(reference_transcripts.items(),
                                         key=lambda x: len(x[1]))
        reference_abundances = [0] * len(sorted_reference_tuples)
        for i, (acc1, seq1) in enumerate(sorted_reference_tuples):
            reference_abundances[i] = [0] * len(sorted_reference_tuples)
            # relative_abundance_matrix[acc1] = {}
            # annotation_matrix[acc1] = {}
            for j, (acc2, seq2) in enumerate(sorted_reference_tuples):
                copy_nr_1 = transcript_abundances[acc1.split("copy")[0]]
                copy_nr_2 = transcript_abundances[acc2.split("copy")[0]]
                reference_abundances[i][j] = float(copy_nr_1) / copy_nr_2
                # relative_abundance_matrix[acc1][acc2] = float(copy_nr_1)/copy_nr_2
                # annotation_matrix[acc1][acc2] = str(fractions.Fraction(copy_nr_1, copy_nr_2))

        # print(relative_abundance_matrix)
        # print(annotation_matrix)
        relative_abundance_matrix_data_frame = pd.DataFrame(
            reference_abundances)
        # msk = relative_abundance_matrix_data_frame > 99
        # relative_abundance_matrix_data_frame_masked = relative_abundance_matrix_data_frame.mask(msk)

        plot_heatmap("relative_abundance",
                     relative_abundance_matrix_data_frame)
        # plot_heatmap("relative_abundance", relative_abundance_matrix_data_frame, annotation=True)

        print("calculating reference similarities")
        aligner = ssw.Aligner(gap_open=2, gap_extend=1)
        # do SW
        sorted_reference_tuples = sorted(reference_transcripts.items(),
                                         key=lambda x: len(x[1]))
        reference_similarities = [0] * len(sorted_reference_tuples)
        for i, (q_acc, q_seq) in enumerate(sorted_reference_tuples):
            reference_similarities[i] = [0] * len(sorted_reference_tuples)
            # print("ref", i)
            for j, (r_acc, r_seq) in enumerate(sorted_reference_tuples):
                # if len(q_seq) - len(r_seq) > 99 or len(q_seq) - len(r_seq) < 99:
                #     reference_similarities[q_acc][r_acc] = min(len(q_seq), len(r_seq))
                #     continue

                ed = edlib_ed(q_seq,
                              r_seq,
                              mode="NW",
                              task="distance",
                              k=2 * max(len(q_seq), len(r_seq)))
                reference_similarities[i][j] = ed

                # r_aligned, q_aligned, stats = ssw_alignment( q_acc, r_acc, q_seq, r_seq, i,j, max_discrepancy = 10000 )
                # if stats:
                #     matches, mismatches, indels, deletions, insertions = stats
                #     errors = mismatches + indels
                #     identity = errors/ float(errors + matches)
                #     reference_similarities[i][j] = errors
                #     # print(ed, errors)
                # else:
                #     reference_similarities[i][j] = min(len(q_seq), len(r_seq))

        ref_sim_data_frame = pd.DataFrame(reference_similarities)
        msk = ref_sim_data_frame > 99
        ref_sim_data_frame_masked = ref_sim_data_frame.mask(msk)
        plot_heatmap("similarities", ref_sim_data_frame_masked)

    return transcript_abundances, transcript_copies, reference_similarities
def create_isoform_graph(transcripts, min_exon):
    
    G = nx.Graph()
    for acc in transcripts.keys():
        G.add_node(acc, accession = acc)


    score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-2)
    aligner = ssw.Aligner(gap_open=2, gap_extend=0, matrix=score_matrix)
    cntr = 0

    processed = set()
    # already_assigned = set()

    for acc1, seq1 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True):
        cntr += 1
        processed.add(acc1)
        # print("length t:", len(seq1), acc1)
        if cntr % 5 == 0:
            print(cntr, "sequences processed")
        # print(acc1)
        # if acc1 in already_assigned:
        #     # print("allready assigned to larger sequence!")
        #     continue

        for acc2, seq2 in sorted(transcripts.items(), key= lambda x: len(x[1]), reverse=True):
            if acc2 in processed:
                continue
            # if acc1 == acc2:
            #     continue
            # if seq1 == seq2:
            #     continue
            # result = aligner.align(seq1, seq2, revcomp=False)
            # seq2_aln, match_line, seq1_aln = result.alignment
            # print(seq1)
            # print(seq2)
            seq1_aln, seq2_aln, matches, mismatches, indels, match_line = ssw_alignment(seq1, seq2, ends_discrepancy_threshold = 2000 )
            print(acc1, acc2, mismatches, indels)
            print(seq1_aln)
            print(seq2_aln)
            print(match_line)

            # remove differences in 3' and 5' ends
            tmp_seq1_aln = seq1_aln
            tmp_seq1_aln = tmp_seq1_aln.lstrip("-")
            tmp_seq1_aln = tmp_seq1_aln.rstrip("-")

            tmp_seq2_aln = seq2_aln
            tmp_seq2_aln = tmp_seq2_aln.lstrip("-")
            tmp_seq2_aln = tmp_seq2_aln.rstrip("-")

            del_seq1 = re.findall(r"[-]+",tmp_seq1_aln)
            del_seq2 = re.findall(r"[-]+",tmp_seq2_aln)
            mismatches = len([ 1 for n1, n2 in zip(seq1_aln,seq2_aln) if n1 != n2 and n1 != "-" and n2 != "-" ])

            ## do not count length discrepancies in ends
            inner_del_seq1 = re.findall(r"[AGCT][-]+[AGCT]",seq1_aln)
            inner_del_seq2 = re.findall(r"[AGCT][-]+[AGCT]",seq2_aln)
            # print(inner_del_seq1)
            # print(inner_del_seq2)
            total_inner = sum([len(d) - 2 for d in inner_del_seq1]) + sum([len(d) - 2 for d in inner_del_seq2])
            # print(indels, total_inner)


            # by default (since all transcripts are distinct if we end up here), each transcript is its on gene member
            # if we find an alingment that contains only structural changes of > X (2) nucleotides, and no other smaller differences we classify as same family
            if mismatches == 0:
                del_lengths1 = [len(del_) for del_ in del_seq1]
                del_lengths2 = [len(del_) for del_ in del_seq2]
                no_small_del_in_seq1 = ((len(del_lengths1) > 0 and min(del_lengths1) >= min_exon) or len(del_lengths1)  == 0)
                no_small_del_in_seq2 = ((len(del_lengths2) > 0 and min(del_lengths2) >= min_exon) or len(del_lengths2)  == 0)
                # print(no_small_del_in_seq1, no_small_del_in_seq2)
                # print((len(del_seq1) > 0 and min(del_seq1) >= 3), len(del_seq1)  == 0)
                # if acc1[0][:14] == "transcript_460" and acc2[0][:14] == "transcript_467" :
                #     print("we are here", no_small_del_in_seq1, no_small_del_in_seq2, mismatches)
                #     sys.exit()
                if no_small_del_in_seq1 and no_small_del_in_seq2:
                    G.add_edge(acc1, acc2, alignment={ acc1 : seq1_aln, acc2 : seq2_aln })
                else:
                    pass
                    # print("Different only by small indel!!")

    list_of_maximal_cliques = list(nx.find_cliques(G))
    print("Number of possible members:", len(list_of_maximal_cliques) )
    print("clique sizes", [ len(cl) for cl in  sorted(list_of_maximal_cliques, key= lambda x: len(x), reverse=True)] )
    return G
def ssw_alignment(x, y, ends_discrepancy_threshold = 250 ):
    """
        Aligns two sequences with SSW
        x: query
        y: reference

    """

    score_matrix = ssw.DNA_ScoreMatrix(match=1, mismatch=-20)
    aligner = ssw.Aligner(gap_open=50, gap_extend=0, matrix=score_matrix)

    # for the ends that SSW leaves behind
    bio_matrix = matlist.blosum62
    g_open = -1
    g_extend = -0.5
    ######################################

    # result = aligner.align("GA", "G", revcomp=False)
    # y_alignment, match_line, x_alignment = result.alignment
    # c = Counter(match_line)
    # matches, mismatches, indels = c["|"], c["*"], c[" "]
    # alignment_length = len(match_line)
    # print("matches:{0}, mismatches:{1}, indels:{2} ".format(matches, mismatches, indels))
    # print(match_line)

    result = aligner.align(x, y, revcomp=False)
    y_alignment, match_line, x_alignment = result.alignment
    # print()
    # print(y_alignment)
    # print(match_line)
    # print(x_alignment)
    matches, mismatches, indels = match_line.count("|"), match_line.count("*"), match_line.count(" ")

    # alignment_length = len(match_line)
    
    start_discrepancy = max(result.query_begin, result.reference_begin)  # 0-indexed # max(result.query_begin, result.reference_begin) - min(result.query_begin, result.reference_begin)
    query_end_discrepancy = len(x) - result.query_end - 1
    ref_end_discrepancy = len(y) - result.reference_end - 1
    end_discrepancy = max(query_end_discrepancy, ref_end_discrepancy)  # max(result.query_end, result.reference_end) - min(result.query_end, result.reference_end)
    # print("disc:", start_discrepancy, end_discrepancy)
    tot_discrepancy = start_discrepancy + end_discrepancy

    if 0 < start_discrepancy <= ends_discrepancy_threshold:
        print("HERE",start_discrepancy)
        matches_snippet = 0
        mismatches_snippet = 0
        if result.query_begin and result.reference_begin:
            query_start_snippet = x[:result.query_begin]
            ref_start_snippet = y[:result.reference_begin]
            alns = pairwise2.align.globalds(query_start_snippet, ref_start_snippet, bio_matrix, g_open, g_extend)
            top_aln = alns[0]
            # print(alns)
            mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1]))))
            indels_snippet = top_aln[0].count("-") + top_aln[1].count("-")
            matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet
            # print(matches_snippet, mismatches_snippet, indels_snippet)
            query_start_alignment_snippet = top_aln[0]
            ref_start_alignment_snippet = top_aln[1]
        elif result.query_begin:
            query_start_alignment_snippet = x[:result.query_begin]
            ref_start_alignment_snippet = "-"*len(query_start_alignment_snippet)
            indels_snippet = len(ref_start_alignment_snippet)
        elif result.reference_begin:
            ref_start_alignment_snippet = y[:result.reference_begin]
            query_start_alignment_snippet = "-"*len(ref_start_alignment_snippet)
            indels_snippet = len(query_start_alignment_snippet)
        else:
            print("BUG")
            sys.exit()
        matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet

        # print(ref_start_alignment_snippet)
        # print(query_start_alignment_snippet)
        y_alignment = ref_start_alignment_snippet + y_alignment
        x_alignment = query_start_alignment_snippet + x_alignment

    if 0 < end_discrepancy <= ends_discrepancy_threshold:
        print("HERE2", end_discrepancy)
        matches_snippet = 0
        mismatches_snippet = 0
        if query_end_discrepancy and ref_end_discrepancy:
            query_end_snippet = x[result.query_end+1:]
            ref_end_snippet = y[result.reference_end+1:]
            alns = pairwise2.align.globalds(query_end_snippet, ref_end_snippet, bio_matrix, g_open, g_extend)
            top_aln = alns[0]
            mismatches_snippet = len(list(filter(lambda x: x[0] != x[1] and x[0] != '-' and x[1] != "-", zip(top_aln[0],top_aln[1]))))
            indels_snippet = top_aln[0].count("-") + top_aln[1].count("-")
            matches_snippet = len(top_aln[0]) - mismatches_snippet - indels_snippet
            query_end_alignment_snippet = top_aln[0]
            ref_end_alignment_snippet = top_aln[1]
        elif query_end_discrepancy:
            query_end_alignment_snippet = x[result.query_end+1:]
            ref_end_alignment_snippet = "-"*len(query_end_alignment_snippet)
            indels_snippet = len(ref_end_alignment_snippet)

        elif ref_end_discrepancy:
            ref_end_alignment_snippet = y[result.reference_end+1:]
            query_end_alignment_snippet = "-"*len(ref_end_alignment_snippet)
            indels_snippet = len(query_end_alignment_snippet)

        else:
            print("BUG")
            sys.exit()
        matches, mismatches, indels = matches + matches_snippet, mismatches + mismatches_snippet, indels + indels_snippet

        y_alignment = y_alignment + ref_end_alignment_snippet
        x_alignment = x_alignment + query_end_alignment_snippet

    return x_alignment, y_alignment, matches, mismatches, indels, match_line