def create_synteny_matrix_mul(self, gene_seq, g1, g2, n):
     for gene in g1:
         if gene == "NULL_GENE":
             continue
         try:
             _ = gene_seq[gene]
         except BaseException:
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     for gene in g2:
         if gene == "NULL_GENE":
             continue
         try:
             _ = gene_seq[gene]
         except BaseException:
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     sm = np.zeros((n, n, 2))
     sml = np.zeros((n, n, 2))
     for i in range(n):
         if g1[i] == "NULL_GENE":
             continue
         if gene_seq[g1[i]] == "":
             return np.zeros((n, n, 2)), np.zeros((n, n, 2))
         for j in range(n):
             if g2[j] == "NULL_GENE":
                 continue
             if gene_seq[g2[j]] == "":
                 return np.zeros((n, n, 2)), np.zeros((n, n, 2))
             norm_len = max(len(gene_seq[g1[i]]), len(gene_seq[g2[j]]))
             try:
                 result = ed.align(gene_seq[g1[i]],
                                   gene_seq[g2[j]],
                                   mode="NW",
                                   task="distance")
                 sm[i][j][0] = result["editDistance"] / (norm_len)
                 result = ed.align(gene_seq[g1[i]],
                                   gene_seq[g2[j]][::-1],
                                   mode="NW",
                                   task="distance")
                 sm[i][j][1] = result["editDistance"] / (norm_len)
                 _, result, _ = local_pairwise_align_ssw(
                     DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]]))
                 sml[i][j][0] = result / (norm_len)
                 _, result, _ = local_pairwise_align_ssw(
                     DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]][::-1]))
                 sml[i][j][1] = result / (norm_len)
             except BaseException:
                 return np.zeros((n, n, 2)), np.zeros((n, n, 2))
     return sm, sml
Example #2
0
def _align(a: Protein, b: Protein):
    """Wraps the skbio pairwise ssw alilgner.

    Args:
        a (str): sequence a
        b (str): sequence b

    Returns:
        skbio.alignment.TabularMSA: skbio alignment table
    """
    return local_pairwise_align_ssw(a, b, substitution_matrix=blosum50)
Example #3
0
def dnaLocalAlignSsw(seq1, seq2):
    seq1 = seq1.upper()
    seq2 = seq2.upper()

    msa, score, _ = local_pairwise_align_ssw(DNA(seq1), DNA(seq2))

    response = {
        'seq1':
        str(seq1),
        'aln1':
        str(msa[0]),
        'aln2':
        str(msa[1]),
        'score':
        score,
        'similarity':
        float('{:.2f}'.format(msa[0].match_frequency(msa[1], relative=True) *
                              100))
    }

    return response
Example #4
0
def get_primer_positions(primer_seqs, reference_seq):
    # hash map to hold start, stop positions for primers
    d = {}
    
    for p in primer_seqs.items():
        qname, qseq = p
        if 'RIGHT' in qname:  # mind the reverse complement
            qseq = str(DNA(qseq).reverse_complement())

        # align primer to reference using (striped) Smith-Waterman
        msa, aln_score, pos = local_pairwise_align_ssw(
            DNA(qseq), DNA(reference_seq))
        
        _, rpos = pos
        pstart, pend = rpos
        pspan = range(pstart, pend + 1)  # pspan .. primer span
        # + 1 bc/ the alignment is inclusive of last position while the fn 
        # range (Python in general) is not
    
        # contains start, end position of primer on ref
        d[pstart] = qname
        d[pend] = qname
    return d
Example #5
0
def _assembleTwo(seq1, seq2):
    """This only works if two sequences share a significant identical overlap"""
    if len(seq2) <= len(seq1) and re.search(seq2, seq1):
        return seq1
    elif len(seq1) <= len(seq2) and re.search(seq1, seq2):
        return seq2
    else:
        msa = local_pairwise_align_ssw(Protein(seq1),
                                       Protein(seq2),
                                       substitution_matrix=ident)
        if msa[1] >= 8:
            try:
                (s1, e1), (s2, e2) = msa[-1]
            except:
                print(msa)

            if s1 >= s2:
                return seq1 + seq2[e2 + 1:]
            else:
                return seq2 + seq1[e1 + 1:]
            return out
        else:
            print('No significant overlap')
            raise
Example #6
0
def _assembleTwo(seq1, seq2):
    """This only works if two sequences share a significant identical overlap"""
    if len(seq2) <= len(seq1) and re.search(seq2, seq1):
        return seq1
    elif len(seq1) <= len(seq2) and re.search(seq1, seq2):
        return seq2
    else:
        msa = local_pairwise_align_ssw(Protein(seq1),
                                       Protein(seq2),
                                       substitution_matrix=ident)
        if msa[1] >= 8:
            try:
                (s1, e1), (s2, e2) = msa[-1]
            except:
                print(msa)

            if s1 >= s2:
                return seq1 + seq2[e2+1:]
            else:
                return seq2 + seq1[e1+1:]
            return out
        else:
            print('No significant overlap')
            raise
def ssw_similarity(seq_a,seq_b):
    a = local_pairwise_align_ssw(seq_a,seq_b)
    return 1-a.distances()[0][1]
def convertCoords(lower_exon_dict, upper_exon_dict, srcRef, altRef, buff=0):
    newCoords = {}
    for i, chrom in enumerate(SeqIO.parse(
            srcRef, "fasta")):  # Use BIOpython to parse reference fasta.
        for j, pchrom in enumerate(SeqIO.parse(altRef, "fasta")):
            Chrom = chrom.id.strip("chr0").strip("chr")
            Pchrom = pchrom.id.strip("chr0").strip("chr")
            if Chrom == Pchrom:
                Alt = pchrom
        for gene, Linfo in lower_exon_dict[chrom.id].items(
        ):  # Loop over all gene coordinates of interest for the current chromosome
            Uinfo = upper_exon_dict[chrom.id][gene]
            pdb.set_trace()
            # c, lstart, lstop, geneID, p, pstart, pstop, laltID = lower_exon_dict[chrom.id][gene]
            # c, ustart, ustop, geneID, p, pstart, pstop, ualtID =
            lower_query = DNA(
                str(chrom.seq)[int(Linfo[1]):int(Linfo[2])]
            )  # Pull sequence from reference using current gene coordinates
            upper_query = DNA(
                str(chrom.seq)[int(Uinfo[1]):int(Uinfo[2])]
            )  # Pull sequence from reference using current gene coordinates
            assert Linfo[-1] == Uinfo[-1]
            subject = str(
                Alt.seq
            )[int(Linfo[5]) - buff:int(Linfo[6]) +
              buff]  # Pull sequence from reference using current gene coordinates
            sN = str(
                len(subject) - subject.count("N")
            )  # Count number of N's in the sequence in order to adjust query length that is included in multi_fasta
            lower_alignment = alignment.local_pairwise_align_ssw(
                lower_query, subject)
            upper_alignment = alignment.local_pairwise_align_ssw(
                upper_query, subject)
            Laln_len = lower_alignment[1]
            Laln_qcoords = lower_alignment[2][0]
            Laln_scoords = lower_alignment[2][1]
            Ualn_len = upper_alignment[1]
            Ualn_qcoords = upper_alignment[2][0]
            Ualn_scoords = upper_alignment[2][1]

            propL_aln = abs(Lstop - Lstart) / len(lower_query)
            propU_aln = abs(Ustop - Ustart) / len(upper_query)

            ## STOPPED HERE...DEAL with parsing alignment results
            ## HOW TO CONVERT ALIGNMENT RESULTS TO NEW COORDINATES??
            ## DRAW OUT ALL POSSIBLE WAYS IN WHICH THEY COULD ALIGN!!!
            pdb.set_trace()
            if Ustart > Lstart:  # Same orientation between src and alt
                assert Lstart < Lstop and Ustart < Ustop
                newStart = int(pstart) - buff + Lstart
                newStop = int(pstop) + buff - Ustop
            else:
                assert Lstart > Lstop and Ustart > Ustop
                newStop = int(pstart) - buff + Ustart
                newStart = int(pstop) + buff - Lstart
            oldStart = min(lstart, ustart)
            oldStop = max(lstop, ustop)
            newCoords[gene] = [
                p, newStart, newStop, geneID, (newStop - newStart), c,
                oldStart, oldStop, oldStop - oldStart
            ]
    return (newCoords)
Example #9
0
def get_meth_profile(args, seg_chrom, seg_start, seg_end, seg_name,
                     seg_strand):
    logger.info('profiling %s %s:%d-%d:%s' %
                (seg_name, seg_chrom, seg_start, seg_end, seg_strand))

    te_ref_seq = single_seq_fa(args.teref)
    ref = pysam.Fastafile(args.ref)

    meth_tbx = pysam.Tabixfile(args.meth)

    tmp_methdata = str(uuid4()) + '.tmp.methdata.tsv'

    with open(tmp_methdata, 'w') as meth_out:
        # header
        with gzip.open(args.meth, 'rt') as _:
            for line in _:
                assert line.startswith('chromosome')
                meth_out.write(line)
                break

        assert seg_chrom in meth_tbx.contigs

        for rec in meth_tbx.fetch(seg_chrom, seg_start, seg_end):
            meth_out.write(str(rec) + '\n')

    # index by read_name
    methdata = pd.read_csv(tmp_methdata, sep='\t', header=0, index_col=4)

    os.remove(tmp_methdata)

    reads = []
    if args.excl_ambig:
        reads = exclude_ambiguous_reads(args.bam, seg_chrom, seg_start,
                                        seg_end)
    else:
        reads = get_reads(args.bam, seg_chrom, seg_start, seg_end)

    reads = list(set(reads).intersection(set(methdata.index)))

    methdata = methdata.loc[reads]

    seg_reads = {}

    for index, row in methdata.iterrows():
        r_start = row['start']
        r_end = row['end']
        llr = row['log_lik_ratio']
        seq = row['sequence']

        # get per-CG position (nanopolish/calculate_methylation_frequency.py)
        cg_pos = seq.find("CG")
        first_cg_pos = cg_pos

        while cg_pos != -1:
            cg_start = r_start + cg_pos - first_cg_pos
            cg_pos = seq.find("CG", cg_pos + 1)

            cg_seg_start = cg_start - seg_start

            if cg_start >= seg_start and cg_start <= seg_end:
                if index not in seg_reads:
                    seg_reads[index] = Read(index, cg_seg_start, llr)
                else:
                    seg_reads[index].add_cpg(cg_seg_start, llr)

    meth_table = dd(dict)
    sample = '.'.join(args.bam.split('.')[:-1])

    for name, read in seg_reads.items():
        for loc in read.llrs.keys():
            uuid = str(uuid4())
            meth_table[uuid]['loc'] = loc
            meth_table[uuid]['llr'] = read.llrs[loc]
            meth_table[uuid]['read'] = name
            meth_table[uuid]['sample'] = sample
            meth_table[uuid]['call'] = read.meth_calls[loc]

    meth_table = pd.DataFrame.from_dict(meth_table).T
    meth_table['loc'] = pd.to_numeric(meth_table['loc'])
    meth_table['llr'] = pd.to_numeric(meth_table['llr'])

    meth_table['orig_loc'] = meth_table['loc']
    meth_table['loc'] = ss.rankdata(meth_table['loc'], method='dense')

    coord_to_cpg = {}
    cpg_to_coord = {}
    for orig_loc, new_loc in zip(meth_table['orig_loc'], meth_table['loc']):
        coord_to_cpg[orig_loc] = new_loc
        cpg_to_coord[new_loc] = orig_loc

    windowed_methfrac, meth_n = slide_window(meth_table,
                                             sample,
                                             width=int(args.slidingwindowsize),
                                             slide=int(args.slidingwindowstep))

    if len(windowed_methfrac) <= int(args.smoothwindowsize):
        logger.warning('too few sites after windowing: %s:%d-%d' %
                       (seg_chrom, seg_start, seg_end))
        return [], []

    smoothed_methfrac = smooth(np.asarray(list(windowed_methfrac.values())),
                               window_len=int(args.smoothwindowsize))

    coord_meth_pos = []

    cpg_meth_pos = list(windowed_methfrac.keys())

    for cpg in cpg_meth_pos:
        if seg_strand == '+':
            coord_meth_pos.append(cpg_to_coord[cpg])
        if seg_strand == '-':
            coord_meth_pos.append((seg_end - seg_start) - cpg_to_coord[cpg])

    # alignment to ref elt

    elt_seq = ref.fetch(seg_chrom, seg_start, seg_end)
    if seg_strand == '-':
        elt_seq = rc(elt_seq)

    te_ref_seq = te_ref_seq.upper()
    elt_seq = elt_seq.upper()

    s_ref = skseq.DNA(te_ref_seq)
    s_elt = skseq.DNA(elt_seq)

    aln_res = []

    try:
        if args.globalign:
            aln_res = skalign.global_pairwise_align_nucleotide(s_ref, s_elt)
        else:
            aln_res = skalign.local_pairwise_align_ssw(s_ref, s_elt)
    except IndexError:  # scikit-bio throws this if no bases align  >:|
        logger.warning('no align on seg: %s:%d-%d' %
                       (seg_chrom, seg_start, seg_end))
        return [], []

    coord_ref, coord_elt = aln_res[2]

    len_ref = coord_ref[1] - coord_ref[0]
    len_elt = coord_elt[1] - coord_elt[0]

    if len_ref / len(te_ref_seq) < float(args.lenfrac):
        logger.warning(
            'ref align too short on seg: %s:%d-%d (%f)' %
            (seg_chrom, seg_start, seg_end, len_ref / len(te_ref_seq)))
        return [], []

    if len_elt / len(elt_seq) < float(args.lenfrac):
        logger.warning('elt align too short on seg: %s:%d-%d (%f)' %
                       (seg_chrom, seg_start, seg_end, len_elt / len(elt_seq)))
        return [], []

    tab_msa = aln_res[0]

    elt_to_ref_coords = {}

    pos_ref = coord_ref[0]
    pos_elt = coord_elt[0]

    for pos in tab_msa.iter_positions():
        pos = list(pos)
        b_ref = pos[0]
        b_elt = pos[1]

        if '-' not in pos:
            elt_to_ref_coords[pos_elt] = pos_ref
            pos_ref += 1
            pos_elt += 1

        if b_elt == '-':
            pos_ref += 1

        if b_ref == '-':
            elt_to_ref_coords[pos_elt] = 'na'
            pos_elt += 1

    revised_coord_meth_pos = []
    meth_profile = []

    for pos, meth in zip(coord_meth_pos, smoothed_methfrac):
        if pos not in elt_to_ref_coords:
            continue

        revised_pos = elt_to_ref_coords[pos]

        if revised_pos != 'na':
            revised_coord_meth_pos.append(revised_pos)
            meth_profile.append(meth)

    return revised_coord_meth_pos, meth_profile