def create_synteny_matrix_mul(self, gene_seq, g1, g2, n): for gene in g1: if gene == "NULL_GENE": continue try: _ = gene_seq[gene] except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) for gene in g2: if gene == "NULL_GENE": continue try: _ = gene_seq[gene] except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) sm = np.zeros((n, n, 2)) sml = np.zeros((n, n, 2)) for i in range(n): if g1[i] == "NULL_GENE": continue if gene_seq[g1[i]] == "": return np.zeros((n, n, 2)), np.zeros((n, n, 2)) for j in range(n): if g2[j] == "NULL_GENE": continue if gene_seq[g2[j]] == "": return np.zeros((n, n, 2)), np.zeros((n, n, 2)) norm_len = max(len(gene_seq[g1[i]]), len(gene_seq[g2[j]])) try: result = ed.align(gene_seq[g1[i]], gene_seq[g2[j]], mode="NW", task="distance") sm[i][j][0] = result["editDistance"] / (norm_len) result = ed.align(gene_seq[g1[i]], gene_seq[g2[j]][::-1], mode="NW", task="distance") sm[i][j][1] = result["editDistance"] / (norm_len) _, result, _ = local_pairwise_align_ssw( DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]])) sml[i][j][0] = result / (norm_len) _, result, _ = local_pairwise_align_ssw( DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]][::-1])) sml[i][j][1] = result / (norm_len) except BaseException: return np.zeros((n, n, 2)), np.zeros((n, n, 2)) return sm, sml
def _align(a: Protein, b: Protein): """Wraps the skbio pairwise ssw alilgner. Args: a (str): sequence a b (str): sequence b Returns: skbio.alignment.TabularMSA: skbio alignment table """ return local_pairwise_align_ssw(a, b, substitution_matrix=blosum50)
def dnaLocalAlignSsw(seq1, seq2): seq1 = seq1.upper() seq2 = seq2.upper() msa, score, _ = local_pairwise_align_ssw(DNA(seq1), DNA(seq2)) response = { 'seq1': str(seq1), 'aln1': str(msa[0]), 'aln2': str(msa[1]), 'score': score, 'similarity': float('{:.2f}'.format(msa[0].match_frequency(msa[1], relative=True) * 100)) } return response
def get_primer_positions(primer_seqs, reference_seq): # hash map to hold start, stop positions for primers d = {} for p in primer_seqs.items(): qname, qseq = p if 'RIGHT' in qname: # mind the reverse complement qseq = str(DNA(qseq).reverse_complement()) # align primer to reference using (striped) Smith-Waterman msa, aln_score, pos = local_pairwise_align_ssw( DNA(qseq), DNA(reference_seq)) _, rpos = pos pstart, pend = rpos pspan = range(pstart, pend + 1) # pspan .. primer span # + 1 bc/ the alignment is inclusive of last position while the fn # range (Python in general) is not # contains start, end position of primer on ref d[pstart] = qname d[pend] = qname return d
def _assembleTwo(seq1, seq2): """This only works if two sequences share a significant identical overlap""" if len(seq2) <= len(seq1) and re.search(seq2, seq1): return seq1 elif len(seq1) <= len(seq2) and re.search(seq1, seq2): return seq2 else: msa = local_pairwise_align_ssw(Protein(seq1), Protein(seq2), substitution_matrix=ident) if msa[1] >= 8: try: (s1, e1), (s2, e2) = msa[-1] except: print(msa) if s1 >= s2: return seq1 + seq2[e2 + 1:] else: return seq2 + seq1[e1 + 1:] return out else: print('No significant overlap') raise
def _assembleTwo(seq1, seq2): """This only works if two sequences share a significant identical overlap""" if len(seq2) <= len(seq1) and re.search(seq2, seq1): return seq1 elif len(seq1) <= len(seq2) and re.search(seq1, seq2): return seq2 else: msa = local_pairwise_align_ssw(Protein(seq1), Protein(seq2), substitution_matrix=ident) if msa[1] >= 8: try: (s1, e1), (s2, e2) = msa[-1] except: print(msa) if s1 >= s2: return seq1 + seq2[e2+1:] else: return seq2 + seq1[e1+1:] return out else: print('No significant overlap') raise
def ssw_similarity(seq_a,seq_b): a = local_pairwise_align_ssw(seq_a,seq_b) return 1-a.distances()[0][1]
def convertCoords(lower_exon_dict, upper_exon_dict, srcRef, altRef, buff=0): newCoords = {} for i, chrom in enumerate(SeqIO.parse( srcRef, "fasta")): # Use BIOpython to parse reference fasta. for j, pchrom in enumerate(SeqIO.parse(altRef, "fasta")): Chrom = chrom.id.strip("chr0").strip("chr") Pchrom = pchrom.id.strip("chr0").strip("chr") if Chrom == Pchrom: Alt = pchrom for gene, Linfo in lower_exon_dict[chrom.id].items( ): # Loop over all gene coordinates of interest for the current chromosome Uinfo = upper_exon_dict[chrom.id][gene] pdb.set_trace() # c, lstart, lstop, geneID, p, pstart, pstop, laltID = lower_exon_dict[chrom.id][gene] # c, ustart, ustop, geneID, p, pstart, pstop, ualtID = lower_query = DNA( str(chrom.seq)[int(Linfo[1]):int(Linfo[2])] ) # Pull sequence from reference using current gene coordinates upper_query = DNA( str(chrom.seq)[int(Uinfo[1]):int(Uinfo[2])] ) # Pull sequence from reference using current gene coordinates assert Linfo[-1] == Uinfo[-1] subject = str( Alt.seq )[int(Linfo[5]) - buff:int(Linfo[6]) + buff] # Pull sequence from reference using current gene coordinates sN = str( len(subject) - subject.count("N") ) # Count number of N's in the sequence in order to adjust query length that is included in multi_fasta lower_alignment = alignment.local_pairwise_align_ssw( lower_query, subject) upper_alignment = alignment.local_pairwise_align_ssw( upper_query, subject) Laln_len = lower_alignment[1] Laln_qcoords = lower_alignment[2][0] Laln_scoords = lower_alignment[2][1] Ualn_len = upper_alignment[1] Ualn_qcoords = upper_alignment[2][0] Ualn_scoords = upper_alignment[2][1] propL_aln = abs(Lstop - Lstart) / len(lower_query) propU_aln = abs(Ustop - Ustart) / len(upper_query) ## STOPPED HERE...DEAL with parsing alignment results ## HOW TO CONVERT ALIGNMENT RESULTS TO NEW COORDINATES?? ## DRAW OUT ALL POSSIBLE WAYS IN WHICH THEY COULD ALIGN!!! pdb.set_trace() if Ustart > Lstart: # Same orientation between src and alt assert Lstart < Lstop and Ustart < Ustop newStart = int(pstart) - buff + Lstart newStop = int(pstop) + buff - Ustop else: assert Lstart > Lstop and Ustart > Ustop newStop = int(pstart) - buff + Ustart newStart = int(pstop) + buff - Lstart oldStart = min(lstart, ustart) oldStop = max(lstop, ustop) newCoords[gene] = [ p, newStart, newStop, geneID, (newStop - newStart), c, oldStart, oldStop, oldStop - oldStart ] return (newCoords)
def get_meth_profile(args, seg_chrom, seg_start, seg_end, seg_name, seg_strand): logger.info('profiling %s %s:%d-%d:%s' % (seg_name, seg_chrom, seg_start, seg_end, seg_strand)) te_ref_seq = single_seq_fa(args.teref) ref = pysam.Fastafile(args.ref) meth_tbx = pysam.Tabixfile(args.meth) tmp_methdata = str(uuid4()) + '.tmp.methdata.tsv' with open(tmp_methdata, 'w') as meth_out: # header with gzip.open(args.meth, 'rt') as _: for line in _: assert line.startswith('chromosome') meth_out.write(line) break assert seg_chrom in meth_tbx.contigs for rec in meth_tbx.fetch(seg_chrom, seg_start, seg_end): meth_out.write(str(rec) + '\n') # index by read_name methdata = pd.read_csv(tmp_methdata, sep='\t', header=0, index_col=4) os.remove(tmp_methdata) reads = [] if args.excl_ambig: reads = exclude_ambiguous_reads(args.bam, seg_chrom, seg_start, seg_end) else: reads = get_reads(args.bam, seg_chrom, seg_start, seg_end) reads = list(set(reads).intersection(set(methdata.index))) methdata = methdata.loc[reads] seg_reads = {} for index, row in methdata.iterrows(): r_start = row['start'] r_end = row['end'] llr = row['log_lik_ratio'] seq = row['sequence'] # get per-CG position (nanopolish/calculate_methylation_frequency.py) cg_pos = seq.find("CG") first_cg_pos = cg_pos while cg_pos != -1: cg_start = r_start + cg_pos - first_cg_pos cg_pos = seq.find("CG", cg_pos + 1) cg_seg_start = cg_start - seg_start if cg_start >= seg_start and cg_start <= seg_end: if index not in seg_reads: seg_reads[index] = Read(index, cg_seg_start, llr) else: seg_reads[index].add_cpg(cg_seg_start, llr) meth_table = dd(dict) sample = '.'.join(args.bam.split('.')[:-1]) for name, read in seg_reads.items(): for loc in read.llrs.keys(): uuid = str(uuid4()) meth_table[uuid]['loc'] = loc meth_table[uuid]['llr'] = read.llrs[loc] meth_table[uuid]['read'] = name meth_table[uuid]['sample'] = sample meth_table[uuid]['call'] = read.meth_calls[loc] meth_table = pd.DataFrame.from_dict(meth_table).T meth_table['loc'] = pd.to_numeric(meth_table['loc']) meth_table['llr'] = pd.to_numeric(meth_table['llr']) meth_table['orig_loc'] = meth_table['loc'] meth_table['loc'] = ss.rankdata(meth_table['loc'], method='dense') coord_to_cpg = {} cpg_to_coord = {} for orig_loc, new_loc in zip(meth_table['orig_loc'], meth_table['loc']): coord_to_cpg[orig_loc] = new_loc cpg_to_coord[new_loc] = orig_loc windowed_methfrac, meth_n = slide_window(meth_table, sample, width=int(args.slidingwindowsize), slide=int(args.slidingwindowstep)) if len(windowed_methfrac) <= int(args.smoothwindowsize): logger.warning('too few sites after windowing: %s:%d-%d' % (seg_chrom, seg_start, seg_end)) return [], [] smoothed_methfrac = smooth(np.asarray(list(windowed_methfrac.values())), window_len=int(args.smoothwindowsize)) coord_meth_pos = [] cpg_meth_pos = list(windowed_methfrac.keys()) for cpg in cpg_meth_pos: if seg_strand == '+': coord_meth_pos.append(cpg_to_coord[cpg]) if seg_strand == '-': coord_meth_pos.append((seg_end - seg_start) - cpg_to_coord[cpg]) # alignment to ref elt elt_seq = ref.fetch(seg_chrom, seg_start, seg_end) if seg_strand == '-': elt_seq = rc(elt_seq) te_ref_seq = te_ref_seq.upper() elt_seq = elt_seq.upper() s_ref = skseq.DNA(te_ref_seq) s_elt = skseq.DNA(elt_seq) aln_res = [] try: if args.globalign: aln_res = skalign.global_pairwise_align_nucleotide(s_ref, s_elt) else: aln_res = skalign.local_pairwise_align_ssw(s_ref, s_elt) except IndexError: # scikit-bio throws this if no bases align >:| logger.warning('no align on seg: %s:%d-%d' % (seg_chrom, seg_start, seg_end)) return [], [] coord_ref, coord_elt = aln_res[2] len_ref = coord_ref[1] - coord_ref[0] len_elt = coord_elt[1] - coord_elt[0] if len_ref / len(te_ref_seq) < float(args.lenfrac): logger.warning( 'ref align too short on seg: %s:%d-%d (%f)' % (seg_chrom, seg_start, seg_end, len_ref / len(te_ref_seq))) return [], [] if len_elt / len(elt_seq) < float(args.lenfrac): logger.warning('elt align too short on seg: %s:%d-%d (%f)' % (seg_chrom, seg_start, seg_end, len_elt / len(elt_seq))) return [], [] tab_msa = aln_res[0] elt_to_ref_coords = {} pos_ref = coord_ref[0] pos_elt = coord_elt[0] for pos in tab_msa.iter_positions(): pos = list(pos) b_ref = pos[0] b_elt = pos[1] if '-' not in pos: elt_to_ref_coords[pos_elt] = pos_ref pos_ref += 1 pos_elt += 1 if b_elt == '-': pos_ref += 1 if b_ref == '-': elt_to_ref_coords[pos_elt] = 'na' pos_elt += 1 revised_coord_meth_pos = [] meth_profile = [] for pos, meth in zip(coord_meth_pos, smoothed_methfrac): if pos not in elt_to_ref_coords: continue revised_pos = elt_to_ref_coords[pos] if revised_pos != 'na': revised_coord_meth_pos.append(revised_pos) meth_profile.append(meth) return revised_coord_meth_pos, meth_profile