def setUp(self): """ Generate a sequence object that is easy to test! """ self.seq = seqinf.Sequence("ACTG") self.name = "seq name not seq" self.primer_dimer = seqinf.PrimerDimer( self.seq.sequence, seqinf.Sequence("TGCA").sequence, 2)
def test_format_comp_align(self): """ Test to ensure that complementary alignment is output. """ easy_seq1 = seqinf.Sequence("AAAA").sequence easy_seq2 = seqinf.Sequence("TTTT").complement() test = seqinf.PrimerDimer(easy_seq1, easy_seq2, 4) local = test.pd_local() for item in local: self.assertTrue("TTTT" in seqinf.PrimerDimer.\ format_alignment_compl(item[0], item[1], item[2], item[3], item[4]))
def primer_dimer_local(aln_len_list, names, seq1, seq2): """ Looks for complementarity between all primers in list, and returns alignment + score. No return, prints visualization between aligned primers (forming dimer) if they meet the min score set in the func. Args: min_dimer_alignment (int): the minimum alignment score between 2 primers before it is thrown out names (string): sequence ID seq1 (string): primer1 sequence seq2 (string): primer2 sequence Returns: name, seq1, seq2, score (generator): yields any primer pair that has score greater than or equal to min_dimer_alignment. """ for name, fseq, aln_len in zip(names, seq1, aln_len_list): for rseq in seq2: pd_test = seqinf.PrimerDimer(fseq, seqinf.Sequence(rseq).complement(), aln_len) pd_output = pd_test.pd_local() for item in pd_output: yield (name + "\n", seqinf.PrimerDimer.format_alignment_compl( item[0], item[1], item[2], item[3], item[4]))
def test_handles_lowercase(self): """ Tests to ensure handles lowercase sequence strings. """ input_seq = self.seq.sequence test_seq = seqinf.Sequence('actg').sequence self.assertEqual(input_seq, test_seq)
def gc_percent_seqs(seqs_list): """ Use seqinf to calculate GC content for output from isPCR. Args: seqs_list (list): list of seqs output from isPCR Returns: gc_list (list): list of gc content of seqs. """ upper_seqs = [[nuc.upper() for nuc in seq] for seq in seqs_list] gc_list = [] for seqs in upper_seqs: for seq in seqs: gc_list.append(seqinf.Sequence(seq).gc_percent()) return gc_list
def calc_gc(sliced_seq): """ Takes sliced seq list with header, product len, and product seq and returns the header, product len, and product GC%. A bit redundant but I might find it useful to have this exact sequence in the future. Args: sliced_seq (list): contains (header, product_len, product_seq) Returns: sample_gc (list): contains (header, product_len, product GC%) """ sample_gc = [] for header, p_len, p_seq in sliced_seq: sample_gc.append((header, p_len, seqinf.Sequence(p_seq).gc_percent())) return sample_gc
def flanking_region_fasta_insertion(genome, dataframe, flanking_region_size): """ Makes batch processing possible, pulls down small region of genome for which to design primers around and generates flanking regions based on an inverted sequence. This is based on the chromosome and position of input file. Each Fasta record will contain: Note: If strand is negative, coordinates should be in decreasing order. >Sample_Gene_chr:posNorm1-posNorm2_BP Seq of flanking region upstream of SV + seq of inserted sequence based on strand Args: genome (list): genome list of tuples (header, seq) dataframe (pandas object): dataframe with sample info. flanking_region_size (int): length of sequence upstream dna downstream of input coordinate position to pull as sequence to design primers around. Returns: output (list): (header + seq) """ headersbp1 = [] seqsnormbp1 = [] seqsinsbp1 = [] headersbp2 = [] seqsnormbp2 = [] seqsinsbp2 = [] for headers, seqs in genome: chrm = str(headers) seq = str(seqs) for gene, sample, chrn, startn, stopn, strandn in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN): if str(chrn) == chrm and strandn == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopn)+"__BP1") flank_seq = seq[int(startn):int(startn) + int(flanking_region_size)] seqsnormbp1.append(flank_seq) headersbp1.append(header) elif str(chrn) == chrm and strandn == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopn)+"__BP1") flank_seq = seqinf.Sequence(seq[int(startn):int(startn)-(int(flanking_region_size)):-1])\ .complement() seqsnormbp1.append(flank_seq) headersbp1.append(header) for chri, starti, stopi, strandi in zip(dataframe.ChrIns, dataframe.PosIns1, dataframe.PosIns2, dataframe.StrandI): if str(chri) == chrm and strandi == '+': flank_seq = seq[int(starti) - int(flanking_region_size):int(starti)] seqsinsbp1.append(flank_seq) elif str(chri) == chrm and strandi == '-': flank_seq = seqinf.Sequence(seq[int(starti)+int(flanking_region_size):int(starti):-1])\ .complement() seqsinsbp1.append(flank_seq) for gene, sample, chrn, startn, stopn, strandn in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN): if str(chrn) == chrm and strandn == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopn)+"__BP2") flank_seq = seq[int(stopn) - int(flanking_region_size):int(stopn)] seqsnormbp2.append(flank_seq) headersbp2.append(header) elif str(chrn) == chrm and strandn == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopn)+"__BP2") flank_seq = seqinf.Sequence(seq[int(stopn)+int(flanking_region_size):int(stopn):-1])\ .complement() seqsnormbp2.append(flank_seq) headersbp2.append(header) for chri, starti, stopi, strandi in zip(dataframe.ChrIns, dataframe.PosIns1, dataframe.PosIns2, dataframe.StrandI): if str(chri) == chrm and strandi == '+': flank_seq = seq[int(stopi):int(stopi) + (int(flanking_region_size))] seqsinsbp2.append(flank_seq) elif str(chri) == chrm and strandi == '-': flank_seq = seqinf.\ Sequence(seq[int(stopi):int(stopi)-int(flanking_region_size):-1])\ .complement() seqsinsbp2.append(flank_seq) output = [] for headers, seqsnorm, seqsins in zip(headersbp1, seqsnormbp1, seqsinsbp1): combined_seq = seqsins + seqsnorm output.append((headers, combined_seq.upper())) for headers, seqsnorm, seqsins in zip(headersbp2, seqsnormbp2, seqsinsbp2): combined_seq = seqsnorm + seqsins output.append((headers, combined_seq.upper())) return output
def flanking_region_fasta_translocation(genome, dataframe, flanking_region_size): """ Pulls down small region of genome for which to design primers around and generates flanking regions based on strand info from input file. Each Fasta record will contain: >Sample_Gene_chrNorm:posNorm-posTrans Seq of flanking region upstream of posNorm + seq after posTrans based on strand Args: genome (list): genome list of tuples (header, seq) dataframe (pd.DataFrame): dataframe with sample info. flanking_region_size (int): length of sequence to pad position with. Returns: output (list): (header + seq) """ output = [] headersbp = [] seqnorm = [] seqtrans = [] for headers, seqs in genome: chrm = str(headers) seq = str(seqs) for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans, dataframe.PosTrans, dataframe.StrandT): if str(chrn) == chrm and strandn == '+': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seq[int(posn):int(posn) + int(flanking_region_size)] headersbp.append(header) seqnorm.append(flank_seq) elif str(chrn) == chrm and strandn == '-': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seqinf.Sequence(seq[int(posn):int(posn)-(int(flanking_region_size)):-1])\ .complement() headersbp.append(header) seqnorm.append(flank_seq) for chrt, post, strandt in zip(dataframe.ChrTrans, dataframe.PosTrans, dataframe.StrandT): if str(chrt) == chrm and strandt == '+': flank_seq = seq[int(post) - int(flanking_region_size):int(post)] seqtrans.append(flank_seq) elif str(chrt) == chrm and strandt == '-': flank_seq = seqinf.Sequence(seq[int(post)+int(flanking_region_size):int(post):-1])\ .complement() seqtrans.append(flank_seq) for headers, seqn, seqt in zip(headersbp, seqnorm, seqtrans): combined_seq = seqt + seqn output.append((headers, combined_seq.upper())) return output
def flanking_region_fasta_insertion(genome, dataframe, flanking_region_size): """ Makes batch processing possible, pulls down small region of genome for which to design primers around and generates flanking regions based on an inverted sequence. This is based on the chromosome and position of input file. Each Fasta record will contain: Note: If strand is negative, coordinates should be in decreasing order. >Sample_Gene_chr:posNorm1-posNorm2_BP Seq of flanking region upstream of SV + seq of inserted sequence based on strand Args: genome (list): genome list of tuples (header, seq) dataframe (pandas object): dataframe with sample info. flanking_region_size (int): length of sequence upstream dna downstream of input coordinate position to pull as sequence to design primers around. Returns: result (dict): {header: seq} """ seqsnormbp1 = {} seqsinsbp1 = {} seqsnormbp2 = {} seqsinsbp2 = {} for headers, seqs in genome: chrm = str(headers) seq = str(seqs) for gene, sample, chrn, startn, stopn, strandn, stopi in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN, dataframe.PosIns2): if str(chrn) == chrm and strandn == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP1") flank_seq = seq[int(startn):int(startn) + int(flanking_region_size)] seqsnormbp1[header] = flank_seq elif str(chrn) == chrm and strandn == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP1") flank_seq = seqinf.Sequence(seq[int(startn):int(startn)-(int(flanking_region_size)):-1])\ .complement() seqsnormbp1[header] = flank_seq for gene, sample, chrn, startn, chri, starti, stopi, strandi in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.ChrIns, dataframe.PosIns1, dataframe.PosIns2, dataframe.StrandI): if str(chri) == chrm and strandi == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP1") flank_seq = seq[int(starti) - int(flanking_region_size):int(starti)] seqsinsbp1[header] = flank_seq elif str(chri) == chrm and strandi == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP1") flank_seq = seqinf.Sequence(seq[int(starti)+int(flanking_region_size):int(starti):-1])\ .complement() seqsinsbp1[header] = flank_seq for gene, sample, chrn, startn, stopn, strandn, stopi in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN, dataframe.PosIns2): if str(chrn) == chrm and strandn == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP2") flank_seq = seq[int(stopn) - int(flanking_region_size):int(stopn)] seqsnormbp2[header] = flank_seq elif str(chrn) == chrm and strandn == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP2") flank_seq = seqinf.Sequence(seq[int(stopn)+int(flanking_region_size):int(stopn):-1])\ .complement() seqsnormbp2[header] = flank_seq for gene, sample, chrn, startn, chri, starti, stopi, strandi in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm1, dataframe.ChrIns, dataframe.PosIns1, dataframe.PosIns2, dataframe.StrandI): if str(chri) == chrm and strandi == '+': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP2") flank_seq = seq[int(stopi):int(stopi) + (int(flanking_region_size))] seqsinsbp2[header] = flank_seq elif str(chri) == chrm and strandi == '-': header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\ str(startn)+"-"+str(stopi)+"__BP2") flank_seq = seqinf.\ Sequence(seq[int(stopi):int(stopi)-int(flanking_region_size):-1])\ .complement() seqsinsbp2[header] = flank_seq result = {} for name1, seq1 in seqsnormbp1.items(): for name2, seq2 in seqsinsbp1.items(): if name1 == name2: outseq = seq2 + seq1 result[name1] = outseq for name1, seq1 in seqsnormbp2.items(): for name2, seq2 in seqsinsbp2.items(): if name1 == name2: outseq = seq1 + seq2 result[name1] = outseq return result
def flanking_region_fasta_translocation(genome, dataframe, flanking_region_size): """ Pulls down small region of genome for which to design primers around and generates flanking regions based on strand info from input file. Each Fasta record will contain: >Sample_Gene_chrNorm:posNorm-posTrans Seq of flanking region upstream of posNorm + seq after posTrans based on strand Args: genome (list): genome list of tuples (header, seq) dataframe (pd.DataFrame): dataframe with sample info. flanking_region_size (int): length of sequence to pad position with. Returns: result (dict): {header: seq} """ output = [] samp_norm = {} samp_tran = {} for headers, seqs in genome: chrm = str(headers) seq = str(seqs) for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans, dataframe.PosTrans, dataframe.StrandT): if str(chrn) == chrm and strandn == '+': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seq[int(posn):int(posn) + int(flanking_region_size)] samp_norm[header] = flank_seq elif str(chrn) == chrm and strandn == '-': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seqinf.Sequence(seq[int(posn):int(posn)-(int(flanking_region_size)):-1])\ .complement() samp_norm[header] = flank_seq for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip( dataframe.Gene, dataframe.Sample, dataframe.ChrNorm, dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans, dataframe.PosTrans, dataframe.StrandT): if str(chrt) == chrm and strandt == '+': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seq[int(post) - int(flanking_region_size):int(post)] samp_tran[header] = flank_seq elif str(chrt) == chrm and strandt == '-': header = str( str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" + str(posn) + "-" + str(post)) flank_seq = seqinf.Sequence(seq[int(post)+int(flanking_region_size):int(post):-1])\ .complement() samp_tran[header] = flank_seq result = {} for name1, seq1 in samp_norm.items(): for name2, seq2 in samp_tran.items(): if name1 == name2: outseq = seq2 + seq1 result[name1] = outseq return result
def test_0_gc_percent(self): """ Test to ensure that 0 GC doesn't throw error (div0). """ gc_float = seqinf.Sequence("AAAA").gc_percent() self.assertEqual(gc_float, 0.0)