Beispiel #1
0
 def setUp(self):
     """
     Generate a sequence object that is easy to test!
     """
     self.seq = seqinf.Sequence("ACTG")
     self.name = "seq name not seq"
     self.primer_dimer = seqinf.PrimerDimer(
         self.seq.sequence,
         seqinf.Sequence("TGCA").sequence, 2)
Beispiel #2
0
 def test_format_comp_align(self):
     """
     Test to ensure that complementary alignment is output.
     """
     easy_seq1 = seqinf.Sequence("AAAA").sequence
     easy_seq2 = seqinf.Sequence("TTTT").complement()
     test = seqinf.PrimerDimer(easy_seq1, easy_seq2, 4)
     local = test.pd_local()
     for item in local:
         self.assertTrue("TTTT" in seqinf.PrimerDimer.\
                             format_alignment_compl(item[0],
                                                    item[1],
                                                    item[2],
                                                    item[3],
                                                    item[4]))
Beispiel #3
0
def primer_dimer_local(aln_len_list, names, seq1, seq2):
    """
    Looks for complementarity between all primers in list,
    and returns alignment + score. No return, prints visualization
    between aligned primers (forming dimer) if they meet the min score set in the func.
    Args:
        min_dimer_alignment (int): the minimum alignment score between 2 primers before
            it is thrown out
        names (string): sequence ID
        seq1 (string): primer1 sequence
        seq2 (string): primer2 sequence
    Returns:
        name, seq1, seq2, score (generator): yields any primer pair that has score
        greater than or equal to min_dimer_alignment.
    """
    for name, fseq, aln_len in zip(names, seq1, aln_len_list):
        for rseq in seq2:
            pd_test = seqinf.PrimerDimer(fseq,
                                         seqinf.Sequence(rseq).complement(),
                                         aln_len)
            pd_output = pd_test.pd_local()
            for item in pd_output:
                yield (name + "\n",
                       seqinf.PrimerDimer.format_alignment_compl(
                           item[0], item[1], item[2], item[3], item[4]))
Beispiel #4
0
 def test_handles_lowercase(self):
     """
     Tests to ensure handles lowercase sequence strings.
     """
     input_seq = self.seq.sequence
     test_seq = seqinf.Sequence('actg').sequence
     self.assertEqual(input_seq, test_seq)
Beispiel #5
0
def gc_percent_seqs(seqs_list):
    """
    Use seqinf to calculate GC content for output from isPCR.
    Args:
        seqs_list (list): list of seqs output from isPCR
    Returns:
        gc_list (list): list of gc content of seqs.
    """
    upper_seqs = [[nuc.upper() for nuc in seq] for seq in seqs_list]
    gc_list = []
    for seqs in upper_seqs:
        for seq in seqs:
            gc_list.append(seqinf.Sequence(seq).gc_percent())
    return gc_list
def calc_gc(sliced_seq):
    """
    Takes sliced seq list with header, product len, and product seq and returns
    the header, product len, and product GC%. A bit redundant but I might find
    it useful to have this exact sequence in the future.
    Args:
        sliced_seq (list): contains (header, product_len, product_seq)
    Returns:
        sample_gc (list): contains (header, product_len, product GC%)
    """
    sample_gc = []
    for header, p_len, p_seq in sliced_seq:
        sample_gc.append((header, p_len, seqinf.Sequence(p_seq).gc_percent()))
    return sample_gc
def flanking_region_fasta_insertion(genome, dataframe, flanking_region_size):
    """
    Makes batch processing possible, pulls down small region
    of genome for which to design primers around and generates
    flanking regions based on an inverted sequence.

    This is based on the chromosome and position of input file.
    Each Fasta record  will contain:

    Note: If strand is negative, coordinates should be in decreasing order.

    >Sample_Gene_chr:posNorm1-posNorm2_BP
    Seq of flanking region upstream of SV + seq of inserted sequence based on strand

    Args:
        genome (list): genome list of tuples (header, seq)
        dataframe (pandas object): dataframe with sample info.
        flanking_region_size (int): length of sequence upstream dna downstream of
        input coordinate position to pull as sequence to design primers around.
    Returns:
        output (list): (header + seq)
    """
    headersbp1 = []
    seqsnormbp1 = []
    seqsinsbp1 = []
    headersbp2 = []
    seqsnormbp2 = []
    seqsinsbp2 = []
    for headers, seqs in genome:
        chrm = str(headers)
        seq = str(seqs)
        for gene, sample, chrn, startn, stopn, strandn in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN):
            if str(chrn) == chrm and strandn == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopn)+"__BP1")
                flank_seq = seq[int(startn):int(startn) +
                                int(flanking_region_size)]
                seqsnormbp1.append(flank_seq)
                headersbp1.append(header)

            elif str(chrn) == chrm and strandn == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                             str(startn)+"-"+str(stopn)+"__BP1")
                flank_seq = seqinf.Sequence(seq[int(startn):int(startn)-(int(flanking_region_size)):-1])\
                            .complement()
                seqsnormbp1.append(flank_seq)
                headersbp1.append(header)

        for chri, starti, stopi, strandi in zip(dataframe.ChrIns,
                                                dataframe.PosIns1,
                                                dataframe.PosIns2,
                                                dataframe.StrandI):
            if str(chri) == chrm and strandi == '+':
                flank_seq = seq[int(starti) -
                                int(flanking_region_size):int(starti)]
                seqsinsbp1.append(flank_seq)
            elif str(chri) == chrm and strandi == '-':
                flank_seq = seqinf.Sequence(seq[int(starti)+int(flanking_region_size):int(starti):-1])\
                            .complement()
                seqsinsbp1.append(flank_seq)

        for gene, sample, chrn, startn, stopn, strandn in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN):
            if str(chrn) == chrm and strandn == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopn)+"__BP2")
                flank_seq = seq[int(stopn) -
                                int(flanking_region_size):int(stopn)]
                seqsnormbp2.append(flank_seq)
                headersbp2.append(header)
            elif str(chrn) == chrm and strandn == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopn)+"__BP2")
                flank_seq = seqinf.Sequence(seq[int(stopn)+int(flanking_region_size):int(stopn):-1])\
                            .complement()
                seqsnormbp2.append(flank_seq)
                headersbp2.append(header)

        for chri, starti, stopi, strandi in zip(dataframe.ChrIns,
                                                dataframe.PosIns1,
                                                dataframe.PosIns2,
                                                dataframe.StrandI):
            if str(chri) == chrm and strandi == '+':
                flank_seq = seq[int(stopi):int(stopi) +
                                (int(flanking_region_size))]
                seqsinsbp2.append(flank_seq)
            elif str(chri) == chrm and strandi == '-':
                flank_seq = seqinf.\
                    Sequence(seq[int(stopi):int(stopi)-int(flanking_region_size):-1])\
                    .complement()
                seqsinsbp2.append(flank_seq)

    output = []
    for headers, seqsnorm, seqsins in zip(headersbp1, seqsnormbp1, seqsinsbp1):
        combined_seq = seqsins + seqsnorm
        output.append((headers, combined_seq.upper()))

    for headers, seqsnorm, seqsins in zip(headersbp2, seqsnormbp2, seqsinsbp2):
        combined_seq = seqsnorm + seqsins
        output.append((headers, combined_seq.upper()))
    return output
def flanking_region_fasta_translocation(genome, dataframe,
                                        flanking_region_size):
    """
    Pulls down small region of genome for which to design primers around and
    generates flanking regions based on strand info from input file.

    Each Fasta record will contain:
    >Sample_Gene_chrNorm:posNorm-posTrans
    Seq of flanking region upstream of posNorm + seq after posTrans based on strand
    
    Args:
        genome (list): genome list of tuples (header, seq)
        dataframe (pd.DataFrame): dataframe with sample info.
        flanking_region_size  (int): length of sequence to pad position with.
    Returns:
        output (list): (header + seq)
    """
    output = []
    headersbp = []
    seqnorm = []
    seqtrans = []
    for headers, seqs in genome:
        chrm = str(headers)
        seq = str(seqs)
        for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans,
                dataframe.PosTrans, dataframe.StrandT):
            if str(chrn) == chrm and strandn == '+':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seq[int(posn):int(posn) +
                                int(flanking_region_size)]
                headersbp.append(header)
                seqnorm.append(flank_seq)

            elif str(chrn) == chrm and strandn == '-':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seqinf.Sequence(seq[int(posn):int(posn)-(int(flanking_region_size)):-1])\
                            .complement()
                headersbp.append(header)
                seqnorm.append(flank_seq)

        for chrt, post, strandt in zip(dataframe.ChrTrans, dataframe.PosTrans,
                                       dataframe.StrandT):
            if str(chrt) == chrm and strandt == '+':
                flank_seq = seq[int(post) -
                                int(flanking_region_size):int(post)]
                seqtrans.append(flank_seq)
            elif str(chrt) == chrm and strandt == '-':
                flank_seq = seqinf.Sequence(seq[int(post)+int(flanking_region_size):int(post):-1])\
                            .complement()
                seqtrans.append(flank_seq)

    for headers, seqn, seqt in zip(headersbp, seqnorm, seqtrans):
        combined_seq = seqt + seqn
        output.append((headers, combined_seq.upper()))

    return output
Beispiel #9
0
def flanking_region_fasta_insertion(genome, dataframe, flanking_region_size):
    """
    Makes batch processing possible, pulls down small region
    of genome for which to design primers around and generates
    flanking regions based on an inverted sequence.

    This is based on the chromosome and position of input file.
    Each Fasta record  will contain:

    Note: If strand is negative, coordinates should be in decreasing order.

    >Sample_Gene_chr:posNorm1-posNorm2_BP
    Seq of flanking region upstream of SV + seq of inserted sequence based on strand

    Args:
        genome (list): genome list of tuples (header, seq)
        dataframe (pandas object): dataframe with sample info.
        flanking_region_size (int): length of sequence upstream dna downstream of
        input coordinate position to pull as sequence to design primers around.
    Returns:
        result (dict): {header: seq}
    """
    seqsnormbp1 = {}
    seqsinsbp1 = {}
    seqsnormbp2 = {}
    seqsinsbp2 = {}
    for headers, seqs in genome:
        chrm = str(headers)
        seq = str(seqs)
        for gene, sample, chrn, startn, stopn, strandn, stopi in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN,
                dataframe.PosIns2):
            if str(chrn) == chrm and strandn == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP1")
                flank_seq = seq[int(startn):int(startn) +
                                int(flanking_region_size)]
                seqsnormbp1[header] = flank_seq
            elif str(chrn) == chrm and strandn == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                             str(startn)+"-"+str(stopi)+"__BP1")
                flank_seq = seqinf.Sequence(seq[int(startn):int(startn)-(int(flanking_region_size)):-1])\
                            .complement()
                seqsnormbp1[header] = flank_seq

        for gene, sample, chrn, startn, chri, starti, stopi, strandi in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.ChrIns, dataframe.PosIns1,
                dataframe.PosIns2, dataframe.StrandI):
            if str(chri) == chrm and strandi == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP1")
                flank_seq = seq[int(starti) -
                                int(flanking_region_size):int(starti)]
                seqsinsbp1[header] = flank_seq
            elif str(chri) == chrm and strandi == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP1")
                flank_seq = seqinf.Sequence(seq[int(starti)+int(flanking_region_size):int(starti):-1])\
                            .complement()
                seqsinsbp1[header] = flank_seq
        for gene, sample, chrn, startn, stopn, strandn, stopi in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.PosNorm2, dataframe.StrandN,
                dataframe.PosIns2):
            if str(chrn) == chrm and strandn == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP2")
                flank_seq = seq[int(stopn) -
                                int(flanking_region_size):int(stopn)]
                seqsnormbp2[header] = flank_seq
            elif str(chrn) == chrm and strandn == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP2")
                flank_seq = seqinf.Sequence(seq[int(stopn)+int(flanking_region_size):int(stopn):-1])\
                            .complement()
                seqsnormbp2[header] = flank_seq

        for gene, sample, chrn, startn, chri, starti, stopi, strandi in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm1, dataframe.ChrIns, dataframe.PosIns1,
                dataframe.PosIns2, dataframe.StrandI):
            if str(chri) == chrm and strandi == '+':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP2")
                flank_seq = seq[int(stopi):int(stopi) +
                                (int(flanking_region_size))]
                seqsinsbp2[header] = flank_seq
            elif str(chri) == chrm and strandi == '-':
                header = str(str(sample)+"_"+str(gene)+"_"+str(chrn)+":"+\
                                 str(startn)+"-"+str(stopi)+"__BP2")
                flank_seq = seqinf.\
                    Sequence(seq[int(stopi):int(stopi)-int(flanking_region_size):-1])\
                    .complement()
                seqsinsbp2[header] = flank_seq

    result = {}
    for name1, seq1 in seqsnormbp1.items():
        for name2, seq2 in seqsinsbp1.items():
            if name1 == name2:
                outseq = seq2 + seq1
                result[name1] = outseq

    for name1, seq1 in seqsnormbp2.items():
        for name2, seq2 in seqsinsbp2.items():
            if name1 == name2:
                outseq = seq1 + seq2
                result[name1] = outseq
    return result
Beispiel #10
0
def flanking_region_fasta_translocation(genome, dataframe,
                                        flanking_region_size):
    """
    Pulls down small region of genome for which to design primers around and
    generates flanking regions based on strand info from input file.

    Each Fasta record will contain:
    >Sample_Gene_chrNorm:posNorm-posTrans
    Seq of flanking region upstream of posNorm + seq after posTrans based on strand
    
    Args:
        genome (list): genome list of tuples (header, seq)
        dataframe (pd.DataFrame): dataframe with sample info.
        flanking_region_size  (int): length of sequence to pad position with.
    Returns:
        result (dict): {header: seq}
    """
    output = []
    samp_norm = {}
    samp_tran = {}
    for headers, seqs in genome:
        chrm = str(headers)
        seq = str(seqs)
        for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans,
                dataframe.PosTrans, dataframe.StrandT):
            if str(chrn) == chrm and strandn == '+':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seq[int(posn):int(posn) +
                                int(flanking_region_size)]
                samp_norm[header] = flank_seq
            elif str(chrn) == chrm and strandn == '-':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seqinf.Sequence(seq[int(posn):int(posn)-(int(flanking_region_size)):-1])\
                            .complement()
                samp_norm[header] = flank_seq

        for gene, sample, chrn, posn, strandn, chrt, post, strandt in zip(
                dataframe.Gene, dataframe.Sample, dataframe.ChrNorm,
                dataframe.PosNorm, dataframe.StrandN, dataframe.ChrTrans,
                dataframe.PosTrans, dataframe.StrandT):
            if str(chrt) == chrm and strandt == '+':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seq[int(post) -
                                int(flanking_region_size):int(post)]
                samp_tran[header] = flank_seq
            elif str(chrt) == chrm and strandt == '-':
                header = str(
                    str(sample) + "_" + str(gene) + "_" + str(chrn) + ":" +
                    str(posn) + "-" + str(post))
                flank_seq = seqinf.Sequence(seq[int(post)+int(flanking_region_size):int(post):-1])\
                            .complement()
                samp_tran[header] = flank_seq

    result = {}
    for name1, seq1 in samp_norm.items():
        for name2, seq2 in samp_tran.items():
            if name1 == name2:
                outseq = seq2 + seq1
                result[name1] = outseq

    return result
Beispiel #11
0
 def test_0_gc_percent(self):
     """
     Test to ensure that 0 GC doesn't throw error (div0).
     """
     gc_float = seqinf.Sequence("AAAA").gc_percent()
     self.assertEqual(gc_float, 0.0)