def Tm_feature(data, pam_audit=True, learn_options=None):
    if learn_options is None or "Tm segments" not in learn_options:
        segments = [(19, 24), (11, 19), (6, 11), (4, 24)]
    else:
        segments = learn_options["Tm segments"]
    sequence = data["30mer"].values
    featarray = np.ones((sequence.shape[0], 5))
    rna = True
    for i, seq in enumerate(sequence):
        if pam_audit and seq[25:27] != "GG":
            continue
            raise Exception("excepted GG but found %s" % seq[25:27])

        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  #30mer
        featarray[i, 1] = Tm.Tm_staluc(
            seq[segments[0][0]:segments[0][1]],
            rna=rna)  #5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]],
                                       rna=rna)  #8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]],
                                       rna=rna)  #5-mer
        featarray[i, 4] = Tm.Tm_staluc(seq[segments[3][0]:segments[3][1]],
                                       rna=rna)  #20-spacer
    feat = pd.DataFrame(featarray,
                        index=data.index,
                        columns=[
                            "Tm global_30mer%s" % rna,
                            "5mer_end_%s" % rna,
                            "8mer_middle_%s" % rna,
                            "5mer_start_%s" % rna,
                            "Tm global_spacer_%s" % rna
                        ])
    return feat
Beispiel #2
0
def get_primer(seq, direction, name):
    # Tm_NN: Calculation based on nearest neighbor thermodynamics. Several
    # tables for DNA/DNA, DNA/RNA and RNA/RNA hybridizations are included.
    # Correction for mismatches, dangling ends, salt concentration and other
    # additives are available.
    # Tm_staluc is the 'old' NN calculation and is kept for compatibility.
    # It is, however, recommended to use Tm_NN instead, since Tm_staluc may be
    # depreceated in the future. Also, Tm_NN has much more options. Using
    # Tm_staluc and Tm_NN with default parameters gives (essentially) the same results.

    global PRIMER_NUM
    global PRIMER_TM
    PRIMER_LENGTH = 15  # min primer lenght
    if direction == "fwd":
        while mt.Tm_staluc(seq[0:PRIMER_LENGTH]) <= PRIMER_TM and PRIMER_LENGTH <= 65:
            PRIMER_LENGTH += 1
        primer_seq = seq[0:PRIMER_LENGTH]
        primer_tm = mt.Tm_staluc(primer_seq)
    elif direction == "rev":
        while mt.Tm_staluc(seq[-PRIMER_LENGTH:]) <= PRIMER_TM and PRIMER_LENGTH <= 65:
            PRIMER_LENGTH += 1
        primer_seq = revcomplement(seq[-PRIMER_LENGTH:]).lower()
        primer_tm = mt.Tm_staluc(primer_seq)
    primer_seq = str(primer_seq)
    primer_name = "{}_{}_{}".format(PRIMER_NUM, name, direction)
    primer = list([primer_name, primer_seq, primer_tm, PRIMER_LENGTH])
    PRIMER_NUM += 1
    return primer
Beispiel #3
0
def Temper(sequence):
    seq=sequence
    seq_7=seq[:7]
    seq_8=seq[7:15]
    seq_5=seq[15:20]
    TDic={}
    TDic['T20']=Tm.Tm_staluc(seq)
    TDic['T7']=Tm.Tm_staluc(seq_7)
    TDic['T8']=Tm.Tm_staluc(seq_8)
    TDic['T5']=Tm.Tm_staluc(seq_5)
    return TDic
Beispiel #4
0
def gene_feature(Y):
    """
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    """

    gene_names = Y["Target gene"]

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, "DNA")

    everything = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pd.DataFrame(
        data=everything,
        index=gene_names.index,
        columns=[
            "gene length",
            "gene GC content",
            "gene temperature",
            "gene molecular weight",
        ],
    )
    return df
Beispiel #5
0
def Tm_feature(data, pam_audit=True, learn_options=None):
    """
    assuming '30-mer'is a key
    get melting temperature features from:
        0-the 30-mer ("global Tm")
        1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA,
        i.e. the 5nts immediately proximal of the NGG PAM
        2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
        3-the Tm of the DNA:RNA hybrid from position 3 - 7  (i.e. 5 nt)
    """

    if learn_options is None or "Tm segments" not in learn_options:
        segments = [(19, 24), (11, 19), (6, 11)]
    else:
        segments = learn_options["Tm segments"]

    sequence = data["30mer"].values
    featarray = np.ones((sequence.shape[0], 4))

    for i, seq in enumerate(sequence):
        if pam_audit and seq[25:27] != "GG":
            raise Exception(f"expected GG but found {seq[25:27]}")
        rna = False
        featarray[i, 0] = Tm.Tm_staluc(seq, rna=rna)  # 30mer Tm
        featarray[i, 1] = Tm.Tm_staluc(
            seq[segments[0][0]:segments[0][1]],
            rna=rna)  # 5nts immediately proximal of the NGG PAM
        featarray[i, 2] = Tm.Tm_staluc(seq[segments[1][0]:segments[1][1]],
                                       rna=rna)  # 8-mer
        featarray[i, 3] = Tm.Tm_staluc(seq[segments[2][0]:segments[2][1]],
                                       rna=rna)  # 5-mer

    feat = pd.DataFrame(
        featarray,
        index=data.index,
        columns=[
            f"Tm global_{rna}",
            f"5mer_end_{rna}",
            f"8mer_middle_{rna}",
            f"5mer_start_{rna}",
        ],
    )

    return feat
Beispiel #6
0
 def tm_nn(self, dnaconc=500, saltconc=50):
     """
     dnaconc - float, [DNA] nM
     saltconc - float, [salt] mM
     @return float, nearest neighbore dna/dna melting temperature
     """
     if not self.sequence:
         return 0
     r = TM.Tm_staluc(self.sequence, dnac=dnaconc, saltc=saltconc)
     return round(r,1)
Beispiel #7
0
from Bio.SeqUtils import MeltingTemp as MT
import xlwt

PRIMER_FILE = '../../samples/primers.txt'
# w is the name of a newly created workbook.
w = xlwt.Workbook()
# ws is the name of a new sheet in this workbook.
ws = w.add_sheet('Result')
# These two lines writes the titles of the columns.
ws.write(0, 0, 'Primer Sequence')
ws.write(0, 1, 'Tm')
for index, line in enumerate(open(PRIMER_FILE)):
    # For each line in the input file, write the primer
    # sequence and the Tm
    prm = line[3:len(line) - 4].replace(' ', '')
    ws.write(index + 1, 0, prm)
    ws.write(index + 1, 1, '{0:.2f}'.format(MT.Tm_staluc(prm)))
# Save the spreadsheel into a file.
w.save('primerout.xls')
def melting_temp(s):
    return np.array([Tm.Tm_staluc(x, rna=False) for x in s[1:]]) / 1e3
Beispiel #9
0
print IUPACData.extended_protein_letters
print IUPACData.ambiguous_dna_letters
print IUPACData.unambiguous_dna_letters
print IUPACData.ambiguous_rna_letters
print IUPACData.unambiguous_rna_letters
print IUPACData.ambiguous_dna_complement	#dictionary of complements
#and a lot more
from Bio.Data import CodonTable
print CodonTable.generic_by_id[2]

#SeqUtils. Several functions to deal with DNA and protein sequences.
#DNA utils
import Bio.SeqUtils as SeqUtils
print SeqUtils.GC('gacgatcggtattcgtag')	#GC content
from Bio.SeqUtils import MeltingTemp
print MeltingTemp.Tm_staluc('tgcagtacgtatcgt')	#DNA/RNA melting temperature
#checksum functions: short alphanumeric string signature of a file or sequence
#usually written in description of sequence
#cgc is a easy, weak, very used checksum (better crc32, crc64)
from Bio.SeqUtils import CheckSum
myseq='acaagatgccattgtcccccggcctcctgctgctgct'
print CheckSum.gcg(myseq)
print CheckSum.crc32(myseq)
print CheckSum.crc64(myseq)
print CheckSum.seguid(myseq)
#Protein utils
from Bio.SeqUtils import ProtParam
myprot=ProtParam.ProteinAnalysis('MLTNK')
print myprot.count_amino_acids()
print myprot.get_amino_acids_percent()
print myprot.molecular_weight()