Beispiel #1
0
def gc_genes(genomes_dict_by_type_and_gen_gc3s):
    gc123_by_type_and_gene_list_todas = defaultdict(dict)

    for virus_genus in genomes_dict_by_type_and_gen_gc3s.keys():

        for virus_type in genomes_dict_by_type_and_gen_gc3s[virus_genus]:

            medias_by_gene = {}
            desv_by_gene = {}

            available_genes = genomes_dict_by_type_and_gen_gc3s[virus_genus][
                virus_type].keys()

            gc123_by_gen_dict = defaultdict(list)

            for gene in available_genes:

                for sequence in genomes_dict_by_type_and_gen_gc3s[virus_genus][
                        virus_type][gene]:

                    gc123 = list(GC123(sequence))
                    gc12 = np.mean([gc123[1], gc123[2]])
                    gc123.append(gc12)
                    gc123_by_gen_dict[gene].append(gc123)

                gc123_by_gen_gc3s_array = np.asarray(gc123_by_gen_dict[gene])

            gc123_by_type_and_gene_list_todas[virus_genus][
                virus_type] = gc123_by_gen_dict
Beispiel #2
0
def mainloop():
	seq = input("Sequence 1:")
	seq2 = input("Sequence 2:")
	alignments = pairwise2.align.globalxx(seq, seq2)
	print(format_alignment(*alignments[0]))	
	print("No1.Total GC content:")
	print(GC(seq))
	print("No1.GC by parts:")
	print(GC123(seq))
	print("No2.Total GC content:")
	print(GC(seq2))
	print("No2.GC by parts:")
	print(GC123(seq2))
	input('next prot')
	cls = lambda: os.system('cls')
	cls()
	mainloop()
def codon_counter(nt, codons, nt_type='dna'):

    # Stores codons used for each amino acid and frequency used for said amino acid
    codon_table = dict()

    # Grabs the key (aa) for the given value (codon)
    def get_key(val):
        for key, value in codons.items():
            if val in value:
                return key

    # Handles a RNA string passed to the codon counter
    if nt_type == 'rna' and type(nt) is not Seq:
        nt = Seq(nt)
        nt = nt.back_transcribe()
    elif nt_type == 'rna' and type(nt) is Seq:
        nt = nt.back_transcribe()

    start = None
    stop = None

    # Start and stop codons identified for the sequence
    for frame in range(0, len(nt), 3):
        if nt[frame:frame + 3] == 'ATG' and not start:
            print(
                f'Start codon {nt[frame: frame + 3]} identified at position {frame}'
            )
            start = frame
        # mRNA-1273 contains all three stop codons at the end of the sequence
        # TAG was the last one before the 3' UTR so all stop codons included in the codon table
        if nt[frame:frame + 3] == 'TAG' and not stop:
            print(
                f'Stop codon {nt[frame: frame + 3]} identified at position {frame}'
            )
            stop = frame + 3

    # Trimmed nt sequence starting at ATG and ending at TAG
    nt_cds = nt[start:stop]
    prev_codon = ''
    # Counting codons used per amino acid
    for frame in range(3, (len(nt_cds) + 3), 3):
        aa = get_key(nt_cds[frame - 3:frame])
        codon_table.setdefault(aa, []).append(str(nt_cds[frame - 3:frame]))

    # Returns a list of tuples (codon, num times used to translate aa in nt seq provided / total codons for aa)
    for aa in codon_table.keys():
        codon_counts = {
            aa: [(codon,
                  round(codon_table[aa].count(codon) / len(codon_table[aa]),
                        3)) for codon in set(codon_table[aa])]
        }
        codon_table.update(codon_counts)
    print(GC123(nt_cds))

    return codon_table
Beispiel #4
0
def calculate_GC(entries):
    gc, gc1, gc2, gc3 = [], [], [], []
    name, sequences = list(zip(*entries))
    for seq in sequences:
        _gc, _gc1, _gc2, _gc3 = GC123(seq)
        gc.append(_gc)
        gc1.append(_gc1)
        gc2.append(_gc2)
        gc3.append(_gc3)
    df = pd.DataFrame([[x / 100.0 for x in gc], [x / 100.0 for x in gc1],
                       [x / 100.0 for x in gc2], [x / 100.0 for x in gc3]]).T
    df.columns = "gc_content gc1_content gc2_content gc3_content".split()
    return df
def calculate123(seq_path):
    result_df = pd.DataFrame(
        columns=["seqid", "GC", "GC12", "GC1", "GC2", "GC3"])
    seq_list = SeqIO.parse(seq_path, "fasta")
    for record in seq_list:
        tmp_dict = dict()
        tmp_dict["seqid"] = record.id
        tmp_dict["GC"], tmp_dict["GC1"], tmp_dict["GC2"], tmp_dict[
            "GC3"] = GC123(record.seq)
        tmp_dict["GC12"] = (tmp_dict["GC1"] + tmp_dict["GC2"]) / 2
        result_df = result_df.append(tmp_dict, sort=False, ignore_index=True)
    result_df.iloc[:, 1:] = result_df.iloc[:, 1:] / 100
    return result_df
Beispiel #6
0
def gc_length(input_file, output, field=None, header=None):
    """
    Given an input tab-delimited field, retrieve sequences at the given field
    (default = last field of tab-delimited file) computes GC content and length
    and outputs file. Header specifies if input has header
    """
    gapped = Gapped(ExtendedIUPACDNA(), '-')  # Extend alphabet to allow gaps

    if field is None:
        field = -1
    if header is None:
        header = True

    with open(input_file, "r") as in_file:
        sequences = []
        number = 0
        for line in in_file:
            if not header:
                line = line.split("\t")
                sequences.append(Seq(line[field], gapped))
            if header and number > 0:
                line = line.split("\t")
                sequences.append(Seq(line[field], gapped))
            number += 1

    with open(output, "w") as o_file:

        dinuc_list = ["".join(x) for x in product("ATCG", repeat=2)]
        header = "AoverAT\tGCm\tlength"
        for nuc in "ATCG":
            header += "\t{}.freq".format(nuc)
        for dinuc in dinuc_list:
            header += "\t{}.freq".format(dinuc)
        header += "\tSeq"

        print(header, file=o_file)
        for seq in sequences:
            gc = GC123(seq)
            at = AoverAT(seq)
            nuc_freq = nucleotide(seq)
            dinuc_freq = dinucleotide(seq)

            line = "{}\t{}\t{}".format(at, gc[0], len(seq)-1)
            for nuc in "ATCG":
                line += "\t{}".format(float(nuc_freq[nuc]))
            for dinuc in dinuc_list:
                line += "\t{}".format(float(dinuc_freq[dinuc]))
            line += "\t{}".format(seq.tostring())

            line = line.rstrip("\n")
            print(line, file=o_file)
Beispiel #7
0
def GC3(sequ):
    """Calculates the GC content an the 3rd codon position..

    Args:
        
        sequ (str): DNA sequence

    Returns:
        int: GC3 value
        
    """
    from Bio.SeqUtils import GC123
    GC3 = round(GC123(sequ)[3], 3)
    return GC3
Beispiel #8
0
def gc123(fasta, output=()):
    #create header
    a = [['Transcript', 'GC', 'GC1', 'GC2', 'GC3']]
    #parse and read over fasta file
    for b in SeqIO.parse(open(fasta, "r"), "fasta"):
        #use Biopython GC123 to calculate GC content and codon GC content
        c = GC123(b.seq)
        #for each sequence, add these values to the array
        a = a + [[b.id, str(c[0]), str(c[1]), str(c[2]), str(c[3])]]
    #output results
    if output:
        with open(output, 'w') as d:
            d.writelines('\t'.join(e) + '\n' for e in a)
    else:
        return (a)
Beispiel #9
0
def biodb2cds_gc(biodb):
    from chlamdb.biosqldb import manipulate_biosqldb
    from Bio.SeqUtils import GC123

    server, db = manipulate_biosqldb.load_db(biodb)

    sql1 = 'select distinct accession from orthology_detail_%s' % biodb
    sql2 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb
    sql3 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb

    accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql1,)]

    locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,))
    locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql3,))

    sql_head = 'create table IF NOT EXISTS custom_tables.gc_content_%s (taxon_id INT, ' \
          ' seqfeature_id INT,' \
          ' seq_length INT, gc_percent FLOAT, gc_1 FLOAT, gc_2 FLOAT, gc_3 FLOAT, ' \
               ' INDEX seqfeature_id(seqfeature_id), index taxon_id(taxon_id))' % biodb

    server.adaptor.execute(sql_head,)

    count_all=0
    for accession in accession_list:
        print (accession)
        record = db.lookup(accession=accession)
        seq = record.seq
        for n, feature in enumerate(record.features):
            if feature.type == 'CDS' and not 'pseudo' in feature.qualifiers and not 'pseudogene' in feature.qualifiers and 'translation' in feature.qualifiers:
                count_all+=1
                dna_sequence = feature.extract(seq)
                locus = feature.qualifiers['locus_tag'][0]

                gc, gc1, gc2, gc3 = GC123(str(dna_sequence))
                sql = 'insert into  custom_tables.gc_content_%s values (%s, %s, %s, %s, %s, %s, %s);' % (biodb,
                                                                                                          locus2taxon_id[locus],
                                                                                                          locus2seqfeature_id[locus],
                                                                                                          len(dna_sequence),
                                                                                                          round(gc,2),
                                                                                                          round(gc1),
                                                                                                          round(gc2),
                                                                                                          round(gc3))
                server.adaptor.execute(sql,)
        server.commit()
Beispiel #10
0
def get_gene_gc_len(genes_d):
    ''' Return list of gene GC and GC3, gene sizes, and detailed gene informations'''
    gcs = []
    gc3s = []
    lens = []
    info = []
    for scfid, l in genes_d.iteritems():
        for geneid, gene_start, gene_end, strand, description, gene_seq in l:
            gc = GC(gene_seq)
            gc3 = GC123(gene_seq)[3]
            length = len(gene_seq)
            gcs.append(gc)
            gc3s.append(gc3)
            lens.append(length)
            info.append([
                scfid, geneid, strand, length, gc, gc3, gene_start, gene_end,
                strand, description
            ])
    return gcs, gc3s, lens, info
def calculate123(seq_path):
    seq_list = SeqIO.parse(seq_path, "fasta")
    dict_list = []
    for record in seq_list:
        tmp_dict = dict()
        tmp_dict["GC"], tmp_dict["GC1"], tmp_dict["GC2"], tmp_dict[
            "GC3"] = GC123(record.seq) / 100
        tmp_dict['slen'] = len(record.seq)
        dict_list.append(tmp_dict)
    result_df = pd.DataFrame(dict_list)
    result_dict = {
        'seqid': splt(seq_path)[1],
        'GC': sum(result_df.GC * result_df.slen) / result_df.slen.sum() / 100,
        'GC1':
        sum(result_df.GC1 * result_df.slen) / result_df.slen.sum() / 100,
        'GC2':
        sum(result_df.GC2 * result_df.slen) / result_df.slen.sum() / 100,
        'GC3': sum(result_df.GC3 * result_df.slen) / result_df.slen.sum() / 100
    }
    return result_dict
Beispiel #12
0
def gc123_genomes(genomes_dict):

    gc123_output_list = defaultdict(dict)

    for virus_genus in genomes_dict.keys():

        for virus_type in genomes_dict[virus_genus].keys():

            gc123_values = []

            for record in genomes_dict[virus_genus][virus_type]:

                gc123 = list(GC123(record.seq))
                gc12 = np.mean([gc123[1], gc123[2]])
                gc123.append(gc12)

                gc123_values.append(gc123)

            gc123_output_list[virus_genus][virus_type] = gc123_values

    return gc123_output_list
Beispiel #13
0
from Bio.SeqUtils import GC
from Bio.SeqUtils import GC123
seq = input(str("DNA manual:"))
print("Total GC content:")
print(GC(seq))
print("GC by parts:")
print(GC123(seq))
input("enter")
Beispiel #14
0
# --------------------------------------------
'''
print(step3)

levelTwoType = []

geneDist = os.path.join(outdir, "gene.dist.tsv")
with open(geneDist, 'w') as f:
    f.write("id\ttranscript_num\tgc\tgc1\tgc2\tgc3\tlength\n")
    for gene in db.features_of_type("gene"):
        transcriptCounts = str(len(list(db.children(gene))))
        transcriptType = [t.featuretype for t in db.children(gene, level=1)]
        levelTwoType += transcriptType
        geneFa = gene.sequence(fasta)
        gc = GC(geneFa)
        gc123 = GC123(geneFa)
        geneLen = gene.end - gene.start + 1
        items = [
            gene.id, transcriptCounts,
            str(gc),
            str(gc123[1]),
            str(gc123[2]),
            str(gc123[3]),
            str(geneLen)
        ]
        linestr = '\t'.join(items)
        f.write(linestr + '\n')

print(set(levelTwoType))

# 基因间区长度分布
Beispiel #15
0
def parsing_genom_CDS():
    chain = ""
    liste_gen = parsing_fasta("tmp.txt.genome")
    if liste_gen:
        ##recuperer les TaxID
        with open('tmp.txt') as f:
            line = f.readline()
            line = line.rstrip()
            line = int(float(line))
        len_gen = 0
        N = 0
        gc = 0
        for i in liste_gen:
            len_gen = len_gen + len(str(i))  #lengueur des chromosomes
            for j in str(i):
                if j == 'N':
                    N = N + 1  #nbr de N dans la seq génomique

                if j in ['G', 'C']:
                    gc = gc + 1

        len_gen_valid = len_gen - N

        gc = gc * 100 / float(len_gen_valid)  #taux de GC genome
        N = N / float(len_gen)  #%NA
        #print(gc, N, len_gen, len_gen_valid)

        ####CDS
        li = parsing_fasta("tmp.txt.cds")
        if li:
            lis = []
            a = 0  #taux GC CDSvalid
            b = 0  #taux GC1 CDSvalid
            c = 0  #taux GC2 CDSvalid
            d = 0  #taux GC3 CDSvalid
            LenCumCDS = 0
            NbrCDS_valid = 0
            LenCumCDS_valid = 0
            j = 0
            #NbrCDS = len(li)
            for i in li:
                i = str(i)
                j = j + 1
                LenCumCDS = LenCumCDS + len(i)
                if CDS_Conformity(i):
                    NbrCDS_valid += 1
                    LenCumCDS_valid += len(i)
                    lis = GC123(i)
                    a += lis[0]
                    b += lis[1]
                    c += lis[2]
                    d += lis[3]
                else:
                    pass
            NbrCDS = j
            #print(j, NbrCDS_valid, LenCumCDS, LenCumCDS_valid)
            #print(a/len(li),b/len(li),c/len(li),d/len(li))

            lst = [
                line, len_gen,
                round(gc),
                round(N), NbrCDS, LenCumCDS,
                round(a / len(li)), NbrCDS_valid, LenCumCDS_valid,
                round(b / len(li)),
                round(c / len(li)),
                round(d / len(li))
            ]
            lst = map(str, lst)
            chain = "\t".join(lst)
            chain = chain + '\n'
    return chain
Beispiel #16
0
def parsing_genom_CDS():
	
	##recuperer les TaxID
	with open('tmp.txt') as f:
		line = f.readline()
		line = line.rstrip()
		line = int (float(line))
	#construire fichier CSV pour stocker les résultats
	#res = csv.writer(open("resultats.csv", "wb"))
	#c.writerow(["TaxId","LenGenome","GCgenome","%NA","NbrCDS","LenCumCDS","GC_CDS","NbrCDS_valid","LenCumCDS_valid","GC1","GC2","GC3"])
	
	liste_gen = parsing_fasta("tmp.txt.genome")
	
	len_gen = 0
	N = 0
	gc = 0
	for i in liste_gen:
		len_gen = len_gen + len(str(i)) #lengueur des chromosomes
		for j in str(i):
			if j == 'N':
				N = N + 1 #nbr de N dans la seq génomique
	
			if j in ['G', 'C']:
				gc = gc + 1
		
	len_gen_valid = len_gen - N
	
	gc = gc*100/float(len_gen_valid) #taux de GC genome
	N = N / float(len_gen) #%NA
	#print(gc, N, len_gen, len_gen_valid)

	####CDS
	li = parsing_fasta("tmp.txt.cds")
	lis = []
	a = 0
	b = 0
	c = 0
	d = 0
	LenCumCDS = 0
	NbrCDS_valid = 0
	LenCumCDS_valid = 0
	j = 0
	#NbrCDS = len(li)
	for i in li:
		i = str(i)
		j = j +1
		LenCumCDS = LenCumCDS + len(i)
		if CDS_Conformity(i):
			NbrCDS_valid += 1
			LenCumCDS_valid +=len(i)
			lis = GC123(i)
			a +=lis[0]
			b +=lis[1]
			c +=lis[2]
			d +=lis[3]
		else:
			pass
	NbrCDS = j
	#print(j, NbrCDS_valid, LenCumCDS, LenCumCDS_valid)
	#print(a/len(li),b/len(li),c/len(li),d/len(li))

	lst = [line, len_gen, round(gc), round(N), NbrCDS, LenCumCDS, round(a/len(li)), NbrCDS_valid, LenCumCDS_valid, round(b/len(li)), round(c/len(li)), round(d/len(li))]
	lst = map(str, lst)
	chain = "\t".join(lst)
	chain = chain + '\n'
	return chain
Beispiel #17
0
# print( brin1[1:] )
# print( brin1[1:].translate() )
# print( brin1[2:] )
# print( brin1[2:].translate() )

## known bug for six_frame_translations !!
from Bio.SeqUtils import six_frame_translations
print( "" )
# print(six_frame_translations( fasta_seq_toTest ) )
# print(six_frame_translations( genba_seq_toTest ) )

## does not work above for GC()
from Bio.SeqUtils import GC, GC123
print( "" )
# print(GC( fasta_seq_toTest ) )
print(GC123( fasta_seq_toTest ) )
# print(GC( genba_seq_toTest ) )
print(GC123( genba_seq_toTest ) )


## NCBI connection
def FetchSeq(mydb, myrettype, myretmode, myid):
  from Bio import Entrez
  from Bio import SeqIO
  Entrez.email = "*****@*****.**"
  with Entrez.efetch(db=mydb, rettype=myrettype, retmode=myretmode, id=myid) as handle:
    seq_record = SeqIO.read(handle, "fasta")
  print("%s with %i features" % (seq_record.id, len(seq_record.features)))

print( "***** FetchSeq nucleotide fasta text 6273291" )
FetchSeq("nucleotide", "fasta", "text", "6273291")
from Bio import SeqIO

fasta_file = "capybara_genes.fasta"  # Input fasta file

gene_code = ['Capybara_gene_code']
gc_content_tot = ['GC_content_total']
gc_first = ['GC_first_codon_pos']
gc_second = ['GC_second_codon_pos']
gc_third = ['GC_third_codon_pos']
gene_length = ['Gene_length']
protein_length = ['Prot_length']

fasta_sequences = SeqIO.parse(open(fasta_file), 'fasta')
for seq in fasta_sequences:
    gene_code.append(seq.id)  #ID exists only for 'Record' objects
    gc_content_tot.append(GC123(seq)[0])
    gc_first.append(GC123(seq)[1])
    gc_second.append(GC123(seq)[2])
    gc_third.append(GC123(seq)[3])
    gene_length.append(len(seq))
    protein_length.append(len(seq) / 3)

gene_code = np.array(gene_code)
gc_content_tot = np.array(gc_content_tot)
gc_first = np.array(gc_first)
gc_second = np.array(gc_second)
gc_third = np.array(gc_third)
gene_length = np.array(gene_length)
protein_length = np.array(protein_length)
tbl = np.column_stack((gene_code, gc_content_tot, gc_first, gc_second,
                       gc_third, gene_length, protein_length))
fastaArray = []

id_Fasta = ''

for i in range(0,
               len(fastaLines) -
               1):  #from 0 to the number of lines in the file minus one
    lines = fastaLines[i]
    if lines.startswith('>'):
        id_Fasta = lines.strip(
            '>')  #strip removes leading and trailing characters
        id_Fasta = id_Fasta.strip(
            '\n')  #strip removes leading and trailing characters
        id_Fasta = id_Fasta.strip(
            '\s')  #strip removes leading and trailing characters

        count = resultArray.index(id_Fasta)
        fastaLines[i + 1] = fastaLines[i + 1].strip('\n')
        gc_1 = round(GC123(fastaLines[i + 1])[1], 2)
        gc_2 = round(GC123(fastaLines[i + 1])[2], 2)
        gc_3 = round(GC123(fastaLines[i + 1])[3], 2)
        gc_overall = round(GC(fastaLines[i + 1]), 2)
        outputFile.write('>' + resultLineArray[count] + '\t' + str(gc_1) +
                         '\t' + str(gc_2) + '\t' + str(gc_3) + '\t' +
                         str(gc_overall) + '\n')
        outputFile.write(fastaLines[i + 1] + '\n')
        resultArray.pop(count)
        resultLineArray.pop(count)

outputFile.close()
Beispiel #20
0
#!/usr/bin/python

# testing python script
import Bio as bio
import matplotlib.pyplot as plot

from Bio.Seq import Seq
from Bio.Alphabet import generic_rna,generic_dna
from Bio.SeqUtils import GC123,GC,GC_skew

seq = "ataccaggctgaggcccattaatgatgcaatttgctgggcttctctattttctccgtgcttccatcctcttctccgtcggcggggagaagtgaaatgccgtggagatgggcggcggcggcggcgacggcggcgacgagaaagctcaccgggatctctcagtcgcgagtttcagtagcctttaccggccgtcttctctaccgctcgttcggaagcgactccagtgaaagccgcaagaggtcactgccacggggggtcgtatcgatcggggccatcagccttgctggaggtctcgtgctcagcgccgtcaacgacctcgccatcttcaatggatgcacaacgaaggcaattgagcatgctgctgacaaccctgctgttgtggaagcaattggagtgcctatagtcagaggaccgtggtatgatgcttctcttgaggtgggccatcgacggcggtctgtgtcatgcacattccctgtatctgggccacatgggtcaggatttctccagattaaggcaacccgagatggagaggatggtctgctttcgtttctgcggcatcacgactggaagatcctattgctggaggctcatcttgaagcaccatcagatgatgaggaccagagaaagctggttaaggtgaatcttgcaagcagtggccgtggggaagatggggatccagagagtggttaatcttttgtactgaattccatggtgagtggaagatcgtgtcatctgaatggactccaaatattaaatgacatggagatctagggaagcaaaaaaaaaaaaaaaa"

print GC123(seq)
print GC(seq)

plot(GC_skew(seq,window=100),c="r")
xlabel("Window")
ylabel("(G-C)/(G+C)")
title("GC-skew")