def gc_genes(genomes_dict_by_type_and_gen_gc3s): gc123_by_type_and_gene_list_todas = defaultdict(dict) for virus_genus in genomes_dict_by_type_and_gen_gc3s.keys(): for virus_type in genomes_dict_by_type_and_gen_gc3s[virus_genus]: medias_by_gene = {} desv_by_gene = {} available_genes = genomes_dict_by_type_and_gen_gc3s[virus_genus][ virus_type].keys() gc123_by_gen_dict = defaultdict(list) for gene in available_genes: for sequence in genomes_dict_by_type_and_gen_gc3s[virus_genus][ virus_type][gene]: gc123 = list(GC123(sequence)) gc12 = np.mean([gc123[1], gc123[2]]) gc123.append(gc12) gc123_by_gen_dict[gene].append(gc123) gc123_by_gen_gc3s_array = np.asarray(gc123_by_gen_dict[gene]) gc123_by_type_and_gene_list_todas[virus_genus][ virus_type] = gc123_by_gen_dict
def mainloop(): seq = input("Sequence 1:") seq2 = input("Sequence 2:") alignments = pairwise2.align.globalxx(seq, seq2) print(format_alignment(*alignments[0])) print("No1.Total GC content:") print(GC(seq)) print("No1.GC by parts:") print(GC123(seq)) print("No2.Total GC content:") print(GC(seq2)) print("No2.GC by parts:") print(GC123(seq2)) input('next prot') cls = lambda: os.system('cls') cls() mainloop()
def codon_counter(nt, codons, nt_type='dna'): # Stores codons used for each amino acid and frequency used for said amino acid codon_table = dict() # Grabs the key (aa) for the given value (codon) def get_key(val): for key, value in codons.items(): if val in value: return key # Handles a RNA string passed to the codon counter if nt_type == 'rna' and type(nt) is not Seq: nt = Seq(nt) nt = nt.back_transcribe() elif nt_type == 'rna' and type(nt) is Seq: nt = nt.back_transcribe() start = None stop = None # Start and stop codons identified for the sequence for frame in range(0, len(nt), 3): if nt[frame:frame + 3] == 'ATG' and not start: print( f'Start codon {nt[frame: frame + 3]} identified at position {frame}' ) start = frame # mRNA-1273 contains all three stop codons at the end of the sequence # TAG was the last one before the 3' UTR so all stop codons included in the codon table if nt[frame:frame + 3] == 'TAG' and not stop: print( f'Stop codon {nt[frame: frame + 3]} identified at position {frame}' ) stop = frame + 3 # Trimmed nt sequence starting at ATG and ending at TAG nt_cds = nt[start:stop] prev_codon = '' # Counting codons used per amino acid for frame in range(3, (len(nt_cds) + 3), 3): aa = get_key(nt_cds[frame - 3:frame]) codon_table.setdefault(aa, []).append(str(nt_cds[frame - 3:frame])) # Returns a list of tuples (codon, num times used to translate aa in nt seq provided / total codons for aa) for aa in codon_table.keys(): codon_counts = { aa: [(codon, round(codon_table[aa].count(codon) / len(codon_table[aa]), 3)) for codon in set(codon_table[aa])] } codon_table.update(codon_counts) print(GC123(nt_cds)) return codon_table
def calculate_GC(entries): gc, gc1, gc2, gc3 = [], [], [], [] name, sequences = list(zip(*entries)) for seq in sequences: _gc, _gc1, _gc2, _gc3 = GC123(seq) gc.append(_gc) gc1.append(_gc1) gc2.append(_gc2) gc3.append(_gc3) df = pd.DataFrame([[x / 100.0 for x in gc], [x / 100.0 for x in gc1], [x / 100.0 for x in gc2], [x / 100.0 for x in gc3]]).T df.columns = "gc_content gc1_content gc2_content gc3_content".split() return df
def calculate123(seq_path): result_df = pd.DataFrame( columns=["seqid", "GC", "GC12", "GC1", "GC2", "GC3"]) seq_list = SeqIO.parse(seq_path, "fasta") for record in seq_list: tmp_dict = dict() tmp_dict["seqid"] = record.id tmp_dict["GC"], tmp_dict["GC1"], tmp_dict["GC2"], tmp_dict[ "GC3"] = GC123(record.seq) tmp_dict["GC12"] = (tmp_dict["GC1"] + tmp_dict["GC2"]) / 2 result_df = result_df.append(tmp_dict, sort=False, ignore_index=True) result_df.iloc[:, 1:] = result_df.iloc[:, 1:] / 100 return result_df
def gc_length(input_file, output, field=None, header=None): """ Given an input tab-delimited field, retrieve sequences at the given field (default = last field of tab-delimited file) computes GC content and length and outputs file. Header specifies if input has header """ gapped = Gapped(ExtendedIUPACDNA(), '-') # Extend alphabet to allow gaps if field is None: field = -1 if header is None: header = True with open(input_file, "r") as in_file: sequences = [] number = 0 for line in in_file: if not header: line = line.split("\t") sequences.append(Seq(line[field], gapped)) if header and number > 0: line = line.split("\t") sequences.append(Seq(line[field], gapped)) number += 1 with open(output, "w") as o_file: dinuc_list = ["".join(x) for x in product("ATCG", repeat=2)] header = "AoverAT\tGCm\tlength" for nuc in "ATCG": header += "\t{}.freq".format(nuc) for dinuc in dinuc_list: header += "\t{}.freq".format(dinuc) header += "\tSeq" print(header, file=o_file) for seq in sequences: gc = GC123(seq) at = AoverAT(seq) nuc_freq = nucleotide(seq) dinuc_freq = dinucleotide(seq) line = "{}\t{}\t{}".format(at, gc[0], len(seq)-1) for nuc in "ATCG": line += "\t{}".format(float(nuc_freq[nuc])) for dinuc in dinuc_list: line += "\t{}".format(float(dinuc_freq[dinuc])) line += "\t{}".format(seq.tostring()) line = line.rstrip("\n") print(line, file=o_file)
def GC3(sequ): """Calculates the GC content an the 3rd codon position.. Args: sequ (str): DNA sequence Returns: int: GC3 value """ from Bio.SeqUtils import GC123 GC3 = round(GC123(sequ)[3], 3) return GC3
def gc123(fasta, output=()): #create header a = [['Transcript', 'GC', 'GC1', 'GC2', 'GC3']] #parse and read over fasta file for b in SeqIO.parse(open(fasta, "r"), "fasta"): #use Biopython GC123 to calculate GC content and codon GC content c = GC123(b.seq) #for each sequence, add these values to the array a = a + [[b.id, str(c[0]), str(c[1]), str(c[2]), str(c[3])]] #output results if output: with open(output, 'w') as d: d.writelines('\t'.join(e) + '\n' for e in a) else: return (a)
def biodb2cds_gc(biodb): from chlamdb.biosqldb import manipulate_biosqldb from Bio.SeqUtils import GC123 server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select distinct accession from orthology_detail_%s' % biodb sql2 = 'select locus_tag, taxon_id from orthology_detail_%s' % biodb sql3 = 'select locus_tag, seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb accession_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql1,)] locus2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) locus2seqfeature_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql3,)) sql_head = 'create table IF NOT EXISTS custom_tables.gc_content_%s (taxon_id INT, ' \ ' seqfeature_id INT,' \ ' seq_length INT, gc_percent FLOAT, gc_1 FLOAT, gc_2 FLOAT, gc_3 FLOAT, ' \ ' INDEX seqfeature_id(seqfeature_id), index taxon_id(taxon_id))' % biodb server.adaptor.execute(sql_head,) count_all=0 for accession in accession_list: print (accession) record = db.lookup(accession=accession) seq = record.seq for n, feature in enumerate(record.features): if feature.type == 'CDS' and not 'pseudo' in feature.qualifiers and not 'pseudogene' in feature.qualifiers and 'translation' in feature.qualifiers: count_all+=1 dna_sequence = feature.extract(seq) locus = feature.qualifiers['locus_tag'][0] gc, gc1, gc2, gc3 = GC123(str(dna_sequence)) sql = 'insert into custom_tables.gc_content_%s values (%s, %s, %s, %s, %s, %s, %s);' % (biodb, locus2taxon_id[locus], locus2seqfeature_id[locus], len(dna_sequence), round(gc,2), round(gc1), round(gc2), round(gc3)) server.adaptor.execute(sql,) server.commit()
def get_gene_gc_len(genes_d): ''' Return list of gene GC and GC3, gene sizes, and detailed gene informations''' gcs = [] gc3s = [] lens = [] info = [] for scfid, l in genes_d.iteritems(): for geneid, gene_start, gene_end, strand, description, gene_seq in l: gc = GC(gene_seq) gc3 = GC123(gene_seq)[3] length = len(gene_seq) gcs.append(gc) gc3s.append(gc3) lens.append(length) info.append([ scfid, geneid, strand, length, gc, gc3, gene_start, gene_end, strand, description ]) return gcs, gc3s, lens, info
def calculate123(seq_path): seq_list = SeqIO.parse(seq_path, "fasta") dict_list = [] for record in seq_list: tmp_dict = dict() tmp_dict["GC"], tmp_dict["GC1"], tmp_dict["GC2"], tmp_dict[ "GC3"] = GC123(record.seq) / 100 tmp_dict['slen'] = len(record.seq) dict_list.append(tmp_dict) result_df = pd.DataFrame(dict_list) result_dict = { 'seqid': splt(seq_path)[1], 'GC': sum(result_df.GC * result_df.slen) / result_df.slen.sum() / 100, 'GC1': sum(result_df.GC1 * result_df.slen) / result_df.slen.sum() / 100, 'GC2': sum(result_df.GC2 * result_df.slen) / result_df.slen.sum() / 100, 'GC3': sum(result_df.GC3 * result_df.slen) / result_df.slen.sum() / 100 } return result_dict
def gc123_genomes(genomes_dict): gc123_output_list = defaultdict(dict) for virus_genus in genomes_dict.keys(): for virus_type in genomes_dict[virus_genus].keys(): gc123_values = [] for record in genomes_dict[virus_genus][virus_type]: gc123 = list(GC123(record.seq)) gc12 = np.mean([gc123[1], gc123[2]]) gc123.append(gc12) gc123_values.append(gc123) gc123_output_list[virus_genus][virus_type] = gc123_values return gc123_output_list
from Bio.SeqUtils import GC from Bio.SeqUtils import GC123 seq = input(str("DNA manual:")) print("Total GC content:") print(GC(seq)) print("GC by parts:") print(GC123(seq)) input("enter")
# -------------------------------------------- ''' print(step3) levelTwoType = [] geneDist = os.path.join(outdir, "gene.dist.tsv") with open(geneDist, 'w') as f: f.write("id\ttranscript_num\tgc\tgc1\tgc2\tgc3\tlength\n") for gene in db.features_of_type("gene"): transcriptCounts = str(len(list(db.children(gene)))) transcriptType = [t.featuretype for t in db.children(gene, level=1)] levelTwoType += transcriptType geneFa = gene.sequence(fasta) gc = GC(geneFa) gc123 = GC123(geneFa) geneLen = gene.end - gene.start + 1 items = [ gene.id, transcriptCounts, str(gc), str(gc123[1]), str(gc123[2]), str(gc123[3]), str(geneLen) ] linestr = '\t'.join(items) f.write(linestr + '\n') print(set(levelTwoType)) # 基因间区长度分布
def parsing_genom_CDS(): chain = "" liste_gen = parsing_fasta("tmp.txt.genome") if liste_gen: ##recuperer les TaxID with open('tmp.txt') as f: line = f.readline() line = line.rstrip() line = int(float(line)) len_gen = 0 N = 0 gc = 0 for i in liste_gen: len_gen = len_gen + len(str(i)) #lengueur des chromosomes for j in str(i): if j == 'N': N = N + 1 #nbr de N dans la seq génomique if j in ['G', 'C']: gc = gc + 1 len_gen_valid = len_gen - N gc = gc * 100 / float(len_gen_valid) #taux de GC genome N = N / float(len_gen) #%NA #print(gc, N, len_gen, len_gen_valid) ####CDS li = parsing_fasta("tmp.txt.cds") if li: lis = [] a = 0 #taux GC CDSvalid b = 0 #taux GC1 CDSvalid c = 0 #taux GC2 CDSvalid d = 0 #taux GC3 CDSvalid LenCumCDS = 0 NbrCDS_valid = 0 LenCumCDS_valid = 0 j = 0 #NbrCDS = len(li) for i in li: i = str(i) j = j + 1 LenCumCDS = LenCumCDS + len(i) if CDS_Conformity(i): NbrCDS_valid += 1 LenCumCDS_valid += len(i) lis = GC123(i) a += lis[0] b += lis[1] c += lis[2] d += lis[3] else: pass NbrCDS = j #print(j, NbrCDS_valid, LenCumCDS, LenCumCDS_valid) #print(a/len(li),b/len(li),c/len(li),d/len(li)) lst = [ line, len_gen, round(gc), round(N), NbrCDS, LenCumCDS, round(a / len(li)), NbrCDS_valid, LenCumCDS_valid, round(b / len(li)), round(c / len(li)), round(d / len(li)) ] lst = map(str, lst) chain = "\t".join(lst) chain = chain + '\n' return chain
def parsing_genom_CDS(): ##recuperer les TaxID with open('tmp.txt') as f: line = f.readline() line = line.rstrip() line = int (float(line)) #construire fichier CSV pour stocker les résultats #res = csv.writer(open("resultats.csv", "wb")) #c.writerow(["TaxId","LenGenome","GCgenome","%NA","NbrCDS","LenCumCDS","GC_CDS","NbrCDS_valid","LenCumCDS_valid","GC1","GC2","GC3"]) liste_gen = parsing_fasta("tmp.txt.genome") len_gen = 0 N = 0 gc = 0 for i in liste_gen: len_gen = len_gen + len(str(i)) #lengueur des chromosomes for j in str(i): if j == 'N': N = N + 1 #nbr de N dans la seq génomique if j in ['G', 'C']: gc = gc + 1 len_gen_valid = len_gen - N gc = gc*100/float(len_gen_valid) #taux de GC genome N = N / float(len_gen) #%NA #print(gc, N, len_gen, len_gen_valid) ####CDS li = parsing_fasta("tmp.txt.cds") lis = [] a = 0 b = 0 c = 0 d = 0 LenCumCDS = 0 NbrCDS_valid = 0 LenCumCDS_valid = 0 j = 0 #NbrCDS = len(li) for i in li: i = str(i) j = j +1 LenCumCDS = LenCumCDS + len(i) if CDS_Conformity(i): NbrCDS_valid += 1 LenCumCDS_valid +=len(i) lis = GC123(i) a +=lis[0] b +=lis[1] c +=lis[2] d +=lis[3] else: pass NbrCDS = j #print(j, NbrCDS_valid, LenCumCDS, LenCumCDS_valid) #print(a/len(li),b/len(li),c/len(li),d/len(li)) lst = [line, len_gen, round(gc), round(N), NbrCDS, LenCumCDS, round(a/len(li)), NbrCDS_valid, LenCumCDS_valid, round(b/len(li)), round(c/len(li)), round(d/len(li))] lst = map(str, lst) chain = "\t".join(lst) chain = chain + '\n' return chain
# print( brin1[1:] ) # print( brin1[1:].translate() ) # print( brin1[2:] ) # print( brin1[2:].translate() ) ## known bug for six_frame_translations !! from Bio.SeqUtils import six_frame_translations print( "" ) # print(six_frame_translations( fasta_seq_toTest ) ) # print(six_frame_translations( genba_seq_toTest ) ) ## does not work above for GC() from Bio.SeqUtils import GC, GC123 print( "" ) # print(GC( fasta_seq_toTest ) ) print(GC123( fasta_seq_toTest ) ) # print(GC( genba_seq_toTest ) ) print(GC123( genba_seq_toTest ) ) ## NCBI connection def FetchSeq(mydb, myrettype, myretmode, myid): from Bio import Entrez from Bio import SeqIO Entrez.email = "*****@*****.**" with Entrez.efetch(db=mydb, rettype=myrettype, retmode=myretmode, id=myid) as handle: seq_record = SeqIO.read(handle, "fasta") print("%s with %i features" % (seq_record.id, len(seq_record.features))) print( "***** FetchSeq nucleotide fasta text 6273291" ) FetchSeq("nucleotide", "fasta", "text", "6273291")
from Bio import SeqIO fasta_file = "capybara_genes.fasta" # Input fasta file gene_code = ['Capybara_gene_code'] gc_content_tot = ['GC_content_total'] gc_first = ['GC_first_codon_pos'] gc_second = ['GC_second_codon_pos'] gc_third = ['GC_third_codon_pos'] gene_length = ['Gene_length'] protein_length = ['Prot_length'] fasta_sequences = SeqIO.parse(open(fasta_file), 'fasta') for seq in fasta_sequences: gene_code.append(seq.id) #ID exists only for 'Record' objects gc_content_tot.append(GC123(seq)[0]) gc_first.append(GC123(seq)[1]) gc_second.append(GC123(seq)[2]) gc_third.append(GC123(seq)[3]) gene_length.append(len(seq)) protein_length.append(len(seq) / 3) gene_code = np.array(gene_code) gc_content_tot = np.array(gc_content_tot) gc_first = np.array(gc_first) gc_second = np.array(gc_second) gc_third = np.array(gc_third) gene_length = np.array(gene_length) protein_length = np.array(protein_length) tbl = np.column_stack((gene_code, gc_content_tot, gc_first, gc_second, gc_third, gene_length, protein_length))
fastaArray = [] id_Fasta = '' for i in range(0, len(fastaLines) - 1): #from 0 to the number of lines in the file minus one lines = fastaLines[i] if lines.startswith('>'): id_Fasta = lines.strip( '>') #strip removes leading and trailing characters id_Fasta = id_Fasta.strip( '\n') #strip removes leading and trailing characters id_Fasta = id_Fasta.strip( '\s') #strip removes leading and trailing characters count = resultArray.index(id_Fasta) fastaLines[i + 1] = fastaLines[i + 1].strip('\n') gc_1 = round(GC123(fastaLines[i + 1])[1], 2) gc_2 = round(GC123(fastaLines[i + 1])[2], 2) gc_3 = round(GC123(fastaLines[i + 1])[3], 2) gc_overall = round(GC(fastaLines[i + 1]), 2) outputFile.write('>' + resultLineArray[count] + '\t' + str(gc_1) + '\t' + str(gc_2) + '\t' + str(gc_3) + '\t' + str(gc_overall) + '\n') outputFile.write(fastaLines[i + 1] + '\n') resultArray.pop(count) resultLineArray.pop(count) outputFile.close()
#!/usr/bin/python # testing python script import Bio as bio import matplotlib.pyplot as plot from Bio.Seq import Seq from Bio.Alphabet import generic_rna,generic_dna from Bio.SeqUtils import GC123,GC,GC_skew seq = "ataccaggctgaggcccattaatgatgcaatttgctgggcttctctattttctccgtgcttccatcctcttctccgtcggcggggagaagtgaaatgccgtggagatgggcggcggcggcggcgacggcggcgacgagaaagctcaccgggatctctcagtcgcgagtttcagtagcctttaccggccgtcttctctaccgctcgttcggaagcgactccagtgaaagccgcaagaggtcactgccacggggggtcgtatcgatcggggccatcagccttgctggaggtctcgtgctcagcgccgtcaacgacctcgccatcttcaatggatgcacaacgaaggcaattgagcatgctgctgacaaccctgctgttgtggaagcaattggagtgcctatagtcagaggaccgtggtatgatgcttctcttgaggtgggccatcgacggcggtctgtgtcatgcacattccctgtatctgggccacatgggtcaggatttctccagattaaggcaacccgagatggagaggatggtctgctttcgtttctgcggcatcacgactggaagatcctattgctggaggctcatcttgaagcaccatcagatgatgaggaccagagaaagctggttaaggtgaatcttgcaagcagtggccgtggggaagatggggatccagagagtggttaatcttttgtactgaattccatggtgagtggaagatcgtgtcatctgaatggactccaaatattaaatgacatggagatctagggaagcaaaaaaaaaaaaaaaa" print GC123(seq) print GC(seq) plot(GC_skew(seq,window=100),c="r") xlabel("Window") ylabel("(G-C)/(G+C)") title("GC-skew")