def parse_CDS(file): '''Function to parse the CDS numbers along with their accession numbers input: read file, genbank output: CDS description ''' write_file= cg.cds_file fm.wipe_file(write_file) #wipe write file for rec in SeqIO.parse(file, "genbank"):# for every entry(rec) run though the following if rec.features: for feature in rec.features: #gathering and cleaning accession if feature.type=='CDS': acc= cl.remove_version(rec.id) fm.write_file("insert into CDS values ('"+ acc + "',", write_file) #Accession numbers cds_region= cl.clean_cds_region(feature.location) fm.write_file("'"+cds_region+"',", write_file) #CDS regions try: cds_seq=str(feature.location.extract(rec).seq) fm.write_file("'"+cds_seq+"'); \n", write_file) #CDS sequence except: fm.write_file("'CDS ERROR; FULL DNA SEQUENCE:"+str(rec.seq) +"');\n", write_file) continue
def parse_dna_seq(): '''This function captures whole DNA sequences from a variable database input: database output: return captured DNA sequences ''' fm.wipe_file(cg.dna_file) for dnaseq in SeqIO.parse(cg.r_file, "genbank"): fm.write_file(str(dnaseq.seq) +'\n'+'\n', cg.dna_file)
def parse_acc(): '''Using Biopython to parse Accession numbers input: empty, indirectly output: Accession numbers ''' fm.wipe_file(cg.acc_file) for i in SeqIO.parse(cg.r_file, "genbank"): acc= cl.remove_version(i.id) fm.write_file(acc + '\n', cg.acc_file)
def parse_acc_dna(r_file): '''This function captures Accession numbers and whole DNA sequences from a variable database input: read file, containing sequences output: return captured DNA sequences ''' write_file=cg.gene_file fm.wipe_file(write_file) for record in SeqIO.parse(cg.r_file, "genbank"): acc= cl.remove_version(record.id) fm.write_file("insert into GENE values ('"+ acc + "', '", write_file) fm.write_file(str(record.seq) +"'); \n", write_file)
def parse_CDS(file): '''Function to parse the CDS numbers along with their accession numbers input: read file, genbank output: CDS description ''' fm.wipe_file(cg.cds_file) for rec in SeqIO.parse(file, "genbank"): if rec.features: for feature in rec.features: acc= cl.remove_version(rec.id) fm.write_file("insert into CDS values ("+"'"+ acc + "' ,", cg.cds_file) #gathering and cleaning accession if feature.type == "CDS": aa_seq=feature.qualifiers['translation'] fm.write_file("'"+aa_seq[0]+"')", cg.cds_file) #captures string inside the list [] else: cds_region= cl.clean_cds_region(feature.location) fm.write_file("'"+cds_region+"'", cg.cds_file) #Where the CDS regions are fm.write_file("'"+feature.location.extract(rec).seq+"');", cg.cds_file) #CDS sequences
'''Parsing data''' import re import sys from config import config as cg from file_management import file_management as fm from parse_genfile import parse_data as pd from cleaning_data import clean_data as cl from SQL_format import sql_format as sf sys.path.insert(0, '../cgi-biocomp2/') #Run entire script to fully parse genbank file(r_file) #Parse ACCESSION fm.wipe_file(cg.sql_acc_file) fm.write_file(pd.parse_acc_no(cg.r_file), cg.sql_acc_file) #WRITES TO: gene_file = 'sql_acc_no.sql' #Parse GENE_ID fm.wipe_file(cg.gene_file) fm.write_file(sf.parse_gene_id(cg.r_file), cg.gene_file) #WRITES TO: gene_file = 'sql_gene_id.sql' #Parse CHROM_LOC fm.wipe_file(cg.chrom_file) fm.write_file(sf.parse_chrom_loc(cg.r_file), cg.chrom_file) #WRITES TO: chrom_file = 'sql_chrom_loc.sql' #Parse PRODUCT_NAME fm.wipe_file(cg.prod_file)