def gff3_to_gtf(gff3_file): dialect = { 'field separator': '; ', 'fmt': 'gtf', 'keyval separator': ' ', 'leading semicolon': False, 'multival separator': ',', 'quoted GFF2 values': True, 'order': ['gene_id', 'transcript_id'], 'repeated keys': False, 'trailing semicolon': True } out_file = os.path.splitext(gff3_file)[0] + ".gtf" if file_exists(out_file): return out_file print "Converting %s to %s." % (gff3_file, out_file) db = gffutils.create_db(gff3_file, ":memory:") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in DataIterator(db.features_of_type("exon"), dialect=dialect): transcript_id = feature["Parent"][0] gene_id = db[transcript_id]["Parent"][0] attr = {"transcript_id": transcript_id, "gene_id": gene_id} attributes = gffutils.attributes.Attributes(attr) feature.attributes = attributes print >> out_handle, feature return out_file
def _output_ncbi_gff3(gff3_file, out_file, dialect): gene_key = "gene" id_spec = {"gene": gene_key} db = gffutils.create_db(gff3_file, ":memory:", id_spec=id_spec) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in DataIterator(db.features_of_type("exon"), dialect=dialect): # Gnomon features are often missing a transcript id # some malformed features are also missing the gene key try: transcript_id = feature["transcript_id"] except KeyError: try: transcript_id = feature[gene_key] except KeyError: continue gene_id = feature[gene_key] try: biotype = feature["gene_biotype"] except KeyError: biotype = "unknown" attr = { "transcript_id": transcript_id, "gene_id": gene_id, "gene_biotype": biotype } attributes = gffutils.attributes.Attributes(attr) feature.attributes = attributes print(feature, file=out_handle, end="")
def _output_gff3(gff3_file, out_file, dialect): db = gffutils.create_db(gff3_file, ":memory:") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in DataIterator(db.features_of_type("exon"), dialect=dialect): transcript_id = feature["Parent"][0] gene_id = db[transcript_id]["Parent"][0] attr = {"transcript_id": transcript_id, "gene_id": gene_id} attributes = gffutils.attributes.Attributes(attr) feature.attributes = attributes print(feature, file=out_handle, end="")
def get_trans_records(dbname, trans_seq_dict): records = DataIterator(dbname) e = 0 chromosome = [] strand = [] start = [] end = [] trans_id = [] exon_number = [] exon_id = [] gene_id = [] protein_id = [] strand_dict = {} protein_id_dict = {} for record in records: if ('transcript_type' in record.attributes): if (record.attributes['transcript_id'][0] in trans_seq_dict.keys()): strand_dict[record.attributes['transcript_id'][0]] = record[6] protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\ record.attributes['gene_id'][0]) if (record[2] == 'exon'): chromosome.append(record[0]) strand.append(record[6]) start.append(int(record[3])) end.append(int(record[4])) tmp = record.attributes['ID'][0] flag = tmp.find(':') tmp = tmp[flag + 1:] s = tmp.split(':', 1) if (s[0].find('_') != -1): flag = s[0].find('_') s[0] = s[0][:flag] trans_id.append(s[0]) exon_number.append(int(s[1])) gene_id.append(record.attributes['gene_id'][0]) exon_id.append(record.attributes['exon_id'][0]) protein_id.append(record.attributes['protein_id'][0]) e += 1 #print(e) exon_records={'chr':chromosome,'strand':strand,'start':start,'end':end,'trans_id':trans_id,\ 'exon_number':exon_number,'exon_id':exon_id,'protein_id':protein_id,'gene_id':gene_id} trans_records=pd.DataFrame(exon_records,columns=['chr','strand','start','end','trans_id','exon_number',\ 'exon_id','protein_id','gene_id']) trans_records = trans_records.sort_values(by=['trans_id', 'exon_number'], axis=0) return trans_records, strand_dict, protein_id_dict
def get_new_sequence(dfname, dbname, rna_db, het, exclude, output_name): version = get_version(rna_db) if (version == 'swissprot'): db = 'sp' else: db = version sequence_dict = {} rna_seq = SeqIO.parse(rna_db, 'fasta') ##check the correctness of rna-seqs for correct in rna_seq: tmp = correct.id flag = tmp.find("|") tmp = tmp[flag + 1:] cds = tmp.find('CDS:') if (cds == -1): print("The format of file of parameter -r(--rna) is incorrect!") return 0 tmp = tmp[cds:] cds_end = tmp.find('|') tmp = tmp[tmp.find(':') + 1:cds_end] split_flag = tmp.find('-') if (split_flag == -1): print("The format of file of parameter -r(--rna) is incorrect!") return 0 rna_seqs = SeqIO.parse(rna_db, 'fasta') for seq in rna_seqs: tmp = seq.id flag = tmp.find("|") mrna_id = tmp[:flag] tmp = tmp[flag + 1:] cds = tmp.find('CDS:') tmp = tmp[cds:] cds_end = tmp.find('|') tmp = tmp[tmp.find(':') + 1:cds_end] split_flag = tmp.find('-') coding_start = int(tmp[:split_flag]) coding_end = int(tmp[split_flag + 1:]) sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end) print("sequence_dict ready") records = DataIterator(dbname) strand_dict = {} protein_id_dict = {} for record in records: if (record[2] == 'transcript'): if ('transcript_type' in record.attributes): if (record.attributes['transcript_id'][0] in sequence_dict.keys()): strand_dict[record.attributes['transcript_id'] [0]] = record[6] protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\ record.attributes['gene_id'][0]) print("protein_id_dict ready") df = pd.read_csv(dfname, sep='\t', header=None) change_df = extract_transcript_change(df) trans_index_dict = {} for i in range(0, change_df.shape[0]): if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()): trans_index_dict[change_df.iloc[i]['mrna']] = [i] else: trans_index_dict[change_df.iloc[i]['mrna']].append(i) my_seqs = [] k_cnt = 0 hom_only_cnt = 0 hom_het_cnt = 0 het_only_cnt = 0 original_cnt = 0 random_cnt = 0 for k in trans_index_dict.keys(): if (k in protein_id_dict.keys()): k_cnt += 1 if (k_cnt % 1000 == 0): print(k_cnt) pid = protein_id_dict[k][0] gid = protein_id_dict[k][1] if (strand_dict[k] == '+'): transcript = str(sequence_dict[k][1]) else: tmp = Seq(str(sequence_dict[k][1]), IUPAC.ambiguous_dna).complement() transcript = str(tmp) coding_start = int(sequence_dict[k][0]) - 1 coding_end = int(sequence_dict[k][2]) - 1 transcript = transcript[coding_start:coding_end + 1] shift = 0 des = "" het_list = [] hom_position_list = [] for i in trans_index_dict[k]: if (change_df.iloc[i]['snp_type'] == 'hom'): hom_position_list.append( (int(change_df.iloc[i]['c_start']), change_df.iloc[i]['mutation_type'])) # if(change_df.iloc[i]['mutation_type']=='snv'): # transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ # int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv') # des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_' # else: if (strand_dict[k] == '-'): transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\ change_df.iloc[i]['mutation_type']) else: transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\ change_df.iloc[i]['mutation_type']) des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\ str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_' if (change_df.iloc[i]['mutation_type'].find('del') != -1): shift -= (int(change_df.iloc[i]['c_end']) - int(change_df.iloc[i]['c_start']) + 1) elif (change_df.iloc[i]['mutation_type'].find('ins') != -1): shift += (int(change_df.iloc[i]['c_end']) - int(change_df.iloc[i]['c_start']) + 1) else: het_list.append(i) if (len(hom_position_list) != 0): new_sequence = "" new_des = "" for p in hom_position_list: flag = des.find(str(p[0])) tmp = des[flag:] flag = tmp.find('_') new_des += str(p[1]) + ":" + tmp[:flag + 1] new_sequence = transcript if (len(new_sequence) != 0): new_sequence = new_sequence[shift:] if (strand_dict[k] == '+'): new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).transcribe().translate( to_stop=True)) else: new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna). complement().transcribe().translate(to_stop=True)) while (new_seq.find('None') != -1): new_seq = new_seq.replace('None', '') my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0])+shift)+'-'+str(int(sequence_dict[k][2])+shift)+'_'+new_des,\ description=new_des)) hom_only_cnt += 1 else: if (exclude == False): new_sequence = transcript if (strand_dict[k] == '+'): new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).transcribe().translate( to_stop=True)) else: new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna). complement().transcribe().translate(to_stop=True)) my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0]))+'-'+str(int(sequence_dict[k][2]))+'_no_variant',\ description="no variant")) original_cnt += 1 if (het == 1): coding_start = int(sequence_dict[k][0]) + shift - 1 coding_end = int(sequence_dict[k][2]) + shift - 1 count = int(len(transcript) / 900) if (len(transcript) <= 900): count = 1 cnt = 0 for l in range(0, count): l = l * 900 start = l if (start + 1799 < len(transcript)): stop = start + 1799 else: stop = len(transcript) - 1 het_number = len(het_list) for n in range(0, het_number): new_sequence = "" new_des = "" if ((int(change_df.iloc[het_list[n]]['c_start']) >= start) & (int(change_df.iloc[het_list[n]]['c_start']) <= stop)): if (strand_dict[k] == '-'): new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\ int(change_df.iloc[het_list[n]]['c_end'])+shift,\ str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\ change_df.iloc[het_list[n]]['mutation_type']) else: new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\ int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\ change_df.iloc[het_list[n]]['mutation_type']) new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\ str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\ str(change_df.iloc[het_list[n]]['c_content']) if (len(new_sequence) != 0): new_sequence = new_sequence[start:stop + 1] if (strand_dict[k] == '+'): new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna ).transcribe().translate(to_stop=True)) else: new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).complement(). transcribe().translate(to_stop=True)) cnt += 1 while (new_seq.find('None') != -1): new_seq = new_seq.replace('None', '') my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+new_des,\ description=new_des)) if (len(hom_position_list) != 0): hom_het_cnt += 1 else: het_only_cnt += 1 if (change_df.iloc[het_list[n]]['mutation_type'] == 'snv'): random_seq, random_des = generate_random_SNV_site( strand_dict[k], k, transcript[start:stop + 1], int(change_df.iloc[het_list[n]]['c_start']) - start) my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\ description=des+random_des)) random_cnt += 1 else: random_seq,random_des=generate_random_fs(strand_dict[k],transcript[start:stop+1],\ change_df.iloc[het_list[n]]['mutation_type'],\ len(change_df.iloc[het_list[n]]['c_content']),\ int(change_df.iloc[het_list[n]]['c_start'])-start) # if(len(random_seq)!=0): my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\ description=des+random_des)) random_cnt += 1 if (exclude == False): for key in sequence_dict.keys(): if (key not in trans_index_dict.keys()): pid = protein_id_dict[key][0] gid = protein_id_dict[key][1] coding_start = int(sequence_dict[key][0]) - 1 coding_end = int(sequence_dict[key][2]) - 1 new_sequence = sequence_dict[key][1][coding_start:coding_end + 1] new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).transcribe().translate( to_stop=True)) my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+key+'|'+gid+'_0:'+str(coding_start+1)+'-'+str(coding_end+1)+'_no_variant',\ description="no variant")) original_cnt += 1 print("The number of proteins related is " + str(k_cnt)) print("The number of sequences generated is " + str(len(my_seqs))) handle = open(output_name + ".fasta", "w") for sequence in my_seqs: SeqIO.write(sequence, handle, "fasta") print("The number of sequences containing hom only is " + str(hom_only_cnt)) print("The number of sequences containing het only is " + str(het_only_cnt)) print("The number of mixed sequences is " + str(hom_het_cnt)) print("The number of original sequences is " + str(original_cnt)) print("The number of random sequences is " + str(random_cnt))
type=str, help= 'A gff file containing intervals within which gc content can be determined.' ) conf = ap.parse_args() #----------------------------------------------------- # Step 2 # Identify the gc content of features in the gff file #----------------------------------------------------- genome_file = conf.genome gff_file = conf.gff for feature in DataIterator(gff_file): contig_id = str(feature.seqid) feat_start = str(feature.start) feat_stop = str(feature.stop) sequence = feature.sequence(genome_file) g_count = sequence.count('G') c_count = sequence.count('C') n_count = sequence.count('N') gc_count = float(g_count + c_count) seq_len = int(len(sequence) - n_count) gc_frac = np.divide(gc_count, seq_len) gc_perc = int(np.round_(np.multiply(gc_frac, 100), decimals=0, out=None)) outline = [contig_id, feat_start, feat_stop, str(gc_perc)] print("\t".join(outline))
def get_new_sequence(dfname, dbname, rna_db, protein_db, dataset_name): version = get_version(protein_db) if (version == 'swissprot'): db = 'sp' else: db = version protein_coding_list = get_protein_coding_list_from_db(protein_db) records = DataIterator(dbname) strand_dict = {} protein_id_dict = {} for record in records: if (record[2] == 'transcript'): if ('transcript_type' in record.attributes): if (record.attributes['transcript_type'][0] == 'protein_coding' ): if (record.attributes['transcript_id'][0] in protein_coding_list): strand_dict[record.attributes['transcript_id'] [0]] = record[6] protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\ record.attributes['gene_id'][0]) print("protein_id_dict ready") sequence_dict = {} rna_seqs = SeqIO.parse(rna_db, 'fasta') ##check the correctness of rna-seqs for correct in rna_seqs: tmp = correct.id flag = tmp.find("|") mrna_id = tmp[:flag] tmp = tmp[flag + 1:] cds = tmp.find('CDS:') if (cds == -1): print("The format of file of parameter -r(--rna) is incorrect!") return 0 tmp = tmp[cds:] cds_end = tmp.find('|') tmp = tmp[tmp.find(':') + 1:cds_end] split_flag = tmp.find('-') if (split_flag == -1): print("The format of file of parameter -r(--rna) is incorrect!") return 0 coding_start = int(tmp[:split_flag]) coding_end = int(tmp[split_flag + 1:]) for seq in rna_seqs: tmp = seq.id flag = tmp.find("|") mrna_id = tmp[:flag] tmp = tmp[flag + 1:] cds = tmp.find('CDS:') tmp = tmp[cds:] cds_end = tmp.find('|') tmp = tmp[tmp.find(':') + 1:cds_end] split_flag = tmp.find('-') coding_start = int(tmp[:split_flag]) coding_end = int(tmp[split_flag + 1:]) sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end) print("sequence_dict ready") df = pd.read_csv(dfname, sep='\t', header=None) change_df = extract_transcript_change(df) trans_index_dict = {} for i in range(0, change_df.shape[0]): if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()): trans_index_dict[change_df.iloc[i]['mrna']] = [i] else: trans_index_dict[change_df.iloc[i]['mrna']].append(i) my_seqs = [] k_cnt = 0 hom_cnt = 0 het_cnt = 0 for k in trans_index_dict.keys(): if (k in protein_id_dict.keys()): k_cnt += 1 # if(k_cnt%1000==0): # print(k_cnt) pid = protein_id_dict[k][0] gid = protein_id_dict[k][1] if (strand_dict[k] == '+'): transcript = str(sequence_dict[k][1]) else: tmp = Seq(str(sequence_dict[k][1]), IUPAC.ambiguous_dna).complement() transcript = str(tmp) coding_start = int(sequence_dict[k][0]) - 1 coding_end = int(sequence_dict[k][2]) - 1 transcript = transcript[coding_start:coding_end + 1] shift = 0 des = "" het_list = [] hom_position_list = [] for i in trans_index_dict[k]: if (change_df.iloc[i]['snp_type'] == 'hom'): hom_cnt += 1 hom_position_list.append( (int(change_df.iloc[i]['c_start']), change_df.iloc[i]['mutation_type'])) # if(change_df.iloc[i]['mutation_type']=='snv'): # transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ # int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv') # des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_' # else: if (strand_dict[k] == '-'): transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\ change_df.iloc[i]['mutation_type']) else: transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\ int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\ change_df.iloc[i]['mutation_type']) des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\ str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_' if (change_df.iloc[i]['mutation_type'].find('del') != -1): shift -= (int(change_df.iloc[i]['c_end']) - int(change_df.iloc[i]['c_start']) + 1) elif (change_df.iloc[i]['mutation_type'].find('ins') != -1): shift += (int(change_df.iloc[i]['c_end']) - int(change_df.iloc[i]['c_start']) + 1) else: het_list.append(i) het_cnt += 1 coding_start = int(sequence_dict[k][0]) + shift - 1 coding_end = int(sequence_dict[k][2]) + shift - 1 count = int(len(transcript) / 900) if (len(transcript) <= 900): count = 1 cnt = 0 for l in range(0, count): l = l * 900 start = l if (start + 1799 < len(transcript)): stop = start + 1799 else: stop = len(transcript) - 1 # while(coding_start>stop): # l=l+1 # start=l*1800 # if(l+1799<len(transcript)): # stop=l+1799 # else: # stop=len(transcript)-1 # #if(start>coding_end+shift) if (len(hom_position_list) != 0): new_sequence = "" new_des = "" for p in hom_position_list: if ((p[0] - 1 >= start) & (p[0] - 1 <= stop)): flag = des.find(str(p[0])) tmp = des[flag:] flag = tmp.find('_') new_des += str(p[1]) + ":" + tmp[:flag + 1] new_sequence = transcript if (len(new_sequence) != 0): new_sequence = new_sequence[start:stop + 1] if (strand_dict[k] == '+'): new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna). transcribe().translate(to_stop=True)) else: new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).complement(). transcribe().translate(to_stop=True)) cnt += 1 while (new_seq.find('None') != -1): new_seq = new_seq.replace('None', '') my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\ description=new_des)) het_number = len(het_list) for n in range(0, het_number): new_sequence = "" new_des = "" if ((int(change_df.iloc[het_list[n]]['c_start']) >= start) & (int(change_df.iloc[het_list[n]]['c_start']) <= stop)): if (strand_dict[k] == '-'): new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\ int(change_df.iloc[het_list[n]]['c_end'])+shift,\ str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\ change_df.iloc[het_list[n]]['mutation_type']) else: new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\ int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\ change_df.iloc[het_list[n]]['mutation_type']) new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\ str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\ str(change_df.iloc[het_list[n]]['c_content']) if (len(new_sequence) != 0): new_sequence = new_sequence[start:stop + 1] if (strand_dict[k] == '+'): new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna). transcribe().translate(to_stop=True)) else: new_seq = str( Seq(str(new_sequence), IUPAC.ambiguous_dna).complement(). transcribe().translate(to_stop=True)) cnt += 1 while (new_seq.find('None') != -1): new_seq = new_seq.replace('None', '') my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\ id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\ description=new_des)) print("The number of proteins related is " + str(k_cnt)) print("The number of sequences generated is " + str(len(my_seqs))) print("The number of homozygous is " + str(hom_cnt)) print("The number of heterozygous is " + str(het_cnt)) #return my_seqs handle = open(dataset_name + "_all_mutation_" + version + ".fasta", "w") hom_seq = 0 het_seq = 0 hom_het_seq = 0 for sequence in my_seqs: if (str(sequence.description)[-1] == '_'): hom_seq += 1 else: if (str(sequence.description).find('_') == -1): het_seq += 1 else: hom_het_seq += 1 SeqIO.write(sequence, handle, "fasta") print("The number of homozygous sequences is " + str(hom_seq)) print("The number of heterozygous sequences is " + str(het_seq)) print("The number of mixed sequences is " + str(hom_het_seq))
def read_taxon_id(run_folder): """ Search for Taxon ID in genbank or GFF files. For GenBank file searc for ''taxon:' key in 'db_xref' qualifier. For GFF file search for 'taxon' in dbxref feature. Args: run_folder (str): path to the input folder """ taxon_ids = {} for input_folder in os.listdir(run_folder): input_folder_path = os.path.join(run_folder, input_folder) for input_file in os.listdir(input_folder_path): if '.gbk' in input_file: gbk_pathname = os.path.join(input_folder_path, input_file) # Take the species name and the taxon id from the genbank file. with open(gbk_pathname, "r") as gbk: # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. first_seq_record = next(SeqIO.parse(gbk, "genbank")) # Take the source feature of the first record. # This feature contains the taxon ID in the db_xref qualifier. src_features = [ feature for feature in first_seq_record.features if feature.type == "source" ] for src_feature in src_features: try: src_dbxref_qualifiers = src_feature.qualifiers[ 'db_xref'] for src_dbxref_qualifier in src_dbxref_qualifiers: if 'taxon:' in src_dbxref_qualifier: taxon_id = src_dbxref_qualifier.replace( 'taxon:', '') except KeyError: logger.info( 'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.' .format(gbk_pathname)) elif '.gff' in input_file: gff_pathname = os.path.join(input_folder_path, input_file) # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. try: region_feature = [ feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region' ][0] except IndexError: raise IndexError( 'No region feature in the GFF file of {0}, GFF file must have region features.' .format(input_folder)) try: region_feature.attributes['Dbxref'] except KeyError: raise KeyError( 'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(input_folder)) for dbxref in region_feature.attributes['Dbxref']: if 'taxon' in dbxref: taxon_id = dbxref.split('taxon:')[1] elif '.pf' in input_file: logger.info( 'No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id' .format(input_folder)) taxon_id = "missing" taxon_ids[input_folder] = taxon_id return taxon_ids
def create_flats_and_lisp(run_folder, taxon_file): """ Read Genbank/GFF/PF files and create Pathway Tools needed file. Create also a lisp file to create flat files from Pathway tools results. The name of the PGDB created by Pathway Tools will be the name of the species with '_' instead of space. Create organism-params.dat: ID pgdb_id STORAGE FILE NCBI-TAXON-ID taxon_id NAME species_name Create genetic-elements.dats: NAME ANNOT-FILE gbk_name // Create flat_files_creation.lisp: (in-package :ecocyc) (select-organism :org-id 'pgdb_id) (create-flat-files-for-current-kb) Args: run_folder (str): ID of a species of the input folder taxon_file (bool): Boolean indicating if a taxon_file must be used Returns: list: boolean list, True if all files have been created """ # Look for a Genbank/GFF files in the run folder. # PGDB ID corresponds to the name of the species folder. pgdb_id = os.path.basename(run_folder) gbk_name = pgdb_id + ".gbk" gbk_pathname = os.path.join(run_folder, gbk_name) gbff_name = pgdb_id + ".gbff" gbff_pathname = os.path.join(run_folder, gbff_name) gff_name = pgdb_id + ".gff" gff_pathname = os.path.join(run_folder, gff_name) organism_dat = os.path.join(run_folder, 'organism-params.dat') genetic_dat = os.path.join(run_folder, 'genetic-elements.dat') lisp_pathname = os.path.join(run_folder, 'flat_files_creation.lisp') fasta_extensions = ['.fasta', '.fsa'] taxon_id = "" taxon_error = False species_name = "" taxon_datas = {} if os.path.isfile(gbk_pathname) or os.path.isfile(gbff_pathname): if os.path.isfile(gbk_pathname): input_name = gbk_name input_path = gbk_pathname else: input_name = gbff_name input_path = gbff_pathname # Take the species name and the taxon id from the genbank file. with open(input_path, "r") as gbk: # Take the first record of the genbank (first contig/chromosome) to retrieve the species name. try: first_seq_record = next(SeqIO.parse(gbk, "genbank")) except StopIteration: logger.critical( 'Issue with the genbank {0}, it can be empty or malformatted.' .format(input_path)) return None try: species_name = first_seq_record.annotations['organism'] except KeyError: logger.critical( 'No organism in the Genbank {0} In the SOURCE you must have: ORGANISM Species name' .format(pgdb_id)) return None # Take the source feature of the first record. # This feature contains the taxon ID in the db_xref qualifier. src_features = [ feature for feature in first_seq_record.features if feature.type == "source" ] for src_feature in src_features: if 'db_xref' in src_feature.qualifiers: src_dbxref_qualifiers = src_feature.qualifiers['db_xref'] for src_dbxref_qualifier in src_dbxref_qualifiers: if 'taxon:' in src_dbxref_qualifier: taxon_id = src_dbxref_qualifier.replace( 'taxon:', '') if not taxon_id: logger.info( 'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.' .format(gbk_pathname)) logger.info('Try to look in the taxon_id.tsv file') taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) if taxon_file: taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) elif os.path.isfile(gff_pathname): input_name = gff_name # Check if there is a fasta file. gff_fasta = None for fasta_extension in fasta_extensions: fasta_input_name = input_name.replace('.gff', fasta_extension) fasta_path = os.path.join(run_folder, fasta_input_name) if os.path.exists(fasta_path): gff_fasta = fasta_input_name if not gff_fasta: logger.critical( 'No fasta file (.fasta or .fsa) with the GFF of {0}'.format( pgdb_id)) return None # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature. try: region_feature = [ feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region' ][0] except IndexError: logger.critical( 'No region feature in the GFF file of {0}, GFF file must have region features.' .format(pgdb_id)) return None try: region_feature.attributes['Dbxref'] except KeyError: logger.critical( 'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(pgdb_id)) for dbxref in region_feature.attributes['Dbxref']: if 'taxon' in dbxref: taxon_id = dbxref.split('taxon:')[1] if not taxon_id or taxon_file: if not taxon_id: logger.info( 'Missing "taxon:" in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.' .format(pgdb_id)) logger.info('Try to look in the taxon_id.tsv file') taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) # Look for PF files. elif all([ True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file ]): for species_file in os.listdir(run_folder): if '.pf' in species_file: # Check if there is a fasta file. pf_fasta = None for fasta_extension in fasta_extensions: fasta_species_name = species_file.replace( '.pf', fasta_extension) fasta_path = os.path.join(run_folder, fasta_species_name) if os.path.exists(fasta_path): pf_fasta = fasta_species_name if not pf_fasta: logger.critical( 'No fasta file (.fasta or .fsa) with the Pathologic file of {0}, this could lead to warnings in Pathway Tools.' .format(pgdb_id)) taxon_error, taxon_id, taxon_datas = extract_taxon_id( run_folder, pgdb_id, taxon_id, taxon_file) if taxon_error == True: logger.critical('Issue with taxon ID of {0}.'.format(run_folder)) return None # Create the organism-params dat file. with open(organism_dat, 'w', encoding='utf-8') as organism_file: organism_writer = csv.writer(organism_file, delimiter='\t', lineterminator='\n') organism_writer.writerow(['ID', pgdb_id]) organism_writer.writerow(['STORAGE', "FILE"]) organism_writer.writerow(['NCBI-TAXON-ID', taxon_id]) organism_writer.writerow(['NAME', species_name]) if 'reference_pgdbs' in taxon_datas: for reference_pgdb in taxon_datas['reference_pgdbs']: organism_writer.writerow(['REF-ORGID', reference_pgdb]) # Create the genetic-elements dat file. with open(genetic_dat, 'w', encoding='utf-8') as genetic_file: if os.path.isfile(gff_pathname) or os.path.isfile( gbk_pathname) or os.path.isfile(gbff_pathname): genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n') genetic_writer.writerow(['NAME', '']) genetic_writer.writerow(['ANNOT-FILE', input_name]) if os.path.isfile(gff_pathname): genetic_writer.writerow(['SEQ-FILE', gff_fasta]) if 'circular' in taxon_datas: circular = taxon_datas['circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas: element_type = taxon_datas['element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas: codon_table = taxon_datas['codon_table'] genetic_writer.writerow(['CODON-TABLE', codon_table]) genetic_writer.writerow(['//']) elif all([ True for species_file in os.listdir(run_folder) if '.pf' in species_file or '.fasta' in species_file or '.fsa' in species_file ]): genetic_writer = csv.writer(genetic_file, delimiter='\t', lineterminator='\n') for species_file in os.listdir(run_folder): if '.pf' in species_file: species_file_name = os.path.splitext(species_file)[0] genetic_writer.writerow( ['NAME', species_file.replace('.pf', '')]) genetic_writer.writerow( ['ID', species_file.replace('.pf', '')]) genetic_writer.writerow(['ANNOT-FILE', species_file]) fasta_path = os.path.join( run_folder, species_file.replace('.pf', '.fasta')) fsa_path = os.path.join( run_folder, species_file.replace('.pf', '.fsa')) if os.path.exists(fasta_path): genetic_writer.writerow([ 'SEQ-FILE', species_file.replace('.pf', '.fasta') ]) elif os.path.exists(fsa_path): genetic_writer.writerow( ['SEQ-FILE', species_file.replace('.pf', '.fsa')]) if species_file_name in taxon_datas: if 'circular' in taxon_datas[species_file_name]: circular = taxon_datas[species_file_name][ 'circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas[species_file_name]: element_type = taxon_datas[species_file_name][ 'element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas[species_file_name]: codon_table = taxon_datas[species_file_name][ 'codon_table'] genetic_writer.writerow( ['CODON-TABLE', codon_table]) else: if 'circular' in taxon_datas: circular = taxon_datas['circular'] genetic_writer.writerow(['CIRCULAR?', circular]) if 'element_type' in taxon_datas: element_type = taxon_datas['element_type'] genetic_writer.writerow(['TYPE', element_type]) if 'codon_table' in taxon_datas: codon_table = taxon_datas['codon_table'] genetic_writer.writerow( ['CODON-TABLE', codon_table]) genetic_writer.writerow(['//']) # Create the lisp script. check_lisp_file = create_flat_creation_script(pgdb_id, lisp_pathname) return all([ os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file ])