def ParseGFF(fname): """get contents from a decent GFF file. """ smap={1:'+' , -1:'-' } for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type=='gene': for child in each_rec.sub_features: print child.id, 'NULL', rec.id, smap[child.strand], child.type, child.location._start.position, child.location._end.position, each_rec.qualifiers['Note'][0] cnt=1 for cod in child.sub_features: fid=None if cod.type=="five_prime_UTR": fid=child.id+'_'+str(cnt) cod.type='UTR5' elif cod.type=="three_prime_UTR": fid=child.id+'_'+str(cnt) cod.type='UTR3' elif cod.type=="CDS": fid=child.id+'_'+str(cnt) if fid: print fid, child.id, rec.id, smap[cod.strand], cod.type, cod.location._start.position, cod.location._end.position,'NULL' cnt+=1 break fh.close()
def __main__(): try: gff_file = sys.argv[1] chrid = sys.argv[2] source = sys.argv[3] except: sys.stderr.write('Access denied for a GFF file !\n') sys.exit(-1) #gff_source = ['rheMac2_ensGene', 'rheMac2_refGene', 'rheMac2_refSeqAnno', 'rheMac2_transMapAlnUcscGenes'] limit_parse = dict( gff_id = [chrid], gff_source = [source] ) chrpos = dict() fh = open(gff_file) for rec in GFF.parse(fh, limit_info=limit_parse): for element in rec.features: chrpos[(element.location._start.position, element.location._end.position)] = 1 fh.close() sorted_chrpos = [sort_pos for sort_pos in sorted(chrpos)] WriteSortedGFF(gff_file, limit_parse, sorted_chrpos)
def ParserCommonType(fname, source): gff_type = ['transcript', 'exon'] source_type = zip([source] * len(gff_type), gff_type) filter_type = dict(gff_source_type=source_type, gff_id=['I', 'II', 'III', 'IV', 'V', 'X']) gene_cnt, genes_data = 1, [] gid = ordered_dict() gfh = open(fname) for rec in GFF.parse(gfh, limit_info=filter_type): for feature in rec.features: if feature.qualifiers['gene'][0] in gid: gene = gid[feature.qualifiers['gene'][0]] gene['is_alt_spliced'] = 1 gene['transcripts'].append(feature.id) exon_pos = [] for xlevel in feature.sub_features: exon_pos.append([ xlevel.location._start.position + 1, xlevel.location._end.position ]) if orient == '-': if exon_pos != [] and len(exon_pos) != 1: if exon_pos[0][0] > exon_pos[-1][0]: exon_pos.reverse() gene["exons"].append(exon_pos) gid[feature.qualifiers['gene'][0]] = gene else: gene = init_gene() gene['id'] = gene_cnt gene['name'] = feature.qualifiers['gene'][0] gene['chr'] = rec.id gene['source'] = feature.qualifiers['source'][0] orient = None if feature.strand == 1: orient = '+' else: orient = '-' gene['strand'] = orient gene['is_alt_spliced'] = 0 gene['transcripts'].append(feature.id) exon_pos = [] for xlevel in feature.sub_features: exon_pos.append([ xlevel.location._start.position + 1, xlevel.location._end.position ]) if orient == '-': if exon_pos != [] and len(exon_pos) != 1: if exon_pos[0][0] > exon_pos[-1][0]: exon_pos.reverse() gene["exons"].append(exon_pos) gene_cnt += 1 gid[feature.qualifiers['gene'][0]] = gene gfh.close() for ent in gid.ordered_items(): cand = OrganizePacket(ent[1]) genes_data.append(cand) return genes_data
def get_annotation(fname, rfname): """Parse genome annotation from GFF3 file. """ smap={1:'+', -1:'-'} from Core import GFF te_featdb, teg_featdb, ribo_featdb, oth_featdb=dict(), dict(), dict(), dict() for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: sys.stderr.write(cid+".....\n") fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type=='gene': for child in each_rec.sub_features: if child.type in ['mRNA', 'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']: if cid in oth_featdb: oth_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=smap[each_rec.strand] else: oth_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand]} elif each_rec.type=='transposable_element_gene': if cid in teg_featdb: teg_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=smap[each_rec.strand] else: teg_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand]} elif each_rec.type=='transposable_element': if cid in te_featdb: te_featdb[cid][(each_rec.id, int(each_rec.location._start.position), int(each_rec.location._end.position))]=smap[each_rec.strand] else: te_featdb[cid]={(each_rec.id, int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand]} elif each_rec.type=='pseudogene': if cid in oth_featdb: oth_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=smap[each_rec.strand] else: oth_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand]} fh.close() ## add additional rRNA related location from A. thaliana fh=open(rfname, 'rU') for line in fh: line=line.strip('\n\r').split(' ') if line[0] in ribo_featdb: ribo_featdb[line[0]][(int(line[1]), int(line[2]))]=line[3] else: ribo_featdb[line[0]]={(int(line[1]), int(line[2])):line[3]} fh.close() return te_featdb, teg_featdb, ribo_featdb, oth_featdb
def get_Feature(fname): """Extract genome annotation information from a provided GFF file. """ from Core import GFF te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb = dict(), dict(), dict(), dict(), dict() for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO According to the file type add chromosome number automatically. print cid fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type add source flag automatically for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type=='gene': for child in each_rec.sub_features: if child.type=='mRNA': if cid in cg_featdb: cg_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: cg_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.sub_features[0].type in ['miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']: if cid in oth_featdb: oth_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: oth_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.sub_features[0].type=='rRNA': if cid in ribo_featdb: ribo_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: ribo_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.type=='pseudogene': if cid in psd_featdb: psd_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: psd_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.type in ['transposable_element', 'transposable_element_gene']: if cid in te_featdb: te_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: te_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} fh.close() return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
def __main__(): try: gff_fname = sys.argv[1] fasta_fname = sys.argv[2] gb_fname = sys.argv[3] except: print __doc__ sys.exit(-1) fasta_rec = SeqIO.to_dict(SeqIO.parse(fasta_fname, "fasta", generic_dna)) gff_rec = GFF.parse(gff_fname, fasta_rec) SeqIO.write(gff_rec, gb_fname, "genbank")
def ParserCommonType(fname, source): gff_type=['transcript', 'exon'] source_type = zip([source] * len(gff_type), gff_type) filter_type = dict(gff_source_type = source_type, gff_id = ['I', 'II', 'III', 'IV', 'V', 'X']) gene_cnt, genes_data = 1, [] gid = ordered_dict() gfh = open(fname) for rec in GFF.parse(gfh, limit_info=filter_type): for feature in rec.features: if feature.qualifiers['gene'][0] in gid: gene = gid[feature.qualifiers['gene'][0]] gene['is_alt_spliced'] = 1 gene['transcripts'].append(feature.id) exon_pos = [] for xlevel in feature.sub_features: exon_pos.append([xlevel.location._start.position + 1, xlevel.location._end.position]) if orient == '-': if exon_pos != [] and len(exon_pos) != 1: if exon_pos[0][0] > exon_pos[-1][0]: exon_pos.reverse() gene["exons"].append(exon_pos) gid[feature.qualifiers['gene'][0]]=gene else: gene = init_gene() gene['id'] = gene_cnt gene['name'] = feature.qualifiers['gene'][0] gene['chr'] = rec.id gene['source'] = feature.qualifiers['source'][0] orient = None if feature.strand == 1: orient = '+' else: orient = '-' gene['strand'] = orient gene['is_alt_spliced'] = 0 gene['transcripts'].append(feature.id) exon_pos = [] for xlevel in feature.sub_features: exon_pos.append([xlevel.location._start.position + 1, xlevel.location._end.position]) if orient == '-': if exon_pos != [] and len(exon_pos) != 1: if exon_pos[0][0] > exon_pos[-1][0]: exon_pos.reverse() gene["exons"].append(exon_pos) gene_cnt += 1 gid[feature.qualifiers['gene'][0]]=gene gfh.close() for ent in gid.ordered_items(): cand = OrganizePacket(ent[1]) genes_data.append(cand) return genes_data
def ParseGFF(fname): """get contents from a decent GFF file. """ smap = {1: '+', -1: '-'} for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=[ 'TAIR10' ]) ## change the source TODO According to the file type for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type == 'gene': for child in each_rec.sub_features: print child.id, 'NULL', rec.id, smap[ child. strand], child.type, child.location._start.position, child.location._end.position, each_rec.qualifiers[ 'Note'][0] cnt = 1 for cod in child.sub_features: fid = None if cod.type == "five_prime_UTR": fid = child.id + '_' + str(cnt) cod.type = 'UTR5' elif cod.type == "three_prime_UTR": fid = child.id + '_' + str(cnt) cod.type = 'UTR3' elif cod.type == "CDS": fid = child.id + '_' + str(cnt) if fid: print fid, child.id, rec.id, smap[ cod. strand], cod.type, cod.location._start.position, cod.location._end.position, 'NULL' cnt += 1 break fh.close()
def __main__(): try: gff_file = sys.argv[1] chrid = sys.argv[2] source = sys.argv[3] except: sys.stderr.write('Access denied for a GFF file !\n') sys.exit(-1) #gff_source = ['rheMac2_ensGene', 'rheMac2_refGene', 'rheMac2_refSeqAnno', 'rheMac2_transMapAlnUcscGenes'] limit_parse = dict(gff_id=[chrid], gff_source=[source]) chrpos = dict() fh = open(gff_file) for rec in GFF.parse(fh, limit_info=limit_parse): for element in rec.features: chrpos[(element.location._start.position, element.location._end.position)] = 1 fh.close() sorted_chrpos = [sort_pos for sort_pos in sorted(chrpos)] WriteSortedGFF(gff_file, limit_parse, sorted_chrpos)
def get_Feature(fname): """Extract genome annotation information from a provided GFF file. """ from Core import GFF global te_featdb; global psd_featdb; global cg_featdb; global oth_featdb; global ribo_featdb for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: ## change according to the GFF file TODO According to the file type add chromosome number automatically. print cid fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) ## change the source TODO According to the file type add source flag automatically for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type=='gene': for child in each_rec.sub_features: if child.type=='mRNA': if cid in cg_featdb: cg_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: cg_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.sub_features[0].type in ['miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA']: if cid in oth_featdb: oth_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: oth_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.sub_features[0].type=='rRNA': if cid in ribo_featdb: ribo_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: ribo_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.type=='pseudogene': if cid in psd_featdb: psd_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: psd_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} elif each_rec.type in ['transposable_element', 'transposable_element_gene']: if cid in te_featdb: te_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position))]=each_rec.strand else: te_featdb[cid]={(int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand} fh.close() ribo_featdb['Chr3'][(14199917, 14203578)]=1 ## Unannotated rRNA from A thaliana genome. ribo_featdb['Chr2'][(5784, 9683)]=1 ## Unannotated rRNA from A thaliana genome. ribo_featdb['Chr2'][(2821, 3704)]=1 # ribo_featdb['Chr3'][(14196614, 14197675)]=1 # ribo_featdb['Chr3'][(14194052, 14194611)]=1 # ribo_featdb['Chr3'][(14199498, 14199751)]=1 # ribo_featdb['Chr3'][(14195564, 14195739)]=1 # ribo_featdb['ChrM'][(11426, 11883)]=-1 # ribo_featdb['ChrM'][(364594, 365124)]=1 #""" return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
def get_annotation(fname, rfname): """Parse genome annotation from GFF3 file. """ smap = {1: '+', -1: '-'} from Core import GFF te_featdb, teg_featdb, ribo_featdb, oth_featdb = dict(), dict(), dict( ), dict() for cid in ['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC']: sys.stderr.write(cid + ".....\n") fh = open(fname, 'rU') limit_info = dict(gff_id=[cid], gff_source=['TAIR10']) for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type == 'gene': for child in each_rec.sub_features: if child.type in [ 'mRNA', 'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA' ]: if cid in oth_featdb: oth_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = smap[each_rec.strand] else: oth_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand] } elif each_rec.type == 'transposable_element_gene': if cid in teg_featdb: teg_featdb[cid][(int( each_rec.location._start.position), int(each_rec.location._end.position) )] = smap[each_rec.strand] else: teg_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand] } elif each_rec.type == 'transposable_element': if cid in te_featdb: te_featdb[cid][(each_rec.id, int(each_rec.location._start.position), int(each_rec.location._end.position) )] = smap[each_rec.strand] else: te_featdb[ cid] = { (each_rec.id, int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand] } elif each_rec.type == 'pseudogene': if cid in oth_featdb: oth_featdb[cid][(int( each_rec.location._start.position), int(each_rec.location._end.position) )] = smap[each_rec.strand] else: oth_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): smap[each_rec.strand] } fh.close() ## add additional rRNA related location from A. thaliana fh = open(rfname, 'rU') for line in fh: line = line.strip('\n\r').split(' ') if line[0] in ribo_featdb: ribo_featdb[line[0]][(int(line[1]), int(line[2]))] = line[3] else: ribo_featdb[line[0]] = {(int(line[1]), int(line[2])): line[3]} fh.close() return te_featdb, teg_featdb, ribo_featdb, oth_featdb
def get_Feature(fname): """Extract genome annotation information from a provided GFF file. """ from Core import GFF te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb = dict(), dict( ), dict(), dict(), dict() for cid in [ 'Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC' ]: ## change according to the GFF file TODO According to the file type add chromosome number automatically. print cid fh = open(fname, 'rU') limit_info = dict( gff_id=[cid], gff_source=['TAIR10'] ) ## change the source TODO According to the file type add source flag automatically for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type == 'gene': for child in each_rec.sub_features: if child.type == 'mRNA': if cid in cg_featdb: cg_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: cg_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.sub_features[0].type in [ 'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA' ]: if cid in oth_featdb: oth_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: oth_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.sub_features[0].type == 'rRNA': if cid in ribo_featdb: ribo_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: ribo_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.type == 'pseudogene': if cid in psd_featdb: psd_featdb[cid][(int( each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: psd_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.type in [ 'transposable_element', 'transposable_element_gene' ]: if cid in te_featdb: te_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: te_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } fh.close() return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb
def WriteSortedGFF(fname, limit_parse, sorted_pos): fgh = open(fname) for rec in GFF.parse(fgh, limit_info=limit_parse): for sort_pos in sorted_pos: for feature in rec.features: if sort_pos == (feature.location._start.position, feature.location._end.position): orient = None if feature.strand == 1: orient = '+' else: orient = '-' gline = [rec.id, feature.qualifiers['source'][0], feature.type, str(feature.location._start.position + 1), str(feature.location._end.position), '.', orient, '.', 'ID=' + feature.qualifiers['ID'][0] + ';Name=' + feature.qualifiers['Name'][0] ] print '\t'.join(gline) for slevel in feature.sub_features: ## second level feature orient = None if slevel.strand == 1: orient = '+' else: orient = '-' tline = [rec.id, slevel.qualifiers['source'][0], slevel.type, str(slevel.location._start.position + 1), str(slevel.location._end.position), str(float(slevel.qualifiers['score'][0])), orient, '.', 'ID=' + slevel.qualifiers['ID'][0] + ';Parent=' + slevel.qualifiers['Parent'][0] ] print '\t'.join(tline) for tlevel in slevel.sub_features: orient = None if tlevel.strand == 1: orient = '+' else: orient = '-' xline = [] if tlevel.type == 'CDS': xline = [rec.id, tlevel.qualifiers['source'][0], tlevel.type, str(tlevel.location._start.position + 1), str(tlevel.location._end.position), '.', orient, tlevel.qualifiers['phase'][0], 'Parent=' + tlevel.qualifiers['Parent'][0] ] else: xline = [rec.id, tlevel.qualifiers['source'][0], tlevel.type, str(tlevel.location._start.position + 1), str(tlevel.location._end.position), '.', orient, '.', 'Parent=' + tlevel.qualifiers['Parent'][0] ] print '\t'.join(xline) #break #break ## one sorted position fgh.close()
def WriteSortedGFF(fname, limit_parse, sorted_pos): fgh = open(fname) for rec in GFF.parse(fgh, limit_info=limit_parse): for sort_pos in sorted_pos: for feature in rec.features: if sort_pos == (feature.location._start.position, feature.location._end.position): orient = None if feature.strand == 1: orient = '+' else: orient = '-' gline = [ rec.id, feature.qualifiers['source'][0], feature.type, str(feature.location._start.position + 1), str(feature.location._end.position), '.', orient, '.', 'ID=' + feature.qualifiers['ID'][0] + ';Name=' + feature.qualifiers['Name'][0] ] print '\t'.join(gline) for slevel in feature.sub_features: ## second level feature orient = None if slevel.strand == 1: orient = '+' else: orient = '-' tline = [ rec.id, slevel.qualifiers['source'][0], slevel.type, str(slevel.location._start.position + 1), str(slevel.location._end.position), str(float(slevel.qualifiers['score'][0])), orient, '.', 'ID=' + slevel.qualifiers['ID'][0] + ';Parent=' + slevel.qualifiers['Parent'][0] ] print '\t'.join(tline) for tlevel in slevel.sub_features: orient = None if tlevel.strand == 1: orient = '+' else: orient = '-' xline = [] if tlevel.type == 'CDS': xline = [ rec.id, tlevel.qualifiers['source'][0], tlevel.type, str(tlevel.location._start.position + 1), str(tlevel.location._end.position), '.', orient, tlevel.qualifiers['phase'][0], 'Parent=' + tlevel.qualifiers['Parent'][0] ] else: xline = [ rec.id, tlevel.qualifiers['source'][0], tlevel.type, str(tlevel.location._start.position + 1), str(tlevel.location._end.position), '.', orient, '.', 'Parent=' + tlevel.qualifiers['Parent'][0] ] print '\t'.join(xline) #break #break ## one sorted position fgh.close()
def get_Feature(fname): """Extract genome annotation information from a provided GFF file. """ from Core import GFF global te_featdb global psd_featdb global cg_featdb global oth_featdb global ribo_featdb for cid in [ 'Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC' ]: ## change according to the GFF file TODO According to the file type add chromosome number automatically. print cid fh = open(fname, 'rU') limit_info = dict( gff_id=[cid], gff_source=['TAIR10'] ) ## change the source TODO According to the file type add source flag automatically for rec in GFF.parse(fh, limit_info=limit_info): for each_rec in rec.features: if each_rec.type == 'gene': for child in each_rec.sub_features: if child.type == 'mRNA': if cid in cg_featdb: cg_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: cg_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.sub_features[0].type in [ 'miRNA', 'ncRNA', 'snoRNA', 'snRNA', 'tRNA' ]: if cid in oth_featdb: oth_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: oth_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.sub_features[0].type == 'rRNA': if cid in ribo_featdb: ribo_featdb[cid][( int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: ribo_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.type == 'pseudogene': if cid in psd_featdb: psd_featdb[cid][(int( each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: psd_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } elif each_rec.type in [ 'transposable_element', 'transposable_element_gene' ]: if cid in te_featdb: te_featdb[cid][(int(each_rec.location._start.position), int(each_rec.location._end.position) )] = each_rec.strand else: te_featdb[cid] = { (int(each_rec.location._start.position), int(each_rec.location._end.position)): each_rec.strand } fh.close() ribo_featdb['Chr3'][( 14199917, 14203578)] = 1 ## Unannotated rRNA from A thaliana genome. ribo_featdb['Chr2'][( 5784, 9683)] = 1 ## Unannotated rRNA from A thaliana genome. ribo_featdb['Chr2'][(2821, 3704)] = 1 # ribo_featdb['Chr3'][(14196614, 14197675)] = 1 # ribo_featdb['Chr3'][(14194052, 14194611)] = 1 # ribo_featdb['Chr3'][(14199498, 14199751)] = 1 # ribo_featdb['Chr3'][(14195564, 14195739)] = 1 # ribo_featdb['ChrM'][(11426, 11883)] = -1 # ribo_featdb['ChrM'][(364594, 365124)] = 1 #""" return te_featdb, psd_featdb, cg_featdb, oth_featdb, ribo_featdb