def main(gff_file, fasta_file, prefix, min_len=50): gff = gt.parser(gff_file=gff_file, fasta_file=fasta_file) scaffolds = sorted(gff.seq.items(), key=lambda x: len(x[1]), reverse=True) gene_counter = 0 with open('namemap.tsv', 'w') as namemap: for seqid, seq in scaffolds: genes = gff.getitems(featuretype='gene', seqid=seqid) sorted_genes = sorted(genes, key=lambda sub: sub.get_start()) for gene in sorted_genes: transcript = list(gff.get_children(gene, featuretype='mRNA'))[0] if len(transcript.pep) < min_len: continue gene_counter += 1 gene_id = format_id(prefix, gene_counter) namemap.write('{0}\t{1}\n'.format(gene.ID, gene_id)) gene.ID = gene_id gene.source = 'BRAKER' print '\t'.join(gene.gff_fields) transcript_id = '{0}.1'.format(gene_id) transcript.ID = transcript_id transcript.parents = [gene_id] transcript.source = 'BRAKER' print '\t'.join(transcript.gff_fields) cds_counter = 0 for cds in gff.get_children(transcript, featuretype='CDS'): cds_counter += 1 cds_id = '{0}.CDS{1}'.format(gene_id, cds_counter) cds.ID = cds_id cds.source = 'BRAKER' cds.parents = [transcript_id] print '\t'.join(cds.gff_fields)
def main(gff_file, fasta_file, prefix, min_len = 50): gff = gt.parser(gff_file = gff_file, fasta_file = fasta_file, limit = dict(featuretype = ['gene','mRNA','CDS'])) scaffolds = sorted(gff.seq.items(), key = lambda x: len(x[1]), reverse = True) gene_counter = 0 for seqid,seq in scaffolds: genes = gff.getitems(featuretype = 'gene', seqid = seqid) sorted_genes = sorted(genes, key = lambda sub: sub.get_start()) for gene in sorted_genes: transcript1 = list(gff.get_children(gene, featuretype = 'mRNA'))[0] if len(transcript1.pep) < min_len: continue gene_counter += 1 gene_id = format_id(prefix, gene_counter) gene.ID = gene_id gene.source = 'BRAKER' print '\t'.join(gene.gff_fields) mRNA_counter = 0 for transcript in gff.get_children(gene, featuretype = 'mRNA'): mRNA_counter += 1 transcript_id = '{0}.{1}'.format(gene_id, mRNA_counter) transcript.ID = transcript_id transcript.parents = [gene_id] transcript.source = 'BRAKER' print '\t'.join(transcript.gff_fields) cds_counter = 0 for cds in gff.get_children(transcript, featuretype = 'CDS'): cds_counter += 1 cds_id = '{0}.CDS{1}'.format(gene_id, cds_counter) cds.ID = cds_id cds.source = 'BRAKER' cds.parents = [transcript_id] print '\t'.join(cds.gff_fields)
def init(gff_file,fasta_file,gene_collection,track_collection): print 'parsing gff' assembly = fasta_file.split('/')[-1] track = gff_file.split('/')[-1] track_name = '.'.join(track.split('.')[:-1]) gff = gt.parser(gff_file,fasta_file=fasta_file) gff_md5 = get_md5(gff_file) bulk = gene_collection.initialize_unordered_bulk_op() print 'initializing upload' meta = {'track':track,'assembly':assembly,'md5':gff_md5} for gene in gff.getitems(featuretype='gene'): gene_dic = model_gene_feature(gene) gene_dic['track'] = track gene_dic['assembly'] = assembly bulk.insert(gene_dic) meta.setdefault('gene',0) meta['gene'] += 1 print 'uploading to mongodb' print '...' bulk.execute() print 'uploaded {0} genes'.format(meta['gene']) print 'indexing' gene_collection.create_index('ID') gene_collection.create_index('type') gene_collection.create_index([('seqid',pymongo.TEXT),('start',pymongo.ASCENDING),('end',pymongo.ASCENDING)]) gene_collection.create_index('subfeatures.ID') print 'setting metadata' track_collection.insert_one(meta)
def main(gff_file, namemap): gff = gt.parser(gff_file=gff_file) with open('namemap.tsv', 'r') as namemap: genes = gff.getitems(featuretype='gene') sorted_genes = sorted(genes, key=lambda sub: sub.get_start()) genefind = namemap.read() for gene in sorted_genes: number = genefind.find(gene.ID) print "%s : %s " % (gene.ID, number)
def test1(): for gff_file in (gff_success,gff_fail): gff = gt.parser(gff_file,fasta_file=fasta_file) for transcript in gff.getitems(featuretype='mRNA'): if transcript.pep[0] != 'M': print gff_file print transcript.pep print transcript raise Exception('Wrong start') if transcript.pep[-1] != '*': print gff_file print transcript.pep print transcript raise Exception('Wrong stop')
def main(gff_file, fasta_file): gff = gt.parser(gff_file, fasta_file=fasta_file) cds_file = '{0}.CDS.fasta'.format(gff_file) pep_file = '{0}.PEP.fasta'.format(gff_file) transcripts = gff.getitems(featuretype='mRNA') sorted_transcripts = sorted(transcripts, key=lambda t: t.ID) with open(cds_file, 'w') as cds_handle, open(pep_file, 'w') as pep_handle: for transcript in sorted_transcripts: if len(transcript.seq) == 0: continue name = transcript.attributes.get('Name', [''])[0] header = '>{0} {1}\n'.format(transcript.ID, name) seq = splitter(transcript.seq, 60) pep = splitter(transcript.pep, 60) cds_handle.write(header) for s in seq: cds_handle.write(s + '\n') pep_handle.write(header) for p in pep: pep_handle.write(p + '\n')
def main(infile=None): client_ip = get_client() client = MongoClient(client_ip) db = client.meteor collection = db.interpro ipr_hierarchy = get_ipr_hierarchy() upload(collection,ipr_hierarchy) #for ipr in ipr_hierarchy: # print ipr quit() ipr_combinations = get_ipr_combinations() gff = gt.parser(infile) names = {} for polypeptide in gff.getitems(featuretype='polypeptide'): domains = set() for protein_match in gff.get_children(polypeptide,featuretype='protein_match'): interpro = get_interpro(protein_match) if not interpro or interpro not in ipr_hierarchy: continue name = ipr_hierarchy[interpro].name if 'DUF' in name or 'unknown' in name: continue domains.add(protein_match) if not domains: name = ['None'] else: reduced_domains = reduce_domains(domains,ipr_hierarchy) if len(reduced_domains) == 1: name = [reduced_domains[0].name] else: name = find_similar_names(reduced_domains) #print name if len(name) != 1: name = [ipr_combinations.get(name,name[0])] name = name[0] if name.endswith('domain') or name.endswith('fold'): name += ' containing protein' print '\t'.join([polypeptide.ID,name]) '''
def upload_ips(gff_file,gene_collection,interpro_collection): print 'parsing gff' gff = gt.parser(gff_file) counter = 0 all_interpro = set() print 'uploading to mongodb' print 'adding to genes' for polypeptide in gff.getitems(featuretype='polypeptide'): counter += 1 for protein_match in gff.get_children(polypeptide,featuretype='protein_match'): protein_match_ID = re.sub('\.','.',protein_match.ID) gene_key = {'subfeatures.ID':polypeptide.ID} gene_update = {'$set': { 'subfeatures.$.interproscan.'+protein_match_ID: { 'start' : protein_match.start, 'end' : protein_match.end, 'score' : protein_match.score, 'source' : protein_match.source, 'signature_desc' : ','.join(protein_match.attributes.get('signature_desc',[''])), 'dbxref' : ','.join(protein_match.attributes.get('Dbxref',[''])), 'name' : ','.join(protein_match.attributes.get('Name',[''])) } } } dbxref = protein_match.attributes.get('Dbxref',None) if dbxref: dbxref_dict = {'domains.'+kv[0]:kv[1] for kv in [d.split(':') for d in dbxref]} gene_update['$addToSet'] = dbxref_dict interpro = dbxref_dict.get('domains.InterPro',None) if interpro: all_interpro.add(interpro) gene_collection.update(gene_key,gene_update) print 'fetching additional interpro data' all_interpro = list(all_interpro) for domains in (all_interpro[i:i+100] for i in xrange(0,len(all_interpro),100)): url = 'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/interpro/{0}/tab'.format(','.join(domains)) response = urlopen(url) for line in response.readlines(): line = line.strip() if not line or line[0] == '#': continue parts = line.split('\t') rand_color = randomcolor.RandomColor(seed=parts[0]) interpro_key = {'ID':parts[0]} interpro_update = {'$set': { 'type':parts[1], 'short_name':parts[2], 'description':parts[3], 'color':rand_color.generate(format_='rgb')[0] }} domains.remove(parts[0]) interpro_collection.update(interpro_key,interpro_update,upsert=True) if domains: for domain in domains: interpro_key = {'ID':domain} interpro_update = {'$set': { 'type':'ERROR', 'short_name':'ERROR', 'description':'This domain was found with interproscan, but could not be found on the interpro site', 'color':'black' }} interpro_collection.update(interpro_key,interpro_update,upsert=True)