Example #1
0
def main(gff_file, fasta_file, prefix, min_len=50):
    gff = gt.parser(gff_file=gff_file, fasta_file=fasta_file)
    scaffolds = sorted(gff.seq.items(), key=lambda x: len(x[1]), reverse=True)
    gene_counter = 0
    with open('namemap.tsv', 'w') as namemap:
        for seqid, seq in scaffolds:
            genes = gff.getitems(featuretype='gene', seqid=seqid)
            sorted_genes = sorted(genes, key=lambda sub: sub.get_start())
            for gene in sorted_genes:
                transcript = list(gff.get_children(gene,
                                                   featuretype='mRNA'))[0]
                if len(transcript.pep) < min_len:
                    continue
                gene_counter += 1
                gene_id = format_id(prefix, gene_counter)
                namemap.write('{0}\t{1}\n'.format(gene.ID, gene_id))
                gene.ID = gene_id
                gene.source = 'BRAKER'
                print '\t'.join(gene.gff_fields)
                transcript_id = '{0}.1'.format(gene_id)
                transcript.ID = transcript_id
                transcript.parents = [gene_id]
                transcript.source = 'BRAKER'
                print '\t'.join(transcript.gff_fields)
                cds_counter = 0
                for cds in gff.get_children(transcript, featuretype='CDS'):
                    cds_counter += 1
                    cds_id = '{0}.CDS{1}'.format(gene_id, cds_counter)
                    cds.ID = cds_id
                    cds.source = 'BRAKER'
                    cds.parents = [transcript_id]
                    print '\t'.join(cds.gff_fields)
def main(gff_file, fasta_file, prefix, min_len = 50):
	gff = gt.parser(gff_file = gff_file, fasta_file = fasta_file, 
		limit = dict(featuretype = ['gene','mRNA','CDS']))
	scaffolds = sorted(gff.seq.items(), key = lambda x: len(x[1]), reverse = True)
	gene_counter = 0
	for seqid,seq in scaffolds:
		genes = gff.getitems(featuretype = 'gene', seqid = seqid)
		sorted_genes = sorted(genes, key = lambda sub: sub.get_start())
		for gene in sorted_genes:
			transcript1 = list(gff.get_children(gene, featuretype = 'mRNA'))[0]
			if len(transcript1.pep) < min_len:
				continue
			gene_counter += 1
			gene_id = format_id(prefix, gene_counter)
			gene.ID = gene_id
			gene.source = 'BRAKER'
			print '\t'.join(gene.gff_fields)
			mRNA_counter = 0
			for transcript in gff.get_children(gene, featuretype = 'mRNA'): 
				mRNA_counter += 1
				transcript_id = '{0}.{1}'.format(gene_id, mRNA_counter)
				transcript.ID = transcript_id
				transcript.parents = [gene_id]
				transcript.source = 'BRAKER'
				print '\t'.join(transcript.gff_fields)
				cds_counter = 0
				for cds in gff.get_children(transcript, featuretype = 'CDS'):
					cds_counter += 1
					cds_id = '{0}.CDS{1}'.format(gene_id, cds_counter)
					cds.ID = cds_id
					cds.source = 'BRAKER'
					cds.parents = [transcript_id]
					print '\t'.join(cds.gff_fields)
def init(gff_file,fasta_file,gene_collection,track_collection):
	print 'parsing gff'
	assembly = fasta_file.split('/')[-1]
	track = gff_file.split('/')[-1]
	track_name = '.'.join(track.split('.')[:-1])
	
	gff = gt.parser(gff_file,fasta_file=fasta_file)
	gff_md5 = get_md5(gff_file)

	bulk = gene_collection.initialize_unordered_bulk_op()

	print 'initializing upload'

	meta = {'track':track,'assembly':assembly,'md5':gff_md5}

	for gene in gff.getitems(featuretype='gene'):
		gene_dic = model_gene_feature(gene)
		gene_dic['track'] = track
		gene_dic['assembly'] = assembly
		bulk.insert(gene_dic)
		meta.setdefault('gene',0)
		meta['gene'] += 1

	print 'uploading to mongodb'
	print '...'
	bulk.execute()
	print 'uploaded {0} genes'.format(meta['gene'])
	print 'indexing'
	gene_collection.create_index('ID')
	gene_collection.create_index('type')
	gene_collection.create_index([('seqid',pymongo.TEXT),('start',pymongo.ASCENDING),('end',pymongo.ASCENDING)])
	gene_collection.create_index('subfeatures.ID')
	print 'setting metadata'
	track_collection.insert_one(meta)
Example #4
0
def main(gff_file, namemap):
    gff = gt.parser(gff_file=gff_file)
    with open('namemap.tsv', 'r') as namemap:
        genes = gff.getitems(featuretype='gene')
        sorted_genes = sorted(genes, key=lambda sub: sub.get_start())
        genefind = namemap.read()
        for gene in sorted_genes:
            number = genefind.find(gene.ID)
            print "%s : %s " % (gene.ID, number)
def test1():
	for gff_file in (gff_success,gff_fail):
		gff = gt.parser(gff_file,fasta_file=fasta_file)
		for transcript in gff.getitems(featuretype='mRNA'):
			if transcript.pep[0] != 'M':
				print gff_file
				print transcript.pep
				print transcript
				raise Exception('Wrong start')
			if transcript.pep[-1] != '*':
				print gff_file
				print transcript.pep
				print transcript
				raise Exception('Wrong stop')
def main(gff_file, fasta_file):
    gff = gt.parser(gff_file, fasta_file=fasta_file)
    cds_file = '{0}.CDS.fasta'.format(gff_file)
    pep_file = '{0}.PEP.fasta'.format(gff_file)
    transcripts = gff.getitems(featuretype='mRNA')
    sorted_transcripts = sorted(transcripts, key=lambda t: t.ID)
    with open(cds_file, 'w') as cds_handle, open(pep_file, 'w') as pep_handle:
        for transcript in sorted_transcripts:
            if len(transcript.seq) == 0:
                continue
            name = transcript.attributes.get('Name', [''])[0]
            header = '>{0} {1}\n'.format(transcript.ID, name)

            seq = splitter(transcript.seq, 60)
            pep = splitter(transcript.pep, 60)
            cds_handle.write(header)
            for s in seq:
                cds_handle.write(s + '\n')
            pep_handle.write(header)
            for p in pep:
                pep_handle.write(p + '\n')
def main(infile=None):
	client_ip = get_client()
	client = MongoClient(client_ip)
	db = client.meteor
	collection = db.interpro
	ipr_hierarchy = get_ipr_hierarchy()
	upload(collection,ipr_hierarchy)
	#for ipr in ipr_hierarchy:
	#	print ipr
	quit()
	ipr_combinations = get_ipr_combinations()
	gff = gt.parser(infile)
	names = {}
	for polypeptide in gff.getitems(featuretype='polypeptide'):
		domains = set()
		for protein_match in gff.get_children(polypeptide,featuretype='protein_match'):
			interpro = get_interpro(protein_match)
			if not interpro or interpro not in ipr_hierarchy:
				continue
			name = ipr_hierarchy[interpro].name
			if 'DUF' in name or 'unknown' in name:
				continue
			domains.add(protein_match)
		if not domains:
			name = ['None']
		else:
			reduced_domains = reduce_domains(domains,ipr_hierarchy)
			if len(reduced_domains) == 1:
				name = [reduced_domains[0].name]
			else:
				name = find_similar_names(reduced_domains)
		#print name
		if len(name) != 1:
			name = [ipr_combinations.get(name,name[0])]
		name = name[0]
		if name.endswith('domain') or name.endswith('fold'):
			name += ' containing protein'
		print '\t'.join([polypeptide.ID,name])
	'''
def upload_ips(gff_file,gene_collection,interpro_collection):
	print 'parsing gff'
	gff = gt.parser(gff_file)
	counter = 0
	all_interpro = set()
	print 'uploading to mongodb'
	print 'adding to genes'
	for polypeptide in gff.getitems(featuretype='polypeptide'):
		counter += 1
		for protein_match in gff.get_children(polypeptide,featuretype='protein_match'):
			protein_match_ID = re.sub('\.','&#46;',protein_match.ID)
			gene_key = {'subfeatures.ID':polypeptide.ID}
			gene_update = {'$set': {
				'subfeatures.$.interproscan.'+protein_match_ID: {
						'start' : protein_match.start,
						'end' : protein_match.end,
						'score' : protein_match.score,
						'source' : protein_match.source,
						'signature_desc' : ','.join(protein_match.attributes.get('signature_desc',[''])),
						'dbxref' : ','.join(protein_match.attributes.get('Dbxref',[''])),
						'name' : ','.join(protein_match.attributes.get('Name',['']))
					}
				}
			}
			dbxref = protein_match.attributes.get('Dbxref',None)
			if dbxref:
				dbxref_dict = {'domains.'+kv[0]:kv[1] for kv in [d.split(':') for d in dbxref]}
				gene_update['$addToSet'] = dbxref_dict

				interpro = dbxref_dict.get('domains.InterPro',None)
				if interpro:
					all_interpro.add(interpro)
						
			
			gene_collection.update(gene_key,gene_update)

	print 'fetching additional interpro data'
	all_interpro = list(all_interpro)
	for domains in (all_interpro[i:i+100] for i in xrange(0,len(all_interpro),100)):
		url =  'http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/interpro/{0}/tab'.format(','.join(domains))
		response = urlopen(url)
		for line in response.readlines():
			line = line.strip()
			if not line or line[0] == '#':
				continue
			parts = line.split('\t')
			rand_color = randomcolor.RandomColor(seed=parts[0])
			interpro_key = {'ID':parts[0]}
			interpro_update = {'$set': {
				'type':parts[1],
				'short_name':parts[2],
				'description':parts[3],
				'color':rand_color.generate(format_='rgb')[0]
			}}
			domains.remove(parts[0])
			interpro_collection.update(interpro_key,interpro_update,upsert=True)
		if domains:
			for domain in domains:
				interpro_key = {'ID':domain}
				interpro_update = {'$set': {
					'type':'ERROR',
					'short_name':'ERROR',
					'description':'This domain was found with interproscan, but could not be found on the interpro site',
					'color':'black'
				}}
			interpro_collection.update(interpro_key,interpro_update,upsert=True)