Example #1
0
	def __init__(self,gff_file,filetype='standard',fasta_file=None,remove_noncoding=False,limit=None,author=None):
		if filetype not in self._filetypes:
			raise TypeError()
		self.filename = gff_file
		if filetype not in self._filetypes:
			e = '{0} is not a valid filetype'.format(filetype)
			raise TypeError(e)
		self.filetype = filetype
		self.fasta_file = fasta_file
		self.remove_noncoding = remove_noncoding
		if limit:
			if not isinstance(limit,dict):
				e = '{0} is not a valid type for limit'.format(type(limit))
				raise TypeError(e)
			else:
				#reformat limit values into lists if they are not already lists
				self.limit = {}
				for key,value in limit.iteritems():
					if isinstance(value,(list,tuple)):
						self.limit[key] = value
					elif isinstance(value,basestring):
						self.limit[key] = [value]
				#self.limit = {key:[value] for key,value in limit.iteritems() if isinstance(value,basestring) or if isinstance(value,(list,tuple))}
		else:
			self.limit = limit
 		self.gff = Gff(filename=gff_file)
 		self.author = author
Example #2
0
	def __init__(self,gff_file,filetype='standard',fasta_file=None,remove_noncoding=False,limit=None,author=None):
		if filetype not in self._filetypes:
			raise TypeError()
		self.filename = gff_file
		if filetype not in self._filetypes:
			e = '{0} is not a valid filetype'.format(filetype)
			raise TypeError(e)
		self.filetype = filetype
		self.fasta_file = fasta_file
		self.remove_noncoding = remove_noncoding
		if limit:
			if not isinstance(limit,dict):
				e = '{0} is not a valid type for limit'.format(type(limit))
				raise TypeError(e)
			else:
				#reformat limit values into lists if they are not already lists
				self.limit = {}
				for key,value in limit.iteritems():
					if isinstance(value,(list,tuple)):
						self.limit[key] = value
					elif isinstance(value,basestring):
						self.limit[key] = [value]
				#self.limit = {key:[value] for key,value in limit.iteritems() if isinstance(value,basestring) or if isinstance(value,(list,tuple))}
		else:
			self.limit = limit
 		self.gff = Gff(filename=gff_file)
 		self.author = author
Example #3
0
if keepinfo:
	writer.cnames.append('ORIGINAL')

def getNameFromAttrs(attrs):
	if attr2name:
		return attr2name(**attrs)
	for key in sorted(attrs.keys()):
		if key in writer.cnames:
			continue
		if 'id' in key.lower():
			return attrs[key]
		if 'name' in key.lower():
			return attrs[key]
		return attrs[key]

gff = Gff(infile)
for record in gff:
	r        = TsvRecord()
	r.CHR    = record['seqid']
	r.START  = record['start']
	r.END    = record['end']
	r.SCORE  = record['score']
	r.STRAND = record['strand']
	attrs    = record['attributes']
	attrs.update(dict(
		CHR    = r.CHR,
		START  = r.START,
		END    = r.END,
		SCORE  = r.SCORE,
		STRAND = r.STRAND
	))
Example #4
0
class Parser(object):
	"""
	Parser object gff3 formatted annotation files.
	The parse() method return the processed Gff object.
	Has several methods for known incorrectly formatted files: _ratt() and _manual(), which are called by specifying filetype
	"""
	_filetypes = ('standard','ratt','manual','interproscan')
	def __init__(self,gff_file,filetype='standard',fasta_file=None,remove_noncoding=False,limit=None,author=None):
		if filetype not in self._filetypes:
			raise TypeError()
		self.filename = gff_file
		if filetype not in self._filetypes:
			e = '{0} is not a valid filetype'.format(filetype)
			raise TypeError(e)
		self.filetype = filetype
		self.fasta_file = fasta_file
		self.remove_noncoding = remove_noncoding
		if limit:
			if not isinstance(limit,dict):
				e = '{0} is not a valid type for limit'.format(type(limit))
				raise TypeError(e)
			else:
				#reformat limit values into lists if they are not already lists
				self.limit = {}
				for key,value in limit.iteritems():
					if isinstance(value,(list,tuple)):
						self.limit[key] = value
					elif isinstance(value,basestring):
						self.limit[key] = [value]
				#self.limit = {key:[value] for key,value in limit.iteritems() if isinstance(value,basestring) or if isinstance(value,(list,tuple))}
		else:
			self.limit = limit
 		self.gff = Gff(filename=gff_file)
 		self.author = author
		#self.parsed = False
	def _readlines(self):
		"""
		Generator that yields GffSubPart objects formatted according to filetype. 
		Limit is a dictionary that can limit the lines parsed based on some attributes like 
		"""
		with open(self.filename,'rU') as fh:
			for line in fh:
				if not line.strip():
					continue
				if line[0] == '#':
					if 'FASTA' in line:
						return
					continue
				if self.filetype == 'interproscan':
					line = re.sub('; ',': ',line)
				parts = line.strip().split('\t')
				if self.limit != None:
					#Series of list comprehensions to check if any of the relevant attributes of sub are in the limit dictionary
					#An empty list is falsy, so if the relevant value is not in the limit dictionary, continue to the next line
					if not [x for x in self.limit.get('seqid',[parts[0]]) if x == parts[0]]:
						continue
					elif not [x for x in self.limit.get('source',[parts[1]]) if x == parts[1]]:
						continue
					elif not [x for x in self.limit.get('featuretype',[parts[2]]) if x == parts[2]]:
						continue
					elif not [x for x in self.limit.get('strand',[parts[6]]) if x == parts[6]]:
						continue
				sub = GffSubPart(*parts,filetype=self.filetype)
				yield sub

	def _ratt(self):
		name_index_map = {}
		name_index_remove = set()
		child_map = {}
		
		for transcript in self.gff.getitems(featuretype='mRNA'):
			new_transcript_ID = transcript.attributes['locus_tag'][0]
			if transcript.ID == new_transcript_ID:
				continue

			child_map.setdefault(new_transcript_ID,[])

			name_index_map[new_transcript_ID] = transcript._key
			name_index_remove.add(transcript.ID)
			
			sub_counter = {}
			for sub in self.gff.get_children(transcript):
				if sub == transcript:
					continue
				sub_counter.setdefault(sub.featuretype,0)
				sub_counter[sub.featuretype] += 1
				
				sub.parents = [new_transcript_ID]
				new_sub_ID = '{0}.{1}{2}'.format(new_transcript_ID,sub.featuretype,sub_counter[sub.featuretype])
				
				child_map.setdefault(new_transcript_ID,[]).append(new_sub_ID)
				name_index_map[new_sub_ID] = sub._key
				name_index_remove.add(sub.ID)

				sub.ID = new_sub_ID
				sub.source = 'ratt'

			transcript.ID = new_transcript_ID
			transcript.source = 'ratt'
			transcript.attributes = {key:value for key,value in transcript.attributes.iteritems() if key in ('ID','Parent')}

			gene_ID = transcript.ID.split('.')[0]
			
			if not gene_ID in self.gff.name_index:
				gene_parts = transcript.gff_fields
				gene = GffSubPart(*gene_parts)
				gene.featuretype = 'gene'
				gene.ID = gene_ID
				gene.children = [transcript.ID]
				self.gff.update(gene)
			else:
				for gene in self.gff[gene_ID]:
					gene.children.append(transcript.ID)

			transcript.parents = [gene_ID]

		for new_name,_key in name_index_map.iteritems():
			self.gff.name_index[new_name] = [_key]
		for old_name in name_index_remove:
			self.gff.name_index.pop(old_name)
		for parent_id in child_map:
			for parent in self.gff[parent_id]:
				parent.children = child_map[parent_id]

	def _manual(self):
		ann = {}
		for sub in self.gff.getitems(featuretype='CDS'):
			if sub.attributes.get('Name',False):
				ann.setdefault(sub.ID,[]).append(sub)
		for gene,cds_list in ann.iteritems():
			self._assert_equal(cds_list)
			template = cds_list[0]
			exon_counter = 0
			#GffSubPart elements
			seqid = template.seqid
			source = template.source
			start = min(s.start for s in cds_list)
			end = max(s.end for s in cds_list)
			score = template.score
			strand = template.strand
			phase = template.phase
			#GffSubPart attributes
			gene_ID = uuid.uuid4()
			mrna_ID = '{0}.1'.format(gene_ID)
			name = template.attributes.get('Name',False)[0]
			author = self.author#template.attributes.get('Created by',False)[0]
			gene_attributes = 'ID={0};Name={1};Created by={2}'.format(gene_ID,name,author)
			mrna_attributes = 'ID={0};Parent={1};Name={2};Created by={3}'.format(mrna_ID,gene_ID,name,author)
			#make the gene GffSubPart
			gene = GffSubPart(seqid,source,'gene',start,end,score,strand,phase,gene_attributes)
			#make the mRNA GffSubPart
			mrna = GffSubPart(seqid,source,'mRNA',start,end,score,strand,phase,mrna_attributes)
			self.gff.update(gene)
			self.gff.update(mrna)
			if gene.strand == '+':
					reverse = True
			else:
					reverse = False
			for cds in sorted(cds_list,key=lambda x:x.get_start(),reverse=reverse):
				exon_counter += 1
				old_ID = cds.ID
				cds.ID = '{0}.CDS{1}'.format(mrna_ID,exon_counter)
				cds.attributes['ID'] = ['{0}.CDS{1}'.format(mrna_ID,exon_counter)]
				#print self.gff.name_index[old_ID]
				self.gff.name_index.setdefault(cds.ID,set()).add(cds._key)
				self.gff.name_index[old_ID] = {x for x in self.gff.name_index[old_ID] if x != cds._key}
				if len(self.gff.name_index[old_ID]) == 0:
					self.gff.name_index.pop(old_ID)
				cds.parents = [mrna_ID]
				cds.attributes['Parent'] = [mrna_ID]
				cds.attributes.pop('modified by',[])
				cds.attributes.pop('created by',[])
				cds.attributes['Created by'] = [self.author]
	def _interproscan(self):
		type_dict = {}
		for feature in self.gff:
			for attribute,values in feature.attributes.iteritems():
				feature.attributes[attribute] = [value.strip('"') for value in values]
			if feature.featuretype == 'protein_match':
				#set parent attribute
				parent = feature.attributes['Target'][0]
				parent = parent.split()[0]
				feature.parents = [parent]
				#determine new ID based on parent name and source, this should result in unique IDs, opposed to what comes out of IPS
				source = feature.source
				type_dict.setdefault(parent,{}).setdefault(source,0)
				type_dict[parent][source] += 1
				type_count = type_dict[parent][source]
				ID = '{0}.{1}.{2}'.format(parent,source,type_count)
				self.gff.name_index[ID] = self.gff.name_index.pop(feature.ID)
				feature.ID = ID

	@staticmethod
	def _assert_equal(subslist):
		first = subslist[0]
		assert all(x.seqid == first.seqid for x in subslist)
		assert all(x.source == first.source for x in subslist)
		assert all(x.strand == first.strand for x in subslist)
		try:
			assert all(x.attributes['Name'] == first.attributes['Name'] for x in subslist),[x.attributes['Name'] for x in subslist]
		except KeyError as e:
			print first
			raise e
		except AssertionError as e:
			raise e
		try:
			pass
			#assert all(x.attributes['Created by'] == first.attributes['Created by'] for x in subslist)
		except KeyError as e:
			print first
			raise e
		
	def _remove_noncoding(self):
		if not self.fasta_file:
			e = 'Can not remove non coding annotations without sequence data'
			raise NotImplementedError(e)
		remove = []
		for feature in self.gff.getitems(featuretype='mRNA'):
			try:
				pep = feature.pep
			except TranslateError:
				remove.append(feature.ID)
				continue
			if pep[0] != 'M':
				if feature.seq[0:3] != 'CTG':
					remove.append(feature.ID)
			elif pep[-1] != '*':
				remove.append(feature.ID)
			elif '*' in pep[1:-1]:
				remove.append(feature.ID)
		print 'Remove {0} genes because they do not encode a protein'.format(len(remove))
		self.gff.remove(remove)

	def _get_nested_ratt_ids(self,ID):
		IDs = set()
		for lower_ID in self._get_lower_ratt_ids(ID):
			IDs.add(lower_ID)
			yield lower_ID
		for higher_ID in self._get_higher_ratt_ids(ID):
			if higher_ID not in IDs:
				yield higher_ID

	def _get_lower_ratt_ids(self,ID):
		yield ID
		if ID[-4] != '.':
			return
		for sub_ID in self._get_lower_ratt_ids(ID[:-2]):
			yield sub_ID

	def _get_higher_ratt_ids(self,ID):
		yield ID
		counter = int(ID[-1])
		new_ID = '{0}.{1}'.format(ID,counter + 1)
		if new_ID not in self.gff:
			return
		for super_ID in self._get_higher_ratt_ids(new_ID):
			yield super_ID

	def _remove_fragmented_ratt(self):
		remove = set()
		ID_map = {}
		for transcript in self.gff.getitems(featuretype='mRNA'):
			if transcript.ID in remove:
				continue
			if transcript.ID[-4] == '.':
				#get all fragments
				fragments = (ff for f in self._get_nested_ratt_ids(transcript.ID) for ff in self.gff[f])
				#sort fragments by CDS length
				sorted_fragments = sorted(fragments,key = lambda x:len(x.seq))
				#remove the longest fragment from the list so it doesnt get removed
				longest = sorted_fragments.pop()
				#remove all the other fragments	
				remove |= {f.ID for f in sorted_fragments}#set(sorted_fragments)
				#make sure longest fragment keeps the original name
				transcript_ID = [ID for ID in self._get_nested_ratt_ids(longest.ID) if ID[-4] != '.'][0]
				ID_map[longest.ID] = transcript_ID
				#set ID and Parent attribute of nested CDS features
				sub_counter = {}
				for sub in self.gff.get_children(longest):
					if sub == longest:
						continue
					sub.attributes['Parent'] = [transcript_ID]
					sub_counter.setdefault(sub.featuretype,0)
					sub_counter[sub.featuretype] += 1
					sub_ID = '{0}.{1}{2}'.format(transcript_ID,sub.featuretype,sub_counter[sub.featuretype])
					ID_map[sub.ID] = sub_ID
		self.gff.remove(remove)
		for old_ID,new_ID in ID_map.iteritems():
			assert old_ID not in remove,(old_ID,new_ID)
			for feature in self.gff[old_ID]:
				feature.ID = new_ID

	def parse(self):
		self.remove_fragmented_ratt = False
		for subpart in self._readlines():
			#if self.filetype == 'interproscan':
			#	if subpart.featuretype == 'protein_match':
			#		parent = subpart.attributes['Target'][0]
			#		parent = parent.split()[0]
			#		subpart.parents = [parent]
			self.gff.update(subpart)
		if self.filetype == 'ratt':
			self.remove_fragmented_ratt = True
			self.gff.set_children()
			self._ratt()
		elif self.filetype == 'manual':
			self._manual()
			self.gff.set_children()
		elif self.filetype == 'interproscan':
			self._interproscan()
			self.gff.set_children()
		else:
			pass
		if self.filetype != 'ratt':
			self.gff.set_children()
		if self.fasta_file:
			self.gff.add_fasta(self.fasta_file)
		if self.remove_fragmented_ratt:
			self._remove_fragmented_ratt()
		if self.remove_noncoding:
			self._remove_noncoding()
		self.parsed = True
		return self.gff
Example #5
0
    :param t: 
    :return: 
    """
    print('C {}\t{}\t{}'.format(t['transcript_id'], t['begin'], t['end']))
    if t['transcript_id'] not in merge['names']:
        if merge['strand'] != t['strand']:
            print('warning different strands {}'.format(t['transcript_id']))
        merge['begin'] = min(merge['begin'], t['begin'])
        merge['end'] = min(merge['end'], t['end'])
        merge['member'].append(t)
        merge['names'].append(t['transcript_id'])


if __name__ == '__main__':

    sgff = Gff(file="stranded.merged.stringtie.gff")
    transcript_n = sgff.read_feature('transcript')
    print('{} stranded features read'.format(transcript_n))
    sgff.replace_columns_re(['sequence'], 'lcl\|', r'')
    # sgff.attr_sep = '='
    # transcript_n = sgff.read_feature('mRNA')
    query = r'(maker|augustus|masked|processed|gene|trnascan)(-|_)'
    sgff.replace_columns_re(['gene_id', 'transcript_id'], query, r'')
    sgff.replace_columns_re(['gene_id', 'transcript_id'], r'-tRNA-', r'.tRNA')
    sgff.position_to_int()
    # sgff.rename_key('ID', 'transcript_id')

    sbundle = make_bundle(sgff)

    ugff = Gff(file="unstranded.merged.stringtie.gff")
    transcript_n = ugff.read_feature('transcript')
Example #6
0
outfile = {{o.outfile | quote}}
notfound = {{args.notfound | quote}}
genecol = {{args.genecol or 0 | repr}}
inopts = {{args.inopts | repr}}
refgene = {{args.refgene | quote}}

if not path.isfile(refgene):
    raise OSError('Refgene file does not exists: {}'.format(refgene))

# get genes
genes = TsvReader(infile, **inopts).dump(genecol)
genes = dict(zip(genes, [False] * len(genes)))
writer = TsvWriter(outfile)
writer.cnames = ['CHR', 'START', 'END', 'NAME', 'SCORE', 'STRAND']

gff = Gff(refgene)
for g in gff:
    attrs = g['attributes']
    if attrs['gene_id'] not in genes:
        continue
    r = TsvRecord()
    r.CHR = g['seqid']
    r.START = g['start']
    r.END = g['end']
    r.SCORE = g['score']
    r.STRAND = g['strand']
    r.NAME = attrs['gene_id']
    writer.write(r)
writer.close()

for g, v in genes.items():
Example #7
0
class Parser(object):
	"""
	Parser object gff3 formatted annotation files.
	The parse() method return the processed Gff object.
	Has several methods for known incorrectly formatted files: _ratt() and _manual(), which are called by specifying filetype
	"""
	_filetypes = ('standard','ratt','manual','interproscan')
	def __init__(self,gff_file,filetype='standard',fasta_file=None,remove_noncoding=False,limit=None,author=None):
		if filetype not in self._filetypes:
			raise TypeError()
		self.filename = gff_file
		if filetype not in self._filetypes:
			e = '{0} is not a valid filetype'.format(filetype)
			raise TypeError(e)
		self.filetype = filetype
		self.fasta_file = fasta_file
		self.remove_noncoding = remove_noncoding
		if limit:
			if not isinstance(limit,dict):
				e = '{0} is not a valid type for limit'.format(type(limit))
				raise TypeError(e)
			else:
				#reformat limit values into lists if they are not already lists
				self.limit = {}
				for key,value in limit.iteritems():
					if isinstance(value,(list,tuple)):
						self.limit[key] = value
					elif isinstance(value,basestring):
						self.limit[key] = [value]
				#self.limit = {key:[value] for key,value in limit.iteritems() if isinstance(value,basestring) or if isinstance(value,(list,tuple))}
		else:
			self.limit = limit
 		self.gff = Gff(filename=gff_file)
 		self.author = author
		#self.parsed = False
	def _readlines(self):
		"""
		Generator that yields GffSubPart objects formatted according to filetype. 
		Limit is a dictionary that can limit the lines parsed based on some attributes like 
		"""
		with open(self.filename,'rU') as fh:
			for line in fh:
				if not line.strip():
					continue
				if line[0] == '#':
					if 'FASTA' in line:
						return
					continue
				if self.filetype == 'interproscan':
					line = re.sub('; ',': ',line)
				parts = line.strip().split('\t')
				if self.limit != None:
					#Series of list comprehensions to check if any of the relevant attributes of sub are in the limit dictionary
					#An empty list is falsy, so if the relevant value is not in the limit dictionary, continue to the next line
					if not [x for x in self.limit.get('seqid',[parts[0]]) if x == parts[0]]:
						continue
					elif not [x for x in self.limit.get('source',[parts[1]]) if x == parts[1]]:
						continue
					elif not [x for x in self.limit.get('featuretype',[parts[2]]) if x == parts[2]]:
						continue
					elif not [x for x in self.limit.get('strand',[parts[6]]) if x == parts[6]]:
						continue
				sub = GffSubPart(*parts,filetype=self.filetype)
				yield sub

	def _ratt(self):
		name_index_map = {}
		name_index_remove = set()
		child_map = {}
		
		for transcript in self.gff.getitems(featuretype='mRNA'):
			new_transcript_ID = transcript.attributes['locus_tag'][0]
			if transcript.ID == new_transcript_ID:
				continue

			child_map.setdefault(new_transcript_ID,[])

			name_index_map[new_transcript_ID] = transcript._key
			name_index_remove.add(transcript.ID)
			
			sub_counter = {}
			for sub in self.gff.get_children(transcript):
				if sub == transcript:
					continue
				sub_counter.setdefault(sub.featuretype,0)
				sub_counter[sub.featuretype] += 1
				
				sub.parents = [new_transcript_ID]
				new_sub_ID = '{0}.{1}{2}'.format(new_transcript_ID,sub.featuretype,sub_counter[sub.featuretype])
				
				child_map.setdefault(new_transcript_ID,[]).append(new_sub_ID)
				name_index_map[new_sub_ID] = sub._key
				name_index_remove.add(sub.ID)

				sub.ID = new_sub_ID
				sub.source = 'ratt'

			transcript.ID = new_transcript_ID
			transcript.source = 'ratt'
			transcript.attributes = {key:value for key,value in transcript.attributes.iteritems() if key in ('ID','Parent')}

			gene_ID = transcript.ID.split('.')[0]
			
			if not gene_ID in self.gff.name_index:
				gene_parts = transcript.gff_fields
				gene = GffSubPart(*gene_parts)
				gene.featuretype = 'gene'
				gene.ID = gene_ID
				gene.children = [transcript.ID]
				self.gff.update(gene)
			else:
				for gene in self.gff[gene_ID]:
					gene.children.append(transcript.ID)

			transcript.parents = [gene_ID]

		for new_name,_key in name_index_map.iteritems():
			self.gff.name_index[new_name] = [_key]
		for old_name in name_index_remove:
			self.gff.name_index.pop(old_name)
		for parent_id in child_map:
			for parent in self.gff[parent_id]:
				parent.children = child_map[parent_id]

	def _manual(self):
		ann = {}
		for sub in self.gff.getitems(featuretype='CDS'):
			if sub.attributes.get('Name',False):
				ann.setdefault(sub.ID,[]).append(sub)
		for gene,cds_list in ann.iteritems():
			self._assert_equal(cds_list)
			template = cds_list[0]
			exon_counter = 0
			#GffSubPart elements
			seqid = template.seqid
			source = template.source
			start = min(s.start for s in cds_list)
			end = max(s.end for s in cds_list)
			score = template.score
			strand = template.strand
			phase = template.phase
			#GffSubPart attributes
			gene_ID = uuid.uuid4()
			mrna_ID = '{0}.1'.format(gene_ID)
			name = template.attributes.get('Name',False)[0]
			author = self.author#template.attributes.get('Created by',False)[0]
			gene_attributes = 'ID={0};Name={1};Created by={2}'.format(gene_ID,name,author)
			mrna_attributes = 'ID={0};Parent={1};Name={2};Created by={3}'.format(mrna_ID,gene_ID,name,author)
			#make the gene GffSubPart
			gene = GffSubPart(seqid,source,'gene',start,end,score,strand,phase,gene_attributes)
			#make the mRNA GffSubPart
			mrna = GffSubPart(seqid,source,'mRNA',start,end,score,strand,phase,mrna_attributes)
			self.gff.update(gene)
			self.gff.update(mrna)
			if gene.strand == '+':
					reverse = True
			else:
					reverse = False
			for cds in sorted(cds_list,key=lambda x:x.get_start(),reverse=reverse):
				exon_counter += 1
				old_ID = cds.ID
				cds.ID = '{0}.CDS{1}'.format(mrna_ID,exon_counter)
				cds.attributes['ID'] = ['{0}.CDS{1}'.format(mrna_ID,exon_counter)]
				#print self.gff.name_index[old_ID]
				self.gff.name_index.setdefault(cds.ID,set()).add(cds._key)
				self.gff.name_index[old_ID] = {x for x in self.gff.name_index[old_ID] if x != cds._key}
				if len(self.gff.name_index[old_ID]) == 0:
					self.gff.name_index.pop(old_ID)
				cds.parents = [mrna_ID]
				cds.attributes['Parent'] = [mrna_ID]
				cds.attributes.pop('modified by',[])
				cds.attributes.pop('created by',[])
				cds.attributes['Created by'] = [self.author]
				
	def _interproscan(self):
		type_dict = {}
		for feature in self.gff:
			for attribute,values in feature.attributes.iteritems():
				feature.attributes[attribute] = [value.strip('"') for value in values]
			if feature.featuretype == 'protein_match':
				#set parent attribute
				parent = feature.attributes['Target'][0]
				parent = parent.split()[0]
				feature.parents = [parent]
				#determine new ID based on parent name and source, this should result in unique IDs, opposed to what comes out of IPS
				source = feature.source
				type_dict.setdefault(parent,{}).setdefault(source,0)
				type_dict[parent][source] += 1
				type_count = type_dict[parent][source]
				ID = '{0}.{1}.{2}'.format(parent,source,type_count)
				self.gff.name_index[ID] = self.gff.name_index.pop(feature.ID)
				feature.ID = ID

	@staticmethod
	def _assert_equal(subslist):
		first = subslist[0]
		assert all(x.seqid == first.seqid for x in subslist)
		assert all(x.source == first.source for x in subslist)
		assert all(x.strand == first.strand for x in subslist)
		try:
			assert all(x.attributes['Name'] == first.attributes['Name'] for x in subslist),[x.attributes['Name'] for x in subslist]
		except KeyError as e:
			print first
			raise e
		except AssertionError as e:
			raise e
		try:
			pass
			#assert all(x.attributes['Created by'] == first.attributes['Created by'] for x in subslist)
		except KeyError as e:
			print first
			raise e
		
	def _remove_noncoding(self):
		if not self.fasta_file:
			e = 'Can not remove non coding annotations without sequence data'
			raise NotImplementedError(e)
		remove = []
		for feature in self.gff.getitems(featuretype='mRNA'):
			try:
				pep = feature.pep
			except TranslateError:
				remove.append(feature.ID)
				continue
			if pep[0] != 'M':
				if feature.seq[0:3] != 'CTG':
					remove.append(feature.ID)
			elif pep[-1] != '*':
				remove.append(feature.ID)
			elif '*' in pep[1:-1]:
				remove.append(feature.ID)
		print 'Remove {0} genes because they do not encode a protein'.format(len(remove))
		self.gff.remove(remove)

	def _get_nested_ratt_ids(self,ID):
		IDs = set()
		for lower_ID in self._get_lower_ratt_ids(ID):
			IDs.add(lower_ID)
			yield lower_ID
		for higher_ID in self._get_higher_ratt_ids(ID):
			if higher_ID not in IDs:
				yield higher_ID

	def _get_lower_ratt_ids(self,ID):
		yield ID
		if ID[-4] != '.':
			return
		for sub_ID in self._get_lower_ratt_ids(ID[:-2]):
			yield sub_ID

	def _get_higher_ratt_ids(self,ID):
		yield ID
		counter = int(ID[-1])
		new_ID = '{0}.{1}'.format(ID,counter + 1)
		if new_ID not in self.gff:
			return
		for super_ID in self._get_higher_ratt_ids(new_ID):
			yield super_ID

	def _remove_fragmented_ratt(self):
		remove = set()
		ID_map = {}
		for transcript in self.gff.getitems(featuretype='mRNA'):
			if transcript.ID in remove:
				continue
			if transcript.ID[-4] == '.':
				#get all fragments
				fragments = (ff for f in self._get_nested_ratt_ids(transcript.ID) for ff in self.gff[f])
				#sort fragments by CDS length
				sorted_fragments = sorted(fragments,key = lambda x:len(x.seq))
				#remove the longest fragment from the list so it doesnt get removed
				longest = sorted_fragments.pop()
				#remove all the other fragments	
				remove |= {f.ID for f in sorted_fragments}#set(sorted_fragments)
				#make sure longest fragment keeps the original name
				transcript_ID = [ID for ID in self._get_nested_ratt_ids(longest.ID) if ID[-4] != '.'][0]
				ID_map[longest.ID] = transcript_ID
				#set ID and Parent attribute of nested CDS features
				sub_counter = {}
				for sub in self.gff.get_children(longest):
					if sub == longest:
						continue
					sub.attributes['Parent'] = [transcript_ID]
					sub_counter.setdefault(sub.featuretype,0)
					sub_counter[sub.featuretype] += 1
					sub_ID = '{0}.{1}{2}'.format(transcript_ID,sub.featuretype,sub_counter[sub.featuretype])
					ID_map[sub.ID] = sub_ID
		self.gff.remove(remove)
		for old_ID,new_ID in ID_map.iteritems():
			assert old_ID not in remove,(old_ID,new_ID)
			for feature in self.gff[old_ID]:
				feature.ID = new_ID

	def parse(self):
		self.remove_fragmented_ratt = False
		for subpart in self._readlines():
			#if self.filetype == 'interproscan':
			#	if subpart.featuretype == 'protein_match':
			#		parent = subpart.attributes['Target'][0]
			#		parent = parent.split()[0]
			#		subpart.parents = [parent]
			self.gff.update(subpart)
		if self.filetype == 'ratt':
			self.remove_fragmented_ratt = True
			self.gff.set_children()
			self._ratt()
		elif self.filetype == 'manual':
			self._manual()
			self.gff.set_children()
		elif self.filetype == 'interproscan':
			self._interproscan()
			self.gff.set_children()
		else:
			pass
		if self.filetype != 'ratt':
			self.gff.set_children()
		if self.fasta_file:
			self.gff.add_fasta(self.fasta_file)
		if self.remove_fragmented_ratt:
			self._remove_fragmented_ratt()
		if self.remove_noncoding:
			self._remove_noncoding()
		self.parsed = True
		return self.gff