def get(record): return annotation_tools.join_descriptions(index[getattr(record,join_field)],'/')
def get(record): return annotation_tools.join_descriptions( index[getattr(record, join_field)], '/')
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [ ] for item in table: ann = annotation.Annotation( seqid = item.chrom, source = source, type = 'mRNA', strand = {'+':1, '-':-1}[item.strand], start = int(item.txStart), end = int(item.txEnd), attr = { 'ID' : item.name, 'Name' : get_name(item), 'Product' : get_product(item), #'UCSC_name2' : item.name2, } ) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [ ] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append(annotation.Annotation( source = source, type = 'gene', seqid = group[0].seqid, strand = group[0].strand, start = min(item.start for item in group), end = max(item.end for item in group), attr = { 'ID' : ID, 'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'), 'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), } )) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start,end in zip(exonStarts,exonEnds): annotations.append(annotation.Annotation( source = source, type = 'exon', seqid = item.seqid, strand = item.strand, start = start, end = end, attr = { 'Parent' : item.attr['ID'], } )) if max(cdsStart,start) < min(cdsEnd,end): annotations.append(annotation.Annotation( source = source, type = 'CDS', seqid = item.seqid, strand = item.strand, start = max(cdsStart,start), end = min(cdsEnd,end), #TODO: phase attr = { 'Parent' : item.attr['ID'], } )) # Load sequence if self.download: io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz']) with workspace.tempspace() as temp: io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz']) sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp/'reference.gff','wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames = sequences + [ temp/'reference.gff' ], index = self.index, ).run()
def run(self): assert self.ucsc_name, 'Need a UCSC genome name' scratch = _ucsc_scratch(self) # Load annotations source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table) table = scratch.get_table(self.table) get_name = scratch.getter(self.name) get_product = scratch.getter(self.product) mrnas = [] for item in table: ann = annotation.Annotation( seqid=item.chrom, source=source, type='mRNA', strand={ '+': 1, '-': -1 }[item.strand], start=int(item.txStart), end=int(item.txEnd), attr={ 'ID': item.name, 'Name': get_name(item), 'Product': get_product(item), #'UCSC_name2' : item.name2, }) ann.record = item mrnas.append(ann) _uniquify_ids(mrnas) annotations = [] for group in _grouped_features(mrnas): ID = '/'.join(item.attr['ID'] for item in group) for item in group: item.attr['Parent'] = ID item.attr['ID'] = item.attr['ID'] + '-mRNA' annotations.append( annotation.Annotation( source=source, type='gene', seqid=group[0].seqid, strand=group[0].strand, start=min(item.start for item in group), end=max(item.end for item in group), attr={ 'ID': ID, 'Name': annotation_tools.join_descriptions( [item.attr['Name'] for item in group], '/'), 'Product': annotation_tools.join_descriptions( [item.attr['Product'] for item in group], '/'), #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'), })) for item in group: annotations.append(item) exonStarts = _parse_ints(item.record.exonStarts) exonEnds = _parse_ints(item.record.exonEnds) cdsStart = int(item.record.cdsStart) cdsEnd = int(item.record.cdsEnd) for start, end in zip(exonStarts, exonEnds): annotations.append( annotation.Annotation(source=source, type='exon', seqid=item.seqid, strand=item.strand, start=start, end=end, attr={ 'Parent': item.attr['ID'], })) if max(cdsStart, start) < min(cdsEnd, end): annotations.append( annotation.Annotation( source=source, type='CDS', seqid=item.seqid, strand=item.strand, start=max(cdsStart, start), end=min(cdsEnd, end), #TODO: phase attr={ 'Parent': item.attr['ID'], })) # Load sequence if self.download: io.execute([ 'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' + self.ucsc_name + '/bigZips/chromFa.tar.gz', scratch.ucsc / 'chromFa.tar.gz' ]) with workspace.tempspace() as temp: io.execute([ 'tar', '-C', temp.working_dir, '-zxf', scratch.ucsc / 'chromFa.tar.gz' ]) sequences = [ temp / item for item in natural_sorted(os.listdir(temp.working_dir)) ] with open(temp / 'reference.gff', 'wb') as f: annotation.write_gff3_header(f) for item in annotations: print >> f, item.as_gff() Make_tt_reference( self.output_dir, filenames=sequences + [temp / 'reference.gff'], index=self.index, ).run()