def dominate(genes, log, radius=20): """ Reduce overlaps between exons in different genes. Priority is given to ensemb+havana then havana then ensembl, and then to furthest upstrand-ending transcript (differences less than radius are ignored), then to protein_coding genes """ transcripts = [ ] for gene in genes: for transcript in gene.children: transcript.parent = gene transcripts.extend(gene.children) transcript_index = span_index.index_annotations(transcripts) result = [ ] for gene in genes: new_gene = gene.copy() new_gene.children = [ ] for transcript in gene.children: new_exons = [ item for item in transcript.children if item.type == "exon" ] for hit in transcript_index.get(transcript, same_strand=True): if hit.parent == transcript.parent: continue if hit.strand > 0: offset = hit.end - transcript.end else: offset = transcript.start - hit.start hit_score = ( source_levels.get(hit.source, 0), min(offset+radius,0) if offset < 0 else max(0,offset-radius), hit.attr["Biotype"] != "protein_coding" ) transcript_score = ( source_levels.get(transcript.source, 0), 0, transcript.attr["Biotype"] != "protein_coding" ) if hit_score < transcript_score: new_exons, ablated_any = ablate(new_exons, [ item for item in hit.children if item.type == "exon" ], radius) #if ablated_any: # print hit.attr["Biotype"], transcript.attr["Biotype"], transcript.attr["Name"] if new_exons: new_transcript = transcript.copy() new_transcript.children = [ item for item in transcript.children if item.type != "exon" ] + new_exons new_gene.children.append(new_transcript) if new_gene.children: result.append(new_gene) return result
def load_analysis(dirname): result = Analysis() result.peaks = index(join(dirname,'peaks','relation-child.gff'), modify=lambda item: item.three_prime()) result.peak_index = span_index.index_annotations(result.peaks.itervalues()) return result
def run(self): workspace = self.get_workspace() header = ["##gff-version 3\n"] lengths = {} with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level", "NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [] extended_utrs = [] utr_parts = [] exons_kept = [] cds_kept = [] transcripts_kept = [] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend( [item2 for item2 in item.children if item2.type == "CDS"]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict(ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description", ""), biotype=item.attr["biotype"]) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0, max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0, max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace / "utr.gff", utrs, header) write_gff3(workspace / "utr_extended.gff", extended_utrs, header) write_gff3(workspace / "utr_part.gff", utr_parts, header) write_gff3(workspace / "transcript.gff", transcripts_kept, header) write_gff3(workspace / "exon.gff", exons_kept, header) write_gff3(workspace / "cds.gff", cds_kept, header)
def merge(genes, log): good_genes = [ ] exons = [ ] gene_exons = { } exon_gene = { } for gene in genes: this_all = all_within(gene) seqid = gene.seqid strand = gene.strand sane = True for item in this_all: sane = sane and item.seqid == seqid sane = sane and item.strand == strand if not sane: log.log("Skipping gene "+str(gene)+" due to inconsistent chromosome or strand.\n") continue this_exons = [ item for item in this_all if item.type == "exon" ] good_genes.append(gene) gene_exons[gene] = this_exons for exon in this_exons: exon_gene[exon] = gene exons.extend(this_exons) index = span_index.index_annotations(exons) union = Union_find(good_genes) for exon in exons: gene1 = exon_gene[exon] for hit in index.get(exon, same_strand=True): gene2 = exon_gene[hit] if gene1 != gene2: union.join(gene1,gene2) sets = union.get_sets() counts = collections.defaultdict(int) for item in sets: counts[len(item)] += 1 for n in sorted(counts): log.log("Merging produced %d sets of %d genes\n" % (counts[n], n)) result = [ ] for gene_set in sets: gene_list = list(gene_set) seqid = gene_list[0].seqid strand = gene_list[0].strand source = gene_list[0].source type = gene_list[0].type for gene in gene_set: assert gene.seqid == seqid assert gene.strand == strand start = min(gene.start for gene in gene_list) end = max(gene.end for gene in gene_list) attr = collections.defaultdict(set) for gene in gene_set: for name in gene.attr: attr[name].add(gene.attr[name]) for name in attr: attr[name] = join_up(attr[name], brief = name == "Name") attr = dict(attr) if len(gene_set) >= 2: print "Merged gene:", attr.get("Name",""), attr.get("Biotype","") merged_gene = annotation.Annotation( seqid = seqid, source = source, type = type, start = start, end = end, strand = strand, attr = attr ) merged_gene.children = [ ] for gene in gene_list: for transcript in gene.children: new_transcript = transcript.copy() new_transcript.children = transcript.children new_transcript.attr["Parent"] = merged_gene.get_id() merged_gene.children.append(new_transcript) result.append(merged_gene) return result
def utr_index(self): return span_index.index_annotations(self.utrs.values())
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] parts = self.parts or self.types parts = [ item.lower() for item in parts.split(',') ] all_annotations = list(annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [ ] seen = set() queue = [ (item,item) for item in annotations ] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary),item.start,item.end,item.seqid,item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend( (primary, item2) for item2 in item.children ) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension,int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end-start strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand ) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append( (tail_length,adaptor_bases) ) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def gene_index(self): return span_index.index_annotations(self.genes.values())
def peak_index(self): return span_index.index_annotations(self.peaks.itervalues())
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames=self.filenames, snpeff=False, cs='ifavailable' if self.index else False, ls=False, bowtie='ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work / 'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations( [item for item in annotations if item.type == "exon"]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [] gene_utrs = [] for gene in annotations: if gene.type != 'gene': continue mrnas = [item for item in gene.children if item.type == 'mRNA'] assert mrnas, "Gene without any mRNAs: " + gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str( _max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str( _max_extension(mrna, exon_index, mrna_end_index)) cdss = [item for item in mrna.children if item.type == 'CDS'] exons = [item for item in mrna.children if item.type == 'exon'] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max( item.start, cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end, cds_3prime)) if mrna.strand >= 0: utr_start = min( mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start + 1, mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max( mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start, utr_end - 1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=mrna.seqid, strand=mrna.strand, start=utr_start, end=utr_end, attr=attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end - utr_start + max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max( gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start + 1, gene.end) else: utr_end = min( gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start, utr_end - 1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=gene.seqid, strand=gene.strand, start=utr_start, end=utr_end, attr=attr, ) utr.attr["max_extension"] = str( _max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work / 'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = [ ] for peak in annotation.read_annotations(self.child): if float(peak.attr.get("mean_tail","0.0")) < self.min_tail: continue peaks.append(peak) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [item.lower() for item in self.types.split(',')] parts = self.parts or self.types parts = [item.lower() for item in parts.split(',')] all_annotations = list( annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [] seen = set() queue = [(item, item) for item in annotations] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary), item.start, item.end, item.seqid, item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend((primary, item2) for item2 in item.children) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension, int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace / 'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end - start strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min( hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append((tail_length, adaptor_bases)) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations([ item for item in annotations if item.type == "exon" ]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [ ] gene_utrs = [ ] for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] assert mrnas, "Gene without any mRNAs: "+gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [ ] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index)) cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [ ] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end,cds_3prime)) if mrna.strand >= 0: utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start+1,mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start,utr_end-1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = mrna.seqid, strand = mrna.strand, start = utr_start, end = utr_end, attr = attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end-utr_start+max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work/'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = list(annotation.read_annotations(self.child)) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): workspace = self.get_workspace() header = [ "##gff-version 3\n" ] lengths = { } with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level","NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [ ] extended_utrs = [ ] utr_parts = [ ] exons_kept = [ ] cds_kept = [ ] transcripts_kept = [ ] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict( ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description",""), biotype=item.attr["biotype"] ) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0,max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace/"utr.gff",utrs,header) write_gff3(workspace/"utr_extended.gff",extended_utrs,header) write_gff3(workspace/"utr_part.gff",utr_parts,header) write_gff3(workspace/"transcript.gff",transcripts_kept,header) write_gff3(workspace/"exon.gff",exons_kept,header) write_gff3(workspace/"cds.gff",cds_kept,header)