def coding_regions(self): coding_regions = {} annotations = list( annotation.read_annotations(join(self.dirname, 'reference.gff'))) annotation.link_up_annotations(annotations) for item in annotations: if item.type == "CDS": [mrna] = item.parents [gene] = mrna.parents name = gene.get_id() if name not in coding_regions: coding_regions[name] = [] coding_regions[name].append(item) for gene in annotations: if gene.type != "gene": continue name = gene.get_id() if name not in coding_regions: coding_region = gene.three_prime() coding_region.attr = {"ID": name} else: items = coding_regions[name] coding_region = annotation.Annotation( type="coding_region", seqid=items[0].seqid, strand=items[0].strand, start=min(item2.start for item2 in items), end=max(item2.end for item2 in items), attr={"ID": name}, ) coding_regions[name] = coding_region return coding_regions
def coding_regions(self): coding_regions = { } annotations = list(annotation.read_annotations(join(self.dirname,'reference.gff'))) annotation.link_up_annotations(annotations) for item in annotations: if item.type == "CDS": [ mrna ] = item.parents [ gene ] = mrna.parents name = gene.get_id() if name not in coding_regions: coding_regions[name] = [ ] coding_regions[name].append(item) for gene in annotations: if gene.type != "gene": continue name = gene.get_id() if name not in coding_regions: coding_region = gene.three_prime() coding_region.attr = { "ID" : name } else: items = coding_regions[name] coding_region = annotation.Annotation( type = "coding_region", seqid = items[0].seqid, strand= items[0].strand, start = min(item2.start for item2 in items), end = max(item2.end for item2 in items), attr = { "ID" : name }, ) coding_regions[name] = coding_region return coding_regions
def get_genes(items, extractions, log): annotation.link_up_annotations(items) log.log("Content of downloaded GFF file (type:biotype ... paths):\n") show_tree(items, log) log.log("\n") genes = list() for gene_type,gene_biotype,transcript_type,transcript_biotype in extractions: this_genes = extract_and_translate(items, gene_type, gene_biotype, transcript_type, transcript_biotype) log.log("%d genes from %s/%s/%s/%s\n" % (len(this_genes), gene_type,gene_biotype,transcript_type,transcript_biotype)) genes.extend(this_genes) log.log("\n") genes = merge(genes,log) features = [ ] for gene in genes: features.extend(all_within(gene)) features.sort(key=lambda i: (i.seqid,i.start)) return features
def get_genes(items, extractions, log): annotation.link_up_annotations(items) log.log("Content of downloaded GFF file (type:biotype ... paths):\n") show_tree(items, log) log.log("\n") genes = list() for gene_type,gene_biotype,transcript_type,transcript_biotype in extractions: this_genes = extract_and_translate(items, gene_type, gene_biotype, transcript_type, transcript_biotype) log.log("%d genes from %s/%s/%s/%s\n" % (len(this_genes), gene_type,gene_biotype,transcript_type,transcript_biotype)) genes.extend(this_genes) log.log("\n") genes = dominate(genes,log) log.log("\n") genes = merge(genes,log) features = [ ] for gene in genes: features.extend(all_within(gene)) features.sort(key=lambda i: (i.seqid,i.start)) return features
def run(self): workspace = self.get_workspace() header = [ "##gff-version 3\n" ] lengths = { } with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level","NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [ ] extended_utrs = [ ] utr_parts = [ ] exons_kept = [ ] cds_kept = [ ] transcripts_kept = [ ] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict( ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description",""), biotype=item.attr["biotype"] ) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0,max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace/"utr.gff",utrs,header) write_gff3(workspace/"utr_extended.gff",extended_utrs,header) write_gff3(workspace/"utr_part.gff",utr_parts,header) write_gff3(workspace/"transcript.gff",transcripts_kept,header) write_gff3(workspace/"exon.gff",exons_kept,header) write_gff3(workspace/"cds.gff",cds_kept,header)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] parts = self.parts or self.types parts = [ item.lower() for item in parts.split(',') ] all_annotations = list(annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [ ] seen = set() queue = [ (item,item) for item in annotations ] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary),item.start,item.end,item.seqid,item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend( (primary, item2) for item2 in item.children ) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension,int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end-start strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand ) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append( (tail_length,adaptor_bases) ) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = list(annotation.read_annotations(self.child)) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations([ item for item in annotations if item.type == "exon" ]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [ ] gene_utrs = [ ] for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] assert mrnas, "Gene without any mRNAs: "+gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [ ] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index)) cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [ ] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end,cds_3prime)) if mrna.strand >= 0: utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start+1,mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start,utr_end-1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = mrna.seqid, strand = mrna.strand, start = utr_start, end = utr_end, attr = attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end-utr_start+max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work/'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) with open(work/'utr.gff','wb') as f: annotation.write_gff3_header(f) for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] utr_5primes = [ ] for mrna in mrnas: cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not cdss or not exons: continue if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end > cds_3prime: utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start < cds_3prime: utr_5primes.append(min(item.end,cds_3prime)) if gene.strand >= 0: utr_start = max(utr_5primes) if utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(utr_5primes) if utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' thing = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) print >> f, thing.as_gff() work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): workspace = self.get_workspace() header = ["##gff-version 3\n"] lengths = {} with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level", "NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [] extended_utrs = [] utr_parts = [] exons_kept = [] cds_kept = [] transcripts_kept = [] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend( [item2 for item2 in item.children if item2.type == "CDS"]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict(ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description", ""), biotype=item.attr["biotype"]) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0, max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0, max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace / "utr.gff", utrs, header) write_gff3(workspace / "utr_extended.gff", extended_utrs, header) write_gff3(workspace / "utr_part.gff", utr_parts, header) write_gff3(workspace / "transcript.gff", transcripts_kept, header) write_gff3(workspace / "exon.gff", exons_kept, header) write_gff3(workspace / "cds.gff", cds_kept, header)
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames=self.filenames, snpeff=False, cs='ifavailable' if self.index else False, ls=False, bowtie='ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work / 'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations( [item for item in annotations if item.type == "exon"]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [] gene_utrs = [] for gene in annotations: if gene.type != 'gene': continue mrnas = [item for item in gene.children if item.type == 'mRNA'] assert mrnas, "Gene without any mRNAs: " + gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str( _max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str( _max_extension(mrna, exon_index, mrna_end_index)) cdss = [item for item in mrna.children if item.type == 'CDS'] exons = [item for item in mrna.children if item.type == 'exon'] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max( item.start, cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end, cds_3prime)) if mrna.strand >= 0: utr_start = min( mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start + 1, mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max( mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start, utr_end - 1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=mrna.seqid, strand=mrna.strand, start=utr_start, end=utr_end, attr=attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end - utr_start + max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max( gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start + 1, gene.end) else: utr_end = min( gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start, utr_end - 1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=gene.seqid, strand=gene.strand, start=utr_start, end=utr_end, attr=attr, ) utr.attr["max_extension"] = str( _max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work / 'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = [ ] for peak in annotation.read_annotations(self.child): if float(peak.attr.get("mean_tail","0.0")) < self.min_tail: continue peaks.append(peak) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [item.lower() for item in self.types.split(',')] parts = self.parts or self.types parts = [item.lower() for item in parts.split(',')] all_annotations = list( annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [] seen = set() queue = [(item, item) for item in annotations] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary), item.start, item.end, item.seqid, item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend((primary, item2) for item2 in item.children) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension, int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace / 'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end - start strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min( hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append((tail_length, adaptor_bases)) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()