Ejemplo n.º 1
0
    def coding_regions(self):
        coding_regions = {}
        annotations = list(
            annotation.read_annotations(join(self.dirname, 'reference.gff')))
        annotation.link_up_annotations(annotations)
        for item in annotations:
            if item.type == "CDS":
                [mrna] = item.parents
                [gene] = mrna.parents
                name = gene.get_id()
                if name not in coding_regions:
                    coding_regions[name] = []
                coding_regions[name].append(item)

        for gene in annotations:
            if gene.type != "gene": continue
            name = gene.get_id()
            if name not in coding_regions:
                coding_region = gene.three_prime()
                coding_region.attr = {"ID": name}
            else:
                items = coding_regions[name]
                coding_region = annotation.Annotation(
                    type="coding_region",
                    seqid=items[0].seqid,
                    strand=items[0].strand,
                    start=min(item2.start for item2 in items),
                    end=max(item2.end for item2 in items),
                    attr={"ID": name},
                )
            coding_regions[name] = coding_region

        return coding_regions
 def coding_regions(self):
     coding_regions = { }
     annotations = list(annotation.read_annotations(join(self.dirname,'reference.gff')))
     annotation.link_up_annotations(annotations)
     for item in annotations:
         if item.type == "CDS":
             [ mrna ] = item.parents
             [ gene ] = mrna.parents
             name = gene.get_id()
             if name not in coding_regions:
                 coding_regions[name] = [ ]
             coding_regions[name].append(item)
     
     for gene in annotations:
         if gene.type != "gene": continue
         name = gene.get_id()
         if name not in coding_regions:
             coding_region = gene.three_prime()
             coding_region.attr = { "ID" : name }
         else:
             items = coding_regions[name]
             coding_region = annotation.Annotation(
                 type = "coding_region",
                 seqid = items[0].seqid,
                 strand= items[0].strand,
                 start = min(item2.start for item2 in items),
                 end = max(item2.end for item2 in items),
                 attr = { "ID" : name },
                 )
         coding_regions[name] = coding_region
     
     return coding_regions
Ejemplo n.º 3
0
def get_genes(items, extractions, log):
    annotation.link_up_annotations(items)

    log.log("Content of downloaded GFF file (type:biotype ... paths):\n")
    show_tree(items, log)
    log.log("\n")        
    
    genes = list()
    for gene_type,gene_biotype,transcript_type,transcript_biotype in extractions:
        this_genes = extract_and_translate(items, gene_type, gene_biotype, transcript_type, transcript_biotype)
        log.log("%d genes from %s/%s/%s/%s\n" % (len(this_genes), gene_type,gene_biotype,transcript_type,transcript_biotype))
        genes.extend(this_genes)
    
    log.log("\n")
    genes = merge(genes,log)
    features = [ ]
    for gene in genes:
        features.extend(all_within(gene))
    features.sort(key=lambda i: (i.seqid,i.start))
    return features
Ejemplo n.º 4
0
def get_genes(items, extractions, log):
    annotation.link_up_annotations(items)

    log.log("Content of downloaded GFF file (type:biotype ... paths):\n")
    show_tree(items, log)
    log.log("\n")        
    
    genes = list()
    for gene_type,gene_biotype,transcript_type,transcript_biotype in extractions:
        this_genes = extract_and_translate(items, gene_type, gene_biotype, transcript_type, transcript_biotype)
        log.log("%d genes from %s/%s/%s/%s\n" % (len(this_genes), gene_type,gene_biotype,transcript_type,transcript_biotype))
        genes.extend(this_genes)
    
    log.log("\n")
    genes = dominate(genes,log)
    log.log("\n")
    genes = merge(genes,log)
    features = [ ]
    for gene in genes:
        features.extend(all_within(gene))
    features.sort(key=lambda i: (i.seqid,i.start))
    return features
    def run(self):
        workspace = self.get_workspace()
                
        header = [ "##gff-version 3\n" ]
        lengths = { }
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])
                    
        header = "".join(header)
                
        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]
            
        
        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level","NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support
        
        exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ]
        exon_index = span_index.index_annotations(exons)
        
        utrs = [ ]
        extended_utrs = [ ]
        utr_parts = [ ]
        exons_kept = [ ]
        cds_kept = [ ]
        transcripts_kept = [ ]
        for item in items:
            this_exons = [ item2 for item2 in item.children if item2.type == "exon" ]
            if this_exons and well_supported(item):    
                 transcripts_kept.append(item)
                 exons_kept.extend(this_exons)
                 cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ])
        
            if self.gene_level:
                utr_bits = [ item3 for item2 in item.children  if well_supported(item2)
                                   for item3 in item2.children if item3.type == self.what ] 
            else:
                if not well_supported(item): continue
                utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] 
            
            if not utr_bits:
                continue
            
            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)
            
            gene = item if self.gene_level else item.parent
            
            utr.attr = dict(
                ID=item.get_id(),
                Name=item.attr["Name"],
                gene_id=gene.get_id(),
                gene=gene.attr["Name"],
                description=gene.attr.get("description",""),
                biotype=item.attr["biotype"]
                )
        
            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension, lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr
            
            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)
        
            extended_utr = utr.shifted(0,max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)
        
            utrs.append(utr)
            extended_utrs.append(extended_utr)
            
            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)
                            
        write_gff3(workspace/"utr.gff",utrs,header)
        write_gff3(workspace/"utr_extended.gff",extended_utrs,header)
        write_gff3(workspace/"utr_part.gff",utr_parts,header)
        write_gff3(workspace/"transcript.gff",transcripts_kept,header)
        write_gff3(workspace/"exon.gff",exons_kept,header)
        write_gff3(workspace/"cds.gff",cds_kept,header)

     




        
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
         
         parts = self.parts or self.types 
         parts = [ item.lower() for item in parts.split(',') ]
         
         
         all_annotations = list(annotation.read_annotations(annotations_filename))
         annotation.link_up_annotations(all_annotations)
         for item in all_annotations: 
             item.primary = None
     
         annotations = [ 
             item 
             for item in all_annotations
             if item.type.lower() in types
         ]
         
         part_annotations = [ ]
         seen = set()
         queue = [ (item,item) for item in annotations ]
         while queue:
             primary, item = queue.pop()
             if item.type.lower() in parts:
                 assert item.primary is None, "Feature with multiple parents"
                 item.primary = primary
                 key = (id(primary),item.start,item.end,item.seqid,item.strand)
                 # Ignore duplicate exons (many isoforms will have the same exons)
                 if key not in seen:
                     seen.add(key)
                     part_annotations.append(item)
             queue.extend( (primary, item2) for item2 in item.children )
         
         del seen
         del all_annotations
         
         self.log.log('%d annotations\n' % len(annotations))
         self.log.log('%d part annotations\n' % len(part_annotations))
         
         #assert annotations, 'No annotations of specified types in file'
         
         for item in part_annotations:
             this_extension = self.extension
             if "max_extension" in item.attr:
                 this_extension = min(this_extension,int(item.attr["max_extension"]))
                 
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += this_extension
             else:
                 item.tail_pos = item.start
                 item.start -= this_extension
         
         for item in annotations:    
             item.hits = [] # [ (tail_length, adaptor_bases) ]
         
         index = span_index.index_annotations(part_annotations)
         
         for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'):
             if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                 continue
        
             start = alignment.reference_start
             end = alignment.reference_end
             alignment_length = end-start
             strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
             fragment_feature = annotation.Annotation(
                 seqid=alignment.reference_name,
                 start=start,
                 end=end,
                 strand=strand
                 )
             
             if strand >= 0:
                 tail_pos = end
             else:
                 tail_pos = start
             
             tail_length = 0
             adaptor_bases = 0
             for item in alignment.extra:
                 if item.startswith('AN:i:'):
                     tail_length = int(item[5:])
                 elif item.startswith('AD:i:'):
                     adaptor_bases = int(item[5:])
             
             hits = index.get(fragment_feature, same_strand=True)
             if hits:
                 gene = min(hits, key=lambda gene: 
                     (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                     # Nearest by tail_pos
                     # failing that, by id to ensure a deterministic choice
                 
                 gene.primary.hits.append( (tail_length,adaptor_bases) )

         for item in annotations:
             del item.parents
             del item.children
             del item.primary

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()
Ejemplo n.º 7
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = list(annotation.read_annotations(self.child))
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]
            
            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
Ejemplo n.º 8
0
    def run(self):    
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])
        
        nesoni.Make_reference(
            self.output_dir,
            filenames = self.filenames,
            snpeff = False,
            cs = 'ifavailable' if self.index else False,
            ls = False,
            bowtie = 'ifavailable' if self.index else False,
            ).run()
            
        annotations = list(annotation.read_annotations(work/'reference.gff'))
        annotation.link_up_annotations(annotations)
        
        exon_index = span_index.index_annotations([
            item for item in annotations if item.type == "exon"
            ])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
            ])
        
        mrna_utrs = [ ]
        gene_utrs = [ ]
        
        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
            assert mrnas, "Gene without any mRNAs: "+gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index))
        
            gene_utr_5primes = [ ]
            
            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna
                
                mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index))
            
                cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                exons = [ item for item in mrna.children if item.type == 'exon' ]
                
                if not exons: continue
                
                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]
                
                if not cdss: continue
                
                mrna_utr_5primes = [ ]
                if gene.strand >= 0:
                   cds_3prime = max(item.end for item in cdss)
                   for item in exons:
                       if item.end >= cds_3prime:
                           mrna_utr_5primes.append(max(item.start,cds_3prime))
                else:
                   cds_3prime = min(item.start for item in cdss)
                   for item in exons:
                       if item.start <= cds_3prime:
                           mrna_utr_5primes.append(min(item.end,cds_3prime))
                
                if mrna.strand >= 0:
                    utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start+1,mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start,utr_end-1)
                    gene_utr_5primes.append(utr_end)
                
                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID']+'-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source = 'tt',
                    type = 'three_prime_utr',
                    seqid = mrna.seqid,
                    strand = mrna.strand,
                    start = utr_start,
                    end = utr_end,
                    attr = attr,
                    )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end-utr_start+max_ext > 1:
                    mrna_utrs.append(utr)
            
            if gene.strand >= 0:
                utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start+1,gene.end)
            else:
                utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start,utr_end-1)
            
            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID']+'-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source = 'tt',
                type = 'three_prime_utr',
                seqid = gene.seqid,
                strand = gene.strand,
                start = utr_start,
                end = utr_end,
                attr = attr,
                )
            utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)
        
        annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work/'utr.gff', gene_utrs)
            
        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 9
0
 def run(self):    
     work = self.get_workspace()
     work.update_param(remove=['tail_tools_reference_version'])
     
     nesoni.Make_reference(
         self.output_dir,
         filenames = self.filenames,
         snpeff = False,
         cs = 'ifavailable' if self.index else False,
         ls = False,
         bowtie = 'ifavailable' if self.index else False,
         ).run()
         
     annotations = list(annotation.read_annotations(work/'reference.gff'))
     annotation.link_up_annotations(annotations)
     
     with open(work/'utr.gff','wb') as f:
         annotation.write_gff3_header(f)
         for gene in annotations:
             if gene.type != 'gene': continue
             mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
         
             utr_5primes = [ ]
             
             for mrna in mrnas:
                 cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                 exons = [ item for item in mrna.children if item.type == 'exon' ]
                 if not cdss or not exons: continue
                 if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end > cds_3prime:
                            utr_5primes.append(max(item.start,cds_3prime))
                 else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start < cds_3prime:
                            utr_5primes.append(min(item.end,cds_3prime))
             
             if gene.strand >= 0:
                 utr_start = max(utr_5primes) if utr_5primes else gene.end
                 utr_end = max(utr_start+1,gene.end)
             else:
                 utr_end = min(utr_5primes) if utr_5primes else gene.start
                 utr_start = min(gene.start,utr_end-1)
             
             attr = gene.attr.copy()
             attr['Parent'] = attr['ID']
             attr['ID'] = attr['ID']+'-3UTR'
             thing = annotation.Annotation(
                 source = 'tt',
                 type = 'three_prime_utr',
                 seqid = gene.seqid,
                 strand = gene.strand,
                 start = utr_start,
                 end = utr_end,
                 attr = attr,
                 )
             print >> f, thing.as_gff()
         
         
     work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 10
0
    def run(self):
        workspace = self.get_workspace()

        header = ["##gff-version 3\n"]
        lengths = {}
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])

        header = "".join(header)

        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]

        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level", "NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support

        exons = [
            item for item in items
            if item.type == "exon" and well_supported(item.parent)
        ]
        exon_index = span_index.index_annotations(exons)

        utrs = []
        extended_utrs = []
        utr_parts = []
        exons_kept = []
        cds_kept = []
        transcripts_kept = []
        for item in items:
            this_exons = [
                item2 for item2 in item.children if item2.type == "exon"
            ]
            if this_exons and well_supported(item):
                transcripts_kept.append(item)
                exons_kept.extend(this_exons)
                cds_kept.extend(
                    [item2 for item2 in item.children if item2.type == "CDS"])

            if self.gene_level:
                utr_bits = [
                    item3 for item2 in item.children if well_supported(item2)
                    for item3 in item2.children if item3.type == self.what
                ]
            else:
                if not well_supported(item): continue
                utr_bits = [
                    item2 for item2 in item.children if item2.type == self.what
                ]

            if not utr_bits:
                continue

            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)

            gene = item if self.gene_level else item.parent

            utr.attr = dict(ID=item.get_id(),
                            Name=item.attr["Name"],
                            gene_id=gene.get_id(),
                            gene=gene.attr["Name"],
                            description=gene.attr.get("description", ""),
                            biotype=item.attr["biotype"])

            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension,
                                    lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr

            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0, max_extension),
                                      same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)

            extended_utr = utr.shifted(0, max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)

            utrs.append(utr)
            extended_utrs.append(extended_utr)

            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)

        write_gff3(workspace / "utr.gff", utrs, header)
        write_gff3(workspace / "utr_extended.gff", extended_utrs, header)
        write_gff3(workspace / "utr_part.gff", utr_parts, header)
        write_gff3(workspace / "transcript.gff", transcripts_kept, header)
        write_gff3(workspace / "exon.gff", exons_kept, header)
        write_gff3(workspace / "cds.gff", cds_kept, header)
Ejemplo n.º 11
0
    def run(self):
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])

        nesoni.Make_reference(
            self.output_dir,
            filenames=self.filenames,
            snpeff=False,
            cs='ifavailable' if self.index else False,
            ls=False,
            bowtie='ifavailable' if self.index else False,
        ).run()

        annotations = list(annotation.read_annotations(work / 'reference.gff'))
        annotation.link_up_annotations(annotations)

        exon_index = span_index.index_annotations(
            [item for item in annotations if item.type == "exon"])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
        ])

        mrna_utrs = []
        gene_utrs = []

        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [item for item in gene.children if item.type == 'mRNA']
            assert mrnas, "Gene without any mRNAs: " + gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(
                _max_extension(gene, exon_index, mrna_end_index))

            gene_utr_5primes = []

            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna

                mrna.attr["max_extension"] = str(
                    _max_extension(mrna, exon_index, mrna_end_index))

                cdss = [item for item in mrna.children if item.type == 'CDS']
                exons = [item for item in mrna.children if item.type == 'exon']

                if not exons: continue

                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]

                if not cdss: continue

                mrna_utr_5primes = []
                if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end >= cds_3prime:
                            mrna_utr_5primes.append(max(
                                item.start, cds_3prime))
                else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start <= cds_3prime:
                            mrna_utr_5primes.append(min(item.end, cds_3prime))

                if mrna.strand >= 0:
                    utr_start = min(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start + 1, mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start, utr_end - 1)
                    gene_utr_5primes.append(utr_end)

                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID'] + '-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source='tt',
                    type='three_prime_utr',
                    seqid=mrna.seqid,
                    strand=mrna.strand,
                    start=utr_start,
                    end=utr_end,
                    attr=attr,
                )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end - utr_start + max_ext > 1:
                    mrna_utrs.append(utr)

            if gene.strand >= 0:
                utr_start = max(
                    gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start + 1, gene.end)
            else:
                utr_end = min(
                    gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start, utr_end - 1)

            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID'] + '-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source='tt',
                type='three_prime_utr',
                seqid=gene.seqid,
                strand=gene.strand,
                start=utr_start,
                end=utr_end,
                attr=attr,
            )
            utr.attr["max_extension"] = str(
                _max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)

        annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work / 'utr.gff', gene_utrs)

        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 12
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = [ ]
        for peak in annotation.read_annotations(self.child):
            if float(peak.attr.get("mean_tail","0.0")) < self.min_tail:
                continue
            peaks.append(peak)
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]

            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
Ejemplo n.º 13
0
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()