def dominate(genes, log, radius=20):
    """ Reduce overlaps between exons in different genes. 
        Priority is given to ensemb+havana then havana then ensembl, 
        and then to furthest upstrand-ending transcript (differences less than radius are ignored),
        then to protein_coding genes """
    transcripts = [ ]
    for gene in genes:
        for transcript in gene.children:
            transcript.parent = gene
        transcripts.extend(gene.children)

    transcript_index = span_index.index_annotations(transcripts)

    result = [ ]
    for gene in genes:
        new_gene = gene.copy()
        new_gene.children = [ ]
        for transcript in gene.children:
            new_exons = [ item for item in transcript.children if item.type == "exon" ]
            for hit in transcript_index.get(transcript, same_strand=True):
                if hit.parent == transcript.parent: continue
                if hit.strand > 0:
                    offset = hit.end - transcript.end
                else:
                    offset = transcript.start - hit.start
                
                hit_score = (
                    source_levels.get(hit.source, 0),
                    min(offset+radius,0) if offset < 0 else max(0,offset-radius),
                    hit.attr["Biotype"] != "protein_coding"
                )
                transcript_score = (
                    source_levels.get(transcript.source, 0),
                    0,
                    transcript.attr["Biotype"] != "protein_coding"
                )

                if hit_score < transcript_score:
                    new_exons, ablated_any = ablate(new_exons, 
                        [ item for item in hit.children if item.type == "exon" ],
                        radius)
                    #if ablated_any:
                    #    print hit.attr["Biotype"], transcript.attr["Biotype"], transcript.attr["Name"]

            if new_exons:
                new_transcript = transcript.copy()
                new_transcript.children = [ item for item in transcript.children if item.type != "exon" ] + new_exons
                new_gene.children.append(new_transcript)

        if new_gene.children:
            result.append(new_gene)

    return result
Ejemplo n.º 2
0
def dominate(genes, log, radius=20):
    """ Reduce overlaps between exons in different genes. 
        Priority is given to ensemb+havana then havana then ensembl, 
        and then to furthest upstrand-ending transcript (differences less than radius are ignored),
        then to protein_coding genes """
    transcripts = [ ]
    for gene in genes:
        for transcript in gene.children:
            transcript.parent = gene
        transcripts.extend(gene.children)

    transcript_index = span_index.index_annotations(transcripts)

    result = [ ]
    for gene in genes:
        new_gene = gene.copy()
        new_gene.children = [ ]
        for transcript in gene.children:
            new_exons = [ item for item in transcript.children if item.type == "exon" ]
            for hit in transcript_index.get(transcript, same_strand=True):
                if hit.parent == transcript.parent: continue
                if hit.strand > 0:
                    offset = hit.end - transcript.end
                else:
                    offset = transcript.start - hit.start
                
                hit_score = (
                    source_levels.get(hit.source, 0),
                    min(offset+radius,0) if offset < 0 else max(0,offset-radius),
                    hit.attr["Biotype"] != "protein_coding"
                )
                transcript_score = (
                    source_levels.get(transcript.source, 0),
                    0,
                    transcript.attr["Biotype"] != "protein_coding"
                )

                if hit_score < transcript_score:
                    new_exons, ablated_any = ablate(new_exons, 
                        [ item for item in hit.children if item.type == "exon" ],
                        radius)
                    #if ablated_any:
                    #    print hit.attr["Biotype"], transcript.attr["Biotype"], transcript.attr["Name"]

            if new_exons:
                new_transcript = transcript.copy()
                new_transcript.children = [ item for item in transcript.children if item.type != "exon" ] + new_exons
                new_gene.children.append(new_transcript)

        if new_gene.children:
            result.append(new_gene)

    return result
Ejemplo n.º 3
0
def load_analysis(dirname):
    result = Analysis()
    result.peaks = index(join(dirname,'peaks','relation-child.gff'),
        modify=lambda item: item.three_prime())
    result.peak_index = span_index.index_annotations(result.peaks.itervalues())
    return result
Ejemplo n.º 4
0
    def run(self):
        workspace = self.get_workspace()

        header = ["##gff-version 3\n"]
        lengths = {}
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])

        header = "".join(header)

        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]

        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level", "NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support

        exons = [
            item for item in items
            if item.type == "exon" and well_supported(item.parent)
        ]
        exon_index = span_index.index_annotations(exons)

        utrs = []
        extended_utrs = []
        utr_parts = []
        exons_kept = []
        cds_kept = []
        transcripts_kept = []
        for item in items:
            this_exons = [
                item2 for item2 in item.children if item2.type == "exon"
            ]
            if this_exons and well_supported(item):
                transcripts_kept.append(item)
                exons_kept.extend(this_exons)
                cds_kept.extend(
                    [item2 for item2 in item.children if item2.type == "CDS"])

            if self.gene_level:
                utr_bits = [
                    item3 for item2 in item.children if well_supported(item2)
                    for item3 in item2.children if item3.type == self.what
                ]
            else:
                if not well_supported(item): continue
                utr_bits = [
                    item2 for item2 in item.children if item2.type == self.what
                ]

            if not utr_bits:
                continue

            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)

            gene = item if self.gene_level else item.parent

            utr.attr = dict(ID=item.get_id(),
                            Name=item.attr["Name"],
                            gene_id=gene.get_id(),
                            gene=gene.attr["Name"],
                            description=gene.attr.get("description", ""),
                            biotype=item.attr["biotype"])

            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension,
                                    lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr

            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0, max_extension),
                                      same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)

            extended_utr = utr.shifted(0, max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)

            utrs.append(utr)
            extended_utrs.append(extended_utr)

            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)

        write_gff3(workspace / "utr.gff", utrs, header)
        write_gff3(workspace / "utr_extended.gff", extended_utrs, header)
        write_gff3(workspace / "utr_part.gff", utr_parts, header)
        write_gff3(workspace / "transcript.gff", transcripts_kept, header)
        write_gff3(workspace / "exon.gff", exons_kept, header)
        write_gff3(workspace / "cds.gff", cds_kept, header)
def merge(genes, log):
    good_genes = [ ]
    exons = [ ]
    gene_exons = { }
    exon_gene = { }
    for gene in genes:
        this_all = all_within(gene)
        seqid = gene.seqid
        strand = gene.strand
        sane = True
        for item in this_all:
            sane = sane and item.seqid == seqid
            sane = sane and item.strand == strand
        
        if not sane:
            log.log("Skipping gene "+str(gene)+" due to inconsistent chromosome or strand.\n")
            continue
        
        this_exons = [ item for item in this_all if item.type == "exon" ]

        good_genes.append(gene)
        gene_exons[gene] = this_exons 
        for exon in this_exons:
            exon_gene[exon] = gene
        exons.extend(this_exons)
        
        
    index = span_index.index_annotations(exons)
    union = Union_find(good_genes)
    for exon in exons:
        gene1 = exon_gene[exon]
        for hit in index.get(exon, same_strand=True):
            gene2 = exon_gene[hit]
            if gene1 != gene2: union.join(gene1,gene2)
    
    
    sets = union.get_sets()    
    counts = collections.defaultdict(int)
    for item in sets:
        counts[len(item)] += 1
    for n in sorted(counts):
        log.log("Merging produced %d sets of %d genes\n" % (counts[n], n))
    
    result = [ ]
    for gene_set in sets:
        gene_list = list(gene_set)
        seqid = gene_list[0].seqid
        strand = gene_list[0].strand        
        source = gene_list[0].source
        type = gene_list[0].type
        
        for gene in gene_set:
            assert gene.seqid == seqid
            assert gene.strand == strand
        
        start = min(gene.start for gene in gene_list)
        end = max(gene.end for gene in gene_list)        
        attr = collections.defaultdict(set)
        for gene in gene_set:
            for name in gene.attr:
                attr[name].add(gene.attr[name])
        for name in attr:
            attr[name] = join_up(attr[name], brief = name == "Name")
        attr = dict(attr)

        if len(gene_set) >= 2:
            print "Merged gene:", attr.get("Name",""), attr.get("Biotype","")
        
        merged_gene = annotation.Annotation(
            seqid = seqid,
            source = source,
            type = type,
            start = start,
            end = end,
            strand = strand,
            attr = attr
            )
        merged_gene.children = [ ]
        for gene in gene_list:
            for transcript in gene.children:
                new_transcript = transcript.copy()
                new_transcript.children = transcript.children
                new_transcript.attr["Parent"] = merged_gene.get_id()
                merged_gene.children.append(new_transcript)
        result.append(merged_gene) 
    
    return result
 def utr_index(self):
     return span_index.index_annotations(self.utrs.values())
Ejemplo n.º 7
0
def merge(genes, log):
    good_genes = [ ]
    exons = [ ]
    gene_exons = { }
    exon_gene = { }
    for gene in genes:
        this_all = all_within(gene)
        seqid = gene.seqid
        strand = gene.strand
        sane = True
        for item in this_all:
            sane = sane and item.seqid == seqid
            sane = sane and item.strand == strand
        
        if not sane:
            log.log("Skipping gene "+str(gene)+" due to inconsistent chromosome or strand.\n")
            continue
        
        this_exons = [ item for item in this_all if item.type == "exon" ]

        good_genes.append(gene)
        gene_exons[gene] = this_exons 
        for exon in this_exons:
            exon_gene[exon] = gene
        exons.extend(this_exons)
        
        
    index = span_index.index_annotations(exons)
    union = Union_find(good_genes)
    for exon in exons:
        gene1 = exon_gene[exon]
        for hit in index.get(exon, same_strand=True):
            gene2 = exon_gene[hit]
            if gene1 != gene2: union.join(gene1,gene2)
    
    
    sets = union.get_sets()    
    counts = collections.defaultdict(int)
    for item in sets:
        counts[len(item)] += 1
    for n in sorted(counts):
        log.log("Merging produced %d sets of %d genes\n" % (counts[n], n))
    
    result = [ ]
    for gene_set in sets:
        gene_list = list(gene_set)
        seqid = gene_list[0].seqid
        strand = gene_list[0].strand        
        source = gene_list[0].source
        type = gene_list[0].type
        
        for gene in gene_set:
            assert gene.seqid == seqid
            assert gene.strand == strand
        
        start = min(gene.start for gene in gene_list)
        end = max(gene.end for gene in gene_list)        
        attr = collections.defaultdict(set)
        for gene in gene_set:
            for name in gene.attr:
                attr[name].add(gene.attr[name])
        for name in attr:
            attr[name] = join_up(attr[name], brief = name == "Name")
        attr = dict(attr)

        if len(gene_set) >= 2:
            print "Merged gene:", attr.get("Name",""), attr.get("Biotype","")
        
        merged_gene = annotation.Annotation(
            seqid = seqid,
            source = source,
            type = type,
            start = start,
            end = end,
            strand = strand,
            attr = attr
            )
        merged_gene.children = [ ]
        for gene in gene_list:
            for transcript in gene.children:
                new_transcript = transcript.copy()
                new_transcript.children = transcript.children
                new_transcript.attr["Parent"] = merged_gene.get_id()
                merged_gene.children.append(new_transcript)
        result.append(merged_gene) 
    
    return result
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
         
         parts = self.parts or self.types 
         parts = [ item.lower() for item in parts.split(',') ]
         
         
         all_annotations = list(annotation.read_annotations(annotations_filename))
         annotation.link_up_annotations(all_annotations)
         for item in all_annotations: 
             item.primary = None
     
         annotations = [ 
             item 
             for item in all_annotations
             if item.type.lower() in types
         ]
         
         part_annotations = [ ]
         seen = set()
         queue = [ (item,item) for item in annotations ]
         while queue:
             primary, item = queue.pop()
             if item.type.lower() in parts:
                 assert item.primary is None, "Feature with multiple parents"
                 item.primary = primary
                 key = (id(primary),item.start,item.end,item.seqid,item.strand)
                 # Ignore duplicate exons (many isoforms will have the same exons)
                 if key not in seen:
                     seen.add(key)
                     part_annotations.append(item)
             queue.extend( (primary, item2) for item2 in item.children )
         
         del seen
         del all_annotations
         
         self.log.log('%d annotations\n' % len(annotations))
         self.log.log('%d part annotations\n' % len(part_annotations))
         
         #assert annotations, 'No annotations of specified types in file'
         
         for item in part_annotations:
             this_extension = self.extension
             if "max_extension" in item.attr:
                 this_extension = min(this_extension,int(item.attr["max_extension"]))
                 
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += this_extension
             else:
                 item.tail_pos = item.start
                 item.start -= this_extension
         
         for item in annotations:    
             item.hits = [] # [ (tail_length, adaptor_bases) ]
         
         index = span_index.index_annotations(part_annotations)
         
         for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'):
             if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                 continue
        
             start = alignment.reference_start
             end = alignment.reference_end
             alignment_length = end-start
             strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
             fragment_feature = annotation.Annotation(
                 seqid=alignment.reference_name,
                 start=start,
                 end=end,
                 strand=strand
                 )
             
             if strand >= 0:
                 tail_pos = end
             else:
                 tail_pos = start
             
             tail_length = 0
             adaptor_bases = 0
             for item in alignment.extra:
                 if item.startswith('AN:i:'):
                     tail_length = int(item[5:])
                 elif item.startswith('AD:i:'):
                     adaptor_bases = int(item[5:])
             
             hits = index.get(fragment_feature, same_strand=True)
             if hits:
                 gene = min(hits, key=lambda gene: 
                     (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                     # Nearest by tail_pos
                     # failing that, by id to ensure a deterministic choice
                 
                 gene.primary.hits.append( (tail_length,adaptor_bases) )

         for item in annotations:
             del item.parents
             del item.children
             del item.primary

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()
Ejemplo n.º 9
0
 def utr_index(self):
     return span_index.index_annotations(self.utrs.values())
Ejemplo n.º 10
0
 def gene_index(self):
     return span_index.index_annotations(self.genes.values())
Ejemplo n.º 11
0
 def peak_index(self):
     return span_index.index_annotations(self.peaks.itervalues())
Ejemplo n.º 12
0
    def run(self):
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])

        nesoni.Make_reference(
            self.output_dir,
            filenames=self.filenames,
            snpeff=False,
            cs='ifavailable' if self.index else False,
            ls=False,
            bowtie='ifavailable' if self.index else False,
        ).run()

        annotations = list(annotation.read_annotations(work / 'reference.gff'))
        annotation.link_up_annotations(annotations)

        exon_index = span_index.index_annotations(
            [item for item in annotations if item.type == "exon"])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
        ])

        mrna_utrs = []
        gene_utrs = []

        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [item for item in gene.children if item.type == 'mRNA']
            assert mrnas, "Gene without any mRNAs: " + gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(
                _max_extension(gene, exon_index, mrna_end_index))

            gene_utr_5primes = []

            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna

                mrna.attr["max_extension"] = str(
                    _max_extension(mrna, exon_index, mrna_end_index))

                cdss = [item for item in mrna.children if item.type == 'CDS']
                exons = [item for item in mrna.children if item.type == 'exon']

                if not exons: continue

                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]

                if not cdss: continue

                mrna_utr_5primes = []
                if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end >= cds_3prime:
                            mrna_utr_5primes.append(max(
                                item.start, cds_3prime))
                else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start <= cds_3prime:
                            mrna_utr_5primes.append(min(item.end, cds_3prime))

                if mrna.strand >= 0:
                    utr_start = min(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start + 1, mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start, utr_end - 1)
                    gene_utr_5primes.append(utr_end)

                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID'] + '-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source='tt',
                    type='three_prime_utr',
                    seqid=mrna.seqid,
                    strand=mrna.strand,
                    start=utr_start,
                    end=utr_end,
                    attr=attr,
                )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end - utr_start + max_ext > 1:
                    mrna_utrs.append(utr)

            if gene.strand >= 0:
                utr_start = max(
                    gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start + 1, gene.end)
            else:
                utr_end = min(
                    gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start, utr_end - 1)

            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID'] + '-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source='tt',
                type='three_prime_utr',
                seqid=gene.seqid,
                strand=gene.strand,
                start=utr_start,
                end=utr_end,
                attr=attr,
            )
            utr.attr["max_extension"] = str(
                _max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)

        annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work / 'utr.gff', gene_utrs)

        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 13
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = [ ]
        for peak in annotation.read_annotations(self.child):
            if float(peak.attr.get("mean_tail","0.0")) < self.min_tail:
                continue
            peaks.append(peak)
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]

            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
Ejemplo n.º 14
0
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()
Ejemplo n.º 15
0
    def run(self):    
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])
        
        nesoni.Make_reference(
            self.output_dir,
            filenames = self.filenames,
            snpeff = False,
            cs = 'ifavailable' if self.index else False,
            ls = False,
            bowtie = 'ifavailable' if self.index else False,
            ).run()
            
        annotations = list(annotation.read_annotations(work/'reference.gff'))
        annotation.link_up_annotations(annotations)
        
        exon_index = span_index.index_annotations([
            item for item in annotations if item.type == "exon"
            ])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
            ])
        
        mrna_utrs = [ ]
        gene_utrs = [ ]
        
        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
            assert mrnas, "Gene without any mRNAs: "+gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index))
        
            gene_utr_5primes = [ ]
            
            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna
                
                mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index))
            
                cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                exons = [ item for item in mrna.children if item.type == 'exon' ]
                
                if not exons: continue
                
                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]
                
                if not cdss: continue
                
                mrna_utr_5primes = [ ]
                if gene.strand >= 0:
                   cds_3prime = max(item.end for item in cdss)
                   for item in exons:
                       if item.end >= cds_3prime:
                           mrna_utr_5primes.append(max(item.start,cds_3prime))
                else:
                   cds_3prime = min(item.start for item in cdss)
                   for item in exons:
                       if item.start <= cds_3prime:
                           mrna_utr_5primes.append(min(item.end,cds_3prime))
                
                if mrna.strand >= 0:
                    utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start+1,mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start,utr_end-1)
                    gene_utr_5primes.append(utr_end)
                
                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID']+'-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source = 'tt',
                    type = 'three_prime_utr',
                    seqid = mrna.seqid,
                    strand = mrna.strand,
                    start = utr_start,
                    end = utr_end,
                    attr = attr,
                    )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end-utr_start+max_ext > 1:
                    mrna_utrs.append(utr)
            
            if gene.strand >= 0:
                utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start+1,gene.end)
            else:
                utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start,utr_end-1)
            
            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID']+'-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source = 'tt',
                type = 'three_prime_utr',
                seqid = gene.seqid,
                strand = gene.strand,
                start = utr_start,
                end = utr_end,
                attr = attr,
                )
            utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)
        
        annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work/'utr.gff', gene_utrs)
            
        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 16
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = list(annotation.read_annotations(self.child))
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]
            
            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
 def peak_index(self):
     return span_index.index_annotations(self.peaks.itervalues())
    def run(self):
        workspace = self.get_workspace()
                
        header = [ "##gff-version 3\n" ]
        lengths = { }
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])
                    
        header = "".join(header)
                
        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]
            
        
        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level","NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support
        
        exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ]
        exon_index = span_index.index_annotations(exons)
        
        utrs = [ ]
        extended_utrs = [ ]
        utr_parts = [ ]
        exons_kept = [ ]
        cds_kept = [ ]
        transcripts_kept = [ ]
        for item in items:
            this_exons = [ item2 for item2 in item.children if item2.type == "exon" ]
            if this_exons and well_supported(item):    
                 transcripts_kept.append(item)
                 exons_kept.extend(this_exons)
                 cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ])
        
            if self.gene_level:
                utr_bits = [ item3 for item2 in item.children  if well_supported(item2)
                                   for item3 in item2.children if item3.type == self.what ] 
            else:
                if not well_supported(item): continue
                utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] 
            
            if not utr_bits:
                continue
            
            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)
            
            gene = item if self.gene_level else item.parent
            
            utr.attr = dict(
                ID=item.get_id(),
                Name=item.attr["Name"],
                gene_id=gene.get_id(),
                gene=gene.attr["Name"],
                description=gene.attr.get("description",""),
                biotype=item.attr["biotype"]
                )
        
            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension, lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr
            
            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)
        
            extended_utr = utr.shifted(0,max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)
        
            utrs.append(utr)
            extended_utrs.append(extended_utr)
            
            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)
                            
        write_gff3(workspace/"utr.gff",utrs,header)
        write_gff3(workspace/"utr_extended.gff",extended_utrs,header)
        write_gff3(workspace/"utr_part.gff",utr_parts,header)
        write_gff3(workspace/"transcript.gff",transcripts_kept,header)
        write_gff3(workspace/"exon.gff",exons_kept,header)
        write_gff3(workspace/"cds.gff",cds_kept,header)

     




        
 def gene_index(self):
     return span_index.index_annotations(self.genes.values())