Ejemplo n.º 1
0
def make_file_for_primer_3(gff_file, ref_file, names_file, output_file, start,
                           end):
    # check for a tmp direcory
    if len(glob.glob("./tmp")) == 0:
        call(["mkdir", "tmp"])

    gff_file = list(annotation.read_annotations(gff_file))
    print "\nReading in the reference file\n"
    seq_dict = dict(io.read_sequences(ref_file))

    names_file = open(names_file).readlines()
    config = open("Software/primer_config.txt").readlines()

    with open("tmp/regions_" + output_file, 'w') as out_f:
        for name in names_file:
            sname = name.strip("\n ")
            found = False
            for line in gff_file:
                gff_name = line.attr.get("Name", "No_name")
                peak = line.attr.get("id", "No_id")
                if sname in gff_name.split("/"):
                    out_f.write("SEQUENCE_ID=" + gff_name.replace("/", "_") +
                                "_" + peak + "\n")
                    # Move the peaks 30 bases proximal
                    out_f.write("SEQUENCE_TEMPLATE=" +
                                line.shifted(start, end).get_seq(seq_dict) +
                                "\n")
                    found = True
                    for cline in config:
                        out_f.write(cline.strip("\n") + "\n")
                    out_f.write("=" + "\n")
            if found == False:
                print "Could not find the gene " + sname + " in the reference gff file\n"
Ejemplo n.º 2
0
    def run(self):
        if self.output is not None:
           out_file = open(self.output,'wb')
        else:
           out_file = sys.stdout
    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']
                    
                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()
        
        if self.output is not None:
            out_file.close()
Ejemplo n.º 3
0
 def set_annotations(self, filenames):
     f = self.open('reference.gff','wb')
     annotation.write_gff3_header(f)
     for filename in filenames:
         for feature in annotation.read_annotations(filename):
             print >> f, feature.as_gff()
     f.close()
Ejemplo n.º 4
0
 def set_annotations(self, filenames):
     f = self.open('reference.gff','wb')
     print >> f, '##gff-version 3'
     for filename in filenames:
         for feature in annotation.read_annotations(filename):
             print >> f, feature.as_gff()
     f.close()
Ejemplo n.º 5
0
 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
Ejemplo n.º 6
0
    def coding_regions(self):
        coding_regions = {}
        annotations = list(
            annotation.read_annotations(join(self.dirname, 'reference.gff')))
        annotation.link_up_annotations(annotations)
        for item in annotations:
            if item.type == "CDS":
                [mrna] = item.parents
                [gene] = mrna.parents
                name = gene.get_id()
                if name not in coding_regions:
                    coding_regions[name] = []
                coding_regions[name].append(item)

        for gene in annotations:
            if gene.type != "gene": continue
            name = gene.get_id()
            if name not in coding_regions:
                coding_region = gene.three_prime()
                coding_region.attr = {"ID": name}
            else:
                items = coding_regions[name]
                coding_region = annotation.Annotation(
                    type="coding_region",
                    seqid=items[0].seqid,
                    strand=items[0].strand,
                    start=min(item2.start for item2 in items),
                    end=max(item2.end for item2 in items),
                    attr={"ID": name},
                )
            coding_regions[name] = coding_region

        return coding_regions
Ejemplo n.º 7
0
 def set_annotations(self, filenames):
     f = self.open('reference.gff', 'wb')
     print >> f, '##gff-version 3'
     for filename in filenames:
         for feature in annotation.read_annotations(filename):
             print >> f, feature.as_gff()
     f.close()
Ejemplo n.º 8
0
    def run(self):
        only = set( item.lower() for item in self.only )
        exclude = set( item.lower() for item in self.exclude )
        
        if self.output is not None:
           out_file = open(self.output,'wb')
        else:
           out_file = sys.stdout
    
        print >> out_file, '##gff-version 3'
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if only and item.type.lower() not in only: continue
                if item.type.lower() in exclude: continue
                
                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']
                    
                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()

        if self.output is not None:
            out_file.close()
Ejemplo n.º 9
0
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Ejemplo n.º 10
0
 def _describe_peaks(self, r):        
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"]
     
     peak_counts = collections.defaultdict(int)
     read_counts = collections.defaultdict(int)
     total = 0
     for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")):
         peak_counts[item.attr.get("Relation","None")] += 1
         read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values())            
         total += 1
     
     total_reads = sum(read_counts.values())
     
     r.write("<p>\n")
     r.write("%d peaks\n" % total)
     
     for name, desc in [
             ("3'UTR", "in a 3' UTR"),
             ("Exon", "otherwise in an exon"),
             ("Downstrand", "otherwise downstrand of a non-coding RNA"),
             ("Intron", "otherwise in an intron"),
             ("Antisense", "otherwise antisense to a gene"),
             ("None", "couldn't be related to annotated genes"),
             ]:
         r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc))
     r.write("</p>\n")
Ejemplo n.º 11
0
def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file):
    # check for a tmp direcory 
    if len(glob.glob("./tmp")) == 0:
        call (["mkdir", "tmp"])

    gff_file = list(annotation.read_annotations(gff_file))
    print "\n Reading in the reference file"
    seq_dict = dict(io.read_sequences(ref_file))
    names_file = open(names_file).readlines()
    config  = open("primer_config.txt").readlines()

    with open("tmp/regions_" + output_file, 'w') as out_f:
        for name in names_file:
            sname = name.strip("\n")
            found = False
            for line in gff_file:
                gff_name = line.attr.get ("Name", "No_name")
                peak = line.attr.get ("id", "No_id")
                if sname in gff_name.split("/"):
                    out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n")
                    out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n")
                    found = True
                    for cline in config:
                        out_f.write(cline.strip("\n")  + "\n")
                    out_f.write("="  + "\n")
            if found ==False:
                print "Could not find the gene " + sname + " in the gff file"
Ejemplo n.º 12
0
    def run(self):
        if self.output is not None:
            out_file = open(self.output, 'wb')
        else:
            out_file = sys.stdout

        annotation.write_gff3_header(out_file)

        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue

                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']

                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()

        if self.output is not None:
            out_file.close()
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra,
                index = self.index, shrimp = self.shrimp, 
                bowtie = self.bowtie, star = self.star
                ).run()
 def _describe_peaks(self, r):        
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"]
     
     peak_counts = collections.defaultdict(int)
     read_counts = collections.defaultdict(int)
     total = 0
     for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")):
         peak_counts[item.attr.get("Relation","None")] += 1
         read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values())            
         total += 1
     
     total_reads = sum(read_counts.values())
     
     r.write("<p>\n")
     r.write("%d peaks\n" % total)
     
     for name, desc in [
             ("3'UTR", "in a 3' UTR"),
             ("Exon", "otherwise in an exon"),
             ("Downstrand", "otherwise downstrand of a non-coding RNA"),
             ("Intron", "otherwise in an intron"),
             ("Antisense", "otherwise antisense to a gene"),
             ("None", "couldn't be related to annotated genes"),
             ]:
         r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc))
     r.write("</p>\n")
Ejemplo n.º 15
0
    def run(self):
        only = set(item.lower() for item in self.only)
        exclude = set(item.lower() for item in self.exclude)

        if self.output is not None:
            out_file = open(self.output, 'wb')
        else:
            out_file = sys.stdout

        print >> out_file, '##gff-version 3'
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if only and item.type.lower() not in only: continue
                if item.type.lower() in exclude: continue

                if 'ID' not in item.attr and 'locus_tag' in item.attr:
                    item.attr['ID'] = item.attr['locus_tag']

                if 'color' not in item.attr:
                    if item.type == 'CDS':
                        item.attr['color'] = '#008800'
                    if item.type == 'rRNA':
                        item.attr['color'] = '#bb0000'
                    if item.type == 'tRNA':
                        item.attr['color'] = '#bb00bb'
                    if item.type == 'misc_feature':
                        item.attr['color'] = '#8888ff'

                print >> out_file, item.as_gff()

        if self.output is not None:
            out_file.close()
 def coding_regions(self):
     coding_regions = { }
     annotations = list(annotation.read_annotations(join(self.dirname,'reference.gff')))
     annotation.link_up_annotations(annotations)
     for item in annotations:
         if item.type == "CDS":
             [ mrna ] = item.parents
             [ gene ] = mrna.parents
             name = gene.get_id()
             if name not in coding_regions:
                 coding_regions[name] = [ ]
             coding_regions[name].append(item)
     
     for gene in annotations:
         if gene.type != "gene": continue
         name = gene.get_id()
         if name not in coding_regions:
             coding_region = gene.three_prime()
             coding_region.attr = { "ID" : name }
         else:
             items = coding_regions[name]
             coding_region = annotation.Annotation(
                 type = "coding_region",
                 seqid = items[0].seqid,
                 strand= items[0].strand,
                 start = min(item2.start for item2 in items),
                 end = max(item2.end for item2 in items),
                 attr = { "ID" : name },
                 )
         coding_regions[name] = coding_region
     
     return coding_regions
Ejemplo n.º 17
0
    def run(self):
        features_parent = [
            _Related_feature(item, item.start, item.end, [])
            for item in annotation.read_annotations(self.parent)
            if selection.matches(self.select_parent, [item.type])
        ]
        features_child = [
            _Related_feature(item, item.start, item.end, [])
            for item in annotation.read_annotations(self.child)
            if selection.matches(self.select_child, [item.type])
        ]

        index = {}
        for item in features_child:
            if item.feature.seqid not in index:
                index[item.feature.seqid] = span_index.Span_index()
            index[item.feature.seqid].insert(item)
        for value in index.values():
            value.prepare()

        for item_1 in features_parent:
            if item_1.feature.strand == 1:
                start = item_1.start - self.upstrand
                end = item_1.end + self.downstrand
            elif item_1.feature.strand == -1:
                start = item_1.start - self.downstrand
                end = item_1.end + self.upstrand
            else:
                start = item_1.start - max(self.upstrand, self.downstrand)
                end = item_1.end + max(self.upstrand, self.downstrand)
            if item_1.feature.seqid in index:
                for item_2 in index[item_1.feature.seqid].get(start, end):
                    item_1.relations.append(item_2)
                    item_2.relations.append(item_1)

        for item in features_parent:
            item.modify_with_relations(self.use, self.to_child, self.to_parent)

        with open(self.prefix + '-parent.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for item in features_parent:
                print >> f, item.feature.as_gff()

        with open(self.prefix + '-child.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for item in features_child:
                print >> f, item.feature.as_gff()
Ejemplo n.º 18
0
    def run(self):
        features_parent = [ 
            _Related_feature(item,item.start,item.end,[]) 
            for item in annotation.read_annotations(self.parent) 
            if selection.matches(self.select_parent, [item.type]) 
            ]
        features_child = [ 
            _Related_feature(item,item.start,item.end,[]) 
            for item in annotation.read_annotations(self.child) 
            if selection.matches(self.select_child, [item.type])
            ]
        
        index = { }
        for item in features_child:
            if item.feature.seqid not in index:
                index[item.feature.seqid] = span_index.Span_index()
            index[item.feature.seqid].insert(item)
        for value in index.values():
            value.prepare()
        
        for item_1 in features_parent:
            if item_1.feature.strand == 1:
                start = item_1.start - self.upstrand
                end = item_1.end + self.downstrand
            elif item_1.feature.strand == -1:
                start = item_1.start - self.downstrand
                end = item_1.end + self.upstrand
            else:
                start = item_1.start - max(self.upstrand,self.downstrand)
                end = item_1.end + max(self.upstrand,self.downstrand)
            if item_1.feature.seqid in index:
                for item_2 in index[item_1.feature.seqid].get(start,end):
                    item_1.relations.append(item_2)
                    item_2.relations.append(item_1)

        for item in features_parent:
            item.modify_with_relations(self.use, self.to_child, self.to_parent)
        
        with open(self.prefix + '-parent.gff','wb') as f:
            annotation.write_gff3_header(f)
            for item in features_parent:
                print >> f, item.feature.as_gff()
        
        with open(self.prefix + '-child.gff','wb') as f:
            annotation.write_gff3_header(f)
            for item in features_child:
                print >> f, item.feature.as_gff()
Ejemplo n.º 19
0
def index(filename, type=None, modify = lambda item: item, name = lambda item: item.get_id()):
    result = { }
    for item in annotation.read_annotations(filename):
        if type is not None and item.type != type: continue
        item = modify(item)
        assert name(item) not in result
        result[name(item)] = item
    return result    
Ejemplo n.º 20
0
def index(filename,
          type=None,
          modify=lambda item: item,
          name=lambda item: item.get_id()):
    result = collections.OrderedDict()
    for item in annotation.read_annotations(filename):
        if type is not None and item.type != type: continue
        item = modify(item)
        assert name(item) not in result
        result[name(item)] = item
    return result
Ejemplo n.º 21
0
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]

        shift_start_absolute, shift_start_proportion = decode_shift(
            self.shift_start)
        shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end)

        renames = []
        if self.rename:
            for item in self.rename.split(','):
                new, old = item.split('=')
                if new != old:
                    renames.append((new, old))

        out_file = open(self.prefix + '.gff', 'wb')
        annotation.write_gff3_header(out_file)

        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue

                if self.type:
                    item.type = self.type

                length = item.end - item.start
                shift_start = int(
                    math.floor(0.5 + shift_start_absolute +
                               shift_start_proportion * length))
                shift_end = int(
                    math.floor(0.5 + shift_end_absolute +
                               shift_end_proportion * length))

                if item.strand == 1:
                    item.start += shift_start
                    item.end += shift_end
                elif item.strand == -1:
                    item.end -= shift_start
                    item.start -= shift_end
                item.start = max(0, item.start)  #IGV complains

                item.strand = strand_changer[item.strand]

                old_attr = item.attr.copy()
                for new, old in renames:
                    if old in item.attr:
                        del item.attr[old]
                for new, old in renames:
                    if old in old_attr:
                        item.attr[new] = old_attr[old]

                print >> out_file, item.as_gff()

        out_file.close()
Ejemplo n.º 22
0
    def run(self):
        annotations = [ ]
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)
        
        annotations.sort(key=lambda item: (item.seqid, item.strand, item.start))
        
        group = [ ]
        groups = [ ]
        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]        
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                seqid = item.seqid
                strand = item.strand
                end = item.end-self.overlap
            group.append(item)
            end = max(item.end-self.overlap, end)
        emit()

        out_file = open(self.prefix+'.gff','wb')
        annotation.write_gff3_header(out_file)

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = join_descriptions( item2.type for item2 in group )
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min( item2.start for item2 in group )
            item.end = max( item2.end for item2 in group )
            item.score = None
            item.phase = None
            item.attr = { }
            
            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions( item3.attr[key] for item3 in group if key in item3.attr )
            
            print >> out_file, item.as_gff()
            
        out_file.close()
Ejemplo n.º 23
0
    def run(self):
        assert self.release
        assert self.species
        assert self.assembly
        assert self.dna
        
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        ensembl = workspace.Workspace(work/'ensembl')
        
        genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz"
        genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename
        
        gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz"
        gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename
        
        
        if self.download:
            self.log.log("Fetching "+genome_url+"\n")
            io.execute(['rsync','-aP',genome_url, ensembl/genome_filename])
            self.log.log("Fetching "+gff_url+"\n")
            io.execute(['rsync','-aP',gff_url, ensembl/gff_filename])
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(ensembl/gff_filename))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(ensembl/genome_filename):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Ejemplo n.º 24
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            info = io.get_file_info(filename)

            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length',
                                            float(total_length) / total)
                print >> f
                any = True

            if info.matches('annotations'):
                total = 0
                counts = {}
                for item in annotation.read_annotations(filename, "/"):
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True

            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Ejemplo n.º 25
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            try:
                iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
                iterator = None

            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length',
                                        float(total_length) / total)
                print >> f
                any = True

            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None

            if iterator:
                total = 0
                counts = {}
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if not any:
                raise grace.Error(
                    filename +
                    ' is neither a sequence file nor an annotation file that nesoni can read.'
                )

        self.end_output(f)
Ejemplo n.º 26
0
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]
        
        shift_start_absolute, shift_start_proportion = decode_shift(self.shift_start)
        shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end)
        
        renames = [ ]
        if self.rename:
            for item in self.rename.split(','):
                new, old = item.split('=')
                if new != old:
                    renames.append((new,old))
    
        out_file = open(self.prefix+'.gff','wb')    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if self.type:
                    item.type = self.type
                
                length = item.end-item.start
                shift_start = int(math.floor(0.5+shift_start_absolute+shift_start_proportion*length))
                shift_end = int(math.floor(0.5+shift_end_absolute+shift_end_proportion*length))
                
                if item.strand == 1:
                    item.start += shift_start
                    item.end += shift_end
                elif item.strand == -1:
                    item.end -= shift_start
                    item.start -= shift_end
                item.start = max(0, item.start) #IGV complains
                
                item.strand = strand_changer[item.strand]
                
                old_attr = item.attr.copy()
                for new,old in renames:
                    if old in item.attr:
                       del item.attr[old]
                for new,old in renames:
                    if old in old_attr:
                       item.attr[new] = old_attr[old]
            
                print >> out_file, item.as_gff()

        out_file.close()
Ejemplo n.º 27
0
    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            info = io.get_file_info(filename)
            
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            if info.matches('annotations'):
                total = 0
                counts = { }
                for item in annotation.read_annotations(filename):
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True
            
            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Ejemplo n.º 28
0
    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            try:
               iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
               iterator = None
               
            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None
                
            if iterator:
                total = 0
                counts = { }
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if not any:
                raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')

        self.end_output(f)
Ejemplo n.º 29
0
    def run(self):
        assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.'
        strand_changer = STRAND_CHANGE[self.change_strand]
    
        out_file = open(self.prefix+'.gff','wb')    
        annotation.write_gff3_header(out_file)
        
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                
                if item.strand == 1:
                    item.start += self.shift_start
                    item.end += self.shift_end
                elif item.strand == -1:
                    item.end -= self.shift_start
                    item.start -= self.shift_end
                
                item.strand = strand_changer[item.strand]
            
                print >> out_file, item.as_gff()

        out_file.close()
Ejemplo n.º 30
0
    def run(self):    
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])
        
        nesoni.Make_reference(
            self.output_dir,
            filenames = self.filenames,
            snpeff = False,
            cs = 'ifavailable' if self.index else False,
            ls = False,
            bowtie = 'ifavailable' if self.index else False,
            ).run()
            
        annotations = list(annotation.read_annotations(work/'reference.gff'))
        annotation.link_up_annotations(annotations)
        
        exon_index = span_index.index_annotations([
            item for item in annotations if item.type == "exon"
            ])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
            ])
        
        mrna_utrs = [ ]
        gene_utrs = [ ]
        
        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
            assert mrnas, "Gene without any mRNAs: "+gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index))
        
            gene_utr_5primes = [ ]
            
            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna
                
                mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index))
            
                cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                exons = [ item for item in mrna.children if item.type == 'exon' ]
                
                if not exons: continue
                
                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]
                
                if not cdss: continue
                
                mrna_utr_5primes = [ ]
                if gene.strand >= 0:
                   cds_3prime = max(item.end for item in cdss)
                   for item in exons:
                       if item.end >= cds_3prime:
                           mrna_utr_5primes.append(max(item.start,cds_3prime))
                else:
                   cds_3prime = min(item.start for item in cdss)
                   for item in exons:
                       if item.start <= cds_3prime:
                           mrna_utr_5primes.append(min(item.end,cds_3prime))
                
                if mrna.strand >= 0:
                    utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start+1,mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start,utr_end-1)
                    gene_utr_5primes.append(utr_end)
                
                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID']+'-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source = 'tt',
                    type = 'three_prime_utr',
                    seqid = mrna.seqid,
                    strand = mrna.strand,
                    start = utr_start,
                    end = utr_end,
                    attr = attr,
                    )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end-utr_start+max_ext > 1:
                    mrna_utrs.append(utr)
            
            if gene.strand >= 0:
                utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start+1,gene.end)
            else:
                utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start,utr_end-1)
            
            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID']+'-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source = 'tt',
                type = 'three_prime_utr',
                seqid = gene.seqid,
                strand = gene.strand,
                start = utr_start,
                end = utr_end,
                attr = attr,
                )
            utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)
        
        annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work/'utr.gff', gene_utrs)
            
        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 31
0
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()
Ejemplo n.º 32
0
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'

        # Reference genome

        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(
            self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end - start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(
                    chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene, peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end - start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end - start > self.max_seq: return ''
                return bio.reverse_complement(
                    chromosomes[gene.seqid][start:end])

        # Normalization files

        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix + '-norm', self.counts).run()
            norm_file = self.prefix + '-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All', str)])['All']
        pair_norm_names = []
        pair_norms = []
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i] + '-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix + '-pairs-norm.csv',
            [('All', io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
        )

        # Read data

        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = []
        children = list(annotation.read_annotations(self.children))

        count_table = io.read_grouped_table(self.counts,
                                            [('Count', int),
                                             ('Tail_count', int),
                                             ('Tail', _float_or_none),
                                             ('Proportion', _float_or_none),
                                             ('Annotation', str)])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']

        samples = counts.value_type().keys()
        sample_tags = {}
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts

        for item in children:
            item.weight = sum(counts[item.get_id()][name] *
                              float(norms[name]['Normalizing.multiplier'])
                              for name in samples)

        parents = []
        id_to_parent = {}
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id(
            ) not in id_to_parent, 'Duplicate id in parent file: ' + item.get_id(
            )
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = []
            #item.cds = [ ]

            # Default utr
            if item.strand >= 0:
                item.utr_pos = item.end
            else:
                item.utr_pos = item.start

            if 'three_prime_UTR_start' in item.attr:
                if item.strand >= 0:
                    item.utr_pos = int(item.attr['three_prime_UTR_start']) - 1
                else:
                    item.utr_pos = int(item.attr['three_prime_UTR_start'])

        for item in utrs:
            assert item.attr[
                'Parent'] in id_to_parent, 'Unknown gene ' + item.attr['Parent']
            id_to_parent[item.attr['Parent']].utr_pos = (
                item.start if item.strand >= 0 else item.end)

        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start  #End of transcription, 0-based, ie between-positions based

            if 'Parent' in item.attr and item.attr.get(
                    "Relation") != "Antisense":
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)

        for item in parents:
            item.children.sort(key=_annotation_sorter)

            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]

                #relevant = [
                #    peak for peak in relevant
                #    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                #    ]

                relevant = [
                    peak for peak in relevant
                    if peak.attr.get("Relation") == "3'UTR"
                ]

            if self.top:
                relevant.sort(key=lambda peak: peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant

        # JSON output

        j_data = {}
        j_genes = j_data['genes'] = {}

        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = []
        j_genes['chromosome'] = []
        j_genes['strand'] = []
        j_genes['start'] = []
        j_genes['utr'] = []
        j_genes['end'] = []
        j_genes['gene'] = []
        j_genes['product'] = []
        j_genes['peaks'] = []
        j_genes['relevant_peaks'] = []
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append(item.get_id())
            j_genes['chromosome'].append(item.seqid)
            j_genes['strand'].append(item.strand)
            j_genes['start'].append(item.start)
            j_genes['utr'].append(item.utr_pos)
            j_genes['end'].append(item.end)
            j_genes['gene'].append(
                item.attr.get('Name', item.attr.get('gene', '')))
            j_genes['product'].append(
                item.attr.get('Product', item.attr.get('product', '')))
            j_genes['peaks'].append(
                [item2.get_id() for item2 in item.children])
            j_genes['relevant_peaks'].append(
                [item2.get_id() for item2 in item.relevant_children])
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )

        j_peaks = j_data['peaks'] = {}
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = []
        j_peaks['chromosome'] = []
        j_peaks['strand'] = []
        j_peaks['start'] = []
        j_peaks['end'] = []
        j_peaks['parents'] = []
        j_peaks['counts'] = []
        j_peaks['tail_lengths'] = []
        j_peaks['proportion_tailed'] = []
        for item in children:
            j_peaks['name'].append(item.get_id())
            j_peaks['chromosome'].append(item.seqid)
            j_peaks['strand'].append(item.strand)
            j_peaks['start'].append(item.start)
            j_peaks['end'].append(item.end)
            j_peaks['parents'].append(item.attr['Parent'].split(',')
                                      if 'Parent' in item.attr else [])
            j_peaks['counts'].append(counts[item.get_id()].values())
            j_peaks['tail_lengths'].append(
                count_table['Tail'][item.get_id()].values())
            j_peaks['proportion_tailed'].append(
                count_table['Proportion'][item.get_id()].values())

        j_samples = j_data['samples'] = {}
        j_samples['name'] = []
        j_samples['tags'] = []
        j_samples['normalizing_multiplier'] = []
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(
                float(norms[name]['Normalizing.multiplier']))

        j_chromosomes = j_data['chromosomes'] = {}
        j_chromosomes['name'] = []
        j_chromosomes['length'] = []
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))

        with open(self.prefix + '.json', 'wb') as f:
            json.dump(j_data, f)

        # Output paired peak file

        output_comments = ['#Counts']
        output_samples = []
        for item in samples:
            output_samples.append(item + '-peak1')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak1', 'peak1'] +
                                            sample_tags.get(item, [])))
        for item in samples:
            output_samples.append(item + '-peak2')
            output_comments.append('#sampleTags=' +
                                   ','.join([item + '-peak2', 'peak2'] +
                                            sample_tags.get(item, [])))

        output_names = []
        output_counts = []
        output_tail_counts = []
        output_proportions = []
        output_tails = []
        output_annotation_fields = [
            'gene', 'product', 'biotype', 'mean_tail_1', 'mean_tail_2',
            'chromosome', 'strand', 'transcription_stops'
        ]  #, 'interpeak_seq', ]
        output_annotations = []

        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks) - 1):
                for j in xrange(i + 1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-' + id_i + '-' + id_j
                    output_names.append(id_pair)

                    row = []
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text, row))

                    row = []
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text, row))

                    row = []
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text, row))

                    row = []
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text, row))

                    output_annotations.append([
                        item.attr.get('Name', item.attr.get('gene', '')),
                        item.attr.get('Product', item.attr.get('product', '')),
                        item.attr.get('Biotype', ''),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,
                                    peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                    ])

        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [
                ('Count', io.named_matrix_type(output_names,
                                               output_samples)(output_counts)),
                ('Tail_count',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_tail_counts)),
                ('Proportion',
                 io.named_matrix_type(output_names,
                                      output_samples)(output_proportions)),
                ('Tail', io.named_matrix_type(output_names,
                                              output_samples)(output_tails)),
                ('Annotation',
                 io.named_matrix_type(
                     output_names,
                     output_annotation_fields)(output_annotations)),
            ],
            comments=output_comments,
        )
Ejemplo n.º 33
0
    def run(self):
        annotations = [ ]
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)
        
        annotations.sort(key=lambda item: (item.type, item.seqid, item.strand, item.start))
        
        group = [ ]
        groups = [ ]
        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]        
        type = None
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                type = item.type
                seqid = item.seqid
                strand = item.strand
                end = item.end-self.overlap
            group.append(item)
            end = max(item.end-self.overlap, end)
        emit()


        items = [ ]
        
        id_map = { }

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = group[0].type
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min( item2.start for item2 in group )
            item.end = max( item2.end for item2 in group )
            item.score = None
            item.phase = None
            item.attr = { }
            
            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner )

            item.parents = [ ]
            for item2 in group:
                if 'ID' in item2.attr:
                    assert item2.attr['ID'] not in id_map, 'Duplicate ID: '+item2.attr['ID']
                    id_map[item2.attr['ID']] = item.attr['ID']
                if 'Parent' in item2.attr:
                    item.parents.append(item2.attr['Parent'])
            
            items.append(item)
        
        for item in items:
            if item.parents:
                item.attr['Parent'] = join_descriptions([ id_map.get(parent,parent) for parent in item.parents ], ',')
        
        with open(self.prefix+'.gff','wb') as out_file:
            annotation.write_gff3_header(out_file)
            for item in items:
                print >> out_file, item.as_gff()
Ejemplo n.º 34
0
    def run(self):
        annotations = []
        for filename in self.filenames:
            for item in annotation.read_annotations(filename):
                if not selection.matches(self.select, [item.type]): continue
                if self.type:
                    item.type = self.type
                annotations.append(item)

        annotations.sort(
            key=lambda item: (item.type, item.seqid, item.strand, item.start))

        group = []
        groups = []

        def emit():
            if not group: return
            groups.append(group[:])
            del group[:]

        type = None
        seqid = None
        strand = None
        end = 0
        for item in annotations:
            if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end:
                emit()
                type = item.type
                seqid = item.seqid
                strand = item.strand
                end = item.end - self.overlap
            group.append(item)
            end = max(item.end - self.overlap, end)
        emit()

        items = []

        id_map = {}

        for group in groups:
            item = annotation.Annotation()
            item.source = group[0].source
            item.type = group[0].type
            item.seqid = group[0].seqid
            item.strand = group[0].strand
            item.start = min(item2.start for item2 in group)
            item.end = max(item2.end for item2 in group)
            item.score = None
            item.phase = None
            item.attr = {}

            for item2 in group:
                for key in item2.attr:
                    if key in item.attr: continue
                    item.attr[key] = join_descriptions([
                        item3.attr[key] for item3 in group if key in item3.attr
                    ], self.joiner)

            item.parents = []
            for item2 in group:
                if 'ID' in item2.attr:
                    assert item2.attr[
                        'ID'] not in id_map, 'Duplicate ID: ' + item2.attr['ID']
                    id_map[item2.attr['ID']] = item.attr['ID']
                if 'Parent' in item2.attr:
                    item.parents.append(item2.attr['Parent'])

            items.append(item)

        for item in items:
            if item.parents:
                item.attr['Parent'] = join_descriptions(
                    [id_map.get(parent, parent) for parent in item.parents],
                    ',')

        with open(self.prefix + '.gff', 'wb') as out_file:
            annotation.write_gff3_header(out_file)
            for item in items:
                print >> out_file, item.as_gff()
Ejemplo n.º 35
0
def count_run(
    min_score, min_size, max_size, filter_mode, equalize, types, locii,
    qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log):
    
    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono': 
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification."
    
    from Bio import Seq, SeqIO

    annotation_filenames = [ ]
    bam_filenames = [ ]
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)    

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]    
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference().annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)
            
            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)
    
    assert bam_filenames, 'No reference alignments given' 

    merge = { }
    merge_qualifiers = { }
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>
    
        f = open(merge_filename,'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers)+1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    
    genes = { }  # reference name -> gene index
    
    feature_names = { }   # feature_name -> number of occurrences
    
    features = [ ]
    
    n_features = 0

    chromosome_length = { }
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue
            
            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None
            
            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length
    
    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    
    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'
    
        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue
                
                if (locii is not None and
                    ('locus_tag' not in feature.attr or
                     feature.attr['locus_tag'].lower() not in locii)):
                    continue
                
                f = Feature(n_samples)
                f.name = feature.get_id()                
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name,0)+1
                if feature_names[f.name] > 1:
                   f.name += '/%d' % feature_names[f.name]
                   
                f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ]
                
                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f))
                features.append(f)

    else:
        # Sequences as features        
        log.log('No annotation files given or found, using sequences as features\n')
        
        name_feature = { } # (merged)name -> feature
        
        for name in chromosome_length:
            merged_name = merge.get(name, name)
            
            if merged_name not in name_feature: 
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length) #...
            
            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))
                
                

    
    log.log('%d features\n\n' % len(features))

    for name in genes: 
        genes[name].prepare()
        
    n_fragments = [ 0 ] * n_samples
    n_fragments_aligned = [ 0 ] * n_samples
    n_low_score = [ 0 ] * n_samples
    n_something = [ 0 ] * n_samples
    n_multiple = [ 0 ] * n_samples
    n_span = [ 0 ] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)):
            n_fragments[i] += 1
            
            if not fragment_alignments: 
                continue
            
            n_fragments_aligned[i] += 1
            
            feature_hits = [ ] # [ [ (feature, strand) ] ]
            
            # Use only top scoring alignments
            fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ]
            
            best_score = max(fragment_scores)
            
            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue
            
            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score            
            fragment_alignments = [ item 
                                    for item, score in zip(fragment_alignments, fragment_scores)
                                    if score >= cutoff ]            
            
            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
            
                start = min(item.pos-1 for item in alignments)
                end = max(item.pos+item.length-1 for item in alignments)
                length = end-start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue
                
                rname = alignments[0].rname
                strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1
                assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'
            
                this_feature_hits = [ ]    
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append( key )
                    if not use_only_monogamous or len(fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1
                
                if this_feature_hits: 
                    feature_hits.append( this_feature_hits )
                
                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1],b[1])][b[0]] += 1
                                
                
            if len(feature_hits) > 0: 
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print
            
            
            if len(feature_hits) > 1: 
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1],b[1])][b[0]] += 1
            
            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1
            
            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i])
        log.log('\n')

    strandedness = [ ]
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward+n_reverse < 5: continue
        strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) )
    strandedness = sum(strandedness) / len(strandedness)
    log.log('Strand specificity: %.0f%%\n'
            '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
            '   Average over all features with at least 5 hits.)\n'
            % strandedness)


    if use_strand == 'pool':
        getters = [ lambda f: (feature.name, 
                               add_lists(feature.count[1],feature.count[-1]),
                               add_defdicts(feature.common[(1,1)], 
                                            feature.common[(1,-1)], 
                                            feature.common[(-1,1)], 
                                            feature.common[(-1,-1)]),
                               add_defdicts(feature.ambiguous[(1,1)], 
                                            feature.ambiguous[(1,-1)], 
                                            feature.ambiguous[(-1,1)], 
                                            feature.ambiguous[(-1,-1)])) ]
    elif use_strand == 'forward':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ]
    elif use_strand == 'reverse':
        getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]
    elif use_strand == 'both':
        getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]),
                    lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [ float(min_hits)/item for item in total_hits ]
        total_hits = [ min_hits ] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        [ 'Feature' ] +
        titles +
        [ 'RPKM ' + item for item in titles ] +
        [ 'Length' ] +
        qualifiers +
        [ 'On same fragment' ] +
        ([ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ])
    )
    
    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)
            
            if equalize:
                count = [
                    subsample(count[i], p[i])
                    for i in xrange(n_samples)
                ]
            
            rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ]
            
            common_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(common.items(), key=lambda item:item[1], reverse=True)
            ) 
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1],item[0])
                for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True)
            ) 
            
            print >> f, tab_encode(
                [ feature_name ] +
                [ str(item) for item in count ] +
                [ '%.2f' % item for item in rpkm ] +
                [ str(feature.length) ] +
                list(feature.qualifiers) +
                [ common_str ] +
                ([ ambiguous_str ] if expect_multiple_alignments else [ ]) 
            )
            

    f.close()
Ejemplo n.º 36
0
 def run(self):    
     work = self.get_workspace()
     work.update_param(remove=['tail_tools_reference_version'])
     
     nesoni.Make_reference(
         self.output_dir,
         filenames = self.filenames,
         snpeff = False,
         cs = 'ifavailable' if self.index else False,
         ls = False,
         bowtie = 'ifavailable' if self.index else False,
         ).run()
         
     annotations = list(annotation.read_annotations(work/'reference.gff'))
     annotation.link_up_annotations(annotations)
     
     with open(work/'utr.gff','wb') as f:
         annotation.write_gff3_header(f)
         for gene in annotations:
             if gene.type != 'gene': continue
             mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
         
             utr_5primes = [ ]
             
             for mrna in mrnas:
                 cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                 exons = [ item for item in mrna.children if item.type == 'exon' ]
                 if not cdss or not exons: continue
                 if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end > cds_3prime:
                            utr_5primes.append(max(item.start,cds_3prime))
                 else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start < cds_3prime:
                            utr_5primes.append(min(item.end,cds_3prime))
             
             if gene.strand >= 0:
                 utr_start = max(utr_5primes) if utr_5primes else gene.end
                 utr_end = max(utr_start+1,gene.end)
             else:
                 utr_end = min(utr_5primes) if utr_5primes else gene.start
                 utr_start = min(gene.start,utr_end-1)
             
             attr = gene.attr.copy()
             attr['Parent'] = attr['ID']
             attr['ID'] = attr['ID']+'-3UTR'
             thing = annotation.Annotation(
                 source = 'tt',
                 type = 'three_prime_utr',
                 seqid = gene.seqid,
                 strand = gene.strand,
                 start = utr_start,
                 end = utr_end,
                 attr = attr,
                 )
             print >> f, thing.as_gff()
         
         
     work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 37
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = [ ]
        for peak in annotation.read_annotations(self.child):
            if float(peak.attr.get("mean_tail","0.0")) < self.min_tail:
                continue
            peaks.append(peak)
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]

            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
         
         parts = self.parts or self.types 
         parts = [ item.lower() for item in parts.split(',') ]
         
         
         all_annotations = list(annotation.read_annotations(annotations_filename))
         annotation.link_up_annotations(all_annotations)
         for item in all_annotations: 
             item.primary = None
     
         annotations = [ 
             item 
             for item in all_annotations
             if item.type.lower() in types
         ]
         
         part_annotations = [ ]
         seen = set()
         queue = [ (item,item) for item in annotations ]
         while queue:
             primary, item = queue.pop()
             if item.type.lower() in parts:
                 assert item.primary is None, "Feature with multiple parents"
                 item.primary = primary
                 key = (id(primary),item.start,item.end,item.seqid,item.strand)
                 # Ignore duplicate exons (many isoforms will have the same exons)
                 if key not in seen:
                     seen.add(key)
                     part_annotations.append(item)
             queue.extend( (primary, item2) for item2 in item.children )
         
         del seen
         del all_annotations
         
         self.log.log('%d annotations\n' % len(annotations))
         self.log.log('%d part annotations\n' % len(part_annotations))
         
         #assert annotations, 'No annotations of specified types in file'
         
         for item in part_annotations:
             this_extension = self.extension
             if "max_extension" in item.attr:
                 this_extension = min(this_extension,int(item.attr["max_extension"]))
                 
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += this_extension
             else:
                 item.tail_pos = item.start
                 item.start -= this_extension
         
         for item in annotations:    
             item.hits = [] # [ (tail_length, adaptor_bases) ]
         
         index = span_index.index_annotations(part_annotations)
         
         for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'):
             if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                 continue
        
             start = alignment.reference_start
             end = alignment.reference_end
             alignment_length = end-start
             strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
             fragment_feature = annotation.Annotation(
                 seqid=alignment.reference_name,
                 start=start,
                 end=end,
                 strand=strand
                 )
             
             if strand >= 0:
                 tail_pos = end
             else:
                 tail_pos = start
             
             tail_length = 0
             adaptor_bases = 0
             for item in alignment.extra:
                 if item.startswith('AN:i:'):
                     tail_length = int(item[5:])
                 elif item.startswith('AD:i:'):
                     adaptor_bases = int(item[5:])
             
             hits = index.get(fragment_feature, same_strand=True)
             if hits:
                 gene = min(hits, key=lambda gene: 
                     (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                     # Nearest by tail_pos
                     # failing that, by id to ensure a deterministic choice
                 
                 gene.primary.hits.append( (tail_length,adaptor_bases) )

         for item in annotations:
             del item.parents
             del item.children
             del item.primary

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()
Ejemplo n.º 39
0
    def run(self):
        items = list(annotation.read_annotations(self.parent))
                
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) <= 1
        
        genes = [ item for item in items if item.type == "gene" ]
        downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ]
        exons = [ item for item in items if item.type == "exon" ]
        utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ]
        
        gene_index = span_index.index_annotations(genes)
        downstrand_gene_index = span_index.index_annotations(downstrand_genes)
        exon_index = span_index.index_annotations(exons)
        utr_index = span_index.index_annotations(utrs)
        
        peaks = list(annotation.read_annotations(self.child))
        
        for peak in peaks:
            # Query is final base in genome before poly(A) starts
            query = peak.three_prime().shifted(-1,0)
            
            hit_to = "3'UTR"
            hits = [ item.parents[0].parents[0] for item in
                     utr_index.get(query, True) ]
            
            if not hits:
                hit_to = "Exon"
                hits = [ item.parents[0].parents[0] for item in
                         exon_index.get(query, True) ]
            
            # For non-coding RNAs, which don't have a 3' UTR
            if not hits:
                hit_to = "Downstrand"
                hits = downstrand_gene_index.get(query, True)
            
            if not hits:
                hit_to = "Intron"
                hits = gene_index.get(query, True)

            antisense_hits = gene_index.get(query.reversed(), True)
            if not hits:
                hit_to = "Antisense"
                hits = antisense_hits
            
            if hits:
                peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",")
                peak.attr["Relation"] = hit_to
                peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ])
                peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ])
                peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ])
            
            if antisense_hits:
                peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",")
                peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ])
                peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ])
                peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits])
                
        
        annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm
        annotation.write_gff3(self.prefix+"-child.gff", peaks)
Ejemplo n.º 40
0
    def run(self):
        names = [ sample.output_dir for sample in self.samples ]
            #os.path.splitext(os.path.split(item)[1])[0]
            #for item in self.reads
            #]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        plotspace = io.Workspace(workspace/'plots', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        testspace_dedup = io.Workspace(workspace/'test-dedup', must_exist=False)
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        #dirs = [
        #    workspace/item
        #    for item in names
        #]

        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        polya_dirs = [ item + '-polyA' for item in dirs ]        
        interleaved = [ item2 for item in zip(dirs,polya_dirs) for item2 in item ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]                
        #filter_logs = [ item.get_filter_action().log_filename() for item in samples ]
        #filter_polya_logs = [ item.get_polya_filter_action().log_filename() for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            saturation = 0,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = 'gene',
            )
        

        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        
        
        nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make()

        def writer():
            for row in io.read_table(workspace/'norm.csv'):
                row['Name'] = row['Name']+'-polyA'
                yield row
        io.write_csv(workspace/'norm-polyA.csv', writer(), comments=['Normalization'])


        with nesoni.Stage() as stage:
            if self.include_plots:        
                for plot_name, directories, norm_filename in [
                      ('all',   dirs,       workspace/'norm.csv'),
                      ('polyA', polya_dirs, workspace/'norm-polyA.csv'),
                      ]:
                    nesoni.IGV_plots(
                        plotspace/plot_name,
                        working_dirs = directories,
                        label_prefix = plot_name+' ',
                        raw = True,
                        norm = True,
                        genome = reference.get_genome_filename(),
                        norm_file = norm_filename,
                        #delete_igv = False,
                        ).process_make(stage)

            analyse_gene_counts_0 = analyse_template(
                output_dir = expressionspace/'genewise',
                saturation = 0,
                extension = self.extension,
                title = 'Genewise expression - ' + self.title,
                file_prefix = file_prefix+'genewise-',
                )
            analyse_gene_counts_0.process_make(stage)
            
            analyse_gene_counts_1 = analyse_template(
                output_dir = expressionspace/'genewise-dedup',
                saturation = 1,
                title = 'Genewise expression with read deduplication - ' + self.title,
                file_prefix = file_prefix+'genewise-dedup-',
                )
            analyse_gene_counts_1.process_make(stage)
            
            stage.process(self._run_peaks, 
                workspace=workspace, expressionspace=expressionspace, reference=reference, 
                polya_dirs=polya_dirs, analyse_template=analyse_template, file_prefix=file_prefix,
                )
            
        with nesoni.Stage() as stage:
            for test in self.tests:
                test(
                    output_dir = testspace/test.output_dir,
                    analysis = self.output_dir
                    ).process_make(stage)

                test(
                    output_dir = testspace_dedup/test.output_dir,
                    analysis = self.output_dir,
                    dedup = True,
                    ).process_make(stage)
        
        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(os.path.join(self.output_dir, 'report'), self.title, self.file_prefix)
                    
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )


        if self.include_plots:        
            r.heading('IGV plots')
            
            r.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.')
            
            genome_files = [ ]
            if self.include_genome:
                genome_files.append(reference.get_genome_filename())
                genome_dir = reference.get_genome_dir()
                base = os.path.split(self.genome_dir)[1]
                for filename in os.listdir(genome_dir):
                    genome_files.append((
                        os.path.join(genome_dir, filename),
                        os.path.join(base, filename)
                        ))
            
            r.p(r.tar('igv-plots',
                genome_files +
                glob.glob(plotspace/'*.tdf')
                ))
        

        if self.include_bams:
            r.heading('BAM files')
            
            r.p('These BAM files contain the alignments of reads to the reference sequences.')
            
            r.p('Reads with a poly(A) tail have an \'AN\' attribute giving the length of non-templated poly(A) sequence. '
                'Tail-tools only treats a read as having a tail if this length is at least 4.')
            
            bam_files = [ ]
            for name in names:
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam'),name+'.bam') )
                bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') )
            r.p(r.tar('bam-files', bam_files))


        r.heading('Genewise expression')
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')

        io.symbolic_link(source=expressionspace/('genewise-dedup','report'),link_name=r.workspace/'genewise-dedup')
        r.p('<a href="genewise-dedup/index.html">&rarr; Genewise expression with read deduplication</a>')


        r.heading('Peakwise expression')

        web.Geneview_webapp(r.workspace/'view').run()        
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        n_peaks = len(list(annotation.read_annotations(peak_filename)))
        r.p('%d peaks called (%d poly(A) reads were required to call a peak).' % (n_peaks, self.peak_min_depth))
        
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        #if self.groups:
            #r.subheading('Peak shift between groups')
            #r.p(r.get(workspace/('peak-shift','grouped.csv')) + ' - genes with a potential peak shift')        
            #r.get(workspace/('peak-shift','grouped.json'))

        #r.subheading('Peak shift between samples')
        #r.p(r.get(workspace/('peak-shift','individual.csv')) + ' - genes with a potential peak shift')        
        #r.get(workspace/('peak-shift','individual.json'))

        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')

        io.symbolic_link(source=expressionspace/('peakwise-dedup','report'),link_name=r.workspace/'peakwise-dedup')
        r.p('<a href="peakwise-dedup/index.html">&rarr; Peakwise expression with read deduplication</a>')
                
        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                io.symbolic_link(source=testspace_dedup/test.output_dir,link_name=r.workspace/('test-dedup-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    ' &nbsp; <a href="test-dedup-%s" style="font-size: 66%%">[&rarr; Deduplicated version]</a>' % (test.output_dir, test.get_title(), test.output_dir))

        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
       

        r.write('<p/><hr>\n')
        
        r.p('Note: Use deduplicated versions with care. '
            'They may possibly provide more significant results, however they are less quantitative. '
            'Read deduplication involves throwing away a large amount of data, much of which will not be a technical artifact. '
            'Deduplicated versions might best be viewed as a check on data quality.')
        
        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('tail-tools version '+tail_tools.VERSION)
        r.p('nesoni version '+nesoni.VERSION)
        #r.p('SHRiMP version '+grace.get_shrimp_2_version())
        
        r.close()
Ejemplo n.º 41
0
    def run(self):
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])

        nesoni.Make_reference(
            self.output_dir,
            filenames=self.filenames,
            snpeff=False,
            cs='ifavailable' if self.index else False,
            ls=False,
            bowtie='ifavailable' if self.index else False,
        ).run()

        annotations = list(annotation.read_annotations(work / 'reference.gff'))
        annotation.link_up_annotations(annotations)

        exon_index = span_index.index_annotations(
            [item for item in annotations if item.type == "exon"])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
        ])

        mrna_utrs = []
        gene_utrs = []

        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [item for item in gene.children if item.type == 'mRNA']
            assert mrnas, "Gene without any mRNAs: " + gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(
                _max_extension(gene, exon_index, mrna_end_index))

            gene_utr_5primes = []

            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna

                mrna.attr["max_extension"] = str(
                    _max_extension(mrna, exon_index, mrna_end_index))

                cdss = [item for item in mrna.children if item.type == 'CDS']
                exons = [item for item in mrna.children if item.type == 'exon']

                if not exons: continue

                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]

                if not cdss: continue

                mrna_utr_5primes = []
                if gene.strand >= 0:
                    cds_3prime = max(item.end for item in cdss)
                    for item in exons:
                        if item.end >= cds_3prime:
                            mrna_utr_5primes.append(max(
                                item.start, cds_3prime))
                else:
                    cds_3prime = min(item.start for item in cdss)
                    for item in exons:
                        if item.start <= cds_3prime:
                            mrna_utr_5primes.append(min(item.end, cds_3prime))

                if mrna.strand >= 0:
                    utr_start = min(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start + 1, mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(
                        mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start, utr_end - 1)
                    gene_utr_5primes.append(utr_end)

                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID'] + '-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source='tt',
                    type='three_prime_utr',
                    seqid=mrna.seqid,
                    strand=mrna.strand,
                    start=utr_start,
                    end=utr_end,
                    attr=attr,
                )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end - utr_start + max_ext > 1:
                    mrna_utrs.append(utr)

            if gene.strand >= 0:
                utr_start = max(
                    gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start + 1, gene.end)
            else:
                utr_end = min(
                    gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start, utr_end - 1)

            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID'] + '-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source='tt',
                type='three_prime_utr',
                seqid=gene.seqid,
                strand=gene.strand,
                start=utr_start,
                end=utr_end,
                attr=attr,
            )
            utr.attr["max_extension"] = str(
                _max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)

        annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work / 'utr.gff', gene_utrs)

        work.update_param(tail_tools_reference_version=work.VERSION)
Ejemplo n.º 42
0
def count_run(min_score, min_size, max_size, filter_mode, equalize, types,
              locii, qualifiers, use_strand, merge_filename, limit,
              output_prefix, filenames, log):

    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool', 'forward', 'reverse',
                          'both'), "Can't understand --strand specification."

    from Bio import Seq, SeqIO

    annotation_filenames = []
    bam_filenames = []
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference(
                ).annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)

            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)

    assert bam_filenames, 'No reference alignments given'

    merge = {}
    merge_qualifiers = {}
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>

        f = open(merge_filename, 'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers) + 1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    genes = {}  # reference name -> gene index

    feature_names = {}  # feature_name -> number of occurrences

    features = []

    n_features = 0

    chromosome_length = {}
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue

            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None

            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length

    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'

        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue

                if (locii is not None and
                    ('locus_tag' not in feature.attr
                     or feature.attr['locus_tag'].lower() not in locii)):
                    continue

                f = Feature(n_samples)
                f.name = feature.get_id()
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name, 0) + 1
                if feature_names[f.name] > 1:
                    f.name += '/%d' % feature_names[f.name]

                f.qualifiers = [
                    feature.attr.get(item, '') for item in qualifiers
                ]

                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(
                    Span_entry(feature.start, feature.end, feature.strand or 1,
                               f))
                features.append(f)

    else:
        # Sequences as features
        log.log(
            'No annotation files given or found, using sequences as features\n'
        )

        name_feature = {}  # (merged)name -> feature

        for name in chromosome_length:
            merged_name = merge.get(name, name)

            if merged_name not in name_feature:
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name,
                                                    ('', ) * len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length)  #...

            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))

    log.log('%d features\n\n' % len(features))

    for name in genes:
        genes[name].prepare()

    n_fragments = [0] * n_samples
    n_fragments_aligned = [0] * n_samples
    n_low_score = [0] * n_samples
    n_something = [0] * n_samples
    n_multiple = [0] * n_samples
    n_span = [0] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(
                bam_filenames[i],
                'Counting sample %d of %d' % (i + 1, n_samples)):
            n_fragments[i] += 1

            if not fragment_alignments:
                continue

            n_fragments_aligned[i] += 1

            feature_hits = []  # [ [ (feature, strand) ] ]

            # Use only top scoring alignments
            fragment_scores = [
                sum(al.get_AS() for al in item) for item in fragment_alignments
            ]

            best_score = max(fragment_scores)

            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue

            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score
            fragment_alignments = [
                item
                for item, score in zip(fragment_alignments, fragment_scores)
                if score >= cutoff
            ]

            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1

                start = min(item.pos - 1 for item in alignments)
                end = max(item.pos + item.length - 1 for item in alignments)
                length = end - start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue

                rname = alignments[0].rname
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1
                assert alignments[
                    0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'

                this_feature_hits = []
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append(key)
                    if not use_only_monogamous or len(
                            fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1

                if this_feature_hits:
                    feature_hits.append(this_feature_hits)

                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1], b[1])][b[0]] += 1

            if len(feature_hits) > 0:
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print

            if len(feature_hits) > 1:
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1], b[1])][b[0]] += 1

            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1

            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference',
                  n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded',
                      n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i], 'had an alignment that spanned multiple genes',
                  n_span[i])
        log.log('\n')

    strandedness = []
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward + n_reverse < 5: continue
        strandedness.append(
            (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse))
    strandedness = sum(strandedness) / len(strandedness)
    log.log(
        'Strand specificity: %.0f%%\n'
        '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
        '   Average over all features with at least 5 hits.)\n' % strandedness)

    if use_strand == 'pool':
        getters = [
            lambda f:
            (feature.name, add_lists(feature.count[1], feature.count[-1]),
             add_defdicts(feature.common[(1, 1)], feature.common[
                 (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]),
             add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[
                 (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[
                     (-1, -1)]))
        ]
    elif use_strand == 'forward':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)])
        ]
    elif use_strand == 'reverse':
        getters = [
            lambda f: (feature.name, feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]
    elif use_strand == 'both':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)]), lambda f:
            (feature.name + 'r', feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [float(min_hits) / item for item in total_hits]
        total_hits = [min_hits] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        ['Feature'] + titles + ['RPKM ' + item for item in titles] +
        ['Length'] + qualifiers + ['On same fragment'] +
        (['Ambiguous alignment'] if expect_multiple_alignments else []))

    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)

            if equalize:
                count = [subsample(count[i], p[i]) for i in xrange(n_samples)]

            rpkm = [
                count[i] * 1.0e9 / feature.length / total_hits[i]
                for i in xrange(n_samples)
            ]

            common_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    common.items(), key=lambda item: item[1], reverse=True))
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    ambiguous.items(), key=lambda item: item[1], reverse=True))

            print >> f, tab_encode(
                [feature_name] + [str(item) for item in count] +
                ['%.2f' % item for item in rpkm] + [str(feature.length)] +
                list(feature.qualifiers) + [common_str] +
                ([ambiguous_str] if expect_multiple_alignments else []))

    f.close()
Ejemplo n.º 43
0
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
        
        # Reference genome
        
        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end-start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene,peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end-start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end-start > self.max_seq: return ''
                return bio.reverse_complement(chromosomes[gene.seqid][start:end])
        
        # Normalization files
        
        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
            norm_file = self.prefix+'-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All',str)])['All']
        pair_norm_names = [ ]
        pair_norms = [ ]
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix+'-pairs-norm.csv',
            [('All',io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
            )


        # Read data
        
        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = [ ]
        children = list(annotation.read_annotations(self.children))
        
        count_table = io.read_grouped_table(self.counts, [
            ('Count',int),
            ('Tail_count',int),
            ('Tail',_float_or_none),
            ('Proportion',_float_or_none),
            ('Annotation',str)
            ])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']
        
        samples = counts.value_type().keys()
        sample_tags = { }
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts
        
        for item in children:
            item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
        
        parents = [ ]
        id_to_parent = { }
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = [ ]
            #item.cds = [ ]
            
            # Default utr
            if item.strand >= 0:
               item.utr_pos = item.end
            else:
               item.utr_pos = item.start
            
            if 'three_prime_UTR_start' in item.attr:
               if item.strand >= 0:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])-1
               else:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])
            
            
        
        for item in utrs:
            assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent']    
            id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end)


        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based
            
            if 'Parent' in item.attr:
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)
                    

        for item in parents:
            item.children.sort(key=_annotation_sorter)
            
            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]
                relevant = [ 
                    peak for peak in relevant 
                    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                    ]
                    
            if self.top:
                relevant.sort(key=lambda peak:peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant
        
        
        
        # JSON output
        
        j_data = { }
        j_genes = j_data['genes'] = { }
        
        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = [ ]
        j_genes['chromosome'] = [ ]
        j_genes['strand'] = [ ]
        j_genes['start'] = [ ]
        j_genes['utr'] = [ ]
        j_genes['end'] = [ ]
        j_genes['gene'] = [ ]
        j_genes['product'] = [ ]
        j_genes['peaks'] = [ ]
        j_genes['relevant_peaks'] = [ ]
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append( item.get_id() )
            j_genes['chromosome'].append( item.seqid )
            j_genes['strand'].append( item.strand )
            j_genes['start'].append( item.start )
            j_genes['utr'].append( item.utr_pos )
            j_genes['end'].append( item.end )
            j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) )
            j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) )
            j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] )
            j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] )
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )
        
        j_peaks = j_data['peaks'] = { }
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = [ ]
        j_peaks['chromosome'] = [ ]
        j_peaks['strand'] = [ ]
        j_peaks['start'] = [ ]
        j_peaks['end'] = [ ]
        j_peaks['parents'] = [ ]
        j_peaks['counts'] = [ ]
        j_peaks['tail_lengths'] = [ ]
        j_peaks['proportion_tailed'] = [ ]
        for item in children:
            j_peaks['name'].append( item.get_id() )
            j_peaks['chromosome'].append( item.seqid )
            j_peaks['strand'].append( item.strand )
            j_peaks['start'].append( item.start )
            j_peaks['end'].append( item.end )
            j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ])
            j_peaks['counts'].append( counts[item.get_id()].values() )
            j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() )
            j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() )
        
        j_samples = j_data['samples'] = { }
        j_samples['name'] = [ ]
        j_samples['tags'] = [ ]
        j_samples['normalizing_multiplier'] = [ ]
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier']))
        
        j_chromosomes = j_data['chromosomes'] = { }
        j_chromosomes['name'] = [ ]
        j_chromosomes['length'] = [ ]
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))        
        
        with open(self.prefix + '.json','wb') as f:
            json.dump(j_data, f)
        
        
        # Output paired peak file
        
        output_comments = [ '#Counts' ]
        output_samples = [ ]
        for item in samples:
            output_samples.append(item+'-peak1')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[])))
        for item in samples:
            output_samples.append(item+'-peak2')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[])))
        
        output_names = [ ]
        output_counts = [ ]
        output_tail_counts = [ ]
        output_proportions = [ ]
        output_tails = [ ]
        output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 
                                     'transcription_stops' ] #, 'interpeak_seq', ]
        output_annotations = [ ]
            
        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks)-1):
                for j in xrange(i+1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-'+id_i+'-'+id_j
                    output_names.append(id_pair)
                    
                    row = [ ]
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text,row))
                    
                    row = [ ]
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text,row))

                    row = [ ]
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text,row))

                    row = [ ]
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text,row))
                    
                    output_annotations.append([
                        item.attr.get('Name',item.attr.get('gene','')),
                        item.attr.get('Product',item.attr.get('product','')),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                        ])
        
        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [ 
                ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)),
                ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)),
                ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)),
                ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)),
                ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)),
                ],
            comments=output_comments,
            )
                        
#        # Chi Sq tests
#        
#        #for id in relation:
#        #    peaks = relation[id]
#        #    if len(peaks) < 2: continue     
#        
#        mats = [ ]   
#        genes = [ ]
#        products = [ ]
#        mean_tails = [ ]
#        prop_tails = [ ]
#        
#        peak_names = [ ]
#        chromosome_names = [ ]
#        strands = [ ]
#        transcription_stops = [ ]
#        interpeak_seqs = [ ]
#        prepeak_seqs = [ ]
#        
#        for parent in parents:
#            id = parent.get_id()
#            peaks = parent.relevant_children
#            if len(peaks) < 2: continue
#            
#            matrix = [ ]
#            for item in peaks:
#                matrix.append(counts[item.get_id()].values())
#            
#            mats.append(
#                runr.R_literal(id) + ' = ' + 
#                runr.R_literal(matrix)
#                )
#            
#            genes.append(parent.attr.get('Name',parent.attr.get('gene','')))
#            products.append(parent.attr.get('Product',parent.attr.get('product','')))
#            
#            def format_mean(s):
#                if s == 'NA': return 'NA'
#                return '%.1f' % float(s)
#            mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks ))
#            
#            def format_prop(s):
#                if s == 'NA': return 'NA'
#                return '%.2f' % float(s)
#            prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks ))
#            
#            peak_names.append(', '.join(item.get_id() for item in peaks))
#            chromosome_names.append(parent.seqid)
#            strands.append(parent.strand)
#            transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks))
#            interpeak_seqs.append(get_interpeak_seq(peaks))
#            prepeak_seqs.append(get_prepeak_seq(parent,peaks))
#            
#            #if len(mats) >= 10: break
#        
#        text = 'cat("Loading data into R+\n")\n'
#        text += 'data <- list(\n' + ',\n'.join(mats) + ')\n'        
#        text += CHISQ
#        
#        runr.run_script(text,
#            OUTPUT_FILENAME=self.prefix+'.csv',
#            GENES = genes,
#            PRODUCTS = products,
#            MEAN_TAILS = mean_tails,
#            PROP_TAILS = prop_tails,
#            PEAK_NAMES = peak_names,
#            CHROMOSOME_NAMES = chromosome_names,
#            STRANDS = strands,
#            TRANSCRIPTION_STOPS = transcription_stops,
#            INTERPEAK_SEQS = interpeak_seqs,
#            PREPEAK_SEQS = prepeak_seqs,
#            )
#        
            
        
        
        
        
        
        
        
        
Ejemplo n.º 44
0
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
     
         annotations = [ 
             item 
             for item in annotation.read_annotations(annotations_filename)
             if item.type.lower() in types
         ]
         
         self.log.log('%d annotations\n' % len(annotations))
         
         assert annotations, 'No annotations of specified types in file'
         
         index = { }
         
         for item in annotations:
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += self.extension
             else:
                 item.tail_pos = item.start
                 item.start -= self.extension
         
             if item.seqid not in index:
                 index[item.seqid] = span_index.Span_index()
             index[item.seqid].insert(item)
             
             item.hits = [] # [ (rel_start, rel_end, tail_length) ]
         
         for item in index.itervalues(): 
             item.prepare()
         
         for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'):
             for fragment in fragment_alignments:
                 start = min(item.pos-1 for item in fragment)
                 end = max(item.pos+item.length-1 for item in fragment)
                 alignment_length = end-start
                 strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1
                 
                 if strand >= 0:
                     tail_pos = end
                 else:
                     tail_pos = start
                 
                 tail_length = 0
                 adaptor_bases = 0
                 for item in fragment[0].extra:
                     if item.startswith('AN:i:'):
                         tail_length = int(item[5:])
                     elif item.startswith('AD:i:'):
                         adaptor_bases = int(item[5:])
                 
                 if fragment[0].rname in index:
                     hits = [ 
                         gene
                         for gene in index[fragment[0].rname].get(start,end)
                         if gene.strand == strand
                         ]                         
                     if hits:
                         gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id()))
                             # Nearest by tail_pos
                             # failing that, by id to ensure a deterministic choice
                             
                         if strand > 0:
                             rel_start = start - gene.start
                             rel_end = end - gene.start
                         else:
                             rel_start = gene.end - end
                             rel_end = gene.end - start
                         
                         gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) )

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()