def build_bowtie_index(self):
     io.execute([
             'bowtie2-build',
             self.reference_fasta_filename(),
             self/'bowtie',
             ],
         )
 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
    def get_table(self, table_name):
        if table_name not in self.tables:
            if self.action.download:
                for filename in [table_name + '.txt.gz', table_name + '.sql']:
                    io.execute([
                        'rsync', '-P',
                        'rsync://hgdownload.cse.ucsc.edu/goldenPath/' +
                        self.action.ucsc_name + '/database/' + filename,
                        self.ucsc / filename
                    ])

            fields = []
            with open(self.ucsc / table_name + '.sql', 'rU') as f:
                for line in f:
                    if line.startswith('  `'):
                        parts = line.strip().split()
                        assert parts[0][0] == '`' and parts[0][-1] == '`'
                        fields.append(parts[0][1:-1])

            tup_class = collections.namedtuple(table_name, fields)
            data = []
            with gzip.open(self.ucsc / table_name + '.txt.gz', 'rb') as f:
                for line in f:
                    data.append(tup_class(*line.rstrip('\n').split('\t')))
            self.tables[table_name] = data

        return self.tables[table_name]
Exemple #4
0
 def run(self):
     genome = self.genome
     if os.path.isdir(genome):
         genome = os.path.join(genome, os.path.split(genome)[1]+'.genome')
         print genome
     
     #pref_filename = os.path.join(os.path.expanduser('~'),'igv','prefs.properties')
     #if os.path.exists(pref_filename):
     #    with open(pref_filename,'rb') as f:
     #        lines = f.readlines()
     #    with open(pref_filename,'wb') as f:
     #        for line in lines:
     #            if line.startswith('DEFAULT_GENOME_KEY='):
     #                #line = 'DEFAULT_GENOME_KEY=\n'
     #                continue
     #            f.write(line)
     
     with workspace.tempspace() as temp:
         with open(temp/'batch.txt','wb') as f:
             print >> f, 'new'
             print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd()
             print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd()
             print >> f, 'genome '+os.path.abspath(genome)
             for filename in self.files:
                 print >> f, 'load '+os.path.abspath(filename)
         
         io.execute(['java','-Xmx32000m',
                     #Flags from igb.sh script:
                     '-Dproduction=true','-Dapple.laf.useScreenMenuBar=true','-Djava.net.preferIPv4Stack=true',
                     '-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
Exemple #5
0
def sort_bam(in_filename, out_prefix, by_name=False, cores=8):
    cores = min(cores, legion.coordinator().get_cores())
    megs = max(10, 800 // cores)
    
    io.execute(
        [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] +
        ([ '-n' ] if by_name else [ ]) +
        [ in_filename, out_prefix ], cores=cores)
Exemple #6
0
def index_vcf(filename):
    """ IGV index a VCF file. 
        Don't fail if igvtools fails (eg not installed).
    """
    try:
        io.execute('igvtools index FILENAME', FILENAME=filename)
    except (OSError, AssertionError):
        print >> sys.stderr, 'Failed to index VCF file with igvtools. Continuing.'
Exemple #7
0
def sort_and_index(in_filename, out_prefix):
    io.execute([
        'samtools', 'sort', in_filename, out_prefix
    ])
    
    io.execute([
        'samtools', 'index', out_prefix + '.bam'
    ])
Exemple #8
0
def index_vcf(filename):
    """ IGV index a VCF file. 
        Don't fail if igvtools fails (eg not installed).
    """
    try:
        io.execute('igvtools index FILENAME',FILENAME=filename)
    except (OSError, AssertionError):
        print >> sys.stderr, 'Failed to index VCF file with igvtools. Continuing.'
 def build_bowtie_index(self, log_to=sys.stdout):
     io.execute([
             'bowtie2-build',
             self.reference_fasta_filename(),
             self/'bowtie',
             ],
         stdout = log_to,
         )
Exemple #10
0
def sort_bam(in_filename, out_prefix, by_name=False, cores=8):
    cores = min(cores, legion.coordinator().get_cores())
    megs = max(10, 800 // cores)
    
    io.execute(
        [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] +
        ([ '-n' ] if by_name else [ ]) +
        [ in_filename, out_prefix ], cores=cores)
Exemple #11
0
def sort_and_index(in_filename, out_prefix):
    io.execute([
        'samtools', 'sort', in_filename, out_prefix
    ])
    
    io.execute([
        'samtools', 'index', out_prefix + '.bam'
    ])
Exemple #12
0
    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'

        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               exclude=[ 'gene', 'source' ]
        ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Exemple #13
0
 def run(self):
     with workspace.tempspace() as temp:
         with open(temp/'batch.txt','wb') as f:
             print >> f, 'new'
             print >> f, 'genome '+os.path.abspath(self.genome)
             for filename in self.files:
                 print >> f, 'load '+os.path.abspath(filename)
         
         io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
 def build_shrimp_mmap(self, cs=False):
     suffix = '-cs' if cs else '-ls'
     
     grace.status('Building SHRiMP mmap')
     io.execute([
         'gmapper' + suffix,
         '--save', self.object_filename('reference' + suffix),
         self.reference_fasta_filename(),
     ])
     grace.status('')
Exemple #15
0
 def run(self):
     from nesoni import io
     
     f_in = self.begin_input()
     f_out = self.begin_output()
     try:
         io.execute(self.command, stdin=f_in, stdout=f_out)
     finally:
         self.end_output(f_out)
         self.end_input(f_in)
Exemple #16
0
 def run(self):
     with workspace.tempspace() as temp:
         with open(temp/'batch.txt','wb') as f:
             print >> f, 'new'
             print >> f, 'preference LAST_TRACK_DIRECTORY', os.getcwd()
             print >> f, 'preference LAST_GENOME_IMPORT_DIRECTORY', os.getcwd()
             print >> f, 'genome '+os.path.abspath(self.genome)
             for filename in self.files:
                 print >> f, 'load '+os.path.abspath(filename)
         
         io.execute(['java','-jar',io.find_jar('igv.jar'),'-b',temp/'batch.txt'])
Exemple #17
0
    def run(self):
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        jar = io.find_jar('snpEff.jar')
        
        with open(self.prefix + '.vcf','wb') as f:
            io.execute('java -jar JAR eff GENOME VCF -c CONFIG',
                JAR=jar, GENOME=reference.name, VCF=self.vcf, CONFIG=reference/'snpeff.config',
                stdout=f)

        index_vcf(self.prefix+'.vcf')
    def build_shrimp_mmap(self, cs=False):
        suffix = '-cs' if cs else '-ls'

        grace.status('Building SHRiMP mmap')
        io.execute([
            'gmapper' + suffix,
            '--save',
            self.object_filename('reference' + suffix),
            self.reference_fasta_filename(),
        ])
        grace.status('')
Exemple #19
0
    def run(self):
        assert self.release
        assert self.species
        assert self.assembly
        assert self.dna
        
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        ensembl = workspace.Workspace(work/'ensembl')
        
        genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz"
        genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename
        
        gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz"
        gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename
        
        
        if self.download:
            self.log.log("Fetching "+genome_url+"\n")
            io.execute(['rsync','-aP',genome_url, ensembl/genome_filename])
            self.log.log("Fetching "+gff_url+"\n")
            io.execute(['rsync','-aP',gff_url, ensembl/gff_filename])
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(ensembl/gff_filename))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(ensembl/genome_filename):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Exemple #20
0
    def run(self):
        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        jar = io.find_jar('snpEff.jar')

        with open(self.prefix + '.vcf', 'wb') as f:
            io.execute('java -jar JAR eff GENOME VCF -c CONFIG',
                       JAR=jar,
                       GENOME=reference.name,
                       VCF=self.vcf,
                       CONFIG=reference / 'snpeff.config',
                       stdout=f)

        index_vcf(self.prefix + '.vcf')
 def run(self):
     from nesoni import io
     
     assert self.command, 'Nothing to execute!'
     
     print self.ident()
     
     f_in = self.begin_input()
     f_out = self.begin_output()
     try:
         io.execute(self.command[:1] + self.execution_options + self.command[1:], 
                    stdin=f_in, stdout=f_out)
     finally:
         self.end_output(f_out)
         self.end_input(f_in)
 def run(self):
     work = self.get_workspace()
     acc = self.run_accession
     io.execute(
         'wget -c URL',
         URL='http://ftp-private.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra'
             % (acc[:3],acc[:6],acc,acc),
         cwd=work.working_dir,
         )
     
     io.execute(
         'fastq-dump --split-files --bzip2 FILENAME',
         FILENAME=acc+'.sra',
         cwd=work.working_dir,
         )
Exemple #23
0
    def run(self):
        from nesoni import io

        assert self.command, 'Nothing to execute!'

        print self.ident()

        f_in = self.begin_input()
        f_out = self.begin_output()
        try:
            io.execute(self.command[:1] + self.execution_options +
                       self.command[1:],
                       stdin=f_in,
                       stdout=f_out)
        finally:
            self.end_output(f_out)
            self.end_input(f_in)
Exemple #24
0
    def run(self):
        work = self.get_workspace()
        acc = self.run_accession
        io.execute(
            'wget -c URL',
            #URL='http://ftp-private.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra'
            URL=
            'http://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/%s/%s/%s/%s.sra'
            % (acc[:3], acc[:6], acc, acc),
            cwd=work.working_dir,
        )

        io.execute(
            'fastq-dump --split-files --bzip2 FILENAME',
            FILENAME='./' + acc + '.sra',
            cwd=work.working_dir,
        )
Exemple #25
0
 def run(self):
     assert self.sort in ('queryname', 'coordinate')
     
     jar = io.find_jar('MergeSamFiles.jar', 'MergeSamFiles is part of the Picard package.')
     io.execute([
         'java','-jar',jar,
         'USE_THREADING=true',
         'TMP_DIR='+tempfile.gettempdir(), #Force picard to use same temp dir as Python
         'SORT_ORDER='+self.sort,
         'OUTPUT='+self.prefix+'.bam'
         ] + [ 'INPUT='+item for item in self.bams ])
     
     if self.sort == 'coordinate' and self.index:
         jar = io.find_jar('BuildBamIndex.jar', 'BuildBamIndex is part of the Picard package.')
         io.execute([
             'java','-jar',jar,
             'INPUT='+self.prefix+'.bam'
             ])
Exemple #26
0
 def run(self):
     assert self.sort in ('queryname', 'coordinate')
     
     jar = io.find_jar('MergeSamFiles.jar', 'MergeSamFiles is part of the Picard package.')
     io.execute([
         'java','-jar',jar,
         'USE_THREADING=true',
         'TMP_DIR='+tempfile.gettempdir(), #Force picard to use same temp dir as Python
         'SORT_ORDER='+self.sort,
         'OUTPUT='+self.prefix+'.bam'
         ] + [ 'INPUT='+item for item in self.bams ])
     
     if self.sort == 'coordinate' and self.index:
         jar = io.find_jar('BuildBamIndex.jar', 'BuildBamIndex is part of the Picard package.')
         io.execute([
             'java','-jar',jar,
             'INPUT='+self.prefix+'.bam'
             ])
Exemple #27
0
 def href(self, filename, title=None, image=False):
     relative = self.workspace.path_as_relative_path(filename)
     if title is None:
        title = os.path.split(filename)[1]
        
        size = os.stat(filename).st_size
        if size >= 1<<30:
            title += ' (%.1fGb)' % (float(size)/(1<<30))
        elif size >= 1<<20: 
            title += ' (%.1fMb)' % (float(size)/(1<<20))
        elif size >= 1<<10: 
            title += ' (%.1fkb)' % (float(size)/(1<<10))
     
     if image:
        thumb_name = 'thumb-'+os.path.splitext(relative)[0]+'.png'
        thumb_filename = self.workspace/thumb_name
        io.execute(['convert', '-thumbnail', '50x50', filename, thumb_filename])
        title = ('<span style="display: inline-block; width: 50px;"><img src="%s"/></span> ' % thumb_name) + title 
        
     return '<a href="%s">%s</a>' % (relative, title)
    def set_sequences(self, filenames):        
        reference_genbank_filename = self / 'reference.gbk'
        reference_filename = self / 'reference.fa'

        reference_genbank_file = open(reference_genbank_filename,'wb')
        any_genbank = [ False ]

        def genbank_callback(name, record):
            """ Make a copy of any genbank files passed in. """
            from Bio import SeqIO
            
            SeqIO.write([record], reference_genbank_file, 'genbank')
            
            f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
            SeqIO.write([record], f, 'genbank')
            f.close()
            
            any_genbank[0] = True
        
        lengths = [ ]
        seen = set()
        f = open(reference_filename, 'wb')
        for filename in filenames:
            for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback):
                name = name.split()[0]
                assert name not in seen, 'Duplicate chromosome name: ' + name
                seen.add(name)
                lengths.append( (name, len(seq)) )
                io.write_fasta(f, name, seq)
        f.close()        
        self.set_object(lengths, 'reference-lengths.pickle.gz')
        
        reference_genbank_file.close()
        if not any_genbank[0]:
            os.unlink(reference_genbank_filename)
            
        # Create an index of the reference sequences for samtools
        io.execute([
            'samtools', 'faidx', reference_filename
        ])
Exemple #29
0
    def get_table(self, table_name):
        if table_name not in self.tables:
            if self.action.download: 
                for filename in [ table_name+'.txt.gz', table_name+'.sql' ]:
                    io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.action.ucsc_name+'/database/'+filename, self.ucsc/filename])
            
            fields = [ ]
            with open(self.ucsc/table_name+'.sql','rU') as f:
                for line in f:
                    if line.startswith('  `'):
                        parts = line.strip().split()
                        assert parts[0][0] == '`' and parts[0][-1] == '`'
                        fields.append(parts[0][1:-1])

            tup_class = collections.namedtuple(table_name, fields)
            data = [ ]
            with gzip.open(self.ucsc/table_name+'.txt.gz','rb') as f:
                for line in f:
                    data.append(tup_class(* line.rstrip('\n').split('\t') ))
            self.tables[table_name] = data
         
        return self.tables[table_name]
Exemple #30
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        io.execute([
            'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        ])
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Exemple #31
0
    def href(self, filename, title=None, image=False):
        relative = self.workspace.path_as_relative_path(filename)
        if title is None:
            title = os.path.split(filename)[1]

            size = os.stat(filename).st_size
            if size >= 1 << 30:
                title += ' (%.1fGb)' % (float(size) / (1 << 30))
            elif size >= 1 << 20:
                title += ' (%.1fMb)' % (float(size) / (1 << 20))
            elif size >= 1 << 10:
                title += ' (%.1fkb)' % (float(size) / (1 << 10))

        if image:
            thumb_name = 'thumb-' + relative
            thumb_filename = self.workspace / thumb_name
            io.execute(
                ['convert', '-thumbnail', '50x50', filename, thumb_filename])
            title = (
                '<span style="display: inline-block; width: 50px;"><img src="%s"/></span> '
                % thumb_name) + title

        return '<a href="%s">%s</a>' % (relative, title)
Exemple #32
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)
        workspace.setup_reference(self.reference)

        # assert os.path.exists(self.reference), 'Reference file does not exist'
        # reference_filename = workspace._object_filename('reference.fa')
        # if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, "temp.bam")
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"])
            f = open(self.input, "rb")
            while True:
                data = f.read(1 << 20)
                if not data:
                    break
                writer.write(data)
            writer.close()
            f.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", sort_input_filename, bam_prefix])

        if temp_filename is not None:
            os.unlink(temp_filename)

        grace.status("")
Exemple #33
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)
        workspace.setup_reference(self.reference)

        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename,
                                    ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1 << 20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()

        grace.status('Sort')

        io.execute(['samtools', 'sort', '-n', sort_input_filename, bam_prefix])

        if temp_filename is not None:
            os.unlink(temp_filename)

        grace.status('')
   def run(self):
       assert self.method in ("limma", "fitnoise1", "fitnoise2"), "Unknown method."
       assert self.method != "limma" or not self.empirical_controls
       
       title = self.get_title()
   
       n_alt = len(self.alt)
       n_null = len(self.null)
       
       suffix = '-dedup' if self.dedup else ''
   
       genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv')
       genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv')

       primarypeakwise_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'counts.csv')
       primarypeakwise_norm_filename = join(self.analysis,'expression','primarypeakwise'+suffix,'norm.csv')

       peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv')
       peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv')

       pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv')
       pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv')

   
       reader = io.Table_reader(genewise_filename, 'Count')
       reader.close()
       samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ]
       tags = { }
       for item in samples:
           tags[item] = [ item ]
       for line in reader.comments:
           if line.startswith('#sampleTags='):
               parts = line[len('#sampleTags='):].split(',')
               tags[parts[0]] = parts
              
       model = [ ]
       for term in self.alt + self.null:        
           spec = selection.term_specification(term)
           model.append([ selection.weight(spec, tags[item]) for item in samples ])
       model = zip(*model) #Transpose
       
       select = [ any(row) for row in model ]
       model = [ row for row,selected in zip(model,select) if selected ]
       model_columns = [ selection.term_name(item) for item in self.alt + self.null ]
       model_rows = [ item for keep, item in zip(select, samples) if keep ]
       
       #degust complains if name starts with '-', delimits with commas
       model_columns = [ ('.' if item[:1] == '-' else '') + item.replace(',',';') for item in model_columns ]
       
       pairs_n_alt = n_alt       
       pairs_select = select + select
       pairs_model = (
           [ (0,) * n_alt + row + (0,) for row in model ] +
           [ row[:n_alt]  + row + (1,) for row in model ] 
           )
       pairs_model_columns = (
           [ item+'-interaction' for item in model_columns[:n_alt] ] +
           model_columns +
           [ 'pair2' ]
           )
       pairs_model_rows = [ item+'-peak1' for item in model_rows ] + [ item+'-peak2' for item in model_rows ]
       
       
       design_str = '['+('-'*(8*n_alt-2))+'] test coefficients\n'
       for row, name in zip(model, model_rows):
           design_str += "%s %s\n" % (''.join('%7g ' % item for item in row), name)
       
       print
       print "Design matrix"
       print design_str
       print
       print 'Pair design matrix'
       print '['+('-'*(8*n_alt-2))+'] test coefficients'
       for row, name in zip(pairs_model, pairs_model_rows):
           print ''.join('%7g ' % item for item in row), name
       print
       
       
       workspace = self.get_workspace()
       
       runr.run_script(TEST_R, self.tell,
           DIR = workspace.working_dir,
           METHOD = self.method,
           WEIGHT = self.weight,
           EMPIRICAL_CONTROLS = self.empirical_controls,
           MIN_READS = self.min_reads,
           BIOTYPE = self.biotype,
           RELATION = self.relation,
           QUANTILE_TAIL = self.quantile_tail,
           DO_EXPRESSION = self.do_expression,
           DO_TAIL_LENGTH = self.do_tail_length,
           VERBOSE = self.verbose,
           
           GENEWISE_FILENAME = genewise_filename,
           GENEWISE_NORM_FILENAME = genewise_norm_filename,
           PRIMARYPEAKWISE_FILENAME = primarypeakwise_filename,
           PRIMARYPEAKWISE_NORM_FILENAME = primarypeakwise_norm_filename,
           PEAKWISE_FILENAME = peakwise_filename,
           PEAKWISE_NORM_FILENAME = peakwise_norm_filename,
           PAIRWISE_FILENAME = pairwise_filename,
           PAIRWISE_NORM_FILENAME = pairwise_norm_filename,
           
           N_ALT = n_alt,
           SELECT = select,
           MODEL = model,
           MODEL_COLUMNS = model_columns,
           PAIRS_N_ALT = pairs_n_alt,
           PAIRS_SELECT = pairs_select,
           PAIRS_MODEL = pairs_model,
           PAIRS_MODEL_COLUMNS = pairs_model_columns,
           )
       if self.tell: return
       
       reporter = reporting.Reporter(workspace.working_dir, title, style=web.style())
       
       if self.dedup:
           reporter.p('Read deduplication was used.')
       
       reporter.write('<table>\n')
       for is_expression, entities, result, aveexpr, subtitle, terms in [
           (True, 'genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]),
           (False, 'genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]),
           (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression', 'Primary-peakwise expression level', model_columns[:n_alt]),
           (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail', 'Primary-peakwise tail length', model_columns[:n_alt]),
           (True, 'peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]),
           (False, 'peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]),
           (True, 'peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]),
           (False, 'peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]),
           ]:
           #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
           #n = 0
           #n_01 = 0
           #n_05 = 0
           #for row in data.values():
           #    fdr = float(row['adj.P.Val'])
           #    if fdr <= 0.01: n_01 += 1
           #    if fdr <= 0.05: n_05 += 1
           #    n += 1
           
           if is_expression and not self.do_expression: continue
           if not is_expression and not self.do_tail_length: continue
           
           io.execute([
               'degust.py',
               '--name', title + ' : ' + subtitle,
               '--avg', aveexpr,
               '--primary', 'baseline',
               '--logFC', ','.join(terms),
               '--fdr', 'adj.P.Val',
               '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr,
               '--notour', '1',
               '--out', workspace/(result+'.html'),
               workspace/(result+'-toptable.csv'),
               ])

           with open(workspace/(result+'.txt'),'rU') as f:
               lines = f.readlines()
           
           reporter.write('<tr><td valign="top" width="33%">')
           reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) )
           #reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
           line = reporter.href(workspace/(result+'-toptable.csv'), 'Spreadsheet')
           if result.endswith('voom'):
               line += ', ' + reporter.href(workspace/(result+'.png'), 'voom plot')
           reporter.p(line)
           for line in lines[-2:]:
               reporter.p(line.strip())
           reporter.write('</td><td valign="top"><br/><br/>')
           for line in lines[:-2]:
               reporter.write(line.strip() + '<br/>\n')
           reporter.write('</td></tr>')

       reporter.write('</table>\n')
       
       reporter.subheading("Design matrix")
       
       reporter.write('<pre>' + design_str + '</pre>')
       
       reporter.close()
Exemple #35
0
    def run(self):
        assert self.ucsc_name, 'Need a UCSC genome name'
        
        scratch = _ucsc_scratch(self)
        
        # Load annotations
        
        source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table)
        
        table = scratch.get_table(self.table)
        get_name = scratch.getter(self.name)
        get_product = scratch.getter(self.product)

        mrnas = [ ]
        
        for item in table:
            ann = annotation.Annotation(
                seqid = item.chrom,
                source = source,
                type = 'mRNA',
                strand = {'+':1, '-':-1}[item.strand],
                start = int(item.txStart),
                end = int(item.txEnd),
                attr = {
                    'ID' : item.name,
                    'Name' : get_name(item),
                    'Product' : get_product(item),
                    #'UCSC_name2' : item.name2,
                    }
                )
            
            ann.record = item
            mrnas.append(ann)

        _uniquify_ids(mrnas)
        
        annotations = [ ]
        
        for group in _grouped_features(mrnas):
            ID = '/'.join(item.attr['ID'] for item in group)
            for item in group:
                item.attr['Parent'] = ID
                item.attr['ID'] = item.attr['ID'] + '-mRNA'
            
            annotations.append(annotation.Annotation(
                source = source,
                type = 'gene',
                seqid = group[0].seqid,
                strand = group[0].strand,
                start = min(item.start for item in group),
                end = max(item.end for item in group),
                attr = {
                    'ID' : ID,
                    'Name' : annotation_tools.join_descriptions([ item.attr['Name'] for item in group ], '/'),
                    'Product' : annotation_tools.join_descriptions([ item.attr['Product'] for item in group ], '/'),
                    #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'),
                    }
                ))
            for item in group:
                annotations.append(item)
                
                exonStarts = _parse_ints(item.record.exonStarts)
                exonEnds = _parse_ints(item.record.exonEnds)
                cdsStart = int(item.record.cdsStart)
                cdsEnd = int(item.record.cdsEnd)
                for start,end in zip(exonStarts,exonEnds):
                    annotations.append(annotation.Annotation(
                        source = source,
                        type = 'exon',
                        seqid = item.seqid,
                        strand = item.strand,
                        start = start,
                        end = end,
                        attr = {
                            'Parent' : item.attr['ID'],
                            }
                        ))
                    if max(cdsStart,start) < min(cdsEnd,end):
                        annotations.append(annotation.Annotation(
                            source = source,
                            type = 'CDS',
                            seqid = item.seqid,
                            strand = item.strand,
                            start = max(cdsStart,start),
                            end = min(cdsEnd,end),
                            #TODO: phase
                            attr = {
                                'Parent' : item.attr['ID'],
                                }
                            ))

        # Load sequence
        
        if self.download:
            io.execute(['rsync','-P','rsync://hgdownload.cse.ucsc.edu/goldenPath/'+self.ucsc_name+'/bigZips/chromFa.tar.gz',scratch.ucsc/'chromFa.tar.gz'])
        
        with workspace.tempspace() as temp:
            io.execute(['tar','-C',temp.working_dir,'-zxf',scratch.ucsc/'chromFa.tar.gz'])
            sequences = [ temp/item for item in natural_sorted(os.listdir(temp.working_dir)) ]
            
            with open(temp/'reference.gff','wb') as f:
                annotation.write_gff3_header(f)
                for item in annotations:
                    print >> f, item.as_gff()
            
            Make_tt_reference(
                self.output_dir,
                filenames = sequences + [ temp/'reference.gff' ],
                index = self.index,
                ).run()
Exemple #36
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = {}
        for item in reader.metadata.get('sampleTags', []):
            parts = item.split(',')
            tags[parts[0]] = parts

        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = ['reference'] + reader.samples

        for sample in samples:
            if sample not in tags:
                tags[sample] = [sample, 'all']

        samples = selection.select_and_sort(self.select, self.sort, samples,
                                            lambda sample: tags[sample])

        required = [
            i for i, sample in enumerate(samples)
            if selection.matches(self.require, tags[sample])
        ]

        sample_number = dict((b, a) for a, b in enumerate(reader.samples))

        items = []
        for record in reader:
            variants = get_variants(record)
            genotypes = []
            counts = []
            qualities = []
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(
                        get_genotype(record.samples[sample_number[sample]]))
                    counts.append(
                        get_variant_counts(
                            record.samples[sample_number[sample]]))
                    qualities.append(
                        record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None
                            and
                            not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue

            if self.only_snps and any(genotype is not None and any(
                    len(variants[i]) != 1 for i in genotype)
                                      for genotype in genotypes):
                continue

            snpeff = snpeff_describe(record.INFO.get('EFF', ''))
            if not any(
                    selection.matches(self.snpeff_filter, item[1])
                    for item in (snpeff or [('', [])])):
                continue

            items.append(
                _Nway_record(variants=variants,
                             genotypes=genotypes,
                             counts=counts,
                             qualities=qualities,
                             snpeff=snpeff,
                             record=record))

        self.log.log('%d variants\n\n' % len(items))

        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)

            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; '
                'QUIT' % (self.prefix, self.prefix, len(items)),
            )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)

        else:
            raise grace.Error('Unknown output format: ' + self.as_)
Exemple #37
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
    
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
        
        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)        
        reference = working.get_reference()
        
        log_file = open(self.log_filename(),'wb')
              
        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [ 0 ]
            def tempname():
                n[0] += 1
                return temp/('%d.fq'%n[0])
            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
                if ok:
                    return filename            
                result_name = tempname()
                with open(result_name,'wb') as f:
                    for name, seq, qual in io.read_sequences(filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name
            
            ones = [ ]
            twos = [ ]
            singles = [ ]
            
            for pair in self.pairs:
                assert len(pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))
            
            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name,seq,qual)
                        
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error('Interleaved file contains odd number of sequences')
                        io.write_fastq(right, name,seq,qual)
            
            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = (
                [ 'bowtie2', 
                    '--threads', str(cores),
                    '--rg-id', '1',
                    '--rg', 'SM:'+working.name,                    
                    ] + 
                self.bowtie_options + 
                [ '-x', reference.get_bowtie_index_prefix() ]
                )
            commands = [ ]
            if ones:
                commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ])
            if singles:
                commands.append(command + [ '-U', ','.join(singles) ])
            
            temp_bam_name = temp/'temp.bam'

            with io.pipe_to(
                     ['samtools', 'view', '-S', '-b', '-'],
                     stdout=open(temp_bam_name,'wb'),
                     stderr=log_file
                     ) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')            
                    with io.pipe_from(
                        command,
                        stderr=log_file,
                        cores=cores
                        ) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            io.execute([
                'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
                ])
            
        log_file.close()
Exemple #38
0
def sort_and_index_bam(in_filename, out_prefix, cores=8):
    sort_bam(in_filename, out_prefix, cores=cores)
    
    io.execute([
        'samtools', 'index', out_prefix + '.bam'
        ])
    def run(self):    
        work = self.get_workspace()
        work.update_param(remove=['tail_tools_reference_version'])
        
        nesoni.Make_reference(
            self.output_dir,
            filenames = self.filenames,
            snpeff = False,
            cs = 'ifavailable' if self.index and self.shrimp else False,
            ls = False,
            bowtie = 'ifavailable' if self.index and self.bowtie else False,
            ).run()
            
        annotations = list(annotation.read_annotations(work/'reference.gff'))
        annotation.link_up_annotations(annotations)
        
        exon_index = span_index.index_annotations([
            item for item in annotations if item.type == "exon"
            ])
        mrna_end_index = span_index.index_annotations([
            item.three_prime() for item in annotations if item.type == "mRNA"
            ])
        
        mrna_utrs = [ ]
        gene_utrs = [ ]
        
        for gene in annotations:
            if gene.type != 'gene': continue

            mrnas = [ item for item in gene.children if item.type == 'mRNA' ]
            assert mrnas, "Gene without any mRNAs: "+gene.get_id()

            gene.attr['color'] = '#880088'
            gene.start = min(item.start for item in mrnas)
            gene.end = max(item.end for item in mrnas)
            gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index))
        
            gene_utr_5primes = [ ]
            
            for mrna in mrnas:
                assert mrna.strand == gene.strand, mrna
                assert mrna.seqid == gene.seqid, mrna
                
                mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index))
            
                cdss = [ item for item in mrna.children if item.type == 'CDS' ]
                exons = [ item for item in mrna.children if item.type == 'exon' ]
                
                if not exons: continue
                
                #link up annotations sorts children, so final is really final
                for item in exons[:-1]:
                    item.attr["max_extension"] = "0"
                exons[-1].attr["max_extension"] = mrna.attr["max_extension"]
                
                if not cdss: continue
                
                mrna_utr_5primes = [ ]
                if gene.strand >= 0:
                   cds_3prime = max(item.end for item in cdss)
                   for item in exons:
                       if item.end >= cds_3prime:
                           mrna_utr_5primes.append(max(item.start,cds_3prime))
                else:
                   cds_3prime = min(item.start for item in cdss)
                   for item in exons:
                       if item.start <= cds_3prime:
                           mrna_utr_5primes.append(min(item.end,cds_3prime))
                
                if mrna.strand >= 0:
                    utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end
                    utr_end = max(utr_start+1,mrna.end)
                    gene_utr_5primes.append(utr_start)
                else:
                    utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start
                    utr_start = min(mrna.start,utr_end-1)
                    gene_utr_5primes.append(utr_end)
                
                attr = mrna.attr.copy()
                attr['Parent'] = attr['ID']
                attr['ID'] = attr['ID']+'-3UTR'
                attr['color'] = '#008888'
                utr = annotation.Annotation(
                    source = 'tt',
                    type = 'three_prime_utr',
                    seqid = mrna.seqid,
                    strand = mrna.strand,
                    start = utr_start,
                    end = utr_end,
                    attr = attr,
                    )
                max_ext = _max_extension(utr, exon_index, mrna_end_index)
                utr.attr["max_extension"] = str(max_ext)
                #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon
                if utr_end-utr_start+max_ext > 1:
                    mrna_utrs.append(utr)
            
            if gene.strand >= 0:
                utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end
                utr_end = max(utr_start+1,gene.end)
            else:
                utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start
                utr_start = min(gene.start,utr_end-1)
            
            attr = gene.attr.copy()
            attr['Parent'] = attr['ID']
            attr['ID'] = attr['ID']+'-3UTR'
            attr['color'] = '#008888'
            utr = annotation.Annotation(
                source = 'tt',
                type = 'three_prime_utr',
                seqid = gene.seqid,
                strand = gene.strand,
                start = utr_start,
                end = utr_end,
                attr = attr,
                )
            utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index))
            gene_utrs.append(utr)
        
        annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs)
        annotation.write_gff3(work/'utr.gff', gene_utrs)
        
        if self.index and self.star and grace.can_execute("STAR"):
            star_work = workspace.Workspace(work/'star')
            io.execute([
                'STAR','--runMode','genomeGenerate',
                '--outFileNamePrefix',star_work.working_dir+'/',
                '--genomeDir',star_work.working_dir,
                '--genomeFastaFiles',work/'reference.fa',
                '--sjdbGTFfile',work/'reference.gff',
                '--sjdbGTFtagExonParentTranscript','Parent',
                '--sjdbOverhang','100',
                ])
            
        work.update_param(tail_tools_reference_version=work.VERSION)
Exemple #40
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = { }
        for item in reader.metadata.get('sampleTags',[]):
            parts = item.split(',')
            tags[parts[0]] = parts
        
        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = [ 'reference'] + reader.samples
        
        for sample in samples:
            if sample not in tags:
                tags[sample] = [ sample, 'all' ]

        samples = selection.select_and_sort(
            self.select, self.sort, samples, lambda sample: tags[sample])
        
        required = [ i for i, sample in enumerate(samples)
                     if selection.matches(self.require, tags[sample]) ]
        
        sample_number = dict((b,a) for a,b in enumerate(reader.samples))
        
        items = [ ]
        for record in reader:
            variants = get_variants(record)
            genotypes = [ ]
            counts = [ ]
            qualities = [ ]
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(get_genotype(record.samples[sample_number[sample]]))
                    counts.append(get_variant_counts(record.samples[sample_number[sample]]))
                    qualities.append(record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes            
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None and
                        not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue
                
            if self.only_snps and any(
                genotype is not None and any(len(variants[i]) != 1 for i in genotype)
                for genotype in genotypes):
                continue
                
            snpeff = snpeff_describe(record.INFO.get('EFF',''))
            if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('',[])]) ):
                continue

            items.append(_Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record))
        
        self.log.log('%d variants\n\n' % len(items))
        
        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)
            
            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                        'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                        'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 
                        'QUIT' 
                        % (self.prefix, self.prefix, len(items)),
                )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)
        
        else:
            raise grace.Error('Unknown output format: '+self.as_)
Exemple #41
0
    def run(self):
        assert self.method in ("limma", "fitnoise1",
                               "fitnoise2"), "Unknown method."
        assert self.method != "limma" or not self.empirical_controls

        title = self.get_title()

        n_alt = len(self.alt)
        n_null = len(self.null)

        suffix = '-dedup' if self.dedup else ''

        genewise_filename = join(self.analysis, 'expression',
                                 'genewise' + suffix, 'counts.csv')
        genewise_norm_filename = join(self.analysis, 'expression',
                                      'genewise' + suffix, 'norm.csv')

        primarypeakwise_filename = join(self.analysis, 'expression',
                                        'primarypeakwise' + suffix,
                                        'counts.csv')
        primarypeakwise_norm_filename = join(self.analysis, 'expression',
                                             'primarypeakwise' + suffix,
                                             'norm.csv')

        peakwise_filename = join(self.analysis, 'expression',
                                 'peakwise' + suffix, 'counts.csv')
        peakwise_norm_filename = join(self.analysis, 'expression',
                                      'peakwise' + suffix, 'norm.csv')

        pairwise_filename = join(self.analysis, 'peak-shift' + suffix,
                                 'individual-pairs.csv')
        pairwise_norm_filename = join(self.analysis, 'peak-shift' + suffix,
                                      'individual-pairs-norm.csv')

        reader = io.Table_reader(genewise_filename, 'Count')
        reader.close()
        samples = [
            item for i, item in enumerate(reader.headings)
            if reader.groups[i] == 'Count'
        ]
        tags = {}
        for item in samples:
            tags[item] = [item]
        for line in reader.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                tags[parts[0]] = parts

        model = []
        for term in self.alt + self.null:
            spec = selection.term_specification(term)
            model.append(
                [selection.weight(spec, tags[item]) for item in samples])
        model = zip(*model)  #Transpose

        select = [any(row) for row in model]
        model = [row for row, selected in zip(model, select) if selected]
        model_columns = [
            selection.term_name(item) for item in self.alt + self.null
        ]
        model_rows = [item for keep, item in zip(select, samples) if keep]

        #degust complains if name starts with '-', delimits with commas
        model_columns = [
            ('.' if item[:1] == '-' else '') + item.replace(',', ';')
            for item in model_columns
        ]

        pairs_n_alt = n_alt
        pairs_select = select + select
        pairs_model = ([(0, ) * n_alt + row + (0, ) for row in model] +
                       [row[:n_alt] + row + (1, ) for row in model])
        pairs_model_columns = (
            [item + '-interaction'
             for item in model_columns[:n_alt]] + model_columns + ['pair2'])
        pairs_model_rows = [item + '-peak1' for item in model_rows
                            ] + [item + '-peak2' for item in model_rows]

        design_str = '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients\n'
        for row, name in zip(model, model_rows):
            design_str += "%s %s\n" % (''.join('%7g ' % item
                                               for item in row), name)

        print
        print "Design matrix"
        print design_str
        print
        print 'Pair design matrix'
        print '[' + ('-' * (8 * n_alt - 2)) + '] test coefficients'
        for row, name in zip(pairs_model, pairs_model_rows):
            print ''.join('%7g ' % item for item in row), name
        print

        workspace = self.get_workspace()

        runr.run_script(
            TEST_R,
            self.tell,
            DIR=workspace.working_dir,
            METHOD=self.method,
            WEIGHT=self.weight,
            EMPIRICAL_CONTROLS=self.empirical_controls,
            MIN_READS=self.min_reads,
            BIOTYPE=self.biotype,
            RELATION=self.relation,
            QUANTILE_TAIL=self.quantile_tail,
            DO_EXPRESSION=self.do_expression,
            DO_TAIL_LENGTH=self.do_tail_length,
            VERBOSE=self.verbose,
            GENEWISE_FILENAME=genewise_filename,
            GENEWISE_NORM_FILENAME=genewise_norm_filename,
            PRIMARYPEAKWISE_FILENAME=primarypeakwise_filename,
            PRIMARYPEAKWISE_NORM_FILENAME=primarypeakwise_norm_filename,
            PEAKWISE_FILENAME=peakwise_filename,
            PEAKWISE_NORM_FILENAME=peakwise_norm_filename,
            PAIRWISE_FILENAME=pairwise_filename,
            PAIRWISE_NORM_FILENAME=pairwise_norm_filename,
            N_ALT=n_alt,
            SELECT=select,
            MODEL=model,
            MODEL_COLUMNS=model_columns,
            PAIRS_N_ALT=pairs_n_alt,
            PAIRS_SELECT=pairs_select,
            PAIRS_MODEL=pairs_model,
            PAIRS_MODEL_COLUMNS=pairs_model_columns,
        )
        if self.tell: return

        reporter = reporting.Reporter(workspace.working_dir,
                                      title,
                                      style=web.style())

        if self.dedup:
            reporter.p('Read deduplication was used.')

        reporter.write('<table>\n')
        for is_expression, entities, result, aveexpr, subtitle, terms in [
            (True, 'genes', 'genewise-voom', 'avg.expression',
             'Genewise expression level', model_columns[:n_alt]),
            (False, 'genes', 'genewise-tail', 'avg.tail',
             'Genewise tail length', model_columns[:n_alt]),
            (True, 'primary peaks', 'primarypeakwise-voom', 'avg.expression',
             'Primary-peakwise expression level', model_columns[:n_alt]),
            (False, 'primary peaks', 'primarypeakwise-tail', 'avg.tail',
             'Primary-peakwise tail length', model_columns[:n_alt]),
            (True, 'peaks', 'peakwise-voom', 'avg.expression',
             'Peakwise expression level', model_columns[:n_alt]),
            (False, 'peaks', 'peakwise-tail', 'avg.tail',
             'Peakwise tail length', model_columns[:n_alt]),
            (True, 'peak pairs', 'pairwise-voom', 'avg.expression',
             'Peak-pair expression shift', pairs_model_columns[:n_alt]),
            (False, 'peak pairs', 'pairwise-tail', 'avg.tail',
             'Peak-pair tail length shift', pairs_model_columns[:n_alt]),
        ]:
            #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
            #n = 0
            #n_01 = 0
            #n_05 = 0
            #for row in data.values():
            #    fdr = float(row['adj.P.Val'])
            #    if fdr <= 0.01: n_01 += 1
            #    if fdr <= 0.05: n_05 += 1
            #    n += 1

            if is_expression and not self.do_expression: continue
            if not is_expression and not self.do_tail_length: continue

            io.execute([
                'degust.py',
                '--name',
                title + ' : ' + subtitle,
                '--avg',
                aveexpr,
                '--primary',
                'baseline',
                '--logFC',
                ','.join(terms),
                '--fdr',
                'adj.P.Val',
                '--info',
                'gene,locus_tag,product,reads,polya.reads,tail.lengths,' +
                aveexpr,
                '--notour',
                '1',
                '--out',
                workspace / (result + '.html'),
                workspace / (result + '-toptable.csv'),
            ])

            with open(workspace / (result + '.txt'), 'rU') as f:
                lines = f.readlines()

            reporter.write('<tr><td valign="top" width="33%">')
            reporter.subheading(
                reporter.href(workspace / (result + '.html'), subtitle))
            #reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
            line = reporter.href(workspace / (result + '-toptable.csv'),
                                 'Spreadsheet')
            if result.endswith('voom'):
                line += ', ' + reporter.href(workspace /
                                             (result + '.png'), 'voom plot')
            reporter.p(line)
            for line in lines[-2:]:
                reporter.p(line.strip())
            reporter.write('</td><td valign="top"><br/><br/>')
            for line in lines[:-2]:
                reporter.write(line.strip() + '<br/>\n')
            reporter.write('</td></tr>')

        reporter.write('</table>\n')

        reporter.subheading("Design matrix")

        reporter.write('<pre>' + design_str + '</pre>')

        reporter.close()
    def run(self):
        assert self.ucsc_name, 'Need a UCSC genome name'

        scratch = _ucsc_scratch(self)

        # Load annotations

        source = 'tt-ucsc-%s-%s' % (self.ucsc_name, self.table)

        table = scratch.get_table(self.table)
        get_name = scratch.getter(self.name)
        get_product = scratch.getter(self.product)

        mrnas = []

        for item in table:
            ann = annotation.Annotation(
                seqid=item.chrom,
                source=source,
                type='mRNA',
                strand={
                    '+': 1,
                    '-': -1
                }[item.strand],
                start=int(item.txStart),
                end=int(item.txEnd),
                attr={
                    'ID': item.name,
                    'Name': get_name(item),
                    'Product': get_product(item),
                    #'UCSC_name2' : item.name2,
                })

            ann.record = item
            mrnas.append(ann)

        _uniquify_ids(mrnas)

        annotations = []

        for group in _grouped_features(mrnas):
            ID = '/'.join(item.attr['ID'] for item in group)
            for item in group:
                item.attr['Parent'] = ID
                item.attr['ID'] = item.attr['ID'] + '-mRNA'

            annotations.append(
                annotation.Annotation(
                    source=source,
                    type='gene',
                    seqid=group[0].seqid,
                    strand=group[0].strand,
                    start=min(item.start for item in group),
                    end=max(item.end for item in group),
                    attr={
                        'ID':
                        ID,
                        'Name':
                        annotation_tools.join_descriptions(
                            [item.attr['Name'] for item in group], '/'),
                        'Product':
                        annotation_tools.join_descriptions(
                            [item.attr['Product'] for item in group], '/'),
                        #'UCSC_name2' : annotation_tools.join_descriptions([ item.attr['UCSC_name2'] for item in group ], '/'),
                    }))
            for item in group:
                annotations.append(item)

                exonStarts = _parse_ints(item.record.exonStarts)
                exonEnds = _parse_ints(item.record.exonEnds)
                cdsStart = int(item.record.cdsStart)
                cdsEnd = int(item.record.cdsEnd)
                for start, end in zip(exonStarts, exonEnds):
                    annotations.append(
                        annotation.Annotation(source=source,
                                              type='exon',
                                              seqid=item.seqid,
                                              strand=item.strand,
                                              start=start,
                                              end=end,
                                              attr={
                                                  'Parent': item.attr['ID'],
                                              }))
                    if max(cdsStart, start) < min(cdsEnd, end):
                        annotations.append(
                            annotation.Annotation(
                                source=source,
                                type='CDS',
                                seqid=item.seqid,
                                strand=item.strand,
                                start=max(cdsStart, start),
                                end=min(cdsEnd, end),
                                #TODO: phase
                                attr={
                                    'Parent': item.attr['ID'],
                                }))

        # Load sequence

        if self.download:
            io.execute([
                'rsync', '-P', 'rsync://hgdownload.cse.ucsc.edu/goldenPath/' +
                self.ucsc_name + '/bigZips/chromFa.tar.gz',
                scratch.ucsc / 'chromFa.tar.gz'
            ])

        with workspace.tempspace() as temp:
            io.execute([
                'tar', '-C', temp.working_dir, '-zxf',
                scratch.ucsc / 'chromFa.tar.gz'
            ])
            sequences = [
                temp / item
                for item in natural_sorted(os.listdir(temp.working_dir))
            ]

            with open(temp / 'reference.gff', 'wb') as f:
                annotation.write_gff3_header(f)
                for item in annotations:
                    print >> f, item.as_gff()

            Make_tt_reference(
                self.output_dir,
                filenames=sequences + [temp / 'reference.gff'],
                index=self.index,
            ).run()
Exemple #43
0
def sort_and_index_bam(in_filename, out_prefix, cores=8):
    sort_bam(in_filename, out_prefix, cores=cores)
    
    io.execute([
        'samtools', 'index', out_prefix + '.bam'
        ])
Exemple #44
0
   def run(self):
       title = self.get_title()
   
       n_alt = len(self.alt)
       n_null = len(self.null)
       
       suffix = '-dedup' if self.dedup else ''
   
       genewise_filename = join(self.analysis,'expression','genewise'+suffix,'counts.csv')
       genewise_norm_filename = join(self.analysis,'expression','genewise'+suffix,'norm.csv')

       peakwise_filename = join(self.analysis,'expression','peakwise'+suffix,'counts.csv')
       peakwise_norm_filename = join(self.analysis,'expression','peakwise'+suffix,'norm.csv')

       pairwise_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs.csv')
       pairwise_norm_filename = join(self.analysis,'peak-shift'+suffix,'individual-pairs-norm.csv')

   
       reader = io.Table_reader(genewise_filename, 'Count')
       reader.close()
       samples = [ item for i, item in enumerate(reader.headings) if reader.groups[i] == 'Count' ]
       tags = { }
       for item in samples:
           tags[item] = [ item ]
       for line in reader.comments:
           if line.startswith('#sampleTags='):
               parts = line[len('#sampleTags='):].split(',')
               tags[parts[0]] = parts
              
       model = [ ]
       for term in self.alt + self.null:        
           spec = term_specification(term)
           model.append([ 1 if selection.matches(spec, tags[item]) else 0 for item in samples ])
       model = zip(*model) #Transpose
       
       select = [ any(row) for row in model ]
       model = [ row for row,selected in zip(model,select) if selected ]
       model_columns = [ term_name(item) for item in self.alt + self.null ]
       
       pairs_n_alt = n_alt       
       pairs_select = select + select
       pairs_model = (
           [ (0,) * n_alt + row + (0,) for row in model ] +
           [ row[:n_alt]  + row + (1,) for row in model ] 
           )
       pairs_model_columns = (
           [ item+'-interaction' for item in model_columns[:n_alt] ] +
           model_columns +
           [ 'pair2' ]
           )
       
       workspace = self.get_workspace()
       
       runr.run_script(TEST_R, self.tell,
           SOURCE = os.path.join(os.path.dirname(__file__),'tail_tools.R'),
           DIR = workspace.working_dir,
           MIN_READS = self.min_reads,
           GENEWISE_FILENAME = genewise_filename,
           GENEWISE_NORM_FILENAME = genewise_norm_filename,
           PEAKWISE_FILENAME = peakwise_filename,
           PEAKWISE_NORM_FILENAME = peakwise_norm_filename,
           PAIRWISE_FILENAME = pairwise_filename,
           PAIRWISE_NORM_FILENAME = pairwise_norm_filename,
           
           N_ALT = n_alt,
           SELECT = select,
           MODEL = model,
           MODEL_COLUMNS = model_columns,
           PAIRS_N_ALT = pairs_n_alt,
           PAIRS_SELECT = pairs_select,
           PAIRS_MODEL = pairs_model,
           PAIRS_MODEL_COLUMNS = pairs_model_columns,
           )
       if self.tell: return
       
       reporter = reporting.Reporter(workspace.working_dir, title)
       
       if self.dedup:
           reporter.p('Read deduplication was used.')
       
       for entities, result, aveexpr, subtitle, terms in [
           ('genes', 'genewise-voom', 'avg.expression', 'Genewise expression level', model_columns[:n_alt]),
           ('genes', 'genewise-tail', 'avg.tail', 'Genewise tail length', model_columns[:n_alt]),
           ('peaks', 'peakwise-voom', 'avg.expression', 'Peakwise expression level', model_columns[:n_alt]),
           ('peaks', 'peakwise-tail', 'avg.tail', 'Peakwise tail length', model_columns[:n_alt]),
           ('peak pairs', 'pairwise-voom', 'avg.expression', 'Peak-pair expression shift', pairs_model_columns[:n_alt]),
           ('peak pairs', 'pairwise-tail', 'avg.tail', 'Peak-pair tail length shift', pairs_model_columns[:n_alt]),
           ]:
           #data = io.read_grouped_table(workspace/(result+'-toptable.csv'))['All']
           #n = 0
           #n_01 = 0
           #n_05 = 0
           #for row in data.values():
           #    fdr = float(row['adj.P.Val'])
           #    if fdr <= 0.01: n_01 += 1
           #    if fdr <= 0.05: n_05 += 1
           #    n += 1
           
           io.execute([
               'degust.py',
               '--name', title + ' : ' + subtitle,
               '--avg', aveexpr,
               '--primary', 'baseline',
               '--logFC', ','.join(terms),
               '--fdr', 'adj.P.Val',
               '--info', 'gene,locus_tag,product,reads,polya.reads,tail.lengths,'+aveexpr,
               '--notour', '1',
               '--out', workspace/(result+'.html'),
               workspace/(result+'-toptable.csv'),
               ])
            
           reporter.subheading( reporter.href(workspace/(result+'.html'), subtitle) )
           #reporter.p( '%d %s, %d with fdr&lt;=0.01, %d with fdr&lt;=0.05' % (n,entities,n_01,n_05) )
           with open(workspace/(result+'.txt'),'rU') as f:
               for line in f:
                   reporter.write(line.strip() + '<br/>\n')
        
       reporter.close()
Exemple #45
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(grace.how_many_cpus()),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Create working directory

        workspace = self.get_workspace(
        )  #working_directory.Working(self.output_dir, must_exist=False)
        workspace.setup_reference(self.references)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        #workspace = io.Workspace(self.output_dir)
        #
        #workspace.update_param(
        #    shrimp_cutoff = cutoff
        #)
        #
        ##Make copy of reference sequences
        #
        #reference_filename = io.abspath(self.output_dir,'reference.fa')
        #reference_file = open(reference_filename,'wb')
        #
        #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk')
        #reference_genbank_file = open(reference_genbank_filename,'wb')
        #any_genbank = [ False ]
        #
        #def genbank_callback(name, record):
        #    """ Make a copy of any genbank files passed in. """
        #    from Bio import SeqIO
        #
        #    SeqIO.write([record], reference_genbank_file, 'genbank')
        #
        #    f = open(os.path.join(
        #        self.output_dir,
        #        grace.filesystem_friendly_name(name) + '.gbk'
        #    ), 'wb')
        #    SeqIO.write([record], f, 'genbank')
        #    f.close()
        #
        #    any_genbank[0] = True
        #
        #for filename in self.references:
        #    for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback):
        #        #Don't retain any comment
        #        name = name.split()[0]
        #        io.write_fasta(reference_file, name, sequence.upper())
        #
        #        f = open(os.path.join(
        #            self.output_dir,
        #            grace.filesystem_friendly_name(name) + '.fa'
        #        ), 'wb')
        #        io.write_fasta(f, name, sequence.upper())
        #        f.close()
        #
        #
        #reference_file.close()
        #reference_genbank_file.close()
        #if not any_genbank[0]:
        #    os.unlink(reference_genbank_filename)
        #
        ## Create an index of the reference sequences
        #io.execute([
        #    'samtools', 'faidx', reference_filename
        #])

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        #if self.cs:
        #    program = 'gmapper-cs'
        #else:
        #    program = 'gmapper-ls'

        sam_header_sent = [False]
        n_seen = [0]

        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)

            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        if '--qv-offset' not in self.shrimp_options:
            guesses = []
            for filenames, is_paired in read_sets:
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
            default_options['--qv-offset'] = str(guesses[0])

        for filenames, is_paired in read_sets:
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')
            #    temp_read_filename = io.abspath(working_dir, 'temp.fa')
            #else:
            #    temp_read_filename = io.abspath(working_dir, 'temp.fq')

            #try:

            #if len(filenames) == 1: # gmapper can cope with gzipped     and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'):
            #    actual_read_filename = filenames[0]
            #else:
            #    actual_read_filename = temp_read_filename
            #    grace.status('Copying reads')
            #    f = open(temp_read_filename, 'wb')
            #    if has_qualities:
            #        for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]):
            #            for name, seq, qual in reads:
            #                io.write_fastq(f, name, seq, qual)
            #    else:
            #        for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]):
            #            for name, seq in reads:
            #                io.write_fasta(f, name, seq)
            #    f.close()
            #    grace.status('')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file)
            eat(p)

            #finally:
            #    if os.path.exists(temp_read_filename):
            #        os.unlink(temp_read_filename)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status('')
Exemple #46
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, "No reference sequences given"
        assert self.reads or self.pairs or self.interleaved, "No reads given"
        for pair in self.pairs:
            assert len(pair) == 2, "Two files required in each pair: section"

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        # Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            "-E": None,
            "-T": None,
            "-N": str(cores),
            "-n": "2",
            "-w": "200%",
            "-p": "opp-in",
            "-I": "0,500",
            "-X": None,
        }

        if self.sam_unaligned:
            default_options["--sam-unaligned"] = None

        if self.half_paired:
            default_options["--half-paired"] = None
        else:
            default_options["--no-half-paired"] = None

        cutoff = "55%"  # Default changed in SHRiMP 2.0.2
        if "-h" in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1]

        # Run shrimp

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")
        bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted")

        temp_filename = io.abspath(self.output_dir, "temp.bam")

        log_filename = io.abspath(self.output_dir, "shrimp_log.txt")
        log_file = open(log_filename, "wb")

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith("@"):
                    if sam_header_sent[0]:
                        continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status("%s alignments produced" % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ["-p", "-I"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2 :]
            for flag in ["--half-paired"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1 :]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames  # A little ugly
            )
            if has_qualities:
                options.append("--fastq")

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ["-1", filenames[0], "-2", filenames[1]]

            if "--qv-offset" not in self.shrimp_options:
                guesses = []
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
                assert (
                    len(set(guesses)) == 1
                ), "Conflicting quality offset guesses, please specify --qv-offset manually."
                default_options["--qv-offset"] = str(guesses[0])

            default_options["--read-group"] = "%s,%s" % (
                workspace.name.replace(",", "_"),
                workspace.name.replace(",", "_"),
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status("")

            full_param = reference.shrimp_command(self.cs, options + reads_parameters)

            print >>sys.stderr, "Running", " ".join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status("")