Example #1
0
def open_possibly_compressed_file(filename):
    """ Read a file. It might be compressed. """
    #if isinstance(filename, file):
    #    return filename #Whatever

    #peek, f = peek_and_pipe(open_possibly_remote_file(filename), 4)

    from nesoni import sam
    if sam.is_bam(filename):
        return sam.open_bam(filename)

    f = open(filename, 'rb')
    peek = f.read(4)
    f.close()
    #f = open(filename,'rb')

    if peek.startswith('\x1f\x8b'):
        #command = 'gunzip'
        return gzip.open(filename, 'rb')
    elif peek.startswith('BZh'):
        #command = 'bunzip2'
        return bz2.BZFile(filename, 'rb')
    else:
        #command = None
        return open(filename, 'rb')
Example #2
0
def open_possibly_compressed_file(filename):
    """ Read a file. It might be compressed. """
    #if isinstance(filename, file):
    #    return filename #Whatever
        
    #peek, f = peek_and_pipe(open_possibly_remote_file(filename), 4)

    from nesoni import sam    
    if sam.is_bam(filename):
        return sam.open_bam(filename)
    
    f = open(filename,'rb')
    peek = f.read(4)
    f.close()
    #f = open(filename,'rb')

    if peek.startswith('\x1f\x8b'):
        #command = 'gunzip'
        return gzip.open(filename, 'rb')
    elif peek.startswith('BZh'):
        #command = 'bunzip2'
        return bz2.BZFile(filename, 'rb')
    else:
        #command = None
        return open(filename, 'rb')
Example #3
0
def get_compression_type(filename):
    if hasattr(filename, 'read'):
        return 'none' #It's already file-like
        
    from nesoni import sam    
    
    f = open(filename,'rb')
    peek = f.read(4)
    f.close()

    if peek.startswith('\x1f\x8b'): #gzip format
        if sam.is_bam(filename): #it might be a BAM
            return 'bam'
            
        return 'gzip'
    elif peek.startswith('BZh'): #bzip2 format
        return 'bzip2'
    else:
        return 'none'
Example #4
0
def get_compression_type(filename):
    if hasattr(filename, 'read'):
        return 'none'  #It's already file-like

    from nesoni import sam

    f = open(filename, 'rb')
    peek = f.read(4)
    f.close()

    if peek.startswith('\x1f\x8b'):  #gzip format
        if sam.is_bam(filename):  #it might be a BAM
            return 'bam'

        return 'gzip'
    elif peek.startswith('BZh'):  #bzip2 format
        return 'bzip2'
    else:
        return 'none'
Example #5
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost=self.snp_cost)

        # assert os.path.exists(self.reference), 'Reference file does not exist'
        # reference_filename = workspace._object_filename('reference.fa')
        # if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, "temp.bam")
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"])
            f = open(self.input, "rb")
            while True:
                data = f.read(1 << 20)
                if not data:
                    break
                writer.write(data)
            writer.close()
            f.close()

        grace.status("Sort")

        # io.execute([
        #    'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        # ])
        sam.sort_bam(sort_input_filename, bam_prefix, by_name=True)

        if temp_filename is not None:
            os.unlink(temp_filename)

        grace.status("")
Example #6
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        #])
        sam.sort_bam(sort_input_filename, bam_prefix, by_name=True)
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Example #7
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        io.execute([
            'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        ])
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Example #8
0
    def run(self):
        bams = [ ]
        reference = None
        reference2 = None
        
        extra = [ ]
        
        for sample in self.samples:
            if sam.is_bam(sample):
                bams.append(sample)
            elif os.path.isdir(sample):
                working = working_directory.Working(sample,True)
                bams.append( working.get_filtered_sorted_bam() )
                extra.append( '##sampleTags=' + ','.join(working.get_tags()) )
                if reference2 is None:
                    reference2 = working.get_reference().reference_fasta_filename()
            elif io.is_sequence_file(sample):
                assert reference is None, 'Only one reference FASTA file allowed.'
                reference = sample
        
        if reference is None:
            reference = reference2
        if reference is None:
            raise grace.Error('No reference FASTA file given.')
        
        with nesoni.Stage() as stage:
            tempspace = stage.enter( workspace.tempspace() )
            if self.depth_limit:
                with nesoni.Stage() as stage2:
                    for i in xrange(len(bams)):
                        sam.Bam_depth_limit(
                            tempspace/('%d'%i), 
                            bams[i], 
                            depth=self.depth_limit
                            ).process_make(stage2)
                        bams[i] = tempspace/('%d.bam'%i)
            
            # FreeBayes claims to handle multiple bams, but it doesn't actually work
            if len(bams) > 1:
                sam.Bam_merge(tempspace/'merged', bams=bams, index=False).run()
                bams = [ tempspace/'merged.bam' ]
        
            command = [ 
                'freebayes',
                '-f', reference,
                '--ploidy',str(self.ploidy),
                '--pvar',str(self.pvar),
                ] + self.freebayes_options + bams
            
            self.log.log('Running: '+' '.join(command)+'\n')
        
            f_out = stage.enter( open(self.prefix+'.vcf','wb') )
            f_in  = stage.enter( io.pipe_from(command) )
            done_extra = False
            for line in f_in:
                if not done_extra and not line.startswith('##'):
                    for extra_line in extra:
                        f_out.write(extra_line+'\n')
                    done_extra = True
                f_out.write(line)

        index_vcf(self.prefix+'.vcf')
Example #9
0
    def run(self):
        bams = []
        reference = None
        reference2 = None

        extra = []

        for sample in self.samples:
            if sam.is_bam(sample):
                bams.append(sample)
            elif os.path.isdir(sample):
                working = working_directory.Working(sample, True)
                bams.append(working.get_filtered_sorted_bam())
                extra.append('##sampleTags=' + ','.join(working.get_tags()))
                if reference2 is None:
                    reference2 = working.get_reference(
                    ).reference_fasta_filename()
            elif io.is_sequence_file(sample):
                assert reference is None, 'Only one reference FASTA file allowed.'
                reference = sample

        if reference is None:
            reference = reference2
        if reference is None:
            raise grace.Error('No reference FASTA file given.')

        with nesoni.Stage() as stage:
            tempspace = stage.enter(workspace.tempspace())
            if self.depth_limit:
                with nesoni.Stage() as stage2:
                    for i in xrange(len(bams)):
                        sam.Bam_depth_limit(
                            tempspace / ('%d' % i),
                            bams[i],
                            depth=self.depth_limit).process_make(stage2)
                        bams[i] = tempspace / ('%d.bam' % i)

            # FreeBayes claims to handle multiple bams, but it doesn't actually work
            if len(bams) > 1:
                sam.Bam_merge(tempspace / 'merged', bams=bams,
                              index=False).run()
                bams = [tempspace / 'merged.bam']

            command = [
                'freebayes',
                '-f',
                reference,
                '--ploidy',
                str(self.ploidy),
                '--pvar',
                str(self.pvar),
            ] + self.freebayes_options + bams

            self.log.log('Running: ' + ' '.join(command) + '\n')

            f_out = stage.enter(open(self.prefix + '.vcf', 'wb'))
            f_in = stage.enter(io.pipe_from(command))
            done_extra = False
            for line in f_in:
                if not done_extra and not line.startswith('##'):
                    for extra_line in extra:
                        f_out.write(extra_line + '\n')
                    done_extra = True
                f_out.write(line)

        index_vcf(self.prefix + '.vcf')