Example #1
0
    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'

        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               exclude=[ 'gene', 'source' ]
        ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Example #2
0
File: sam.py Project: mscook/nesoni
def open_bam(filename):
    process = io.run([
        'samtools',
        'view',
        '-h',
        io.abspath(filename),
    ])
    return process.stdout
Example #3
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost=self.snp_cost)

        # assert os.path.exists(self.reference), 'Reference file does not exist'
        # reference_filename = workspace._object_filename('reference.fa')
        # if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        # os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, "temp.bam")
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ["samtools", "view", "-S", "-b", "-"])
            f = open(self.input, "rb")
            while True:
                data = f.read(1 << 20)
                if not data:
                    break
                writer.write(data)
            writer.close()
            f.close()

        grace.status("Sort")

        # io.execute([
        #    'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        # ])
        sam.sort_bam(sort_input_filename, bam_prefix, by_name=True)

        if temp_filename is not None:
            os.unlink(temp_filename)

        grace.status("")
Example #4
0
def bam_headers(filename):
    process = io.run([
        'samtools',
        'view',
        '-H',
        io.abspath(filename),
    ])
    
    headers = process.stdout.read()                
    assert process.wait() == 0, '"samtools view -H ..." failed'
    
    return headers
Example #5
0
def bam_headers(filename):
    process = io.run([
        'samtools',
        'view',
        '-H',
        io.abspath(filename),
    ])
    
    headers = process.stdout.read()                
    assert process.wait() == 0, '"samtools view -H ..." failed'
    
    return headers
Example #6
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        #])
        sam.sort_bam(sort_input_filename, bam_prefix, by_name=True)
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Example #7
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        io.execute([
            'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        ])
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Example #8
0
 def __init__(self, filename):
     assert os.path.exists(filename), filename + ' does not exist'
     
     if is_bam(filename):
         self.process = io.run([
             'samtools',
             'view',
             io.abspath(filename),
         ])
         
         ## Godawful hack
         #self.process.stdout = io.process_buffer(self.process.stdout)
         self.file = self.process.stdout
     else:
         self.process = None
         self.file = io.open_possibly_compressed_file(filename)
Example #9
0
 def __init__(self, filename):
     assert os.path.exists(filename), filename + ' does not exist'
     
     if is_bam(filename):
         self.process = io.run([
             'samtools',
             'view',
             io.abspath(filename),
         ])
         
         ## Godawful hack
         #self.process.stdout = io.process_buffer(self.process.stdout)
         self.file = self.process.stdout
     else:
         self.process = None
         self.file = io.open_possibly_compressed_file(filename)
Example #10
0
def open_bam(filename):
    process = io.run([
        'samtools', 'view', '-h',
        io.abspath(filename),
    ])    
    return process.stdout
Example #11
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        #Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(cores),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None,
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str(
                    io.guess_quality_offset(*filenames))

            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',',
                                       '_'), workspace.name.replace(',', '_'))
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)

        os.unlink(temp_filename)

        grace.status('')
Example #12
0
 def absolutize(filename):
     if options.prefix is not None:
         return options.prefix + filename
     else:
         return io.abspath(filename)
Example #13
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, "No reference sequences given"
        assert self.reads or self.pairs or self.interleaved, "No reads given"
        for pair in self.pairs:
            assert len(pair) == 2, "Two files required in each pair: section"

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        # Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            "-E": None,
            "-T": None,
            "-N": str(cores),
            "-n": "2",
            "-w": "200%",
            "-p": "opp-in",
            "-I": "0,500",
            "-X": None,
        }

        if self.sam_unaligned:
            default_options["--sam-unaligned"] = None

        if self.half_paired:
            default_options["--half-paired"] = None
        else:
            default_options["--no-half-paired"] = None

        cutoff = "55%"  # Default changed in SHRiMP 2.0.2
        if "-h" in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1]

        # Run shrimp

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")
        bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted")

        temp_filename = io.abspath(self.output_dir, "temp.bam")

        log_filename = io.abspath(self.output_dir, "shrimp_log.txt")
        log_file = open(log_filename, "wb")

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith("@"):
                    if sam_header_sent[0]:
                        continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status("%s alignments produced" % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ["-p", "-I"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2 :]
            for flag in ["--half-paired"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1 :]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames  # A little ugly
            )
            if has_qualities:
                options.append("--fastq")

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ["-1", filenames[0], "-2", filenames[1]]

            if "--qv-offset" not in self.shrimp_options:
                guesses = []
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
                assert (
                    len(set(guesses)) == 1
                ), "Conflicting quality offset guesses, please specify --qv-offset manually."
                default_options["--qv-offset"] = str(guesses[0])

            default_options["--read-group"] = "%s,%s" % (
                workspace.name.replace(",", "_"),
                workspace.name.replace(",", "_"),
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status("")

            full_param = reference.shrimp_command(self.cs, options + reads_parameters)

            print >>sys.stderr, "Running", " ".join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status("")
Example #14
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = [ ]
        for item in self.reads:
            read_sets.append( ([item], False) )
        for item in self.pairs:
            read_sets.append( (item, True) )
        for item in self.interleaved:
            read_sets.append( ([item], True) )

        #Create working directory
        
        workspace = self.get_workspace()
        workspace.setup_reference(self.references)        
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())
                
        default_options = { 
            '-E' : None, 
            '-T' : None, 
            '-N' : str(cores), 
            '-n':'2', 
            '-w':'200%',
            '-p': 'opp-in', 
            '-I': '0,500', 
            '-X':None,
        }
        
        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None
        
        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%' #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ]
        
        #Run shrimp
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')
        
        temp_filename = io.abspath(self.output_dir, 'temp.bam')
        
        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')
        
        sam_eater = sam.Bam_writer(temp_filename)
        
        sam_header_sent = [False]
        n_seen = [0]
        
        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True
        
        def remove_pair_options(options):
            for flag in ['-p','-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+1:]
            return options
        
        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]
               
            has_qualities = all(
                len( io.read_sequences(filename, qualities=True).next() ) == 3  #A little ugly
                for filename in filenames
            )
            if has_qualities:
                options.append( '--fastq' )
            
            if len(filenames) == 1:
                reads_parameters = [ filenames[0] ]
            else:
                reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ]
            
            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) )
                
            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',','_'),
                workspace.name.replace(',','_')
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])
            
            if not is_paired:
               options = remove_pair_options(options)
            
            grace.status('')
            
            full_param = reference.shrimp_command(self.cs, options + reads_parameters)
            
            print >> sys.stderr, 'Running', ' '.join(full_param)
            
            with io.pipe_from(full_param,
                    stderr=log_file,
                    cores=cores) as f:
                eat(f)
        
        log_file.close()
        
        sam_eater.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)
        
        os.unlink(temp_filename)
        
        grace.status('')
Example #15
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        read_sets = [ ]
        for item in self.reads:
            read_sets.append( ([item], False) )
        for item in self.pairs:
            read_sets.append( (item, True) )
        for item in self.interleaved:
            read_sets.append( ([item], True) )
        
        default_options = { '-E' : None, '-T' : None, '-N' : str(grace.how_many_cpus()), '-n':'2', '-w':'200%',
                            '-p': 'opp-in', '-I': '0,500', '-X':None }
        
        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None
        
        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None


        cutoff = '55%' #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ]
        
        #Create working directory
        
        workspace = self.get_workspace() #working_directory.Working(self.output_dir, must_exist=False)
        workspace.setup_reference(self.references)        
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()
        
        #workspace = io.Workspace(self.output_dir)
        #
        #workspace.update_param( 
        #    shrimp_cutoff = cutoff
        #)
        #
        ##Make copy of reference sequences
        #
        #reference_filename = io.abspath(self.output_dir,'reference.fa')
        #reference_file = open(reference_filename,'wb')
        #
        #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk')
        #reference_genbank_file = open(reference_genbank_filename,'wb')
        #any_genbank = [ False ]
        #
        #def genbank_callback(name, record):
        #    """ Make a copy of any genbank files passed in. """
        #    from Bio import SeqIO
        #    
        #    SeqIO.write([record], reference_genbank_file, 'genbank')
        #    
        #    f = open(os.path.join(
        #        self.output_dir,
        #        grace.filesystem_friendly_name(name) + '.gbk'
        #    ), 'wb')
        #    SeqIO.write([record], f, 'genbank')
        #    f.close()
        #    
        #    any_genbank[0] = True
        #
        #for filename in self.references:
        #    for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback):
        #        #Don't retain any comment
        #        name = name.split()[0]
        #        io.write_fasta(reference_file, name, sequence.upper())
        #        
        #        f = open(os.path.join(
        #            self.output_dir,
        #            grace.filesystem_friendly_name(name) + '.fa'
        #        ), 'wb')
        #        io.write_fasta(f, name, sequence.upper())
        #        f.close()
        #        
        #
        #reference_file.close()
        #reference_genbank_file.close()
        #if not any_genbank[0]:
        #    os.unlink(reference_genbank_filename)
        #
        ## Create an index of the reference sequences
        #io.execute([
        #    'samtools', 'faidx', reference_filename
        #])
        
        #Run shrimp
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')
        
        temp_filename = io.abspath(self.output_dir, 'temp.bam')
        
        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')
        
        sam_eater = sam.Bam_writer(temp_filename)
        
        #if self.cs:
        #    program = 'gmapper-cs'
        #else:
        #    program = 'gmapper-ls'
        
        sam_header_sent = [False]
        n_seen = [0]
        
        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
                
            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True
        
        def remove_pair_options(options):
            for flag in ['-p','-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+1:]
            return options
        
        if '--qv-offset' not in self.shrimp_options:
            guesses = [ ]
            for filenames, is_paired in read_sets:
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
            default_options['--qv-offset'] = str(guesses[0])
                
        for filenames, is_paired in read_sets:
            options = self.shrimp_options[:]
               
            has_qualities = all(
                len( io.read_sequences(filename, qualities=True).next() ) == 3  #A little ugly
                for filename in filenames
            )
            if has_qualities:
                options.append( '--fastq' )
            #    temp_read_filename = io.abspath(working_dir, 'temp.fa')
            #else:
            #    temp_read_filename = io.abspath(working_dir, 'temp.fq')
            
            #try:
            
            #if len(filenames) == 1: # gmapper can cope with gzipped     and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'):
            #    actual_read_filename = filenames[0]
            #else:
            #    actual_read_filename = temp_read_filename
            #    grace.status('Copying reads')
            #    f = open(temp_read_filename, 'wb')
            #    if has_qualities:
            #        for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]):
            #            for name, seq, qual in reads:
            #                io.write_fastq(f, name, seq, qual)
            #    else:
            #        for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]):
            #            for name, seq in reads:
            #                io.write_fasta(f, name, seq)
            #    f.close()
            #    grace.status('')
            
            if len(filenames) == 1:
                reads_parameters = [ filenames[0] ]
            else:
                reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ]
            
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])
            
            if not is_paired:
               options = remove_pair_options(options)
            
            grace.status('')
            
            full_param = reference.shrimp_command(self.cs, options + reads_parameters)
            
            print >> sys.stderr, 'Running', ' '.join(full_param)
            
            p = io.run(full_param,
                    stdout=subprocess.PIPE,
                    stderr=log_file)
            eat(p)
                        
            #finally:
            #    if os.path.exists(temp_read_filename):
            #        os.unlink(temp_read_filename)
        
        log_file.close()
        
        sam_eater.close()
        
        grace.status('Sort')
        
        io.execute([
            'samtools', 'sort', '-n', temp_filename, bam_prefix
        ])
        
        os.unlink(temp_filename)
        
        grace.status('')
Example #16
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(grace.how_many_cpus()),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Create working directory

        workspace = self.get_workspace(
        )  #working_directory.Working(self.output_dir, must_exist=False)
        workspace.setup_reference(self.references)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        #workspace = io.Workspace(self.output_dir)
        #
        #workspace.update_param(
        #    shrimp_cutoff = cutoff
        #)
        #
        ##Make copy of reference sequences
        #
        #reference_filename = io.abspath(self.output_dir,'reference.fa')
        #reference_file = open(reference_filename,'wb')
        #
        #reference_genbank_filename = io.abspath(self.output_dir,'reference.gbk')
        #reference_genbank_file = open(reference_genbank_filename,'wb')
        #any_genbank = [ False ]
        #
        #def genbank_callback(name, record):
        #    """ Make a copy of any genbank files passed in. """
        #    from Bio import SeqIO
        #
        #    SeqIO.write([record], reference_genbank_file, 'genbank')
        #
        #    f = open(os.path.join(
        #        self.output_dir,
        #        grace.filesystem_friendly_name(name) + '.gbk'
        #    ), 'wb')
        #    SeqIO.write([record], f, 'genbank')
        #    f.close()
        #
        #    any_genbank[0] = True
        #
        #for filename in self.references:
        #    for name, sequence in io.read_sequences(filename, genbank_callback=genbank_callback):
        #        #Don't retain any comment
        #        name = name.split()[0]
        #        io.write_fasta(reference_file, name, sequence.upper())
        #
        #        f = open(os.path.join(
        #            self.output_dir,
        #            grace.filesystem_friendly_name(name) + '.fa'
        #        ), 'wb')
        #        io.write_fasta(f, name, sequence.upper())
        #        f.close()
        #
        #
        #reference_file.close()
        #reference_genbank_file.close()
        #if not any_genbank[0]:
        #    os.unlink(reference_genbank_filename)
        #
        ## Create an index of the reference sequences
        #io.execute([
        #    'samtools', 'faidx', reference_filename
        #])

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        #if self.cs:
        #    program = 'gmapper-cs'
        #else:
        #    program = 'gmapper-ls'

        sam_header_sent = [False]
        n_seen = [0]

        def eat(process):
            for line in process.stdout:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)

            assert process.wait() == 0, 'shrimp failed'
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        if '--qv-offset' not in self.shrimp_options:
            guesses = []
            for filenames, is_paired in read_sets:
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
            default_options['--qv-offset'] = str(guesses[0])

        for filenames, is_paired in read_sets:
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')
            #    temp_read_filename = io.abspath(working_dir, 'temp.fa')
            #else:
            #    temp_read_filename = io.abspath(working_dir, 'temp.fq')

            #try:

            #if len(filenames) == 1: # gmapper can cope with gzipped     and filenames[0].endswith('.fa') or filenames[0].endswith('.fq'):
            #    actual_read_filename = filenames[0]
            #else:
            #    actual_read_filename = temp_read_filename
            #    grace.status('Copying reads')
            #    f = open(temp_read_filename, 'wb')
            #    if has_qualities:
            #        for reads in itertools.izip(*[ io.read_sequences(filename, qualities=True) for filename in filenames ]):
            #            for name, seq, qual in reads:
            #                io.write_fastq(f, name, seq, qual)
            #    else:
            #        for reads in itertools.izip(*[ io.read_sequences(filename) for filename in filenames ]):
            #            for name, seq in reads:
            #                io.write_fasta(f, name, seq)
            #    f.close()
            #    grace.status('')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            p = io.run(full_param, stdout=subprocess.PIPE, stderr=log_file)
            eat(p)

            #finally:
            #    if os.path.exists(temp_read_filename):
            #        os.unlink(temp_read_filename)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        io.execute(['samtools', 'sort', '-n', temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status('')