Ejemplo n.º 1
0
    def removeDuplicates(self,
                         path_to_jar=False,
                         force=False,
                         mem_num_gigs=2,
                         max_cpus=-1):
        if not path_to_jar:
            path_to_jar = _get_jar_path('picard')

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        print('Print: will use %s cpus for picard' % max_processes)

        paths_to_BAMs_dd = []
        for BAM in self.paths_to_BAMs:
            BAM_out = BAM[:-4] + '_dd.bam'
            if not _os.path.exists(BAM_out) or force:
                # an alternative parallel code with polling not waiting
                # while len(processes) >= max_processes:
                # _sleep(1)
                # print('processes: %s' % (len(processes)))
                # processes.difference_update(
                # [p for p in processes if p.poll() is not None]
                # )
                picard_command = [
                    'MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=',
                    BAM[:-4] + '_dd.log'
                ]  #, 'VALIDATION_STRINGENCY=','LENIENT']
                cmd = ['java',
                       '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar
                       ] + picard_command
                processes.add(_subprocess.Popen(cmd, shell=False))
                print('Called: %s' % (' '.join(map(str, cmd))))
                #_subprocess.call(cmd, shell=False)
                print('processes: %s, max_processes: %s' %
                      (len(processes), max_processes))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])

            else:
                print('Found:')
                print(BAM_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd += [BAM_out]

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        self.paths_to_BAMs_dd = paths_to_BAMs_dd
Ejemplo n.º 2
0
    def removeDuplicates(self, path_to_jar = False, force = False, mem_num_gigs = 2, max_cpus = -1):
        if not path_to_jar:
            path_to_jar = _get_jar_path('picard')

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        print('Print: will use %s cpus for picard' % max_processes)

        paths_to_BAMs_dd = []
        for BAM in self.paths_to_BAMs:
            BAM_out = BAM[:-4] + '_dd.bam'
            if not _os.path.exists(BAM_out) or force:
                # an alternative parallel code with polling not waiting
                # while len(processes) >= max_processes:
                    # _sleep(1)
                    # print('processes: %s' % (len(processes)))
                    # processes.difference_update(
                        # [p for p in processes if p.poll() is not None]
                    # )
                picard_command = ['MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log'] #, 'VALIDATION_STRINGENCY=','LENIENT']
                cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar] + picard_command
                processes.add( _subprocess.Popen(cmd, shell=False) )
                print('Called: %s' % (' '.join(map(str, cmd))))
                #_subprocess.call(cmd, shell=False)
                print('processes: %s, max_processes: %s' % (len(processes),max_processes))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
                
            else:
                print('Found:')
                print(BAM_out)
                print('use "force = True" to overwrite')
            
            paths_to_BAMs_dd += [BAM_out]

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        self.paths_to_BAMs_dd = paths_to_BAMs_dd
Ejemplo n.º 3
0
    def IndelRealignGATK(self, 
            jar = ['external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar'], 
            picard_jar = False, 
            samtools_exe = False,
            use_java = 'java',
            force = False,
            mem_num_gigs = 2,
            max_cpus = -1):
        
        # GATK is manually downloaded by user and placed in folder of their choice
        jar = _os.path.sep.join(jar)

        if not picard_jar:
            picard_jar = _get_jar_path('picard')

        if not samtools_exe:
            samtools_exe = _get_exe_path('samtools')

        genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)

        e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'

        assert hasattr(self, 'paths_to_BAMs_dd_si'), e1

        e2 = 'Could not find %s. Please ensure file exists'

        for BAM in self.paths_to_BAMs_dd_si:
            assert _os.path.exists(BAM), e2 % BAM

        if not _os.path.exists(genome_fna[:-4] + '.dict'):
            print('Creating sequence dictionary for %s' % genome_fna)
            _subprocess.call([use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict'])

        #have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa','fai')]
        have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('fai',)]

        if not all(have_index_files):
            print('Writing index files for %s' % genome_fna)
            _subprocess.call([samtools_exe, 'faidx', genome_fna])

        processes = set()
        max_processes = _decide_max_processes( max_cpus )

        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            if not _os.path.exists(intervals) or force:
                cmd = [use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, 
                        '-T', 'RealignerTargetCreator', 
                        '-R', genome_fna, 
                        '-I', BAM, 
                        '-o', intervals] #, '--validation_strictness', 'LENIENT']
                print(' '.join(map(str, cmd)))
                processes.add( _subprocess.Popen(cmd, shell=False) )
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(intervals)
                print('use "force = True" to overwrite')

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()


        paths_to_BAMs_dd_si_ra = []
        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            bam_out = BAM[:-4] + '_realn.bam'
            if not _os.path.exists(bam_out) or force:
                cmd = [use_java, '-Xmx4g', '-jar', jar, 
                        '-T', 'IndelRealigner', 
                        '-R', genome_fna, 
                        '-I', BAM, 
                        '-targetIntervals', intervals, 
                        '-o', bam_out,
                        '--filter_bases_not_stored']
                print(' '.join(map(str, cmd)))
                processes.add( _subprocess.Popen(cmd, shell=False) )
                if len(processes) >= max_processes:
                    _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(bam_out)
                print('use "force = True" to overwrite')
            
            paths_to_BAMs_dd_si_ra += [bam_out]

        for p in processes:
            if p.poll() is None:
                p.wait()

        # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
        # both IndelRealignGATK and recalibBaseScoresGATK put here
        self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
Ejemplo n.º 4
0
    def IndelRealignGATK(self,
                         jar=[
                             'external_programs', 'GenomeAnalysisTK',
                             'GenomeAnalysisTK.jar'
                         ],
                         picard_jar=False,
                         samtools_exe=False,
                         use_java='java',
                         force=False,
                         mem_num_gigs=2,
                         max_cpus=-1):

        # GATK is manually downloaded by user and placed in folder of their choice
        jar = _os.path.sep.join(jar)

        if not picard_jar:
            picard_jar = _get_jar_path('picard')

        if not samtools_exe:
            samtools_exe = _get_exe_path('samtools')

        genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id)

        e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.'

        assert hasattr(self, 'paths_to_BAMs_dd_si'), e1

        e2 = 'Could not find %s. Please ensure file exists'

        for BAM in self.paths_to_BAMs_dd_si:
            assert _os.path.exists(BAM), e2 % BAM

        # always (re)generate dict in case of upstream changes in data
        print('Creating sequence dictionary for %s' % genome_fna)
        _subprocess.call([
            use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=',
            genome_fna, 'O=', genome_fna[:-4] + '.dict'
        ])

        # always (re)index in case of upstream changes in data
        print('Writing index files for %s' % genome_fna)
        _subprocess.call([samtools_exe, 'faidx', genome_fna])

        processes = set()
        max_processes = _decide_max_processes(max_cpus)

        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            if not _os.path.exists(intervals) or force:
                cmd = [
                    use_java,
                    '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T',
                    'RealignerTargetCreator', '-R', genome_fna, '-I', BAM,
                    '-o', intervals
                ]  #, '--validation_strictness', 'LENIENT']
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    (pid, exit_status) = _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(intervals)
                print('use "force = True" to overwrite')

        # Check if all the child processes were closed
        for p in processes:
            if p.poll() is None:
                p.wait()

        paths_to_BAMs_dd_si_ra = []
        for BAM in self.paths_to_BAMs_dd_si:
            intervals = BAM[:-4] + '.intervals'
            bam_out = BAM[:-4] + '_realn.bam'
            if not _os.path.exists(bam_out) or force:
                cmd = [
                    use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner',
                    '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals,
                    '-o', bam_out, '--filter_bases_not_stored'
                ]
                print(' '.join(map(str, cmd)))
                processes.add(_subprocess.Popen(cmd, shell=False))
                if len(processes) >= max_processes:
                    _os.wait()
                    processes.difference_update(
                        [p for p in processes if p.poll() is not None])
            else:
                print('Found:')
                print(bam_out)
                print('use "force = True" to overwrite')

            paths_to_BAMs_dd_si_ra += [bam_out]

        for p in processes:
            if p.poll() is None:
                p.wait()

        # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK
        # both IndelRealignGATK and recalibBaseScoresGATK put here
        self.ready_BAMs = [paths_to_BAMs_dd_si_ra]