def removeDuplicates(self, path_to_jar=False, force=False, mem_num_gigs=2, max_cpus=-1): if not path_to_jar: path_to_jar = _get_jar_path('picard') processes = set() max_processes = _decide_max_processes(max_cpus) print('Print: will use %s cpus for picard' % max_processes) paths_to_BAMs_dd = [] for BAM in self.paths_to_BAMs: BAM_out = BAM[:-4] + '_dd.bam' if not _os.path.exists(BAM_out) or force: # an alternative parallel code with polling not waiting # while len(processes) >= max_processes: # _sleep(1) # print('processes: %s' % (len(processes))) # processes.difference_update( # [p for p in processes if p.poll() is not None] # ) picard_command = [ 'MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log' ] #, 'VALIDATION_STRINGENCY=','LENIENT'] cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar ] + picard_command processes.add(_subprocess.Popen(cmd, shell=False)) print('Called: %s' % (' '.join(map(str, cmd)))) #_subprocess.call(cmd, shell=False) print('processes: %s, max_processes: %s' % (len(processes), max_processes)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd = paths_to_BAMs_dd
def removeDuplicates(self, path_to_jar = False, force = False, mem_num_gigs = 2, max_cpus = -1): if not path_to_jar: path_to_jar = _get_jar_path('picard') processes = set() max_processes = _decide_max_processes( max_cpus ) print('Print: will use %s cpus for picard' % max_processes) paths_to_BAMs_dd = [] for BAM in self.paths_to_BAMs: BAM_out = BAM[:-4] + '_dd.bam' if not _os.path.exists(BAM_out) or force: # an alternative parallel code with polling not waiting # while len(processes) >= max_processes: # _sleep(1) # print('processes: %s' % (len(processes))) # processes.difference_update( # [p for p in processes if p.poll() is not None] # ) picard_command = ['MarkDuplicates', 'I=', BAM, 'O=', BAM_out, 'M=', BAM[:-4] + '_dd.log'] #, 'VALIDATION_STRINGENCY=','LENIENT'] cmd = ['java', '-Xmx%sg' % mem_num_gigs, '-jar', path_to_jar] + picard_command processes.add( _subprocess.Popen(cmd, shell=False) ) print('Called: %s' % (' '.join(map(str, cmd)))) #_subprocess.call(cmd, shell=False) print('processes: %s, max_processes: %s' % (len(processes),max_processes)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(BAM_out) print('use "force = True" to overwrite') paths_to_BAMs_dd += [BAM_out] # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() self.paths_to_BAMs_dd = paths_to_BAMs_dd
def IndelRealignGATK(self, jar = ['external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar'], picard_jar = False, samtools_exe = False, use_java = 'java', force = False, mem_num_gigs = 2, max_cpus = -1): # GATK is manually downloaded by user and placed in folder of their choice jar = _os.path.sep.join(jar) if not picard_jar: picard_jar = _get_jar_path('picard') if not samtools_exe: samtools_exe = _get_exe_path('samtools') genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id) e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.' assert hasattr(self, 'paths_to_BAMs_dd_si'), e1 e2 = 'Could not find %s. Please ensure file exists' for BAM in self.paths_to_BAMs_dd_si: assert _os.path.exists(BAM), e2 % BAM if not _os.path.exists(genome_fna[:-4] + '.dict'): print('Creating sequence dictionary for %s' % genome_fna) _subprocess.call([use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict']) #have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa','fai')] have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('fai',)] if not all(have_index_files): print('Writing index files for %s' % genome_fna) _subprocess.call([samtools_exe, 'faidx', genome_fna]) processes = set() max_processes = _decide_max_processes( max_cpus ) for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' if not _os.path.exists(intervals) or force: cmd = [use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T', 'RealignerTargetCreator', '-R', genome_fna, '-I', BAM, '-o', intervals] #, '--validation_strictness', 'LENIENT'] print(' '.join(map(str, cmd))) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(intervals) print('use "force = True" to overwrite') # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() paths_to_BAMs_dd_si_ra = [] for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' bam_out = BAM[:-4] + '_realn.bam' if not _os.path.exists(bam_out) or force: cmd = [use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner', '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals, '-o', bam_out, '--filter_bases_not_stored'] print(' '.join(map(str, cmd))) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(bam_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si_ra += [bam_out] for p in processes: if p.poll() is None: p.wait() # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK # both IndelRealignGATK and recalibBaseScoresGATK put here self.ready_BAMs = [paths_to_BAMs_dd_si_ra]
def IndelRealignGATK(self, jar=[ 'external_programs', 'GenomeAnalysisTK', 'GenomeAnalysisTK.jar' ], picard_jar=False, samtools_exe=False, use_java='java', force=False, mem_num_gigs=2, max_cpus=-1): # GATK is manually downloaded by user and placed in folder of their choice jar = _os.path.sep.join(jar) if not picard_jar: picard_jar = _get_jar_path('picard') if not samtools_exe: samtools_exe = _get_exe_path('samtools') genome_fna = 'genome_sequences/{}.fna'.format(self.genome_id) e1 = 'Could not find "paths_to_BAMs_dd_si" attribute. Before starting GATK analysis, read alignments must have duplicates removed. Please run: .toBAMS(), .removeDuplicates(), .sortIndexBAMs() methods on this SAMs instance, or --deduplicate if using baga_cli.py.' assert hasattr(self, 'paths_to_BAMs_dd_si'), e1 e2 = 'Could not find %s. Please ensure file exists' for BAM in self.paths_to_BAMs_dd_si: assert _os.path.exists(BAM), e2 % BAM # always (re)generate dict in case of upstream changes in data print('Creating sequence dictionary for %s' % genome_fna) _subprocess.call([ use_java, '-jar', picard_jar, 'CreateSequenceDictionary', 'R=', genome_fna, 'O=', genome_fna[:-4] + '.dict' ]) # always (re)index in case of upstream changes in data print('Writing index files for %s' % genome_fna) _subprocess.call([samtools_exe, 'faidx', genome_fna]) processes = set() max_processes = _decide_max_processes(max_cpus) for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' if not _os.path.exists(intervals) or force: cmd = [ use_java, '-Xmx%sg' % mem_num_gigs, '-jar', jar, '-T', 'RealignerTargetCreator', '-R', genome_fna, '-I', BAM, '-o', intervals ] #, '--validation_strictness', 'LENIENT'] print(' '.join(map(str, cmd))) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(intervals) print('use "force = True" to overwrite') # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() paths_to_BAMs_dd_si_ra = [] for BAM in self.paths_to_BAMs_dd_si: intervals = BAM[:-4] + '.intervals' bam_out = BAM[:-4] + '_realn.bam' if not _os.path.exists(bam_out) or force: cmd = [ use_java, '-Xmx4g', '-jar', jar, '-T', 'IndelRealigner', '-R', genome_fna, '-I', BAM, '-targetIntervals', intervals, '-o', bam_out, '--filter_bases_not_stored' ] print(' '.join(map(str, cmd))) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) else: print('Found:') print(bam_out) print('use "force = True" to overwrite') paths_to_BAMs_dd_si_ra += [bam_out] for p in processes: if p.poll() is None: p.wait() # the last list of BAMs in ready_BAMs is input for CallgVCFsGATK # both IndelRealignGATK and recalibBaseScoresGATK put here self.ready_BAMs = [paths_to_BAMs_dd_si_ra]