def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True): '''Make index and hash files for consensus''' frag_gen = fragment[:2] # NOTE: we can use --overwrite here, because there is no concurrency (every # job has its own hash) # 1. Make genome index file sp.call([stampy_bin, '--species="HIV fragment '+frag_gen+'"', '--overwrite', '-G', get_index_file(data_folder, adaID, frag_gen, ext=False), get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True), ]) if VERBOSE: print 'Built index: '+adaID+' '+frag_gen # 2. Build a hash file sp.call([stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False), ]) if VERBOSE: print 'Built hash: '+adaID+' '+frag_gen if summary: with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True): '''Make index and hash files for consensus''' frag_gen = fragment[:2] # NOTE: we can use --overwrite here, because there is no concurrency (every # job has its own hash) # 1. Make genome index file sp.call([ stampy_bin, '--species="HIV fragment ' + frag_gen + '"', '--overwrite', '-G', get_index_file(data_folder, adaID, frag_gen, ext=False), get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True), ]) if VERBOSE: print 'Built index: ' + adaID + ' ' + frag_gen # 2. Build a hash file sp.call([ stampy_bin, '--overwrite', '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False), ]) if VERBOSE: print 'Built hash: ' + adaID + ' ' + frag_gen if summary: with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f: f.write('\n') f.write('Stampy index and hash written.') f.write('\n')
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: '+adaID+' '+frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j+1), rescue=rescue) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1), '-l', 'h_rt='+cluster_time, '-l', 'h_vmem='+vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j+1), rescue=rescue) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
for fragment in fragments_sample: frag_gen = fragment[:2] # Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, frag_gen, VERBOSE=VERBOSE, threads=threads, maxreads=maxreads, filter_reads=filter_reads, summary=summary, rescue=use_rescue) continue if summary: sfn = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=use_rescue) with open(sfn, 'w') as f: f.write('Call: python map_to_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+frag_gen+\ ' --threads '+str(threads)+\ ' --verbose '+str(VERBOSE)) if maxreads != -1: f.write(' --maxreads '+str(maxreads)) if filter_reads: f.write(' --filter') f.write('\n') if not check_consensus_length(data_folder, adaID, fragment, VERBOSE=VERBOSE):
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename + ', fragment ' + fragment + ': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j + 1), rescue=rescue) # Map call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l', 'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart=' + str(j + 1) + '/' + str(threads), '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [ get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j + 1), rescue=rescue) for j in xrange(threads) ] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (' + str(threads) + ' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
# Submit to the cluster self if requested if submit: fork_self(seq_run, adaID, frag_gen, VERBOSE=VERBOSE, threads=threads, maxreads=maxreads, filter_reads=filter_reads, summary=summary, rescue=use_rescue) continue if summary: sfn = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=use_rescue) with open(sfn, 'w') as f: f.write('Call: python map_to_consensus.py'+\ ' --run '+seq_run+\ ' --adaIDs '+adaID+\ ' --fragments '+frag_gen+\ ' --threads '+str(threads)+\ ' --verbose '+str(VERBOSE)) if maxreads != -1: f.write(' --maxreads ' + str(maxreads)) if filter_reads: f.write(' --filter') f.write('\n') if not check_consensus_length(