def run_postprocess(postprocess_cmd_arr, file_basecount_dict, workflow_prov, err_log, stat_log): ''' Prepare postprocess command with input/output paths according to sample name, pass commands to yap_tee or subprocess for execution. ''' if wd.regroup_output == 'yes': workflow_output_path = wd.workflow_output_path + "/regroup_output" else: workflow_output_path = wd.workflow_output_path for zz in range(0, len(postprocess_cmd_arr)): postprocess_tee_arr = [] postprocess_nontee_arr = [] initial_pipe_commands = [] postprocess_temp_arr = [] cmd_type = postprocess_cmd_arr[zz][0] cmd_meta_data = postprocess_cmd_arr[zz][1] postprocess_temp_arr = postprocess_cmd_arr[zz][2] input_file_extension = '' pipe1 = '' pipe2 = '' #set default input directory for postprocess stage as aligner_output #user can specify "postprocess_output" through configuration file input_directory = "aligner_output" for kk in range(0, len(cmd_meta_data)): input_meta_data = cmd_meta_data[kk].split(" ") if input_meta_data: if re.search('input_file_type', input_meta_data[0]) is not None: input_file_extension = input_meta_data[1] if re.search('input_directory', input_meta_data[0]) is not None: input_directory = input_meta_data[1] '''iterate over filename and barcode, fetch files from the source directory, file extensions and python glob module''' for filename_key in file_basecount_dict.iterkeys(): #fetch original input file pair for this sample for tmp_arr in wd.inp_files_list: if tmp_arr[2] == filename_key: #store it in variables pipe1 and pipe2 pipe1 = tmp_arr[0] pipe2 = tmp_arr[1] break postprocess_input_file_arr = [] path, file_name = os.path.split(filename_key) if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": file_name, extension = os.path.splitext(file_name) file_name, extension = os.path.splitext(file_name) file_basecount_dict[filename_key] = { 'no_barcode_specified': [] } for barcode in file_basecount_dict[filename_key]: barcode_value = yap_tools.rename_barcode(barcode) aligner_dir_path = '' postprocess_dir_path = '' aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*" postprocess_input_file_arr = glob.glob(postprocess_input) if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": if input_directory == "aligner_output": aligner_dir_path = path postprocess_input = filename_key temp_arr = glob.glob(aligner_dir_path + "/" + "*" + input_file_extension + "*") if temp_arr > 0: for k in temp_arr: if k == postprocess_input: postprocess_input_file_arr = [ postprocess_input ] if input_file_extension == '': postprocess_input_file_arr = [] postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output" postprocess_input_file_arr.sort() if (len(postprocess_input_file_arr) > 0): if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": if input_directory == "aligner_output": if postprocess_input_file_arr[0] == filename_key: pass else: break for k in range(0, len(postprocess_temp_arr)): postprocess_cmd = postprocess_temp_arr[k][1] postprocess_cmd_name = postprocess_temp_arr[k][0] if file_name == '': if barcode_value != '': postprocess_outfile = postprocess_dir_path + "/" + \ barcode_value + "_" + postprocess_cmd_name else: postprocess_outfile = postprocess_dir_path + \ "/" + postprocess_cmd_name else: if barcode_value != '': postprocess_outfile = postprocess_dir_path + "/" + \ file_name + "_" + barcode_value + \ "_" + postprocess_cmd_name else: postprocess_outfile = postprocess_dir_path + \ "/" + file_name + "_" + \ postprocess_cmd_name #replace generic keywords with appropriate file path variables postprocess_cmd = postprocess_cmd.replace( 'input_directory', '') postprocess_cmd = postprocess_cmd.replace( 'input_file_type' + ' ' + input_file_extension, '') postprocess_cmd = postprocess_cmd.replace( "aligner_output", '') postprocess_cmd = postprocess_cmd.replace( "postprocess_output", '') postprocess_cmd = postprocess_cmd.replace( 'output_file', postprocess_outfile) postprocess_cmd = postprocess_cmd.replace( 'output_directory', postprocess_dir_path) postprocess_cmd = postprocess_cmd.replace(' =', '=') postprocess_cmd = postprocess_cmd.replace( 'sample_name', file_name) if re.search("file_based_input", postprocess_cmd) is not None: postprocess_cmd = postprocess_cmd.replace( 'file_based_input', postprocess_input_file_arr[0]) postprocess_nontee_arr = [ postprocess_cmd_name, postprocess_cmd ] elif re.search("directory_based_input", postprocess_cmd) is not None: postprocess_cmd = postprocess_cmd.replace( 'directory_based_input', aligner_dir_path) postprocess_nontee_arr = [ postprocess_cmd_name, postprocess_cmd ] else: postprocess_tee_arr.append(postprocess_cmd) workflow_prov.append("INPUT: " + postprocess_input_file_arr[0]) for kk in postprocess_tee_arr: if kk != '': workflow_prov.append(kk) if len(postprocess_tee_arr) != 0: #pass commands to yap_tee function yap_tools.yap_tee(initial_pipe_commands, postprocess_tee_arr, postprocess_input_file_arr[0], err_log, stat_log) if len(postprocess_nontee_arr) != 0: #pass commands to non_tee function which uses subproces run_postprocess_nontee(postprocess_nontee_arr, workflow_prov, err_log, stat_log) else: if file_name == '': print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n" else: print "Warning: No aligner output for filename = ", file_name, " barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n" return workflow_prov
def run_postprocess( postprocess_cmd_arr, file_basecount_dict, workflow_prov, err_log, stat_log): ''' Prepare postprocess command with input/output paths according to sample name, pass commands to yap_tee or subprocess for execution. ''' if wd.regroup_output =='yes': workflow_output_path = wd.workflow_output_path + "/regroup_output" else: workflow_output_path= wd.workflow_output_path for zz in range(0, len(postprocess_cmd_arr)): postprocess_tee_arr = [] postprocess_nontee_arr = [] initial_pipe_commands = [] postprocess_temp_arr = [] cmd_type = postprocess_cmd_arr[zz][0] cmd_meta_data = postprocess_cmd_arr[zz][1] postprocess_temp_arr = postprocess_cmd_arr[zz][2] input_file_extension = '' pipe1='' pipe2='' #set default input directory for postprocess stage as aligner_output #user can specify "postprocess_output" through configuration file input_directory = "aligner_output" for kk in range(0, len(cmd_meta_data)): input_meta_data = cmd_meta_data[kk].split(" ") if input_meta_data: if re.search('input_file_type', input_meta_data[0]) is not None: input_file_extension = input_meta_data[1] if re.search('input_directory', input_meta_data[0]) is not None: input_directory = input_meta_data[1] '''iterate over filename and barcode, fetch files from the source directory, file extensions and python glob module''' for filename_key in file_basecount_dict.iterkeys(): #fetch original input file pair for this sample for tmp_arr in wd.inp_files_list: if tmp_arr[2] == filename_key: #store it in variables pipe1 and pipe2 pipe1 = tmp_arr[0] pipe2 = tmp_arr[1] break postprocess_input_file_arr = [] path, file_name = os.path.split(filename_key) if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": file_name, extension = os.path.splitext(file_name) file_name, extension = os.path.splitext(file_name) file_basecount_dict[filename_key] = { 'no_barcode_specified': []} for barcode in file_basecount_dict[filename_key]: barcode_value = yap_tools.rename_barcode(barcode) aligner_dir_path = '' postprocess_dir_path = '' aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*" postprocess_input_file_arr = glob.glob(postprocess_input) if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": if input_directory == "aligner_output": aligner_dir_path = path postprocess_input = filename_key temp_arr = glob.glob(aligner_dir_path + "/" + "*" + input_file_extension + "*") if temp_arr > 0: for k in temp_arr: if k == postprocess_input: postprocess_input_file_arr = [postprocess_input] if input_file_extension == '': postprocess_input_file_arr = [] postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output" postprocess_input_file_arr.sort() if (len(postprocess_input_file_arr) > 0): if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes": if input_directory == "aligner_output": if postprocess_input_file_arr[0] == filename_key: pass else: break for k in range(0, len(postprocess_temp_arr)): postprocess_cmd = postprocess_temp_arr[k][1] postprocess_cmd_name = postprocess_temp_arr[k][0] if file_name == '': if barcode_value != '': postprocess_outfile = postprocess_dir_path + "/" + \ barcode_value + "_" + postprocess_cmd_name else: postprocess_outfile = postprocess_dir_path + \ "/" + postprocess_cmd_name else: if barcode_value != '': postprocess_outfile = postprocess_dir_path + "/" + \ file_name + "_" + barcode_value + \ "_" + postprocess_cmd_name else: postprocess_outfile = postprocess_dir_path + \ "/" + file_name + "_" + \ postprocess_cmd_name #replace generic keywords with appropriate file path variables postprocess_cmd = postprocess_cmd.replace( 'input_directory', '') postprocess_cmd = postprocess_cmd.replace( 'input_file_type' + ' ' + input_file_extension, '') postprocess_cmd = postprocess_cmd.replace( "aligner_output", '') postprocess_cmd = postprocess_cmd.replace( "postprocess_output", '') postprocess_cmd = postprocess_cmd.replace( 'output_file', postprocess_outfile) postprocess_cmd = postprocess_cmd.replace( 'output_directory', postprocess_dir_path) postprocess_cmd = postprocess_cmd.replace(' =', '=') postprocess_cmd = postprocess_cmd.replace( 'sample_name', file_name) if re.search("file_based_input", postprocess_cmd) is not None: postprocess_cmd = postprocess_cmd.replace( 'file_based_input', postprocess_input_file_arr[0]) postprocess_nontee_arr = [ postprocess_cmd_name, postprocess_cmd] elif re.search("directory_based_input", postprocess_cmd) is not None: postprocess_cmd = postprocess_cmd.replace( 'directory_based_input', aligner_dir_path) postprocess_nontee_arr = [ postprocess_cmd_name, postprocess_cmd] else: postprocess_tee_arr.append(postprocess_cmd) workflow_prov.append( "INPUT: " + postprocess_input_file_arr[0]) for kk in postprocess_tee_arr: if kk != '': workflow_prov.append(kk) if len(postprocess_tee_arr) != 0: #pass commands to yap_tee function yap_tools.yap_tee( initial_pipe_commands, postprocess_tee_arr, postprocess_input_file_arr[0], err_log, stat_log) if len(postprocess_nontee_arr) != 0: #pass commands to non_tee function which uses subproces run_postprocess_nontee( postprocess_nontee_arr, workflow_prov, err_log, stat_log) else: if file_name == '': print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n" else: print "Warning: No aligner output for filename = ", file_name, " barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n" return workflow_prov
def sort_alignment_output( chunk_number, aligner_cmd_name, aligner_cmd, aligner_output_filename, workflow_prov, err_log, stat_log): ''' sorts alignment output based on coordinate or queryname or both ''' initial_pipe_commands = [] sort_flag = 'False' format_ext = '' alignment_file_ext = '' sort_file_ext = '' after_sort_cmd = '' name_sort_cmd = '' coordinate_sort_cmd = '' if wd.alignment_sort_order != 'unsorted': #prepare sort commands, input/output filenames based on file extension aligner_dir_path = os.path.split(aligner_output_filename)[0] aligner_dir_path_tmp, filename_base = os.path.split(aligner_output_filename) if filename_base.find('queryname') == -1 and filename_base.find('coordinate') == -1: if os.path.exists(aligner_output_filename): aligner_output_filename_base, alignment_file_ext = os.path.splitext( aligner_output_filename) if alignment_file_ext == '.gz' or alignment_file_ext == 'bz2': aligner_output_filename_base, format_ext = os.path.splitext( aligner_output_filename_base) sort_aligner_output_filename = aligner_output_filename_base + \ "_" + wd.alignment_sort_order if format_ext == '.sam' or alignment_file_ext == '.sam': initial_pipe_commands = ['samtools view -bhS -'] sort_file_ext = ".sam" name_sort_cmd = " samtools sort -on -m 100000000 - " + \ aligner_output_filename_base + "_" + "queryname" coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \ aligner_output_filename_base + "_" + "coordinate" name_sort_cmd = " samtools sort -on -m 100000000 - " + \ aligner_output_filename_base + "_" + "queryname" coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \ aligner_output_filename_base + "_" + "coordinate" after_sort_cmd = ' | samtools view -h - -o ' else: name_sort_cmd = " samtools sort -n -m 100000000 - " coordinate_sort_cmd = " samtools sort -m 100000000 - " sort_aligner_output_filename += sort_file_ext if format_ext == '.sam' or format_ext == '.bam' or alignment_file_ext == '.sam' or alignment_file_ext == '.bam': sort_flag = 'True' if sort_flag == 'True': if wd.alignment_sort_order == 'queryname': sort_commands = [ name_sort_cmd + after_sort_cmd + sort_aligner_output_filename] if wd.alignment_sort_order == 'coordinate': sort_commands = [ coordinate_sort_cmd + after_sort_cmd + sort_aligner_output_filename] if wd.alignment_sort_order == 'queryname' or wd.alignment_sort_order == 'coordinate': #if sort order is both; then call yap tee functionality yap_tools.yap_tee(initial_pipe_commands, sort_commands,aligner_output_filename, err_log, stat_log) if len(initial_pipe_commands) > 0: workflow_prov.append(initial_pipe_commands[0] + sort_commands[0]) else: workflow_prov.append(sort_commands[0]) rm_cmd = "rm " + aligner_output_filename prm = Popen(rm_cmd, shell='True').wait() if prm != 0: print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "\n" if wd.alignment_sort_order == 'both': sort_queryname_output = aligner_output_filename_base + "_queryname" + sort_file_ext sort_coordinate_output = aligner_output_filename_base + "_coordinate" + sort_file_ext sort_commands = [name_sort_cmd + after_sort_cmd + sort_queryname_output,coordinate_sort_cmd + after_sort_cmd + sort_coordinate_output] #if sort order is both; then call yap tee functionality yap_tools.yap_tee(initial_pipe_commands, sort_commands,aligner_output_filename, err_log, stat_log) if len(initial_pipe_commands) > 0: workflow_prov.append( initial_pipe_commands[0] + sort_commands[0]) workflow_prov.append( initial_pipe_commands[0] + sort_commands[1]) else: workflow_prov.append(sort_commands[0]) workflow_prov.append(sort_commands[1]) try: os.remove(aligner_output_filename) except Exception as e: print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "cmd", "\n" print e return workflow_prov
def sort_alignment_output(chunk_number, aligner_cmd_name, aligner_cmd, aligner_output_filename, workflow_prov, err_log, stat_log): ''' sorts alignment output based on coordinate or queryname or both ''' initial_pipe_commands = [] sort_flag = 'False' format_ext = '' alignment_file_ext = '' sort_file_ext = '' after_sort_cmd = '' name_sort_cmd = '' coordinate_sort_cmd = '' if wd.alignment_sort_order != 'unsorted': #prepare sort commands, input/output filenames based on file extension aligner_dir_path = os.path.split(aligner_output_filename)[0] aligner_dir_path_tmp, filename_base = os.path.split( aligner_output_filename) if filename_base.find('queryname') == -1 and filename_base.find( 'coordinate') == -1: if os.path.exists(aligner_output_filename): aligner_output_filename_base, alignment_file_ext = os.path.splitext( aligner_output_filename) if alignment_file_ext == '.gz' or alignment_file_ext == 'bz2': aligner_output_filename_base, format_ext = os.path.splitext( aligner_output_filename_base) sort_aligner_output_filename = aligner_output_filename_base + \ "_" + wd.alignment_sort_order if format_ext == '.sam' or alignment_file_ext == '.sam': initial_pipe_commands = ['samtools view -bhS -'] sort_file_ext = ".sam" name_sort_cmd = " samtools sort -on -m 100000000 - " + \ aligner_output_filename_base + "_" + "queryname" coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \ aligner_output_filename_base + "_" + "coordinate" name_sort_cmd = " samtools sort -on -m 100000000 - " + \ aligner_output_filename_base + "_" + "queryname" coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \ aligner_output_filename_base + "_" + "coordinate" after_sort_cmd = ' | samtools view -h - -o ' else: name_sort_cmd = " samtools sort -n -m 100000000 - " coordinate_sort_cmd = " samtools sort -m 100000000 - " sort_aligner_output_filename += sort_file_ext if format_ext == '.sam' or format_ext == '.bam' or alignment_file_ext == '.sam' or alignment_file_ext == '.bam': sort_flag = 'True' if sort_flag == 'True': if wd.alignment_sort_order == 'queryname': sort_commands = [ name_sort_cmd + after_sort_cmd + sort_aligner_output_filename ] if wd.alignment_sort_order == 'coordinate': sort_commands = [ coordinate_sort_cmd + after_sort_cmd + sort_aligner_output_filename ] if wd.alignment_sort_order == 'queryname' or wd.alignment_sort_order == 'coordinate': #if sort order is both; then call yap tee functionality yap_tools.yap_tee(initial_pipe_commands, sort_commands, aligner_output_filename, err_log, stat_log) if len(initial_pipe_commands) > 0: workflow_prov.append(initial_pipe_commands[0] + sort_commands[0]) else: workflow_prov.append(sort_commands[0]) rm_cmd = "rm " + aligner_output_filename prm = Popen(rm_cmd, shell='True').wait() if prm != 0: print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "\n" if wd.alignment_sort_order == 'both': sort_queryname_output = aligner_output_filename_base + "_queryname" + sort_file_ext sort_coordinate_output = aligner_output_filename_base + "_coordinate" + sort_file_ext sort_commands = [ name_sort_cmd + after_sort_cmd + sort_queryname_output, coordinate_sort_cmd + after_sort_cmd + sort_coordinate_output ] #if sort order is both; then call yap tee functionality yap_tools.yap_tee(initial_pipe_commands, sort_commands, aligner_output_filename, err_log, stat_log) if len(initial_pipe_commands) > 0: workflow_prov.append(initial_pipe_commands[0] + sort_commands[0]) workflow_prov.append(initial_pipe_commands[0] + sort_commands[1]) else: workflow_prov.append(sort_commands[0]) workflow_prov.append(sort_commands[1]) try: os.remove(aligner_output_filename) except Exception as e: print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "cmd", "\n" print e return workflow_prov