Esempio n. 1
0
def merge_preprocessed_output(file_basecount_dict):
    '''For Every sample, merge preprocessed chunk data into a single file'''
    for filename_key in file_basecount_dict.iterkeys():
        #iterate over filename and barcode, get list of files to be merged
        file_name = filename_key
        barcode_basecount_dict = file_basecount_dict[filename_key]
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "preprocess_output"
            if barcode_value != '':
                preprocessed_output_basename = barcode_dir_path + "/" + "preprocess_data" + "_" + file_name + "_" + barcode_value
            else:
                preprocessed_output_basename = barcode_dir_path + "/" + "preprocess_data" + "_" + file_name
            barcode_value = yap_tools.rename_barcode(barcode)
            preprocess_content1 = glob.glob(barcode_dir_path + "/" +
                                            "*preprocessed_data*_1.txt")
            # cat files together
            if len(preprocess_content1) > 0:
                os.system("cat " + barcode_dir_path + "/" +
                          "*preprocessed_data*_1.txt" + ">" +
                          preprocessed_output_basename + "_1.txt")
                os.system("rm " + barcode_dir_path + "/" + barcode_value +
                          "*preprocessed_data*_1.txt")
            if wd.paired_end_data == "yes":
                preprocess_content2 = glob.glob(barcode_dir_path + "/" +
                                                "*preprocessed_data*_2.txt")
                if len(preprocess_content2) > 0:
                    os.system("cat " + barcode_dir_path + "/" +
                              "*preprocessed_data*_2.txt" + ">" +
                              preprocessed_output_basename + "_2.txt")
                    os.system("rm " + barcode_dir_path + "/" + barcode_value +
                              "*preprocessed_data*_2.txt")
Esempio n. 2
0
def merge_preprocessed_output(file_basecount_dict):
    '''For Every sample, merge preprocessed chunk data into a single file'''
    for filename_key in file_basecount_dict.iterkeys():
        #iterate over filename and barcode, get list of files to be merged
        file_name = filename_key  
        barcode_basecount_dict = file_basecount_dict[filename_key]
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "preprocess_output"
            if barcode_value != '':
                preprocessed_output_basename = barcode_dir_path + "/" + "preprocess_data" + "_" + file_name + "_" + barcode_value
            else:
                preprocessed_output_basename = barcode_dir_path + "/" + "preprocess_data" + "_" + file_name
	    barcode_value = yap_tools.rename_barcode(barcode)
            preprocess_content1 = glob.glob(
            barcode_dir_path + "/" + "*preprocessed_data*_1.txt")
	    # cat files together 
            if len(preprocess_content1) > 0:
            	os.system("cat " + barcode_dir_path +"/" +"*preprocessed_data*_1.txt" +">" + preprocessed_output_basename + "_1.txt")
            	os.system("rm " + barcode_dir_path + "/" + barcode_value + "*preprocessed_data*_1.txt")
            if wd.paired_end_data == "yes":
            	preprocess_content2 = glob.glob(barcode_dir_path + "/" + "*preprocessed_data*_2.txt")
            	if len(preprocess_content2) > 0:
                	os.system("cat " + barcode_dir_path + "/" + "*preprocessed_data*_2.txt" + ">" + preprocessed_output_basename + "_2.txt")
                	os.system("rm " + barcode_dir_path + "/" + barcode_value + "*preprocessed_data*_2.txt")
Esempio n. 3
0
def run_postprocess(postprocess_cmd_arr, file_basecount_dict, workflow_prov,
                    err_log, stat_log):
    ''' 
    Prepare postprocess command with input/output paths according to sample name, 
    pass commands to yap_tee or subprocess for execution.
    '''
    if wd.regroup_output == 'yes':
        workflow_output_path = wd.workflow_output_path + "/regroup_output"
    else:
        workflow_output_path = wd.workflow_output_path
    for zz in range(0, len(postprocess_cmd_arr)):
        postprocess_tee_arr = []
        postprocess_nontee_arr = []
        initial_pipe_commands = []
        postprocess_temp_arr = []
        cmd_type = postprocess_cmd_arr[zz][0]
        cmd_meta_data = postprocess_cmd_arr[zz][1]
        postprocess_temp_arr = postprocess_cmd_arr[zz][2]
        input_file_extension = ''
        pipe1 = ''
        pipe2 = ''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration file
        input_directory = "aligner_output"
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type',
                             input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[1]
                if re.search('input_directory',
                             input_meta_data[0]) is not None:
                    input_directory = input_meta_data[1]
        '''iterate over filename and barcode, fetch files from the source directory,
        file extensions and python glob module'''
        for filename_key in file_basecount_dict.iterkeys():
            #fetch original input file pair for this sample
            for tmp_arr in wd.inp_files_list:
                if tmp_arr[2] == filename_key:
                    #store it in variables pipe1 and pipe2
                    pipe1 = tmp_arr[0]
                    pipe2 = tmp_arr[1]
                    break
            postprocess_input_file_arr = []
            path, file_name = os.path.split(filename_key)
            if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                file_name, extension = os.path.splitext(file_name)
                file_name, extension = os.path.splitext(file_name)
                file_basecount_dict[filename_key] = {
                    'no_barcode_specified': []
                }
            for barcode in file_basecount_dict[filename_key]:
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = ''
                postprocess_dir_path = ''
                aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory
                postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*"
                postprocess_input_file_arr = glob.glob(postprocess_input)
                if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                    if input_directory == "aligner_output":
                        aligner_dir_path = path
                        postprocess_input = filename_key
                        temp_arr = glob.glob(aligner_dir_path + "/" + "*" +
                                             input_file_extension + "*")
                        if temp_arr > 0:
                            for k in temp_arr:
                                if k == postprocess_input:
                                    postprocess_input_file_arr = [
                                        postprocess_input
                                    ]
                if input_file_extension == '':
                    postprocess_input_file_arr = []
                postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output"
                postprocess_input_file_arr.sort()
                if (len(postprocess_input_file_arr) > 0):
                    if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                        if input_directory == "aligner_output":
                            if postprocess_input_file_arr[0] == filename_key:
                                pass
                            else:
                                break
                    for k in range(0, len(postprocess_temp_arr)):
                        postprocess_cmd = postprocess_temp_arr[k][1]
                        postprocess_cmd_name = postprocess_temp_arr[k][0]
                        if file_name == '':
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    barcode_value + "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + postprocess_cmd_name
                        else:
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    file_name + "_" + barcode_value + \
                                    "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + file_name + "_" + \
                                    postprocess_cmd_name
                        #replace generic keywords with appropriate file path variables
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_directory', '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_file_type' + ' ' + input_file_extension, '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "aligner_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "postprocess_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_file', postprocess_outfile)
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_directory', postprocess_dir_path)
                        postprocess_cmd = postprocess_cmd.replace(' =', '=')
                        postprocess_cmd = postprocess_cmd.replace(
                            'sample_name', file_name)
                        if re.search("file_based_input",
                                     postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'file_based_input',
                                postprocess_input_file_arr[0])
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd
                            ]
                        elif re.search("directory_based_input",
                                       postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'directory_based_input', aligner_dir_path)
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd
                            ]
                        else:
                            postprocess_tee_arr.append(postprocess_cmd)
                    workflow_prov.append("INPUT: " +
                                         postprocess_input_file_arr[0])
                    for kk in postprocess_tee_arr:
                        if kk != '':
                            workflow_prov.append(kk)
                    if len(postprocess_tee_arr) != 0:
                        #pass commands to yap_tee function
                        yap_tools.yap_tee(initial_pipe_commands,
                                          postprocess_tee_arr,
                                          postprocess_input_file_arr[0],
                                          err_log, stat_log)
                    if len(postprocess_nontee_arr) != 0:
                        #pass commands to non_tee function which uses subproces
                        run_postprocess_nontee(postprocess_nontee_arr,
                                               workflow_prov, err_log,
                                               stat_log)
                else:
                    if file_name == '':
                        print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n"
                    else:
                        print "Warning: No aligner output for filename = ", file_name, "  barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n"
    return workflow_prov
Esempio n. 4
0
def write_basecount_matrix(file_basecount_dict):
    '''
    Prepare output filename information and write base count matrix 
    results to a file
    '''
    def write_final_matrix(
        file_basecount_matrix,
        basecount_metrics_filename,
        basecount_file_basename,
    ):
        '''Writes basecount matrix to file'''
        file_w_handler = open(basecount_metrics_filename, 'a+')
        len_matrix = len(file_basecount_matrix)
        len_row = len(file_basecount_matrix[0])
        file_w_handler.write("# A C T G N \n")
        for i in range(len_matrix):
            row_sum = numpy.sum(file_basecount_matrix[i])
            if row_sum != 0:
                file_w_handler.write(str(i) + " ")
                for j in range(len_row):
                    file_w_handler.write(
                        str(file_basecount_matrix[i][j]) + " ")
                file_w_handler.write("\n")
        file_w_handler.close()
        #create plot
        os.system("gplot.sh " + basecount_metrics_filename + " " +
                  basecount_file_basename + '.eps')

    #For each sample, prepare basecount output filename information
    for filename_key in file_basecount_dict.iterkeys():
        path_name, file_name = os.path.split(filename_key)
        file_name, extension = os.path.splitext(file_name)
        barcode_basecount_dict = file_basecount_dict[filename_key]
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
            preprocess_output_dir = barcode_dir_path + "/" + "preprocess_output"
            file_basecount_matrix1 = barcode_basecount_dict[barcode][0]
            file_basecount_matrix2 = barcode_basecount_dict[barcode][1]
            if barcode_value != '':
                basecount_file_basename1 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + "_basecountmetrics_1"
                basecount_metrics_filename1 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + \
                    "_basecountmetrics_1" + '.txt'
                basecount_file_basename2 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + "_basecountmetrics_2"
                basecount_metrics_filename2 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + \
                    "_basecountmetrics_2" + '.txt'
            else:
                basecount_file_basename1 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_1"
                basecount_metrics_filename1 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_1" + '.txt'
                basecount_file_basename2 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_2"
                basecount_metrics_filename2 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_2" + '.txt'

            if (numpy.sum(file_basecount_matrix1) > 0):
                print "For filename = ", file_name, "barcode = ", barcode, " ...producing the basecount metrics results for first paired file", "\n"
                #pass output filename informaton to write function
                write_final_matrix(
                    file_basecount_matrix1,
                    basecount_metrics_filename1,
                    basecount_file_basename1,
                )
            else:
                print "No data for finename = ", file_name, " barcode = ", barcode, " ...skipping the basecount metrics output for first paired file", "\n"
            if (numpy.sum(file_basecount_matrix2) > 0):

                print "For filename = ", file_name, "barcode = ", barcode, " ...producing the basecount metrics results second paired file", "\n"
                #pass output filename informaton to write function
                write_final_matrix(
                    file_basecount_matrix2,
                    basecount_metrics_filename2,
                    basecount_file_basename2,
                )
            else:
                print "No data for finename = ", file_name, " barcode = ", barcode, " ...skipping the basecount metrics output for second paired file", "\n"
Esempio n. 5
0
def write_basecount_matrix(file_basecount_dict):
    '''
    Prepare output filename information and write base count matrix 
    results to a file
    '''
    def write_final_matrix(
            file_basecount_matrix,
            basecount_metrics_filename,
            basecount_file_basename,
            ):
        '''Writes basecount matrix to file'''
        file_w_handler = open(basecount_metrics_filename, 'a+')
        len_matrix = len(file_basecount_matrix)
        len_row = len(file_basecount_matrix[0])
        file_w_handler.write("# A C T G N \n")
        for i in range(len_matrix):
            row_sum = numpy.sum(file_basecount_matrix[i])
            if row_sum != 0:
                file_w_handler.write(str(i) + " ")
                for j in range(len_row):
                    file_w_handler.write(
                        str(file_basecount_matrix[i][j]) + " ")
                file_w_handler.write("\n")
        file_w_handler.close()
        #create plot 
        os.system("gplot.sh " + basecount_metrics_filename +
                  " " + basecount_file_basename + '.eps')
    #For each sample, prepare basecount output filename information 
    for filename_key in file_basecount_dict.iterkeys():
        path_name, file_name = os.path.split(filename_key)
        file_name, extension = os.path.splitext(file_name)
        barcode_basecount_dict = file_basecount_dict[filename_key]
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
            preprocess_output_dir = barcode_dir_path + "/" + "preprocess_output"
            file_basecount_matrix1 = barcode_basecount_dict[barcode][0]
            file_basecount_matrix2 = barcode_basecount_dict[barcode][1]
            if barcode_value != '':
                basecount_file_basename1 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + "_basecountmetrics_1"
                basecount_metrics_filename1 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + \
                    "_basecountmetrics_1" + '.txt'
                basecount_file_basename2 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + "_basecountmetrics_2"
                basecount_metrics_filename2 = preprocess_output_dir + '/' + \
                    file_name + "_" + barcode_value + \
                    "_basecountmetrics_2" + '.txt'
            else:
                basecount_file_basename1 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_1"
                basecount_metrics_filename1 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_1" + '.txt'
                basecount_file_basename2 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_2"
                basecount_metrics_filename2 = preprocess_output_dir + \
                    '/' + file_name + "_basecountmetrics_2" + '.txt'

            if (numpy.sum(file_basecount_matrix1) > 0):
                print  "For filename = ", file_name, "barcode = ", barcode, " ...producing the basecount metrics results for first paired file", "\n"
                #pass output filename informaton to write function 
                write_final_matrix(
                    file_basecount_matrix1,
                    basecount_metrics_filename1,
                    basecount_file_basename1,
                    )
            else:
                print "No data for finename = ", file_name, " barcode = ", barcode, " ...skipping the basecount metrics output for first paired file", "\n"
            if (numpy.sum(file_basecount_matrix2) > 0):

                print "For filename = ", file_name, "barcode = ", barcode, " ...producing the basecount metrics results second paired file", "\n"
                #pass output filename informaton to write function 
                write_final_matrix(
                    file_basecount_matrix2,
                    basecount_metrics_filename2,
                    basecount_file_basename2,
                    )
            else:
                print "No data for finename = ", file_name, " barcode = ", barcode, " ...skipping the basecount metrics output for second paired file", "\n"
Esempio n. 6
0
def merge_alignment_output(
        file_basecount_dict,
        workflow_prov,
        err_log,
        stat_log):
    '''
    Prepares for merging of chunk alignment output into single file,
    generates chunk list and commands based on filename and format.
    '''
    sort_order = wd.alignment_sort_order
    for filename_key in file_basecount_dict.iterkeys():
        path_name, file_name = os.path.split(filename_key)
        barcode_basecount_dict = file_basecount_dict[filename_key]
        file_type = ''
        sort_files_cmd = ''
        suffix_ext = ''
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            aligner_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "aligner_output"
            aligner_output_base = aligner_dir_path + "/" + barcode_value + "_" + file_name
            if barcode_value != '':
                aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name + "_" + barcode_value
                aligner_final_output_base = aligner_dir_path + "/" + file_name + "_" + barcode_value
            else:
                aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name
                aligner_final_output_base = aligner_dir_path + "/" + file_name
            for aligner_output_key in wd.aligner_output_key_arr:
                sort_input_files_arr = []
                aligner_output_suffix = ''
                if len(aligner_output_key.split("output_file")) > 1:
	                aligner_output_suffix = aligner_output_key.split("output_file")[1]
                else:
                	if len(aligner_output_key.split("accepted_hits")) > 1:
                        	aligner_output_suffix = aligner_output_key.split("accepted_hits")[1]
                                aligner_output_base = aligner_dir_path + "/" + "accepted_hits"
                aligner_output_suffix = aligner_output_key.split("output_file")[1]
                aligner_output_suffix, suffix_ext = os.path.splitext(aligner_output_suffix)
                if aligner_output_suffix == '.sam' or aligner_output_suffix == '.bam':
                    aligner_output_suffix = ''
                else:
                    aligner_output_suffix, suffix_ext = os.path.splitext(aligner_output_suffix)
                output_with_suffix = aligner_output_base + "*" + aligner_output_suffix
                final_output_with_suffix = aligner_final_output_base + aligner_output_suffix
                if len(glob.glob(output_with_suffix + "*.sam")) > 0:
                    file_type = "sam"
                if len(glob.glob(output_with_suffix + "*.bam")) > 0:
                    file_type = "bam"
                if wd.alignment_sort_order == 'both':
                    sort_input_files_arr = glob.glob(
                        output_with_suffix + "*queryname*")
                    final_output_name = final_output_with_suffix + "_" + "queryname" + "." + file_type
                    sort_files_cmd = 'samtools merge -n '
                    execute_merge_alignment(
                        final_output_name,
                        sort_input_files_arr,
                        file_type,
                        file_name,
                        barcode,
                        sort_files_cmd,
                        workflow_prov,
                        err_log,
                        stat_log)
                    sort_input_files_arr = glob.glob(output_with_suffix + "*coordinate*")
                    final_output_name = final_output_with_suffix + "_" + "coordinate" + "." + file_type
                    sort_files_cmd = 'samtools merge '
                    execute_merge_alignment(
                        final_output_name,
                        sort_input_files_arr,
                        file_type,
                        file_name,
                        barcode,
                        sort_files_cmd,
                        workflow_prov,
                        err_log,
                        stat_log)
                else:
                    sort_input_files_arr = glob.glob(
                        output_with_suffix + "*" + file_type)
                    if wd.alignment_sort_order == 'unsorted':
                        final_output_name = final_output_with_suffix
                    else:
                        final_output_name = final_output_with_suffix + "_" + wd.alignment_sort_order
                    if file_type == "sam":
                        final_output_name += ".sam"
                    if file_type == "bam":
                        final_output_name += ".bam"
                    if file_type == 'sam' or file_type == "bam":
                        sort_cmd_input = ''
                        if wd.alignment_sort_order == 'queryname':
                            sort_files_cmd = 'samtools merge -n '
                        if wd.alignment_sort_order == 'coordinate':
                            sort_files_cmd = 'samtools merge '
                        execute_merge_alignment(
                            final_output_name,
                            sort_input_files_arr,
                            file_type,
                            file_name,
                            barcode,
                            sort_files_cmd,
                            workflow_prov,
                            err_log,
                            stat_log)
                if file_type == "sam" or file_type == "bam":
                    rm_cmd = "rm " + aligner_dir_path + "/" + barcode + \
                        "*" + aligner_output_suffix + "*." + file_type
                    if len(glob.glob(aligner_dir_path + "/" + barcode + "*" + aligner_output_suffix + "*." + file_type)) > 0:
                        prm = Popen(rm_cmd, shell='False')
                        std_out, std_err = prm.communicate()
                        exit_code = prm.returncode
                        if exit_code != 0:
                            if file_name == '':
                                print "Error: chunk merge clean up after sort failed for barcode=", barcode, "\n"
                            else:
                                print "Error: chunks clean up after merge sort failed for filename =", file_name, "barcode=", barcode, "\n"
    return workflow_prov
Esempio n. 7
0
def regroup_files(
	regroup_arr,
        workflow_prov):
    '''
    Merge the alignment data based on regroup sample information given in-
    workflow configuration. Takes care of sorting and merging different file outputs
    into a single sample output. 
    '''
    regroup_title = regroup_arr[0]
    regroup_files_arr = regroup_arr[1]
    err_string = ""
    stat_string = ""
    for i in range(len(regroup_files_arr)):
        if os.path.exists(wd.err_log_path + "/" + regroup_files_arr[i] + "_err.log"):
            err_string += " " + wd.err_log_path + "/" + regroup_files_arr[i] + "_err.log"
        if os.path.exists(wd.stat_log_path + "/" + regroup_files_arr[i] + "_stat.log"):
            stat_string += " " + wd.stat_log_path + "/" + regroup_files_arr[i] + "_stat.log"
        if os.path.exists(wd.err_log_path + "/" + regroup_files_arr[i] + "_log_temp"):
            os.system("rm -rf " + wd.err_log_path + "/" + regroup_files_arr[i] + "_log_temp")
        if os.path.exists(wd.stat_log_path + "/" + regroup_files_arr[i] + "_log_temp"):
            os.system("rm -rf " + wd.stat_log_path + "/" + regroup_files_arr[i] + "_log_temp")
    if err_string != " " + wd.err_log_path + "/" + regroup_title + "_err.log":
        os.system("cat" +err_string +"> " + wd.err_log_path + "/" + regroup_title + "_err.log")
        os.system("rm" + err_string)
    os.system("mkdir " + wd.err_log_path + "/" + regroup_title + "_log_temp")
    if stat_string != " " + wd.stat_log_path + "/" + regroup_title + "_stat.log":
        os.system("cat" + stat_string + "> " + wd.stat_log_path + "/" + regroup_title + "_stat.log")
        os.system("rm" + stat_string)
    os.system("mkdir " + wd.stat_log_path + "/" + regroup_title + "_log_temp")
    err_log = wd.err_log_path + "/" + regroup_title + "_err.log"
    stat_log = wd.stat_log_path + "/" + regroup_title + "_stat.log"
    #write regrouped file information to log file
    regroup_merge_log = wd.regroup_output_path + "/" + "regroup_files.log"
    fw = open(regroup_merge_log, 'a')
    fw.write(regroup_title + " ")
    for i in range(0, len(regroup_files_arr)):
        if i == len(regroup_files_arr) - 1:
            fw.write(regroup_files_arr[i])
        else:
            fw.write(regroup_files_arr[i] + ",")
    fw.write("\n")
    fw.close()
    for aligner_output_key in wd.aligner_output_key_arr:
        sort_input_files_arr = []
        sort_input_files_arr_queryname = []
        sort_input_files_arr_coordinate = []
        aligner_output_suffix = ''
        if len(aligner_output_key.split("output_file")) > 1:
		aligner_output_suffix = aligner_output_key.split("output_file")[1]
        else:
        	if len(aligner_output_key.split("accepted_hits")) > 1:
                	aligner_output_suffix = aligner_output_key.split("accepted_hits")[1]
        aligner_output_suffix, suffix_ext = os.path.splitext(aligner_output_suffix)
        if aligner_output_suffix == '.sam' or aligner_output_suffix == '.bam':
            aligner_output_suffix = ''
        else:
            aligner_output_suffix, suffix_ext = os.path.splitext(
                aligner_output_suffix)
        sort_input_files_arr = []
        sort_input_files_arr_queryname = []
        sort_input_files_arr_coordinate = []
        for filename_key in regroup_files_arr:
            file_name = filename_key
            file_type = ''
            sort_files_cmd = ''
            suffix_ext = ''
            for barcode in wd.barcode_dict.iterkeys():
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "aligner_output"
                regroup_aligner_dir_path = wd.regroup_output_path + "/" + regroup_title + "/" + barcode + "/" + "aligner_output"
                aligner_output_base = aligner_dir_path + \
                    "/" + barcode_value + "_" + file_name
                if barcode_value != '':
                    aligner_output_base = aligner_dir_path + "/" + \
                        "aligner_" + file_name + "_" + barcode_value
                    aligner_final_output_base = regroup_aligner_dir_path + \
                        "/" + regroup_title + "_" + barcode_value
                else:
                    aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name
                    aligner_final_output_base = regroup_aligner_dir_path + "/" + regroup_title
                if re.search('accepted_hits',aligner_output_key) != None:
			aligner_output_base = aligner_dir_path + "/" + "accepted_hits"
                output_with_suffix = aligner_output_base + "*" + aligner_output_suffix
                final_output_with_suffix = aligner_final_output_base + aligner_output_suffix
                if len(glob.glob(output_with_suffix + "*.sam")) > 0:
                    file_type = "sam"
                if len(glob.glob(output_with_suffix + "*.bam")) > 0:
                    file_type = "bam"
                if wd.alignment_sort_order == 'both':
                    sort_input_files_arr_queryname.extend(
                        glob.glob(output_with_suffix + "*queryname*"))
                    sort_input_files_arr_coordinate.extend(
                        glob.glob(output_with_suffix + "*coordinate*"))
                else:
                    sort_input_files_arr.extend(
                        glob.glob(output_with_suffix + "*" + file_type))
        if wd.alignment_sort_order == 'both':
            final_output_name_queryname = final_output_with_suffix + "_" + "queryname" + "." + file_type
            final_output_name_coordinate = final_output_with_suffix + "_" + "coordinate" + "." + file_type
            sort_files_cmd = 'samtools merge -n '
            execute_merge_alignment(
                final_output_name_queryname,
                sort_input_files_arr_queryname,
                file_type,
                file_name,
                barcode,
                sort_files_cmd,
                workflow_prov,
                err_log,
                stat_log)
            sort_files_cmd = 'samtools merge '
            execute_merge_alignment(
                final_output_name_coordinate,
                sort_input_files_arr_coordinate,
                file_type,
                file_name,
                barcode,
                sort_files_cmd,
                workflow_prov,
                err_log,
                stat_log)
        else:
            if wd.alignment_sort_order == 'unsorted':
                final_output_name = final_output_with_suffix
            else:
                final_output_name = final_output_with_suffix + "_" + wd.alignment_sort_order
            if file_type == "sam":
                final_output_name += ".sam"
            if file_type == "bam":
                final_output_name += ".bam"
            if file_type == 'sam' or file_type == "bam":
                sort_cmd_input = ''
                if wd.alignment_sort_order == 'queryname':
                    sort_files_cmd = 'samtools merge -n '
                if wd.alignment_sort_order == 'coordinate':
                    sort_files_cmd = 'samtools merge '
                execute_merge_alignment(
                    final_output_name,
                    sort_input_files_arr,
                    file_type,
                    file_name,
                    barcode,
                    sort_files_cmd,
                    workflow_prov,
                    err_log,
                    stat_log)
        if file_type == "sam" or file_type == "bam":
            rm_cmd = "rm " + aligner_dir_path + "/" + barcode + \
                "*" + aligner_output_suffix + "*." + file_type
            if len(glob.glob(aligner_dir_path + "/" + barcode + "*" + aligner_output_suffix + "*." + file_type)) > 0:
                prm = Popen(rm_cmd, shell='False').wait()
                if prm != 0:
                    if file_name == '':
                        print "Error: chunk merge clean up after sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks clean up after merge sort failed for filename =", file_name, "barcode=", barcode, "\n"
    return workflow_prov
Esempio n. 8
0
def run_postprocess(
        postprocess_cmd_arr,
        file_basecount_dict,
        workflow_prov,
        err_log,
        stat_log):
    ''' 
    Prepare postprocess command with input/output paths according to sample name, 
    pass commands to yap_tee or subprocess for execution.
    '''
    if wd.regroup_output =='yes':
    	workflow_output_path = wd.workflow_output_path + "/regroup_output"
    else:
	workflow_output_path= wd.workflow_output_path
    for zz in range(0, len(postprocess_cmd_arr)):
        postprocess_tee_arr = []
        postprocess_nontee_arr = []
        initial_pipe_commands = []
        postprocess_temp_arr = []
        cmd_type = postprocess_cmd_arr[zz][0]
        cmd_meta_data = postprocess_cmd_arr[zz][1]
        postprocess_temp_arr = postprocess_cmd_arr[zz][2]
        input_file_extension = ''
	pipe1=''
	pipe2=''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration file
        input_directory = "aligner_output" 
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type', input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[1]
                if re.search('input_directory', input_meta_data[0]) is not None:
                    input_directory = input_meta_data[1]
        '''iterate over filename and barcode, fetch files from the source directory,
        file extensions and python glob module'''
        for filename_key in file_basecount_dict.iterkeys():
	    #fetch original input file pair for this sample
            for tmp_arr in wd.inp_files_list:
            	if tmp_arr[2] == filename_key:
			#store it in variables pipe1 and pipe2
                	pipe1 = tmp_arr[0]
                        pipe2 = tmp_arr[1]
                	break
            postprocess_input_file_arr = []
            path, file_name = os.path.split(filename_key)
            if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
            	file_name, extension = os.path.splitext(file_name)
            	file_name, extension = os.path.splitext(file_name)
                file_basecount_dict[filename_key] = {
                    'no_barcode_specified': []}
            for barcode in file_basecount_dict[filename_key]:
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = ''
                postprocess_dir_path = ''
                aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory
                postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*"
                postprocess_input_file_arr = glob.glob(postprocess_input)
            	if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
            		if input_directory == "aligner_output":
                        	aligner_dir_path = path
                                postprocess_input = filename_key
                                temp_arr = glob.glob(aligner_dir_path + "/" + "*" + input_file_extension + "*")
                                if temp_arr > 0:
                                	for k in temp_arr:
                                        	if k == postprocess_input:
                                                	postprocess_input_file_arr = [postprocess_input]
                if input_file_extension == '':
                    postprocess_input_file_arr = []
                postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output"
                postprocess_input_file_arr.sort()
                if (len(postprocess_input_file_arr) > 0):
                    if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                    	if input_directory == "aligner_output":
                        	if postprocess_input_file_arr[0] == filename_key:
                            		pass
                        	else:
                            		break
                    for k in range(0, len(postprocess_temp_arr)):
                        postprocess_cmd = postprocess_temp_arr[k][1]
                        postprocess_cmd_name = postprocess_temp_arr[k][0]
                        if file_name == '':
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    barcode_value + "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + postprocess_cmd_name
                        else:
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    file_name + "_" + barcode_value + \
                                    "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + file_name + "_" + \
                                    postprocess_cmd_name
                        #replace generic keywords with appropriate file path variables
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_directory', '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_file_type' + ' ' + input_file_extension, '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "aligner_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "postprocess_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_file', postprocess_outfile)
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_directory', postprocess_dir_path)
                        postprocess_cmd = postprocess_cmd.replace(' =', '=')
                        postprocess_cmd = postprocess_cmd.replace(
                            'sample_name', file_name)
                        if re.search("file_based_input", postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'file_based_input',
                                postprocess_input_file_arr[0])
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd]
                        elif re.search("directory_based_input", postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'directory_based_input', aligner_dir_path)
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd]
                        else:
                            postprocess_tee_arr.append(postprocess_cmd)
                    workflow_prov.append(
                        "INPUT: " + postprocess_input_file_arr[0])
                    for kk in postprocess_tee_arr:
                        if kk != '':
                            workflow_prov.append(kk)
                    if len(postprocess_tee_arr) != 0:
		        #pass commands to yap_tee function
                        yap_tools.yap_tee(
                            initial_pipe_commands,
                            postprocess_tee_arr,
                            postprocess_input_file_arr[0],
                            err_log,
                            stat_log)
                    if len(postprocess_nontee_arr) != 0:
		        #pass commands to non_tee function which uses subproces
                        run_postprocess_nontee(
                            postprocess_nontee_arr,
                            workflow_prov,
                            err_log,
                            stat_log)
                else:
                    if file_name == '':
                        print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n"
                    else:
                        print "Warning: No aligner output for filename = ", file_name, "  barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n"
    return workflow_prov
Esempio n. 9
0
def execute_file(input_filename_local,input_filename_local_2,file_name,chunk_number,myrank,ii,file_basecount_dict):
        workflow_prov = []
        err_chunk_file = wd.err_log_path + "/" + file_name + \
                     "_log_temp/" + file_name + "_" + str(ii).zfill(6)
        stat_chunk_file = wd.stat_log_path + "/" + file_name + \
                      "_log_temp/" + file_name + "_" + str(ii).zfill(6)
        str_out="*" * 50 + "ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
        yap_file_io.write_data(str_out,err_chunk_file)
        yap_file_io.write_data(str_out,stat_chunk_file)
        for filename_key in file_basecount_dict.iterkeys():
            if filename_key == file_name:
                for barcode in wd.barcode_dict.iterkeys():
                    barcode_value = yap_tools.rename_barcode(barcode)
                    barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
                    aligner_dir_path = barcode_dir_path + "/" + "aligner_output"
                    if wd.alignment_sort_order != 'unsorted':
                        if barcode_value != '':
                            aligner_output_filename = aligner_dir_path + "/" + \
                                "aligner_" + file_name + \
                                "_" + barcode_value
                        else:
                            aligner_output_filename = aligner_dir_path + \
                                "/" + "aligner_" + file_name
                    else:
                        if barcode_value != '':
                            aligner_output_filename = aligner_dir_path + \
                                "/" + file_name + \
                                "_" + barcode_value
                        else:
                            aligner_output_filename = aligner_dir_path + \
                                "/" + file_name

                    if wd.run_preprocess_analysis == 'yes':
                        preprocessed_file_inp1 = ['pipe1']
                        preprocessed_file_inp2 = ['pipe2']
                        preprocess_dir_path = barcode_dir_path + \
                            "/" + "preprocess_output"
                        preprocessed_inp1 = preprocess_dir_path + \
                            "/" + "*preprocess_data*_1.txt"
                        preprocessed_inp2 = preprocess_dir_path + \
                            "/" + "*preprocess_data_*2.txt"
                        preprocessed_file_inp1 = glob.glob(
                            preprocessed_inp1)
                        if wd.paired_end_data == "yes":
                            preprocessed_file_inp2 = glob.glob(
                                preprocessed_inp2)
                        if (wd.paired_end_data== "yes" and preprocessed_file_inp1 and preprocessed_file_inp2) or (wd.paired_end_data != "yes" and preprocessed_file_inp1):
                            print "Entering Alignment section: Filename=", input_filename_local, "barcode=", barcode, "\n"
                            if wd.paired_end_data == 'yes':
                                workflow_prov.append(
                                    "INPUT: " +
                                    preprocessed_file_inp1[0] +
                                    " and " +
                                    preprocessed_file_inp2[0])
                                aligner_out_str, workflow_prov = yap_aligner.run_aligner(preprocessed_file_inp1[0], preprocessed_file_inp2[
                                                                                    0],aligner_output_filename, chunk_number,myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                            else:
                                workflow_prov.append(
                                    "INPUT: " +
                                    preprocessed_file_inp1[0])
                                aligner_out_str, workflow_prov = yap_aligner.run_aligner(preprocessed_file_inp1[
                                                                                    0], '', aligner_output_filename, chunk_number,myrank, workflow_prov, err_chunk_file, stat_chunk_file)

                            if wd.write_preprocessed_data != 'yes':
                                prm1 = Popen(
                                    "rm " +
                                    preprocess_dir_path +
                                    "/" +
                                    "*preprocess_data*_1.txt",
                                    shell='False').wait()
                                if paired_end_data == "yes":
                                    if preprocessed_file_inp2:
                                        prm2 = Popen(
                                            "rm " +
                                            preprocess_dir_path +
                                            "/" +
                                            "*preprocess_data*_2.txt",
                                            shell='False').wait()
                        else:
                            print "Skipping Alignment for : Filename=", input_filename_local, "barcode=", barcode, "........", "No preprocessed data found"
                    else:
                        if wd.paired_end_data == 'yes':
                            workflow_prov.append(
                                "INPUT: " +
                                input_filename_local +
                                " and " +
                                input_filename_local_2)
                            aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                                input_filename_local, input_filename_local_2, aligner_output_filename, 0, workflow_prov, err_chunk_file, stat_chunk_file)
                        else:
                            workflow_prov.append("INPUT: " + input_filename_local)
                            aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                                input_filename_local, '', aligner_cmd_arr, aligner_output_filename, 0, workflow_prov, err_chunk_file, stat_chunk_file)
		    #remove temporary files created by aligners
                    rm_cmd = "rm " + \
                        aligner_output_filename + "*.sai"
                    if len(glob.glob(aligner_output_filename + "*.sai")) > 0:
                        prm = Popen(
                            rm_cmd, shell='False').wait()
                    if barcode in file_basecount_dict[filename_key]:
                        pass
                    else:
                        file_basecount_dict[
                            filename_key][barcode] = []
	#write to log
        str_out="*" * 50 + "ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
        yap_file_io.write_data(str_out,err_chunk_file)
        yap_file_io.write_data(str_out,stat_chunk_file)
        return workflow_prov, file_basecount_dict
Esempio n. 10
0
def execute_chunk(
        input_file_list_local,
        inp1,
        inp2,
        chunk_number,
	myrank,
        workflow_prov,
	eqp_dict):
    '''
    Executes preprocess commands for chunked data and passes to the alignment stage
    Takes chunked input data, filename list, chunk number, rank of the processor     
    and provenance list to append log data.
    ''' 
    # variable declaration
    input_filename_local = input_file_list_local[0]
    input_filename_local_2 = input_file_list_local[1]
    file_name = input_file_list_local[2]
    err_chunk_file = wd.err_log_path + "/" + file_name + \
        "_log_temp/" + file_name + "_" + str(chunk_number).zfill(6)
    stat_chunk_file = wd.stat_log_path + "/" + file_name + \
        "_log_temp/" + file_name + "_" + str(chunk_number).zfill(6)
    myhost = os.getenv('HOSTNAME')
    yap_file_io.write_data("HOSTNAME: " + str(myhost) + "\n", err_chunk_file)
    yap_file_io.write_data("HOSTNAME: " + str(myhost) + "\n", stat_chunk_file)
    yap_file_io.write_data("CHUNK NUMBER: " + str(chunk_number) + "\n", err_chunk_file)
    yap_file_io.write_data("CHUNK NUMBER: " + str(chunk_number) + "\n", stat_chunk_file)
    seqs_arr1 = []
    seqs_arr2 = []
    read_length = wd.max_read_length
    barcode_seqstruct_dict1 = {}
    barcode_seqstruct_dict2 = {}
    barcode_output_dict = {}
    aligner_out_str = ''
    sort_order = ''
    barcode_flag = 'False'
    sort_order = wd.alignment_sort_order
    # convert the input data based on format given in workflow configuration
    if wd.input_file_format == "qseq" or wd.input_file_format != wd.preprocess_output_file_format:
        inp1 = yap_tools.convert_format(inp1)
        if wd.paired_end_data == 'yes':
            inp2 = yap_tools.convert_format(inp2)
    if wd.run_preprocess_analysis == 'yes':
	str_out = "-"*20 + "PREPROCESS STARTED" +"\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
	yap_file_io.write_data(str_out,err_chunk_file)
	yap_file_io.write_data(str_out,stat_chunk_file)
        # Run barcode splitter as first preprocess step
        for jj in range(0, len(wd.preprocess_cmd_arr)):
            preprocess_cmd_name = wd.preprocess_cmd_arr[jj][2][0][0]
            preprocess_cmd = wd.preprocess_cmd_arr[jj][2][0][1]
            if re.search('fastx_barcode_splitter', preprocess_cmd_name) is not None:
                barcode_flag = 'True'
                print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
		str_out= "YAP_COMMAND: " + preprocess_cmd + "\n" + "INPUT FILE: " + input_filename_local
		yap_file_io.write_data(str_out,err_chunk_file)
		yap_file_io.write_data(str_out,stat_chunk_file)
                barcode_seqstruct_dict1, workflow_prov = yap_preprocess.fastx_barcode_splitter(
                    inp1, wd.preprocess_output_file_format, preprocess_cmd, workflow_prov, err_chunk_file, stat_chunk_file)
                yap_file_io.write_data("_" * 30 + "\n", err_chunk_file)
                yap_file_io.write_data("_" * 30 + "\n", stat_chunk_file)
                barcode_seqstruct_dict1["no_barcode_specified"] = ''
                print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                if wd.paired_end_data == 'yes':
                    print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
		    str_out= "YAP_COMMAND: " + preprocess_cmd + "\n" + "INPUT FILE: " + input_filename_local_2
		    yap_file_io.write_data(str_out,err_chunk_file)
		    yap_file_io.write_data(str_out,stat_chunk_file)
                    barcode_seqstruct_dict2, workflow_prov = yap_preprocess.fastx_barcode_splitter(
                        inp2,wd.preprocess_output_file_format , preprocess_cmd, workflow_prov, err_chunk_file, stat_chunk_file)
                    yap_file_io.write_data("_" * 30 + "\n", err_chunk_file)
                    yap_file_io.write_data("_" * 30 + "\n", stat_chunk_file)
                    barcode_seqstruct_dict2["no_barcode_specified"] = ''
                    print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                break
        if barcode_flag == 'False':
            #if no barcode command; then create dictionary with one barcode tag
            barcode_seqstruct_dict1["no_barcode_specified"] = inp1
            barcode_seqstruct_dict2["no_barcode_specified"] = inp2
    else:
        #if no preprocess stage specified; then create dictionary with one barcode tag
        barcode_seqstruct_dict1["no_barcode_specified"] = inp1
        barcode_seqstruct_dict2["no_barcode_specified"] = inp2
    #iterate over the barcode dictionary 
    for barcode, inp1 in barcode_seqstruct_dict1.iteritems():
        run_unique_reads = 'False'
        barcode_value = yap_tools.rename_barcode(barcode)
        if wd.paired_end_data == "yes":
            inp2 = barcode_seqstruct_dict2[barcode]
        preprocessed_data_dict = {}
	#intialize matrix for basecount analysis
        aligner_output_str_local = ''
        basecount_matrix_local1 = numpy.zeros(
            (int(read_length), 5), dtype=numpy.int)
        basecount_matrix_local2 = numpy.zeros(
            (int(read_length), 5), dtype=numpy.int)
        barcode_output_dict.setdefault(barcode, [basecount_matrix_local1, basecount_matrix_local2])
        #set output file paths
        barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
        preprocess_dir_path = barcode_dir_path + "/" + "preprocess_output"
        if wd.data_distribution_method != "file_based":
            if barcode_value != '':
                preprocess_out_filename1 = preprocess_dir_path + "/" + barcode_value + "_" + file_name + \
                    "_" + str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + barcode_value + "_" + file_name + \
                    "_" + str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_2.txt"
            else:
                preprocess_out_filename1 = preprocess_dir_path + "/" + file_name + "_" + \
                    str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + file_name + "_" + \
                    str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_2.txt"
        else:
            if barcode_value != '':
                preprocess_out_filename1 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + \
                    "_" + barcode_value + "_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + \
                    "_" + barcode_value + "_2.txt"
            else:
                preprocess_out_filename1 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + "_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + "_2.txt"
        aligner_dir_path = barcode_dir_path + "/" + "aligner_output"
        if barcode_value != '':
            aligner_output_filename = aligner_dir_path + "/" + "aligner_" + \
                file_name + "_" + barcode_value + \
                "_" + str(chunk_number).zfill(6)
        else:
            aligner_output_filename = aligner_dir_path + "/" + \
                "aligner_" + file_name + "_" + str(chunk_number).zfill(6)

        for jj in range(0, len(wd.preprocess_cmd_arr)):
            preprocess_cmd_name = wd.preprocess_cmd_arr[jj][2][0][1]
            preprocess_cmd = wd.preprocess_cmd_arr[jj][2][0][1]
            # skip fastqc and fastq screen and barcode splitter as they are
            # already executed
            if (re.search('fastqc', preprocess_cmd_name) is not None) or (re.search('fastq_screen', preprocess_cmd_name) is not None)or(re.search('fastx_barcode_splitter',
                                                                                                                                                  preprocess_cmd_name) is not None):
                pass
            else:
                if re.search('calculate_basecount_metrics', preprocess_cmd_name) is not None:
		    #excecute basecount calculation
                    basecount_matrix_local1, workflow_prov = yap_tools.qc_basecount(
                        inp1, workflow_prov)
                    basecount_matrix_local2, workflow_prov = yap_tools.qc_basecount(
                        inp2, workflow_prov)
                elif re.search('fastx_clipper', preprocess_cmd_name) is not None:
		    """
		    Check for fastx clipper as special case and execute.
		    This is because fastx clipper execution has been optimized by providing contaminants for every file,
		    instead of just applying contaminants universally. 
		    """ 
                    run_unique_reads = 'True'
                    if input_filename_local in wd.contaminant_dict.keys():
                        contaminants_arr1 = wd.contaminant_dict[
                            input_filename_local]
                        print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                        index = 0
                        for index in range(0, len(contaminants_arr1)):
			    #iterate over all the contaminants for this file 
                            fastx_clipper_cmd = preprocess_cmd
                            contaminant1 = contaminants_arr1[index].strip("\n")
                            if inp1 != '':
                                cont_replace = " -a " + contaminant1
                                fastx_clipper_cmd = fastx_clipper_cmd.replace(
                                    'pipe1', " - ") + " -a " + contaminant1
                                inp1 = yap_tools.multiproc_function(
                                    fastx_clipper_cmd, inp1, int(
                                        wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                                yap_log.merge_multiproc_files(
                                    fastx_clipper_cmd,
                                    input_filename_local,
                                    barcode,
                                    err_chunk_file,
                                    stat_chunk_file)
                            if inp1 == '':
                                break
                        print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        if input_filename_local_2 in wd.contaminant_dict.keys():
			    #repeat fastx clipper for the paired end
                            contaminants_arr2 = wd.contaminant_dict[
                                input_filename_local_2]
                            print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                            index = 0
                            for index in range(0, len(contaminants_arr2)):
                                fastx_clipper_cmd = preprocess_cmd
                                contaminant2 = contaminants_arr2[
                                    index].strip("\n")
                                if inp2 != '':
                                    cont_replace = " -a " + contaminant2
                                    fastx_clipper_cmd = fastx_clipper_cmd.replace(
                                        'pipe1',
                                        " - ") + " -a " + contaminant2
                                    inp2 = yap_tools.multiproc_function(
                                        fastx_clipper_cmd, inp2, int(
                                            wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                                    yap_log.merge_multiproc_files(
                                        fastx_clipper_cmd,
                                        input_filename_local_2,
                                        barcode,
                                        err_chunk_file,
                                        stat_chunk_file)
                                if inp2 == '':
                                    break
                            print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                elif re.search('eqp_rename_reads',preprocess_cmd_name) != None:
                        # this section renames reads according to specific format, applies to in-house use, neglect otherwise
                        inp1_arr = inp1.splitlines(1)
                        inp1=''
                        inp2_arr = inp2.splitlines(1)
                        inp2=''
                        read_count=1
                        if wd.data_distribution_method == "file_based":
                                if eqp_dict.has_key("eqp_read_counter"):
                                        if len(eqp_dict["eqp_read_counter"]) > 0:
                                                file_name, read_count = eqp_dict["eqp_read_counter"]
                                                if file_name !=  input_filename_local:
                                                        read_count = 1
                        format_lines = int(wd.format_specific_lines)
                        for i in range(0,len(inp1_arr),format_lines):
                                if wd.paired_end_data == 'yes':
                                        if (len(inp1_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp2_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp1_arr[i+1].strip("\n").replace('T','')) >= 5) and (len(inp2_arr[i+1].strip("\n").replace('T','')) >= 5) :
                                                inp1 += '@F'+str(read_count).zfill(9)+'/1'+'\n'
                                                inp2 += '@F'+str(read_count).zfill(9)+'/2'+'\n'
                                                for jj in range (1,format_lines):
                                                        inp1 += inp1_arr[i+jj]
                                                        inp2 += inp2_arr[i+jj]
                                                read_count += 1
                                else:
                                        if (len(inp1_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp1_arr[i+1].strip("\n").replace('T','')) >= 5):
                                                inp1_arr[i] = '@F'+str(read_count).zfill(9)+'/1'+'\n'
                                                for jj in range (1,format_lines):
                                                        inp1 += inp1_arr[i+jj]
                                                read_count += 1
                        eqp_dict["eqp_read_counter"] = [ input_filename_local, read_count]
                        inp1_arr = []
                        inp2_arr = []
                else:
		    #set the flag to remove umatched pair after preprocesing 
                    run_unique_reads = 'True'
                    print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
		    #for all other preprocess commands execute this section
                    if inp1 != '':
                        preprocess_cmd = preprocess_cmd.replace('pipe1', ' - ')
                        inp1 = yap_tools.multiproc_function(
                            preprocess_cmd, inp1, int(
                                wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                        yap_log.merge_multiproc_files(
                            preprocess_cmd,
                            input_filename_local,
                            barcode,
                            err_chunk_file,
                            stat_chunk_file)
                    print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        preprocess_cmd = preprocess_cmd.replace('pipe1', ' - ')
                        print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                        if inp2 != '':
                            inp2 = yap_tools.multiproc_function(
                                preprocess_cmd, inp2, int(
                                    wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                            yap_log.merge_multiproc_files(
                                preprocess_cmd,
                                input_filename_local_2,
                                barcode,
                                err_chunk_file,
                                stat_chunk_file)
                        print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
        if wd.paired_end_data == 'yes':
            if run_unique_reads == 'True':
		#remove all the umatched pairs from two chunks belonging to the same sample
		#this is because each chunk goes through command separately, not as a pair.
                if inp1 != '' and inp2 != '':
                    inp1, inp2 = yap_tools.find_unique_set(
                        inp1.splitlines(1), inp2.splitlines(1))
	if wd.run_preprocess_analysis  == 'yes':
		#write log data
		str_out="-"*20 + "PREPROCESS FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
        if wd.data_distribution_method != "file_based":
	    #if the workflow is not filebased; then pass the chunks for alignment.
            if wd.run_reference_alignment == 'yes':
		str_out="-"*20 + "ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
                if (wd.paired_end_data == 'yes' and inp1 != '' and inp2 != '') or (wd.paired_end_data != 'yes' and inp1 != ''):
                    print "Entering Alignment: Filename=", input_filename_local, "barcode=", barcode, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        workflow_prov.append(
                            "INPUT: " +
                            input_filename_local +
                            " and " +
                            input_filename_local_2 +
                            " chunk number= " +
                            str(chunk_number))
                        aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                            inp1, inp2,aligner_output_filename, chunk_number, myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                    else:
                        workflow_prov.append(
                            "INPUT: " +
                            input_filename_local +
                            " chunk number= " +
                            str(chunk_number))
                        aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                            inp1, '', aligner_output_filename, chunk_number,myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                    rm_cmd = "rm " + aligner_output_filename + "*.sai"
                    if len(glob.glob(aligner_output_filename + "*.sai")) > 0:
                        prm = Popen(rm_cmd, shell='False').wait()
                    if len(glob.glob(aligner_output_filename + "*.head")) > 0:
                        prm = Popen(rm_cmd, shell='False').wait()

                else:
                    	print "Exiting Alignment: Filename=", input_filename_local, "barcode=", barcode, " chunk number=", chunk_number, "\n"
		str_out="-"*20 + "ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20+ "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
            if wd.run_preprocess_analysis == 'yes':
                if wd.write_preprocessed_data == 'yes':
		    #write preprocessed data to file
                    yap_file_io.write_data(inp1, preprocess_out_filename1)
                    if wd.paired_end_data == "yes":
                        yap_file_io.write_data(inp2, preprocess_out_filename2)
                else:
		    #else empty input data chunk
                    inp1 = ''
                    inp2 = ''
            else:
		#else empty input data chunk
                inp1 = ''
                inp2 = ''

        else:
	    #if workflow is filebased; then write preprocessed data to file
            if wd.run_preprocess_analysis == "yes":
                if wd.write_preprocessed_data == 'yes' or wd.run_reference_alignment == "yes":
                    yap_file_io.write_data(inp1, preprocess_out_filename1)
                    if wd.paired_end_data == "yes":
                        yap_file_io.write_data(inp2, preprocess_out_filename2)
        barcode_output_dict[barcode][0] = basecount_matrix_local1
        barcode_output_dict[barcode][1] = basecount_matrix_local2
    return barcode_output_dict, workflow_prov
Esempio n. 11
0
def regroup_files(regroup_arr, workflow_prov):
    '''
    Merge the alignment data based on regroup sample information given in-
    workflow configuration. Takes care of sorting and merging different file outputs
    into a single sample output. 
    '''
    regroup_title = regroup_arr[0]
    regroup_files_arr = regroup_arr[1]
    err_string = ""
    stat_string = ""
    for i in range(len(regroup_files_arr)):
        if os.path.exists(wd.err_log_path + "/" + regroup_files_arr[i] +
                          "_err.log"):
            err_string += " " + wd.err_log_path + "/" + regroup_files_arr[
                i] + "_err.log"
        if os.path.exists(wd.stat_log_path + "/" + regroup_files_arr[i] +
                          "_stat.log"):
            stat_string += " " + wd.stat_log_path + "/" + regroup_files_arr[
                i] + "_stat.log"
        if os.path.exists(wd.err_log_path + "/" + regroup_files_arr[i] +
                          "_log_temp"):
            os.system("rm -rf " + wd.err_log_path + "/" +
                      regroup_files_arr[i] + "_log_temp")
        if os.path.exists(wd.stat_log_path + "/" + regroup_files_arr[i] +
                          "_log_temp"):
            os.system("rm -rf " + wd.stat_log_path + "/" +
                      regroup_files_arr[i] + "_log_temp")
    if err_string != " " + wd.err_log_path + "/" + regroup_title + "_err.log":
        os.system("cat" + err_string + "> " + wd.err_log_path + "/" +
                  regroup_title + "_err.log")
        os.system("rm" + err_string)
    os.system("mkdir " + wd.err_log_path + "/" + regroup_title + "_log_temp")
    if stat_string != " " + wd.stat_log_path + "/" + regroup_title + "_stat.log":
        os.system("cat" + stat_string + "> " + wd.stat_log_path + "/" +
                  regroup_title + "_stat.log")
        os.system("rm" + stat_string)
    os.system("mkdir " + wd.stat_log_path + "/" + regroup_title + "_log_temp")
    err_log = wd.err_log_path + "/" + regroup_title + "_err.log"
    stat_log = wd.stat_log_path + "/" + regroup_title + "_stat.log"
    #write regrouped file information to log file
    regroup_merge_log = wd.regroup_output_path + "/" + "regroup_files.log"
    fw = open(regroup_merge_log, 'a')
    fw.write(regroup_title + " ")
    for i in range(0, len(regroup_files_arr)):
        if i == len(regroup_files_arr) - 1:
            fw.write(regroup_files_arr[i])
        else:
            fw.write(regroup_files_arr[i] + ",")
    fw.write("\n")
    fw.close()
    for aligner_output_key in wd.aligner_output_key_arr:
        sort_input_files_arr = []
        sort_input_files_arr_queryname = []
        sort_input_files_arr_coordinate = []
        aligner_output_suffix = ''
        if len(aligner_output_key.split("output_file")) > 1:
            aligner_output_suffix = aligner_output_key.split("output_file")[1]
        else:
            if len(aligner_output_key.split("accepted_hits")) > 1:
                aligner_output_suffix = aligner_output_key.split(
                    "accepted_hits")[1]
        aligner_output_suffix, suffix_ext = os.path.splitext(
            aligner_output_suffix)
        if aligner_output_suffix == '.sam' or aligner_output_suffix == '.bam':
            aligner_output_suffix = ''
        else:
            aligner_output_suffix, suffix_ext = os.path.splitext(
                aligner_output_suffix)
        sort_input_files_arr = []
        sort_input_files_arr_queryname = []
        sort_input_files_arr_coordinate = []
        for filename_key in regroup_files_arr:
            file_name = filename_key
            file_type = ''
            sort_files_cmd = ''
            suffix_ext = ''
            for barcode in wd.barcode_dict.iterkeys():
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "aligner_output"
                regroup_aligner_dir_path = wd.regroup_output_path + "/" + regroup_title + "/" + barcode + "/" + "aligner_output"
                aligner_output_base = aligner_dir_path + \
                    "/" + barcode_value + "_" + file_name
                if barcode_value != '':
                    aligner_output_base = aligner_dir_path + "/" + \
                        "aligner_" + file_name + "_" + barcode_value
                    aligner_final_output_base = regroup_aligner_dir_path + \
                        "/" + regroup_title + "_" + barcode_value
                else:
                    aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name
                    aligner_final_output_base = regroup_aligner_dir_path + "/" + regroup_title
                if re.search('accepted_hits', aligner_output_key) != None:
                    aligner_output_base = aligner_dir_path + "/" + "accepted_hits"
                output_with_suffix = aligner_output_base + "*" + aligner_output_suffix
                final_output_with_suffix = aligner_final_output_base + aligner_output_suffix
                if len(glob.glob(output_with_suffix + "*.sam")) > 0:
                    file_type = "sam"
                if len(glob.glob(output_with_suffix + "*.bam")) > 0:
                    file_type = "bam"
                if wd.alignment_sort_order == 'both':
                    sort_input_files_arr_queryname.extend(
                        glob.glob(output_with_suffix + "*queryname*"))
                    sort_input_files_arr_coordinate.extend(
                        glob.glob(output_with_suffix + "*coordinate*"))
                else:
                    sort_input_files_arr.extend(
                        glob.glob(output_with_suffix + "*" + file_type))
        if wd.alignment_sort_order == 'both':
            final_output_name_queryname = final_output_with_suffix + "_" + "queryname" + "." + file_type
            final_output_name_coordinate = final_output_with_suffix + "_" + "coordinate" + "." + file_type
            sort_files_cmd = 'samtools merge -n '
            execute_merge_alignment(final_output_name_queryname,
                                    sort_input_files_arr_queryname, file_type,
                                    file_name, barcode, sort_files_cmd,
                                    workflow_prov, err_log, stat_log)
            sort_files_cmd = 'samtools merge '
            execute_merge_alignment(final_output_name_coordinate,
                                    sort_input_files_arr_coordinate, file_type,
                                    file_name, barcode, sort_files_cmd,
                                    workflow_prov, err_log, stat_log)
        else:
            if wd.alignment_sort_order == 'unsorted':
                final_output_name = final_output_with_suffix
            else:
                final_output_name = final_output_with_suffix + "_" + wd.alignment_sort_order
            if file_type == "sam":
                final_output_name += ".sam"
            if file_type == "bam":
                final_output_name += ".bam"
            if file_type == 'sam' or file_type == "bam":
                sort_cmd_input = ''
                if wd.alignment_sort_order == 'queryname':
                    sort_files_cmd = 'samtools merge -n '
                if wd.alignment_sort_order == 'coordinate':
                    sort_files_cmd = 'samtools merge '
                execute_merge_alignment(final_output_name,
                                        sort_input_files_arr, file_type,
                                        file_name, barcode, sort_files_cmd,
                                        workflow_prov, err_log, stat_log)
        if file_type == "sam" or file_type == "bam":
            rm_cmd = "rm " + aligner_dir_path + "/" + barcode + \
                "*" + aligner_output_suffix + "*." + file_type
            if len(
                    glob.glob(aligner_dir_path + "/" + barcode + "*" +
                              aligner_output_suffix + "*." + file_type)) > 0:
                prm = Popen(rm_cmd, shell='False').wait()
                if prm != 0:
                    if file_name == '':
                        print "Error: chunk merge clean up after sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks clean up after merge sort failed for filename =", file_name, "barcode=", barcode, "\n"
    return workflow_prov
Esempio n. 12
0
def merge_alignment_output(file_basecount_dict, workflow_prov, err_log,
                           stat_log):
    '''
    Prepares for merging of chunk alignment output into single file,
    generates chunk list and commands based on filename and format.
    '''
    sort_order = wd.alignment_sort_order
    for filename_key in file_basecount_dict.iterkeys():
        path_name, file_name = os.path.split(filename_key)
        barcode_basecount_dict = file_basecount_dict[filename_key]
        file_type = ''
        sort_files_cmd = ''
        suffix_ext = ''
        for barcode in barcode_basecount_dict.iterkeys():
            barcode_value = yap_tools.rename_barcode(barcode)
            aligner_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode + "/" + "aligner_output"
            aligner_output_base = aligner_dir_path + "/" + barcode_value + "_" + file_name
            if barcode_value != '':
                aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name + "_" + barcode_value
                aligner_final_output_base = aligner_dir_path + "/" + file_name + "_" + barcode_value
            else:
                aligner_output_base = aligner_dir_path + "/" + "aligner_" + file_name
                aligner_final_output_base = aligner_dir_path + "/" + file_name
            for aligner_output_key in wd.aligner_output_key_arr:
                sort_input_files_arr = []
                aligner_output_suffix = ''
                if len(aligner_output_key.split("output_file")) > 1:
                    aligner_output_suffix = aligner_output_key.split(
                        "output_file")[1]
                else:
                    if len(aligner_output_key.split("accepted_hits")) > 1:
                        aligner_output_suffix = aligner_output_key.split(
                            "accepted_hits")[1]
                        aligner_output_base = aligner_dir_path + "/" + "accepted_hits"
                aligner_output_suffix = aligner_output_key.split(
                    "output_file")[1]
                aligner_output_suffix, suffix_ext = os.path.splitext(
                    aligner_output_suffix)
                if aligner_output_suffix == '.sam' or aligner_output_suffix == '.bam':
                    aligner_output_suffix = ''
                else:
                    aligner_output_suffix, suffix_ext = os.path.splitext(
                        aligner_output_suffix)
                output_with_suffix = aligner_output_base + "*" + aligner_output_suffix
                final_output_with_suffix = aligner_final_output_base + aligner_output_suffix
                if len(glob.glob(output_with_suffix + "*.sam")) > 0:
                    file_type = "sam"
                if len(glob.glob(output_with_suffix + "*.bam")) > 0:
                    file_type = "bam"
                if wd.alignment_sort_order == 'both':
                    sort_input_files_arr = glob.glob(output_with_suffix +
                                                     "*queryname*")
                    final_output_name = final_output_with_suffix + "_" + "queryname" + "." + file_type
                    sort_files_cmd = 'samtools merge -n '
                    execute_merge_alignment(final_output_name,
                                            sort_input_files_arr, file_type,
                                            file_name, barcode, sort_files_cmd,
                                            workflow_prov, err_log, stat_log)
                    sort_input_files_arr = glob.glob(output_with_suffix +
                                                     "*coordinate*")
                    final_output_name = final_output_with_suffix + "_" + "coordinate" + "." + file_type
                    sort_files_cmd = 'samtools merge '
                    execute_merge_alignment(final_output_name,
                                            sort_input_files_arr, file_type,
                                            file_name, barcode, sort_files_cmd,
                                            workflow_prov, err_log, stat_log)
                else:
                    sort_input_files_arr = glob.glob(output_with_suffix + "*" +
                                                     file_type)
                    if wd.alignment_sort_order == 'unsorted':
                        final_output_name = final_output_with_suffix
                    else:
                        final_output_name = final_output_with_suffix + "_" + wd.alignment_sort_order
                    if file_type == "sam":
                        final_output_name += ".sam"
                    if file_type == "bam":
                        final_output_name += ".bam"
                    if file_type == 'sam' or file_type == "bam":
                        sort_cmd_input = ''
                        if wd.alignment_sort_order == 'queryname':
                            sort_files_cmd = 'samtools merge -n '
                        if wd.alignment_sort_order == 'coordinate':
                            sort_files_cmd = 'samtools merge '
                        execute_merge_alignment(final_output_name,
                                                sort_input_files_arr,
                                                file_type, file_name, barcode,
                                                sort_files_cmd, workflow_prov,
                                                err_log, stat_log)
                if file_type == "sam" or file_type == "bam":
                    rm_cmd = "rm " + aligner_dir_path + "/" + barcode + \
                        "*" + aligner_output_suffix + "*." + file_type
                    if len(
                            glob.glob(aligner_dir_path + "/" + barcode + "*" +
                                      aligner_output_suffix + "*." +
                                      file_type)) > 0:
                        prm = Popen(rm_cmd, shell='False')
                        std_out, std_err = prm.communicate()
                        exit_code = prm.returncode
                        if exit_code != 0:
                            if file_name == '':
                                print "Error: chunk merge clean up after sort failed for barcode=", barcode, "\n"
                            else:
                                print "Error: chunks clean up after merge sort failed for filename =", file_name, "barcode=", barcode, "\n"
    return workflow_prov