Ejemplo n.º 1
0
def print_matrix(dict, file):
    """ Called within the pass_fail_matrix() function.
        Input dict with sample name/command name if applicable with stagewise
        fail/pass dict and write path as input.
        prints the stagewise matrix to file. """
    normal_stage_arr = ['PREPROCESS', 'ALIGNMENT', 'POSTPROCESS']               
    # Stages of the matrix/sample
    compare_stage_arr = ['CUFFDIFF', 'CUFFCOMPARE', 'CUFFMERGE', 'MACS2']                
    # Multi-sample hence printed after.
    head_str = '\t' + '\t'.join(normal_stage_arr) + '\n'
    normal_str = ""
    compare_str = ""
    for i in sorted(dict.keys()):
        if sorted(dict[i].keys()) == sorted(normal_stage_arr):
            normal_str += i + "\t"
            for j in range(len(normal_stage_arr)):
                if j != len(normal_stage_arr) - 1:
                    normal_str += dict[i][normal_stage_arr[j]] + "\t"
                else:
                    normal_str += dict[i][normal_stage_arr[j]] + "\n"
        elif set(dict[i].keys()).issubset(set(compare_stage_arr)):
            for j in dict[i].keys():
                compare_str += i + ": " + dict[i][j] + "\n"
    yap_file_io.write_data(head_str + normal_str, file)
    if compare_str != '':
        yap_file_io.write_data("\n\n" + compare_str, file)
Ejemplo n.º 2
0
def print_matrix(dict, file):
    """ Called within the pass_fail_matrix() function.
        Input dict with sample name/command name if applicable with stagewise
        fail/pass dict and write path as input.
        prints the stagewise matrix to file. """
    normal_stage_arr = ['PREPROCESS', 'ALIGNMENT', 'POSTPROCESS']
    # Stages of the matrix/sample
    compare_stage_arr = ['CUFFDIFF', 'CUFFCOMPARE', 'CUFFMERGE', 'MACS2']
    # Multi-sample hence printed after.
    head_str = '\t' + '\t'.join(normal_stage_arr) + '\n'
    normal_str = ""
    compare_str = ""
    for i in sorted(dict.keys()):
        if sorted(dict[i].keys()) == sorted(normal_stage_arr):
            normal_str += i + "\t"
            for j in range(len(normal_stage_arr)):
                if j != len(normal_stage_arr) - 1:
                    normal_str += dict[i][normal_stage_arr[j]] + "\t"
                else:
                    normal_str += dict[i][normal_stage_arr[j]] + "\n"
        elif set(dict[i].keys()).issubset(set(compare_stage_arr)):
            for j in dict[i].keys():
                compare_str += i + ": " + dict[i][j] + "\n"
    yap_file_io.write_data(head_str + normal_str, file)
    if compare_str != '':
        yap_file_io.write_data("\n\n" + compare_str, file)
Ejemplo n.º 3
0
def fastx_barcode_splitter(
        seqs_str,
        output_file_format,
        fastx_barcode_splitter_cmd,
        preprocess_prov,
        err_log,
        stat_log):
    '''
    Runs barcode splitter, returns output data into a dictionary,
    where key represents a barcdoe and sequence string as value
    '''
    bar_seq = ''
    preprocess_prov.append(fastx_barcode_splitter_cmd)
    globaldict = {}
    P1 = Popen(fastx_barcode_splitter_cmd, stdin=PIPE,
               stdout=PIPE, stderr=PIPE, shell=True)
    try:
        std_out, std_err = P1.communicate(seqs_str)
        exit_code = P1.returncode
        yap_log.write_log(fastx_barcode_splitter_cmd, "",
                  exit_code, std_err, err_log, stat_log)
        bar_seq_split = std_out.replace(" ", "").split("|")
        for i in range(0, len(bar_seq_split)):
            if bar_seq_split[i] != '':
                splited_S = bar_seq_split[i].split("=>")
                globaldict[splited_S[0]] = splited_S[1]
                del splited_S
    except Exception as e:
        write_data(str(e), err_log)
    yap_file_io.write_data("\n", err_log)
    yap_file_io.write_data("\n", stat_log)
    return globaldict, preprocess_prov
Ejemplo n.º 4
0
def fastx_barcode_splitter(seqs_str, output_file_format,
                           fastx_barcode_splitter_cmd, preprocess_prov,
                           err_log, stat_log):
    '''
    Runs barcode splitter, returns output data into a dictionary,
    where key represents a barcdoe and sequence string as value
    '''
    bar_seq = ''
    preprocess_prov.append(fastx_barcode_splitter_cmd)
    globaldict = {}
    P1 = Popen(fastx_barcode_splitter_cmd,
               stdin=PIPE,
               stdout=PIPE,
               stderr=PIPE,
               shell=True)
    try:
        std_out, std_err = P1.communicate(seqs_str)
        exit_code = P1.returncode
        yap_log.write_log(fastx_barcode_splitter_cmd, "", exit_code, std_err,
                          err_log, stat_log)
        bar_seq_split = std_out.replace(" ", "").split("|")
        for i in range(0, len(bar_seq_split)):
            if bar_seq_split[i] != '':
                splited_S = bar_seq_split[i].split("=>")
                globaldict[splited_S[0]] = splited_S[1]
                del splited_S
    except Exception as e:
        write_data(str(e), err_log)
    yap_file_io.write_data("\n", err_log)
    yap_file_io.write_data("\n", stat_log)
    return globaldict, preprocess_prov
Ejemplo n.º 5
0
def write_log(command, file, exit_code, std_err, err_log, stat_log):
    """ Checks if the command has succeeded/failed and
        logs it to the status/error log respectively. """
    cmd_sep = "_" * 30 + "\n"
    if str(exit_code) == '0':
        err_str = "YAP_COMMAND: %s\nINPUT_FILES: %s\nEXIT_CODE: %s\nYAP_STATUS_MSG: %s\n" % (
            command, file, exit_code, std_err)
        yap_file_io.write_data(err_str + cmd_sep, stat_log)
    else:
        err_str = "YAP_COMMAND: %s\nINPUT_FILES: %s\nEXIT_CODE: %s\nYAP_ERROR_MSG: %s\n" % (
            command, file, exit_code, std_err)
        yap_file_io.write_data(err_str + cmd_sep, err_log)
Ejemplo n.º 6
0
def write_log(command, file, exit_code, std_err, err_log, stat_log):
    """ Checks if the command has succeeded/failed and
        logs it to the status/error log respectively. """
    cmd_sep = "_" * 30 + "\n"
    if str(exit_code) == '0':
        err_str = "YAP_COMMAND: %s\nINPUT_FILES: %s\nEXIT_CODE: %s\nYAP_STATUS_MSG: %s\n" % (
            command, file, exit_code, std_err)
        yap_file_io.write_data(err_str + cmd_sep, stat_log)
    else:
        err_str = "YAP_COMMAND: %s\nINPUT_FILES: %s\nEXIT_CODE: %s\nYAP_ERROR_MSG: %s\n" % (
            command, file, exit_code, std_err)
        yap_file_io.write_data(err_str + cmd_sep, err_log)
Ejemplo n.º 7
0
def merge_tee_files(command, filename, err_log, stat_log):
    """ Merges the temporary log files produced as a result of 
        the multiprocessing module. """
    exit_str = Popen("cat " + err_log + "_yap_tee_* | grep EXIT_CODE",
                     stdout=PIPE,
                     shell=True).communicate()[0]
    exit_code = 0
    for i in exit_str.split('\n'):
        m = re.match("EXIT_CODE: (.*)\n", i)
        if m:
            exit_code = exit_code + int(m.group(1))
    if exit_code == 0:
        yap_file_io.write_data(
            "YAP_COMMAND: " + command + "\nINPUT_FILES: " + filename +
            "\nYAP_STATUS_MSG: ", stat_log)
        os.system("cat " + err_log + "_yap_tee_* >>" + stat_log)
        yap_file_io.write_data("_" * 30 + "\n", stat_log)
        os.system("rm " + err_log + "_yap_tee_*")
    else:
        yap_file_io.write_data(
            "YAP_COMMAND: " + command + "\nINPUT_FILES: " + filename +
            "\nYAP_ERROR_MSG:", err_log)
        os.system("cat " + err_log + "_yap_tee_* >>" + err_log)
        yap_file_io.write_data("_" * 30 + "\n", err_log)
        os.system("rm " + err_log + "_yap_tee_*")
Ejemplo n.º 8
0
def run_fastq_screen(inp_files_list, fastq_screen_cmd):
    ''' 
    Runs fastq screen command, writes log information to the files and returns log data list
    '''
    prov = []
    file_base_name = inp_files_list[2]
    err_log = wd.err_log_path + "/" + file_base_name + "_fastqscreen_err.log"
    stat_log =wd.stat_log_path + "/" + file_base_name + "_fastqscreen_stat.log"
    fastq_screen_cmd += inp_files_list[0] + " " + inp_files_list[1] + " "
    fastq_screen_cmd = fastq_screen_cmd.replace('output_directory',wd.workflow_output_path + "/" + file_base_name + "/" + "no_barcode_specified" + "/" + "preprocess_output")
    fastq_screen_cmd = fastq_screen_cmd.replace('pipe1', '')
    fastq_screen_cmd += " "
    str_out="*" * 50 + "FASTQSCREEN STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
    yap_file_io.write_data(str_out,err_log)
    yap_file_io.write_data(str_out,stat_log)
    prm = Popen(fastq_screen_cmd, stderr=PIPE, shell='False')
    std_out, std_err = prm.communicate()
    exit_code = prm.returncode
    prov.append(fastq_screen_cmd)
    yap_log.write_log(fastq_screen_cmd, file_base_name,
              exit_code, std_err, err_log, stat_log)
    str_out="*" * 50 + "FASTQSCREEN FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
    yap_file_io.write_data(str_out,err_log)
    yap_file_io.write_data(str_out,stat_log)
    return prov
Ejemplo n.º 9
0
def merge_multiproc_files(command, filename, barcode, err_log, stat_log):
    """ Merges the temporary multiproc error files created. """
    exit_str = Popen("cat " + err_log + "_multiproc_* | grep EXIT_CODE",
                     stdout=PIPE, shell=True).communicate()[0]
    exit_code = 0
    for i in exit_str.split('\n'):
        m = re.match("EXIT_CODE: (.*)\n", i)
        if m:
            exit_code = exit_code + int(m.group(1))
    if exit_code == 0:
        yap_file_io.write_data(
            "YAP_COMMAND: " +
            command +
            "\nINPUT FILE: " +
            filename +
            "\n",
            stat_log)
        os.system("cat " + err_log + "_multiproc_* >>" + stat_log)
        yap_file_io.write_data("_" * 30 + "\n", stat_log)
        os.system("rm " + err_log + "_multiproc_*")
    else:
        yap_file_io.write_data(
            "YAP_COMMAND: " +
            command +
            "\nINPUT FILE: " +
            filename +
            "\n",
            err_log)
        os.system("cat " + err_log + "_multiproc_* >>" + err_log)
        yap_file_io.write_data("_" * 30 + "\n", err_log)
        os.system("rm " + err_log + "_multiproc_*")
Ejemplo n.º 10
0
def run_fastq_screen(inp_files_list, fastq_screen_cmd):
    ''' 
    Runs fastq screen command, writes log information to the files and returns log data list
    '''
    prov = []
    file_base_name = inp_files_list[2]
    err_log = wd.err_log_path + "/" + file_base_name + "_fastqscreen_err.log"
    stat_log = wd.stat_log_path + "/" + file_base_name + "_fastqscreen_stat.log"
    fastq_screen_cmd += inp_files_list[0] + " " + inp_files_list[1] + " "
    fastq_screen_cmd = fastq_screen_cmd.replace(
        'output_directory', wd.workflow_output_path + "/" + file_base_name +
        "/" + "no_barcode_specified" + "/" + "preprocess_output")
    fastq_screen_cmd = fastq_screen_cmd.replace('pipe1', '')
    fastq_screen_cmd += " "
    str_out = "*" * 50 + "FASTQSCREEN STARTED" + "\t" + str(
        time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
    yap_file_io.write_data(str_out, err_log)
    yap_file_io.write_data(str_out, stat_log)
    prm = Popen(fastq_screen_cmd, stderr=PIPE, shell='False')
    std_out, std_err = prm.communicate()
    exit_code = prm.returncode
    prov.append(fastq_screen_cmd)
    yap_log.write_log(fastq_screen_cmd, file_base_name, exit_code, std_err,
                      err_log, stat_log)
    str_out = "*" * 50 + "FASTQSCREEN FINISHED" + "\t" + str(
        time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
    yap_file_io.write_data(str_out, err_log)
    yap_file_io.write_data(str_out, stat_log)
    return prov
Ejemplo n.º 11
0
def merge_tee_files(command, filename, err_log, stat_log):
    """ Merges the temporary log files produced as a result of 
        the multiprocessing module. """
    exit_str = Popen("cat " + err_log + "_yap_tee_* | grep EXIT_CODE",
                     stdout=PIPE, shell=True).communicate()[0]
    exit_code = 0
    for i in exit_str.split('\n'):
        m = re.match("EXIT_CODE: (.*)\n", i)
        if m:
            exit_code = exit_code + int(m.group(1))
    if exit_code == 0:
        yap_file_io.write_data("YAP_COMMAND: " + command + "\nINPUT_FILES: " +
                   filename + "\nYAP_STATUS_MSG: ", stat_log)
        os.system("cat " + err_log + "_yap_tee_* >>" + stat_log)
        yap_file_io.write_data("_" * 30 + "\n", stat_log)
        os.system("rm " + err_log + "_yap_tee_*")
    else:
        yap_file_io.write_data("YAP_COMMAND: " + command + "\nINPUT_FILES: " +
                   filename + "\nYAP_ERROR_MSG:", err_log)
        os.system("cat " + err_log + "_yap_tee_* >>" + err_log)
        yap_file_io.write_data("_" * 30 + "\n", err_log)
        os.system("rm " + err_log + "_yap_tee_*")
Ejemplo n.º 12
0
def merge_multiproc_files(command, filename, barcode, err_log, stat_log):
    """ Merges the temporary multiproc error files created. """
    exit_str = Popen("cat " + err_log + "_multiproc_* | grep EXIT_CODE",
                     stdout=PIPE,
                     shell=True).communicate()[0]
    exit_code = 0
    for i in exit_str.split('\n'):
        m = re.match("EXIT_CODE: (.*)\n", i)
        if m:
            exit_code = exit_code + int(m.group(1))
    if exit_code == 0:
        yap_file_io.write_data(
            "YAP_COMMAND: " + command + "\nINPUT FILE: " + filename + "\n",
            stat_log)
        os.system("cat " + err_log + "_multiproc_* >>" + stat_log)
        yap_file_io.write_data("_" * 30 + "\n", stat_log)
        os.system("rm " + err_log + "_multiproc_*")
    else:
        yap_file_io.write_data(
            "YAP_COMMAND: " + command + "\nINPUT FILE: " + filename + "\n",
            err_log)
        os.system("cat " + err_log + "_multiproc_* >>" + err_log)
        yap_file_io.write_data("_" * 30 + "\n", err_log)
        os.system("rm " + err_log + "_multiproc_*")
Ejemplo n.º 13
0
def execute_file(input_filename_local,input_filename_local_2,file_name,chunk_number,myrank,ii,file_basecount_dict):
        workflow_prov = []
        err_chunk_file = wd.err_log_path + "/" + file_name + \
                     "_log_temp/" + file_name + "_" + str(ii).zfill(6)
        stat_chunk_file = wd.stat_log_path + "/" + file_name + \
                      "_log_temp/" + file_name + "_" + str(ii).zfill(6)
        str_out="*" * 50 + "ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
        yap_file_io.write_data(str_out,err_chunk_file)
        yap_file_io.write_data(str_out,stat_chunk_file)
        for filename_key in file_basecount_dict.iterkeys():
            if filename_key == file_name:
                for barcode in wd.barcode_dict.iterkeys():
                    barcode_value = yap_tools.rename_barcode(barcode)
                    barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
                    aligner_dir_path = barcode_dir_path + "/" + "aligner_output"
                    if wd.alignment_sort_order != 'unsorted':
                        if barcode_value != '':
                            aligner_output_filename = aligner_dir_path + "/" + \
                                "aligner_" + file_name + \
                                "_" + barcode_value
                        else:
                            aligner_output_filename = aligner_dir_path + \
                                "/" + "aligner_" + file_name
                    else:
                        if barcode_value != '':
                            aligner_output_filename = aligner_dir_path + \
                                "/" + file_name + \
                                "_" + barcode_value
                        else:
                            aligner_output_filename = aligner_dir_path + \
                                "/" + file_name

                    if wd.run_preprocess_analysis == 'yes':
                        preprocessed_file_inp1 = ['pipe1']
                        preprocessed_file_inp2 = ['pipe2']
                        preprocess_dir_path = barcode_dir_path + \
                            "/" + "preprocess_output"
                        preprocessed_inp1 = preprocess_dir_path + \
                            "/" + "*preprocess_data*_1.txt"
                        preprocessed_inp2 = preprocess_dir_path + \
                            "/" + "*preprocess_data_*2.txt"
                        preprocessed_file_inp1 = glob.glob(
                            preprocessed_inp1)
                        if wd.paired_end_data == "yes":
                            preprocessed_file_inp2 = glob.glob(
                                preprocessed_inp2)
                        if (wd.paired_end_data== "yes" and preprocessed_file_inp1 and preprocessed_file_inp2) or (wd.paired_end_data != "yes" and preprocessed_file_inp1):
                            print "Entering Alignment section: Filename=", input_filename_local, "barcode=", barcode, "\n"
                            if wd.paired_end_data == 'yes':
                                workflow_prov.append(
                                    "INPUT: " +
                                    preprocessed_file_inp1[0] +
                                    " and " +
                                    preprocessed_file_inp2[0])
                                aligner_out_str, workflow_prov = yap_aligner.run_aligner(preprocessed_file_inp1[0], preprocessed_file_inp2[
                                                                                    0],aligner_output_filename, chunk_number,myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                            else:
                                workflow_prov.append(
                                    "INPUT: " +
                                    preprocessed_file_inp1[0])
                                aligner_out_str, workflow_prov = yap_aligner.run_aligner(preprocessed_file_inp1[
                                                                                    0], '', aligner_output_filename, chunk_number,myrank, workflow_prov, err_chunk_file, stat_chunk_file)

                            if wd.write_preprocessed_data != 'yes':
                                prm1 = Popen(
                                    "rm " +
                                    preprocess_dir_path +
                                    "/" +
                                    "*preprocess_data*_1.txt",
                                    shell='False').wait()
                                if paired_end_data == "yes":
                                    if preprocessed_file_inp2:
                                        prm2 = Popen(
                                            "rm " +
                                            preprocess_dir_path +
                                            "/" +
                                            "*preprocess_data*_2.txt",
                                            shell='False').wait()
                        else:
                            print "Skipping Alignment for : Filename=", input_filename_local, "barcode=", barcode, "........", "No preprocessed data found"
                    else:
                        if wd.paired_end_data == 'yes':
                            workflow_prov.append(
                                "INPUT: " +
                                input_filename_local +
                                " and " +
                                input_filename_local_2)
                            aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                                input_filename_local, input_filename_local_2, aligner_output_filename, 0, workflow_prov, err_chunk_file, stat_chunk_file)
                        else:
                            workflow_prov.append("INPUT: " + input_filename_local)
                            aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                                input_filename_local, '', aligner_cmd_arr, aligner_output_filename, 0, workflow_prov, err_chunk_file, stat_chunk_file)
		    #remove temporary files created by aligners
                    rm_cmd = "rm " + \
                        aligner_output_filename + "*.sai"
                    if len(glob.glob(aligner_output_filename + "*.sai")) > 0:
                        prm = Popen(
                            rm_cmd, shell='False').wait()
                    if barcode in file_basecount_dict[filename_key]:
                        pass
                    else:
                        file_basecount_dict[
                            filename_key][barcode] = []
	#write to log
        str_out="*" * 50 + "ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
        yap_file_io.write_data(str_out,err_chunk_file)
        yap_file_io.write_data(str_out,stat_chunk_file)
        return workflow_prov, file_basecount_dict
Ejemplo n.º 14
0
    #make workflow configuration variables global 
    workflow_obj=wd.workflow_dictionary()
    workflow_obj.make_global(workflow_config_dict)
    #summary file to store all the workflow provenance details
    f_summary_file = wd.workflow_output_path + "/" + wd.workflow_name + "_workflow_summary.txt"
    basecount_metrics_flag = ''
    file_basecount_dict=wd.file_basecount_dict
    #create temp directory, this is local for everynode
    yap_tools.create_dir(wd.yap_temp_user_dir)
    if (myrank == 0):
	#create output directory structure 
        yap_init.initialize_dir_struct()
    	# printing analysis summary
        yap_print_info.print_info()
        str_out= "-"*20 +" PROVENANCE "+ "-"*20 +"\n\n"
        yap_file_io.write_data(str_out,f_summary_file)
    comm.barrier()
    if wd.run_preprocess_analysis == "yes":
	#if preprocess is set to 'yes', perform initial qc commands
        for i in range(0, len(wd.preprocess_cmd_arr)):
            preprocess_cmd_name = wd.preprocess_cmd_arr[i][2][0][0]
            preprocess_cmd = wd.preprocess_cmd_arr[i][2][0][1]
            if re.search('calculate_basecount_metrics', preprocess_cmd_name) is not None:
                basecount_metrics_flag = 'True'
            if re.search('fastqc', preprocess_cmd_name) is not None:
                fastqc_split_index_arr = wd.paired_files_split_arr[myrank]
                fastqc_inp_files = []
                for k in range(0, len(fastqc_split_index_arr)):
                    if fastqc_split_index_arr[k] == 'no file':
                        fastqc_job_status = "Done"
                        pass
Ejemplo n.º 15
0
def execute_chunk(
        input_file_list_local,
        inp1,
        inp2,
        chunk_number,
	myrank,
        workflow_prov,
	eqp_dict):
    '''
    Executes preprocess commands for chunked data and passes to the alignment stage
    Takes chunked input data, filename list, chunk number, rank of the processor     
    and provenance list to append log data.
    ''' 
    # variable declaration
    input_filename_local = input_file_list_local[0]
    input_filename_local_2 = input_file_list_local[1]
    file_name = input_file_list_local[2]
    err_chunk_file = wd.err_log_path + "/" + file_name + \
        "_log_temp/" + file_name + "_" + str(chunk_number).zfill(6)
    stat_chunk_file = wd.stat_log_path + "/" + file_name + \
        "_log_temp/" + file_name + "_" + str(chunk_number).zfill(6)
    myhost = os.getenv('HOSTNAME')
    yap_file_io.write_data("HOSTNAME: " + str(myhost) + "\n", err_chunk_file)
    yap_file_io.write_data("HOSTNAME: " + str(myhost) + "\n", stat_chunk_file)
    yap_file_io.write_data("CHUNK NUMBER: " + str(chunk_number) + "\n", err_chunk_file)
    yap_file_io.write_data("CHUNK NUMBER: " + str(chunk_number) + "\n", stat_chunk_file)
    seqs_arr1 = []
    seqs_arr2 = []
    read_length = wd.max_read_length
    barcode_seqstruct_dict1 = {}
    barcode_seqstruct_dict2 = {}
    barcode_output_dict = {}
    aligner_out_str = ''
    sort_order = ''
    barcode_flag = 'False'
    sort_order = wd.alignment_sort_order
    # convert the input data based on format given in workflow configuration
    if wd.input_file_format == "qseq" or wd.input_file_format != wd.preprocess_output_file_format:
        inp1 = yap_tools.convert_format(inp1)
        if wd.paired_end_data == 'yes':
            inp2 = yap_tools.convert_format(inp2)
    if wd.run_preprocess_analysis == 'yes':
	str_out = "-"*20 + "PREPROCESS STARTED" +"\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
	yap_file_io.write_data(str_out,err_chunk_file)
	yap_file_io.write_data(str_out,stat_chunk_file)
        # Run barcode splitter as first preprocess step
        for jj in range(0, len(wd.preprocess_cmd_arr)):
            preprocess_cmd_name = wd.preprocess_cmd_arr[jj][2][0][0]
            preprocess_cmd = wd.preprocess_cmd_arr[jj][2][0][1]
            if re.search('fastx_barcode_splitter', preprocess_cmd_name) is not None:
                barcode_flag = 'True'
                print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
		str_out= "YAP_COMMAND: " + preprocess_cmd + "\n" + "INPUT FILE: " + input_filename_local
		yap_file_io.write_data(str_out,err_chunk_file)
		yap_file_io.write_data(str_out,stat_chunk_file)
                barcode_seqstruct_dict1, workflow_prov = yap_preprocess.fastx_barcode_splitter(
                    inp1, wd.preprocess_output_file_format, preprocess_cmd, workflow_prov, err_chunk_file, stat_chunk_file)
                yap_file_io.write_data("_" * 30 + "\n", err_chunk_file)
                yap_file_io.write_data("_" * 30 + "\n", stat_chunk_file)
                barcode_seqstruct_dict1["no_barcode_specified"] = ''
                print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                if wd.paired_end_data == 'yes':
                    print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
		    str_out= "YAP_COMMAND: " + preprocess_cmd + "\n" + "INPUT FILE: " + input_filename_local_2
		    yap_file_io.write_data(str_out,err_chunk_file)
		    yap_file_io.write_data(str_out,stat_chunk_file)
                    barcode_seqstruct_dict2, workflow_prov = yap_preprocess.fastx_barcode_splitter(
                        inp2,wd.preprocess_output_file_format , preprocess_cmd, workflow_prov, err_chunk_file, stat_chunk_file)
                    yap_file_io.write_data("_" * 30 + "\n", err_chunk_file)
                    yap_file_io.write_data("_" * 30 + "\n", stat_chunk_file)
                    barcode_seqstruct_dict2["no_barcode_specified"] = ''
                    print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                break
        if barcode_flag == 'False':
            #if no barcode command; then create dictionary with one barcode tag
            barcode_seqstruct_dict1["no_barcode_specified"] = inp1
            barcode_seqstruct_dict2["no_barcode_specified"] = inp2
    else:
        #if no preprocess stage specified; then create dictionary with one barcode tag
        barcode_seqstruct_dict1["no_barcode_specified"] = inp1
        barcode_seqstruct_dict2["no_barcode_specified"] = inp2
    #iterate over the barcode dictionary 
    for barcode, inp1 in barcode_seqstruct_dict1.iteritems():
        run_unique_reads = 'False'
        barcode_value = yap_tools.rename_barcode(barcode)
        if wd.paired_end_data == "yes":
            inp2 = barcode_seqstruct_dict2[barcode]
        preprocessed_data_dict = {}
	#intialize matrix for basecount analysis
        aligner_output_str_local = ''
        basecount_matrix_local1 = numpy.zeros(
            (int(read_length), 5), dtype=numpy.int)
        basecount_matrix_local2 = numpy.zeros(
            (int(read_length), 5), dtype=numpy.int)
        barcode_output_dict.setdefault(barcode, [basecount_matrix_local1, basecount_matrix_local2])
        #set output file paths
        barcode_dir_path = wd.workflow_output_path + "/" + file_name + "/" + barcode
        preprocess_dir_path = barcode_dir_path + "/" + "preprocess_output"
        if wd.data_distribution_method != "file_based":
            if barcode_value != '':
                preprocess_out_filename1 = preprocess_dir_path + "/" + barcode_value + "_" + file_name + \
                    "_" + str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + barcode_value + "_" + file_name + \
                    "_" + str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_2.txt"
            else:
                preprocess_out_filename1 = preprocess_dir_path + "/" + file_name + "_" + \
                    str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + file_name + "_" + \
                    str(chunk_number).zfill(6) + "_" + \
                    str(myrank) + "_preprocessed_data_2.txt"
        else:
            if barcode_value != '':
                preprocess_out_filename1 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + \
                    "_" + barcode_value + "_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + \
                    "_" + barcode_value + "_2.txt"
            else:
                preprocess_out_filename1 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + "_1.txt"
                preprocess_out_filename2 = preprocess_dir_path + "/" + \
                    "preprocess_data" + "_" + file_name + "_2.txt"
        aligner_dir_path = barcode_dir_path + "/" + "aligner_output"
        if barcode_value != '':
            aligner_output_filename = aligner_dir_path + "/" + "aligner_" + \
                file_name + "_" + barcode_value + \
                "_" + str(chunk_number).zfill(6)
        else:
            aligner_output_filename = aligner_dir_path + "/" + \
                "aligner_" + file_name + "_" + str(chunk_number).zfill(6)

        for jj in range(0, len(wd.preprocess_cmd_arr)):
            preprocess_cmd_name = wd.preprocess_cmd_arr[jj][2][0][1]
            preprocess_cmd = wd.preprocess_cmd_arr[jj][2][0][1]
            # skip fastqc and fastq screen and barcode splitter as they are
            # already executed
            if (re.search('fastqc', preprocess_cmd_name) is not None) or (re.search('fastq_screen', preprocess_cmd_name) is not None)or(re.search('fastx_barcode_splitter',
                                                                                                                                                  preprocess_cmd_name) is not None):
                pass
            else:
                if re.search('calculate_basecount_metrics', preprocess_cmd_name) is not None:
		    #excecute basecount calculation
                    basecount_matrix_local1, workflow_prov = yap_tools.qc_basecount(
                        inp1, workflow_prov)
                    basecount_matrix_local2, workflow_prov = yap_tools.qc_basecount(
                        inp2, workflow_prov)
                elif re.search('fastx_clipper', preprocess_cmd_name) is not None:
		    """
		    Check for fastx clipper as special case and execute.
		    This is because fastx clipper execution has been optimized by providing contaminants for every file,
		    instead of just applying contaminants universally. 
		    """ 
                    run_unique_reads = 'True'
                    if input_filename_local in wd.contaminant_dict.keys():
                        contaminants_arr1 = wd.contaminant_dict[
                            input_filename_local]
                        print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                        index = 0
                        for index in range(0, len(contaminants_arr1)):
			    #iterate over all the contaminants for this file 
                            fastx_clipper_cmd = preprocess_cmd
                            contaminant1 = contaminants_arr1[index].strip("\n")
                            if inp1 != '':
                                cont_replace = " -a " + contaminant1
                                fastx_clipper_cmd = fastx_clipper_cmd.replace(
                                    'pipe1', " - ") + " -a " + contaminant1
                                inp1 = yap_tools.multiproc_function(
                                    fastx_clipper_cmd, inp1, int(
                                        wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                                yap_log.merge_multiproc_files(
                                    fastx_clipper_cmd,
                                    input_filename_local,
                                    barcode,
                                    err_chunk_file,
                                    stat_chunk_file)
                            if inp1 == '':
                                break
                        print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        if input_filename_local_2 in wd.contaminant_dict.keys():
			    #repeat fastx clipper for the paired end
                            contaminants_arr2 = wd.contaminant_dict[
                                input_filename_local_2]
                            print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                            index = 0
                            for index in range(0, len(contaminants_arr2)):
                                fastx_clipper_cmd = preprocess_cmd
                                contaminant2 = contaminants_arr2[
                                    index].strip("\n")
                                if inp2 != '':
                                    cont_replace = " -a " + contaminant2
                                    fastx_clipper_cmd = fastx_clipper_cmd.replace(
                                        'pipe1',
                                        " - ") + " -a " + contaminant2
                                    inp2 = yap_tools.multiproc_function(
                                        fastx_clipper_cmd, inp2, int(
                                            wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                                    yap_log.merge_multiproc_files(
                                        fastx_clipper_cmd,
                                        input_filename_local_2,
                                        barcode,
                                        err_chunk_file,
                                        stat_chunk_file)
                                if inp2 == '':
                                    break
                            print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                elif re.search('eqp_rename_reads',preprocess_cmd_name) != None:
                        # this section renames reads according to specific format, applies to in-house use, neglect otherwise
                        inp1_arr = inp1.splitlines(1)
                        inp1=''
                        inp2_arr = inp2.splitlines(1)
                        inp2=''
                        read_count=1
                        if wd.data_distribution_method == "file_based":
                                if eqp_dict.has_key("eqp_read_counter"):
                                        if len(eqp_dict["eqp_read_counter"]) > 0:
                                                file_name, read_count = eqp_dict["eqp_read_counter"]
                                                if file_name !=  input_filename_local:
                                                        read_count = 1
                        format_lines = int(wd.format_specific_lines)
                        for i in range(0,len(inp1_arr),format_lines):
                                if wd.paired_end_data == 'yes':
                                        if (len(inp1_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp2_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp1_arr[i+1].strip("\n").replace('T','')) >= 5) and (len(inp2_arr[i+1].strip("\n").replace('T','')) >= 5) :
                                                inp1 += '@F'+str(read_count).zfill(9)+'/1'+'\n'
                                                inp2 += '@F'+str(read_count).zfill(9)+'/2'+'\n'
                                                for jj in range (1,format_lines):
                                                        inp1 += inp1_arr[i+jj]
                                                        inp2 += inp2_arr[i+jj]
                                                read_count += 1
                                else:
                                        if (len(inp1_arr[i+1].strip("\n").replace('A','')) >= 5) and (len(inp1_arr[i+1].strip("\n").replace('T','')) >= 5):
                                                inp1_arr[i] = '@F'+str(read_count).zfill(9)+'/1'+'\n'
                                                for jj in range (1,format_lines):
                                                        inp1 += inp1_arr[i+jj]
                                                read_count += 1
                        eqp_dict["eqp_read_counter"] = [ input_filename_local, read_count]
                        inp1_arr = []
                        inp2_arr = []
                else:
		    #set the flag to remove umatched pair after preprocesing 
                    run_unique_reads = 'True'
                    print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
		    #for all other preprocess commands execute this section
                    if inp1 != '':
                        preprocess_cmd = preprocess_cmd.replace('pipe1', ' - ')
                        inp1 = yap_tools.multiproc_function(
                            preprocess_cmd, inp1, int(
                                wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                        yap_log.merge_multiproc_files(
                            preprocess_cmd,
                            input_filename_local,
                            barcode,
                            err_chunk_file,
                            stat_chunk_file)
                    print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        preprocess_cmd = preprocess_cmd.replace('pipe1', ' - ')
                        print "Entering " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
                        if inp2 != '':
                            inp2 = yap_tools.multiproc_function(
                                preprocess_cmd, inp2, int(
                                    wd.format_specific_lines), '', err_chunk_file, stat_chunk_file)
                            yap_log.merge_multiproc_files(
                                preprocess_cmd,
                                input_filename_local_2,
                                barcode,
                                err_chunk_file,
                                stat_chunk_file)
                        print "Exiting " + preprocess_cmd_name + " : Filename=", input_filename_local_2, " chunk number=", chunk_number, "\n"
        if wd.paired_end_data == 'yes':
            if run_unique_reads == 'True':
		#remove all the umatched pairs from two chunks belonging to the same sample
		#this is because each chunk goes through command separately, not as a pair.
                if inp1 != '' and inp2 != '':
                    inp1, inp2 = yap_tools.find_unique_set(
                        inp1.splitlines(1), inp2.splitlines(1))
	if wd.run_preprocess_analysis  == 'yes':
		#write log data
		str_out="-"*20 + "PREPROCESS FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
        if wd.data_distribution_method != "file_based":
	    #if the workflow is not filebased; then pass the chunks for alignment.
            if wd.run_reference_alignment == 'yes':
		str_out="-"*20 + "ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20 + "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
                if (wd.paired_end_data == 'yes' and inp1 != '' and inp2 != '') or (wd.paired_end_data != 'yes' and inp1 != ''):
                    print "Entering Alignment: Filename=", input_filename_local, "barcode=", barcode, " chunk number=", chunk_number, "\n"
                    if wd.paired_end_data == 'yes':
                        workflow_prov.append(
                            "INPUT: " +
                            input_filename_local +
                            " and " +
                            input_filename_local_2 +
                            " chunk number= " +
                            str(chunk_number))
                        aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                            inp1, inp2,aligner_output_filename, chunk_number, myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                    else:
                        workflow_prov.append(
                            "INPUT: " +
                            input_filename_local +
                            " chunk number= " +
                            str(chunk_number))
                        aligner_out_str, workflow_prov = yap_aligner.run_aligner(
                            inp1, '', aligner_output_filename, chunk_number,myrank,workflow_prov, err_chunk_file, stat_chunk_file)
                    rm_cmd = "rm " + aligner_output_filename + "*.sai"
                    if len(glob.glob(aligner_output_filename + "*.sai")) > 0:
                        prm = Popen(rm_cmd, shell='False').wait()
                    if len(glob.glob(aligner_output_filename + "*.head")) > 0:
                        prm = Popen(rm_cmd, shell='False').wait()

                else:
                    	print "Exiting Alignment: Filename=", input_filename_local, "barcode=", barcode, " chunk number=", chunk_number, "\n"
		str_out="-"*20 + "ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "-"*20+ "\n"
		yap_file_io.write_data(str_out, err_chunk_file)
		yap_file_io.write_data(str_out, stat_chunk_file)
            if wd.run_preprocess_analysis == 'yes':
                if wd.write_preprocessed_data == 'yes':
		    #write preprocessed data to file
                    yap_file_io.write_data(inp1, preprocess_out_filename1)
                    if wd.paired_end_data == "yes":
                        yap_file_io.write_data(inp2, preprocess_out_filename2)
                else:
		    #else empty input data chunk
                    inp1 = ''
                    inp2 = ''
            else:
		#else empty input data chunk
                inp1 = ''
                inp2 = ''

        else:
	    #if workflow is filebased; then write preprocessed data to file
            if wd.run_preprocess_analysis == "yes":
                if wd.write_preprocessed_data == 'yes' or wd.run_reference_alignment == "yes":
                    yap_file_io.write_data(inp1, preprocess_out_filename1)
                    if wd.paired_end_data == "yes":
                        yap_file_io.write_data(inp2, preprocess_out_filename2)
        barcode_output_dict[barcode][0] = basecount_matrix_local1
        barcode_output_dict[barcode][1] = basecount_matrix_local2
    return barcode_output_dict, workflow_prov
Ejemplo n.º 16
0
def get_postprocess_file_compare_cmd_arr(
        postprocess_compare_file_cmd_arr,
        inp_file_list):
    '''Polish the postprocess commands which require multiple samples,
    for input/output paths'''
    postprocess_cmd_arr = postprocess_compare_file_cmd_arr
    postprocess_compare_arr = []
    temp_sample_file_list = ''
    list_samples_to_compare_dict = wd.list_of_samples_to_compare
    list_samples_dict = wd.list_of_samples
    if wd.regroup_output == 'yes':
	workflow_output_path = wd.workflow_output_path+ "/regroup_output"
    else:
	workflow_output_path = wd.workflow_output_path
    for j in range(0, len(postprocess_cmd_arr)):
        input_file_extension = ''
        file_list_comp_matchobj = ''
        sample_file_list_matchobj = ''
        input_directory_matchobj = ''
        compare_file_name = ''
        file_compare_list = []
        postprocess_cmd = postprocess_cmd_arr[j][1]
        postprocess_cmd_name = postprocess_cmd_arr[j][0]
        cmd_type = postprocess_cmd_arr[j][0]
        cmd_meta_data = postprocess_cmd_arr[j][1]
        postprocess_temp_arr = postprocess_cmd_arr[j][2]
        postprocess_cmd_name = postprocess_temp_arr[0][0]
        postprocess_cmd = postprocess_temp_arr[0][1]
        input_file_extension = ''
        list_delimiter = ''
        sample_name = ''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration
        input_directory = "aligner_output" 
        list_delimiter_obj = re.match(
            r'(.*) list_delimiter[\s\t]*([\S\T]*)[\s\t]*', 
            postprocess_cmd,
            re.M | re.I)
        if list_delimiter_obj:
            list_delimiter = list_delimiter_obj.group(2).strip("\n")
            #postprocess_cmd = postprocess_cmd.replace(list_delimiter, '')
        #check for command input/output keywords from configuration variables 
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type', input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[1] #fetch user provided input file type
                if re.search('input_directory', input_meta_data[0]) is not None:
                    input_directory = input_meta_data[1] # fetch user provided input directory
        if postprocess_cmd_name in list_samples_to_compare_dict:
            cmd_list_samples_to_compare = list_samples_to_compare_dict[
                postprocess_cmd_name]
            compare_file_name = cmd_list_samples_to_compare[0]
            file_compare_list = cmd_list_samples_to_compare[1]
            file_list_comp_matchobj = 'True'
        if postprocess_cmd_name in list_samples_dict:
            cmd_list_samples = list_samples_dict[postprocess_cmd_name]
            compare_file_name = cmd_list_samples[0]
            file_compare_list = cmd_list_samples[1]
            sample_file_list_matchobj = 'True'
        if wd.run_preprocess_analysis == "no" and wd.run_postprocess_analysis == "yes" and wd.run_reference_alignment == "no":
            inp_file_list1 = []
            for i in inp_file_list:
                compare_file_temp = i[0]
                path, compare_file = os.path.split(compare_file_temp)
                compare_file, extension = os.path.splitext(compare_file)
                compare_postprocess_dir_path = compare_file_temp
                for jj in glob.glob(compare_postprocess_dir_path):
                    inp_file_list1.append([jj, '', jj, ''])
            inp_file_list = inp_file_list1
        if compare_file_name == "all": #if all the sample space to be analyzed one-to-all
            files_temp_list = []
            if file_list_comp_matchobj == 'True':
                for j in range(0, len(inp_file_list)):
                    files_temp_list.append(inp_file_list[j][2])
                #generate the list of file sets one-to-all
                file_compare_list = (
                    generate_file_comparisons(files_temp_list))
            if sample_file_list_matchobj == 'True':
                for j in range(0, len(inp_file_list)):
                    files_temp_list.append(inp_file_list[j][2])
                file_compare_list.append([files_temp_list])
        '''Iterate over file sets to be analyzed together, Check if the files 
        exist according to source directory and input type'''
        for i in range(0, len(file_compare_list)):
            compare_file_set = file_compare_list[i]
            input_string = ''
            iter_var = 0
            cmd_compare_dir_path = workflow_output_path + "/" + postprocess_cmd_name + "_output"
            #create directory structure by command name and group number
            if os.path.exists(cmd_compare_dir_path) != True:
                os.system("mkdir" + " " + cmd_compare_dir_path)
            output_compare_dir_path = cmd_compare_dir_path + \
                "/" + postprocess_cmd_name + "_group_" + str(i + 1)
            err_log = wd.err_log_path+ '/' + postprocess_cmd_name + "_group_" + str(i + 1) + "_err.log"  
            stat_log = wd.stat_log_path + '/' + postprocess_cmd_name + "_group_" + str(i + 1) + "_stat.log" 
            if os.path.exists(output_compare_dir_path) != True:
                os.system("mkdir" + " " + output_compare_dir_path)
            temp_file_compare_list = cmd_compare_dir_path + \
                "/" + postprocess_cmd_name + "_input_file_list"
            temp_sample_file_list = output_compare_dir_path + "/" + \
                postprocess_cmd_name + "group" + \
                str(i + 1) + "_input_file_list"
            postprocess_outfile = output_compare_dir_path + \
                "/" + postprocess_cmd_name
            for jk in range(0, len(compare_file_set)):
                input_set = []
                iter_var = iter_var + 1
                compare_file = ''
                command_out_dir = ''
                postprocess_cmd_new = ''
                input_set = get_input_sets(
                    compare_file_set[jk],
                    compare_file_name,
                    input_directory,
                    input_file_extension)
                if file_list_comp_matchobj == 'True':
                    if len(input_set) > 0:
                        if iter_var == 1:
                            sample_name = postprocess_cmd_name + \
                                "_Group_" + str(i + 1)
                            yap_file_io.write_data(postprocess_cmd_name +
                                       "_Group_" +
                                       str(i +
                                           1) +
                                       ":" +
                                       "\n", temp_file_compare_list)
                        yap_file_io.write_data(
                            "Set" + str(jk + 1) + "=", temp_file_compare_list)
                        for zz in range(0, len(input_set)):
                            yap_file_io.write_data(
                                "\t" +
                                input_set[zz] +
                                "\n",
                                temp_file_compare_list)
                            input_string += input_set[zz] + ","
                        yap_file_io.write_data("\n", temp_file_compare_list)
                        if list_delimiter != '':
                            input_string = input_string.strip(
                                ",") + " " + list_delimiter + " "
                        else:
                            input_string = input_string.strip(",") + " "
                if sample_file_list_matchobj == 'True':
                    if len(input_set) > 0:
                        sample_name = postprocess_cmd_name + "_Group_" + str(1)
                        for kk in(input_set):
                            yap_file_io.write_data(kk + "\n", temp_sample_file_list)
            #polish commands according to type of command file comparison sets 
            if file_list_comp_matchobj == 'True':
                if input_string != '':
                    input_string = input_string.strip(list_delimiter + " ")
                    postprocess_cmd = postprocess_cmd.replace(
                        compare_file_name, '')
                    postprocess_cmd_new = postprocess_cmd.replace(
                        'input_directory', '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        input_directory, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_of_samples_to_compare', '')
                    postprocess_cmd_new = postprocess_cmd_new + input_string
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_file', postprocess_outfile)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_directory', output_compare_dir_path)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_delimiter' + ' ' + list_delimiter, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'sample_name', sample_name)
            #polish commands according to type of command where all files together passed as a list  
            if sample_file_list_matchobj == 'True':
                if glob.glob(temp_sample_file_list):
                    postprocess_cmd = postprocess_cmd.replace(
                        compare_file_name, '')
                    postprocess_cmd_new = postprocess_cmd.replace(
                        'input_directory', '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        input_directory, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_file', postprocess_outfile)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_directory', output_compare_dir_path)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_of_samples', temp_sample_file_list)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_delimiter' + ' ' + list_delimiter , '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'sample_name', sample_name)
            postprocess_compare_arr.append(
                [postprocess_cmd_name, postprocess_cmd_new, err_log, stat_log])
    return postprocess_compare_arr
Ejemplo n.º 17
0
def execute_merge_alignment(
        final_output_name,
        sort_input_files_arr,
        file_type,
        file_name,
        barcode,
        sort_files_cmd,
        workflow_prov,
        err_log,
        stat_log):
    '''
    Executes merge data commands for alignment output data.
    '''
    sort_cmd_input = ''
    sort_input_files_new_arr = []
    if file_type != "sam":
        if len(sort_input_files_arr) > 0:
            if len(sort_input_files_arr) == 1:
                os.rename(sort_input_files_arr[0], final_output_name)
                workflow_prov.append(
                    'RENAMED FILE ' +
                    sort_input_files_arr[0] +
                    ' TO ' +
                    final_output_name)
            else:
                for z in range(0, len(sort_input_files_arr)):
                    sort_cmd_input += sort_input_files_arr[z].strip("\n") + " "
                if wd.alignment_sort_order == "unsorted":
                    sort_files_cmd = "samtools cat -o " + \
                        final_output_name + ' ' + sort_cmd_input
                else:
                    sort_files_cmd = sort_files_cmd + ' ' + \
                        final_output_name + ' ' + sort_cmd_input
                str_out = "*" * 50 + "MERGE ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                pmerge = Popen(sort_files_cmd, stdout=PIPE, stderr=PIPE, shell='True')
                std_out, std_err = pmerge.communicate()
                exit_code = pmerge.returncode
                yap_log.write_log(sort_files_cmd, str(sort_input_files_arr).lstrip(
                    '[').rstrip(']'), exit_code, std_err, err_log, stat_log)
                str_out = "*" * 50 + "MERGE ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                if sort_files_cmd != '':
                    workflow_prov.append(sort_files_cmd)
                if exit_code != 0:
                    if file_name == '':
                        print "Error: chunk merge sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks  merge sort failed for Filename=", file_name, "barcode=", barcode, "\n"
                for z in range(0, len(sort_input_files_arr)):
                    os.remove(sort_input_files_arr[z])
    else:
        if len(sort_input_files_arr) > 0:
            if len(sort_input_files_arr) == 1:
                os.rename(sort_input_files_arr[0], final_output_name)
                workflow_prov.append(
                    'RENAMED FILE ' +
                    sort_input_files_arr[0] +
                    ' TO ' +
                    final_output_name)
            else:
                str_out = "*" * 50 + "MERGE ALIGNMENT STARTED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                for z in range(0, len(sort_input_files_arr)):
                    sam_file_name = sort_input_files_arr[z]
                    sam_file_name_base, ext = os.path.splitext(sam_file_name)
                    sam_to_bam_cmd = "samtools view -bhS " + \
                        sam_file_name + " -o " + sam_file_name_base + ".bam"
                    pconv = Popen(
                        sam_to_bam_cmd, stdout=PIPE, stderr=PIPE, shell='True')
                    std_out, std_err = pconv.communicate()
                    exit_code = pconv.returncode
                    yap_log.write_log(
                        sam_to_bam_cmd,
                        final_output_name,
                        exit_code,
                        std_err,
                        err_log,
                        stat_log)
                    std_out = ""
                    std_err = ""
                    exit_code = 0
                    if exit_code != 0:
                        print " Sam to bam conversion failed"
                    sort_input_files_new_arr.append(
                        sam_file_name_base + '.bam')
                    os.remove(sam_file_name)
                for z in range(0, len(sort_input_files_new_arr)):
                    sort_cmd_input += sort_input_files_new_arr[
                        z].strip("\n") + " "
                if  wd.alignment_sort_order == "unsorted":
                    sort_files_cmd = "samtools cat -o - " + sort_cmd_input + \
                        " | samtools view -h - -o " + final_output_name
                else:
                    sort_files_cmd = sort_files_cmd + ' - ' + ' ' + sort_cmd_input + \
                        " | samtools view -h - -o " + final_output_name
                std_out = ''
                std_err = ''
                pmerge = Popen(
                    sort_files_cmd, stdout=PIPE, stderr=PIPE, shell='False')
                std_out, std_err = pmerge.communicate()
                exit_code = pmerge.returncode
                if sort_files_cmd != '':
                    workflow_prov.append(sort_files_cmd)
                if exit_code != 0:
                    if file_name == '':
                        print "Error: chunk merge sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks  merge sort failed for Filename=", file_name, "barcode=", barcode, "\n"
                yap_log.write_log(sort_files_cmd, str(sort_input_files_arr).lstrip(
                    '[').rstrip(']'), exit_code, std_err, err_log, stat_log)
                str_out = "*" * 50 + "MERGE ALIGNMENT FINISHED" + "\t" + str(time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                for z in range(0, len(sort_input_files_new_arr)):
                    os.remove(sort_input_files_new_arr[z])
Ejemplo n.º 18
0
def get_postprocess_file_compare_cmd_arr(postprocess_compare_file_cmd_arr,
                                         inp_file_list):
    '''Polish the postprocess commands which require multiple samples,
    for input/output paths'''
    postprocess_cmd_arr = postprocess_compare_file_cmd_arr
    postprocess_compare_arr = []
    temp_sample_file_list = ''
    list_samples_to_compare_dict = wd.list_of_samples_to_compare
    list_samples_dict = wd.list_of_samples
    if wd.regroup_output == 'yes':
        workflow_output_path = wd.workflow_output_path + "/regroup_output"
    else:
        workflow_output_path = wd.workflow_output_path
    for j in range(0, len(postprocess_cmd_arr)):
        input_file_extension = ''
        file_list_comp_matchobj = ''
        sample_file_list_matchobj = ''
        input_directory_matchobj = ''
        compare_file_name = ''
        file_compare_list = []
        postprocess_cmd = postprocess_cmd_arr[j][1]
        postprocess_cmd_name = postprocess_cmd_arr[j][0]
        cmd_type = postprocess_cmd_arr[j][0]
        cmd_meta_data = postprocess_cmd_arr[j][1]
        postprocess_temp_arr = postprocess_cmd_arr[j][2]
        postprocess_cmd_name = postprocess_temp_arr[0][0]
        postprocess_cmd = postprocess_temp_arr[0][1]
        input_file_extension = ''
        list_delimiter = ''
        sample_name = ''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration
        input_directory = "aligner_output"
        list_delimiter_obj = re.match(
            r'(.*) list_delimiter[\s\t]*([\S\T]*)[\s\t]*', postprocess_cmd,
            re.M | re.I)
        if list_delimiter_obj:
            list_delimiter = list_delimiter_obj.group(2).strip("\n")
            #postprocess_cmd = postprocess_cmd.replace(list_delimiter, '')
        #check for command input/output keywords from configuration variables
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type',
                             input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[
                        1]  #fetch user provided input file type
                if re.search('input_directory',
                             input_meta_data[0]) is not None:
                    input_directory = input_meta_data[
                        1]  # fetch user provided input directory
        if postprocess_cmd_name in list_samples_to_compare_dict:
            cmd_list_samples_to_compare = list_samples_to_compare_dict[
                postprocess_cmd_name]
            compare_file_name = cmd_list_samples_to_compare[0]
            file_compare_list = cmd_list_samples_to_compare[1]
            file_list_comp_matchobj = 'True'
        if postprocess_cmd_name in list_samples_dict:
            cmd_list_samples = list_samples_dict[postprocess_cmd_name]
            compare_file_name = cmd_list_samples[0]
            file_compare_list = cmd_list_samples[1]
            sample_file_list_matchobj = 'True'
        if wd.run_preprocess_analysis == "no" and wd.run_postprocess_analysis == "yes" and wd.run_reference_alignment == "no":
            inp_file_list1 = []
            for i in inp_file_list:
                compare_file_temp = i[0]
                path, compare_file = os.path.split(compare_file_temp)
                compare_file, extension = os.path.splitext(compare_file)
                compare_postprocess_dir_path = compare_file_temp
                for jj in glob.glob(compare_postprocess_dir_path):
                    inp_file_list1.append([jj, '', jj, ''])
            inp_file_list = inp_file_list1
        if compare_file_name == "all":  #if all the sample space to be analyzed one-to-all
            files_temp_list = []
            if file_list_comp_matchobj == 'True':
                for j in range(0, len(inp_file_list)):
                    files_temp_list.append(inp_file_list[j][2])
                #generate the list of file sets one-to-all
                file_compare_list = (
                    generate_file_comparisons(files_temp_list))
            if sample_file_list_matchobj == 'True':
                for j in range(0, len(inp_file_list)):
                    files_temp_list.append(inp_file_list[j][2])
                file_compare_list.append([files_temp_list])
        '''Iterate over file sets to be analyzed together, Check if the files 
        exist according to source directory and input type'''
        for i in range(0, len(file_compare_list)):
            compare_file_set = file_compare_list[i]
            input_string = ''
            iter_var = 0
            cmd_compare_dir_path = workflow_output_path + "/" + postprocess_cmd_name + "_output"
            #create directory structure by command name and group number
            if os.path.exists(cmd_compare_dir_path) != True:
                os.system("mkdir" + " " + cmd_compare_dir_path)
            output_compare_dir_path = cmd_compare_dir_path + \
                "/" + postprocess_cmd_name + "_group_" + str(i + 1)
            err_log = wd.err_log_path + '/' + postprocess_cmd_name + "_group_" + str(
                i + 1) + "_err.log"
            stat_log = wd.stat_log_path + '/' + postprocess_cmd_name + "_group_" + str(
                i + 1) + "_stat.log"
            if os.path.exists(output_compare_dir_path) != True:
                os.system("mkdir" + " " + output_compare_dir_path)
            temp_file_compare_list = cmd_compare_dir_path + \
                "/" + postprocess_cmd_name + "_input_file_list"
            temp_sample_file_list = output_compare_dir_path + "/" + \
                postprocess_cmd_name + "group" + \
                str(i + 1) + "_input_file_list"
            postprocess_outfile = output_compare_dir_path + \
                "/" + postprocess_cmd_name
            for jk in range(0, len(compare_file_set)):
                input_set = []
                iter_var = iter_var + 1
                compare_file = ''
                command_out_dir = ''
                postprocess_cmd_new = ''
                input_set = get_input_sets(compare_file_set[jk],
                                           compare_file_name, input_directory,
                                           input_file_extension)
                if file_list_comp_matchobj == 'True':
                    if len(input_set) > 0:
                        if iter_var == 1:
                            sample_name = postprocess_cmd_name + \
                                "_Group_" + str(i + 1)
                            yap_file_io.write_data(
                                postprocess_cmd_name + "_Group_" + str(i + 1) +
                                ":" + "\n", temp_file_compare_list)
                        yap_file_io.write_data("Set" + str(jk + 1) + "=",
                                               temp_file_compare_list)
                        for zz in range(0, len(input_set)):
                            yap_file_io.write_data("\t" + input_set[zz] + "\n",
                                                   temp_file_compare_list)
                            input_string += input_set[zz] + ","
                        yap_file_io.write_data("\n", temp_file_compare_list)
                        if list_delimiter != '':
                            input_string = input_string.strip(
                                ",") + " " + list_delimiter + " "
                        else:
                            input_string = input_string.strip(",") + " "
                if sample_file_list_matchobj == 'True':
                    if len(input_set) > 0:
                        sample_name = postprocess_cmd_name + "_Group_" + str(1)
                        for kk in (input_set):
                            yap_file_io.write_data(kk + "\n",
                                                   temp_sample_file_list)
            #polish commands according to type of command file comparison sets
            if file_list_comp_matchobj == 'True':
                if input_string != '':
                    input_string = input_string.strip(list_delimiter + " ")
                    postprocess_cmd = postprocess_cmd.replace(
                        compare_file_name, '')
                    postprocess_cmd_new = postprocess_cmd.replace(
                        'input_directory', '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        input_directory, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_of_samples_to_compare', '')
                    postprocess_cmd_new = postprocess_cmd_new + input_string
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_file', postprocess_outfile)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_directory', output_compare_dir_path)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_delimiter' + ' ' + list_delimiter, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'sample_name', sample_name)
            #polish commands according to type of command where all files together passed as a list
            if sample_file_list_matchobj == 'True':
                if glob.glob(temp_sample_file_list):
                    postprocess_cmd = postprocess_cmd.replace(
                        compare_file_name, '')
                    postprocess_cmd_new = postprocess_cmd.replace(
                        'input_directory', '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        input_directory, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_file', postprocess_outfile)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'output_directory', output_compare_dir_path)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_of_samples', temp_sample_file_list)
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'list_delimiter' + ' ' + list_delimiter, '')
                    postprocess_cmd_new = postprocess_cmd_new.replace(
                        'sample_name', sample_name)
            postprocess_compare_arr.append(
                [postprocess_cmd_name, postprocess_cmd_new, err_log, stat_log])
    return postprocess_compare_arr
Ejemplo n.º 19
0
def execute_merge_alignment(final_output_name, sort_input_files_arr, file_type,
                            file_name, barcode, sort_files_cmd, workflow_prov,
                            err_log, stat_log):
    '''
    Executes merge data commands for alignment output data.
    '''
    sort_cmd_input = ''
    sort_input_files_new_arr = []
    if file_type != "sam":
        if len(sort_input_files_arr) > 0:
            if len(sort_input_files_arr) == 1:
                os.rename(sort_input_files_arr[0], final_output_name)
                workflow_prov.append('RENAMED FILE ' +
                                     sort_input_files_arr[0] + ' TO ' +
                                     final_output_name)
            else:
                for z in range(0, len(sort_input_files_arr)):
                    sort_cmd_input += sort_input_files_arr[z].strip("\n") + " "
                if wd.alignment_sort_order == "unsorted":
                    sort_files_cmd = "samtools cat -o " + \
                        final_output_name + ' ' + sort_cmd_input
                else:
                    sort_files_cmd = sort_files_cmd + ' ' + \
                        final_output_name + ' ' + sort_cmd_input
                str_out = "*" * 50 + "MERGE ALIGNMENT STARTED" + "\t" + str(
                    time.strftime("%Y/%m/%d %H:%M:%S",
                                  time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                pmerge = Popen(sort_files_cmd,
                               stdout=PIPE,
                               stderr=PIPE,
                               shell='True')
                std_out, std_err = pmerge.communicate()
                exit_code = pmerge.returncode
                yap_log.write_log(
                    sort_files_cmd,
                    str(sort_input_files_arr).lstrip('[').rstrip(']'),
                    exit_code, std_err, err_log, stat_log)
                str_out = "*" * 50 + "MERGE ALIGNMENT FINISHED" + "\t" + str(
                    time.strftime("%Y/%m/%d %H:%M:%S",
                                  time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                if sort_files_cmd != '':
                    workflow_prov.append(sort_files_cmd)
                if exit_code != 0:
                    if file_name == '':
                        print "Error: chunk merge sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks  merge sort failed for Filename=", file_name, "barcode=", barcode, "\n"
                for z in range(0, len(sort_input_files_arr)):
                    os.remove(sort_input_files_arr[z])
    else:
        if len(sort_input_files_arr) > 0:
            if len(sort_input_files_arr) == 1:
                os.rename(sort_input_files_arr[0], final_output_name)
                workflow_prov.append('RENAMED FILE ' +
                                     sort_input_files_arr[0] + ' TO ' +
                                     final_output_name)
            else:
                str_out = "*" * 50 + "MERGE ALIGNMENT STARTED" + "\t" + str(
                    time.strftime("%Y/%m/%d %H:%M:%S",
                                  time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                for z in range(0, len(sort_input_files_arr)):
                    sam_file_name = sort_input_files_arr[z]
                    sam_file_name_base, ext = os.path.splitext(sam_file_name)
                    sam_to_bam_cmd = "samtools view -bhS " + \
                        sam_file_name + " -o " + sam_file_name_base + ".bam"
                    pconv = Popen(sam_to_bam_cmd,
                                  stdout=PIPE,
                                  stderr=PIPE,
                                  shell='True')
                    std_out, std_err = pconv.communicate()
                    exit_code = pconv.returncode
                    yap_log.write_log(sam_to_bam_cmd, final_output_name,
                                      exit_code, std_err, err_log, stat_log)
                    std_out = ""
                    std_err = ""
                    exit_code = 0
                    if exit_code != 0:
                        print " Sam to bam conversion failed"
                    sort_input_files_new_arr.append(sam_file_name_base +
                                                    '.bam')
                    os.remove(sam_file_name)
                for z in range(0, len(sort_input_files_new_arr)):
                    sort_cmd_input += sort_input_files_new_arr[z].strip(
                        "\n") + " "
                if wd.alignment_sort_order == "unsorted":
                    sort_files_cmd = "samtools cat -o - " + sort_cmd_input + \
                        " | samtools view -h - -o " + final_output_name
                else:
                    sort_files_cmd = sort_files_cmd + ' - ' + ' ' + sort_cmd_input + \
                        " | samtools view -h - -o " + final_output_name
                std_out = ''
                std_err = ''
                pmerge = Popen(sort_files_cmd,
                               stdout=PIPE,
                               stderr=PIPE,
                               shell='False')
                std_out, std_err = pmerge.communicate()
                exit_code = pmerge.returncode
                if sort_files_cmd != '':
                    workflow_prov.append(sort_files_cmd)
                if exit_code != 0:
                    if file_name == '':
                        print "Error: chunk merge sort failed for barcode=", barcode, "\n"
                    else:
                        print "Error: chunks  merge sort failed for Filename=", file_name, "barcode=", barcode, "\n"
                yap_log.write_log(
                    sort_files_cmd,
                    str(sort_input_files_arr).lstrip('[').rstrip(']'),
                    exit_code, std_err, err_log, stat_log)
                str_out = "*" * 50 + "MERGE ALIGNMENT FINISHED" + "\t" + str(
                    time.strftime("%Y/%m/%d %H:%M:%S",
                                  time.localtime())) + "*" * 50 + "\n"
                yap_file_io.write_data(str_out, err_log)
                yap_file_io.write_data(str_out, stat_log)
                for z in range(0, len(sort_input_files_new_arr)):
                    os.remove(sort_input_files_new_arr[z])