Ejemplo n.º 1
0
def run_postprocess(postprocess_cmd_arr, file_basecount_dict, workflow_prov,
                    err_log, stat_log):
    ''' 
    Prepare postprocess command with input/output paths according to sample name, 
    pass commands to yap_tee or subprocess for execution.
    '''
    if wd.regroup_output == 'yes':
        workflow_output_path = wd.workflow_output_path + "/regroup_output"
    else:
        workflow_output_path = wd.workflow_output_path
    for zz in range(0, len(postprocess_cmd_arr)):
        postprocess_tee_arr = []
        postprocess_nontee_arr = []
        initial_pipe_commands = []
        postprocess_temp_arr = []
        cmd_type = postprocess_cmd_arr[zz][0]
        cmd_meta_data = postprocess_cmd_arr[zz][1]
        postprocess_temp_arr = postprocess_cmd_arr[zz][2]
        input_file_extension = ''
        pipe1 = ''
        pipe2 = ''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration file
        input_directory = "aligner_output"
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type',
                             input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[1]
                if re.search('input_directory',
                             input_meta_data[0]) is not None:
                    input_directory = input_meta_data[1]
        '''iterate over filename and barcode, fetch files from the source directory,
        file extensions and python glob module'''
        for filename_key in file_basecount_dict.iterkeys():
            #fetch original input file pair for this sample
            for tmp_arr in wd.inp_files_list:
                if tmp_arr[2] == filename_key:
                    #store it in variables pipe1 and pipe2
                    pipe1 = tmp_arr[0]
                    pipe2 = tmp_arr[1]
                    break
            postprocess_input_file_arr = []
            path, file_name = os.path.split(filename_key)
            if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                file_name, extension = os.path.splitext(file_name)
                file_name, extension = os.path.splitext(file_name)
                file_basecount_dict[filename_key] = {
                    'no_barcode_specified': []
                }
            for barcode in file_basecount_dict[filename_key]:
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = ''
                postprocess_dir_path = ''
                aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory
                postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*"
                postprocess_input_file_arr = glob.glob(postprocess_input)
                if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                    if input_directory == "aligner_output":
                        aligner_dir_path = path
                        postprocess_input = filename_key
                        temp_arr = glob.glob(aligner_dir_path + "/" + "*" +
                                             input_file_extension + "*")
                        if temp_arr > 0:
                            for k in temp_arr:
                                if k == postprocess_input:
                                    postprocess_input_file_arr = [
                                        postprocess_input
                                    ]
                if input_file_extension == '':
                    postprocess_input_file_arr = []
                postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output"
                postprocess_input_file_arr.sort()
                if (len(postprocess_input_file_arr) > 0):
                    if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                        if input_directory == "aligner_output":
                            if postprocess_input_file_arr[0] == filename_key:
                                pass
                            else:
                                break
                    for k in range(0, len(postprocess_temp_arr)):
                        postprocess_cmd = postprocess_temp_arr[k][1]
                        postprocess_cmd_name = postprocess_temp_arr[k][0]
                        if file_name == '':
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    barcode_value + "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + postprocess_cmd_name
                        else:
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    file_name + "_" + barcode_value + \
                                    "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + file_name + "_" + \
                                    postprocess_cmd_name
                        #replace generic keywords with appropriate file path variables
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_directory', '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_file_type' + ' ' + input_file_extension, '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "aligner_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "postprocess_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_file', postprocess_outfile)
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_directory', postprocess_dir_path)
                        postprocess_cmd = postprocess_cmd.replace(' =', '=')
                        postprocess_cmd = postprocess_cmd.replace(
                            'sample_name', file_name)
                        if re.search("file_based_input",
                                     postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'file_based_input',
                                postprocess_input_file_arr[0])
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd
                            ]
                        elif re.search("directory_based_input",
                                       postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'directory_based_input', aligner_dir_path)
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd
                            ]
                        else:
                            postprocess_tee_arr.append(postprocess_cmd)
                    workflow_prov.append("INPUT: " +
                                         postprocess_input_file_arr[0])
                    for kk in postprocess_tee_arr:
                        if kk != '':
                            workflow_prov.append(kk)
                    if len(postprocess_tee_arr) != 0:
                        #pass commands to yap_tee function
                        yap_tools.yap_tee(initial_pipe_commands,
                                          postprocess_tee_arr,
                                          postprocess_input_file_arr[0],
                                          err_log, stat_log)
                    if len(postprocess_nontee_arr) != 0:
                        #pass commands to non_tee function which uses subproces
                        run_postprocess_nontee(postprocess_nontee_arr,
                                               workflow_prov, err_log,
                                               stat_log)
                else:
                    if file_name == '':
                        print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n"
                    else:
                        print "Warning: No aligner output for filename = ", file_name, "  barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n"
    return workflow_prov
Ejemplo n.º 2
0
def run_postprocess(
        postprocess_cmd_arr,
        file_basecount_dict,
        workflow_prov,
        err_log,
        stat_log):
    ''' 
    Prepare postprocess command with input/output paths according to sample name, 
    pass commands to yap_tee or subprocess for execution.
    '''
    if wd.regroup_output =='yes':
    	workflow_output_path = wd.workflow_output_path + "/regroup_output"
    else:
	workflow_output_path= wd.workflow_output_path
    for zz in range(0, len(postprocess_cmd_arr)):
        postprocess_tee_arr = []
        postprocess_nontee_arr = []
        initial_pipe_commands = []
        postprocess_temp_arr = []
        cmd_type = postprocess_cmd_arr[zz][0]
        cmd_meta_data = postprocess_cmd_arr[zz][1]
        postprocess_temp_arr = postprocess_cmd_arr[zz][2]
        input_file_extension = ''
	pipe1=''
	pipe2=''
        #set default input directory for postprocess stage as aligner_output
        #user can specify "postprocess_output" through  configuration file
        input_directory = "aligner_output" 
        for kk in range(0, len(cmd_meta_data)):
            input_meta_data = cmd_meta_data[kk].split(" ")
            if input_meta_data:
                if re.search('input_file_type', input_meta_data[0]) is not None:
                    input_file_extension = input_meta_data[1]
                if re.search('input_directory', input_meta_data[0]) is not None:
                    input_directory = input_meta_data[1]
        '''iterate over filename and barcode, fetch files from the source directory,
        file extensions and python glob module'''
        for filename_key in file_basecount_dict.iterkeys():
	    #fetch original input file pair for this sample
            for tmp_arr in wd.inp_files_list:
            	if tmp_arr[2] == filename_key:
			#store it in variables pipe1 and pipe2
                	pipe1 = tmp_arr[0]
                        pipe2 = tmp_arr[1]
                	break
            postprocess_input_file_arr = []
            path, file_name = os.path.split(filename_key)
            if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
            	file_name, extension = os.path.splitext(file_name)
            	file_name, extension = os.path.splitext(file_name)
                file_basecount_dict[filename_key] = {
                    'no_barcode_specified': []}
            for barcode in file_basecount_dict[filename_key]:
                barcode_value = yap_tools.rename_barcode(barcode)
                aligner_dir_path = ''
                postprocess_dir_path = ''
                aligner_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + input_directory
                postprocess_input = aligner_dir_path + "/" + "*" + input_file_extension + "*"
                postprocess_input_file_arr = glob.glob(postprocess_input)
            	if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
            		if input_directory == "aligner_output":
                        	aligner_dir_path = path
                                postprocess_input = filename_key
                                temp_arr = glob.glob(aligner_dir_path + "/" + "*" + input_file_extension + "*")
                                if temp_arr > 0:
                                	for k in temp_arr:
                                        	if k == postprocess_input:
                                                	postprocess_input_file_arr = [postprocess_input]
                if input_file_extension == '':
                    postprocess_input_file_arr = []
                postprocess_dir_path = workflow_output_path + "/" + file_name + "/" + barcode + "/" + "postprocess_output"
                postprocess_input_file_arr.sort()
                if (len(postprocess_input_file_arr) > 0):
                    if wd.run_preprocess_analysis == "no" and wd.run_reference_alignment == "no" and wd.run_postprocess_analysis == "yes":
                    	if input_directory == "aligner_output":
                        	if postprocess_input_file_arr[0] == filename_key:
                            		pass
                        	else:
                            		break
                    for k in range(0, len(postprocess_temp_arr)):
                        postprocess_cmd = postprocess_temp_arr[k][1]
                        postprocess_cmd_name = postprocess_temp_arr[k][0]
                        if file_name == '':
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    barcode_value + "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + postprocess_cmd_name
                        else:
                            if barcode_value != '':
                                postprocess_outfile = postprocess_dir_path + "/" + \
                                    file_name + "_" + barcode_value + \
                                    "_" + postprocess_cmd_name
                            else:
                                postprocess_outfile = postprocess_dir_path + \
                                    "/" + file_name + "_" + \
                                    postprocess_cmd_name
                        #replace generic keywords with appropriate file path variables
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_directory', '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'input_file_type' + ' ' + input_file_extension, '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "aligner_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            "postprocess_output", '')
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_file', postprocess_outfile)
                        postprocess_cmd = postprocess_cmd.replace(
                            'output_directory', postprocess_dir_path)
                        postprocess_cmd = postprocess_cmd.replace(' =', '=')
                        postprocess_cmd = postprocess_cmd.replace(
                            'sample_name', file_name)
                        if re.search("file_based_input", postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'file_based_input',
                                postprocess_input_file_arr[0])
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd]
                        elif re.search("directory_based_input", postprocess_cmd) is not None:
                            postprocess_cmd = postprocess_cmd.replace(
                                'directory_based_input', aligner_dir_path)
                            postprocess_nontee_arr = [
                                postprocess_cmd_name, postprocess_cmd]
                        else:
                            postprocess_tee_arr.append(postprocess_cmd)
                    workflow_prov.append(
                        "INPUT: " + postprocess_input_file_arr[0])
                    for kk in postprocess_tee_arr:
                        if kk != '':
                            workflow_prov.append(kk)
                    if len(postprocess_tee_arr) != 0:
		        #pass commands to yap_tee function
                        yap_tools.yap_tee(
                            initial_pipe_commands,
                            postprocess_tee_arr,
                            postprocess_input_file_arr[0],
                            err_log,
                            stat_log)
                    if len(postprocess_nontee_arr) != 0:
		        #pass commands to non_tee function which uses subproces
                        run_postprocess_nontee(
                            postprocess_nontee_arr,
                            workflow_prov,
                            err_log,
                            stat_log)
                else:
                    if file_name == '':
                        print "Warning: No aligner output for barcode = ", barcode, " ...skipping the postprocess step for command : \n", postprocess_temp_arr, "\n"
                    else:
                        print "Warning: No aligner output for filename = ", file_name, "  barcode = ", barcode, " ...skipping the postprocess step for command: \n", postprocess_temp_arr, "\n"
    return workflow_prov
Ejemplo n.º 3
0
def sort_alignment_output(
        chunk_number,
        aligner_cmd_name,
        aligner_cmd,
        aligner_output_filename,
        workflow_prov,
        err_log,
        stat_log):
    '''
    sorts alignment output based on coordinate or queryname or both
    '''
    initial_pipe_commands = []
    sort_flag = 'False'
    format_ext = ''
    alignment_file_ext = ''
    sort_file_ext = ''
    after_sort_cmd = ''
    name_sort_cmd = ''
    coordinate_sort_cmd = ''
    if wd.alignment_sort_order != 'unsorted':
	#prepare sort commands, input/output filenames based on file extension
        aligner_dir_path = os.path.split(aligner_output_filename)[0]
        aligner_dir_path_tmp, filename_base = os.path.split(aligner_output_filename)
        if filename_base.find('queryname') == -1 and filename_base.find('coordinate') == -1:
            if os.path.exists(aligner_output_filename):
                aligner_output_filename_base, alignment_file_ext = os.path.splitext(
                    aligner_output_filename)
                if alignment_file_ext == '.gz' or alignment_file_ext == 'bz2':
                    aligner_output_filename_base, format_ext = os.path.splitext(
                        aligner_output_filename_base)
                sort_aligner_output_filename = aligner_output_filename_base + \
                    "_" + wd.alignment_sort_order
                if format_ext == '.sam' or alignment_file_ext == '.sam':
                    initial_pipe_commands = ['samtools view -bhS -']
                    sort_file_ext = ".sam"
                    name_sort_cmd = " samtools sort -on -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "queryname"
                    coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "coordinate"
                    name_sort_cmd = " samtools sort -on -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "queryname"
                    coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "coordinate"
                    after_sort_cmd = ' | samtools view -h - -o '
                else:
                    name_sort_cmd = " samtools sort -n -m 100000000 - "
                    coordinate_sort_cmd = " samtools sort -m 100000000 - "
                sort_aligner_output_filename += sort_file_ext
                if format_ext == '.sam' or format_ext == '.bam' or alignment_file_ext == '.sam' or alignment_file_ext == '.bam':
                    sort_flag = 'True'
        if sort_flag == 'True':
            if wd.alignment_sort_order == 'queryname':
                sort_commands = [
                    name_sort_cmd +
                    after_sort_cmd +
                    sort_aligner_output_filename]
            if wd.alignment_sort_order == 'coordinate':
                sort_commands = [
                    coordinate_sort_cmd +
                    after_sort_cmd +
                    sort_aligner_output_filename]
            if wd.alignment_sort_order == 'queryname' or wd.alignment_sort_order == 'coordinate':
		#if sort order is both; then call yap tee functionality
                yap_tools.yap_tee(initial_pipe_commands, sort_commands,aligner_output_filename, err_log, stat_log)
                if len(initial_pipe_commands) > 0:
                    workflow_prov.append(initial_pipe_commands[0] + sort_commands[0])
                else:
                    workflow_prov.append(sort_commands[0])
                rm_cmd = "rm " + aligner_output_filename
                prm = Popen(rm_cmd, shell='True').wait()
                if prm != 0:
                    print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "\n"
            if wd.alignment_sort_order == 'both':
                sort_queryname_output = aligner_output_filename_base + "_queryname" + sort_file_ext
                sort_coordinate_output = aligner_output_filename_base + "_coordinate" + sort_file_ext
                sort_commands = [name_sort_cmd + after_sort_cmd + sort_queryname_output,coordinate_sort_cmd + after_sort_cmd + sort_coordinate_output]
		#if sort order is both; then call yap tee functionality
                yap_tools.yap_tee(initial_pipe_commands, sort_commands,aligner_output_filename, err_log, stat_log)
                if len(initial_pipe_commands) > 0:
                    workflow_prov.append(
                        initial_pipe_commands[0] + sort_commands[0])
                    workflow_prov.append(
                        initial_pipe_commands[0] + sort_commands[1])
                else:
                    workflow_prov.append(sort_commands[0])
                    workflow_prov.append(sort_commands[1])
                try:
                    os.remove(aligner_output_filename)
                except Exception as e:
                    print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "cmd", "\n"
                    print e
    return workflow_prov
Ejemplo n.º 4
0
def sort_alignment_output(chunk_number, aligner_cmd_name, aligner_cmd,
                          aligner_output_filename, workflow_prov, err_log,
                          stat_log):
    '''
    sorts alignment output based on coordinate or queryname or both
    '''
    initial_pipe_commands = []
    sort_flag = 'False'
    format_ext = ''
    alignment_file_ext = ''
    sort_file_ext = ''
    after_sort_cmd = ''
    name_sort_cmd = ''
    coordinate_sort_cmd = ''
    if wd.alignment_sort_order != 'unsorted':
        #prepare sort commands, input/output filenames based on file extension
        aligner_dir_path = os.path.split(aligner_output_filename)[0]
        aligner_dir_path_tmp, filename_base = os.path.split(
            aligner_output_filename)
        if filename_base.find('queryname') == -1 and filename_base.find(
                'coordinate') == -1:
            if os.path.exists(aligner_output_filename):
                aligner_output_filename_base, alignment_file_ext = os.path.splitext(
                    aligner_output_filename)
                if alignment_file_ext == '.gz' or alignment_file_ext == 'bz2':
                    aligner_output_filename_base, format_ext = os.path.splitext(
                        aligner_output_filename_base)
                sort_aligner_output_filename = aligner_output_filename_base + \
                    "_" + wd.alignment_sort_order
                if format_ext == '.sam' or alignment_file_ext == '.sam':
                    initial_pipe_commands = ['samtools view -bhS -']
                    sort_file_ext = ".sam"
                    name_sort_cmd = " samtools sort -on -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "queryname"
                    coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "coordinate"
                    name_sort_cmd = " samtools sort -on -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "queryname"
                    coordinate_sort_cmd = " samtools sort -o -m 100000000 - " + \
                        aligner_output_filename_base + "_" + "coordinate"
                    after_sort_cmd = ' | samtools view -h - -o '
                else:
                    name_sort_cmd = " samtools sort -n -m 100000000 - "
                    coordinate_sort_cmd = " samtools sort -m 100000000 - "
                sort_aligner_output_filename += sort_file_ext
                if format_ext == '.sam' or format_ext == '.bam' or alignment_file_ext == '.sam' or alignment_file_ext == '.bam':
                    sort_flag = 'True'
        if sort_flag == 'True':
            if wd.alignment_sort_order == 'queryname':
                sort_commands = [
                    name_sort_cmd + after_sort_cmd +
                    sort_aligner_output_filename
                ]
            if wd.alignment_sort_order == 'coordinate':
                sort_commands = [
                    coordinate_sort_cmd + after_sort_cmd +
                    sort_aligner_output_filename
                ]
            if wd.alignment_sort_order == 'queryname' or wd.alignment_sort_order == 'coordinate':
                #if sort order is both; then call yap tee functionality
                yap_tools.yap_tee(initial_pipe_commands, sort_commands,
                                  aligner_output_filename, err_log, stat_log)
                if len(initial_pipe_commands) > 0:
                    workflow_prov.append(initial_pipe_commands[0] +
                                         sort_commands[0])
                else:
                    workflow_prov.append(sort_commands[0])
                rm_cmd = "rm " + aligner_output_filename
                prm = Popen(rm_cmd, shell='True').wait()
                if prm != 0:
                    print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "\n"
            if wd.alignment_sort_order == 'both':
                sort_queryname_output = aligner_output_filename_base + "_queryname" + sort_file_ext
                sort_coordinate_output = aligner_output_filename_base + "_coordinate" + sort_file_ext
                sort_commands = [
                    name_sort_cmd + after_sort_cmd + sort_queryname_output,
                    coordinate_sort_cmd + after_sort_cmd +
                    sort_coordinate_output
                ]
                #if sort order is both; then call yap tee functionality
                yap_tools.yap_tee(initial_pipe_commands, sort_commands,
                                  aligner_output_filename, err_log, stat_log)
                if len(initial_pipe_commands) > 0:
                    workflow_prov.append(initial_pipe_commands[0] +
                                         sort_commands[0])
                    workflow_prov.append(initial_pipe_commands[0] +
                                         sort_commands[1])
                else:
                    workflow_prov.append(sort_commands[0])
                    workflow_prov.append(sort_commands[1])
                try:
                    os.remove(aligner_output_filename)
                except Exception as e:
                    print "Error: chunks clean up after merge sort failed for Filename=", aligner_output_filename, " chunk number=", chunk_number, "cmd", "\n"
                    print e
    return workflow_prov