def remove_5_bp_snp_indel(raw_vcf_file, out_path, analysis, reference, logger, Config): #variant_caller = eval(ConfigSectionMap("pipeline", Config)['variant_caller']) if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools": print "Samtools: Removing SNPs proximate to Indel by 5bp" remove_snps_5_bp_snp_indel_file_name = raw_vcf_file + "_5bp_indel_removed.vcf" with open(raw_vcf_file, 'rU') as csv_file: for line in csv_file: if not line.startswith('#'): line_array = line.split('\t') if line_array[7].startswith('INDEL;'): indel_positions.append(line_array[1]) for i in indel_positions: lower_range = int(i) - 5 upper_range = int(i) + 6 for positions in range(lower_range, upper_range): indel_range_positions.append(positions) f1 = open(remove_snps_5_bp_snp_indel_file_name, 'w+') with open(raw_vcf_file, 'rU') as csv_file2: for line in csv_file2: if not line.startswith('#'): line_array = line.split('\t') if int(line_array[1]) not in indel_range_positions: print_string = line f1.write(print_string) else: print_string = line f1.write(print_string) return remove_snps_5_bp_snp_indel_file_name elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": print "GATK Haplotype caller: Removing SNPs proximate to Indel by 5bp" remove_snps_5_bp_snp_indel_file_name = raw_vcf_file + "_5bp_indel_removed.vcf" indel_file_name = raw_vcf_file + "_indel.vcf" base_cmd = ConfigSectionMap( "bin_path", Config)['binbase'] + "/" + ConfigSectionMap( "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap( "gatk", Config)['base_cmd'] cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % ( base_cmd, reference, raw_vcf_file, indel_file_name) call(cmd, logger) keep_logging('Running Command: [%s]' % cmd, 'Running Command: [%s]' % cmd, logger, 'info') with open(indel_file_name, 'rU') as csv_file: for line in csv_file: if not line.startswith('#'): line_array = line.split('\t') indel_positions.append(line_array[1]) for i in indel_positions: lower_range = int(i) - 5 upper_range = int(i) + 6 for positions in range(lower_range, upper_range): indel_range_positions.append(positions) f1 = open(remove_snps_5_bp_snp_indel_file_name, 'w+') with open(raw_vcf_file, 'rU') as csv_file2: for line in csv_file2: if not line.startswith('#'): line_array = line.split('\t') if int(line_array[1]) not in indel_range_positions: print_string = line f1.write(print_string) else: print_string = line f1.write(print_string) return remove_snps_5_bp_snp_indel_file_name
def extract_only_ref_variant_fasta_unique_positions_with_unmapped(): # Get reference genome ID from reference fasta file get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() # Read in the SNP Matrix file and seperate the columns. c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') columns = list(zip(*c_reader)) ncol = len(next(c_reader_2)) # Generate an array of all the unique variant positions that were called in all the samples unique_position_array = [] for i in columns[0][1:]: replace_string = i.split(' ') if replace_string[0] != "None": unique_position_array.append(int(replace_string[3])) else: unique_position_array.append(int(replace_string[2])) counts = 1 end = ncol # Loop over each column, check if the column name matches the sample name provided with argument args.filter2_only_snp_vcf_filename for i in xrange(1, end, 1): print_string = "" ref_print_string = "" grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print grab_vcf_filename sample_name_re = columns[i][0][:grab_vcf_filename] #print sample_name_re # Replaced this with a more stable check #sample_name = str(columns[i][0]) # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name) # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_S.*', '', sample_name_re) #print len(columns[i][1:]) if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''): vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re print_string = print_string + ">%s\n" % sample_name_re ref_print_string = ref_print_string + ">%s\n" % sample_name_re #variant_allele = ''.join(columns[i][1:]) variant_allele = "" for ntd in columns[i][1:]: #if "/" in ntd: if "/" in ntd or len(ntd) > 1: variant_allele = variant_allele + ntd[0] else: variant_allele = variant_allele + ntd #print variant_allele print_string = print_string + str(variant_allele) + "\n" allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf.write(vcf_header) allele_variant_fasta.write(print_string) allele_variant_fasta.close() variant_allele_array = [] variant_allele_array_dict = {} #variant_allele_array.append(columns[i][1:]) count_index = 0 end_index = len(unique_position_array) + 1 for start_count in xrange(1, end_index, 1): pos = columns[0][start_count] get_positions_string = pos.split(' ') if get_positions_string[0] != "None": get_positions = int(get_positions_string[3]) else: get_positions = int(get_positions_string[2]) variant_allele_array_dict[get_positions] = columns[i][start_count] # print len(variant_allele_array_dict) # print len(unique_position_array) get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re)) if len(get_sample_reference.keys()) == 1: sample_ref_id = get_sample_reference.keys() for positions in unique_position_array: #print positions #pos_index = unique_position_array.index(positions) if "/" in str(variant_allele_array_dict[positions]) or len(variant_allele_array_dict[positions]) > 1: allele_var = str(variant_allele_array_dict[positions][0]) #print allele_var else: allele_var = str(variant_allele_array_dict[positions]) # if str(positions) == "1477126": # print allele_var ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)})) generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var) allele_ref_variant_vcf.write(generate_vcf_string) allele_ref_variant_vcf.close() filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'a+') bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(bgzip_cmd) subprocess.call([bgzip_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(tabix_cmd) subprocess.call([tabix_cmd], shell=True) base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re) f1.write(fasta_cmd) subprocess.call([fasta_cmd], shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) #os.system("bash %s" % filename) #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) #os.system(sequence_lgth_cmd) #call("%s" % sequence_lgth_cmd, logger) unmapped_positions_file = "%s/%s_unmapped.bed_positions" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print unmapped_positions_file unmapped_vcf_file = "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) unmapped_vcf = open( "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') unmapped_vcf.write(vcf_header) with open(unmapped_positions_file, 'r') as fpp: for lines in fpp: lines = lines.strip() ref_allele = str(get_reference.sequence( {'chr': str(get_reference.keys()[0]), 'start': int(lines), 'stop': int(lines)})) generate_vcf_string_unmapped = "%s\t%s\t.\t%s\t-\t221.999\t.\t.\t.\t.\n" % ( ref_id[0].split(' ')[0], lines, ref_allele) unmapped_vcf.write(generate_vcf_string_unmapped) unmapped_vcf.close() bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], unmapped_vcf_file) print bgzip_cmd tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], unmapped_vcf_file) print tabix_cmd subprocess.call([bgzip_cmd], shell=True) subprocess.call([tabix_cmd], shell=True) #allele_ref_variant_unmapped_vcf = open("%s/%s_ref_allele_variants_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') vcf_filename_unmapped = "%s/%s_ref_allele_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) bcftools_merge_cmd = "%s/%s/bcftools merge --merge snps --force-samples %s.gz %s.gz -O v -o %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], unmapped_vcf_file, vcf_filename, vcf_filename_unmapped) bgzip_cmd = "%s/%s/bgzip -f %s\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename_unmapped) subprocess.call([bcftools_merge_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename_unmapped) fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % ( args.reference, base_vcftools_bin, vcf_filename_unmapped, sample_name_re) #filename = "%s/consensus_ref_allele_unmapped_variant.sh" % args.filter2_only_snp_vcf_dir filename = "%s/%s_consensus_ref_allele_unmapped_variant.sh" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'w+') f1.write(bgzip_cmd) f1.write(tabix_cmd) f1.write(fasta_cmd) print "print here: %s" % filename subprocess.call(['pwd'], shell=True) subprocess.call(bgzip_cmd, shell=True) subprocess.call(tabix_cmd, shell=True) subprocess.call(fasta_cmd, shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) f1.close() else: print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
def gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp): if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools": base_cmd = ConfigSectionMap( "bin_path", Config)['binbase'] + "/" + ConfigSectionMap( "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap( "gatk", Config)['base_cmd'] filter_criteria = "contamination_filters" if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": keep_logging('The average depth filter is turned on.', 'The average depth filter is turned on.', logger, 'info') low_Dp = float(Avg_dp) / 2 high_Dp = float(Avg_dp) * 5 DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp)) else: DP_filter = "DP > %s" % float( ConfigSectionMap(filter_criteria, Config)['dp']) MQ_filter = "MQ > %s" % float( ConfigSectionMap(filter_criteria, Config)['mq']) FQ_filter = "FQ > %s" % float( ConfigSectionMap(filter_criteria, Config)['fq']) QUAL_filter = "QUAL > %s" % float( ConfigSectionMap(filter_criteria, Config)['qual']) AF_filter = "AF1 < %s" % float( ConfigSectionMap(filter_criteria, Config)['af']) gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s" % ( FQ_filter, MQ_filter, QUAL_filter, DP_filter, AF_filter) if os.path.exists(final_raw_vcf): gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_contamination.vcf > %s/%s_filter2_final_contamination.vcf" % ( out_path, analysis, out_path, analysis) keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug') keep_logging(filter_flag_command, filter_flag_command, logger, 'debug') try: call(gatk_filter2_command, logger) call(filter_flag_command, logger) except sp.CalledProcessError: keep_logging('Error in GATK filter step. Exiting.', 'Error in GATK filter step. Exiting.', logger, 'exception') sys.exit(1) gatk_filter2_final_contamination_vcf = "%s/%s_filter2_final_contamination.vcf" % ( out_path, analysis) #extract_dp = "egrep -v \"^#\" %s | cut -f 8 | sed 's/^.*DP=\([0-9]*\);.*$/\1/' > %s/%s_depth_values.txt" % (gatk_filter2_final_contamination_vcf, out_path, analysis) extract_dp = "egrep -v \"^#\" %s | cut -f 8 | grep -Po 'DP=[0-9]*;?' | sed 's/DP=//g' | sed 's/;//g' > %s/%s_depth_values.txt" % ( gatk_filter2_final_contamination_vcf, out_path, analysis) extract_pos = "grep -v '^#' %s | awk -F'\t' '{print $2}' > %s/%s_POS_values.txt" % ( gatk_filter2_final_contamination_vcf, out_path, analysis) extract_fq = "awk -F'\t' '{print $8}' %s | grep -o 'FQ=.*' | sed 's/FQ=//g' | awk -F';' '{print $1}' > %s/%s_FQ_values.txt" % ( gatk_filter2_final_contamination_vcf, out_path, analysis) extract_mq = "egrep -v \"^#\" %s | cut -f 8 | sed 's/^.*MQ=\([0-9]*\);.*$/\1/' > %s/%s_MQ_values.txt" % ( gatk_filter2_final_contamination_vcf, out_path, analysis) extract_af = "awk -F'\t' '{print $8}' %s | grep -o 'AF1=.*' | sed 's/AF1=//g' | awk -F';' '{print $1}' > %s/%s_AF1_values.txt" % ( gatk_filter2_final_contamination_vcf, out_path, analysis) try: call(extract_dp, logger) call(extract_pos, logger) call(extract_fq, logger) call(extract_mq, logger) call(extract_af, logger) keep_logging(extract_dp, filter_flag_command, logger, 'debug') keep_logging(extract_pos, filter_flag_command, logger, 'debug') keep_logging(extract_fq, filter_flag_command, logger, 'debug') keep_logging(extract_mq, filter_flag_command, logger, 'debug') keep_logging(extract_af, filter_flag_command, logger, 'debug') except sp.CalledProcessError: keep_logging('Error in GATK contamination filter step. Exiting.', 'Error in GATK contamination filter step. Exiting.', logger, 'exception') sys.exit(1) header = "pos,af" header_cmd = "echo \"%s\" > %s/header.txt" % (header, out_path) call(header_cmd, logger) paste_command = "paste -d, %s/%s_POS_values.txt %s/%s_AF1_values.txt > %s/%s_temp_paste_file.txt" % ( out_path, analysis, out_path, analysis, out_path, analysis) call(paste_command, logger) combine_file_cmd = "cat %s/header.txt %s/%s_temp_paste_file.txt > %s/%s_INFO.txt" % ( out_path, out_path, analysis, out_path, analysis) call(combine_file_cmd, logger) return gatk_filter2_final_contamination_vcf elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": print "filter"
def gatk_filter_indel(final_raw_vcf, out_path, analysis, reference, logger, Config, Avg_dp): # if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools": # base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd'] # filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria'] # if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": # keep_logging("Using variant filter parameters from: %s" % filter_criteria, # "Using variant filter parameters from: %s" % filter_criteria, logger, 'info') # low_Dp = float(Avg_dp) / 2 # high_Dp = float(Avg_dp) * 5 # DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp)) # else: # DP_filter = "DP > %s" % ConfigSectionMap(filter_criteria, Config)['dp'] # MQ_filter = "MQ > %s" % ConfigSectionMap(filter_criteria, Config)['mq'] # FQ_filter = "FQ < %s" % ConfigSectionMap(filter_criteria, Config)['fq'] # FQ_filter2 = "FQ < %s" % ConfigSectionMap(filter_criteria, Config)['fq2'] # QUAL_filter = "QUAL > %s" % ConfigSectionMap(filter_criteria, Config)['qual'] # AF_filter = "AF1 > %s" % float(ConfigSectionMap(filter_criteria, Config)['af']) # # gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s && %s" % (FQ_filter, MQ_filter, QUAL_filter, DP_filter, FQ_filter2, AF_filter) # if os.path.exists(final_raw_vcf): # gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) # else: # gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) # # filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_indel_gatk.vcf > %s/%s_filter2_indel_final.vcf" % (out_path, analysis, out_path, analysis) # keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug') # keep_logging(filter_flag_command, filter_flag_command, logger, 'debug') # try: # call(gatk_filter2_command, logger) # call(filter_flag_command, logger) # except sp.CalledProcessError: # keep_logging('Error in GATK filter step. Exiting.', 'Error in GATK filter step. Exiting.', logger, 'exception') # sys.exit(1) # gatk_filter2_final_vcf = "%s/%s_filter2_indel_final.vcf" % (out_path, analysis) # return gatk_filter2_final_vcf # elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller": base_cmd = ConfigSectionMap( "bin_path", Config)['binbase'] + "/" + ConfigSectionMap( "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap( "gatk", Config)['base_cmd'] filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria'] keep_logging("Using variant filter parameters from: %s" % filter_criteria, "Using variant filter parameters from: %s" % filter_criteria, logger, 'info') if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes": keep_logging('The average depth filter is turned on.', 'The average depth filter is turned on.', logger, 'info') low_Dp = float(Avg_dp) / 2 high_Dp = float(Avg_dp) * 5 DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp)) else: DP_filter = "DP > %s" % float( ConfigSectionMap(filter_criteria, Config)['dp']) MQ_filter = "MQ > %s" % float( ConfigSectionMap(filter_criteria, Config)['mq']) QUAL_filter = "QD > %s" % float( ConfigSectionMap(filter_criteria, Config)['qd']) AF_filter = "AF > %s" % float( ConfigSectionMap(filter_criteria, Config)['af']) #gatk_filter2_parameter_expression = "%s && %s && %s && %s" % (MQ_filter, QUAL_filter, DP_filter, AF_filter) gatk_filter2_parameter_expression = "%s && %s && %s && %s" % ( MQ_filter, QUAL_filter, DP_filter, AF_filter) if os.path.exists(final_raw_vcf): gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) else: gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % ( base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression) filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_indel_gatk.vcf > %s/%s_filter2_indel_final.vcf" % ( out_path, analysis, out_path, analysis) keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug') keep_logging(filter_flag_command, filter_flag_command, logger, 'debug') try: call(gatk_filter2_command, logger) call(filter_flag_command, logger) except sp.CalledProcessError: keep_logging('Error in GATK filter step. Exiting.', 'Error in GATK filter step. Exiting.', logger, 'exception') sys.exit(1) gatk_filter2_final_vcf = "%s/%s_filter2_indel_final.vcf" % (out_path, analysis) return gatk_filter2_final_vcf
def file_exists(path1, path2, reference): if not os.path.isfile(path1): file_basename = os.path.basename(path1) keep_logging( 'The input file {} does not exists. Please provide another file with full path or check the files path.\n' .format(file_basename), 'The input file {} does not exists. Please provide another file or check the files path.\n' .format(file_basename), logger, 'exception') exit() if path2 is not None: if not os.path.isfile(path2): file_basename = os.path.basename(path2) keep_logging( 'The input file {} does not exists. Please provide another file with full path or check the files path.\n' .format(file_basename), 'The input file {} does not exists. Please provide another file or check the files path.\n' .format(file_basename), logger, 'exception') exit() if not os.path.isfile(reference): file_basename = os.path.basename(reference) keep_logging( 'The reference fasta file {} does not exists. Please provide another with full path file with full path or check the files path.\n' .format(file_basename), 'The reference fasta file {} does not exists. Please provide another file or check the files path.\n' .format(file_basename), logger, 'exception') exit() if ConfigSectionMap("pipeline", Config)['aligner'] == "bwa": ref_index_suffix1 = reference + ".bwt" ref_index_suffix2 = reference + ".amb" ref_index_suffix3 = reference + ".ann" ref_index_suffix4 = reference + ".sa" ref_index_suffix5 = reference + ".pac" elif ConfigSectionMap("pipeline", Config)['aligner'] == "bowtie": ref_index_suffix1 = reference + ".1.bt2" ref_index_suffix2 = reference + ".2.bt2" ref_index_suffix3 = reference + ".3.bt2" ref_index_suffix4 = reference + ".4.ebwt" ref_index_suffix5 = reference + ".rev.1.bt2" ref_index_suffix6 = reference + ".rev.2.bt2" if not os.path.isfile(ref_index_suffix1): keep_logging( 'The reference index files given below does not exists:\n {}\n {}\n {}\n {}\n {}' .format(ref_index_suffix1, ref_index_suffix2, ref_index_suffix3, ref_index_suffix4, ref_index_suffix5), 'The reference index files given below does not exists:\n {}\n {}\n {}\n {}\n {}' .format(ref_index_suffix1, ref_index_suffix2, ref_index_suffix3, ref_index_suffix4, ref_index_suffix5), logger, 'warning') create_index(reference, ref_index_suffix1, ref_index_suffix2, ref_index_suffix3, ref_index_suffix4, ref_index_suffix5) else: keep_logging('Index file already exists.', 'Index file already exists.', logger, 'info') ref_fai_index = reference + ".fai" if not os.path.isfile(ref_fai_index): keep_logging( 'The reference fai index file {} required for samtools does not exists.' .format(ref_fai_index), 'The reference fai index file {} required for samtools does not exists.' .format(ref_fai_index), logger, 'warning') create_fai_index(reference, ref_fai_index) else: keep_logging('Samtools fai Index file already exists.', 'Samtools fai Index file already exists.', logger, 'info') dict_name = os.path.splitext(os.path.basename(reference))[0] + ".dict" if not os.path.isfile( ConfigSectionMap(args.index, Config)['ref_path'] + "/" + dict_name): keep_logging( 'The reference seq dict file {} required for GATK and PICARD does not exists.' .format(dict_name), 'The reference seq dict file {} required for GATK and PICARD does not exists.' .format(dict_name), logger, 'warning') picard_seqdict(dict_name, reference) else: keep_logging( 'The reference seq dict file required for GATK and PICARD exists.', 'The reference seq dict file required for GATK and PICARD exists.', logger, 'info')
def velvetoptimiser(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, out_path): print( "\n################## Running VELVET on input files ##################\n" ) velvet_dir = out_path + "velvet_results" Vforward_paired = "-shortPaired -fastq.gz " + forward_paired Vforward_unpaired = " -short -fastq.gz " + forward_unpaired Vreverse_paired = " -shortPaired2 -fastq.gz " + reverse_paired Vreverse_unpaired = " -short2 -fastq.gz " + reverse_unpaired (paired, unpaired) = check_cleanreads(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired) contigs = out_path + "/contigs.fa" scaffolds = "" if paired == 0 and unpaired == 0: # Clean Paired and unpaired reads doesn't exist. Take raw Input PE files for assembly message = "No clean Paired and unpaired reads. Considering forward_paired and reverse_paired as raw Fastq files for assembly.\n" print(message) cmdstring = ConfigSectionMap( "bin_path" )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + " " + Vreverse_paired + "\'" print("Running: %s \n" % cmdstring) os.system(cmdstring) # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path os.system(cp_cmdstring) print("\n################## END: VELVET ASSEMBLY ##################\n") return contigs, scaffolds elif paired == 1 and unpaired == 0: # Only clean Paired PE files exists. Take these files for assembly input. message = "Taking only paired reads for assembly.\n" print(message) cmdstring = ConfigSectionMap( "bin_path" )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + Vreverse_paired + "\'" print("Running: %s \n" % cmdstring) os.system(cmdstring) # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path os.system(cp_cmdstring) print("\n################## END: VELVET ASSEMBLY ##################\n") return contigs, scaffolds elif paired == 0 and unpaired == 1: # Only clean unpaired PE files exists. Pending... cmdstring = "This can be single reads......" print("Running: %s \n" % cmdstring) #return contigs, scaffolds #os.system(cmdstring) else: # Clean paired and unpaired files exists. Take all these files as input. os.chdir(out_path) cmdstring = ConfigSectionMap( "bin_path" )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + " " + Vreverse_paired + " " + Vforward_unpaired + " " + Vreverse_unpaired + "\'" print("Running with all input file parameters.\n") print("Running: %s \n" % cmdstring) os.system(cmdstring) # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path os.system(cp_cmdstring) print("\n################## END: VELVET ASSEMBLY ##################\n") return contigs, scaffolds
def nucmer_repeat(reference, outdir, logger, Config): keep_logging( '\nFinding repeat region in reference genome: %s\n' % reference, '\nFinding repeat region in reference genome: %s\n' % reference, logger, 'info') prefix = str(reference.split('.')[0]) + "_repeat" nucmer_repeat_cmd = "%s/%s/%s --maxmatch --nosimplify --prefix=%s %s %s" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("mummer", Config)['mummer_bin'], ConfigSectionMap( "mummer", Config)['nucmer_base_cmd'], prefix, reference, reference) keep_logging('Running: %s' % nucmer_repeat_cmd, 'Running: %s' % nucmer_repeat_cmd, logger, 'debug') call(nucmer_repeat_cmd, logger) showcoords_cmd = "%s/%s/show-coords -I %s -r %s.delta > %s.coords" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("mummer", Config)['mummer_bin'], ConfigSectionMap("mummer", Config)['percent_id'], prefix, prefix) keep_logging('Running: %s' % showcoords_cmd, 'Running: %s' % showcoords_cmd, logger, 'debug') call(showcoords_cmd, logger) repeat_match_cmd = "%s/%s/repeat-match %s > %s.repeat_match" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("mummer", Config)['mummer_bin'], reference, prefix) tandem_repeats_cmd = "%s/%s/exact-tandems %s %s > %s_tandem_repeats_file" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("mummer", Config)['mummer_bin'], reference, ConfigSectionMap("mummer", Config)['min_tandem_repeat_length'], prefix) keep_logging('Running: %s' % tandem_repeats_cmd, 'Running: %s' % tandem_repeats_cmd, logger, 'debug') keep_logging('Running: %s' % repeat_match_cmd, 'Running: %s' % repeat_match_cmd, logger, 'debug') call(tandem_repeats_cmd, logger) call(repeat_match_cmd, logger) inexact_repeat_positions = [] with open("%s.coords" % prefix) as fp: for i in xrange(6): fp.next() for line in fp: line = line.strip() line_split = line.split('|') range_str = str('-'.join(line_split[0].strip().split())) i_range = range_str.split('-') end_range = int(i_range[1]) + 1 inexact_repeat_positions.extend( list(range(int(i_range[0]), end_range))) range_str = str('-'.join(line_split[1].strip().split())) i_range = range_str.split('-') end_range = int(i_range[1]) + 1 inexact_repeat_positions.extend( list(range(int(i_range[0]), end_range))) fp.close() #Write inexact repeat position to file inexact_repeat_region_positions.txt f_inexact = open("%s/inexact_repeat_region_positions.txt" % outdir, 'w+') for i in inexact_repeat_positions: f_inexact.write(str(i) + '\n') keep_logging( 'No. of inexact repeat matches positions: %s' % len(set(sorted(inexact_repeat_positions))), 'No. of inexact repeat matches: %s' % len(set(sorted(inexact_repeat_positions))), logger, 'info') keep_logging( 'Note: The pipeline will not remove these inexact repeat positions. Writing these postions to %s/inexact_repeat_region_positions.txt' % outdir, 'Note: The pipeline will not remove these inexact repeat positions. Writing these postions to %s/inexact_repeat_region_positions.txt' % outdir, logger, 'info') #Find Tandem repeats using Nucmer tandem_repeats = [] with open("%s_tandem_repeats_file" % prefix) as fp: for i in xrange(5): fp.next() for line in fp: line = line.strip() line_split = line.split() end_coords = int(line_split[0]) + int(line_split[1]) tandem_repeats.extend(list(range(int(line_split[0]), end_coords))) keep_logging( 'No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), 'No. of Tandem repeat matches positions: %s' % len(set(sorted(tandem_repeats))), logger, 'info') # Not including inexact repeats filter #All_repeats = sorted(set(inexact_repeat_positions + tandem_repeats)) All_repeats = sorted(set(tandem_repeats)) keep_logging( 'Repeat positions in this file %s/repeat_region_positions.txt will be filtered out' % outdir, 'Repeat positions in this file %s/repeat_region_positions.txt will be filtered out' % outdir, logger, 'info') f_open = open("%s/repeat_region_positions.txt" % outdir, 'w+') for pos in All_repeats: f_open.write(str(pos) + '\n') f_open.close() return "%s/repeat_region_positions.txt" % outdir
def pipeline(args, logger): keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info') """ SANITATION CHECKS """ # Check Subroutines: Arguments, Input FASTQ files, Reference Index keep_logging('START: Checking Dependencies...', 'Checking Dependencies', logger, 'info') # Reference Genome file name reference = ConfigSectionMap(args.index, Config)['ref_path'] + "/" + ConfigSectionMap( args.index, Config)['ref_name'] keep_logging( 'Getting Reference Genome name from config file: {}'.format(reference), 'Getting Reference Genome name from config file: {}'.format(reference), logger, 'info') # Check if FASTQ files exists if args.type != "PE" and args.type != "BAM": reverse_raw = "None" file_exists(args.forward_raw, args.forward_raw, reference) elif args.type != "PE" and args.type != "SE": print "BAM type... Not Integrated... continue" else: file_exists(args.forward_raw, args.reverse_raw, reference) # Check Java Version java_check() keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies', logger, 'info') """ Start the pipeline: """ # split values provided with -steps argument and decide the starting point of pipeline steps_list = args.steps.split(',') # Check cluster parameter and set cluster variable, used for running pipeline locally or parallelly on local or on cluster if args.cluster: cluster = args.cluster else: cluster = "local" """ INDIVIDUAL SUBPROCESS FOR EACH PIPELINE STEPS""" ## 1. Pre-Processing Raw reads using Trimmomatic def clean(): keep_logging('START: Pre-Processing Raw reads using Trimmomatic', 'START: Pre-Processing Raw reads using Trimmomatic', logger, 'info') if args.type == "PE": trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder, args.croplength, logger, Config) else: reverse_raw = "None" trimmomatic(args.forward_raw, reverse_raw, args.output_folder, args.croplength, logger, Config) keep_logging('END: Pre-Processing Raw reads using Trimmomatic', 'END: Pre-Processing Raw reads using Trimmomatic', logger, 'info') ## 2. Stages: Alignment using BWA def align_reads(): keep_logging('START: Mapping Reads using BWA', 'START: Mapping Reads using BWA', logger, 'info') split_field = prepare_readgroup( args.forward_raw, ConfigSectionMap("pipeline", Config)['aligner'], logger) out_sam = align(args.output_folder, args.index, split_field, args.analysis_name, files_to_delete, logger, Config, args.type) keep_logging('END: Mapping Reads using BWA', 'END: Mapping Reads using BWA', logger, 'info') return out_sam # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step. def coverage_depth_stats(): gatk_DepthOfCoverage_file = gatk_DepthOfCoverage( out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) return gatk_DepthOfCoverage_file ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc def post_align(out_sam): keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...', 'START: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') out_sorted_bam = prepare_bam(out_sam, args.output_folder, args.analysis_name, files_to_delete, logger, Config) keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...', 'END: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) keep_logging('START: Creating BedGraph Coverage', 'START: Creating BedGraph Coverage', logger, 'info') bedgraph_coverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) only_unmapped_positions_file = bedtools(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Creating BedGraph Coverage', 'END: Creating BedGraph Coverage', logger, 'info') return out_sorted_bam ## 4. Stages: Variant Calling def varcall(): keep_logging('START: Variant Calling', 'START: Variant Calling', logger, 'info') caller = ConfigSectionMap("pipeline", Config)['variant_caller'] if caller == "gatkhaplotypecaller": keep_logging('START: Variant Calling using GATK haplotyper.', 'START: Variant Calling using GATK haplotyper.', logger, 'info') final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf), 'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') return final_raw_vcf, final_raw_indel_vcf elif caller == "samtools": keep_logging( 'START: Variant Calling using Samtools without post-align bam input files.', 'START: Variant Calling using Samtools without post-align bam input files.', logger, 'info') final_raw_indel_vcf = prepare_indel_gatk(out_sorted_bam, args.output_folder, args.analysis_name, args.index, logger, Config) final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) # GATK indel calling integration #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') return final_raw_vcf, final_raw_indel_vcf else: keep_logging( 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller', 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller', logger, 'info') exit() keep_logging('END: Variant Calling', 'END: Variant Calling', logger, 'info') ## 5. Stages: Variant Filteration def filter(gatk_depth_of_coverage_file): keep_logging('START: Variant Filteration', 'START: Variant Filteration', logger, 'info') final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % ( args.output_folder, args.analysis_name) #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) if not os.path.isfile(gatk_depth_of_coverage_file): file_basename = os.path.basename(gatk_depth_of_coverage_file) keep_logging( 'The input file {} does not exists. Please provide another file with full path or check the files path.\n' .format(file_basename), 'The input file {} does not exists. Please provide another file or check the files path.\n' .format(file_basename), logger, 'exception') exit() Avg_dp_cmd = "grep \'^Total\' %s | awk -F\'\t\' \'{print $3}\'" % gatk_depth_of_coverage_file proc = sp.Popen([Avg_dp_cmd], stdout=sp.PIPE, shell=True) (out, err) = proc.communicate() Avg_dp = float(out) print "The Average Depth per reference genome base is: %s" % Avg_dp filter_variants(final_raw_vcf, args.output_folder, args.analysis_name, args.index, logger, Config, Avg_dp) final_raw_indel_vcf = final_raw_vcf_mpileup + "_indel.vcf" filter_indels(final_raw_indel_vcf, args.output_folder, args.analysis_name, args.index, logger, Config, Avg_dp) keep_logging('END: Variant Filteration', 'END: Variant Filteration', logger, 'info') ## 6. Stages: Statistics def stats(): keep_logging('START: Generating Statistics Reports', 'START: Generating Statistics Reports', logger, 'info') alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder, args.analysis_name, logger, Config) picard_stats_file = picardstats(out_sorted_bam, args.output_folder, args.analysis_name, args.index, logger, Config) #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Generating Statistics Reports', 'END: Generating Statistics Reports', logger, 'info') # ################################################### Stages: Remove Unwanted Intermediate files ###################################### # # print "Removing Imtermediate Files...\n%s" % files_to_delete # # for files in files_to_delete: # # os.remove(files) # # print "Removing Imtermediate Files...\n%s" % files_to_delete # # for files in files_to_delete: # # os.remove(files) # ############################################################################ End #################################################### if args.downsample == "yes": read1, read2 = downsample(args, logger) args.forward_raw = read1 args.reverse_raw = read2 print "Using downsampled forward reads %s" % args.forward_raw print "Using downsampled reverse reads %s" % args.reverse_raw if len(steps_list) == 1: if steps_list[0] == "coverage_depth_stats": #clean() #out_sam = align_reads() #out_sorted_bam = post_align() out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_DepthOfCoverage_file = coverage_depth_stats() if steps_list[0] == "filter": #Sanity Check Post-varcall vcf and other files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() if os.path.exists(out_sorted_bam) and os.path.exists( final_raw_vcf) and os.path.exists( gatk_depth_of_coverage_file) and os.path.exists( final_raw_vcf_mpileup): filter(gatk_depth_of_coverage_file) stats() else: keep_logging( 'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files\n', 'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files', logger, 'exception') exit() if steps_list[0] == "stats": #Sanity Check Post-varcall vcf and other files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): print gatk_depth_of_coverage_file gatk_depth_of_coverage_file = coverage_depth_stats() if os.path.exists(out_sorted_bam) and os.path.exists( final_raw_vcf) and os.path.exists( gatk_depth_of_coverage_file) and os.path.exists( final_raw_vcf_mpileup): stats() else: keep_logging( 'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files\n', 'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files', logger, 'exception') exit() elif steps_list[0] == "All": clean() out_sam = align_reads() out_sorted_bam = post_align(out_sam) out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "bedtools": out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) only_unmapped_positions_file = bedtools(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) elif steps_list[0] == "varcall": out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() # Run individual variant calling steps: clean, align, post-align, varcall, filter, stats etc else: if steps_list[0] == "clean": clean() out_sam = align_reads() out_sorted_bam = post_align() #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "align": #Sanity Check clean reads here out_sam = align_reads() out_sorted_bam = post_align(out_sam) out_sorted_bam = post_align() gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "post-align": #Sanity Check BAM file here out_sam = "%s/%s_aln.sam" % (args.output_folder, args.analysis_name) out_sorted_bam = post_align(out_sam) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "varcall": #Sanity Check Post-aligned-BAM and Bed files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) if not os.path.exists("%s.bai" % out_sorted_bam): index_bam(out_sorted_bam, args.output_folder, logger, Config) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf, final_raw_indel_vcf = varcall() filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "filter": #Sanity Check Post-varcall vcf and other files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "stats": #Sanity check BAM and vcf files gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) stats() else: keep_logging( 'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again', 'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again', logger, 'exception')
def get_scheduler_directive(scheduler, Config): """Generate Cluster Directive lines for a scheduler provided with args.scheduler Args: path: scheduler name, Config object Output: variables associated with scheduler """ if scheduler and scheduler == "SLURM": script_Directive = "#SBATCH" job_name_flag = "--job-name=" scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ % (ConfigSectionMap("slurm", Config)['email'], ConfigSectionMap("slurm", Config)['notification'], ConfigSectionMap("slurm", Config)['partition'], ConfigSectionMap("slurm", Config)['flux_account'], ConfigSectionMap("slurm", Config)['resources']) elif scheduler and scheduler == "PBS": script_Directive = "#PBS" job_name_flag = "-N" scheduler_directives = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \ % (ConfigSectionMap("scheduler", Config)['email'], ConfigSectionMap("scheduler", Config)['notification'], ConfigSectionMap("scheduler", Config)['resources'], ConfigSectionMap("scheduler", Config)['queue'], ConfigSectionMap("scheduler", Config)['flux_account']) else: script_Directive = "#SBATCH" job_name_flag = "--job-name=" scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \ % (ConfigSectionMap("slurm", Config)['email'], ConfigSectionMap("slurm", Config)['notification'], ConfigSectionMap("slurm", Config)['partition'], ConfigSectionMap("slurm", Config)['flux_account'], ConfigSectionMap("slurm", Config)['resources']) return scheduler_directives, script_Directive, job_name_flag
def picard_seqdict(reference_filename, reference): dict_name = os.path.splitext(os.path.basename(reference_filename))[0] + ".dict" cmd = "java -jar %s CreateSequenceDictionary REFERENCE=%s OUTPUT=%s/%s" % (base_cmd, reference_filename, ConfigSectionMap(reference, Config)['ref_path'],dict_name) print "\nRunning:\n [%s] \n" % cmd os.system(cmd)
global Config_readme global logger log_unique_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') Config_readme = ConfigParser.ConfigParser() Config_readme.read(args.readme_meta) readme_file = args.out_dir + "/README.md" print readme_file if not os.path.isfile(readme_file): f = open(readme_file, 'w+') f.write("Request submitted by: %s" % ConfigSectionMap("Main", Config_readme)['submitter']) f.close() else: print "README file already exists: Overwriting this file" f = open(readme_file, 'w+') f.write("Request submitted by: %s\n" % ConfigSectionMap("Main", Config_readme)['submitter']) f.write("Project Name: %s\n" % ConfigSectionMap("Main", Config_readme)['project_name']) f.write("Date when pipeline was run: %s\n" % ConfigSectionMap("Main", Config_readme)['date']) f.write("Piepline Version: %s\n" % ConfigSectionMap("Main", Config_readme)['version']) f.write("Comments: %s\n" % ConfigSectionMap("Description", Config_readme)['comments'])
def abacas(reference_genome_path, final_l500_contig, out_path, first_part, logger, Config): keep_logging('Contig Reordering using ABACAS', 'Contig Reordering using ABACAS', logger, 'info') abacas_cmd = "perl %s/%s/%s -r %s -q %s %s -o %s/%s_contigs_ordered" % ( ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("abacas", Config)['abacas_bin'], ConfigSectionMap("abacas", Config)['base_cmd'], reference_genome_path, final_l500_contig, ConfigSectionMap( "abacas", Config)['abacas_parameters'], out_path, first_part) try: keep_logging(abacas_cmd, abacas_cmd, logger, 'debug') call(abacas_cmd, logger) #print "" fasta_header = ">%s" % first_part header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path) print(header_cmd) keep_logging(abacas_cmd, abacas_cmd, logger, 'debug') call(header_cmd, logger) abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % ( out_path, first_part) abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % ( out_path, first_part) join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % ( abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path) #print join_all_contigs keep_logging(join_all_contigs, join_all_contigs, logger, 'debug') call(join_all_contigs, logger) add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path #print add_linker keep_logging(add_linker, add_linker, logger, 'debug') call(add_linker, logger) remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % ( out_path, out_path) #print remove_spaces keep_logging(remove_spaces, remove_spaces, logger, 'debug') call(remove_spaces, logger) join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % ( out_path, out_path, out_path, first_part) #print join_files keep_logging(join_files, join_files, logger, 'debug') call(join_files, logger) except sp.CalledProcessError: keep_logging('Error in reordering Contigs using Abacas. Exiting.', 'Error in reordering Contigs using Abacas. Exiting.', logger, 'exception') sys.exit(1) # fasta_header = ">%s" % first_part # header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path) # print header_cmd # call(header_cmd, logger) # abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % (out_path, first_part) # abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % (out_path, first_part) # print "here" # join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % (abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path) # print join_all_contigs # call(join_all_contigs, logger) # add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path # print add_linker # call(add_linker, logger) # remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % (out_path, out_path) # print remove_spaces # call(remove_spaces, logger) # join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % (out_path, out_path, out_path, first_part) # print join_files # call(join_files, logger) final_ordered_contigs = "%s/%s_contigs_ordered.fasta" % (out_path, first_part) return final_ordered_contigs
def spades_assembly(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired, out_path, logger, Config, do_assembly): # check if the clean reads from Trimmomatic exists in the output folder. # Set the paired and unpaired string constants based on their availability (paired, unpaired) = check_cleanreads(forward_paired, reverse_paired, forward_unpaired, reverse_unpaired) # Pending Changes if paired == "0" and unpaired == "0": # Clean Paired and unpaired reads doesn't exist. Take raw Input PE files for assembly message = "No clean Paired and unpaired reads. Considering forward_paired and reverse_paired as raw Fastq files for assembly.\n" print(message) cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results " + ConfigSectionMap( "spades", Config)['spades_parameters'] plasmid_cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results " + ConfigSectionMap( "spades", Config)['plasmid_spades_parameters'] print("Running: %s \n" % cmdstring) print("Running: %s \n" % plasmid_cmdstring) os.system(cmdstring) os.system( plasmid_cmdstring=ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("spades", Config)['spades_bin'] + ConfigSectionMap("spades", Config)['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results " + ConfigSectionMap("spades", Config)['plasmid_spades_parameters']) print("Spades assembly results can be found in " + out_path + "spades_results") print("plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results") contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print("\n################## End: SPADES ASSEMBLY ##################\n") return contigs, scaffolds # Pending Changes elif paired == "1" and unpaired == "0": # Only clean Paired PE files exists. Take these files for assembly input. message = "Taking only paired reads for assembly.\n" print(message) if reverse_paired == "None" and reverse_unpaired == "None": cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap( "spades", Config)['spades_parameters'] plasmid_cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap( "spades", Config)['plasmid_spades_parameters'] print("Running: %s \n" % cmdstring) print("Running: %s \n" % plasmid_cmdstring) os.system(cmdstring) os.system(plasmid_cmdstring) print("Spades assembly results can be found in " + out_path + "spades_results") print("plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results") contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print( "\n################## End: SPADES ASSEMBLY ##################\n" ) else: ##pending changes cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap( "spades", Config)['spades_parameters'] plasmid_cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap( "spades", Config)['plasmid_spades_parameters'] print("Running: %s \n" % cmdstring) print("Running: %s \n" % plasmid_cmdstring) os.system(cmdstring) os.system(plasmid_cmdstring) print("Spades assembly results can be found in " + out_path + "spades_results") print("plasmid Spades assembly results can be found in " + out_path + "spades_plasmid_results") contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print( "\n################## End: SPADES ASSEMBLY ##################\n" ) return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds # Pending Changes elif paired == "0" and unpaired == "1": # Only clean unpaired PE files exists. Pending... cmdstring = "This can be single reads......" print("Running: %s \n" % cmdstring) os.system(cmdstring) print("Spades assembly results can be found in " + out_path + "spades_results") contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print("\n################## End: SPADES ASSEMBLY ##################\n") return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds else: # Clean paired and unpaired files exists. Take all these files as input. cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " " + ConfigSectionMap( "spades", Config )['spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_results" plasmid_cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " " + ConfigSectionMap( "spades", Config )['plasmid_spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_plasmid_results" # Check if unpaired files are empty fwd_unpaired_size = get_uncompressed_size(forward_unpaired) rev_unpaired_size = get_uncompressed_size(reverse_unpaired) if fwd_unpaired_size == 0 or rev_unpaired_size == 0: cmdstring = ConfigSectionMap( "spades", Config )['base_cmd'] + " " + ConfigSectionMap( "spades", Config )['spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results" plasmid_cmdstring = ConfigSectionMap("spades", Config)['base_cmd'] + " " + \ ConfigSectionMap("spades", Config)[ 'plasmid_spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results" if do_assembly == "both": keep_logging('Running Spades and plasmid Spades assembly', 'Running Spades and plasmid Spades assembly', logger, 'debug') try: keep_logging(cmdstring, cmdstring, logger, 'debug') call(cmdstring, logger) #Check if they are empty contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print("") keep_logging( 'Spades assembly results can be found in {}spades_results'. format(out_path), 'Spades assembly results can be found in {}spades_results'. format(out_path), logger, 'info') except sp.CalledProcessError: keep_logging( 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception') sys.exit(1) try: keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger, 'debug') call(plasmid_cmdstring, logger) print("") plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" keep_logging( 'Spades plasmid assembly results can be found in {}spades_plasmid_results' .format(out_path), 'Spades plasmid assembly results can be found in {}spades_plasmid_results' .format(out_path), logger, 'info') except sp.CalledProcessError: keep_logging( 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception') sys.exit(1) if do_assembly == "wga": keep_logging('Running Spades assembly', 'Running Spades assembly', logger, 'debug') try: keep_logging(cmdstring, cmdstring, logger, 'debug') call(cmdstring, logger) #Check if they are empty contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" # Copy final contigs/scaffolds file to output directory cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path) os.system(cp_cmdstring) print("") keep_logging( 'Spades assembly results can be found in {}spades_results'. format(out_path), 'Spades assembly results can be found in {}spades_results'. format(out_path), logger, 'info') except sp.CalledProcessError: keep_logging( 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception') sys.exit(1) plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" if do_assembly == "plasmid": keep_logging('Running plasmid Spades assembly', 'Running plasmid Spades assembly', logger, 'debug') try: keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger, 'debug') call(plasmid_cmdstring, logger) print("") plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta" plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta" keep_logging( 'Spades plasmid assembly results can be found in {}spades_plasmid_results' .format(out_path), 'Spades plasmid assembly results can be found in {}spades_plasmid_results' .format(out_path), logger, 'info') except sp.CalledProcessError: keep_logging( 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', 'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder', logger, 'exception') sys.exit(1) contigs = out_path + "spades_results" + "/contigs.fasta" scaffolds = out_path + "spades_results" + "/scaffolds.fasta" return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
def extract_only_ref_variant_fasta_unique_positions(): #print "here" # Get reference genome ID get_reference = Fasta(args.reference) if len(get_reference.keys()) == 1: ref_id = get_reference.keys() c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t') columns = list(zip(*c_reader)) ncol = len(next(c_reader_2)) unique_position_array = [] for i in columns[0][1:]: replace_string = i.split(' ') if replace_string[0] != "None": unique_position_array.append(int(replace_string[3])) else: unique_position_array.append(int(replace_string[2])) #print unique_position_array counts = 1 end = ncol for i in xrange(1, end, 1): print_string = "" ref_print_string = "" grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '')) #print grab_vcf_filename sample_name_re = columns[i][0][:grab_vcf_filename] #print sample_name_re # Replaced this with a more stable check #sample_name = str(columns[i][0]) # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name) # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re) # sample_name_re = re.sub('_S.*', '', sample_name_re) if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''): vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re print_string = print_string + ">%s\n" % sample_name_re ref_print_string = ref_print_string + ">%s\n" % sample_name_re #variant_allele = ''.join(columns[i][1:]) variant_allele = "" for ntd in columns[i][1:]: if "/" in ntd: variant_allele = variant_allele + ntd[0] else: variant_allele = variant_allele + ntd #print variant_allele print_string = print_string + str(variant_allele) + "\n" allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_variant_fasta.write(print_string) allele_variant_fasta.close() allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+') allele_ref_variant_vcf.write(vcf_header) variant_allele_array = [] variant_allele_array.append(columns[i][1:]) get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re)) if len(get_sample_reference.keys()) == 1: sample_ref_id = get_sample_reference.keys() for positions in unique_position_array: pos_index = unique_position_array.index(positions) if "/" in str(variant_allele_array[0][pos_index]): allele_var = str(variant_allele_array[0][pos_index][0]) #print allele_var else: allele_var = str(variant_allele_array[0][pos_index]) ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)})) generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var) allele_ref_variant_vcf.write(generate_vcf_string) allele_ref_variant_vcf.close() filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re) f1 = open(filename, 'a+') bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(bgzip_cmd) subprocess.call([bgzip_cmd], shell=True) tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename) f1.write(tabix_cmd) subprocess.call([tabix_cmd], shell=True) base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin'] fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re) f1.write(fasta_cmd) subprocess.call([fasta_cmd], shell=True) sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re) subprocess.call([sed_command], shell=True) f1.write(sed_command) #os.system("bash %s" % filename) #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin']) #os.system(sequence_lgth_cmd) #call("%s" % sequence_lgth_cmd, logger) else: print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
Config.read(config_file) logger = generate_logger(args.output_folder, analysis_string, log_unique_time) # Set output directory paths if args.output_folder != '': args.output_folder += '/' make_sure_path_exists(args.output_folder) if args.output_folder != '': args.output_folder += '/' # Set reference genome path to map the samples and calculate coverage depth if "coverage_depth" in args.analysis_names: try: reference = ConfigSectionMap(args.reference, Config)['ref_path'] + "/" + ConfigSectionMap(args.reference, Config)['ref_name'] except OSError as exception: if exception.errno != errno.EEXIST: print "Please provide reference genome name or Check the reference genome path in config file.\n" exit() else: reference = "NONE" # Main Workflow pipeline(args, logger, Config, args.output_folder, args.prefix, reference) keep_logging('End: Pipeline\n', 'End: Pipeline', logger, 'info') time_taken = datetime.now() - start_time_2 keep_logging('Total Time taken: {}'.format(time_taken), 'Total Time taken: {}'.format(time_taken), logger, 'info')
def coverage_depth_analysis(filenames_array, Config, logger, output_folder, type, samples, coverage_depth_directory, cluster, reference, scheduler): files_to_delete = [] #command_list = [] if type == "PE": for file in filenames_array: command_list = [] filename_base = os.path.basename(file) if "R1_001_final.fastq.gz" in filename_base: reverse_file = file.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz") first_part_split = filename_base.split('R1_001_final.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "R1.fastq.gz" in filename_base: reverse_file = file.replace("R1.fastq.gz", "R2.fastq.gz") first_part_split = filename_base.split('R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "1_combine.fastq.gz" in filename_base: reverse_file = file.replace("1_combine.fastq.gz", "2_combine.fastq.gz") first_part_split = filename_base.split('1_combine.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "1_sequence.fastq.gz" in filename_base: reverse_file = file.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz") first_part_split = filename_base.split('1_sequence.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "_forward.fastq.gz" in filename_base: reverse_file = file.replace("_forward.fastq.gz", "_reverse.fastq.gz") first_part_split = filename_base.split('_forward.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "R1_001.fastq.gz" in filename_base: reverse_file = file.replace("R1_001.fastq.gz", "R2_001.fastq.gz") first_part_split = filename_base.split('R1_001.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "_1.fastq.gz" in filename_base: reverse_file = file.replace("_1.fastq.gz", "_2.fastq.gz") first_part_split = filename_base.split('_1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') else: print "Using Standard second file naming convention" reverse_file = file.replace("_R1_", "_R2_") first_part_split = filename_base.split('_R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') file_prefix = coverage_depth_directory + "/" + first_part analysis = first_part if file.endswith('.gz'): keep_logging("Generating command list to create cluster jobs", "Generating command list to create cluster jobs", logger, 'info') split_field = prepare_readgroup(file, logger) command_list, files_to_delete, out_sam = align_bwa( ConfigSectionMap("bwa", Config)['base_cmd'], file, reverse_file, coverage_depth_directory, reference, split_field, first_part, files_to_delete, logger, Config, type, command_list) # out_sam = files_to_delete[0] command_list, files_to_delete, out_bam = samtobam( out_sam, coverage_depth_directory, analysis, files_to_delete, logger, Config, command_list) # out_bam = files_to_delete[1] command_list, files_to_delete, out_sort_bam = sort_bam( out_bam, coverage_depth_directory, analysis, logger, Config, command_list, files_to_delete) # out_sort_bam = files_to_delete[2] command_list = index_bam(out_sort_bam, coverage_depth_directory, logger, Config, command_list, files_to_delete) command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage( out_sort_bam, coverage_depth_directory, analysis, reference, logger, Config, command_list) command_list = flagstat(out_sort_bam, coverage_depth_directory, analysis, logger, Config, command_list) coverage_depth_cmd = "" for i in command_list: coverage_depth_cmd = coverage_depth_cmd + i + "\n" keep_logging('', coverage_depth_cmd, logger, 'debug') if cluster == "cluster": generate_cluster_jobs(coverage_depth_cmd, file_prefix, scheduler, Config, logger) else: f3 = open(file_prefix + '_commands.sh', 'w+') f3.write(coverage_depth_cmd) else: keep_logging("Generating command list to create cluster jobs", "Generating command list to create cluster jobs", logger, 'info') split_field = prepare_readgroup(file, logger) command_list, files_to_delete = align_bwa( ConfigSectionMap("bwa", Config)['base_cmd'], file, reverse_file, coverage_depth_directory, reference, split_field, first_part, files_to_delete, logger, Config, type, command_list) #out_sam = files_to_delete[0] command_list, files_to_delete = samtobam( out_sam, coverage_depth_directory, analysis, files_to_delete, logger, Config, command_list, files_to_delete) #out_bam = files_to_delete[1] command_list, files_to_delete = sort_bam( out_bam, coverage_depth_directory, analysis, logger, Config, command_list, files_to_delete) #out_sort_bam = files_to_delete[2] command_list = index_bam(out_sort_bam, coverage_depth_directory, logger, Config, command_list, files_to_delete) command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage( out_sorted_bam, coverage_depth_directory, analysis, reference, logger, Config, command_list) command_list = flagstat(out_sort_bam, coverage_depth_directory, analysis, logger, Config, command_list) coverage_depth_cmd = "" for i in command_list: coverage_depth_cmd = coverage_depth_cmd + i + "\n" keep_logging('', coverage_depth_cmd, logger, 'debug') if cluster == "cluster": generate_cluster_jobs(coverage_depth_cmd, file_prefix, scheduler, Config, logger) else: f3 = open(file_prefix + '_commands.sh', 'w+') f3.write(coverage_depth_cmd) elif type == "SE": ###Pending Changes for file in filenames_array: filename_base = os.path.basename(file) if "R1_001_final.fastq.gz" in filename_base: reverse_file = file.replace("R1_001_final.fastq.gz", "R2_001_final.fastq.gz") first_part_split = filename_base.split('R1_001_final.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "R1.fastq.gz" in filename_base: reverse_file = file.replace("R1.fastq.gz", "R2.fastq.gz") first_part_split = filename_base.split('R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "1_combine.fastq.gz" in filename_base: reverse_file = file.replace("1_combine.fastq.gz", "2_combine.fastq.gz") first_part_split = filename_base.split('1_combine.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "1_sequence.fastq.gz" in filename_base: reverse_file = file.replace("1_sequence.fastq.gz", "2_sequence.fastq.gz") first_part_split = filename_base.split('1_sequence.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "_forward.fastq.gz" in filename_base: reverse_file = file.replace("_forward.fastq.gz", "_reverse.fastq.gz") first_part_split = filename_base.split('_forward.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "R1_001.fastq.gz" in filename_base: reverse_file = file.replace("R1_001.fastq.gz", "R2_001.fastq.gz") first_part_split = filename_base.split('R1_001.fastq.gz') first_part = first_part_split[0].replace('_L001', '') elif "_1.fastq.gz" in filename_base: reverse_file = file.replace("_1.fastq.gz", "_2.fastq.gz") first_part_split = filename_base.split('_1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') else: print "Using Standard second file naming convention" reverse_file = file.replace("_R1_", "_R2_") first_part_split = filename_base.split('_R1.fastq.gz') first_part = first_part_split[0].replace('_L001', '') file_prefix = coverage_depth_directory + "/" + first_part if file.endswith('.gz'): keep_logging("Generating command list to create cluster jobs", "Generating command list to create cluster jobs", logger, 'info') split_field = prepare_readgroup(file, logger) command_list, files_to_delete = align_bwa( ConfigSectionMap("bwa", Config)['bwa_bin'], file, reverse_file, coverage_depth_directory, reference, split_field, first_part, files_to_delete, logger, Config, type, command_list) out_sam = files_to_delete[0] command_list, files_to_delete = samtobam( out_sam, coverage_depth_directory, analysis, files_to_delete, logger, Config, command_list, files_to_delete) out_bam = files_to_delete[1] command_list, files_to_delete = sort_bam( out_bam, coverage_depth_directory, analysis, logger, Config, command_list, files_to_delete) out_sort_bam = files_to_delete[2] command_list = index_bam(out_sort_bam, coverage_depth_directory, logger, Config, command_list, files_to_delete) command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage( out_sorted_bam, coverage_depth_directory, analysis, reference, logger, Config, command_list) command_list = flagstat(out_sorted_bam, coverage_depth_directory, analysis, logger, Config, command_list) coverage_depth_cmd = "" for i in command_list: coverage_depth_cmd = coverage_depth_cmd + i + "\n" keep_logging("The coverage Depth commands for file %s are:\n", "The coverage Depth commands for file %s are:\n", logger, 'info') keep_logging(coverage_depth_cmd, coverage_depth_cmd, logger, 'debug') if cluster == "cluster": generate_cluster_jobs(coverage_depth_cmd, file_prefix, Config, logger) else: f3 = open(file_prefix + '_commands.sh', 'w+') f3.write(coverage_depth_cmd) else: keep_logging("Generating command list to create cluster jobs", "Generating command list to create cluster jobs", logger, 'info') split_field = prepare_readgroup(file, logger) command_list, files_to_delete = align_bwa( ConfigSectionMap("bin_path", Config)['binbase'] + ConfigSectionMap("bwa", Config)['bwa_bin'], file, reverse_file, output_folder, reference, split_field, first_part, files_to_delete, logger, Config, type, command_list) out_sam = files_to_delete[0] command_list, files_to_delete = samtobam( out_sam, output_folder, analysis, files_to_delete, logger, Config, command_list, files_to_delete) out_bam = files_to_delete[1] command_list, files_to_delete = sort_bam( out_bam, output_folder, analysis, logger, Config, command_list, files_to_delete) out_sort_bam = files_to_delete[2] command_list = index_bam(out_sort_bam, output_folder, logger, Config, command_list, files_to_delete) command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage( out_sorted_bam, output_folder, analysis, reference, logger, Config, command_list) command_list = flagstat(out_sorted_bam, coverage_depth_directory, analysis, logger, Config, command_list) coverage_depth_cmd = "" for i in command_list: coverage_depth_cmd = coverage_depth_cmd + i + "\n" keep_logging("The coverage Depth commands for file %s are:\n", "The coverage Depth commands for file %s are:\n", logger, 'info') keep_logging(coverage_depth_cmd, coverage_depth_cmd, logger, 'debug') if cluster == "cluster": generate_cluster_jobs(coverage_depth_cmd, file_prefix, Config, logger) else: f3 = open(file_prefix + '_commands.sh', 'w+') f3.write(coverage_depth_cmd)
def pipeline(args, logger, Config, output_folder, prefix, reference): keep_logging('\nSTART: Pipeline', 'START: Pipeline', logger, 'info') """ Check Subroutines and create logger object: Arguments, Input files, Reference Index""" keep_logging('Checking Dependencies...', 'Checking Dependencies', logger, 'info') """ Check java availability """ java_check() """ Check if the input file exists """ with open(args.samples) as fp: for line in fp: line = line.strip() line = args.directory + "/" + line filenames_array.append(line) if args.type != "PE": reverse_raw = "None" file_exists(line, reverse_raw) else: #reverse_raw = args.directory + "/" + reverse_raw file_exists(line, line) keep_logging('Total no. of Samples %s' % len(filenames_array), 'Total no. of Samples %s' % len(filenames_array), logger, 'info') """ Start the pipeline: """ analysis_list = args.analysis_names.split(',') keep_logging('Running analysis - %s' % args.analysis_names, 'Running analysis - %s' % args.analysis_names, logger, 'info') """ Copy filenames to output folder """ cp_cmd = "cp %s %s" % (args.samples, output_folder) os.system(cp_cmd) """ Set Default cluster mode""" if args.cluster: cluster = args.cluster else: cluster = "local" """ Start Specific analysis based on analysis list """ for analysis in analysis_list: if analysis == "coverage": keep_logging("Step: Calculating Coverage...\n", "Calculating Coverage", logger, 'info') coverage(filenames_array, Config, logger, output_folder, args.type, args.samples, args.size, prefix) elif analysis == "quality": keep_logging("Step: Analysing Fastqc Quality...\n", "Analysing Fastqc Quality...", logger, 'info') fastqc_main_directory = args.output_folder + "/%s_Fastqc" % args.prefix make_sure_path_exists(fastqc_main_directory) fastqc_forward_directory = fastqc_main_directory + "/%s_Forward" % args.prefix make_sure_path_exists(fastqc_forward_directory) fastqc_reverse_directory = fastqc_main_directory + "/%s_Reverse" % args.prefix make_sure_path_exists(fastqc_reverse_directory) Multiqc_reports_directory = args.output_folder + "/%s_Multiqc_reports" % args.prefix make_sure_path_exists(Multiqc_reports_directory) quality(filenames_array, Config, logger, output_folder, args.type, args.samples, fastqc_forward_directory, fastqc_reverse_directory) multiqc(fastqc_forward_directory, "%s_Forward_fastqc" % args.prefix, Config, logger, Multiqc_reports_directory) multiqc(fastqc_reverse_directory, "%s_Reverse_fastqc" % args.prefix, Config, logger, Multiqc_reports_directory) elif analysis == "screen_contamination": keep_logging("Step: Screening Fastq reads against Reference Database...\n", "Screening Fastq reads against Reference Database...", logger, 'info') fastq_screen_directory = args.output_folder + "/%s_Fastqc_screen" % args.prefix make_sure_path_exists(fastq_screen_directory) screen_contamination(filenames_array, Config, logger, output_folder, args.type, args.samples, fastq_screen_directory, cluster) Multiqc_reports_directory = args.output_folder + "/%s_Multiqc_reports" % args.prefix make_sure_path_exists(Multiqc_reports_directory) multiqc(fastq_screen_directory, "%s_Fastq_screen" % args.prefix, Config, logger, Multiqc_reports_directory) keep_logging('MultiQC Report of FastQC results can be found in - %s\n' % Multiqc_reports_directory, 'MultiQC Report of FastQC results can be found in - %s\n' % Multiqc_reports_directory, logger, 'info') elif analysis == "kraken_contamination": keep_logging("Step: Running Kraken on Input reads...\n", "Running Kraken on Input reads...", logger, 'info') kraken_directory = args.output_folder + "/%s_Kraken_results" % args.prefix make_sure_path_exists(kraken_directory) kraken_contamination(filenames_array, Config, logger, output_folder, args.type, args.samples, kraken_directory, cluster, args.downsample, args.scheduler, args.size, args.dryrun) elif analysis == "kraken_report": keep_logging("Step: Generating Kraken report on Kraken Results...\n", "Generating Kraken report on Kraken Results...", logger, 'info') kraken_directory = args.output_folder + "/%s_Kraken_results" % args.prefix make_sure_path_exists(kraken_directory) kraken_report(filenames_array, Config, logger, output_folder, args.type, args.samples, kraken_directory, cluster, args.scheduler) elif analysis == "coverage_depth": keep_logging("Step: Running Coverage Depth analysis on Input reads...\n", "Running Coverage Depth analysis on Input reads...", logger, 'info') coverage_depth_directory = args.output_folder + "/%s_Coverage_depth" % args.prefix make_sure_path_exists(coverage_depth_directory) coverage_depth_analysis(filenames_array, Config, logger, output_folder, args.type, args.samples, coverage_depth_directory, cluster, reference, args.scheduler) elif analysis == "mlst": keep_logging("Step: Running Ariba MLST sequence typing on Input reads...\n", "Running MLST sequence typing on Input reads...", logger, 'info') if args.mlst_db: mlstdb = args.mlst_db else: mlstdb = ConfigSectionMap("ariba", Config)['mlst_db_path'] keep_logging( '', "Using Ariba MLST Database from this path - %s" % mlstdb, logger, 'debug') mlst_directory = args.output_folder + "/%s_MLST_results" % args.prefix make_sure_path_exists(mlst_directory) mlst(filenames_array, Config, logger, mlst_directory, args.type, args.samples, mlst_directory, cluster, args.scheduler, mlstdb) elif analysis == "summary": keep_logging('', "Generating Summary report for QC'd analysis - %s" % args.prefix, logger, 'debug') summary(filenames_array, Config, logger, args.prefix, output_folder) keep_logging("Summary report - %s/%s_summary.tsv" % (output_folder, prefix), "Summary report - %s/%s_summary.tsv" % (output_folder, prefix), logger, 'info')
def varcall(): keep_logging('START: Variant Calling', 'START: Variant Calling', logger, 'info') caller = ConfigSectionMap("pipeline", Config)['variant_caller'] if caller == "gatkhaplotypecaller": keep_logging('START: Variant Calling using GATK haplotyper.', 'START: Variant Calling using GATK haplotyper.', logger, 'info') final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf), 'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') return final_raw_vcf, final_raw_indel_vcf elif caller == "samtools": keep_logging( 'START: Variant Calling using Samtools without post-align bam input files.', 'START: Variant Calling using Samtools without post-align bam input files.', logger, 'info') final_raw_indel_vcf = prepare_indel_gatk(out_sorted_bam, args.output_folder, args.analysis_name, args.index, logger, Config) final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) # GATK indel calling integration #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') return final_raw_vcf, final_raw_indel_vcf else: keep_logging( 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller', 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller', logger, 'info') exit() keep_logging('END: Variant Calling', 'END: Variant Calling', logger, 'info')
def pipeline(args, logger): keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info') # Check Subroutines and create logger object: Arguments, Input files, Reference Index keep_logging('START: Checking Dependencies...', 'Checking Dependencies', logger, 'info') # Reference Genome file name reference = ConfigSectionMap(args.index, Config)['ref_path'] + "/" + ConfigSectionMap( args.index, Config)['ref_name'] keep_logging( 'Getting Reference Genome name from config file: {}'.format(reference), 'Getting Reference Genome name from config file: {}'.format(reference), logger, 'info') # Check FASTQ files if args.type != "PE": reverse_raw = "None" file_exists(args.forward_raw, args.forward_raw, reference) else: file_exists(args.forward_raw, args.reverse_raw, reference) # Check Java Version java_check() keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies', logger, 'info') """ Start the pipeline: """ steps_list = args.steps.split(',') if args.cluster: cluster = args.cluster else: cluster = "local" ## 1. Pre-Processing Raw reads using Trimmomatic def clean(): keep_logging('START: Pre-Processing Raw reads using Trimmomatic', 'START: Pre-Processing Raw reads using Trimmomatic', logger, 'info') if args.type == "PE": trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder, args.croplength, logger, Config) else: reverse_raw = "None" trimmomatic(args.forward_raw, reverse_raw, args.output_folder, args.croplength, logger, Config) keep_logging('END: Pre-Processing Raw reads using Trimmomatic', 'END: Pre-Processing Raw reads using Trimmomatic', logger, 'info') ## 2. Stages: Alignment using BWA def align_reads(): keep_logging('START: Mapping Reads using BWA', 'START: Mapping Reads using BWA', logger, 'info') split_field = prepare_readgroup(args.forward_raw, logger) out_sam = align(args.output_folder, args.index, split_field, args.analysis_name, files_to_delete, logger, Config, args.type) keep_logging('END: Mapping Reads using BWA', 'END: Mapping Reads using BWA', logger, 'info') return out_sam # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step. def coverage_depth_stats(): gatk_DepthOfCoverage_file = gatk_DepthOfCoverage( out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) return gatk_DepthOfCoverage_file ## Continue: 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc def post_align(): keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...', 'START: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') out_sorted_bam = prepare_bam(out_sam, args.output_folder, args.analysis_name, files_to_delete, logger, Config) keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...', 'END: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) keep_logging('START: Creating BedGraph Coverage', 'START: Creating BedGraph Coverage', logger, 'info') bedgraph_coverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) only_unmapped_positions_file = bedtools(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Creating BedGraph Coverage', 'END: Creating BedGraph Coverage', logger, 'info') return out_sorted_bam ## 4. Stages: Variant Calling def varcall(): keep_logging('START: Variant Calling', 'START: Variant Calling', logger, 'info') caller = ConfigSectionMap("pipeline", Config)['variant_caller'] if caller == "samtoolswithpostalignbam": keep_logging( 'START: Variant Calling using Samtools and post-align bam input files', 'START: Variant Calling using Samtools and post-align bam input files', logger, 'info') out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name) final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools and post-align bam input files', 'END: Variant Calling using Samtools and post-align bam input files', logger, 'info') elif caller == "gatkhaplotypecaller": keep_logging( 'START: Variant Calling using GATK haplotyper and post-align bam input files', 'START: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info') out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name) final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using GATK haplotyper and post-align bam input files', 'END: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info') elif caller == "samtools": keep_logging( 'START: Variant Calling using Samtools without post-align bam input files.', 'START: Variant Calling using Samtools without post-align bam input files.', logger, 'info') final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging( 'END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') return final_raw_vcf else: keep_logging( 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', logger, 'info') exit() keep_logging('END: Variant Calling', 'END: Variant Calling', logger, 'info') ## 5. Stages: Variant Filteration def filter(gatk_depth_of_coverage_file): keep_logging('START: Variant Filteration', 'START: Variant Filteration', logger, 'info') Avg_dp_cmd = "grep \'^Total\' %s | awk -F\'\t\' \'{print $3}\'" % gatk_depth_of_coverage_file proc = sp.Popen([Avg_dp_cmd], stdout=sp.PIPE, shell=True) (out, err) = proc.communicate() Avg_dp = float(out) print "The Average Depth per reference genome base is: %s" % Avg_dp filter2_variants(final_raw_vcf, args.output_folder, args.analysis_name, args.index, logger, Config, Avg_dp) keep_logging('END: Variant Filteration', 'END: Variant Filteration', logger, 'info') ## 6. Stages: Statistics def stats(): keep_logging('START: Generating Statistics Reports', 'START: Generating Statistics Reports', logger, 'info') alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) #gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder, args.analysis_name, logger, Config) #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Generating Statistics Reports', 'END: Generating Statistics Reports', logger, 'info') if len(steps_list) == 1: if steps_list[0] == "coverage_depth_stats": clean() out_sam = align_reads() out_sorted_bam = post_align() gatk_DepthOfCoverage_file = coverage_depth_stats() elif steps_list[0] == "All": clean() out_sam = align_reads() out_sorted_bam = post_align() out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = varcall() final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() #####Individual steps else: if steps_list[0] == "clean": clean() out_sam = align_reads() out_sorted_bam = post_align() #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = varcall() #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "align": #Check clean reads here out_sam = align_reads() out_sorted_bam = post_align() #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = varcall() #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "post-align": #Check BAM file here out_sorted_bam = post_align() #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = varcall() #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "varcall": #Check Post-aligned-BAM and Bed files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = varcall() #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "filter": #Check Post-varcall vcf and other files here out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) filter(gatk_depth_of_coverage_file) stats() elif steps_list[0] == "stats": #check BAM and vcf files gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % ( args.output_folder, args.analysis_name) if not os.path.exists(gatk_depth_of_coverage_file): gatk_depth_of_coverage_file = coverage_depth_stats() out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % ( args.output_folder, args.analysis_name) stats() else: keep_logging( 'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again', 'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again', logger, 'exception')
def trim(input1, input2, out_path, crop, logger, Config): if input2 != "None": keep_logging('Pre-processing PE reads using Trimmomatic.', 'Pre-processing PE reads using Trimmomatic.', logger, 'info') adapter_file = ConfigSectionMap( "bin_path", Config)['binbase'] + "/" + ConfigSectionMap( "Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap( "Trimmomatic", Config)['adaptor_filepath'] clean_filenames = out_path + ConfigSectionMap( "Trimmomatic", Config)['f_p'] + " " + out_path + ConfigSectionMap( "Trimmomatic", Config)['f_up'] + " " + out_path + ConfigSectionMap( "Trimmomatic", Config)['r_p'] + " " + out_path + ConfigSectionMap( "Trimmomatic", Config)['r_up'] # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config )['palindrome_clipthreshold'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config )['simple_clipthreshold'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['minadapterlength'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['keep_both_reads'] sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap( "Trimmomatic", Config)['window_size'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['window_size_quality'] minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength'] headcrop_string = 'HEADCROP:' + ConfigSectionMap( "Trimmomatic", Config)['headcrop_length'] if not crop: cmdstring = "java -jar " + ConfigSectionMap( "bin_path", Config )['binbase'] + ConfigSectionMap( "Trimmomatic", Config )['trimmomatic_bin'] + "trimmomatic-0.36.jar PE -phred33 " + input1 + " " + input2 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % ( out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) except sp.CalledProcessError: keep_logging('Error in Trimming step. Exiting.', 'Error in Trimming step. Exiting.', logger, 'exception') sys.exit(1) keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info') else: crop_string = 'CROP:' + crop cmdstring = "java -jar " + ConfigSectionMap( "bin_path", Config )['binbase'] + ConfigSectionMap( "Trimmomatic", Config )['trimmomatic_bin'] + "trimmomatic-0.36.jar PE " + input1 + " " + input2 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + " 2> %s/%s_trim_out.log" % ( out_path, os.path.basename(os.path.dirname(out_path))) try: call(cmdstring, logger) except sp.CalledProcessError: keep_logging('Error in Trimming step. Exiting.', 'Error in Trimming step. Exiting.', logger, 'exception') sys.exit(1) keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info') else: keep_logging('Pre-processing SE reads using Trimmomatic.', 'Pre-processing SE reads using Trimmomatic.', logger, 'info') adapter_file = ConfigSectionMap( "bin_path", Config)['binbase'] + "/" + ConfigSectionMap( "Trimmomatic", Config)['trimmomatic_bin'] + "/" + ConfigSectionMap( "Trimmomatic", Config)['adaptor_filepath'] clean_filenames = out_path + ConfigSectionMap("Trimmomatic", Config)['f_p'] # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['palindrome_clipthreshold'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['simple_clipthreshold'] sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap( "Trimmomatic", Config)['window_size'] + ConfigSectionMap( "Trimmomatic", Config)['colon'] + ConfigSectionMap( "Trimmomatic", Config)['window_size_quality'] minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic", Config)['minlength'] headcrop_string = 'HEADCROP:' + ConfigSectionMap( "Trimmomatic", Config)['headcrop_length'] if not crop: cmdstring = "java -jar " + ConfigSectionMap( "bin_path", Config )['binbase'] + ConfigSectionMap( "Trimmomatic", Config )['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % ( out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) except sp.CalledProcessError: keep_logging('Error in Trimming step. Exiting.', 'Error in Trimming step. Exiting.', logger, 'exception') sys.exit(1) keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info') else: crop_string = 'CROP:' + crop cmdstring = "java -jar " + ConfigSectionMap( "bin_path", Config )['binbase'] + ConfigSectionMap( "Trimmomatic", Config )['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + + " 2> %s/%s_trim_out.log" % ( out_path, os.path.basename(os.path.dirname(out_path))) keep_logging(cmdstring, cmdstring, logger, 'debug') try: call(cmdstring, logger) except sp.CalledProcessError: keep_logging('Error in Trimming step. Exiting.', 'Error in Trimming step. Exiting.', logger, 'exception') sys.exit(1) keep_logging('End: Data Pre-processing', 'End: Data Pre-processing', logger, 'info')
def pipeline(args, logger): keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info') # Check Subroutines and create logger object: Arguments, Input files, Reference Index keep_logging('START: Checking Dependencies...', 'Checking Dependencies', logger, 'info') # Reference Genome file name reference = ConfigSectionMap(args.index, Config)['ref_path'] + "/" + ConfigSectionMap(args.index, Config)['ref_name'] keep_logging('Getting Reference Genome name from config file: {}'.format(reference), 'Getting Reference Genome name from config file: {}'.format(reference), logger, 'info') # Check FASTQ files if args.type != "PE": reverse_raw = "None" file_exists(args.forward_raw, args.forward_raw, reference) else: file_exists(args.forward_raw, args.reverse_raw, reference) # Check Java Version java_check() keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies', logger, 'info') ## 1. Pre-Processing Raw reads using Trimmomatic keep_logging('START: Pre-Processing Raw reads using Trimmomatic', 'START: Pre-Processing Raw reads using Trimmomatic', logger, 'info') if args.type == "PE": trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder, args.croplength, logger, Config) else: reverse_raw = "None" trimmomatic(args.forward_raw, reverse_raw, args.output_folder, args.croplength, logger, Config) keep_logging('END: Pre-Processing Raw reads using Trimmomatic', 'END: Pre-Processing Raw reads using Trimmomatic', logger, 'info') ## 2. Stages: Alignment using BWA keep_logging('START: Mapping Reads using BWA', 'START: Mapping Reads using BWA', logger, 'info') split_field = prepare_readgroup(args.forward_raw, logger) files_to_delete = [] out_sam = align(args.bam_input, args.output_folder, args.index, split_field, args.analysis_name, files_to_delete, logger, Config, args.type) keep_logging('END: Mapping Reads using BWA', 'END: Mapping Reads using BWA', logger, 'info') ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...', 'START: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') out_sorted_bam = prepare_bam(out_sam, args.output_folder, args.analysis_name, files_to_delete, logger, Config) keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...', 'END: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info') out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name) # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step. if args.coverage_depth_stats: gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) else: ## Continue: 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc keep_logging('START: Creating BedGraph Coverage', 'START: Creating BedGraph Coverage', logger, 'info') bedgraph_coverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) only_unmapped_positions_file = bedtools(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Creating BedGraph Coverage', 'END: Creating BedGraph Coverage', logger, 'info') ## 4. Stages: Variant Calling keep_logging('START: Variant Calling', 'START: Variant Calling', logger, 'info') caller = ConfigSectionMap("pipeline", Config)['variant_caller'] if caller == "samtoolswithpostalignbam": keep_logging('START: Variant Calling using Samtools and post-align bam input files', 'START: Variant Calling using Samtools and post-align bam input files', logger, 'info') out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name) final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging('END: Variant Calling using Samtools and post-align bam input files', 'END: Variant Calling using Samtools and post-align bam input files', logger, 'info') elif caller == "gatkhaplotypecaller": keep_logging('START: Variant Calling using GATK haplotyper and post-align bam input files', 'START: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info') out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name) final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging('END: Variant Calling using GATK haplotyper and post-align bam input files', 'END: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info') elif caller == "samtools": keep_logging('START: Variant Calling using Samtools without post-align bam input files.', 'START: Variant Calling using Samtools without post-align bam input files.', logger, 'info') final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config) #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name) final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config) #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name) keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug') keep_logging('END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info') else: keep_logging('Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', logger, 'info') exit() keep_logging('END: Variant Calling', 'END: Variant Calling', logger, 'info') ## 5. Stages: Variant Filteration keep_logging('START: Variant Filteration', 'START: Variant Filteration', logger, 'info') filter2_variants(final_raw_vcf, args.output_folder, args.analysis_name, args.index, logger, Config) keep_logging('END: Variant Filteration', 'END: Variant Filteration', logger, 'info') ## 6. Stages: Statistics keep_logging('START: Generating Statistics Reports', 'START: Generating Statistics Reports', logger, 'info') alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config) vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder, args.analysis_name, logger, Config) #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config) keep_logging('END: Generating Statistics Reports', 'END: Generating Statistics Reports', logger, 'info')