def statistics_time(out_dir, sample, process, logger_statistics_process, logger_statistics_errors): if not os.path.isfile(process): store_statistics_logs('null', logger_statistics_errors, process + " does not exist!\n") # print(process + ' does not exist!') time_statistics = out_dir + '/' + sample + '_' + 'time_Cost' scriptdir = os.path.dirname(os.path.abspath(__file__)) command = 'Rscript ' + scriptdir + '/statistics_time.R' + ' -p ' + process + ' -o ' + time_statistics os.system(command)
def statistics_sam_bam(samtools_dir, sam_bam, out_dir, sample, module, logger_statistics_process, logger_statistics_errors, renew): # -statistics of if not os.path.isfile(sam_bam): store_statistics_logs('null', logger_statistics_errors, sam_bam + " does not exist!\n") # print(sam_bam + ' does not exist!') align_statistics = out_dir + '/' + sample + '_' + module + '_statistics.txt' if not os.path.isfile(align_statistics) or renew is 'T': command = samtools_dir + ' stats ' + sam_bam + ' | grep ^SN | cut -f 2-3 > ' + align_statistics os.system(command) return align_statistics
def merge_statistics_sam_bam(logger_statistics_process, logger_statistics_errors, out_dir, sample, names, *args): for arg in args: if not os.path.isfile(arg): store_statistics_logs('null', logger_statistics_errors, arg + " does not exist!\n") # print(arg + ' does not exist!') statisticsfiles = ','.join(args) mergestatistics = out_dir + '/' + sample + '_merge_sam_bam_statisticsfile.txt' scriptdir = os.path.dirname(os.path.abspath(__file__)) command = 'Rscript ' + scriptdir + '/statistics_merge_sam_bam_statisticsfile.R' + \ ' -p ' + statisticsfiles + ' -g ' + names + ' -o ' + mergestatistics os.system(command)
def qc_raw_reads(fastQC_dir, out_dir, sample, module, read1, read2, logger_statistics_process, logger_statistics_errors): qc_read1 = out_dir + '/' + os.path.basename(read1).split( ".fastq")[0] + '_fastqc.zip' qc_read2 = out_dir + '/' + os.path.basename(read2).split( ".fastq")[0] + '_fastqc.zip' if not (os.path.isfile(qc_read1) and os.path.isfile(qc_read2)): command1 = '{0} {1} {2} -o {3}'.format(fastQC_dir, read1, read2, out_dir) stdout, stderr = stdout_err(command1) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) store_statistics_logs(logger_statistics_process, 'null', 'QC-{0} has been completed.\n'.format(module)) else: store_statistics_logs(logger_statistics_process, 'null', 'QC-{0} exists.\n'.format(module)) qc_statistics = out_dir + '/' + sample + '.' + module + '.statistics.txt' qc_result1 = getinfo(out_dir + '/' + os.path.basename(read1), logger_statistics_process, logger_statistics_errors) qc_result2 = getinfo(out_dir + '/' + os.path.basename(read2), logger_statistics_process, logger_statistics_errors) fout = open(qc_statistics, 'w') fout.write('\t'.join([ 'SampleID', 'Sequence direction', 'raw reads', 'min length', 'max length', 'GC content', 'mean of Per base qualit', 'low qualit Bases position', 'Q20', 'Q30', 'N_bases position' ]) + '\n') fout.write('\t'.join([ sample, read1.lstrip(sample + '_').rstrip(".fastq.gz"), '\t'.join(qc_result1[0:9]) ]) + '\n') fout.write('\t'.join([ sample, read2.lstrip(sample + '_').rstrip(".fastq.gz"), '\t'.join(qc_result2[0:9]) ]) + '\n') fout.close() return qc_result1, qc_result2
def main_run_germline_variant_calling(path_sampleID_sub): time_start1 = time.time() source = path_sampleID_sub[0].split('\t')[0] sample = path_sampleID_sub[0].split('\t')[1] if len(path_sampleID_sub[0].split('\t')) > 2: tailname = path_sampleID_sub[0].split('\t')[2] sample = sample + '_' + tailname # parameters (output, fastqc_dir, primers_file, exome_target_bed, min_read_len, common_seq1, common_seq2, num_threads, edit_dist, min_mapq, max_soft_clip, max_dist, memory_size, snp_filter, indel_filter, ref_ens, bwa_dir, samtools_dir, umitools_dir, gatk_dir, ref_index_name, ref_fa_file, total_ref_fa_file, total_ref_fa_dict, known_sites, erc, db_cosmic, db_clinvar, db_g1000, test_level, exome_target, calling, tabix, bgzip, bcftools_dir, varsan2_dir, strelka2_dir, total_ref_chrom_fa_file, datasets_dir, smcounter, mtdepth, rpb, ncpu, minbq, minmq, hplen, mismatchthr, mtdrop, maxmt, primerdist, bedtandemrepeats, bedrepeatmaskersubset, bedtools_dir, renew) = path_sampleID_sub[1:55] # check the output out_dir = output + '/' + sample if not os.path.exists(out_dir): os.makedirs(out_dir) # pipeline log file log_dir = out_dir + '/' + 'log' if not os.path.isdir(log_dir): try: os.makedirs(log_dir) except OSError as e: if e.errno == 17: logger_pipeline_process = log_dir logger_pipeline_errors = log_dir logger_pipeline_process = log_dir logger_pipeline_errors = log_dir # time cost time_start = time.time() module = "QC" read1 = source + '/' + sample + '_R1_001.fastq.gz' read2 = source + '/' + sample + '_R2_001.fastq.gz' #if tools in ['all', 'qc']: if 'qc' in tools or 'all' in tools: print("Test QC module!\n") # qc_dir qc_dir = out_dir + '/' + 'QC' if not os.path.exists(qc_dir): os.makedirs(qc_dir) logger_statistics_process = log_dir logger_statistics_errors = log_dir qc_result1, qc_result2 = qc_raw_reads(fastqc_dir, qc_dir, sample, module, read1, read2, logger_statistics_process, logger_statistics_errors) # check the quality of the raw reads if float(qc_result1[7].strip('%')) > 70 and float( qc_result2[7].strip('%')) > 70: print( "The ratio of read1 and read2 with Q30 quality are both higher than 70%." ) else: exit( "The ratio of read1 and read2 with Q30 quality are both lower than 80%!!!!!!!" ) store_pipeline_logs( logger_pipeline_process, 'null', "--{0}--QC of reads is completed after {1} min.\n".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) store_statistics_logs( logger_statistics_process, 'null', "--{0}--QC of reads is completed after {1} min.\n".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') ########################################################################################## # trim ########################################################################################## # time cost time_start = time.time() # undetermined_dir undetermined_dir = out_dir + '/' + 'undetermined' trimmed1 = undetermined_dir + '/' + sample + '_R1_undetermined.fastq' trimmed2 = undetermined_dir + '/' + sample + '_R2_undetermined.fastq' stats_file = undetermined_dir + '/' + sample + '_basic_stats.txt' # if tools in ['all', 'trim']: if 'trim' in tools or 'all' in tools: print("please check the QC subprocess result--the min read length!") print("The cutoff of the min read length is the default: {0}".format( min_read_len)) print("Test trim module!\n\n\n") # mkdir undetermined_dir if not os.path.exists(undetermined_dir): os.makedirs(undetermined_dir) logger_trim_process = log_dir logger_trim_errors = log_dir trim_read_pairs(read1, read2, trimmed1, trimmed2, min_read_len, common_seq1, common_seq2, stats_file, logger_trim_process, logger_trim_errors) store_pipeline_logs( logger_pipeline_process, 'null', "--{0}--Trim of reads is completed after {1} min.\n".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) store_trim_logs( logger_trim_process, 'null', "--{0}--Trimming of reads is completed after {1} min.\n".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') ########################################################################################## # align ########################################################################################## # time cost time_start = time.time() # aligned_dir aligned_dir = out_dir + '/' + 'aligned' trim_read1 = out_dir + '/' + 'undetermined' + '/' + sample + '_R1_undetermined.fastq' trim_read2 = out_dir + '/' + 'undetermined' + '/' + sample + '_R2_undetermined.fastq' out_file = aligned_dir + '/' + sample + '_aligned.sam' # if tools in ['all', 'align']: if 'align' in tools or 'all' in tools: print("please check the Trim subprocess result--undetermined.fastq!") print("Test align module!\n") if not os.path.exists(aligned_dir): os.makedirs(aligned_dir) logger_bwa_process = log_dir logger_bwa_errors = log_dir align_reads_bwa(bwa_dir, samtools_dir, ref_fa_file, ref_index_name, exome_target_bed, total_ref_fa_file, trim_read1, trim_read2, out_file, num_threads, logger_bwa_process, logger_bwa_errors, renew) store_align_logs( logger_bwa_process, 'null', "--{0}--Alignment of reads is completed after {1} min.".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', "--{0}--Align of reads is completed after {1} min.".format( sample, str('%.3f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') # #####################################annotationmain#################################################### # post_align # ######################################################################################### # time cost time_start = time.time() # post_aligned_dir filtered_dir = out_dir + '/' + 'filtered' # out_file from the align alignment_sam = out_file out_file1 = filtered_dir + '/' + sample + '_tmp.sam' stats_file = filtered_dir + '/' + sample + '_align_stats.txt' primer_stats_file = filtered_dir + '/' + sample + '_primer_stats.csv' out_file2 = filtered_dir + '/' + sample + '_filtered.sam' # if tools in ['all', 'post_align']: if 'post_align' in tools or 'all' in tools: print("please check the Algin subprocess result--aligned.sam!") print("Test post align module!\n") if not os.path.exists(filtered_dir): os.makedirs(filtered_dir) logger_filter_process = log_dir logger_filter_errors = log_dir filter_alignment_samtools(samtools_dir, alignment_sam, min_mapq, max_soft_clip, out_file1, stats_file, logger_filter_process, logger_filter_errors) identify_gs_primers(samtools_dir, out_file1, primers_file, max_dist, out_file2, stats_file, primer_stats_file, logger_filter_process, logger_filter_errors) store_filter_logs( logger_filter_process, 'null', "--{0}--Post Alignment of reads is completed after {1} min.". format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', "--{0}--Post_align of reads is completed after {1} min.".format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') ########################################################################################## # barcode clustering ########################################################################################## # time cost time_start = time.time() # clustering_dir clustered_dir = out_dir + '/' + 'clustered' filtered_sam = out_dir + '/' + 'filtered' + '/' + sample + '_filtered.sam' filtered_bam = clustered_dir + '/' + sample + '_filtered.bam' sorted_bam = clustered_dir + '/' + sample + '_filtered_sorted.bam' umitool_stats = clustered_dir + '/' + sample + '_deduplicated' #umitool_stats = clustered_dir + '/' + sample + '_group.tsv' umis_sam = clustered_dir + '/' + sample + '_umis.sam' #if tools in ['all', 'cluster']: if 'cluster' in tools or 'all' in tools: print("please check the post algin subprocess result--filtered.sam!") print("Test cluster module!\n") if not os.path.exists(clustered_dir): os.makedirs(clustered_dir) logger_umi_process = log_dir logger_umi_errors = log_dir umitool(samtools_dir, umitools_dir, filtered_sam, filtered_bam, sorted_bam, umitool_stats, umis_sam, edit_dist, logger_umi_process, logger_umi_errors) store_cluster_logs( logger_umi_process, 'null', "--{0}--UMIs tools clustering of reads is completed after {1} min." .format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', "--{0}--Cluster of reads is completed after {1} min.".format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') ########################################################################################## # reformat ########################################################################################## # time cost time_start = time.time() # reformated_dir reformated_dir = out_dir + '/' + 'reformated' alignment_sam = out_dir + '/' + 'clustered' + '/' + sample + '_umis.sam' output_sam = reformated_dir + '/' + sample + '_vcready.sam' # if tools in ['all', 'reformat']: if 'reformat' in tools or 'all' in tools: print("please check the cluster subprocess result--umis.sam!") print("Test reformat module!\n") if not os.path.exists(reformated_dir): os.makedirs(reformated_dir) logger_reformat_process = log_dir logger_reformat_errors = log_dir reformat_sam(alignment_sam, output_sam, logger_reformat_process, logger_reformat_errors) store_reformat_logs( logger_reformat_process, 'null', '--{0}--Finish reformating alignment SAM file is completed after {1} min.' .format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', '--{0}--Reformat alignment SAM file is completed after {1} min.'. format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') # ######################################################################################### # Germline variant calling # ######################################################################################### # time cost time_start = time.time() # germline_vc_dir germline_vc_dir = out_dir + '/' + 'germline_vc' # modify the known-sites known_sites = known_sites.replace(',', ' --known-sites ' + datasets_dir + '/') known_sites = datasets_dir + '/' + known_sites vready_sam = out_dir + '/' + 'reformated' + '/' + sample + '_vcready.sam' # marked_bqsr_bam = germline_vc_dir + '/' + sample + '_sorted.MarkDuplicates.BQSR.bam' if os.path.basename(exome_target_bed) != 'all': exon_interval = germline_vc_dir + '/' + 'target_interval.list' else: exon_interval = 'all' # if tools in ['all', 'variant_call']: if 'variant_call' in tools or 'all' in tools: print("please check the reformat subprocess result--vcready.sam!") print("Test variant_call module!\n") if not os.path.exists(germline_vc_dir): os.makedirs(germline_vc_dir) logger_germline_vc_process = log_dir logger_germline_vc_errors = log_dir bqsr = 'n' bam_to_variant, bqsr_bam_to_variant = sam_to_bam( gatk_dir, samtools_dir, vready_sam, sample, germline_vc_dir, memory_size, exome_target_bed, total_ref_fa_file, total_ref_fa_dict, known_sites, logger_germline_vc_process, logger_germline_vc_errors, bqsr, renew) callings = calling.split(',') # if calling == 'GATK': if 'GATK' in callings: germline_variant_calling(gatk_dir, bam_to_variant, sample, germline_vc_dir, memory_size, total_ref_fa_file, exon_interval, erc, snp_filter, indel_filter, logger_germline_vc_process, logger_germline_vc_errors) # elif calling == 'strelka2': if 'strelka2' in callings: strelka2_call(strelka2_dir, bgzip, tabix, total_ref_chrom_fa_file, germline_vc_dir, sample, bam_to_variant, exome_target_bed, logger_germline_vc_process, logger_germline_vc_errors, renew) # elif calling == 'samtools': if 'samtools' in callings: samtools_call(samtools_dir, bcftools_dir, bam_to_variant, sample, germline_vc_dir, total_ref_fa_file, logger_germline_vc_process, logger_germline_vc_errors) # elif calling == 'varscan2': if 'varscan2' in callings: varsan2_call(samtools_dir, varsan2_dir, total_ref_fa_file, germline_vc_dir, sample, bam_to_variant, logger_germline_vc_process, logger_germline_vc_errors) if 'smcounter' in callings: bam_to_variant = bam_to_variant.rstrip( ".MarkDuplicates.RG.bam") + '.bam' threshold = 0 logfile = germline_vc_dir + '/smcountlog' smcounter_call(smcounter, germline_vc_dir + '/' + sample, bam_to_variant, exome_target_bed, mtdepth, rpb, ncpu, minbq, minmq, hplen, mismatchthr, mtdrop, maxmt, primerdist, threshold, total_ref_fa_file, bedtandemrepeats, bedrepeatmaskersubset, bedtools_dir, logfile, logger_germline_vc_process, logger_germline_vc_errors, renew) store_germline_vc_logs( logger_germline_vc_process, 'null', '--{0}--Germline variant calling is completed after {1} min.'. format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', '--{0}--variant_calling is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') # ######################################################################################### # Annotation variant calling # ######################################################################################### # time cost time_start = time.time() # Annotation dir annotation_dir = out_dir + '/' + 'annotation' raw_vcf = germline_vc_dir + '/' + sample + '.raw_variants.vcf' snp_vcf = germline_vc_dir + '/' + sample + '.raw_variants_SNP.vcf' indel_vcf = germline_vc_dir + '/' + sample + '.raw_variants_indel.vcf' # annotation # if tools in ['all', 'annotation']: if 'annotation' in tools or 'all' in tools: print("please check the variant_call subprocess result--VCF!") print("Test annotation module!\n") # Annotation dir if not os.path.exists(annotation_dir): os.makedirs(annotation_dir) logger_annotation_process = log_dir logger_annotation_errors = log_dir snp_limit, indel_limit = read_vcf_filter(snp_filter, indel_filter) callings = calling.split(',') for callingsub in callings: annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, raw_vcf, sample, snp_limit, indel_limit, annotation_dir, logger_annotation_process, logger_annotation_errors, callingsub) if 'GATK' in callings: callingsub = 'GATK' annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, snp_vcf, sample, snp_limit, indel_limit, annotation_dir, logger_annotation_process, logger_annotation_errors, callingsub) annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, indel_vcf, sample, snp_limit, indel_limit, annotation_dir, logger_annotation_process, logger_annotation_errors, callingsub) store_annotation_logs( logger_annotation_process, 'null', '--{0}--Finish annotation variant is completed after {1} min.'. format(sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', '--{0}--Annotation variant is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) print("--" * 20 + '\n\n') ########################################################################################## # statistics of the variant calling pipeline ########################################################################################## # time cost time_start = time.time() # statistics dir statistics_dir = out_dir + '/' + 'statistics' if not os.path.exists(statistics_dir): os.makedirs(statistics_dir) # statistics the clean reads # statistics trim dir statistics_trim_dir = statistics_dir + '/' + 'trim_QC' if not os.path.exists(statistics_trim_dir): os.makedirs(statistics_trim_dir) # if tools in ['all', 'statis']: if 'statis' in tools or 'all' in tools: print("please check the others subprocess results!") print("Test statistics module!\n") #if tools == 'statis': logger_statistics_process = log_dir logger_statistics_errors = log_dir # statistics dir if not os.path.exists(statistics_dir): os.makedirs(statistics_dir) module = "Trim" trim_result1, trim_result2 = qc_raw_reads(fastqc_dir, statistics_trim_dir, sample, module, trimmed1, trimmed2, logger_statistics_process, logger_statistics_errors) # statistics the align module1 = "Align" align_sorted_bam = statistics_depth_coverage( samtools_dir, out_file, statistics_dir, sample, module1, exome_target, exome_target_bed, logger_statistics_process, logger_statistics_errors, renew) align_statistics = statistics_sam_bam(samtools_dir, align_sorted_bam, statistics_dir, sample, module1, logger_statistics_process, logger_statistics_errors, renew) # statistics the filter # cluster module would build the filter sorted bam, but it has been changed UMIs-tools module2 = "Fliter" filtered_sorted_bam = statistics_depth_coverage( samtools_dir, filtered_sam, statistics_dir, sample, module2, exome_target, exome_target_bed, logger_statistics_process, logger_statistics_errors, renew) fliter_statistics = statistics_sam_bam(samtools_dir, filtered_sorted_bam, statistics_dir, sample, module2, logger_statistics_process, logger_statistics_errors, renew) # statistics the umi-tools module3 = "Cluster_reformat" cr_sorted_bam = statistics_depth_coverage( samtools_dir, vready_sam, statistics_dir, sample, module3, exome_target, exome_target_bed, logger_statistics_process, logger_statistics_errors, renew) cr_statistics = statistics_sam_bam(samtools_dir, cr_sorted_bam, statistics_dir, sample, module3, logger_statistics_process, logger_statistics_errors, renew) # staistics the bases MT depth statistics_mtdepth_coverage(germline_vc_dir, statistics_dir, sample, exome_target, logger_statistics_process, logger_statistics_errors) # merge the sorted bam merge_statistics_sam_bam(logger_statistics_process, logger_statistics_errors, statistics_dir, sample, ','.join([module1, module2, module3]), align_statistics, fliter_statistics, cr_statistics) store_statistics_logs( logger_statistics_process, 'null', '--{0}--Statistics is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', '--{0}--Statistics is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start) / 60)))) store_pipeline_logs( logger_pipeline_process, 'null', '--{0}--Pipeline : {0} is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start1) / 60)))) # statistics the time cost process_log = out_dir + '/' + 'log' + '/' + 'process.log' statistics_time(statistics_dir, sample, process_log, logger_statistics_process, logger_statistics_errors) print("--" * 20 + '\n\n') return 'Pipeline : {0} is completed after {1} min.'.format( sample, ('%.2f' % ((time.time() - time_start1) / 60)))
def getinfo(fastqc, logger_statistics_process, logger_statistics_errors): qc_read = fastqc.split(".fastq")[0] + '_fastqc.zip' if not os.path.isfile(fastqc.split(".fastq")[0] + '_fastqc'): command1 = 'unzip' + ' -o ' + qc_read + ' -d ' + os.path.dirname( qc_read) stdout, stderr = stdout_err(command1) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) # print('{0} has been completed.'.format(qc_read)) store_statistics_logs(logger_statistics_process, 'null', '{0} has been completed.\n'.format(qc_read)) qc_data = qc_read.rstrip(".zip") + '/' + 'fastqc_data.txt' qcdata = open(qc_data, "r") modules = 0 # value = re.compile(r'\d+') perbasesequencequalit_posi = [] perbasesequencequalit = [] persequencequalityscores = [] persequencequalityreads = [] per_basen1 = [] perbase_ncontent = [] f = qcdata.readlines() for lines in range(0, len(f)): line = f[lines] line = line.strip() if '>>END_MODULE' in line: modules = modules + 1 if 'Total Sequences' in line: raw_reads = re.findall('\d+', line) if 'Sequence length' in line: seq_length = line.split('\t')[1] if '-' in seq_length: seq_length_min, seq_length_max = seq_length.split('-') else: seq_length_min = str(seq_length) seq_length_max = str(seq_length) if '%GC' in line: gc = re.findall('\d+', line) if modules == 1 and value.match(line[0]): perbasesequencequalit_posi.append(line.split('\t')[0]) perbasesequencequalit.append(float(line.split('\t')[1])) if modules == 3 and value.match(line[0]): persequencequalityscores.append(line.split('\t')[0]) persequencequalityreads.append(float(line.split('\t')[1])) if modules == 6 and value.match(line[0]): per_basen1.append(line.split('\t')[0]) perbase_ncontent.append(line.split('\t')[1]) # --mean of Per base sequence quality all_perbasesequencequalit = 0 for i in range(0, len(perbasesequencequalit)): if '-' not in perbasesequencequalit_posi[i]: qual = float(perbasesequencequalit[i]) all_perbasesequencequalit += qual else: num1, num2 = perbasesequencequalit_posi[i].split('-') qual = float(perbasesequencequalit[i]) all_perbasesequencequalit += qual * (int(num2) - int(num1) + 1) perbasesequencequalit_mean = all_perbasesequencequalit / int( seq_length_max) # low qual base : qual < 25 lowqualit_bases = [] for i in range(0, len(perbasesequencequalit)): if float(perbasesequencequalit[i]) < 25: lowqualit_bases.append(perbasesequencequalit_posi[i]) if len(lowqualit_bases) == 0: lowqualit_bases.append('NULL') lowqualit_bases_region = split_n_bases(','.join(lowqualit_bases))[0] # percent of Q20 and Q30 persequencequalityreads1 = [] for i in range(0, len(persequencequalityreads)): persequencequalityreads1.append( sum(persequencequalityreads[i:]) / int(raw_reads[0])) for i in range(0, len(persequencequalityscores)): if persequencequalityscores[i] == '20': q20 = persequencequalityreads1[i] if persequencequalityscores[i] == '30': q30 = persequencequalityreads1[i] if '20' not in persequencequalityscores: q20 = 1.0000 # N content nbases = [] for i in range(0, len(perbase_ncontent)): if float(perbase_ncontent[i]) > 0: nbases.append(per_basen1[i]) if len(nbases) == 0: nbases.append('NULL') nbase_region = split_n_bases(','.join(nbases)) return [ raw_reads[0], seq_length_min, seq_length_max, gc[0], str(round(perbasesequencequalit_mean, 3)), lowqualit_bases_region, str('%.3f%%' % (q20 * 100)), str('%.3f%%' % (q30 * 100)), nbase_region[0] ]
def statistics_depth_coverage(samtools_dir, sam_bam, out_dir, sample, module, exome_target, exome_target_bed, logger_statistics_process, logger_statistics_errors, renew): # get the path scriptdir = os.path.dirname(os.path.abspath(__file__)) if not os.path.isfile(sam_bam): store_statistics_logs('null', logger_statistics_errors, sam_bam + " does not exist!\n") sorted_bam = sam_bam.rstrip('.sam') + '_sorted.bam' bam = sam_bam.rstrip('.sam') + '.bam' if not os.path.isfile(sorted_bam) or renew is 'T': # print(sorted_bam + ' does not exist!') if not os.path.isfile(bam): bam = sam_bam.rstrip('.sam') + '.bam' command1 = samtools_dir + ' view -bS ' + sam_bam + ' -o ' + bam stdout, stderr = stdout_err(command1) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) store_statistics_logs( logger_statistics_process, 'null', '{0} has been tranformed to bam.\n'.format(sam_bam)) command2 = samtools_dir + ' sort ' + bam + ' -o ' + sorted_bam stdout, stderr = stdout_err(command2) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) os.system('rm -rf {0}'.format(bam)) else: command2 = samtools_dir + ' sort ' + bam + ' -o ' + sorted_bam stdout, stderr = stdout_err(command2) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) sorted_bam_index = sorted_bam + '.bai' if not os.path.isfile(sorted_bam_index) or renew is 'T': # print(sorted_bam_index + ' does not exist!') command3 = samtools_dir + ' index ' + sorted_bam stdout, stderr = stdout_err(command3) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) # -numbers of reads in target region num_reads_in_target_region = out_dir + '/' + sample + '_' + module + '_numbersReadsInTargetRegion.txt' if not os.path.isfile(num_reads_in_target_region) or renew is 'T': command4 = samtools_dir + ' idxstats ' + sorted_bam + ' -o ' + num_reads_in_target_region stdout, stderr = stdout_err(command4) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) # -coverage of reads in target region coverage_in_target_region = out_dir + '/' + sample + '_' + module + '_covergerInTargetRegion.txt' if not os.path.isfile(coverage_in_target_region) or renew is 'T': command5 = '{0} mpileup {1} | perl -alne \'{2}\' > {3}'.format( samtools_dir, sorted_bam, '{$pos{$F[0]}++;$depth{$F[0]}+=$F[3]} END{print "$_\t$pos{$_}\t$depth{$_}" foreach sort keys %pos}', coverage_in_target_region) os.system(command5) # - # -statistics and plot of the depth and coverage in target region statistics_plot = out_dir + '/' + sample + '_' + module + '_depth_coverageInTargetRegion' if not os.path.isfile(statistics_plot + '.pdf') or renew is 'T': # scriptdir = os.path.dirname(os.path.abspath(__file__)) command6 = 'Rscript ' + scriptdir + '/statistics_depth_coverage.R' + ' -p ' \ + num_reads_in_target_region + ' -s ' + coverage_in_target_region\ + ' -r ' + exome_target_bed + ' -o ' + statistics_plot stdout, stderr = stdout_err(command6) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) # - statistics of the depth of the baes in region if module == 'Fliter': bases_depth_in_region = out_dir + '/' + sample + '_' + module + '_basesDepthInRegion.txt' if not os.path.isfile(bases_depth_in_region) or renew is 'T': command7 = '{0} depth {1} > {2}'.format(samtools_dir, sorted_bam, bases_depth_in_region) os.system(command7) # statistics of the depth of exon region statistics_plot1 = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetExon' if os.path.isfile(statistics_plot1 + '.exon_statis.txt'): os.system('rm {0}'.format(statistics_plot1 + '.exon_statis.txt')) command7 = ''.join([ 'Rscript ', scriptdir, '/statistics_bases_coverage_on_target_exon.R', ' -p ', bases_depth_in_region, ' -s ', exome_target, ' -t True ', ' -o ', statistics_plot1 ]) os.system(command7) # depth of the bases in target region bases_depth_in_target_region = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetRegion.txt' if not os.path.isfile(bases_depth_in_target_region) or renew is 'T': command7 = '{0} mpileup {1} | perl -alne \'{2}\' > {3}'.format( samtools_dir, sorted_bam, '{$depth{$F[3]}++}END{print "$_\t$depth{$_}" foreach sort{$a <=> $b}keys %depth}', bases_depth_in_target_region) os.system(command7) # -statistics and plot of the depth and coverage in target region statistics_plot2 = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetRegion' command8 = 'Rscript ' + scriptdir + '/statistics_bases_depth.R' + ' -p ' \ + bases_depth_in_target_region + ' -o ' + statistics_plot2 stdout, stderr = stdout_err(command8) store_statistics_logs(logger_statistics_process, 'null', stdout) store_statistics_logs('null', logger_statistics_errors, stderr) return sorted_bam