def workflow(self): tsv_md5 = '{0}.md5'.format(self._cnv_tsv) fc_stat_md5 = '{0}.md5'.format(self._cnv_fc_stat) if md5sum_check(self._cnv_tsv, tsv_md5) and md5sum_check( self._cnv_fc_stat, fc_stat_md5): log_progress(__modname__, 'CNV TSV file generation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'CNV TSV file generation start', f=self._log_file) if os.path.exists(self._cnv_tsv): os.remove(self._cnv_tsv) if os.path.exists(self._cnv_fc_stat): os.remove(self._cnv_fc_stat) vcf_reader = vcf.Reader(open(self._cnv_vcf, 'r')) cnv_tsv = open(self._cnv_tsv, 'w') cnv_tsv.write( 'chromosome\tstart\tend\treference\talternate\tSV_type\tgene\tfold_change\n' ) for i, record in enumerate(vcf_reader): if record.ALT[0] is not None: chrom = record.CHROM start_pos = record.POS end_pos = record.INFO['END'] allele_reference = record.REF if str(record.ALT[0]) == '<DUP>': allele_alternate = 'DUP' elif str(record.ALT[0]) == '<DEL>': allele_alternate = 'DEL' gene_name = record.INFO['ANT'] sv_type = record.INFO['SVTYPE'] cnv_tsv.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format( chrom, int(start_pos), int(end_pos), allele_reference, allele_alternate, sv_type, gene_name, float(record.samples[0]['FC']))) else: pass with open(self._cnv_fc_stat, 'a') as f: f.write('{0}\t{1}\n'.format(record.INFO['ANT'], record.samples[0]['FC'])) cnv_tsv.close() run_command_md5sum(__modname__, self._log_file, self._cnv_tsv, tsv_md5) run_command_md5sum(__modname__, self._log_file, self._cnv_fc_stat, fc_stat_md5) log_progress(__modname__, 'CNV TSV file generation finished', f=self._log_file)
def low_confidence_annotation(self, input_file, lowconf_vcf): low_conf_homopolyx = join( self._variant_dir, '{0}_lowconf.homopolyx'.format(self._sample_name)) lowconf_vcf_md5 = '{0}.md5'.format(lowconf_vcf) if md5sum_check(lowconf_vcf, lowconf_vcf_md5): log_progress(__modname__, 'Low confidence annotation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Low confidence annotation start', f=self._log_file) if os.path.exists(low_conf_homopolyx): os.remove(low_conf_homopolyx) if os.path.exists(lowconf_vcf): os.remove(lowconf_vcf) exec_cmd1 = [ 'python', self._sw['ngb_lowconf_homopolyx'], '-p', '5', '-r', self._sw['hg19'], '-o', low_conf_homopolyx, input_file ] run_command(__modname__, exec_cmd1, self._log_file) exec_cmd2 = [ 'python', self._sw['ngb_lowconf_repeatcnt'], '-r', self._sw['hg19'], '-o', lowconf_vcf, low_conf_homopolyx ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, lowconf_vcf, lowconf_vcf_md5) log_progress(__modname__, 'Low confidence annotation finished', f=self._log_file)
def remove_reference_info(self, input_file, remove_ref_vcf): remove_ref_vcf_md5 = '{0}.md5'.format(remove_ref_vcf) # remove only reference... if md5sum_check(remove_ref_vcf, remove_ref_vcf_md5): log_progress(__modname__, 'Remove only reference in VCF already finished!!!', f=self._log_file) log_version(__modname__, self._sw['vcftools_ver'], f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) else: log_progress(__modname__, 'Remove only reference in VCF start', f=self._log_file) log_version(__modname__, self._sw['vcftools_ver'], f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) if os.path.exists(remove_ref_vcf): os.remove(remove_ref_vcf) exec_cmd = [ '{0} --vcf {1} --recode --stdout'.format( self._sw['vcftools'], input_file), 'grep -v "0[/|]0"', 'grep -v "\.[/|]\."', '{0} sort -'.format(self._sw['vt']) ] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', remove_ref_vcf) run_command_md5sum(__modname__, self._log_file, remove_ref_vcf, remove_ref_vcf_md5) log_progress(__modname__, 'Remove only reference in VCF finished', f=self._log_file)
def run(self, summary_file, mapq_file, stat_json_file, flag): stat_json_file_md5 = "{0}.md5".format(stat_json_file) if md5sum_check(stat_json_file, stat_json_file_md5): log_progress(__modname__, "Analysis Statistics already finished", f=self._log_file) else: log_progress(__modname__, "Analysis Statistics start", f=self._log_file) if flag == "solid": self._cutoff_uniformity05 = 5 elif flag == "blood": self._cutoff_uniformity05 = 10 stat_data = {} try: with open(summary_file, "r") as f: lines = f.readlines() for line in lines: sp = line.replace("\n", "").split("\t") if len(sp) == 2: stat_data[sp[0].replace(" ", "_")] = sp[1] except Exception as ex: log_error(__modname__, "Parsing stat summary file error: {0}".format(ex), f=self._log_file) sys.exit(1) try: with open(mapq_file, "r") as f: lines = f.readlines() for line in lines: sp = line.replace("\n", "").split("\t") if len(sp) == 2: stat_data[sp[0].replace(" ", "_")] = sp[1] except Exception as ex: log_error(__modname__, "Parsing mapping quality file error: {0}".format(ex), f=self._log_file) sys.exit(1) json_list = self.workflow(stat_data) json_data = {} json_data["qc_data"] = json_list try: with open(stat_json_file, "w") as make_json_file: json.dump(json_data, make_json_file, ensure_ascii=False, sort_keys=True, indent=2) run_command_md5sum(__modname__, self._log_file, stat_json_file, stat_json_file_md5) except Exception as ex: log_error(__modname__, "{0}".format(ex), f=self._log_file) sys.exit(1) log_progress(__modname__, "Analysis Statistics finished", f=self._log_file)
def pileup_depth(self, pileup_depth): md5_file = '%s.md5' % (pileup_depth) if md5sum_check(pileup_depth, md5_file): log_progress(__modname__, 'Get Pileup Depth already finished!!!', f=self._log_file) log_version(__modname__, self._sw['samtools_ver'], f=self._log_file) else: log_progress(__modname__, 'Get Pileup Depth start', f=self._log_file) log_version(__modname__, self._sw['samtools_ver'], f=self._log_file) exec_cmd = [ self._sw['samtools'], 'depth', '-a', '-q', '0', '-Q', '1', '-d', '1000000', '-b', self._target_bed, '--reference', self._sw['hg19'], self._final_bam ] run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w', pileup_depth) run_command_md5sum(__modname__, self._log_file, pileup_depth, md5_file) log_progress(__modname__, 'Get Pileup Depth finished', f=self._log_file)
def copy_bam_files(self): self._final_bam = join(self._align_dir, '{0}_final.bam'.format(self._sample_name)) bam_dst_md5 = '{0}.md5'.format(self._final_bam) if md5sum_check(self._final_bam, bam_dst_md5): log_progress(__modname__, 'Copy the BAM file to output directory already finished', f=self._log_file) else: log_progress(__modname__, 'Copy the BAM file to output directory', f=self._log_file) ### 901: DNA, 902: RNA if self._pipeline == '901': bam_dir = join(self._tst170_dir, 'DNA_IntermediateFiles', 'Alignment') bam_src = join(bam_dir, '{0}_realigned.bam'.format(self._sample_name)) bed_name = 'DNA_PicardTarget.bed' elif self._pipeline == '902': bam_dir = join(self._tst170_dir, 'RNA_IntermediateFiles', 'Alignment') bam_src = join(bam_dir, '{0}.bam'.format(self._sample_name)) bed_name = 'RNA_PicardTarget.bed' else: log_error(__modname__, 'Unknown pipeline code {0} for TST170 pipeline'.format(self._pipeline), f=self._log_file) sys.exit(1) self.copy_files(bam_src, self._final_bam) bai_src = '{0}.bai'.format(bam_src) bai_dst = '{0}.bai'.format(self._final_bam) self.copy_files(bai_src, bai_dst) self.copy_files(join(bam_dir, bed_name), join(self._align_dir, bed_name)) self.generate_tdf_file(self._final_bam) run_command_md5sum(__modname__, self._log_file, self._final_bam, bam_dst_md5) log_progress(__modname__, 'Copy the BAM file finished', f=self._log_file)
def vcf_post_processing(self, input_file, refined_vcf): refined_vcf_md5 = '{0}.md5'.format(refined_vcf) if md5sum_check(refined_vcf, refined_vcf_md5): log_progress(__modname__, 'VCF post processing already finished!!!', f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) else: log_progress(__modname__, 'VCF post processing start', f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) if os.path.exists(refined_vcf): os.remove(refined_vcf) exec_cmd = [ '{0} normalize -r {1} {2}'.format(self._sw['vt'], self._sw['hg19'], input_file), '{0} decompose -s -'.format(self._sw['vt']), ] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', refined_vcf) run_command_md5sum(__modname__, self._log_file, refined_vcf, refined_vcf_md5) log_progress(__modname__, 'VCF post processing finished', f=self._log_file)
def run(self): self._md5_file = '{0}.md5'.format(self._json_file) if md5sum_check(self._json_file, self._md5_file): log_progress(__modname__, 'VCF to JSON already finished!!!', f=self._log_file) else: self.workflow()
def run_hered_qc_report(self): with open(self._status_log_file, 'w') as f: f.write('[STATUS] QC Report Generation\n[PROGRESS] 95') qc_report_file = join(self._output_dir, "data", "stat", "{0}.pdf".format(self._sample_name)) md5_file = '{0}.md5'.format(qc_report_file) if md5sum_check(qc_report_file, md5_file): log_progress(__modname__, 'QC Report Generation already finished', f=self._log_file) else: log_progress(__modname__, 'Run QC Report Generation', f=self._log_file) qc_report_gen = Her_QC_Report(self._sample_name, self._output_dir, self._fastq_r1, self._fastq_r2, self._pipeline, self._pipeline_name, self._platform, self._sample_source, self._run_name, self._log_file) qc_report_gen.run() run_command_md5sum(__modname__, self._log_file, qc_report_file, md5_file) log_progress(__modname__, 'QC Report Generation finished', f=self._log_file)
def run_snpEff(self, input_file, output_file): snpeff_tmp_out = join(self._variant_dir, '{0}_snpeff_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'snpEff gene annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file) else: log_progress(__modname__, 'snpEff gene annotation start', f=self._log_file) log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file) if os.path.exists(snpeff_tmp_out): os.remove(snpeff_tmp_out) if os.path.exists(output_file): os.remove(output_file) exec_cmd1 = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format( self._pe_core), '-Djava.io.tmpdir={0}'.format( self._sample_tmp_dir), '-jar', self._sw['snpeff'], 'ann', 'hg19ngb', '-no-downstream', '-no-upstream', '-noStats', '-no', 'INTERGENIC', '-no', 'INTERGENIC_CONSERVED', '-no', 'INTRAGENIC', '-no', 'RARE_AMINO_ACID', '-no', 'TRANSCRIPT', '-no', 'TRANSCRIPT_DELETED', '-no', 'REGULATION', '-no', 'NEXT_PROT', '-no', 'PROTEIN_STRUCTURAL_INTERACTION_LOCUS', '-no', 'PROTEIN_PROTEIN_INTERACTION_LOCUS', input_file ] run_command_file_handle(__modname__, exec_cmd1, self._log_file, 'w', snpeff_tmp_out) exec_cmd2 = [ self._sw['bcftools'], 'view', '-i', 'INFO/ANN!="."', snpeff_tmp_out ] run_command_file_handle(__modname__, exec_cmd2, self._log_file, 'w', output_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'snpEff gene annotation finished', f=self._log_file)
def run_dbnsfp_annotation(self, input_file, output_file): dbnsfp_tmp_vcf = join(self._variant_dir, '{0}_dbnsfp_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'dbNSFP annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['dbnsfp_db_ver'], f=self._log_file) else: log_progress(__modname__, 'dbNSFP annotation start', f=self._log_file) log_version(__modname__, self._sw['dbnsfp_db_ver'], f=self._log_file) exec_cmd1 = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format(self._pe_core), '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar', self._sw['snpsift'], 'dbnsfp', '-f', 'aapos,aapos_SIFT,aapos_FATHMM,Uniprot_acc,Interpro_domain,SIFT_pred,SIFT_score,LRT_pred,MutationTaster_pred,MutationTaster_score,GERP++_NR,GERP++_RS,phastCons100way_vertebrate,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,Polyphen2_HDIV_pred,Polyphen2_HDIV_score,Polyphen2_HVAR_pred,Polyphen2_HVAR_score,CADD_phred', '-db', self._sw['dbnsfp_db'], input_file ] run_command_file_handle(__modname__, exec_cmd1, self._log_file, 'w', dbnsfp_tmp_vcf) exec_cmd2 = [ 'python', self._sw['ngb_transcript_dbNSFP'], '-o', output_file, dbnsfp_tmp_vcf ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) run_command(__modname__, ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)], self._log_file) log_progress(__modname__, 'dbNSFP annotation finished', f=self._log_file)
def add_type_to_vcf(self, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'Add TYPE info already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Add TYPE info start', f=self._log_file) if os.path.exists(output_file): os.remove(output_file) exec_cmd = [ 'python', self._sw['ngb_add_vcfinfo'], '-o', output_file, input_file ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'Add TYPE info finished', f=self._log_file)
def statistics(self, pileup_depth): fastq_files = self.get_fastq_names() md5_file = '%s.md5' % (self._summary_file) if md5sum_check(self._summary_file, md5_file): log_progress(__modname__, 'Analysis Statistics already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Analysis Statistics start', f=self._log_file) exec_cmd = [ self._script, fastq_files[0], fastq_files[1], fastq_files[2], fastq_files[3], fastq_files[4], fastq_files[5], fastq_files[6], fastq_files[7], self._final_bam, self._target_bed, pileup_depth, self._summary_file ] run_command(__modname__, exec_cmd, self._log_file) log_progress(__modname__, 'Analysis Statistics finished', f=self._log_file)
def generate_tdf_file(self, final_bam): tdf_file = '{0}.tdf'.format(final_bam) tdf_file_md5 = '{0}.md5'.format(tdf_file) if md5sum_check(tdf_file, tdf_file_md5): log_progress(__modname__, 'TDF file generation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file) else: log_progress(__modname__, 'TDF file generation start', f=self._log_file) log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file) if os.path.exists(tdf_file): os.remove(tdf_file) exec_cmd = [ self._sw['igvtools'], 'count', final_bam, tdf_file, 'hg19' ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, tdf_file, tdf_file_md5) log_progress(__modname__, 'TDF file generation finished', f=self._log_file)
def plot_generation(self): png_md5 = '{0}.md5'.format(self._cnv_plot) if md5sum_check(self._cnv_plot, png_md5): log_progress(__modname__, 'CNV plot generation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'CNV plot generation start', f=self._log_file) if os.path.exists(self._cnv_plot): os.remove(self._cnv_plot) exec_cmd = [ 'Rscript', self._cnv_plot_script, self._sample_name, self._cnv_fc_stat, self._cnv_plot ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, self._cnv_plot, png_md5) log_progress(__modname__, 'CNV plot generation finished', f=self._log_file)
def add_hgvs(self, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'Add HGVS info and variant type already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Add HGVS info and variant type start', f=self._log_file) if os.path.exists(output_file): os.remove(output_file) exec_cmd = [ 'python', self._sw['ngb_add_HGVS'], '-d', self._sw['mutect2_bed'], '-o', output_file, input_file ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'Add HGVS info and variant type finished', f=self._log_file)
def sort_target_bed(self): md5_file = '%s.md5' % (self._target_bed) if md5sum_check(self._target_bed, md5_file): log_progress(__modname__, 'Target BED sort already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Target BED sort start', f=self._log_file) if self._pipeline == '901': bed_src = join(self._align_dir, 'DNA_PicardTarget.bed') else: bed_src = join(self._align_dir, 'RNA_PicardTarget.bed') exec_cmd = ['cat %s' % (bed_src), 'sort -k1V,1 -k2n,2'] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', self._target_bed) run_command_md5sum(__modname__, self._log_file, self._target_bed, md5_file) log_progress(__modname__, 'Target BED sort finished', f=self._log_file)
def run_clinvar_annotation(self, input_file, output_file): clinvar_tmp_vcf = join(self._variant_dir, '{0}_clinvar_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'ClinVar annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['ngb_clinvar_ver'], f=self._log_file) else: log_progress(__modname__, 'ClinVar annotation start', f=self._log_file) log_version(__modname__, self._sw['ngb_clinvar_ver'], f=self._log_file) exec_cmd1 = [ 'python', self._sw['ngb_anno_clinvar'], '--dbfile', self._sw['ngb_clinvar_db'], '--infoVCF', self._sw['clinvar_compact_header'], '--inVCF', input_file, '--outVCF', clinvar_tmp_vcf ] run_command(__modname__, exec_cmd1, self._log_file) exec_cmd2 = [ 'python', self._sw['ngb_clinvar_variation'], '-d', self._sw['ngb_clinvar_ref'], '-o', output_file, clinvar_tmp_vcf ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'ClinVar annotation finished', f=self._log_file)
def run_annotation(self, db_name, target_vcf, target_vcf_ver, _info, _name, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, '{0} annotation already finished!!!'.format(db_name), f=self._log_file) log_version(__modname__, target_vcf_ver, f=self._log_file) else: log_progress(__modname__, '{0} annotation start'.format(db_name), f=self._log_file) log_version(__modname__, target_vcf_ver, f=self._log_file) exec_cmd = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format(self._pe_core), '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar', self._sw['snpsift'], 'annotate' ] if _info != '': exec_cmd.append('-info') exec_cmd.append(_info) if _name != '': exec_cmd.append('-name') exec_cmd.append(_name) exec_cmd.append(target_vcf) exec_cmd.append(input_file) run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w', output_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) run_command(__modname__, ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)], self._log_file) log_progress(__modname__, '{0} annotation finished'.format(db_name), f=self._log_file)