def copy_bam_files(self): self._final_bam = join(self._align_dir, '{0}_final.bam'.format(self._sample_name)) bam_dst_md5 = '{0}.md5'.format(self._final_bam) if md5sum_check(self._final_bam, bam_dst_md5): log_progress(__modname__, 'Copy the BAM file to output directory already finished', f=self._log_file) else: log_progress(__modname__, 'Copy the BAM file to output directory', f=self._log_file) ### 901: DNA, 902: RNA if self._pipeline == '901': bam_dir = join(self._tst170_dir, 'DNA_IntermediateFiles', 'Alignment') bam_src = join(bam_dir, '{0}_realigned.bam'.format(self._sample_name)) bed_name = 'DNA_PicardTarget.bed' elif self._pipeline == '902': bam_dir = join(self._tst170_dir, 'RNA_IntermediateFiles', 'Alignment') bam_src = join(bam_dir, '{0}.bam'.format(self._sample_name)) bed_name = 'RNA_PicardTarget.bed' else: log_error(__modname__, 'Unknown pipeline code {0} for TST170 pipeline'.format(self._pipeline), f=self._log_file) sys.exit(1) self.copy_files(bam_src, self._final_bam) bai_src = '{0}.bai'.format(bam_src) bai_dst = '{0}.bai'.format(self._final_bam) self.copy_files(bai_src, bai_dst) self.copy_files(join(bam_dir, bed_name), join(self._align_dir, bed_name)) self.generate_tdf_file(self._final_bam) run_command_md5sum(__modname__, self._log_file, self._final_bam, bam_dst_md5) log_progress(__modname__, 'Copy the BAM file finished', f=self._log_file)
def vcf_post_processing(self, input_file, refined_vcf): refined_vcf_md5 = '{0}.md5'.format(refined_vcf) if md5sum_check(refined_vcf, refined_vcf_md5): log_progress(__modname__, 'VCF post processing already finished!!!', f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) else: log_progress(__modname__, 'VCF post processing start', f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) if os.path.exists(refined_vcf): os.remove(refined_vcf) exec_cmd = [ '{0} normalize -r {1} {2}'.format(self._sw['vt'], self._sw['hg19'], input_file), '{0} decompose -s -'.format(self._sw['vt']), ] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', refined_vcf) run_command_md5sum(__modname__, self._log_file, refined_vcf, refined_vcf_md5) log_progress(__modname__, 'VCF post processing finished', f=self._log_file)
def pileup_depth(self, pileup_depth): md5_file = '%s.md5' % (pileup_depth) if md5sum_check(pileup_depth, md5_file): log_progress(__modname__, 'Get Pileup Depth already finished!!!', f=self._log_file) log_version(__modname__, self._sw['samtools_ver'], f=self._log_file) else: log_progress(__modname__, 'Get Pileup Depth start', f=self._log_file) log_version(__modname__, self._sw['samtools_ver'], f=self._log_file) exec_cmd = [ self._sw['samtools'], 'depth', '-a', '-q', '0', '-Q', '1', '-d', '1000000', '-b', self._target_bed, '--reference', self._sw['hg19'], self._final_bam ] run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w', pileup_depth) run_command_md5sum(__modname__, self._log_file, pileup_depth, md5_file) log_progress(__modname__, 'Get Pileup Depth finished', f=self._log_file)
def low_confidence_annotation(self, input_file, lowconf_vcf): low_conf_homopolyx = join( self._variant_dir, '{0}_lowconf.homopolyx'.format(self._sample_name)) lowconf_vcf_md5 = '{0}.md5'.format(lowconf_vcf) if md5sum_check(lowconf_vcf, lowconf_vcf_md5): log_progress(__modname__, 'Low confidence annotation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Low confidence annotation start', f=self._log_file) if os.path.exists(low_conf_homopolyx): os.remove(low_conf_homopolyx) if os.path.exists(lowconf_vcf): os.remove(lowconf_vcf) exec_cmd1 = [ 'python', self._sw['ngb_lowconf_homopolyx'], '-p', '5', '-r', self._sw['hg19'], '-o', low_conf_homopolyx, input_file ] run_command(__modname__, exec_cmd1, self._log_file) exec_cmd2 = [ 'python', self._sw['ngb_lowconf_repeatcnt'], '-r', self._sw['hg19'], '-o', lowconf_vcf, low_conf_homopolyx ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, lowconf_vcf, lowconf_vcf_md5) log_progress(__modname__, 'Low confidence annotation finished', f=self._log_file)
def remove_reference_info(self, input_file, remove_ref_vcf): remove_ref_vcf_md5 = '{0}.md5'.format(remove_ref_vcf) # remove only reference... if md5sum_check(remove_ref_vcf, remove_ref_vcf_md5): log_progress(__modname__, 'Remove only reference in VCF already finished!!!', f=self._log_file) log_version(__modname__, self._sw['vcftools_ver'], f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) else: log_progress(__modname__, 'Remove only reference in VCF start', f=self._log_file) log_version(__modname__, self._sw['vcftools_ver'], f=self._log_file) log_version(__modname__, self._sw['vt_ver'], f=self._log_file) if os.path.exists(remove_ref_vcf): os.remove(remove_ref_vcf) exec_cmd = [ '{0} --vcf {1} --recode --stdout'.format( self._sw['vcftools'], input_file), 'grep -v "0[/|]0"', 'grep -v "\.[/|]\."', '{0} sort -'.format(self._sw['vt']) ] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', remove_ref_vcf) run_command_md5sum(__modname__, self._log_file, remove_ref_vcf, remove_ref_vcf_md5) log_progress(__modname__, 'Remove only reference in VCF finished', f=self._log_file)
def run(self, summary_file, mapq_file, stat_json_file, flag): stat_json_file_md5 = "{0}.md5".format(stat_json_file) if md5sum_check(stat_json_file, stat_json_file_md5): log_progress(__modname__, "Analysis Statistics already finished", f=self._log_file) else: log_progress(__modname__, "Analysis Statistics start", f=self._log_file) if flag == "solid": self._cutoff_uniformity05 = 5 elif flag == "blood": self._cutoff_uniformity05 = 10 stat_data = {} try: with open(summary_file, "r") as f: lines = f.readlines() for line in lines: sp = line.replace("\n", "").split("\t") if len(sp) == 2: stat_data[sp[0].replace(" ", "_")] = sp[1] except Exception as ex: log_error(__modname__, "Parsing stat summary file error: {0}".format(ex), f=self._log_file) sys.exit(1) try: with open(mapq_file, "r") as f: lines = f.readlines() for line in lines: sp = line.replace("\n", "").split("\t") if len(sp) == 2: stat_data[sp[0].replace(" ", "_")] = sp[1] except Exception as ex: log_error(__modname__, "Parsing mapping quality file error: {0}".format(ex), f=self._log_file) sys.exit(1) json_list = self.workflow(stat_data) json_data = {} json_data["qc_data"] = json_list try: with open(stat_json_file, "w") as make_json_file: json.dump(json_data, make_json_file, ensure_ascii=False, sort_keys=True, indent=2) run_command_md5sum(__modname__, self._log_file, stat_json_file, stat_json_file_md5) except Exception as ex: log_error(__modname__, "{0}".format(ex), f=self._log_file) sys.exit(1) log_progress(__modname__, "Analysis Statistics finished", f=self._log_file)
def workflow(self): tsv_md5 = '{0}.md5'.format(self._cnv_tsv) fc_stat_md5 = '{0}.md5'.format(self._cnv_fc_stat) if md5sum_check(self._cnv_tsv, tsv_md5) and md5sum_check( self._cnv_fc_stat, fc_stat_md5): log_progress(__modname__, 'CNV TSV file generation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'CNV TSV file generation start', f=self._log_file) if os.path.exists(self._cnv_tsv): os.remove(self._cnv_tsv) if os.path.exists(self._cnv_fc_stat): os.remove(self._cnv_fc_stat) vcf_reader = vcf.Reader(open(self._cnv_vcf, 'r')) cnv_tsv = open(self._cnv_tsv, 'w') cnv_tsv.write( 'chromosome\tstart\tend\treference\talternate\tSV_type\tgene\tfold_change\n' ) for i, record in enumerate(vcf_reader): if record.ALT[0] is not None: chrom = record.CHROM start_pos = record.POS end_pos = record.INFO['END'] allele_reference = record.REF if str(record.ALT[0]) == '<DUP>': allele_alternate = 'DUP' elif str(record.ALT[0]) == '<DEL>': allele_alternate = 'DEL' gene_name = record.INFO['ANT'] sv_type = record.INFO['SVTYPE'] cnv_tsv.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n'.format( chrom, int(start_pos), int(end_pos), allele_reference, allele_alternate, sv_type, gene_name, float(record.samples[0]['FC']))) else: pass with open(self._cnv_fc_stat, 'a') as f: f.write('{0}\t{1}\n'.format(record.INFO['ANT'], record.samples[0]['FC'])) cnv_tsv.close() run_command_md5sum(__modname__, self._log_file, self._cnv_tsv, tsv_md5) run_command_md5sum(__modname__, self._log_file, self._cnv_fc_stat, fc_stat_md5) log_progress(__modname__, 'CNV TSV file generation finished', f=self._log_file)
def run_hered_qc_report(self): with open(self._status_log_file, 'w') as f: f.write('[STATUS] QC Report Generation\n[PROGRESS] 95') qc_report_file = join(self._output_dir, "data", "stat", "{0}.pdf".format(self._sample_name)) md5_file = '{0}.md5'.format(qc_report_file) if md5sum_check(qc_report_file, md5_file): log_progress(__modname__, 'QC Report Generation already finished', f=self._log_file) else: log_progress(__modname__, 'Run QC Report Generation', f=self._log_file) qc_report_gen = Her_QC_Report(self._sample_name, self._output_dir, self._fastq_r1, self._fastq_r2, self._pipeline, self._pipeline_name, self._platform, self._sample_source, self._run_name, self._log_file) qc_report_gen.run() run_command_md5sum(__modname__, self._log_file, qc_report_file, md5_file) log_progress(__modname__, 'QC Report Generation finished', f=self._log_file)
def run_snpEff(self, input_file, output_file): snpeff_tmp_out = join(self._variant_dir, '{0}_snpeff_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'snpEff gene annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file) else: log_progress(__modname__, 'snpEff gene annotation start', f=self._log_file) log_version(__modname__, self._sw['snpeff_ver'], f=self._log_file) if os.path.exists(snpeff_tmp_out): os.remove(snpeff_tmp_out) if os.path.exists(output_file): os.remove(output_file) exec_cmd1 = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format( self._pe_core), '-Djava.io.tmpdir={0}'.format( self._sample_tmp_dir), '-jar', self._sw['snpeff'], 'ann', 'hg19ngb', '-no-downstream', '-no-upstream', '-noStats', '-no', 'INTERGENIC', '-no', 'INTERGENIC_CONSERVED', '-no', 'INTRAGENIC', '-no', 'RARE_AMINO_ACID', '-no', 'TRANSCRIPT', '-no', 'TRANSCRIPT_DELETED', '-no', 'REGULATION', '-no', 'NEXT_PROT', '-no', 'PROTEIN_STRUCTURAL_INTERACTION_LOCUS', '-no', 'PROTEIN_PROTEIN_INTERACTION_LOCUS', input_file ] run_command_file_handle(__modname__, exec_cmd1, self._log_file, 'w', snpeff_tmp_out) exec_cmd2 = [ self._sw['bcftools'], 'view', '-i', 'INFO/ANN!="."', snpeff_tmp_out ] run_command_file_handle(__modname__, exec_cmd2, self._log_file, 'w', output_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'snpEff gene annotation finished', f=self._log_file)
def run_dbnsfp_annotation(self, input_file, output_file): dbnsfp_tmp_vcf = join(self._variant_dir, '{0}_dbnsfp_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'dbNSFP annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['dbnsfp_db_ver'], f=self._log_file) else: log_progress(__modname__, 'dbNSFP annotation start', f=self._log_file) log_version(__modname__, self._sw['dbnsfp_db_ver'], f=self._log_file) exec_cmd1 = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format(self._pe_core), '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar', self._sw['snpsift'], 'dbnsfp', '-f', 'aapos,aapos_SIFT,aapos_FATHMM,Uniprot_acc,Interpro_domain,SIFT_pred,SIFT_score,LRT_pred,MutationTaster_pred,MutationTaster_score,GERP++_NR,GERP++_RS,phastCons100way_vertebrate,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,Polyphen2_HDIV_pred,Polyphen2_HDIV_score,Polyphen2_HVAR_pred,Polyphen2_HVAR_score,CADD_phred', '-db', self._sw['dbnsfp_db'], input_file ] run_command_file_handle(__modname__, exec_cmd1, self._log_file, 'w', dbnsfp_tmp_vcf) exec_cmd2 = [ 'python', self._sw['ngb_transcript_dbNSFP'], '-o', output_file, dbnsfp_tmp_vcf ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) run_command(__modname__, ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)], self._log_file) log_progress(__modname__, 'dbNSFP annotation finished', f=self._log_file)
def add_type_to_vcf(self, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'Add TYPE info already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Add TYPE info start', f=self._log_file) if os.path.exists(output_file): os.remove(output_file) exec_cmd = [ 'python', self._sw['ngb_add_vcfinfo'], '-o', output_file, input_file ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'Add TYPE info finished', f=self._log_file)
def generate_tdf_file(self, final_bam): tdf_file = '{0}.tdf'.format(final_bam) tdf_file_md5 = '{0}.md5'.format(tdf_file) if md5sum_check(tdf_file, tdf_file_md5): log_progress(__modname__, 'TDF file generation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file) else: log_progress(__modname__, 'TDF file generation start', f=self._log_file) log_version(__modname__, self._sw['igvtools_ver'], f=self._log_file) if os.path.exists(tdf_file): os.remove(tdf_file) exec_cmd = [ self._sw['igvtools'], 'count', final_bam, tdf_file, 'hg19' ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, tdf_file, tdf_file_md5) log_progress(__modname__, 'TDF file generation finished', f=self._log_file)
def add_hgvs(self, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'Add HGVS info and variant type already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Add HGVS info and variant type start', f=self._log_file) if os.path.exists(output_file): os.remove(output_file) exec_cmd = [ 'python', self._sw['ngb_add_HGVS'], '-d', self._sw['mutect2_bed'], '-o', output_file, input_file ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'Add HGVS info and variant type finished', f=self._log_file)
def plot_generation(self): png_md5 = '{0}.md5'.format(self._cnv_plot) if md5sum_check(self._cnv_plot, png_md5): log_progress(__modname__, 'CNV plot generation already finished!!!', f=self._log_file) else: log_progress(__modname__, 'CNV plot generation start', f=self._log_file) if os.path.exists(self._cnv_plot): os.remove(self._cnv_plot) exec_cmd = [ 'Rscript', self._cnv_plot_script, self._sample_name, self._cnv_fc_stat, self._cnv_plot ] run_command(__modname__, exec_cmd, self._log_file) run_command_md5sum(__modname__, self._log_file, self._cnv_plot, png_md5) log_progress(__modname__, 'CNV plot generation finished', f=self._log_file)
def sort_target_bed(self): md5_file = '%s.md5' % (self._target_bed) if md5sum_check(self._target_bed, md5_file): log_progress(__modname__, 'Target BED sort already finished!!!', f=self._log_file) else: log_progress(__modname__, 'Target BED sort start', f=self._log_file) if self._pipeline == '901': bed_src = join(self._align_dir, 'DNA_PicardTarget.bed') else: bed_src = join(self._align_dir, 'RNA_PicardTarget.bed') exec_cmd = ['cat %s' % (bed_src), 'sort -k1V,1 -k2n,2'] run_command_pipe_file_handle(__modname__, exec_cmd, self._log_file, 'w', self._target_bed) run_command_md5sum(__modname__, self._log_file, self._target_bed, md5_file) log_progress(__modname__, 'Target BED sort finished', f=self._log_file)
def run_annotation(self, db_name, target_vcf, target_vcf_ver, _info, _name, input_file, output_file): output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, '{0} annotation already finished!!!'.format(db_name), f=self._log_file) log_version(__modname__, target_vcf_ver, f=self._log_file) else: log_progress(__modname__, '{0} annotation start'.format(db_name), f=self._log_file) log_version(__modname__, target_vcf_ver, f=self._log_file) exec_cmd = [ self._sw['java'], '-Xmx4g', '-XX:ParallelGCThreads={0}'.format(self._pe_core), '-Djava.io.tmpdir={0}'.format(self._sample_tmp_dir), '-jar', self._sw['snpsift'], 'annotate' ] if _info != '': exec_cmd.append('-info') exec_cmd.append(_info) if _name != '': exec_cmd.append('-name') exec_cmd.append(_name) exec_cmd.append(target_vcf) exec_cmd.append(input_file) run_command_file_handle(__modname__, exec_cmd, self._log_file, 'w', output_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) run_command(__modname__, ['rm', '-rf', '{0}/*'.format(self._sample_tmp_dir)], self._log_file) log_progress(__modname__, '{0} annotation finished'.format(db_name), f=self._log_file)
def run_clinvar_annotation(self, input_file, output_file): clinvar_tmp_vcf = join(self._variant_dir, '{0}_clinvar_tmp.vcf'.format(self._sample_name)) output_md5 = '{0}.md5'.format(output_file) if md5sum_check(output_file, output_md5): log_progress(__modname__, 'ClinVar annotation already finished!!!', f=self._log_file) log_version(__modname__, self._sw['ngb_clinvar_ver'], f=self._log_file) else: log_progress(__modname__, 'ClinVar annotation start', f=self._log_file) log_version(__modname__, self._sw['ngb_clinvar_ver'], f=self._log_file) exec_cmd1 = [ 'python', self._sw['ngb_anno_clinvar'], '--dbfile', self._sw['ngb_clinvar_db'], '--infoVCF', self._sw['clinvar_compact_header'], '--inVCF', input_file, '--outVCF', clinvar_tmp_vcf ] run_command(__modname__, exec_cmd1, self._log_file) exec_cmd2 = [ 'python', self._sw['ngb_clinvar_variation'], '-d', self._sw['ngb_clinvar_ref'], '-o', output_file, clinvar_tmp_vcf ] run_command(__modname__, exec_cmd2, self._log_file) run_command_md5sum(__modname__, self._log_file, output_file, output_md5) log_progress(__modname__, 'ClinVar annotation finished', f=self._log_file)
def run(self): remove_ref_vcf = join(self._variant_dir, '{0}_remove_ref.vcf'.format(self._sample_name)) add_type_vcf = join(self._variant_dir, '{0}_add_type.vcf'.format(self._sample_name)) refined_vcf = join(self._variant_dir, '{0}_refined.vcf'.format(self._sample_name)) lowconf_vcf = join(self._variant_dir, '{0}_lowconf.vcf'.format(self._sample_name)) snpeff_vcf = join(self._variant_dir, '{0}_snpeff.vcf'.format(self._sample_name)) hgvs_vcf = join(self._variant_dir, '{0}_HGVS.vcf'.format(self._sample_name)) dbsnp_vcf = join(self._variant_dir, '{0}_dbsnp.vcf'.format(self._sample_name)) dbsnp_common_vcf = join( self._variant_dir, '{0}_dbsnp_common.vcf'.format(self._sample_name)) kg_vcf = join(self._variant_dir, '{0}_1kg.vcf'.format(self._sample_name)) esp6500_vcf = join(self._variant_dir, '{0}_esp6500.vcf'.format(self._sample_name)) exac_vcf = join(self._variant_dir, '{0}_exac.vcf'.format(self._sample_name)) koexid_vcf = join(self._variant_dir, '{0}_koexid.vcf'.format(self._sample_name)) krgdb_vcf = join(self._variant_dir, '{0}_krgdb.vcf'.format(self._sample_name)) gnomad_vcf = join(self._variant_dir, '{0}_gnomad.vcf'.format(self._sample_name)) cosmic_vcf = join(self._variant_dir, '{0}_cosmic.vcf'.format(self._sample_name)) clinvar_vcf = join(self._variant_dir, '{0}_clinvar.vcf'.format(self._sample_name)) dbnsfp_vcf = join(self._variant_dir, '{0}_dbnsfp.vcf'.format(self._sample_name)) self.remove_reference_info(self._raw_vcf, remove_ref_vcf) self.add_type_to_vcf(remove_ref_vcf, add_type_vcf) self.vcf_post_processing(add_type_vcf, refined_vcf) self.low_confidence_annotation(refined_vcf, lowconf_vcf) self.run_snpEff(lowconf_vcf, snpeff_vcf) self.add_hgvs(snpeff_vcf, hgvs_vcf) # dbSNP dbsnp_name = 'dbsnp_all_' self.run_annotation('dbSNP All', self._sw['dbsnp_vcf'], self._sw['dbsnp_vcf_ver'], '', dbsnp_name, hgvs_vcf, dbsnp_vcf) dbsnp_common_name = 'dbsnp_common_' self.run_annotation('dbSNP common', self._sw['dbsnp_common_vcf'], self._sw['dbsnp_vcf_ver'], '', dbsnp_common_name, dbsnp_vcf, dbsnp_common_vcf) # 1000 Genome kg_info = 'AF,EAS_AF,EUR_AF,AFR_AF,AMR_AF,SAS_AF' kg_name = 'G1000_' self.run_annotation('1KG', self._sw['KG_vcf'], self._sw['KG_vcf_ver'], kg_info, kg_name, dbsnp_common_vcf, kg_vcf) # esp6500 esp6500_info = 'MAF' esp6500_name = 'esp6500_' self.run_annotation('esp6500', self._sw['esp6500_vcf'], self._sw['esp6500_vcf_ver'], esp6500_info, esp6500_name, kg_vcf, esp6500_vcf) # ExAC exac_info = 'AF' exac_name = 'ExAC_' self.run_annotation('ExAC', self._sw['exac_vcf'], self._sw['exac_vcf_ver'], exac_info, exac_name, esp6500_vcf, exac_vcf) # KOEXID self.run_annotation('KOEXID', self._sw['koexid_vcf'], self._sw['koexid_vcf_ver'], '', '', exac_vcf, koexid_vcf) # KRGDB self.run_annotation('KRGDB', self._sw['krgdb_vcf'], self._sw['krgdb_vcf_ver'], '', '', koexid_vcf, krgdb_vcf) # gnomAD gnomad_info = 'AF,AF_AFR,AF_AMR,AF_ASJ,AF_EAS,AF_FIN,AF_NFE,AF_OTH,AF_SAS,AF_Male,AF_Female,AF_raw,AF_POPMAX,AF_AMR_Male,AF_FIN_Female,AF_FIN_Male,AF_AFR_Male,AF_SAS_Male,AF_OTH_Male,AF_NFE_Female,AF_EAS_Female,AF_EAS_Male,AF_SAS_Female,AF_AFR_Female,AF_AMR_Female,AF_ASJ_Male,AF_ASJ_Female,AF_OTH_Female,AF_NFE_Male' gnomad_name = 'gnomAD_' self.run_annotation('gnomAD', self._sw['gnomad_vcf'], self._sw['gnomad_vcf_ver'], gnomad_info, gnomad_name, krgdb_vcf, gnomad_vcf) # COSMIC self.run_annotation('COSMIC', self._sw['cosmic_vcf'], self._sw['cosmic_vcf_ver'], '', '', gnomad_vcf, cosmic_vcf) # ClinVar self.run_clinvar_annotation(cosmic_vcf, clinvar_vcf) # dbNSFP self.run_dbnsfp_annotation(clinvar_vcf, dbnsfp_vcf) # copy to final vcf if os.path.exists(self._final_vcf): os.remove(self._final_vcf) shutil.copyfile(dbnsfp_vcf, self._final_vcf) run_command_md5sum(__modname__, self._log_file, self._final_vcf, '{0}.md5'.format(self._final_vcf))
def workflow(self): vcf_record = {} vcf_reader = vcf.Reader(open(self._vcf_file, 'r')) for i, record in enumerate(vcf_reader): vcf_contents = {} vcf_contents['id'] = i + 1 if 'ANN' in record.INFO: pass else: continue ### allele allele = self.get_allele(record) vcf_contents['allele'] = allele ### build build = {} build['ref_genome'] = 'GRCh37/hg19' vcf_contents['build'] = build ### genomic coordiante genomic_coordinate = {} genomic_coordinate['chromosome'] = record.CHROM genomic_coordinate['g.pos'] = record.POS vcf_contents['genomic_coordinate'] = genomic_coordinate ### gene gene = self.get_gene_info(record, allele) vcf_contents['gene'] = gene ### insilico prediction in_silico_prediction = {} # FATHMM_prediction fathmm = {} if "dbNSFP_FATHMM_pred" in record.INFO: fathmm["prediction"] = str( record.INFO["dbNSFP_FATHMM_pred"]).replace( "'", "").replace("[", "").replace("]", "").replace( ", ", "|").replace("D", "Deleterious").replace( "T", "Tolerated").replace("None", "") else: fathmm['prediction'] = '' in_silico_prediction['FATHMM_prediction'] = fathmm # GERP++ gerp = {} if "dbNSFP_GERP___RS" in record.INFO: gerp["RS_score"] = str(record.INFO["dbNSFP_GERP___RS"][0]) else: gerp['RS_score'] = '' if "dbNSFP_GERP___NR" in record.INFO: gerp["NR_score"] = str(record.INFO["dbNSFP_GERP___NR"][0]) else: gerp['NR_score'] = '' in_silico_prediction["GERP++"] = gerp # LRT lrt = {} if "dbNSFP_LRT_pred" in record.INFO: lrt["prediction"] = str( record.INFO["dbNSFP_LRT_pred"]).replace("'", "").replace( "[", "").replace("]", "").replace(", ", "|").replace( "D", "Deleterious").replace("N", "Neutral").replace( "U", "Unknown").replace("None", "") else: lrt['prediction'] = '' in_silico_prediction["LRT_prediction"] = lrt vcf_contents['in_silico_prediction'] = in_silico_prediction # Mutation Assessor ma = {} if "dbNSFP_MutationAssessor_pred" in record.INFO: ma["prediction"] = str( record.INFO["dbNSFP_MutationAssessor_pred"]).replace( "'", "").replace("[", "").replace("]", "").replace( ", ", "|").replace("H", "high").replace( "M", "medium").replace("L", "low").replace( "N", "netural").replace("None", "") else: ma['prediction'] = '' in_silico_prediction["MutationAssessor_prediction"] = ma # PolyPhen2 polyphen2 = {} polyphen2_score, polyphen2_text, polyphen2_radar = self.polyphen2( record.INFO) polyphen2['score'] = str(polyphen2_score) polyphen2['radar'] = str(polyphen2_radar) polyphen2['text'] = str(polyphen2_text) polyphen2['prediction'] = '' in_silico_prediction['PolyPhen2'] = polyphen2 # SIFT sift = {} sift_score, sift_text, sift_radar = self.sift(record.INFO) sift['score'] = str(sift_score) sift['radar'] = str(sift_radar) sift['text'] = str(sift_text) if "dbNSFP_SIFT_pred" in record.INFO: sift["prediction"] = str( record.INFO["dbNSFP_SIFT_pred"]).replace("'", "").replace( "[", "").replace("]", "").replace(", ", "|").replace( "D", "Deleterious").replace("T", "Tolerated").replace( "None", "") else: sift['prediction'] = '' in_silico_prediction["SIFT"] = sift # Mutation Taster mt = {} mt_score, mt_text, mt_radar = self.mutationtaster(record.INFO) mt['radar'] = str(mt_radar) mt['text'] = str(mt_text) if "dbNSFP_MutationTaster_pred" in record.INFO: mt["prediction"] = str( record.INFO["dbNSFP_MutationTaster_pred"] ).replace("'", "").replace("[", "").replace("]", "").replace( ", ", "|").replace("A", "disease causing automatic").replace( "D", "disease causing").replace( "N", "polymorphism (probably harmless)").replace( "P", "polymorphism automatic (known to be harmless)" ).replace("None", "") else: mt['prediction'] = '' in_silico_prediction["mt"] = mt vcf_contents['in_silico_prediction'] = in_silico_prediction ### Variant Flag flag = {} # check sequencing error if "LOW_CONFIDENCE" in record.INFO: flag["low_confidence"] = str(record.INFO["LOW_CONFIDENCE"][0]) if len(record.FILTER) != 0: for _filter in record.FILTER: if _filter == 'q20': flag[ 'low_quality_score'] = 'Quality score less than 20' elif _filter == 'LowDP': flag[ 'low_DP'] = 'Low coverage, therefore no genotype called. Note different depth thresholds for variant and reference calls' elif _filter == 'SB': flag[ 'high_strand_bias'] = 'Variant strand bias too high' elif _filter == 'LowVariantFreq': flag[ 'low_variant_frequency'] = 'Variant frequency less than 0.0260' elif _filter == 'R8': flag[ 'indel_repeat'] = 'Indel repeat greater than or equal to 8' elif _filter == 'LowAQ': flag[ 'low_AQ'] = 'Variant artifact adjusted quality score less than 10' elif _filter == 'LowVarSupport': flag[ 'low_variant_support'] = 'Variant is supported by too few reads' else: continue vcf_contents["flag"] = flag # oncokb oncokb = {} if "oncokb_action_cancer" in record.INFO: oncokb["action_cancer"] = str( record.INFO["oncokb_action_cancer"][0]).replace("_", " ") else: oncokb["action_cancer"] = "" if "oncokb_action_drugs" in record.INFO: oncokb["action_drugs"] = str( record.INFO["oncokb_action_drugs"][0]).replace("_", " ") else: oncokb["action_drugs"] = "" if "oncokb_action_level" in record.INFO: oncokb["action_level"] = str( record.INFO["oncokb_action_level"][0]).replace("_", " ") else: oncokb["action_level"] = "" if "oncokb_action_pmid" in record.INFO: oncokb["action_pmid"] = str( record.INFO["oncokb_action_pmid"][0]).replace("_", " ") else: oncokb["action_pmid"] = "" if "oncokb_oncogenicity" in record.INFO: oncokb["oncogenicity"] = str( record.INFO["oncokb_oncogenicity"][0]).replace("_", " ") else: oncokb["oncogenicity"] = "" if "oncokb_hgvsp" in record.INFO: oncokb["oncokb_hgvsp"] = str( record.INFO["oncokb_hgvsp"][0]).replace("_", " ") else: oncokb["oncokb_hgvsp"] = "" if "oncokb_tx" in record.INFO: oncokb["oncokb_tx"] = str(record.INFO["oncokb_tx"][0]) else: oncokb["oncokb_tx"] = "" vcf_contents["Precision Oncology Knowledge Base"] = oncokb #clincal(clinvar) clinical = {} if 'ngb_cv_rcv_acc' in record.INFO: clinvar_rcv_accessions = str( record.INFO["ngb_cv_rcv_acc"]).replace("]", "").replace( "[", "").replace("'", "").strip().split(',') clinical['clinvar_accession'] = ",".join( clinvar_rcv_accessions).strip() clinical['total_rcv'] = len(clinvar_rcv_accessions) clinvar_cs = self.clinvar_clinical_significance(record.INFO) clinical['radar'] = str(clinvar_cs) if 'ngb_cv_rcv_sig_reviewstatus' in record.INFO: clinical['review_status'] = clinvar_rcv_accessions = str( record.INFO["ngb_cv_rcv_sig_reviewstatus"]).replace( "]", "").replace("[", "").replace("'", "").replace( "0x2C", "").replace("_", " ").strip() else: clinical['review_status'] = "None" if 'ngb_cv_rcv_sig_description' in record.INFO: clinical['classification'] = clinvar_rcv_accessions = str( record.INFO["ngb_cv_rcv_sig_description"]).replace( "]", "").replace("[", "").replace("'", "").strip() if "ngb_cv_trait_preferred_name" in record.INFO: tmp_preferred_name = str( record.INFO["ngb_cv_trait_preferred_name"][0]) clinical["trait_preferred_name"] = tmp_preferred_name.replace( "0x2C", "").replace("_", " ") if "ngb_cv_trait_link_omim" in record.INFO: clinical["omim"] = str( record.INFO["ngb_cv_trait_link_omim"][0]).replace("-", "") vcf_contents['clinical'] = clinical population = {} #1000 genomes pop_1kg = {} pop_1kg["African"] = self.g1000_pop_freq("AFR", record) pop_1kg["American"] = self.g1000_pop_freq("AMR", record) pop_1kg["East_Asian"] = self.g1000_pop_freq("EAS", record) pop_1kg["European"] = self.g1000_pop_freq("EUR", record) pop_1kg["South_Asian"] = self.g1000_pop_freq("SAS", record) pop_1kg["ALL"] = self.g1000_pop_freq("ALL", record) population['1000_genomes'] = pop_1kg #esp 6500 pop_esp6500 = {} pop_esp6500["ALL"] = self.esp6500_pop_freq("ALL", record) pop_esp6500["AA"] = self.esp6500_pop_freq("AA", record) pop_esp6500["EA"] = self.esp6500_pop_freq("EA", record) population['ESP6500'] = pop_esp6500 #Exac population frequnecy pop_exac = {} pop_exac["ALL"] = self.exac_pop_freq("ALL", record) population["ExAC"] = pop_exac #KoEXID pop_koexid = {} koexid = {} koexid["code"] = "ALL" koexid["population"] = "All KoEXID" koexid_af = "" if "KoEXID_AF" in record.INFO: koexid_af = str(record.INFO["KoEXID_AF"]) koexid["allele_frequency"] = koexid_af pop_koexid["ALL"] = koexid population["Korean Exome Information Database"] = pop_koexid #KRGDB pop_krgdb = {} krgdb = {} krgdb["code"] = "ALL" krgdb["population"] = "All KRGDB" krgdb_af = "" if "KRGDB1100_AF" in record.INFO: krgdb_af = str(record.INFO["KRGDB1100_AF"][0]) krgdb["allele_frequency"] = krgdb_af pop_krgdb["ALL"] = krgdb population["Korean Reference Genome DB"] = pop_krgdb # gnomAD pop_gnomad = {} pop_gnomad["Finnish"] = self.gnomad_pop_freq("FIN", record) pop_gnomad["South Asian"] = self.gnomad_pop_freq("SAS", record) pop_gnomad["Admixed American"] = self.gnomad_pop_freq( "AMR", record) pop_gnomad["Non-Finnish European"] = self.gnomad_pop_freq( "NFE", record) pop_gnomad["East Asian"] = self.gnomad_pop_freq("EAS", record) pop_gnomad["Others"] = self.gnomad_pop_freq("OTH", record) pop_gnomad["African/African American"] = self.gnomad_pop_freq( "AFR", record) pop_gnomad["Ashkenazi Jewish"] = self.gnomad_pop_freq( "ASJ", record) pop_gnomad["ALL"] = self.gnomad_pop_freq("ALL", record) population["gnomAD"] = pop_gnomad vcf_contents['population_frequency'] = population if "TYPE" in record.INFO: allele_type_of_allele_ = str(record.INFO["TYPE"]).replace( "[", "").replace("]", "").replace("'", "").replace(" ", "").strip().split(",") allele_type_of_allele = str(allele_type_of_allele_[0]) else: allele_type_of_allele = "" ref_len = len(allele['reference']) alt_len = len(allele['alternate']) if allele_type_of_allele == "ins": variant_size = ref_len - 1 else: variant_size = ref_len - alt_len variant = {} if "dbsnp_all_RS" in record.INFO: variant["rs_id"] = "rs%s" % (str(record.INFO['dbsnp_all_RS'])) else: variant["rs_id"] = "" variant['start'] = int(record.POS) variant['stop'] = int(record.POS) + variant_size variant['exac_format'] = "{0}-{1}-{2}-{3}".format( str(record.CHROM).replace('chr', ''), record.POS, allele['reference'], allele['alternate']) variant['type'] = allele_type_of_allele left_10_bp = variant['start'] - 23 right_10_bp = variant['stop'] + 22 fasta = pysam.FastaFile(self._reference) left_10_bp_seq = fasta.fetch(str(record.CHROM), start=left_10_bp, end=variant['start'] - 1) right_10_bp_seq = fasta.fetch(str(record.CHROM), start=variant['stop'], end=right_10_bp) variant['left_22_bp'] = left_10_bp_seq variant['right_22_bp'] = right_10_bp_seq if "Variant_type" in record.INFO: variant["variant_type"] = str(record.INFO["Variant_type"][0]) else: variant["variant_type"] = "" vcf_contents["variant_information"] = variant variant_classifier = {} if 'ngb_cv_rcv_sig_description' in record.INFO: prediction = clinical['classification'] else: prediction = "uncertain significance" variant_classifier['result'] = prediction if prediction == "pathogenic": variant_classifier['radar'] = "5" variant_classifier['grade'] = "A" elif prediction == "likely pathogenic": variant_classifier['radar'] = "4" variant_classifier['grade'] = "B" elif prediction == "uncertain significance": variant_classifier['radar'] = "3" variant_classifier['grade'] = "C" elif prediction == "likely benign": variant_classifier['radar'] = "2" variant_classifier['grade'] = "D" elif prediction == "benign": variant_classifier['radar'] = "1" variant_classifier['grade'] = "E" vcf_contents['variant_classifier'] = variant_classifier cosmic_anno = {} cosmic_anno['cosmic_cnt'] = "" cosmic_anno['cosmic_id'] = "" cosmic_anno['cosmic_occurrence'] = "" if 'COSMIC_CNT' in record.INFO: cosmic_anno['cosmic_cnt'] = str( record.INFO['COSMIC_CNT']).replace("]", "").replace( "[", "").replace("'", "").strip() if 'COSMIC_ID' in record.INFO: cosmic_anno['cosmic_id'] = str( record.INFO['COSMIC_ID']).replace("]", "").replace( "[", "").replace("'", "").strip() if 'COSMIC_OCCURENCE' in record.INFO: cosmic_anno['cosmic_occurrence'] = str( record.INFO['COSMIC_OCCURENCE']).replace("]", "").replace( "[", "").replace("'", "").strip() vcf_contents[ 'Catalogue Of Somatic Mutations In Cancer'] = cosmic_anno variant_name = "GRCh37-%s-%s-%s-%s" % ( record.CHROM, variant['start'], allele['reference'], allele['alternate']) vcf_record[variant_name] = vcf_contents for key in sorted(vcf_record.keys(), key=str.lower): vcf_json = {} vcf_json[key] = vcf_record[key] vcf_json = json.dumps(vcf_json, sort_keys=True) with open(self._json_file, 'a') as f: f.write(vcf_json + '\n') #print (vcf_json) run_command_md5sum(__modname__, self._log_file, self._json_file, self._md5_file)