def run(self): valid_path(self.output().path, check_ofile=1) input_f = self.input().path somatic_type = self.infodict["Somatic"] if somatic_type == "N": # Normal only extra_str = '' elif somatic_type == "T": # Tumor only extra_str = ' --tumor_lod 4' else: raise Exception("Unknown values of Somatic columns (like '%s' )" % somatic_type) # both normal and tumor sample use input_file:tumor as parameter cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_f} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log {extra_str}'''.format( gatk=config.gatkv36_path, java_option=config.java_option, REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_f=input_f, output_f=self.output().path, prefix=self.output().path.replace('.vcf', ''), extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) prefix = self.output().path.replace('.vcf', "") input_normal = self.input()["normal"].path input_tumor = self.input()["tumor"].path if config.bed_file_path: extra_str = " --intervals %s" % config.bed_file_path else: extra_str = '' normal_name = self.infodict_N["SampleID"] tumor_name = self.infodict_T["SampleID"] cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam {extra_str} ".format( REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_tumor=input_tumor, input_normal=input_normal, gatk4=config.gatk_pro, N_name=normal_name, T_name=tumor_name, prefix=prefix, extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.infodict_N.get("log_path", None)) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False, log_file=self.get_log_path())
def output(self): odir = self.infodict["odir"] source_name = self.infodict["source_name"] filter_path = config.filtered_dir.format(path=odir, PairN=source_name) valid_path(filter_path, check_odir=1) opath1 = join(filter_path, source_name + '_except_AF_depth.csv') opath2 = join(filter_path, source_name + '_except_AF_depth_PASS.csv') return [luigi.LocalTarget(opath1), luigi.LocalTarget(opath2)]
def output(self): odir = self.infodict["odir"] source_name = self.infodict["source_name"] filter_path = config.filtered_dir.format(path=odir, PairN=source_name) valid_path(filter_path, check_odir=1) final_variants = join(filter_path, source_name + '_final.vcf') return luigi.LocalTarget(final_variants)
def output(self): indir = dirname(self.input().path) odir = join(indir, "pcgr_output") valid_path(odir, check_odir=1) source_name = self.infodict["source_name"] ofile = join(odir, "{}.pcgr_acmg.grch37.html".format(source_name)) # todo: convert ref_path to grch37?? return luigi.LocalTarget(ofile)
def run(self): valid_path(self.output().path, check_ofile=1) cmdline = """{gatk4} MergeVcfs --java-options "-Xmx4g" -R {REF} --INPUT {input_indel} --INPUT {input_snp} --OUTPUT {output_f}""".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_indel=self.input()["indel"].path, input_snp=self.input()["snp"].path, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) cmdline = "java -Xmx4g -jar {gatk} -T CombineVariants -R {REF} --variant:indel {input_indel} --variant:snp {input_snp} --interval_padding 25 --out {output_f} --setKey set --genotypemergeoption UNSORTED".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_indel=self.input()["indel"].path, input_snp=self.input()["snp"].path, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): input_vcf = self.input().path.replace('.vcf', '.vt.vcf') output_dir = dirname(self.output().path) source_name = self.infodict["source_name"] valid_path(output_dir, check_odir=1) # todo: convert ref_path to grch37?? cmdline = "source activate pcgr; python3 {pcgr_dir}/pcgr.py --input_vcf {input_vcf} {pcgr_dir} {output_dir} grch37 {toml_config} {source_name} --no-docker --force_overwrite".format( pcgr_dir=config.pcgr_dir, input_vcf=input_vcf, output_dir=output_dir, toml_config=config.pcgr_toml_file, source_name=source_name) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if config.bed_file_path != '': extra_str = " --intervals {}".format(config.bed_file_path) else: extra_str = "" cmdline = "{gatk4} HaplotypeCaller --java-options '-Xmx30g' --native-pair-hmm-threads 30 --reference {ref} --input {input} --genotyping-mode DISCOVERY --dbsnp {dbsnp} -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {output} {extra_str}".format( ref=config.REF_file_path, input=self.input().path, dbsnp=config.db_snp, output=self.output().path, extra_str=extra_str, gatk4=config.gatk_pro) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": selecttype = "SNP" elif self.object_type == "indel": selecttype = "INDEL" else: raise Exception cmdline = "{gatk4} SelectVariants --java-options '-Xmx4g' -R {REF} -V {input_f} -select-type {selecttype} -O {output_f}".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, selecttype=selecttype) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" elif self.object_type == "indel": filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" else: raise Exception cmdline = """{gatk4} VariantFiltration --java-options '-Xmx4g' -R {REF} -V {input_f} --filter-expression "{filterExpression}" --filter-name \"my_{object_type}_filter\" -O {output_f}""".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, filterExpression=filterExpression, object_type=self.object_type) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": selecttype = "SNP" elif self.object_type == "indel": selecttype = "INDEL" else: raise Exception cmdline = "java -Xmx4g -jar {gatk} -T SelectVariants -R {REF} -V {input_f} -selectType {selecttype} -o {output_f}".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, selecttype=selecttype) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" elif self.object_type == "indel": filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" else: raise Exception cmdline = """java -Xmx4g -jar {gatk} -T VariantFiltration -R {REF} -V {input_f} --filterExpression "{filterExpression}" --filterName \"my_{object_type}_filter\" -o {output_f}""".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, filterExpression=filterExpression, object_type=self.object_type) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.bed_file_path != '': extra_str = "-L %s" % config.bed_file_path else: extra_str = '' cmdline = "java -Xmx4g -jar {gatk} -T HaplotypeCaller -nct {gatk_thread} -R {REF} -I {input} {extra_str} --genotyping_mode DISCOVERY --dbsnp {db_snp} -stand_call_conf 10 -stand_emit_conf 5 -A AlleleBalance -A Coverage -A FisherStrand -o {output_f}".format( gatk=config.gatkv36_path, gatk_thread=config.gatk_thread, REF=config.REF_file_path, input=self.input().path, extra_str=extra_str, db_snp=config.db_snp, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) prefix = self.output().path.replace('.vcf', "") input_normal = self.input()["normal"].path input_tumor = self.input()["tumor"].path cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format( gatk=config.gatkv36_path, java_option=config.java_option, REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_tumor=input_tumor, input_normal=input_normal, prefix=prefix, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) input_f = self.input().path sample_name = self.infodict["SampleID"] if config.bed_file_path: extra_str = " --intervals %s" % config.bed_file_path else: extra_str = "" somatic_type = self.infodict["Somatic"] if somatic_type == "N": extra_str += "" # Normal only extra_str = '' elif somatic_type == "T": # Tumor only extra_str = ' --tumor_lod 4' else: raise Exception("Unknown values of Somatic columns (like '%s' )" % somatic_type) cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format( gatk4=config.gatk_pro, REF=config.REF_file_path, db_snp=config.db_snp, input_tumor=input_f, prefix=self.output().path.replace('.vcf', ''), T_name=sample_name, extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.infodict.get("log_path", None)) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False, log_file=self.get_log_path())
def add_per_info(result_csvs, output_csvs, tumor_bam, normal_bam, bed_file): ''' :param result_csv: :param output_csv: :param tumor_bam: :param normal_bam: :return: ''' fasta = config.REF_file_path print('{:#^40}'.format('Start Whole project...')) t1 = time.time() bed_df = pd.read_csv(bed_file, sep='\t', header=None) range_list = [] for _idx in tqdm(range(bed_df.shape[0])): range_list += ['chr' + str(_i) for _i in range(bed_df.iloc[_idx, 1], bed_df.iloc[_idx, 2])] tb = pysam.AlignmentFile(tumor_bam) nb = pysam.AlignmentFile(normal_bam) ref_ = pysam.FastaFile(fasta) for result_csv, output_csv in zip(result_csvs, output_csvs): result_csv = os.path.realpath(result_csv) output_csv = os.path.realpath(output_csv) # ~ in path is missing location.It will raise error, so need to expand it. ori_csv = pd.read_csv(result_csv, index_col=None) t2 = time.time() print('{:#^40}'.format('Loaded/Inited all required file...... Using %d ' % (t2 - t1))) print('{:#^40}'.format('Star Iteration.......')) for _ in added_col: # init a col or add a new column. ori_csv.loc[:, _] = 0 for _index, row in tqdm(ori_csv.iterrows(), total=ori_csv.shape[0]): Ref = row['Ref'] Alt = row['Alt'] Chr = row['Chr'] Pos = int(row['Start']) End = int(row['End']) if not set(['chr' + str(_i) for _i in range(Pos, End + 1)]).intersection(range_list): # if this pos at the range of WES bed indicated. # if it is a deletion or intersection, partially intersected also is ok. ori_csv.loc[_index, added_col] = 'Off target' continue if tumor_bam: mut_cov, mut_per, ref_cov = parse_bam(tb, Chr, Pos, End, Ref, Alt, ref_) ori_csv.loc[_index, added_col[-3:]] = mut_cov, mut_per, ref_cov if normal_bam: mut_cov, mut_per, ref_cov = parse_bam(nb, Chr, Pos, End, Ref, Alt, ref_) ori_csv.loc[_index, added_col[:3]] = mut_cov, mut_per, ref_cov print('{:#^40}'.format('Almost Completing. Iteration used %d.' % (time.time() - t2))) print('{:#^40}'.format('filtering all unconvinced snp/indel.')) # a_num.T_mut_per = a_num.T_mut_per.astype(float) # a_num.N_mut_per = a_num.N_mut_per.astype(float) # a_num = a_num[a_num.T_mut_per >= a_num.N_mut_per] # a_num = a_num[a_num.T_mut_per != 0] valid_path(output_csv, check_ofile=1) with open(output_csv, 'w') as f1: ori_csv.to_csv(f1, index=False)