def run(self): valid_path(self.output().path, check_ofile=1) prefix = self.output().path.replace('.vcf', "") input_normal = self.input()["normal"].path input_tumor = self.input()["tumor"].path if config.bed_file_path: extra_str = " --intervals %s" % config.bed_file_path else: extra_str = '' normal_name = self.infodict_N["SampleID"] tumor_name = self.infodict_T["SampleID"] cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam {extra_str} ".format( REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_tumor=input_tumor, input_normal=input_normal, gatk4=config.gatk_pro, N_name=normal_name, T_name=tumor_name, prefix=prefix, extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.infodict_N.get("log_path", None)) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False, log_file=self.get_log_path())
def run(self): cmdline = "{vt} decompose -s {input_vcf} | {vt} normalize -r {REF} - > {vt_vcf}".format( vt=config.vt_pro, input_vcf=self.input().path, REF=config.REF_file_path, vt_vcf=self.output().path) run_cmd(cmdline, dry_run=self.dry_run)
def run(self): valid_path(self.output().path, check_ofile=1) input_f = self.input().path somatic_type = self.infodict["Somatic"] if somatic_type == "N": # Normal only extra_str = '' elif somatic_type == "T": # Tumor only extra_str = ' --tumor_lod 4' else: raise Exception("Unknown values of Somatic columns (like '%s' )" % somatic_type) # both normal and tumor sample use input_file:tumor as parameter cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_f} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log {extra_str}'''.format( gatk=config.gatkv36_path, java_option=config.java_option, REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_f=input_f, output_f=self.output().path, prefix=self.output().path.replace('.vcf', ''), extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) cmdline = """{gatk4} MergeVcfs --java-options "-Xmx4g" -R {REF} --INPUT {input_indel} --INPUT {input_snp} --OUTPUT {output_f}""".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_indel=self.input()["indel"].path, input_snp=self.input()["snp"].path, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) cmdline = "java -Xmx4g -jar {gatk} -T CombineVariants -R {REF} --variant:indel {input_indel} --variant:snp {input_snp} --interval_padding 25 --out {output_f} --setKey set --genotypemergeoption UNSORTED".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_indel=self.input()["indel"].path, input_snp=self.input()["snp"].path, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): normal_bam = self.input()["normal"].path tumor_bam = self.input()["tumor"].path input_csv_files = self.input()["annovar"] for input_csv in input_csv_files: input_csv = input_csv.path output_csv = input_csv.replace('.csv', '_added_cov.csv') cmdline = f"python3 {project_root_path}/api/add_per_info_into_csv.py -i {input_csv} -o {output_csv} -tb {tumor_bam} -nb {normal_bam}" run_cmd(cmdline, log_file=self.get_log_path(), dry_run=self.dry_run)
def run(self): for _output, _input in zip(self.output(), self.input()): cmdline = """{gemini} load --cores {threads} -t VEP -v {vep_output_vcf_gz} {Output_db}; \ {gemini} annotate -f {vep_output_vcf_gz} -a extract \ -c SAD,SAF,AF,AD,BaseQRankSum,FS,MQRankSum,ReadPosRankSum,SOR \ -t text,float,float,text,float,float,float,float,float \ -o list,list,list,list,mean,mean,mean,mean,mean {Output_db} >> {gemini_log} 2>&1""".format( gemini=config.gemini_pro, threads=config.gemini_thread, vep_output_vcf_gz=_input.path, Output_db=_output.path, gemini_log=_output.path.replace('.db', ".log")) run_cmd(cmdline, dry_run=self.dry_run)
def run(self): input_vcf = self.input().path.replace('.vcf', '.vt.vcf') output_dir = dirname(self.output().path) source_name = self.infodict["source_name"] valid_path(output_dir, check_odir=1) # todo: convert ref_path to grch37?? cmdline = "source activate pcgr; python3 {pcgr_dir}/pcgr.py --input_vcf {input_vcf} {pcgr_dir} {output_dir} grch37 {toml_config} {source_name} --no-docker --force_overwrite".format( pcgr_dir=config.pcgr_dir, input_vcf=input_vcf, output_dir=output_dir, toml_config=config.pcgr_toml_file, source_name=source_name) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if config.bed_file_path != '': extra_str = " --intervals {}".format(config.bed_file_path) else: extra_str = "" cmdline = "{gatk4} HaplotypeCaller --java-options '-Xmx30g' --native-pair-hmm-threads 30 --reference {ref} --input {input} --genotyping-mode DISCOVERY --dbsnp {dbsnp} -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {output} {extra_str}".format( ref=config.REF_file_path, input=self.input().path, dbsnp=config.db_snp, output=self.output().path, extra_str=extra_str, gatk4=config.gatk_pro) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": selecttype = "SNP" elif self.object_type == "indel": selecttype = "INDEL" else: raise Exception cmdline = "{gatk4} SelectVariants --java-options '-Xmx4g' -R {REF} -V {input_f} -select-type {selecttype} -O {output_f}".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, selecttype=selecttype) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" elif self.object_type == "indel": filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" else: raise Exception cmdline = """{gatk4} VariantFiltration --java-options '-Xmx4g' -R {REF} -V {input_f} --filter-expression "{filterExpression}" --filter-name \"my_{object_type}_filter\" -O {output_f}""".format( gatk4=config.gatk_pro, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, filterExpression=filterExpression, object_type=self.object_type) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": filterExpression = "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" elif self.object_type == "indel": filterExpression = "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" else: raise Exception cmdline = """java -Xmx4g -jar {gatk} -T VariantFiltration -R {REF} -V {input_f} --filterExpression "{filterExpression}" --filterName \"my_{object_type}_filter\" -o {output_f}""".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, filterExpression=filterExpression, object_type=self.object_type) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if self.object_type == "snp": selecttype = "SNP" elif self.object_type == "indel": selecttype = "INDEL" else: raise Exception cmdline = "java -Xmx4g -jar {gatk} -T SelectVariants -R {REF} -V {input_f} -selectType {selecttype} -o {output_f}".format( gatk=config.gatkv36_path, REF=config.REF_file_path, input_f=self.input().path, output_f=self.output().path, selecttype=selecttype) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.bed_file_path != '': extra_str = "-L %s" % config.bed_file_path else: extra_str = '' cmdline = "java -Xmx4g -jar {gatk} -T HaplotypeCaller -nct {gatk_thread} -R {REF} -I {input} {extra_str} --genotyping_mode DISCOVERY --dbsnp {db_snp} -stand_call_conf 10 -stand_emit_conf 5 -A AlleleBalance -A Coverage -A FisherStrand -o {output_f}".format( gatk=config.gatkv36_path, gatk_thread=config.gatk_thread, REF=config.REF_file_path, input=self.input().path, extra_str=extra_str, db_snp=config.db_snp, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) prefix = self.output().path.replace('.vcf', "") input_normal = self.input()["normal"].path input_tumor = self.input()["tumor"].path cmdline = '''java {java_option} -jar {gatk} -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {output_f} --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format( gatk=config.gatkv36_path, java_option=config.java_option, REF=config.REF_file_path, cosmic=config.cos_snp, db_snp=config.db_snp, input_tumor=input_tumor, input_normal=input_normal, prefix=prefix, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): from post_pipelines_analysis.extracted_pos_from_vcf import merge_two_vcf from special_fun.csv2bed import csv2bed pair_vcf = self.input()["pair_vcf"].path.replace('.av', '.vcf') tumor_single_vcf = self.input()["tumor_vcf"].path.replace( '.av', '.vcf') filtered_csv = self.input()["filtered"][1].path if not self.dry_run: filtered_bed = csv2bed(filtered_csv, filtered_csv.replace('.csv', '.bed')) merge_two_vcf(pair_vcf, tumor_single_vcf, filtered_bed, self.output().path, log_file=self.get_log_path()) cmdline = "{vt} decompose -s {input_vcf} | {vt} normalize -r {REF_path} - > {output_vcf}".format( vt=config.vt_pro, input_vcf=self.output().path, REF_path=config.REF_file_path, output_vcf=self.output().path.replace('.vcf', '.vt.vcf')) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) input_f = self.input().path sample_name = self.infodict["SampleID"] if config.bed_file_path: extra_str = " --intervals %s" % config.bed_file_path else: extra_str = "" somatic_type = self.infodict["Somatic"] if somatic_type == "N": extra_str += "" # Normal only extra_str = '' elif somatic_type == "T": # Tumor only extra_str = ' --tumor_lod 4' else: raise Exception("Unknown values of Somatic columns (like '%s' )" % somatic_type) cmdline = "{gatk4} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format( gatk4=config.gatk_pro, REF=config.REF_file_path, db_snp=config.db_snp, input_tumor=input_f, prefix=self.output().path.replace('.vcf', ''), T_name=sample_name, extra_str=extra_str) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.infodict.get("log_path", None)) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False, log_file=self.get_log_path())
def prepare_vcf(vcf_path, log_file=sys.stdout): if not vcf_path.endswith('.gz'): run_cmd('{bgzip} -c {vcf_p} > {vcf_p}.gz'.format( bgzip=config.bgzip_pro, vcf_p=vcf_path), log_file=log_file) vcf_path += '.gz' run_cmd('%s index %s' % (bcftools_path, vcf_path), log_file=log_file) else: if not os.path.isfile(vcf_path + '.csi'): run_cmd('%s index %s' % (bcftools_path, vcf_path), log_file=log_file) return vcf_path
def merge_two_vcf(pair_vcf, single_vcf, bed, output_vcf, log_file=sys.stdout): prepare_vcf(pair_vcf, log_file=log_file) prepare_vcf(single_vcf, log_file=log_file) pair_sample_name = os.popen("zgrep '^#C' %s | cut -f 10-" % pair_vcf).read().replace('\n', '').replace( '\t', ',') single_sample_name = os.popen("zgrep '^#C' %s | cut -f 10-" % single_vcf).read().replace('\n', '').replace( '\t', ',') formatted_line = '''{bcftools_path} view {vcf} -R {bed} -s ^{SM} --force-samples | {bcftools_path} sort > {output}; ''' cmdline1 = formatted_line.format(vcf=pair_vcf.replace('.gz', '') + '.gz', bcftools_path=bcftools_path, bed=bed, output=output_vcf + '1', SM=pair_sample_name) cmdline2 = formatted_line.format(vcf=single_vcf.replace('.gz', '') + '.gz', bcftools_path=bcftools_path, bed=bed, output=output_vcf + '2', SM=single_sample_name) run_cmd(cmdline1, log_file=log_file) run_cmd(cmdline2, log_file=log_file) prepare_vcf(output_vcf + '1', log_file=log_file) prepare_vcf(output_vcf + '2', log_file=log_file) formatted_line2 = """{bcftools} concat {o1} {o2} -a -d all > {output}""".format( bcftools=bcftools_path, o1=output_vcf + '1.gz', o2=output_vcf + '2.gz', output=output_vcf + '3') run_cmd(formatted_line2, log_file=log_file) with open(output_vcf + '3', "r") as fr: with open(output_vcf, 'w') as f1: for row in fr: if row.startswith("##SAMPLE=<ID="): continue f1.write(row) prepare_vcf(output_vcf, log_file=log_file) run_cmd('rm {o}1* ;rm {o}2* ; rm {o}3* '.format(o=output_vcf))
def run(self): cmdline = """{vep} -i {vt_vcf} -o {vep_output_vcf} --vcf --cache --merged --fasta {REF} --sift b --polyphen b --symbol --numbers --biotype \ --total_length --canonical --ccds --gene_phenotype --uniprot --assembly GRCh37 \ --force_overwrite --offline --domains --regulatory --protein --tsl --variant_class --fork {threads} --force \ --no_stats >> {vep_log} 2>&1""".format( vep=config.vep_pro, vt_vcf=self.input().path, REF=config.REF_file_path, vep_output_vcf=self.input().path.replace('.vt.vcf', '.vep.vcf'), threads=config.vep_thread, vep_log=self.input().path.replace('.vt.vcf', '.vep.log')) run_cmd(cmdline, dry_run=self.dry_run) cmdline = '{bgzip} -c {vep_output_vcf} > {vep_output_vcf_gz}'.format( bgzip=config.bgzip_pro, vep_output_vcf=self.input().path.replace('.vt.vcf', '.vep.vcf'), vep_output_vcf_gz=self.output().path) run_cmd(cmdline, dry_run=self.dry_run) cmdline = '{tabix} -p vcf {vep_output_vcf_gz}'.format( tabix=config.tabix_pro, vep_output_vcf_gz=self.output().path) run_cmd(cmdline, dry_run=self.dry_run)
def germline_filter(indir, odir, tab): run_cmd(f"python3 {project_root_path}/api/var_filters.py -i {indir} --tab {tab} -o {odir}", dry_run=False)
def test_somatic_gatk4(odir): run_cmd( f"python3 {project_root_path}/luigi_pipelines/main.py workflow --tab {project_root_path}/test_set/somatic/data_input.tsv --odir {odir} --analysis-type somatic_gatk4 --workers 5 --log-path {odir}/cmd_log.txt", dry_run=False)