def run(self, outfile, params): bam = resolve_argument(params.bam, sep=" ") reference_fasta = get_reference(params) stments, retvals = [], [] variant_types = [x.strip() for x in params.variant_types.split(",")] for variant_type in variant_types: stments.append( "{params.path} " "--type {variant_type} " "--genome {reference_fasta} " "--outfile {outfile}.{variant_type}.vcf " "{params.options} " "{bam} " ">& {outfile}.{variant_type}.log; " "bgzip -f {outfile}.{variant_type}.vcf; " "tabix -f -p vcf {outfile}.{variant_type}.vcf.gz".format( **locals())) retvals.extend(P.run(stments)) vcf_files = " ".join( [outfile + "." + x + ".vcf.gz" for x in variant_types]) retvals.append( P.run("{params.path_vcf_concat} " "{vcf_files} " "| {params.path_vcf_sort} " "| bgzip " "> {outfile}; " "tabix -fp vcf {outfile}".format(**locals()))) return retvals
def check_unique(tool_functions, input_combos=None, input_regex=None, input_alias=None, is_test=False): # compute a list of task names names = [] if input_combos: for toolf, input_files in itertools.product(tool_functions, input_combos): taskf = copy.copy(toolf) taskf.register_input(input_files, regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) else: for toolf in tool_functions: taskf = copy.copy(toolf) taskf.register_input(regex=input_regex, alias=input_alias, is_test=is_test) names.append(taskf.__name__) counts = collections.Counter(names) for name, count in list(counts.items()): if count > 1: make_unique = True P.get_logger().debug( "adding hash identifier because of duplicate name: {}={}".format(name, count)) break else: make_unique = False return make_unique
def run(self, infiles, outfile, params): files = " ".join(infiles) job_threads = params.job_threads # todo: # 1. add header. # 2. do batch+merge sort in order to avoid hitting temporary space limits. # 3. remove unnecessary info fields while sorting, add them later. tmpdir = P.get_temp_filename() retval = P.run( "mkdir {tmpdir}; " "bcftools view -h {infiles[0]} " "| cut -f 1-10 " "| bgzip > {outfile}; " "zcat {files} " "| awk -v OFS='\\t' " "'!/^#/ && $5 != \"<NON_REF>\" " "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' " "2> {outfile}.filter.log " "| sort -k1,1V -k2,2n " "--parallel {job_threads} " "-T {tmpdir} " "2> {outfile}.sort.log " "| uniq " "| bgzip " ">> {outfile}; " "tabix -p vcf {outfile}; " "rm -rf {tmpdir} ".format(**locals()))
def run_statements(self, stmnts, **kwargs): stmnts = [x for x in stmnts if x] filename, main_statement, post_statement = P.join_statements( stmnts, infile=None) stmnt = " ; ".join([x for x in [main_statement, post_statement] if x]) job_threads1, job_threads2 = 1, 1 if "--num_threads" in stmnt: try: job_threads1 = max([ int(x) for x in re.search("--num_threads\s*(\d+)", stmnt).groups() ]) except AttributeError: pass if "--num_cpu_threads_per_data" in stmnt: try: job_threads2 = max([ int(x) for x in re.search( "--num_cpu_threads_per_data_thread\s*(\d+)", stmnt).groups() ]) except AttributeError: pass job_threads = max(job_threads1, job_threads2) return P.run(stmnt, **kwargs)
def run(self, outfile, params): path = os.environ["PATH"] gp = P.get_parameters_as_namedtuple() cluster_queue = gp.cluster["queue"] cluster_memory_resource = gp.cluster["memory_resource"] cluster_parallel_environment = gp.cluster["parallel_environment"] outdir = os.path.dirname(outfile) outname = os.path.basename(outdir) # -sync y forces qsub to wait until job completes before # continuing. statement = ( "{self.path} " "-p canu " "-d {outdir} " "-genomeSize={params.genome_size} " "gridOptionsJobName={outname} " "java={params.path_java} " "gridOptions=\"-q {cluster_queue} -v PATH={path} -sync y \" " "gridEngineMemoryOption=\"-l {cluster_memory_resource}=MEMORY\" " "gridEngineThreadsOption=\"-pe {cluster_parallel_environment} THREADS\" " "{params.options} " "{params.assembly_mode} " "{params.fasta} " ">& {outfile}.log; " "mv {outdir}/canu.contigs.fasta {outfile}".format(**locals())) return P.run(statement, without_cluster=True)
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile}.shared.bed.gz") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_test.bed.gz") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_truth.bed.gz") statements.append("rm -f {tmpf_test} {tmpf_truth}") for section in self.sections: statements.append( "tabix -p bed {outfile}.{section}.bed.gz".format(**locals())) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) # these are small files, so doing it here. Implement tabix.count() # method counts = dict() for section in self.sections: # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf: inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz") counts[section] = len(list(inf.fetch())) inf.close() with IOTools.open_file(outfile, "w") as outf: outf.write("section\tcounts\n") outf.write("\n".join( ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n") return retval
def run(self, outfile, params): if "--threads" in params.options or "-t " in params.options: job_threads = int(re.search("(-t|--threads)\s*(\d+)", params.options).groups()[1]) fastq = resolve_argument(params.fastq, ",").split(",") if len(fastq) == 1: fastq = '-U "{}"'.format(fastq) else: fastq = '-1 "{}" -2 "{}"'.format(*fastq) tmpdir = P.get_temp_filename(clear=True) if "index" in params._fields: index = params.index else: index = params.reference_fasta if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "--rg-id {}".format(readgroup_id) # add additional level of quoting and remove "ID:{}" readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string) readgroup_string = " ".join(["--rg {}".format(x) for x in readgroup_string.split("\t")]) else: readgroup_option = "" readgroup_string = "" return P.run( "mkdir {tmpdir}; " "{self.path} " "{readgroup_option} " "{readgroup_string} " "{params.options} " "-x {index} " "{fastq} " "2> {outfile}.log " "| samtools view -b /dev/stdin " "2> {outfile}.view.log " "| samtools sort -T {tmpdir} -O bam /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile}; " "rm -rf {tmpdir}".format(**locals()), **params._asdict())
def run(self, infile, outfile, params): if params.annotations_bed is None: raise ValueError("{} requires annotations_bed to be set".format( self.name)) if params.workspace_bed is None: raise ValueError("{} requires workspace_bed to be set".format( self.name)) retval = run_metric_bedtools_intersection.run(self, infile, outfile, params) retvals = [retval] statements = [ "mv {outfile} {outfile}.bedtools_intersect_and_annotate_counts.tsv" .format(**locals()) ] bed_files = [] for section in self.sections: tmpf = P.get_temp_filename(clear=True) + "-" + section + ".gz" statements.append( "zcat {outfile}.{section}.bed.gz " "| awk -v OFS='\\t' '{{ $4 = \"{section}\"; print }}' " "| bgzip > {tmpf}".format(**locals())) bed_files.append(tmpf) segment_files = " ".join( ["--segment-bed-file={}".format(x) for x in bed_files]) statements.append( "{params.gat_path} " "{segment_files} " "--with-segment-tracks " "--annotation-bed-file={params.annotations_bed} " "--workspace-bed-file={params.workspace_bed} " "--log={outfile} " "{params.options} " "> {outfile}.bedtools_intersect_and_annotate_enrichment.tsv". format(**locals())) for f in bed_files: statements.append("rm -f {}".format(f)) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def run(self, infiles, outfile, params): files = " ".join(infiles) job_threads = 1 if "--threads" in params.options: job_threads = int(re.search("--threads[= ]\s*(\d+)", params.options).groups()[0]) if "--threads" in params.view_options: job_threads += int(re.search("--threads[= ]\s*(\d+)", params.view_options).groups()[0]) if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) with open(outfile + ".header.sam", "w") as outf: outf.write(readgroup_string + "\n") retval = P.run( "{params.path} merge " "{params.options} " "-f " "-h {outfile}.header.sam " "-r " "- " "{files} " "2> {outfile}.log " "| samtools view -h - " "| perl -p -e 's/^.*\\n// if (/^\@RG/ && !/{readgroup_id}/); " " s/RG:Z:\S+/RG:Z:{readgroup_id}/' " "| samtools view -bS " "{params.view_options} " "- " "> {outfile}; " "samtools index {outfile} 2> {outfile}.index.log" .format(**locals()), job_threads=job_threads) else: retval = P.run( "{params.path} merge " "-f " "{params.options} " "{outfile} " "{files} " "2> {outfile}.log; " "samtools index {outfile} 2> {outfile}.index.log" .format(**locals()), job_threads=job_threads) return retval
def run(self, infiles, outfiles, params): vcfs = infiles if len(vcfs) != 2: raise ValueError("expected 2 VCF files, received {}".format(vcfs)) vcf = " ".join(infiles) if isinstance(outfiles, str): # files not known to ruffus, so expect a glob expression such as # \2.dir/*.dir/*.vcf.gz outdir = os.path.dirname(os.path.dirname(outfiles)) else: outdir = os.path.commonprefix(outfiles) outfile = os.path.join(outdir, "result.log") retval = P.run("{params.path} isec " "{params.options} " "--output-type z " "--prefix {outdir} " "{vcf} " "&> {outfile} ".format(**locals())) f = ["000{}.vcf.gz".format(x) for x in range(4)] self.distribute_results(outdir, list(zip(f, self.output))) f = ["000{}.vcf.gz.tbi".format(x) for x in range(4)] ff = [x + ".tbi" for x in self.output] self.distribute_results(outdir, list(zip(f, ff))) return retval
def distribute_results(self, workdir, pairs, statement=None): """distribute results from a task into separate output directories. Arguments --------- workdir : string working directory pairs : list tuples of input/output filenames statement : string optional statement to be executed to transform input to output. If not given, the files are simply moved. """ statements = [] for infile, outfile in pairs: infile = os.path.join(workdir, infile) outfile = os.path.join(workdir, outfile) if not os.path.exists(infile): raise ValueError( "expected file {} does not exist".format(infile)) if not os.path.exists(os.path.dirname(outfile)): os.makedirs(os.path.dirname(outfile)) if statement is None: shutil.move(infile, outfile) else: statements.append(statement.format(**locals())) if statements: return P.run(statements)
def run(self, infiles, outfile, params): if isinstance(infiles, list) or isinstance(infiles, tuple): if len(infiles) > 1: raise NotImplementedError( "collated somatic variant detection of multiple VCF files not implemented" ) infile = infiles[0] else: infile = infiles with pysam.VariantFile(infile) as inf: samples = list(inf.header.samples) if len(samples) != 2: raise ValueError( "expected only two samples in VCF, got {}: {}".format( len(samples), ",".join(samples))) normal_sample_id, tumour_sample_id = samples statement = ("{params.path} " "{params.options} " "-i {infile} " "-o {outfile} " "-n {normal_sample_id} " "-t {tumour_sample_id} " "2> {outfile}.log ".format(**locals())) return P.run(statement)
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=" ") reference_fasta = get_reference(params) # warning: requires -m or -c in the options if "--multiallelic-caller" not in params.options and \ "-m" not in params.options and \ "-c" not in params.options and \ "--consensus-caller" not in params.options: E.warn("bcftools call requires -m or -c, got {}".format( params.options)) # limit number of jobs to node to limit I/O job_threads = 4 return P.run("{params.path_samtools} mpileup " "-ug " "-f {reference_fasta} " "{params.samtools_options} " "{bam} " "2> {outfile}.pileup.log " "| {params.path} call " "--variants-only " "--output-type z " "{params.options} " "2> {outfile}.call.log " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals()))
def run(self, infile, outfile, params): return P.run("{params.path} " "{params.options} " "-I {infile} " "--log {outfile}.log " "2> {outfile}.err " "> {outfile} ".format(**locals()))
def run(self, outfile, params): outfile = os.path.abspath(outfile) if params.primary_vcf is None: raise ValueError("expected primary_vcf, received {}".format( params.primary_vcf)) if params.filter_vcf is None: raise ValueError("expected filter_vcf, received {}".format( params.filter_vcf)) primary_vcf = os.path.abspath(params.primary_vcf) filter_vcf = os.path.abspath(params.filter_vcf) outdir = os.path.dirname(outfile) retval = P.run( "( " "cd {outdir} && " "{params.path} query -l {filter_vcf} > subset_samples " "&& {params.path} view {params.options} --force-samples " "-S subset_samples {primary_vcf} -Ob -o test.subset_samples.bcf " "&& {params.path} index test.subset_samples.bcf " "&& {params.path} isec {params.options} -n=2 " "--prefix isec " "test.subset_samples.bcf {filter_vcf} " "--output-type z " "&& mv -f isec/0000.vcf.gz {outfile} " "&& tabix {outfile} " ") &> {outfile}.log ".format(**locals())) return retval
def run(self, infile, outfiles, params): tbxfile = pysam.VariantFile(infile) statements = [] for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) statements.append( "mkdir {output_dir}; " "tabix -h {infile} {chrom} | bgzip > {output_file}; " "tabix -p vcf {output_file} ".format(**locals())) retvals = P.run(statements) # clean up empty vcfs, opening empty VCF in pysam throws # ValueError for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) try: f = pysam.VariantFile(output_file) f.close() except ValueError: E.warn("removing empty VCF {}".format(output_file)) shutil.rmtree(output_dir) tbxfile.close()
def run(self, outfiles, params): vcf = resolve_argument(params.vcf, sep=" ") vcfs = vcf.split(" ") if len(vcfs) != 2: raise ValueError("expected 2 VCF files, received {}".format(vcfs)) outdir = os.path.commonprefix(outfiles) outfile = os.path.join(outdir, "result.log") retval = P.run("{params.path} isec " "{params.options} " "--output-type z " "--prefix {outdir} " "{vcf} " "&> {outfile} ".format(**locals())) f = ["000{}.vcf.gz".format(x) for x in range(4)] self.distribute_results(outdir, list(zip(f, self.output))) f = ["000{}.vcf.gz.tbi".format(x) for x in range(4)] ff = [x + ".tbi" for x in self.output] self.distribute_results(outdir, list(zip(f, ff))) return retval
def run(self, outfile, params): retvals = [] prefix = IOTools.snip(outfile, ".bed.gz") vcffile = prefix + ".vcf.gz" if not os.path.exists(vcffile): retvals.extend(run_tool_delly.run(self, vcffile, params)) statements = [] statements.append("{self.path_bcftools} query " "{params.bcftools_options} " "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" " "{vcffile} " "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{" "case \"DEL\": $5=0; break; " "case \"DUP\": $5=3; break; " "case \"INS\": next; break; " "}}; print }}' " "| bgzip " "> {outfile}".format(**locals())) statements.append("tabix -f -p bed {outfile}".format(**locals())) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def run(self, infile, outfile, params): try: retval = P.run("{params.path} view -H " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) with open(outfile, "w") as outf, open(outfile + ".tmp") as inf: outf.write("header_tag\ttag\tlineno\tvalue\n") for lineno, line in enumerate(inf): fields = line[1:-1].split("\t") header_tag = fields[0] if header_tag == "CO": # Do not split comment lines outf.write("\t".join((header_tag, "", str(lineno), "\t".join(fields[1:]))) + "\n") else: for field in fields[1:]: sub_tag, content = field.split(":", 1) outf.write("\t".join((header_tag, sub_tag, str(lineno), content)) + "\n") os.unlink(outfile + ".tmp") return retval
def run(self, infile, outfile, params): return P.run("{params.path} " "--printf='filename\\tsize\\tepoch_modified" "\\tmodified" "\\n" "%%n\\t%%s\\t%%Y\\t%%y\\n' " "{infile} > {outfile}".format(**locals()))
def run(self, infiles, outfile, params): files = " ".join(infiles) return P.run("zcat {files} " "| sort -k 1,1 -k2,2n " "| bgzip > {outfile}; " "tabix -p bed {outfile} ".format(**locals()))
def run(self, outfile, params): # the default is auto so use ten threads. if "threads" in params.options: if "job_threads=auto" in params.options: raise ValueError( "please specify the number of threads " "to use explicitely") else: job_threads = int(re.search("threads=(\d+)", params.options).groups()[0]) else: raise ValueError("please specify the number of threads to use") job_memory = "32G" fastq = resolve_argument(params.fastq, " ") tmpdir = P.get_temp_filename(clear=True) return P.run( "mkdir {tmpdir}; " "zcat {fastq} " "| cut -c -5999 " "| gzip > {tmpdir}/in.fastq.gz; " "{params.path} " "{params.options} " "in={tmpdir}/in.fastq.gz " "ref={params.reference_fasta} " "out={tmpdir}/result.bam " ">& {outfile}.log; " "samtools sort -o {tmpdir}/sorted.bam {tmpdir}/result.bam; " "java -Xmx8000m -jar {params.path_picard} " "AddOrReplaceReadGroups " "INPUT={tmpdir}/sorted.bam " "OUTPUT={outfile} " "VALIDATION_STRINGENCY=LENIENT " "RGID=1 " "RGLB={params.library} " "RGPL={params.platform} " "RGPU=unknown " "RGSM={params.sample} " ">& {outfile}.picard.log; " "samtools index {outfile} " ">& {outfile}.index.log; " "rm -rf {tmpdir}".format(**locals()))
def run(self, infile, outfile, params): statement = ("{params.path} depth " "-a " "{params.options} " "{infile} " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, infiles, outfile, params): statements = [] outdir = os.path.dirname(outfile) temp_files = [] for start in range(0, len(infiles), self.block_size): fn_vcf = os.path.join(outdir, "block_{}.vcf.gz".format(start)) temp_files.append(fn_vcf) if os.path.exists(fn_vcf): continue end = start + self.block_size files = " ".join( ["--variant {}".format(x) for x in infiles[start:end]]) statements.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {params.path} " "-T CombineGVCFs " "-R {params.reference_fasta} " "{params.options} " "{files} " "--out {fn_vcf} " "--log_to_file {fn_vcf}.log " ">& {fn_vcf}.err; ".format(**locals())) retvals = P.run(statements, job_memory="28G") files = " ".join(["--variant {}".format(x) for x in temp_files]) statement = ("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {params.path} " "-T GenotypeGVCFs " "-R {params.reference_fasta} " "{params.options} " "{files} " "--out {outfile} " "--log_to_file {outfile}.log " ">& {outfile}.err; ".format(**locals())) retvals.append(P.run(statement, job_memory="28G")) return retvals
def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def run(self, outfile, params): if "-t" in params.options: job_threads = int(re.search("-t\s*(\d+)", params.options).groups()[0]) else: job_threads = 1 # BWA requires at least 6Gb of memory, but is also correlated # with the number of threads, so use 5Gb + 1Gb per thread job_memory = "{}G".format(5.0 + 1.0 * job_threads) fastq = resolve_argument(params.fastq, ",") fastq = '"{}"'.format('" "'.join(fastq.split(","))) tmpdir = P.get_temp_filename(clear=True) if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "-R {}".format(pipes.quote(readgroup_string)) # add additional level of quoting: readgroup_option = re.sub("\\t", "\\\\t", readgroup_option) else: readgroup_option = "" return P.run( "mkdir {tmpdir}; " "{self.path} mem " "{readgroup_option} " "{params.options} " "{params.reference_fasta} " "{fastq} " "2> {outfile}.log " "| samtools view -bu /dev/stdin " "2> {outfile}.view.log " "| samtools sort --threads {job_threads} -T {tmpdir} -O bam /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile} >& {outfile}.index.log; " "rm -rf {tmpdir}".format(**locals()), **params._asdict())
def run(self, infiles, outfile, params): infiles = " ".join([x + params.add_glob for x in infiles]) statement = ("daisy plot-variant-stats " "{params.options} " "--output-filename-pattern={outfile}.%%s.png " "{infiles} " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, infile, outfile, params): statement = ( "{params.path} fasta2stats " "--output-filename-sequences={outfile}.daisy_fasta2stats_sequences.tsv " "--log {outfile} " "{infile} " "> {outfile}.daisy_fasta2stats_summary.tsv ".format(**locals())) return P.run(statement)
def run(self, infile, outfile, params): tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.annotations_bed) statements = [stmnt] statements.append("{params.path} " "--segment-bed-file={tmpf_test} " "--ignore-segment-tracks " "--annotation-bed-file={tmpf_truth} " "--workspace-bed-file={params.workspace_bed} " "--log={outfile}.log " "{params.options} " "> {outfile}") statement = "; ".join(statements) return P.run(statement.format(**locals()))
def run(self, infile, outfile, params): if params.reference_fasta is None: raise ValueError( "ont_variant_depth_ratio requires reference_fasta to be set") if params.reference_vcf is None: raise ValueError( "ont_variant_depth_ratio requires reference_vcf to be set") statement = [] if params.ref_sample_size is not None: reference_vcf = outfile + ".ref_sample.vcf.gz" statement.append( "daisy fasta2vcf " "--log={outfile}.fasta2vcf.log " "--sample-size={params.ref_sample_size} {params.reference_fasta} " "| bgzip " "> {outfile}.fasta2vcf.vcf.gz; " "tabix -p vcf {outfile}.fasta2vcf.vcf.gz; " "bcftools concat --allow-overlap " "{params.reference_vcf} " "{outfile}.fasta2vcf.vcf.gz " "| bgzip " "> {reference_vcf}; " "tabix -p vcf {reference_vcf} ".format(**locals())) else: reference_vcf = params.reference_vcf statement.append("{params.path_freebayes} " "-f {params.reference_fasta} " "--variant-input {reference_vcf} " "--only-use-input-alleles " "{params.options_freebayes} " "{infile} " "| bgzip " "> {outfile}.genotyped.vcf.gz; ".format(**locals())) # "tabix -p vcf {outfile}.genotyped.vcf.gz; " # "{params.path_bcftools} view {params.options_bcftools} " # "{reference_vcf} " # "| bgzip > {outfile}.ref.vcf.gz; " # "tabix -p vcf {outfile}.ref.vcf.gz; " # "{params.path_bcftools} query -f \"%%CHROM\\t%%POS\\t[%%GT]\\t[%%DPR]\\n\" " # "{outfile}.genotyped.vcf.gz > {outfile}.genotyped.tsv; " # "{params.path_bcftools} query -f \"%%CHROM\\t%%POS\\t[%%GT]\\n\" " # "{outfile}.ref.tsv; " # "join -1 2 -2 2 {outfile}.ref.tsv {outfile}.genotyped.tsv " # "| perl -p -e \"s/[, ]/\\t/g\" " # "| cut -f 1,3,5,6,7 " # "| grep -v '\.' " # "> {outfile}".format(**locals())) statement = ";".join(statement) return P.run(statement)