def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def run(self, infile, outfile, params): if params.reference_fasta_map is None: raise ValueError("bam2reference requires a reference sequence map") reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") retval, diff = get_reference_for_bam(infile, fasta) if retval is None: if diff is None: retval = "corrupted" else: retval = "unknown" E.debug("differences: {}".format(str(diff))) path = "" else: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) path = map_path2name.get(retval, os.path.basename(retval)) with IOTools.open_file(outfile, "w") as outf: outf.write("filename\treference\tpath\n") outf.write("\t".join((infile, retval, path)) + "\n") return None
def expand_globs(config, is_test=False): """detect and expand glob expressions in the input section. A glob expression is any filename that contains a '*'. Multiple glob expressions can be combined on the same line by a ','. A "find" expression is detected starting with 'find'. These expressions will be evaluated in a shell and the results insterted into the dictionary. If a filename starts with "file=", the contents of the file following the "=" are read and inserted. Multiple files can be separated by a ','. If a glob or find expression is evaluated to nothing, an exception is raised unless ``is_test`` is set. In that case, two files will be returned called "test1" and "test2". """ for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("find"): try: data = E.run(value, return_stdout=True) except Exception as e: data = e.output d[key] = [x for x in data.split("\n") if x] elif "*" in value: if "," in value: v = [glob.glob(x.strip()) for x in value.split(",")] v = [item for sublist in v for item in sublist] else: v = glob.glob(value) d[key] = v elif value.startswith("file="): filenames = [x.strip() for x in value.split("=")[1].split(",")] paths = [] for fn in filenames: with IOTools.open_file(fn) as inf: paths.extend([x.strip() for x in inf if x.strip()]) d[key] = paths if len(d[key]) == 0: if not is_test: raise ValueError( "expression '{}' expanded to nothing".format(value)) else: # insert some random files for testing purposes: if "*" in value: # replace glob expressions value = re.sub(",.*", "", value) d[key] = [re.sub("[*]", "test1", value), re.sub("[*]", "test2", value)] else: if "bam" in value: d[key] = ["test1.bam", "test2.bam"] elif "vcf" in value: d[key] = ["test1.vcf.gz", "test2.vcf.gz"] else: d[key] = ["test1.txt", "test2.txt"] return config
def run(self, infiles, outfile, params): def _link(infile, outfile): if os.path.exists(os.path.abspath(outfile)): return dirname = os.path.dirname(outfile) if not os.path.exists(dirname): os.makedirs(dirname) os.symlink(infile, os.path.abspath(outfile)) rx = re.compile(params.regex) outfiles = [] for infile in infiles: outpath = os.path.join( os.path.dirname(outfile), rx.search(infile).expand(params.pattern_out)) for suffix in self.suffixes: for fn in glob.glob(infile + suffix): _link(fn, outpath + suffix) _link(os.path.abspath(infile), outpath) outfiles.append(outpath) with IOTools.open_file(outfile, "w") as outf: outf.write("\n".join(outfiles) + "\n")
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile}.shared.bed.gz") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_test.bed.gz") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_truth.bed.gz") statements.append("rm -f {tmpf_test} {tmpf_truth}") for section in self.sections: statements.append( "tabix -p bed {outfile}.{section}.bed.gz".format(**locals())) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) # these are small files, so doing it here. Implement tabix.count() # method counts = dict() for section in self.sections: # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf: inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz") counts[section] = len(list(inf.fetch())) inf.close() with IOTools.open_file(outfile, "w") as outf: outf.write("section\tcounts\n") outf.write("\n".join( ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n") return retval
def get_default_params(): """return default parameters for tools/metrics. Could be refactored to read defaults from a user specified file. The current implementation takes the one located within the repository. """ with IOTools.open_file( os.path.join(os.path.dirname(__file__), "defaults.yml")) as inf: result = yaml.load(inf, Loader=RoundTripLoader) return result
def line_grouper(filename): rx = re.compile("\d{4}-\d{2}-\d{2} ") with IOTools.open_file(filename) as infile: last_line = None for line in infile: line = line.strip() if not rx.match(line): last_line = " ".join((last_line, line)) else: if last_line: yield last_line last_line = line yield last_line
def run(self, infile, outfile, params): with IOTools.open_file(outfile, "w") as outf: outf.write("contig\tcount\tsum\tmin\tmax\tmean\t" "median\tstddev\tcollapse\n") retval = P.run("zcat {infile} " "| awk '{{printf(\"%%s\\t%%i\\n\", $1, $3-$2); " " printf(\"total\\t%%i\\n\", $3-$2)}}' " "| sort -k1,1 " "| {params.path} groupby " "-g 1 " "-c 2 " "-o count,sum,min,max,mean,median,stddev,collapse " "{params.options} " "2> {outfile}.log " ">> {outfile}; ".format(**locals())) return retval
def resolve_argument(argument, sep=","): """if argument is a container type (dict, list, tuple) resolve its contents to comma-separated list. """ if isinstance(argument, dict): if len(argument) != 1: raise ValueError( "expected a single entry dictionary, got '{}'".format( argument)) return sep.join(x[2] for x in IOTools.nested_iter(argument)) elif isinstance(argument, list) or isinstance(argument, tuple): return sep.join(argument) # special treatment for output from run_collate_link_output elif "filelist" in argument: f = [ x.strip() for x in IOTools.open_file(argument).readlines() if not x.startswith("#") ] return sep.join([x for x in f if x]) return argument
def run(self, infile, outfile, params): if params.reference_vcf is None: raise ValueError("missing input parameter 'reference_vcf'") if params.reference_sdf is None: raise ValueError("missing input parameter 'reference_sdf'") if params.callable_bed is None: raise ValueError("missing input parameter 'callable_bed'") outfile_regions = outfile + ".bed.gz" restrict_bed(outfile_regions, params.callable_bed, infile, remove_chr=params.remove_chr, add_chr=params.add_chr) outputdir = os.path.join(os.path.dirname(outfile), "vcfeval.dir") if os.path.exists(outputdir): shutil.rmtree(outputdir) if LooseVersion(self.get_version()) < LooseVersion("3.7"): bed_options = "--bed-regions={}".format(params.callable_bed) output_columns = [ "threshold", "true_positive_count", "false_positive_count", "false_negative_count", "false_positive_rate", "false_negative_rate", "f_measure" ] else: bed_options = "--evaluation-regions={}".format(params.callable_bed) output_columns = [ "threshold", "true_positive_baseline", "true_positive_count", "false_positive_count", "false_negative_count", "false_positive_rate", "false_negative_rate", "f_measure" ] retval = self.run_with_preprocessing( infile, outfile, params, "{params.path} vcfeval " "--calls={infile} " "--baseline={params.reference_vcf} " "--template={params.reference_sdf} " "{bed_options} " "--output={outputdir} " "{params.options} " ">& {outfile}.log ".format(**locals()), job_memory="unlimited") with IOTools.open_file(os.path.join(outputdir, "summary.txt")) as inf: with IOTools.open_file(outfile, "w") as outf: table = [] for line in inf: if line.startswith("-"): continue line = re.sub("^ +", "", line) line = re.sub(" +", "\t", line) fields = line[:-1].split("\t") table.append(fields) df = pandas.DataFrame(table[1:], columns=table[0]) df.columns = output_columns # convert precision and sensitivity df["false_positive_rate"] = 1.0 - df[ "false_positive_rate"].astype(float) df["false_negative_rate"] = 1.0 - df[ "false_negative_rate"].astype(float) df.to_csv(outf, sep="\t", index=False) return retval
def run(self, infile, outfile, params): if params.reference_vcf is None: raise ValueError("missing input parameter 'reference_vcf'") if params.reference_sdf is None: raise ValueError("missing input parameter 'reference_sdf'") if params.callable_bed is None and params.reference_fasta is None: raise ValueError( "missing input parameter: either 'callable_bed' or " "'reference_fasta' is needed") outfile_regions = outfile + ".bed.gz" if "callable_bed" in params._fields: restrict_bed(outfile_regions, params.callable_bed, infile, remove_chr=params.remove_chr, add_chr=params.add_chr) else: create_genome_bed(outfile_regions, infile, params.reference_fasta, remove_chr=params.remove_chr, add_chr=params.add_chr) with pysam.VariantFile(params.reference_vcf.strip()) as inf: try: # in some pathological VCF (multiple headers), sample # names are not properly read in pysam. sample_name = list(inf.header.samples)[0] except IndexError: sample_name = "TOCOMPARE" params = update_namedtuple(params, rename_samples=sample_name) outfile_reference = outfile + ".ref.vcf.gz" preprocess_reference = self.build_statement_with_preprocessing( params.reference_vcf, outfile_reference, params, "mv {params.reference_vcf} {outfile_reference}; " "tabix -f -p vcf {outfile_reference}".format(**locals())) outputdir = os.path.join(os.path.dirname(outfile), "vcfeval.dir") if os.path.exists(outputdir): shutil.rmtree(outputdir) # The java VM does not work with the ulimit -v and ulimit -h # options. retval = self.run_with_preprocessing( infile, outfile, params, "{preprocess_reference}; " "{params.path} vcfeval " "--calls={infile} " "--baseline={outfile_reference} " "--template={params.reference_sdf} " "--bed-regions={outfile_regions} " "--output={outputdir} " "--sample={sample_name} " ">& {outfile}.log; " "rm -f {outfile_reference} {outfile_reference}.tbi ".format( **locals()), job_memory="unlimited", ) with IOTools.open_file(os.path.join(outputdir, "summary.txt")) as inf: with IOTools.open_file(outfile, "w") as outf: table = [] for line in inf: if line.startswith("-"): continue line = re.sub("^ +", "", line) line = re.sub(" +", "\t", line) fields = line[:-1].split("\t") table.append(fields) df = pandas.DataFrame(table[1:], columns=table[0]) df.columns = [ "threshold", "true_positive_count", "false_positive_count", "false_negative_count", "false_positive_rate", "false_negative_rate", "f_measure" ] # convert precision and sensitivity df["false_positive_rate"] = 1.0 - df[ "false_positive_rate"].astype(float) df["false_negative_rate"] = 1.0 - df[ "false_negative_rate"].astype(float) df.to_csv(outf, sep="\t", index=False) return retval
def build_combinations(config): """build combinations of configuration parameters Return all possible combinations between configuration values. There are two types of combinatorics that are applied:: option1: - value1 - value2 option2: - valueA - valueB will combine into:: - option1/value1 x option2/valueA - option1/value1 x option2/valueB ... Values can be grouped on lower levels for those tools expecting multiple input files of the same type, for a collection of samples to process:: option1: - group1: - value1 - value2 - group2: - value3 - value4 option2: - valueA - valueB Will result in:: - option1/[value1, value2] x option2/valueA - option1/[value1, value2] x option2/valueB ... For tools requiring multiple input files, such as a group of samples and a reference sequence, use the following syntax:: test1: bam: - value1 - value2 reference: value10 test2: bam: - value3 - value4 reference: value11 groupby: label This translates into: - bam/[value1/values2] x reference/value10 - bam/[value3/values4] x reference/value11 Note the ``groupby`` variable indicationg that options should be grouped by the top-level (label). Configurations values taking multiple values are identified by lists, for example:: >>> build_combinations({'option1': ["value1", "value2"]}) [{'option1': 'value1'}, {'option1': 'value2'}] >>> build_combinations({'option1': ["value1", "value2"], 'option2': 'valueA'}) [{'option2': 'valueA', 'option1': 'value1'}, {'option2': 'valueA', 'option1': 'value2'}] >>> build_combinations({'option1': ["value1", "value2"], 'option2': ["valueA", "valueB"]}) [{'option2': 'valueA', 'option1': 'value1'}, {'option2': 'valueA', 'option1': 'value2'}, {'option2': 'valueB', 'option1': 'value1'}, {'option2': 'valueB', 'option1': 'value2'}] # nopep8 >>> benchmark.Workflow.build_combinations({'option1': [{"value1": [1,2,3]}, {"value2": [4,5,6]}]}) [{'option1': {'value1': [1, 2, 3]}}, {'option1': {'value2': [4, 5, 6]}}] Arg: config(dict) : Configuration directory Returns: list : List of dictionaries """ if not config: return [{}] groupby = "option" if "groupby" in config: groupby = config["groupby"].strip() if groupby not in ("label", "option", "file"): raise ValueError( "unknown groupby option '{}', " "expected one of {}".format( groupby, str(("label", "option", "file")))) del config["groupby"] combinations = [] if groupby == "option": # add multiplicity of input files try: variable = [(k, v) for k, v in list(config.items()) if isinstance(v, list)] except AttributeError: raise ValueError( "issue with configuration for option '{}'. " "possibly due to supplying option for tool directly and " "not using 'options'".format(config)) variable = [x for x in variable if x[0] not in RESERVED_WORDS] if variable: constant = [(k, v) for k, v in list(config.items()) if not isinstance(v, list)] levels = [x[0] for x in variable] values = [merge_shared_values(x[1]) for x in variable] for combination in itertools.product(*values): d = dict(constant + list(zip(levels, combination))) combinations.append(d) else: combinations.append(config) elif groupby == "label": for k, v in list(config.items()): assert isinstance(v, dict) combinations.append(v) elif groupby == "file": # use a design-file to define groups if "label" not in config: raise ValueError("using file requires a 'label' column to be set") label_columns = config["label"] if not isinstance(label_columns, list) or isinstance(label_columns, tuple): label_columns = [label_columns] filelist = config["input"] if not isinstance(filelist, list): filelist = [filelist] if len(filelist) > 1: raise NotImplementedError("using multiple files is not implemented") for fn in filelist: with IOTools.open_file(fn) as inf: df = pd.read_table(inf, dtype=str) for label_column in label_columns: if label_column not in df.columns: raise ValueError( "label column {} specified, but does not exist in {}".format( label_column, fn)) map_column2slot = {} shared_values = set() columns = set(df.columns) for key, value in list(config.items()): if key == "label": continue shared_value = True if not isinstance(value, list): value = [value] for v in value: if v in columns and v not in label_columns: map_column2slot[v] = key shared_value = False if shared_value and key != "input": shared_values.add(key) if len(map_column2slot) == 0: raise ValueError( "no mapping found between column headers ({}) " "and slots in config file ({})".format( ",".join(df.columns), ",".join(list(config.keys())))) for row in df.iterrows(): combination = {} for shared_value in shared_values: combination[shared_value] = config[shared_value] dd = dict(row[1]) for column, slot in list(map_column2slot.items()): val = dd[column] if ',' in val: val = val.split(',') if slot in combination: raise ValueError("duplicate slots: {}".format(slot)) combination[slot] = val combination["name"] = "-".join([re.sub(" ", "_", dd[x]) for x in label_columns]) combinations.append(combination) return combinations
def run(self, infile, outfile, params): options = [] reference_fasta = params.reference_fasta reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) reference_label = None use_target_regions = True if params.reference_fasta: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) if params.reference_fasta == "auto": fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") reference_fasta, diffs = get_reference_for_bam( infile, fastafiles=fasta) if reference_fasta: options.append("--ref-seq {}".format(reference_fasta)) reference_label = map_path2name[reference_fasta] elif diffs: E.warn( "attempted to detect reference fasta, but unable to do so. " "diffs: {}".format(diffs)) else: E.warn("sequence dict is empty, BAM likely to be empty. " "target_regions will be ignored") use_target_regions = False else: options.append("--ref-seq {}".format(params.reference_fasta)) reference_label = map_path2name.get(params.reference_fasta, None) if params.target_regions and use_target_regions: target_regions = get_associated_file(params, reference_label, "target_regions") # convert to 1-based coordinates and decompress if target_regions.endswith(".bed.gz"): target_regions = ( "<(zcat {} " "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')" .format(target_regions)) options.append("--target-regions {}".format(target_regions)) options = " ".join(options) if not os.path.exists(outfile + ".tmp"): try: retval = P.run("{params.path} stats " "{self.options} " "{options} " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals()), job_memory="16G") except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) return None else: retval = None def split_output(lines): is_comment = True section, body = None, [] for line in lines: if line.startswith("#"): if body: yield section, body body = [] is_comment = True else: # the following preserves new-line line = re.sub("\t#.*", "", line) fields = line[:-1].split("\t") section = fields[0] body.append(fields[1:]) is_comment = False if body: yield section, body # split into separate files for upload with IOTools.open_file(outfile + ".tmp") as inf: for section, body in split_output(inf): try: tablename, columns = self._map_section_to_table[section] except KeyError: continue output_file = self.map_table_to_file(tablename, outfile) with IOTools.open_file(output_file, "w") as outf: if len(columns) > 1 and columns[1].startswith("VAR_"): outf.write("{}\t{}\n".format(columns[0], columns[1][4:])) for data in body: outf.write("{}\t{}\n".format( data[0], ",".join(data))) else: outf.write("\t".join(columns) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") os.rename(outfile + ".tmp", outfile) return retval
def run(self, infile, outfile, params): # TODO: bam_fastqc_sequence_length_distribution.tsv may # contain ranges such as '30-31'. Convert to beginning of # range like in this perl command: # # perl -p -i -e "s/\-\d+//" # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv if infile.endswith(".gz"): prefix = IOTools.snip(os.path.basename(infile[:-3])) else: prefix = IOTools.snip(os.path.basename(infile)) outdir = os.path.dirname(outfile) datafile = os.path.join(outdir, "{}_fastqc".format(prefix), "fastqc_data.txt") if not os.path.exists(datafile): if not os.path.exists(outdir): os.makedirs(outdir) retval = P.run( "{params.path} " "{params.options} " "--extract " "--outdir {outdir} " "{infile} " ">& {outfile} ".format(**locals()), **params._asdict()) else: IOTools.touch_file(outfile) retval = None def _split_output(lines): body, header, section, status = [], None, None, None for line in lines: if line.startswith("##FastQC"): continue elif line.startswith("#"): header, body = line[1:-1].split("\t"), [] elif line.startswith(">>END_MODULE"): yield section, header, body, status body, header, section, status = [], None, None, None elif line.startswith(">>"): section, status = line[2:-1].split("\t") else: fields = line[:-1].split("\t") body.append(fields) # split into separate files for upload summary_data = [] with IOTools.open_file(datafile) as inf: for section, header, body, status in _split_output(inf): if len(body) == 0: continue summary_data.append((section, status)) tablename = "{}_".format(self.name) + re.sub( " ", "_", section).lower() if tablename not in self.tablenames: raise ValueError( "unknown tablename {}, expected one of {}".format( tablename, self.tablenames)) output_file = ".".join((outfile, tablename, "tsv")) with open(output_file, "w") as outf: outf.write("\t".join([x.lower() for x in header]) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") output_file = ".".join( (outfile, "{}_summary".format(self.name), "tsv")) with IOTools.open_file(output_file, "w") as outf: outf.write("section\tstatus\n") for section, status in summary_data: outf.write("{}\t{}\n".format(section, status)) return retval
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=" ") reference_fasta = get_reference(params) if params.parallel: statements = [] files_to_merge = [] jobsfile = outfile + ".jobs" if re.search("--region", params.options): region = re.search("--region[= ]*(\S+)", params.options).groups()[0] filter_contig, filter_start, filter_end = parse_region_string( region) else: filter_contig, filter_start, filter_end = None, None, None plain_options = re.sub("--region[= ]\S+", "", params.options) statements = [] with pysam.FastaFile(reference_fasta) as fastaf: for contig, length in zip(fastaf.references, fastaf.lengths): if filter_contig and contig != filter_contig: continue begin_range = filter_start if filter_start else 0 end_range = filter_end if filter_end else length for start in range(begin_range, end_range, params.chunk_size): fn = os.path.join( outfile + ".chunk_{}_{:08}.vcf.gz".format(contig, start)) files_to_merge.append(fn) if os.path.exists(fn): continue end = min(start + params.chunk_size, length) statements.append( "{params.path} " "--fasta-reference {reference_fasta} " "--region {contig}:{start}-{end} " "{plain_options} " "{bam} " "2> {fn}.log " "| bgzip " "> {fn}\n".format(**locals())) retvals = P.run(statements, job_array=True) fn = " ".join(files_to_merge) statement = ("zcat {fn} " "| vcffirstheader " "2> {outfile}.vcffirstheader.log " "| vcfstreamsort -w 1000 " "2> {outfile}.vcfstreamsort.log " "| vcfuniq " "2> {outfile}.vcfuniq.log " "| bgzip " "2> {outfile}.bgzip.log " "> {outfile}; " "tabix -p vcf {outfile} " "2> {outfile}.tabix.log; " "rm -f {fn} " "".format(**locals())) retvals.extend(P.run(statement)) else: # limit number of jobs to node to limit I/O job_threads = 2 retvals = P.run( "{params.path} " "--fasta-reference {reference_fasta} " "{params.options} " "{bam} " "2> {outfile}.log " "| bgzip " "> {outfile}; " "tabix -p vcf {outfile}".format(**locals()), **params._asdict()) if "set_filter_exclude" in params._fields: with IOTools.open_file(outfile + ".header.vcf", "w") as outf: outf.write( "##FILTER=<ID=HARD,Description=\"Variant fails hard filters: {}\"> " .format(params.set_filter_exclude)) job_threads = 1 # note to include in first step as these will be set to value "HARD" retvals.extend( P.run("bcftools query " "--include \"{params.set_filter_exclude}\" " "-f \"%%CHROM\\t%%POS\\tHARD\\n\" " "{outfile}.save.vcf.gz " "| bgzip > {outfile}.tab.gz; " "tabix -s 1 -b 2 -e 2 {outfile}.tab.gz; " "bcftools annotate " "-a {outfile}.tab.gz " "-c CHROM,POS,FILTER " "--header-lines {outfile}.header.vcf " "{outfile}.save.vcf.gz " "| bgzip > {outfile}.new.vcf.gz; " "mv {outfile}.new.vcf.gz {outfile}; " "tabix -f -p vcf {outfile} ".format(**locals()))) return retvals
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory TaskLibrary functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt)) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
def upload_result(infiles, outfile, *extras): """upload results into database. Connection details for the database are taken from the configuration dictionary given as first argument to extras. The configuration directory should have an element 'database' with the required field ``url`` and the optional field ``schema``. For example, to upload to an sqlite database in the current directory called csvdb, use:: config = {"database": {"url": "sqlite:///./csvdb"}} Arguments --------- infiles: list List of files to upload. These should be the output of metric tasks in a benchmarking workflow. outfile: output file On success, an empty output file is created. extras: list List of one element containing a configuration directory (see above). """ logger = P.get_logger() if len(extras) != 1: raise ValueError("expecting only one extra argument " "(configuration dictionary)") config = extras[0] url = config["database"]["url"] is_sqlite3 = url.startswith("sqlite") if is_sqlite3: connect_args = {'check_same_thread': False} else: connect_args = {} schema = config["database"].get("schema", None) # TODO: check if schema exists to avoid incomplete # transaction. engine = sqlalchemy.create_engine(url, connect_args=connect_args) # Catch exceptions until database access on thame available try: create_database(engine) except OperationalError as msg: logger.warn("could not connect to database at {}. " "The data will not be uploaded. Msg={}".format( url, str(msg))) return # Create schema if not exists if schema is not None: engine.execute(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema))) pipeline_name = os.path.basename(sys.argv[0]) logger.debug("uploading data to {}, schema={}".format(url, schema)) # TODO: add dependencies # dependencies = infiles[1:] # meta_data = dict([("dependency{}".format(x), y) \ # for x, y in enumerate(dependencies)]) # need to set created dir somehow, important when re-loading # as otherwise all times will be the same. if os.path.exists("benchmark.yml"): s = os.stat("benchmark.yml") created = datetime.datetime.fromtimestamp(s.st_mtime) else: created = datetime.datetime.now() benchmark_run = BenchmarkRun( author=os.environ.get("USER", "unknown"), # needs refactoring, should be: uploaded_at, created_at, run_at # uploaded_at=datetime.datetime.now(), created=created, pipeline_name=pipeline_name, pipeline_version=P.get_version(), pipeline_dir=os.getcwd(), title=config["title"], description=config["description"], config=json.dumps(config), config_hash=hash(json.dumps(config)), status="incomplete") Session = sessionmaker(bind=engine) session = Session() session.add(benchmark_run) session.commit() for tag in config["tags"]: benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag) session.add(benchmark_tag) session.commit() tool_dirs = set() table_cache = TableCache(engine, schema, is_sqlite3) for infile in infiles: path, name = os.path.split(infile) # walk up the path to find "benchmark.info" as it might be # located on a higher level if the tool output multiple files. parts = path.split(os.sep) info_paths = [] rootdir = os.getcwd() while len(parts): p = os.path.join(*parts) if p == rootdir: break if os.path.exists(os.path.join(p, "benchmark.info")): info_paths.append(p) parts.pop() info_paths = info_paths[::-1] # the level of nesting determines the layout: # 1 level: aggregation: tool == metric # 2 levels: tool + metric # 3 levels: tool + split + metric if len(info_paths) not in (1, 2, 3): raise ValueError("for {}, expected two or three paths with info, " "got {}".format(infile, len(info_paths))) meta_data = {} if len(info_paths) == 1: tool_dir = metric_dir = info_paths[0] split_dir = None elif len(info_paths) == 2: tool_dir, metric_dir = info_paths split_dir = None # If there are multiple output files in aggregation, use # intermediate paths as split_subset factors. td = len(tool_dir.split(os.sep)) tm = len(metric_dir.split(os.sep)) d = tm - td if d > 1: meta_data["split_subset"] = re.sub( ".dir", "", os.sep.join(metric_dir.split(os.sep)[td:-1])) elif len(info_paths) == 3: tool_dir, split_dir, metric_dir = info_paths if tool_dir: d = read_data(os.path.join(tool_dir, "benchmark.info"), prefix="tool_") if "tool_action" in d: assert d["tool_action"] == "tool" meta_data.update(d) if metric_dir: d = read_data(os.path.join(metric_dir, "benchmark.info"), prefix="metric_") if "metric_action" in d: # ignore splits, they will be added through metrics if d["metric_action"] == "split": continue assert d["metric_action"] == "metric", \ "action for metric info {} is not 'metric', but '{}'" \ .format(os.path.join(metric_dir, "benchmark.info"), d["metric_action"]) meta_data.update(d) if split_dir: d = read_data(os.path.join(split_dir, "benchmark.info"), prefix="split_") if "split_action" in d: assert d["split_action"] == "split" meta_data.update(d) subset = os.path.basename(os.path.dirname(info_paths[-1])) if subset.endswith(".dir"): subset = subset[:-len(".dir")] meta_data["split_subset"] = subset # tool_input_files can either be a dictionary if a tool # or a simple list if aggregation. try: tool_input_files = [ x["path"] for x in meta_data["tool_input_files"] ] except TypeError: tool_input_files = meta_data["tool_input_files"] try: instance = BenchmarkInstance( run_id=benchmark_run.id, completed=datetime.datetime.fromtimestamp( os.path.getmtime(infile)), input=",".join(tool_input_files), input_alias=meta_data["tool_input_alias"], tool_name=meta_data["tool_name"], tool_version=meta_data["tool_version"], tool_options=meta_data["tool_options"], tool_hash=meta_data["tool_option_hash"], tool_alias=meta_data.get("tool_alias", ""), metric_name=meta_data["metric_name"], metric_version=meta_data["metric_version"], metric_options=meta_data["metric_options"], metric_hash=meta_data["metric_option_hash"], metric_alias=meta_data.get("metric_alias", ""), split_name=meta_data.get("split_name", ""), split_version=meta_data.get("split_version", ""), split_options=meta_data.get("split_options", ""), split_hash=meta_data.get("split_option_hash", ""), split_alias=meta_data.get("split_alias", ""), split_subset=meta_data.get("split_subset", "all"), meta_data=json.dumps(meta_data)) except KeyError as e: raise KeyError("missing required attribute {} in {}".format( str(e), str(meta_data))) session.add(instance) session.commit() # avoid multiple upload of tool data if tool_dir and tool_dir not in tool_dirs: tool_dirs.add(tool_dir) save_benchmark_timings(tool_dir, "tool_timings", engine, instance, schema, is_sqlite3) save_benchmark_timings(metric_dir, "metric_timings", engine, instance, schema, is_sqlite3) metric_table_filter = None if "metric_no_upload" in meta_data: if meta_data["metric_no_upload"] == "*": logger.warn("upload turned off for metric {}".format( meta_data["metric_name"])) continue else: metric_table_filter = re.compile(meta_data["metric_no_upload"]) # multiple tablenames for multiple metric output # # Tables are added into schemas to avoid cluttering # the public namespace. # (if only blobs, no metric output file) if "metric_output_files" in meta_data: assert len(meta_data["metric_output_files"]) == \ len(meta_data["metric_tablenames"]) for output_file, tablename in zip(meta_data["metric_output_files"], meta_data["metric_tablenames"]): if metric_table_filter and metric_table_filter.search( tablename): logger.warn( "upload for table {} turned off".format(tablename)) continue if not os.path.exists(output_file): logger.warn( "output file {} does not exist - ignored".format( output_file)) continue if IOTools.is_empty(output_file): logger.warn("output file {} is empty - ignored".format( output_file)) continue try: table = pandas.read_csv(output_file, sep="\t", comment="#", skip_blank_lines=True) except ValueError as e: logger.warn("table {} can not be read: {}".format( output_file, str(e))) continue except pandas.parser.CParserError as e: logger.warn( "malformatted table {} can not be read: {}".format( output_file, str(e))) continue if len(table) == 0: logger.warn( "table {} is empty - ignored".format(output_file)) continue tablename, table, dtypes = transform_table_before_upload( tablename, table, instance, meta_data, table_cache) if schema is None: tn = tablename else: tn = "{}.{}".format(schema, tablename) logger.debug("saving data from {} to table {}".format( output_file, tn)) # add foreign key table["instance_id"] = instance.id table_cache.add_table(table, tablename, dtypes) if "metric_blob_globs" in meta_data: metric_dir = meta_data["metric_outdir"] files = [ glob.glob(os.path.join(metric_dir, x)) for x in meta_data["metric_blob_globs"] ] files = IOTools.flatten(files) logger.debug("uploading binary data in {} files from {} to " "table binary_data".format(len(files), metric_dir)) table = [] for fn in files: with IOTools.open_file(fn, "rb") as inf: data_row = BenchmarkBinaryData( instance_id=instance.id, filename=os.path.basename(fn), path=fn, data=inf.read()) session.add(data_row) session.commit() table_cache.close() touch(outfile) # upload table sizes df_sizes = pandas.DataFrame.from_records( list(table_cache.uploaded_sizes.items()), columns=["tablename", "bytes_uploaded"]) df_sizes["bytes_resident"] = df_sizes.bytes_uploaded df_sizes["run_id"] = benchmark_run.id df_sizes["schema"] = schema save_table(df_sizes, engine, "metric_storage", schema=None, is_sqlite3=is_sqlite3) # check if arvados job if Arvados.have_arvados(): try: arv_job_info = arvados.current_job() except KeyError: arv_job_info = None if arv_job_info is not None: arv_job = BenchmarkArvadosJob( run_id=benchmark_run.id, job_uuid=arv_job_info["uuid"], owner_uuid=arv_job_info["owner_uuid"]) session.add(arv_job) session.commit() benchmark_run.status = "complete" session.commit() engine.dispose() del engine logger.info("uploaded results under run_id {}".format(benchmark_run.id))
def run(self, infiles, outfile, params): tmpdir = P.get_temp_dir(clear=True) statements = ["mkdir {}".format(tmpdir)] if params.remove_fields: cleanup_statement = ("| {params.path} annotate " "-x {params.remove_fields} " "2> {outfile}_annotate.log ".format( **locals())) else: cleanup_statement = "" # the current pattern is properly overly specific and # substitutes ./. with 0/0 if params.set_missing_genotype_to_reference: set_genotype = "| perl -p -e 's/\.\/\./0\/0/g'" else: set_genotype = "" with IOTools.open_file(outfile + ".filelist_blocks", "w") as blockf: for start in range(0, len(infiles), self.block_size): fn = outfile + ".filelist_{}".format(start) fn_vcf = os.path.join(tmpdir, "block_{}.vcf.gz".format(start)) with IOTools.open_file(fn, "w") as outf: end = start + self.block_size outf.write("\n".join(infiles[start:end]) + "\n") statements.append("{params.path} merge " "{params.options} " "-O v " "--file-list {outfile}.filelist_{start} " "2> {outfile}_merge_{start}.log " "{cleanup_statement} " "{set_genotype} " "| bgzip " "> {fn_vcf}; " "tabix -p vcf {fn_vcf}".format(**locals())) blockf.write(fn_vcf + "\n") if params.restrict_to_all: filter_statement = ("| {params.path} filter " "--include \"FORMAT/GT != '.'\" " "-O v " "2> {outfile}_filter.log ".format(**locals())) else: filter_statement = "" statements.append("{params.path} merge " "{params.options} " "-O v " "--file-list {outfile}.filelist_blocks " "2> {outfile}_merge.log " "{filter_statement} " "| bgzip " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals())) statements.append("rm -rf {}".format(tmpdir)) statement = "; ".join(statements) retvals = P.run(statement, **params._asdict()) return retvals
def run(self, outfile, params): reference_fasta = resolve_argument(params.reference_fasta, ",").split(",") if len(reference_fasta) == 2: reference1, reference2 = reference_fasta else: raise NotImplementedError() outfile = os.path.abspath(outfile) outfile_fastq = os.path.join(os.path.dirname(outfile), "result.fastq.gz") # build BAM header with pysam.FastaFile(reference1) as inf: with IOTools.open_file(outfile + ".header.sam", "w") as outf: outf.write("@HD\tVN:1.3\tSO:unsorted\n") for contig, length in zip(inf.references, inf.lengths): outf.write("@SQ\tSN:{}\tLN:{}\n".format(contig, length)) # not enough space on tmp # tmpdir = P.get_temp_filename(clear=True) tmpdir = os.path.join(os.path.dirname(outfile), "tmp") statements = [] statements.append("mkdir -p {tmpdir}") if params.use_sample_method: fastq_filename = os.path.join(tmpdir, "tmp.fastq") if not os.path.exists(fastq_filename): if params.set_quality_score and params.set_quality_score.strip( ): statements.append( "daisy fastq2fastq " "--quality-offset={params.set_quality_score} " "--log={outfile}.fastq.log " "{params.fastq} " "> {fastq_filename}".format(**locals())) else: statements.append( "zcat {params.fastq} > {tmpdir}/tmp.fastq") statements.append("cd {tmpdir}") statements.append("{params.path} " "{params.options} " "--sample-fastq={tmpdir}/tmp.fastq " "{reference1} " "--prefix=H1 " ">& {outfile}.pbsim1.log") statements.append("{params.path} " "{params.options} " "--sample-fastq={tmpdir}/tmp.fastq " "{reference2} " "--prefix=H2 " ">& {outfile}.pbsim2.log") else: statements.append("cd {tmpdir}") statements.append("{params.path} " "{params.options} " "--prefix=H1 " "{reference1} " ">& {outfile}.pbsim1.log") statements.append("{params.path} " "{params.options} " "--prefix=H2 " "{reference2} " ">& {outfile}.pbsim2.log") statements.append("daisy fastq2fastq " "--input-fastq-file={tmpdir}/H1_0001.fastq " "--output-removed-tsv={outfile}.removed1 " "--set-prefix=H1 " "--method=filter-N " "--log={outfile}.log " "| gzip " "> {outfile_fastq}") statements.append("cat {tmpdir}/H1_0001.maf " "| daisy maf2maf " "--input-filter-tsv={outfile}.removed1 " "--log={outfile}.maf.log " "--set-prefix=H1 " "> {tmpdir}/tmp.maf") statements.append("daisy fastq2fastq " "--input-fastq-file={tmpdir}/H2_0001.fastq " "--output-removed-tsv={outfile}.removed2 " "--method=filter-N " "--set-prefix=H2 " "--log={outfile}.log " "| gzip " ">> {outfile_fastq}") statements.append("cat {tmpdir}/H2_0001.maf " "| daisy maf2maf " "--input-filter-tsv={outfile}.removed1 " "--log={outfile}.maf.log " "--set-prefix=H2 " ">> {tmpdir}/tmp.maf") # generalize for chromosomes statements.append("maf-convert sam {tmpdir}/tmp.maf " "| grep -v '^@' " "| perl -p -e \"s/ref/22/\" " ">> {tmpdir}/tmp.sam") statements.append("cat {outfile}.header.sam {tmpdir}/tmp.sam " "| samtools view -bS " "| samtools sort -T {tmpdir}/ -O bam - > {outfile}") statements.append("samtools index {outfile}") statements.append("rm -rf {tmpdir}") statement = "; ".join(statements).format(**locals()) return P.run(statement)