def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) tools = glob.glob( os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools", "*.py")) counter = E.Counter() for tool in tools: counter.found += 1 tool_module = re.sub(".py", "", os.path.basename(tool)) tool_name = re.sub("_", "-", tool_module) if tool_name in ("__init__", "cli"): c.ignored += 1 continue dest = os.path.join("tools", "{}.rst".format(tool_name)) if os.path.exists(dest) and not options.output_force: counter.skipped += 1 continue with IOTools.openFile(dest, "w") as outf: outf.write(TEMPLATE_TOOL.format(**locals())) counter.new += 1 E.info(counter) E.stop()
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bams = resolve_argument(params.bam, ",") reference_fasta = get_reference(params) statements, gvcfs = [], [] # TODO: sort out multi-threading for idx, bam in enumerate(bams.split(",")): output = prefix + "." + str(idx) + ".g.vcf" gvcfs.append(output) if os.path.exists(output): E.info("{} already exists - skipped".format(output)) continue statements.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--emitRefConfidence GVCF " "--logging_level INFO " "--log_to_file {prefix}.HaplotypeCaller.{idx}.log " "{params.haplotypecaller} " "--out {output} " ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals())) if statements: self.run_statements(statements, job_memory="4G") stmnts = [] gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs]) vcf_output = prefix + ".raw.vcf.gz" stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type GenotypeGVCFs " "--reference_sequence {reference_fasta} " "{gvcfs} " "--logging_level INFO " "--log_to_file {prefix}.GenotypeGVCFs.log " "{params.genotypegvcfs} " "--out {vcf_output} " ">& {prefix}.GenotypeGVCFs".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="4G")
def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) total_counter = E.Counter() table = [] for section, map_task2runner in [("tool", map_tool_to_runner), ("metric", map_metric_to_runner), ("split", map_split_to_runner), ("collate", map_collate_to_runner)]: E.debug("processing section: {}".format(section)) counter = E.Counter() for task, taskf in sorted(map_task2runner.items()): counter.ntasks += 1 comments = [] try: version = taskf().get_version() counter.version_ok += 1 except Exception: version = "" comments.append("unavailable") counter.version_fail += 1 comments = "; ".join(comments) table.append((section, task, version, comments)) E.info("{}: {}".format(section, counter)) total_counter += counter options.stdout.write("section\ttask\tversion\tcomments\n") for row in table: options.stdout.write("\t".join(map(str, row)) + "\n") E.info("{}: {}".format("total", counter)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-d", "--database", dest="databases", action="append", help="only show what will be done, don't do it [%default]") parser.add_option("-f", "--filter", dest="filter_method", type="choice", choices=("first", "last"), help="only input a selection of results") parser.add_option("-t", "--target", dest="target_database", type="string", help="the target database [%default]") parser.set_defaults(filter_method=None, databases=[]) (options, args) = E.start(parser, argv) run_id_offset = 0 instance_id_offset = 0 for database in options.databases: source_db = sqlite3.connect(database) is_instance = False is_run = False cc = source_db.cursor() min_run_id = cc.execute("SELECT MIN (id) FROM run").fetchall()[0][0] max_run_id = cc.execute("SELECT MAX (id) FROM run").fetchall()[0][0] max_instance_id = cc.execute( "SELECT MAX (id) FROM instance").fetchall()[0][0] E.info("{}: min_run_id={}, max_run_id={}, max_instance_id={}".format( database, min_run_id, max_run_id, max_instance_id)) for line in source_db.iterdump(): if line.startswith("CREATE TABLE"): try: tablename = re.search("CREATE TABLE \"(\S+)\"", line).groups()[0] except AttributeError: tablename = re.search("CREATE TABLE (\S+)", line).groups()[0] is_instance = False is_run = False if tablename == "run": offset = run_id_offset pos = "first" is_run = True elif tablename == "tags": offset = run_id_offset pos = "first" elif tablename == "instance": is_instance = True elif tablename == "tool_timings": offset = instance_id_offset pos = "last" elif tablename == "metric_timings": offset = instance_id_offset pos = "last" else: # metric table offset = instance_id_offset pos = "last" elif line.startswith("INSERT INTO"): if is_instance: i, n = re.search("VALUES\((\d+),(\d+),", line).groups() if apply_run_filter(n, options.filter_method, min_run_id, max_run_id): line = None else: line = re.sub( "VALUES\({},{},".format(i, n), "VALUES({},{},".format( int(i) + instance_id_offset, int(n) + run_id_offset), line) else: if pos == "last": n = re.search(",(\d+)\)", line).groups()[0] line = re.sub(",{}\)".format(n), ",{})".format(int(n) + offset), line) elif pos == "first": n = re.search("VALUES\((\d+),", line).groups()[0] line = re.sub("VALUES\({},".format(n), "VALUES({},".format(int(n) + offset), line) if is_run: if apply_run_filter(n, options.filter_method, min_run_id, max_run_id): line = None if line is not None: print(line) cc = source_db.cursor() run_id_offset += max_run_id instance_id_offset += max_instance_id E.info("{}: updated offsets to run_id={}, instance_id={}".format( database, run_id_offset, instance_id_offset)) E.stop()
def purge_run_id(run_id, url, dry_run=False, schemas=None): """remove a run from a database. """ engine = sqlalchemy.create_engine(url) connection = engine.connect() # automap metadata = sqlalchemy.MetaData() metadata.reflect(engine) base = automap_base(metadata=metadata) base.prepare() if schemas is None: insp = reflection.Inspector.from_engine(engine) schemas = insp.get_schema_names() # note: default sqlite schema is "main" if 'public' in schemas: schemas.remove('public') if 'information_schema' in schemas: schemas.remove('information_schema') E.debug("getting instance_id list of run_id={}".format(run_id)) instance_ids = set(get_instance_ids_for_run_id(run_id, engine)) E.debug("found {} instances for run_id={}".format(len(instance_ids), run_id)) non_metric_tables = [ 'run', 'arvados_job', 'instance', 'binary_data', 'metric_timings', 'tool_timings', 'metric_storage', 'tags' ] # delete from tables with field "instance_id" if instance_ids: for schema in schemas: # automap the schema metadata_schema = sqlalchemy.MetaData() metadata_schema.reflect(engine, schema=schema) base_schema = automap_base(metadata=metadata_schema) base_schema.prepare() for table_name in list(base_schema.metadata.tables.keys()): table = sqlalchemy.Table(table_name, metadata_schema, autoload=True) if "instance_id" not in table.c: continue E.info("deleting data in {}".format(table_name)) delete = table.delete().where( table.c.instance_id.in_(instance_ids)) # E.debug(delete) if not dry_run: connection.execute(delete) # delete from tables with field "run_id" for table_name in base.metadata.tables.keys(): table = sqlalchemy.Table(table_name, metadata, autoload=True) if "run_id" not in table.c: continue E.info("deleting data in {} for run_id {}".format(table_name, run_id)) delete = table.delete().where(table.c.run_id == run_id) # E.debug(delete) if not dry_run: connection.execute(delete) table = sqlalchemy.Table('run', metadata, autoload=True) delete = table.delete().where(table.c.id == run_id) E.info("deleting data in 'run' for id {}".format(run_id)) # E.debug(delete) if not dry_run: connection.execute(delete)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-l", "--logfile", dest="logfile", type="string", help="name of logfile [default=%default]") parser.add_option("-t", "--time", dest="time", type="choice", choices=("seconds", "milliseconds"), help="time to show [default=%default]") parser.add_option( "--no-reset", dest="reset", action="store_false", help="do not reset counters when a new pipeline run started " "The default is to reset so that only the counts from the latest " "pipeline execution are show " "[default=%default]") parser.add_option( "-f", "--filter-method", dest="filter", type="choice", choices=("unfinished", "running", "completed", "all"), help="apply filter to output [default=%default]") parser.add_option( "-s", "--sort-order", dest="sort_order", type="choice", choices=("object", "ncalls", "duration", "percall", "running"), help="apply filter to output [default=%default]") parser.add_option( "-i", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default=%default]") parser.set_defaults(sections=[], logfile="pipeline.log", filter="all", reset=True, sort_order="duration", time="seconds") (options, args) = E.start(parser, argv) if options.sections: profile_sections = options.sections else: profile_sections = ("task", "job") counts = {} for section in profile_sections: counts[section] = collections.defaultdict(Counter) def line_grouper(filename): rx = re.compile("\d{4}-\d{2}-\d{2} ") with IOTools.open_file(filename) as infile: last_line = None for line in infile: line = line.strip() if not rx.match(line): last_line = " ".join((last_line, line)) else: if last_line: yield last_line last_line = line yield last_line for line in line_grouper(options.logfile): data = line.split() if len(data) < 5: continue print(line) date, time, level, pipeline, source = data[:5] if re.search("output generated by", line): if options.reset: E.info("resetting counts at line=%s" % line[:-1]) for section in profile_sections: counts[section] = collections.defaultdict(Counter) continue # filter for log messages from task module if source != "task": continue dt = datetime.datetime.strptime( " ".join((date, time)), "%Y-%m-%d %H:%M:%S,%f") msg = "".join(data[5:]) started_task, completed_task, started_job, completed_job = \ (None, None, None, None) if re.search("task.log_at_level.\d+Task=(\S+)", msg): checked_task = re.search( "task.log_at_level.\d+Task=(\S+)", msg).groups()[0] elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg): started_infiles, started_job, missing = re.search( "Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg).groups() elif re.search("Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg): started_infiles, started_job = re.search( "Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg).groups() elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg): started_infiles, started_job = re.search( "Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg).groups() elif re.search("Taskentersqueue=(\S+)", msg): started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0] elif re.search("Job=\[(\S+)->(\S+)\]completed", msg): completed_infiles, completed_job = re.search( "Job=\[(\S+)->(\S+)\]completed", msg).groups() elif re.search("CompletedTask=(\S+)", msg): completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0] elif re.search("UptodateTask=(\S+)", msg): completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0] else: continue try: if started_task: counts["task"][(pipeline, started_task)].add(True, dt, started_task) elif completed_task: counts["task"][(pipeline, completed_task)].add(False, dt, completed_task) elif started_job: counts["job"][(pipeline, started_job)].add(True, dt, started_job) elif completed_job: counts["job"][(pipeline, completed_job)].add(False, dt, completed_job) else: raise ValueError("unknown action") except ValueError as msg: if not options.ignore_errors: raise ValueError(str(msg) + "\nat line %s" % line) def to_milliseconds(d): return d.seconds + d.microseconds / 1000 def to_seconds(d): return d.seconds + d.microseconds / 1000000 if options.time == "milliseconds": f = to_milliseconds elif options.time == "seconds": f = to_seconds for section in profile_sections: running = [] rows = [] for objct, c in list(counts[section].items()): # apply filters if options.filter in ("unfinished", "running") and c.running == 0: continue d = f(c.duration) if c.calls > 0: percall = "%6.3f" % (d / float(c.calls)) else: percall = "na" rows.append((section, objct[0], objct[1], c.calls, d, percall, c.running)) running.extend([x for x, y in c._started.items() if y != 0]) header = ("section", "pipeline", "object", "ncalls", "duration", "percall", "running") options.stdout.write("\t".join((header)) + "\n") idx = header.index(options.sort_order) rows = sorted(rows, key=lambda x: x[idx]) options.stdout.write("\n".join( ["\t".join(map(str, x)) for x in rows]) + "\n") options.stdout.write("#//\n\n") if running: options.stdout.write("# running %ss\n" % section) options.stdout.write("\n".join(map(str, running)) + "\n") options.stdout.write("#//\n\n") E.stop()