def run_on_cluster(to_cluster):
    if to_cluster:
        P.start_session()
        try:
            yield
        finally:
            P.close_session()
    else:
        yield
 def setUp(self):
     TestExecutionRunLocal.setUp(self)
     P.start_session()
Example #3
0
def main(argv=None):

    parser = get_option_parser()

    (options, args) = E.start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    P.get_parameters()
    P.start_session()

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        statements = [build_command(x) for x in data]
        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.stop()
            sys.exit(0)

        P.run(statements)
    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = iotools.open_file(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = iotools.open_file(options.output_pattern % filename,
                                            "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.stop()
Example #4
0
def main(argv=sys.argv):

    TASKS = {}
    for label, collection in [("tool", map_tool_to_runner),
                              ("metric", map_metric_to_runner),
                              ("collate", map_collate_to_runner),
                              ("split", map_split_to_runner)]:
        for key, f in list(collection.items()):
            k = "{}_{}".format(label, key)
            if k in TASKS:
                raise ValueError("duplicate keys in TASK: {} {} {}".format(
                    k, TASKS[k], f))
            TASKS[k] = f

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-file",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="input file. Can be used more than once [%default]")

    parser.add_option(
        "-s",
        "--input-slot",
        dest="input_slots",
        type="string",
        action="append",
        help=
        "input slot. Must be used as often as input_files for tools [%default]"
    )

    parser.add_option(
        "-o",
        "--output-file",
        dest="output_files",
        type="string",
        action="append",
        help="output file. Can be used more than once [%default]")

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="show statement to be executed, do not execute [%default]")

    parser.add_option("--engine",
                      dest="engine",
                      type="choice",
                      choices=("local", "arvados"),
                      help="engine to use [%default]")

    parser.add_option("-t",
                      "--task",
                      dest="task",
                      type="choice",
                      choices=sorted(TASKS.keys()),
                      help="task to run [%default]")

    parser.add_option("-l",
                      "--list-tasks",
                      dest="list_tasks",
                      action="store_true",
                      help="list all available tasks and exit [%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.set_defaults(
        input_files=[],
        input_slots=[],
        output_files=[],
        engine="local",
        dry_run=False,
        task=None,
        always_mount=False,
    )

    (options, args) = E.start(parser, argv, add_cluster_options=True)

    if options.list_tasks:
        options.stdout.write("available_tasks\n{}\n".format("\n".join(
            sorted(TASKS.keys()))))
        E.stop()
        return

    if len(options.input_files) == 0:
        raise ValueError("no input files specified, use --input-file")

    if len(options.output_files) == 0:
        raise ValueError("no output files specified, use --output-file")

    if options.task is None:
        raise ValueError("please specify a task to run (--task)")

    P.get_parameters()

    if options.engine == "arvados":

        raise ValueError("arvados support disabled")
        # crunch_json = Arvados.build_crunch_script(argv)
        crunch_json = None
        retval = E.run('arv-crunch-job --job="$(cat {})"'.format(crunch_json))

        if retval != 0:
            raise ValueError("error while executing")

        os.unlink(crunch_json)
        E.stop()
        return retval

    # Start SGE session
    if not options.without_cluster:
        P.start_session()

    params = dict(parse_args(args))

    signal.signal(signal.SIGINT, cleanup)

    # redirect all mount points in parameters and input files.
    mountpoint = redirect2mounts([params, options.input_files],
                                 always_mount=options.always_mount)
    mountpoint = redirect_defaults2mountpoint(mountpoint)
    # register mountpoint with pipeline
    P.PARAMS["mount_point"] = mountpoint
    P.PARAMS["dryrun"] = options.dry_run

    try:
        # instantiate task runner
        runner = TASKS[options.task](**params)

        if len(options.output_files) == 0:
            tmpfile = tempfile.NamedTemporaryFile(delete=False)
            os.unlink(tmpfile.name)
            options.output_files.append(tmpfile.name)

        if options.task.startswith("tool"):
            if len(options.input_slots) != len(options.input_files):
                raise ValueError(
                    "for tools, provide the same number as input slots as there"
                    "are input files (--input-slots)")

            input_files = dict(zip(options.input_slots, options.input_files))

            runner.register_input(input_files)
            # check if expected is in params
            runner(list(input_files.values()), options.output_files[0])
        elif options.task.startswith("metric"):
            runner(options.input_files[0], options.output_files[0])
        elif options.task.startswith("collate"):
            runner(options.input_files, options.output_files[0])
        elif options.task.startswith("split"):
            runner(options.input_files[0], options.output_files)

        # stop SGE session
        P.close_session()

    finally:
        cleanup()

    E.stop()