Ejemplo n.º 1
0
def predict(ctx, classifier_paths, test_data_paths, output_path, class_prior,
            overwrite, n_jobs):
    """batch testing of classifiers"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("classifier_paths")
    args.pop("test_data_paths")

    class_pattern = FILENAME_PATTERNS["train"]
    classifier_fns = exec_command(
        f"find {classifier_paths} -name {class_pattern}")
    classifier_fns = classifier_fns.splitlines()

    if "*" in test_data_paths and len(classifier_fns) == 1:
        classifier_fns = classifier_fns[0]
        func = _get_predict_query_argsets
    else:
        func = _get_predict_test_argsets

    arg_sets = func(args, classifier_fns, test_data_paths, output_path,
                    overwrite)

    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_predict, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass
Ejemplo n.º 2
0
 def imap(self, f, s, mininterval=1.0, parallel=False, par_kw=None, **kw):
     self.mininterval = mininterval
     if parallel:
         # todo document parallel.map arguments
         par_kw = par_kw or {}
         results = PAR.imap(f, s, **par_kw)
     else:
         results = map(f, s)
     for result in self.series(results, count=len(s), **kw):
         yield result
Ejemplo n.º 3
0
 def test_is_master_process(self):
     """
     is_master_process() should return False
     for all child processes
     """
     index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     master_processes = 0
     for result in parallel.imap(
         check_is_master_process, index, max_workers=None, use_mpi=False
     ):
         if result:
             master_processes += 1
     self.assertEqual(master_processes, 0)
Ejemplo n.º 4
0
    def run(self, aln):
        result = bootstrap_result(aln.info.source)
        try:
            obs = self._hyp(aln)
        except ValueError as err:
            result = NotCompleted("ERROR", str(self._hyp), err.args[0])
            return result
        result.observed = obs
        self._null = obs.null
        self._inpath = aln.info.source

        sym_results = [
            r for r in parallel.imap(self._fit_sim, range(self._num_reps)) if r
        ]
        for sym_result in tqdm(sym_results):
            if not sym_result:
                continue

            result.add_to_null(sym_result)

        return result
Ejemplo n.º 5
0
def ocs_train(ctx, training_path, output_path, label_col, seed, max_flank,
              flank_sizes, model_range, proximal, usegc, n_jobs, overwrite):
    """batch one class SVM training"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("max_flank")
    args.pop("flank_sizes")
    args.pop("model_range")

    arg_sets = get_train_kwarg_sets(training_path, output_path, max_flank,
                                    flank_sizes, model_range, usegc, proximal,
                                    args)
    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_ocs_train, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass
Ejemplo n.º 6
0
def sample_data(ctx, enu_path, germline_path, output_path, seed, enu_ratio,
                numreps, overwrite, size_range, n_jobs):
    """batch creation training/testing sample data"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("size_range")
    sizes = list(map(lambda x: int(x), size_range.split(",")))

    arg_sets = []
    for size in sizes:
        arg_group = args.copy()
        arg_group['train_size'] = size * 1000
        arg_group['output_path'] = os.path.join(output_path, f"{size}k")
        arg_sets.append(arg_group)

    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_sample, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass
Ejemplo n.º 7
0
def performance(ctx, test_data_paths, predictions_path, output_path, label_col,
                overwrite, n_jobs, verbose):
    """batch classifier performance assessment"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("test_data_paths")
    args.pop("predictions_path")
    args.pop("output_path")

    predict_pattern = FILENAME_PATTERNS["predict"]
    if '*' not in test_data_paths:
        test_pattern = FILENAME_PATTERNS["sample_data"]["test"]
        test_fns = exec_command(f"find {test_data_paths} -name {test_pattern}")
        data_fns = test_fns.splitlines()

        data_mapped = {}
        for path in data_fns:
            size = sample_size_from_path(path)
            size = f"{size // 1000}k"
            rep = data_rep_from_path("sample_data", path)
            data_mapped[(size, rep)] = path

        predict_fns = exec_command(f'find {predictions_path} -name'
                                   f' {predict_pattern}')
        predict_fns = predict_fns.splitlines()
        paired = []
        for path in predict_fns:
            paths = dict(predictions_path=path)
            size = sample_size_from_path(path)
            size = f"{size // 1000}k"
            rep = data_rep_from_path("train", path)
            featdir = feature_set_from_path(path)
            paths.update(
                dict(data_path=data_mapped[(size, rep)],
                     size=size,
                     featdir=featdir))
            paired.append(paths)
    else:
        data_fns = glob.glob(test_data_paths)
        data_mapped = {}
        for fn in data_fns:
            bn = os.path.basename(fn).replace(".tsv.gz", "")
            data_mapped[bn] = fn

        predict_fns = exec_command(f'find {predictions_path} -name'
                                   f' {predict_pattern}')
        predict_fns = predict_fns.splitlines()
        paired = []
        for path in predict_fns:
            components = path.split('-')
            for key in data_mapped:
                if key in components:
                    paired.append(
                        dict(predictions_path=path,
                             data_path=data_mapped[key]))
                    break

    arg_sets = []
    for pair in paired:
        arg_group = args.copy()
        try:
            size = pair.pop('size')
            featdir = pair.pop('featdir')
            arg_group['output_path'] = os.path.join(output_path, size, featdir)
        except KeyError:
            arg_group['output_path'] = output_path
        arg_group.update(pair)
        arg_sets.append(arg_group)

    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_performance, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass