def predict(ctx, classifier_paths, test_data_paths, output_path, class_prior, overwrite, n_jobs): """batch testing of classifiers""" args = locals() args.pop('ctx') args.pop("n_jobs") args.pop("classifier_paths") args.pop("test_data_paths") class_pattern = FILENAME_PATTERNS["train"] classifier_fns = exec_command( f"find {classifier_paths} -name {class_pattern}") classifier_fns = classifier_fns.splitlines() if "*" in test_data_paths and len(classifier_fns) == 1: classifier_fns = classifier_fns[0] func = _get_predict_query_argsets else: func = _get_predict_test_argsets arg_sets = func(args, classifier_fns, test_data_paths, output_path, overwrite) if n_jobs > 1: parallel.use_multiprocessing(n_jobs) total = len(arg_sets) gen = parallel.imap(lambda args: ctx.invoke(mutori_predict, **args), arg_sets) for r in tqdm(gen, ncols=80, total=total): pass
def ocs_train(ctx, training_path, output_path, label_col, seed, max_flank, flank_sizes, model_range, proximal, usegc, n_jobs, overwrite): """batch one class SVM training""" args = locals() args.pop('ctx') args.pop("n_jobs") args.pop("max_flank") args.pop("flank_sizes") args.pop("model_range") arg_sets = get_train_kwarg_sets(training_path, output_path, max_flank, flank_sizes, model_range, usegc, proximal, args) if n_jobs > 1: parallel.use_multiprocessing(n_jobs) total = len(arg_sets) gen = parallel.imap(lambda args: ctx.invoke(mutori_ocs_train, **args), arg_sets) for r in tqdm(gen, ncols=80, total=total): pass
def sample_data(ctx, enu_path, germline_path, output_path, seed, enu_ratio, numreps, overwrite, size_range, n_jobs): """batch creation training/testing sample data""" args = locals() args.pop('ctx') args.pop("n_jobs") args.pop("size_range") sizes = list(map(lambda x: int(x), size_range.split(","))) arg_sets = [] for size in sizes: arg_group = args.copy() arg_group['train_size'] = size * 1000 arg_group['output_path'] = os.path.join(output_path, f"{size}k") arg_sets.append(arg_group) if n_jobs > 1: parallel.use_multiprocessing(n_jobs) total = len(arg_sets) gen = parallel.imap(lambda args: ctx.invoke(mutori_sample, **args), arg_sets) for r in tqdm(gen, ncols=80, total=total): pass
def performance(ctx, test_data_paths, predictions_path, output_path, label_col, overwrite, n_jobs, verbose): """batch classifier performance assessment""" args = locals() args.pop('ctx') args.pop("n_jobs") args.pop("test_data_paths") args.pop("predictions_path") args.pop("output_path") predict_pattern = FILENAME_PATTERNS["predict"] if '*' not in test_data_paths: test_pattern = FILENAME_PATTERNS["sample_data"]["test"] test_fns = exec_command(f"find {test_data_paths} -name {test_pattern}") data_fns = test_fns.splitlines() data_mapped = {} for path in data_fns: size = sample_size_from_path(path) size = f"{size // 1000}k" rep = data_rep_from_path("sample_data", path) data_mapped[(size, rep)] = path predict_fns = exec_command(f'find {predictions_path} -name' f' {predict_pattern}') predict_fns = predict_fns.splitlines() paired = [] for path in predict_fns: paths = dict(predictions_path=path) size = sample_size_from_path(path) size = f"{size // 1000}k" rep = data_rep_from_path("train", path) featdir = feature_set_from_path(path) paths.update( dict(data_path=data_mapped[(size, rep)], size=size, featdir=featdir)) paired.append(paths) else: data_fns = glob.glob(test_data_paths) data_mapped = {} for fn in data_fns: bn = os.path.basename(fn).replace(".tsv.gz", "") data_mapped[bn] = fn predict_fns = exec_command(f'find {predictions_path} -name' f' {predict_pattern}') predict_fns = predict_fns.splitlines() paired = [] for path in predict_fns: components = path.split('-') for key in data_mapped: if key in components: paired.append( dict(predictions_path=path, data_path=data_mapped[key])) break arg_sets = [] for pair in paired: arg_group = args.copy() try: size = pair.pop('size') featdir = pair.pop('featdir') arg_group['output_path'] = os.path.join(output_path, size, featdir) except KeyError: arg_group['output_path'] = output_path arg_group.update(pair) arg_sets.append(arg_group) if n_jobs > 1: parallel.use_multiprocessing(n_jobs) total = len(arg_sets) gen = parallel.imap(lambda args: ctx.invoke(mutori_performance, **args), arg_sets) for r in tqdm(gen, ncols=80, total=total): pass