Ejemplo n.º 1
0
def predict(ctx, classifier_paths, test_data_paths, output_path, class_prior,
            overwrite, n_jobs):
    """batch testing of classifiers"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("classifier_paths")
    args.pop("test_data_paths")

    class_pattern = FILENAME_PATTERNS["train"]
    classifier_fns = exec_command(
        f"find {classifier_paths} -name {class_pattern}")
    classifier_fns = classifier_fns.splitlines()

    if "*" in test_data_paths and len(classifier_fns) == 1:
        classifier_fns = classifier_fns[0]
        func = _get_predict_query_argsets
    else:
        func = _get_predict_test_argsets

    arg_sets = func(args, classifier_fns, test_data_paths, output_path,
                    overwrite)

    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_predict, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass
Ejemplo n.º 2
0
def get_train_kwarg_sets(training_path, output_path, max_flank, flank_sizes,
                         model_range, usegc, proximal, args):
    """standadrised generation of kwargs for train algorithms"""
    get_dims = {
        'upto1': MakeDims(1, 1),
        'upto2': MakeDims(1, 2),
        'upto3': MakeDims(1, 3),
        'FS': MakeDims(None, None)
    }[model_range]
    start_flank = {'FS': 2}.get(model_range, 0)
    parameterisations = flank_dim_combinations(max_flank=max_flank,
                                               start_flank=start_flank,
                                               flank_sizes=flank_sizes,
                                               get_dims=get_dims)

    # find all the training data
    train_pattern = FILENAME_PATTERNS["sample_data"]["train"]
    cmnd = f'find {training_path} -name "{train_pattern}"'
    train_paths = exec_command(cmnd)
    train_paths = train_paths.splitlines()

    # we want to process smallest to largest samples
    train_paths.sort(key=sample_size_from_path)

    other_features = dict(usegc=usegc, proximal=proximal)
    arg_sets = []
    for train_path in train_paths:
        data_size = sample_size_from_path(train_path) // 1000
        data_size = f"{data_size}k"
        for params in parameterisations:
            params = params.copy()
            params.update(other_features)
            params.update(args)
            dim = params.get("feature_dim")
            flank_size = params["flank_size"]
            if (dim is None or dim < 2 or flank_size < 2
                    or dim == flank_size * 2):
                # prox only sensible with dim >= 2, flank_size > 1
                # dim < 2 * flank_size
                params["proximal"] = False

            params['training_path'] = train_path
            params['output_path'] = os.path.join(output_path, data_size,
                                                 dirname_from_features(params))
            arg_sets.append(params)

    return arg_sets
Ejemplo n.º 3
0
def _get_predict_query_argsets(args, classifier_fn, test_data_paths,
                               output_path, overwrite):
    """returns argsets for case where single classifier and multiple queries"""
    dirname = os.path.dirname(test_data_paths)
    data_pattern = os.path.basename(test_data_paths)
    cmnd = f"find {dirname} -name {data_pattern}"
    data_fns = exec_command(cmnd)
    # create a dict from sample size, number
    data_fns = data_fns.splitlines()

    # using a single classifier on multiple data files
    arg_sets = []
    for path in data_fns:
        arg_group = args.copy()
        arg_group['classifier_path'] = classifier_fn
        arg_group['output_path'] = output_path
        arg_group['data_path'] = path
        arg_sets.append(arg_group)
    return arg_sets
Ejemplo n.º 4
0
def _get_predict_test_argsets(args, classifier_fns, test_data_paths,
                              output_path, overwrite):
    """returns argsets for case where number of classifier fns match number of
    data fns"""
    test_pattern = FILENAME_PATTERNS["sample_data"]["test"]
    data_fns = exec_command(f"find {test_data_paths} -name {test_pattern}")
    # create a dict from sample size, number
    data_fns = data_fns.splitlines()
    data_mapped = {}
    for path in data_fns:
        size = sample_size_from_path(path)
        size = f"{size // 1000}k"
        rep = data_rep_from_path("sample_data", path)
        data_mapped[(size, rep)] = path

    if type(classifier_fns) == str:
        classifier_fns = classifier_fns.splitlines()

    paired = []
    for path in classifier_fns:
        size = sample_size_from_path(path)
        size = f"{size // 1000}k"
        rep = data_rep_from_path("train", path)
        featdir = feature_set_from_path(path)
        paired.append(
            dict(classifier_path=path,
                 data_path=data_mapped[(size, rep)],
                 size=size,
                 featdir=featdir))
    arg_sets = []
    for pair in paired:
        arg_group = args.copy()
        size = pair.pop('size')
        featdir = pair.pop('featdir')
        arg_group.update(pair)
        arg_group['output_path'] = os.path.join(output_path, size, featdir)
        arg_sets.append(arg_group)
    return arg_sets
Ejemplo n.º 5
0
def collate(base_path, output_path, exclude_paths, overwrite):
    """collates all classifier performance stats and writes
    to a single tsv file"""
    LOGGER.log_args()
    outpath = os.path.join(output_path, "collated.tsv.gz")
    logfile_path = os.path.join(output_path, "collated.log")
    if os.path.exists(outpath) and not overwrite:
        click.secho(f"Skipping. {outpath} exists. "
                    "Use overwrite to force.",
                    fg='green')
        exit(0)

    stat_fns = exec_command(f'find {base_path} -name' ' "*performance.json*"')
    stat_fns = stat_fns.splitlines()
    if not stat_fns:
        msg = f'No files matching "*performance.json*" in {base_path}'
        click.secho(msg, fg='red')
        return

    LOGGER.log_file_path = logfile_path

    records = []
    keys = set()
    exclude_paths = [] if exclude_paths is None else exclude_paths.split(',')
    num_skipped = 0
    for fn in tqdm(stat_fns, ncols=80):
        if skip_path(exclude_paths, fn):
            num_skipped += 1
            LOGGER.log_message(fn, label="SKIPPED FILE")
            continue

        LOGGER.input_file(fn)
        data = load_json(fn)
        labels = data['classification_report']['labels']
        fscores = data['classification_report']['f-score']
        row = {
            "stat_path": fn,
            "classifier_path": data["classifier_path"],
            "auc": data["auc"],
            "algorithm": data["classifier_label"],
            "mean_precision": data["mean_precision"],
            f"fscore({labels[0]})": fscores[0],
            f"fscore({labels[1]})": fscores[1],
            'balanced_accuracy': data['balanced_accuracy']
        }
        row.update(data["feature_params"])
        keys.update(row.keys())
        records.append(row)

    columns = sorted(keys)
    rows = list(map(lambda r: [r.get(c, None) for c in columns], records))
    table = LoadTable(header=columns, rows=rows)
    table = table.sorted(reverse="auc")
    table = table.with_new_column(
        "name",
        lambda x: model_name_from_features(*x),
        columns=["flank_size", "feature_dim", "usegc", "proximal"])
    table = table.with_new_column("size",
                                  sample_size_from_path,
                                  columns="classifier_path")
    table.write(outpath)
    LOGGER.output_file(outpath)

    # make summary statistics via grouping by factors
    factors = [
        "algorithm", "name", "flank_size", "feature_dim", "proximal", "usegc",
        "size"
    ]
    summary = summary_stat_table(table, factors=factors)
    outpath = os.path.join(output_path, "summary_statistics.tsv.gz")
    summary.write(outpath)
    LOGGER.output_file(outpath)
    if num_skipped:
        click.secho("Skipped %d files that matched exclude_paths" %
                    num_skipped,
                    fg='red')
Ejemplo n.º 6
0
def performance(ctx, test_data_paths, predictions_path, output_path, label_col,
                overwrite, n_jobs, verbose):
    """batch classifier performance assessment"""
    args = locals()
    args.pop('ctx')
    args.pop("n_jobs")
    args.pop("test_data_paths")
    args.pop("predictions_path")
    args.pop("output_path")

    predict_pattern = FILENAME_PATTERNS["predict"]
    if '*' not in test_data_paths:
        test_pattern = FILENAME_PATTERNS["sample_data"]["test"]
        test_fns = exec_command(f"find {test_data_paths} -name {test_pattern}")
        data_fns = test_fns.splitlines()

        data_mapped = {}
        for path in data_fns:
            size = sample_size_from_path(path)
            size = f"{size // 1000}k"
            rep = data_rep_from_path("sample_data", path)
            data_mapped[(size, rep)] = path

        predict_fns = exec_command(f'find {predictions_path} -name'
                                   f' {predict_pattern}')
        predict_fns = predict_fns.splitlines()
        paired = []
        for path in predict_fns:
            paths = dict(predictions_path=path)
            size = sample_size_from_path(path)
            size = f"{size // 1000}k"
            rep = data_rep_from_path("train", path)
            featdir = feature_set_from_path(path)
            paths.update(
                dict(data_path=data_mapped[(size, rep)],
                     size=size,
                     featdir=featdir))
            paired.append(paths)
    else:
        data_fns = glob.glob(test_data_paths)
        data_mapped = {}
        for fn in data_fns:
            bn = os.path.basename(fn).replace(".tsv.gz", "")
            data_mapped[bn] = fn

        predict_fns = exec_command(f'find {predictions_path} -name'
                                   f' {predict_pattern}')
        predict_fns = predict_fns.splitlines()
        paired = []
        for path in predict_fns:
            components = path.split('-')
            for key in data_mapped:
                if key in components:
                    paired.append(
                        dict(predictions_path=path,
                             data_path=data_mapped[key]))
                    break

    arg_sets = []
    for pair in paired:
        arg_group = args.copy()
        try:
            size = pair.pop('size')
            featdir = pair.pop('featdir')
            arg_group['output_path'] = os.path.join(output_path, size, featdir)
        except KeyError:
            arg_group['output_path'] = output_path
        arg_group.update(pair)
        arg_sets.append(arg_group)

    if n_jobs > 1:
        parallel.use_multiprocessing(n_jobs)

    total = len(arg_sets)
    gen = parallel.imap(lambda args: ctx.invoke(mutori_performance, **args),
                        arg_sets)
    for r in tqdm(gen, ncols=80, total=total):
        pass