Beispiel #1
0
def lr_train(training_path, output_path, label_col, seed, scoring, flank_size,
             feature_dim, proximal, usegc, c_values, penalty_options, n_jobs,
             overwrite, verbose):
    """logistic regression training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])

    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-lr.pkl.gz")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-lr.log")
    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)

    start_time = time.time()
    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)
    classifier = logistic_regression(feat, resp, seed, scoring, c_values,
                                     penalty_options.split(","), n_jobs)
    betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0]))
    result = dict(classifier=classifier.best_estimator_,
                  betas=betas,
                  scoring=scoring)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
Beispiel #2
0
def xgboost_train(training_path, output_path, label_col, seed, flank_size,
                  feature_dim, proximal, usegc, strategy, n_jobs, overwrite,
                  verbose):
    """Naive Bayes training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-xgb.pkl.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-xgb.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)
    start_time = time.time()
    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    # hacking feature so all -1 > 0
    resp = [v if v > 0 else 0 for v in resp]

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)

    classifier = xgboost(feat, resp, seed, strategy, n_jobs, verbose)
    result = dict(classifier=classifier)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
Beispiel #3
0
def ocs_train(training_path, output_path, label_col, seed, flank_size,
              feature_dim, proximal, usegc, overwrite, verbose):
    """one-class svm training for outlier detection"""
    if seed is None:
        seed = int(time.time())
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    start_time = time.time()
    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-ocs.pkl.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-ocs.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)

    start_time = time.time()
    _, _, feat, n_dims, names = data_to_numeric(training_path,
                                                label_col,
                                                flank_size,
                                                feature_dim,
                                                proximal,
                                                usegc=usegc,
                                                one_class='g')

    classifier = one_class_svm(feat, seed)
    result = dict(classifier=classifier)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)

    with open(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
Beispiel #4
0
def performance(data_path, predictions_path, output_path, label_col, overwrite,
                verbose):
    """produce measures of classifier performance"""
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    if not (data_path or predictions_path):
        click.secho("Need data sets!", fg="red")
        exit()

    basename_train = get_basename(data_path)
    basename_pred = get_basename(predictions_path)
    basename = f"{basename_train}-{basename_pred}"
    outpath = os.path.join(output_path, f"{basename}-performance.json.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-performance.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "Use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path

    LOGGER.input_file(data_path)
    LOGGER.input_file(predictions_path)
    orig = pandas.read_csv(data_path, sep="\t")
    predicted, feature_params, classifier_path, label =\
        load_predictions(predictions_path)
    result = measure_performance(orig, predicted, label_col)
    result["feature_params"] = feature_params
    result["classifier_path"] = classifier_path
    result["classifier_label"] = label
    dump_json(outpath, result)
    LOGGER.shutdown()
Beispiel #5
0
def predict(classifier_path, data_path, output_path, label_col, class_prior,
            overwrite, verbose):
    """predict labels for data"""
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    classifier, feature_params, scaler = load_classifier(classifier_path)
    class_label = get_classifier_label(classifier)
    if class_prior is not None and class_label == 'lr':
        # https://stats.stackexchange.com/questions/117592/logistic-regression-prior-correction-at-test-time
        # based on above and King and Zeng, we adjust the intercept term such
        # that it is incremented by ln(p(1) / p(-1)) where p(1) is the prior
        # of a 1 label, p(-1)=1-p(1)
        class_labels = list(class_prior)
        encoded = transform_response(class_labels)
        ordered = sorted(zip(encoded, class_labels))
        if 'e' in ordered[0]:
            adj = log(class_prior['g'] / class_prior['e'])
        else:
            adj = log(class_prior['e'] / class_prior['g'])

        classifier.intercept_ += adj

    basename_class = get_basename(classifier_path)
    basename_data = get_basename(data_path)
    basename = f"{basename_class}-{basename_data}"
    outpath = os.path.join(output_path,
                           f"{basename}-predicted-{class_label}.json.gz")
    os.makedirs(output_path, exist_ok=True)
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-predict-{class_label}.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(classifier_path)
    LOGGER.input_file(data_path)

    start_time = time.time()
    # if NB, the score func name is different
    if class_label in ("nb", "xgb"):
        classifier.decision_function = classifier.predict_proba

    fulldata = pandas.read_csv(data_path, sep='\t')

    result = {}
    result['feature_params'] = feature_params
    result['classifier_label'] = class_label
    result['classifier_path'] = classifier_path
    result['predictions'] = defaultdict(list)
    total = fulldata.shape[0] // 2000
    pbar = tqdm(iter_indices(fulldata.shape[0], block_size=2000),
                ncols=80,
                total=total)
    for indices in pbar:
        data = fulldata.iloc[indices]
        ids, resp, feat, n_dims, names = data_to_numeric(data,
                                                         label_col=label_col,
                                                         **feature_params)
        if scaler:
            feat = scaler.transform(feat)

        predictions, scores = predict_origin(classifier, feat)
        if class_label in ("nb", "xgb"):
            # each `score' is the probability of belong to either class
            # reduce to just the first class
            scores = scores[:, 1].tolist()
        elif class_label == 'ocs':
            scores = scores[:, 0].tolist()

        predictions = inverse_transform_response(predictions)
        result['predictions']['varid'].extend(list(ids))
        result['predictions']['predicted'].extend(list(predictions))
        result['predictions']['scores'].extend(list(scores))

    dump_json(outpath, result)
    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
Beispiel #6
0
def nb_train(training_path, output_path, label_col, seed, scoring, flank_size,
             feature_dim, proximal, usegc, alpha_options, class_prior, n_jobs,
             overwrite, verbose):
    """Naive Bayes training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-nb.pkl.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-nb.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)

    start_time = time.time()
    if class_prior is not None:
        class_labels = list(class_prior)
        encoded = transform_response(class_labels)
        ordered = sorted(zip(encoded, class_labels))
        class_prior = [class_prior[l] for _, l in ordered]

    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)
    classifier = naive_bayes(feat,
                             resp,
                             seed,
                             alpha_options,
                             scoring,
                             class_prior=class_prior,
                             n_jobs=n_jobs)
    betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0]))
    result = dict(classifier=classifier.best_estimator_,
                  betas=betas,
                  scoring=scoring)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open_(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()