Exemple #1
0
def do_predict(test_file,
               costs_file,
               model_file,
               output_trec_run=None,
               output_eval=None,
               override_cutoffs=None):
    """Run prediction with a saved cascade"""
    test_data = load_data_file(test_file)
    costs, _ = load_costs_data(costs_file,
                               None,
                               n_features=test_data[0].shape[1])

    cascade = load_model(model_file)
    if 'scaler' in cascade:
        cascade['scaler'].transform(test_data[0])

    if override_cutoffs:
        cutoffs = ast.literal_eval(override_cutoffs)
        logging.info('Override cutoffs with %s' % cutoffs)

        new_stages = []
        for i, (prune, model) in enumerate(cascade['stages']):
            new_stages.append((Prune(rank=cutoffs[i]), model))
        cascade['stages'] = new_stages

    predict(cascade,
            test_data,
            costs,
            output_trec_run=output_trec_run,
            output_eval=output_eval)
Exemple #2
0
def build_wlm11_cascade(train_file, validation_file, test_file, costs_file=None,
                        importance_file=None, model_prefix=None, **kwargs):
    """Train a cascade over a partition of disjoint feature sets."""
    train_data, valid_data, test_data = load_data(
        train_file, validation_file, test_file, scaler=MaxAbsScaler(copy=False))
    costs, importance = load_costs_data(
        costs_file, importance_file, n_features=train_data[0].shape[1])

    # NOTE: costs has to be untainted (make copy before passing it to functions)
    cascade = train(train_data, valid_data, costs.copy(), importance.copy(), **kwargs)

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs.copy())
Exemple #3
0
def info(model_file, costs_file=None):
    bst = joblib.load(model_file)
    fids = sorted([int(k[1:]) for k in bst.get_fscore()])

    print('params', vars(bst))
    if hasattr(bst, 'attributes'):
        print('attributes', bst.attributes())
    print('n_features', len(fids))
    print('feature list', fids)

    if costs_file:
        from core.cascade import load_costs_data
        costs, _ = load_costs_data(costs_file, None, max(fids) + 1)
        mask = np.zeros(costs.size, dtype=int)
        np.put(mask, fids, 1)
        print('cost %d' % np.dot(costs, mask))
Exemple #4
0
def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, train_file=None):
    """Run prediction with a saved cascade"""
    test_data = load_data_file(test_file)
    costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1])

    cascade = load_model(model_file)

    # FIXME: scaler needs to be saved along the cascade
    if train_file:
        train_data = load_data_file(train_file)
        scaler = MaxAbsScaler(copy=False)
        scaler.fit(train_data[0])
        scaler.transform(test_data[0])
        logging.info('Data scaled')

    if 'scaler' in cascade:
        cascade['scaler'].transform(test_data[0])
    predict(cascade, test_data, costs,
            output_trec_run=output_trec_run, output_eval=output_eval)
Exemple #5
0
def do_retrain(model_type,
               train_file,
               validation_file,
               model_file,
               new_model_file,
               test_file=None,
               costs_file=None,
               random=0,
               up_to=0,
               learning_rate="0.1",
               subsample="0.5",
               trees="[5,10,50,100,500,1000]",
               nodes="[32]",
               output_trec_run=None,
               output_eval=None):
    """Retrain a tree-based cascade using features learned in the linear models"""
    train_data = load_data_file(train_file)
    valid_data = (None, ) * 4
    if validation_file:
        valid_data = load_data_file(validation_file)

    test_data = (None, ) * 4
    costs = None
    if test_file is not None and costs_file is not None:
        test_data = load_data_file(test_file)
        costs, _ = load_costs_data(costs_file,
                                   None,
                                   n_features=test_data[0].shape[1])

    cascade = load_model(model_file)
    if 'scaler' in cascade:
        cascade['scaler'].transform(train_data[0])
        if valid_data[0] is not None:
            cascade['scaler'].transform(valid_data[0])
        if test_data[0] is not None:
            cascade['scaler'].transform(test_data[0])

    if random > 0:
        for _ in range(random):
            tree = 1 + np.random.randint(1000)
            node = np.random.choice([2, 4, 8, 16, 32, 64])
            print('tree %i, node %i' % (tree, node))
            new_cascade = cascade.copy()
            new_cascade['stages'] = retrain(
                model_type,
                cascade['stages'],
                train_data,
                valid_data,
                learning_rate=ast.literal_eval(learning_rate),
                subsample=ast.literal_eval(subsample),
                trees=[tree],
                nodes=[node],
                up_to=up_to)
            if test_data[0] is not None:
                predict(new_cascade,
                        test_data,
                        costs,
                        output_trec_run=output_trec_run,
                        output_eval=output_eval)
        return

    cascade['stages'] = retrain(model_type,
                                cascade['stages'],
                                train_data,
                                valid_data,
                                learning_rate=ast.literal_eval(learning_rate),
                                subsample=ast.literal_eval(subsample),
                                trees=ast.literal_eval(trees),
                                nodes=ast.literal_eval(nodes),
                                up_to=up_to)
    save_model(cascade, new_model_file)

    if test_data[0] is not None:
        predict(cascade,
                test_data,
                costs,
                output_trec_run=output_trec_run,
                output_eval=output_eval)
Exemple #6
0
def do_train_budgeted_GBDT(train_file,
                           validation_file,
                           test_file,
                           costs_file=None,
                           importance_file=None,
                           model_prefix=None,
                           budget=None,
                           trees='[5, 10, 50, 100, 500, 1000]',
                           nodes='[32]'):
    """Train a 1-stage budgeted GBDT cascade"""

    train_data, valid_data, test_data = load_data(train_file, validation_file,
                                                  test_file)
    costs, importance = load_costs_data(costs_file,
                                        importance_file,
                                        n_features=train_data[0].shape[1])

    x_train, _, _ = train_data
    x_train = x_train.toarray()

    # not all features will be used in a full model
    all_fids = [i for i in range(x_train.shape[1]) if any(x_train[:, i])]

    budget = float(budget)
    if budget:
        c = costs[all_fids]
        c[c.argsort()] = c[c.argsort()].cumsum()
        fids = [fid for fid, b in zip(all_fids, c) if b <= budget]
    else:
        fids = all_fids

    used_features = np.array(fids)
    # used_features = np.flatnonzero(model.get_feature_mask())

    print('Train a budgeted GBDT with %i features' % used_features.size)

    _, y_train, _ = train_data
    class_weights = core.get_class_weights(y_train)

    params = {
        'max_depth': 7,
        'eta': 0.1,
        'silent': True,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'subsample': 0.5
    }

    import GBDT
    new_model = TreeModel(model=GBDT.train(train_data,
                                           valid_data,
                                           core.get_score_multiclass,
                                           class_weights,
                                           params,
                                           trees=ast.literal_eval(trees),
                                           nodes=ast.literal_eval(nodes),
                                           set_classes=True,
                                           features=used_features),
                          score_function=core.get_score_multiclass,
                          class_weights=class_weights,
                          n_features=train_data[0].shape[1])

    cascade = {
        'stages': [(None, new_model)],
        'score_update': core.cascade.UpshiftUpdate(gap=0.1)
    }

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs)
Exemple #7
0
def train_disjoint_cascade(partition_criteria,
                           train_file,
                           validation_file,
                           test_file,
                           costs_file=None,
                           importance_file=None,
                           model_prefix=None,
                           n_stages=3,
                           cutoffs=[None, 10, 5],
                           alpha=0.1,
                           epochs=10,
                           pairwise_transform=False,
                           GBDT_retraining=False):
    """Train a cascade over a partition of disjoint feature sets."""

    np.random.seed(0)  # freeze the randomness bit
    alphas = alpha if isinstance(alpha, list) else [alpha] * n_stages
    params = {'epochs': epochs, 'l1_ratio': 1.0, 'penalty': 'none'}

    scaler = MaxAbsScaler(copy=False)
    train_data, valid_data, test_data = load_data(train_file,
                                                  validation_file,
                                                  test_file,
                                                  scaler=scaler)
    costs, importance = load_costs_data(costs_file,
                                        importance_file,
                                        n_features=train_data[0].shape[1])

    # these options don't go well together (or I haven't figured out how to make them)
    assert not (pairwise_transform and GBDT_retraining)

    # keep the original as GBDT won't work with polarized labels
    original_train_data = train_data

    # massage the data a bit ...
    x_train, y_train, qid_train, docno_train = train_data
    y_train = core.polarize(y_train)

    if pairwise_transform:
        from utils import per_query_transform_pairwise
        x_train, y_train = per_query_transform_pairwise(
            x_train.toarray(), y_train, qid_train)

    train_data = (x_train, y_train, qid_train, docno_train)

    is_qf = np.ones_like(costs)
    x = x_train.toarray()
    for j, _ in enumerate(costs):
        for a, b in group_offsets(qid_train):
            if (x[a:b, j] != x[a, j]).any():
                is_qf[j] = 0
                break

    # NOTE: costs has to be untainted (make copy before passing it to functions)
    partitions = partition_criteria(n_stages, is_qf, costs.copy(), importance)

    stages = train(train_data,
                   valid_data,
                   costs.copy(),
                   importance,
                   n_stages,
                   cutoffs=cutoffs,
                   feature_partitions=partitions,
                   alphas=alphas,
                   **params)
    if GBDT_retraining:
        stages = retrain('GBDT',
                         stages,
                         original_train_data,
                         valid_data,
                         trees=[5, 10, 50, 100, 500, 1000],
                         nodes=[32])

    cascade = {
        'stages': stages,
        'scaler': scaler,
        'score_update': core.cascade.UpshiftUpdate(gap=0.1)
    }

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs)