コード例 #1
0
ファイル: WLM11Cascade.py プロジェクト: kiminh/LTR_Cascade
def build_wlm11_cascade(train_file, validation_file, test_file, costs_file=None,
                        importance_file=None, model_prefix=None, **kwargs):
    """Train a cascade over a partition of disjoint feature sets."""
    train_data, valid_data, test_data = load_data(
        train_file, validation_file, test_file, scaler=MaxAbsScaler(copy=False))
    costs, importance = load_costs_data(
        costs_file, importance_file, n_features=train_data[0].shape[1])

    # NOTE: costs has to be untainted (make copy before passing it to functions)
    cascade = train(train_data, valid_data, costs.copy(), importance.copy(), **kwargs)

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs.copy())
コード例 #2
0
ファイル: GBDT.py プロジェクト: kiminh/LTR_Cascade
def train_tree_ranker(train_file,
                      validation_file,
                      test_file,
                      model_prefix,
                      score_function,
                      params,
                      trees,
                      nodes,
                      set_classes=False,
                      add_original_order=False):
    train_data, valid_data, test_data = load_data(train_file, validation_file,
                                                  test_file)

    if add_original_order:
        # FIXME: quick hack
        logging.info('The original-order hack is applied to all data')
        train_data = (add_original_order_as_feature(train_data), train_data[1],
                      train_data[2])
        if valid_data[0] is not None:
            valid_data = (add_original_order_as_feature(valid_data),
                          valid_data[1], valid_data[2])
        if test_data[0] is not None:
            test_data = (add_original_order_as_feature(test_data),
                         test_data[1], test_data[2])

    class_weights = core.get_class_weights(train_data[1])
    model = train(train_data,
                  valid_data,
                  score_function,
                  class_weights,
                  params,
                  trees,
                  nodes,
                  set_classes=set_classes)
    if model_prefix:
        save_model(model, model_prefix)
    predict(model, test_data, score_function, class_weights)
コード例 #3
0
def do_retrain(model_type,
               train_file,
               validation_file,
               model_file,
               new_model_file,
               test_file=None,
               costs_file=None,
               random=0,
               up_to=0,
               learning_rate="0.1",
               subsample="0.5",
               trees="[5,10,50,100,500,1000]",
               nodes="[32]",
               output_trec_run=None,
               output_eval=None):
    """Retrain a tree-based cascade using features learned in the linear models"""
    train_data = load_data_file(train_file)
    valid_data = (None, ) * 4
    if validation_file:
        valid_data = load_data_file(validation_file)

    test_data = (None, ) * 4
    costs = None
    if test_file is not None and costs_file is not None:
        test_data = load_data_file(test_file)
        costs, _ = load_costs_data(costs_file,
                                   None,
                                   n_features=test_data[0].shape[1])

    cascade = load_model(model_file)
    if 'scaler' in cascade:
        cascade['scaler'].transform(train_data[0])
        if valid_data[0] is not None:
            cascade['scaler'].transform(valid_data[0])
        if test_data[0] is not None:
            cascade['scaler'].transform(test_data[0])

    if random > 0:
        for _ in range(random):
            tree = 1 + np.random.randint(1000)
            node = np.random.choice([2, 4, 8, 16, 32, 64])
            print('tree %i, node %i' % (tree, node))
            new_cascade = cascade.copy()
            new_cascade['stages'] = retrain(
                model_type,
                cascade['stages'],
                train_data,
                valid_data,
                learning_rate=ast.literal_eval(learning_rate),
                subsample=ast.literal_eval(subsample),
                trees=[tree],
                nodes=[node],
                up_to=up_to)
            if test_data[0] is not None:
                predict(new_cascade,
                        test_data,
                        costs,
                        output_trec_run=output_trec_run,
                        output_eval=output_eval)
        return

    cascade['stages'] = retrain(model_type,
                                cascade['stages'],
                                train_data,
                                valid_data,
                                learning_rate=ast.literal_eval(learning_rate),
                                subsample=ast.literal_eval(subsample),
                                trees=ast.literal_eval(trees),
                                nodes=ast.literal_eval(nodes),
                                up_to=up_to)
    save_model(cascade, new_model_file)

    if test_data[0] is not None:
        predict(cascade,
                test_data,
                costs,
                output_trec_run=output_trec_run,
                output_eval=output_eval)
コード例 #4
0
def do_train_budgeted_GBDT(train_file,
                           validation_file,
                           test_file,
                           costs_file=None,
                           importance_file=None,
                           model_prefix=None,
                           budget=None,
                           trees='[5, 10, 50, 100, 500, 1000]',
                           nodes='[32]'):
    """Train a 1-stage budgeted GBDT cascade"""

    train_data, valid_data, test_data = load_data(train_file, validation_file,
                                                  test_file)
    costs, importance = load_costs_data(costs_file,
                                        importance_file,
                                        n_features=train_data[0].shape[1])

    x_train, _, _ = train_data
    x_train = x_train.toarray()

    # not all features will be used in a full model
    all_fids = [i for i in range(x_train.shape[1]) if any(x_train[:, i])]

    budget = float(budget)
    if budget:
        c = costs[all_fids]
        c[c.argsort()] = c[c.argsort()].cumsum()
        fids = [fid for fid, b in zip(all_fids, c) if b <= budget]
    else:
        fids = all_fids

    used_features = np.array(fids)
    # used_features = np.flatnonzero(model.get_feature_mask())

    print('Train a budgeted GBDT with %i features' % used_features.size)

    _, y_train, _ = train_data
    class_weights = core.get_class_weights(y_train)

    params = {
        'max_depth': 7,
        'eta': 0.1,
        'silent': True,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'subsample': 0.5
    }

    import GBDT
    new_model = TreeModel(model=GBDT.train(train_data,
                                           valid_data,
                                           core.get_score_multiclass,
                                           class_weights,
                                           params,
                                           trees=ast.literal_eval(trees),
                                           nodes=ast.literal_eval(nodes),
                                           set_classes=True,
                                           features=used_features),
                          score_function=core.get_score_multiclass,
                          class_weights=class_weights,
                          n_features=train_data[0].shape[1])

    cascade = {
        'stages': [(None, new_model)],
        'score_update': core.cascade.UpshiftUpdate(gap=0.1)
    }

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs)
コード例 #5
0
def train_disjoint_cascade(partition_criteria,
                           train_file,
                           validation_file,
                           test_file,
                           costs_file=None,
                           importance_file=None,
                           model_prefix=None,
                           n_stages=3,
                           cutoffs=[None, 10, 5],
                           alpha=0.1,
                           epochs=10,
                           pairwise_transform=False,
                           GBDT_retraining=False):
    """Train a cascade over a partition of disjoint feature sets."""

    np.random.seed(0)  # freeze the randomness bit
    alphas = alpha if isinstance(alpha, list) else [alpha] * n_stages
    params = {'epochs': epochs, 'l1_ratio': 1.0, 'penalty': 'none'}

    scaler = MaxAbsScaler(copy=False)
    train_data, valid_data, test_data = load_data(train_file,
                                                  validation_file,
                                                  test_file,
                                                  scaler=scaler)
    costs, importance = load_costs_data(costs_file,
                                        importance_file,
                                        n_features=train_data[0].shape[1])

    # these options don't go well together (or I haven't figured out how to make them)
    assert not (pairwise_transform and GBDT_retraining)

    # keep the original as GBDT won't work with polarized labels
    original_train_data = train_data

    # massage the data a bit ...
    x_train, y_train, qid_train, docno_train = train_data
    y_train = core.polarize(y_train)

    if pairwise_transform:
        from utils import per_query_transform_pairwise
        x_train, y_train = per_query_transform_pairwise(
            x_train.toarray(), y_train, qid_train)

    train_data = (x_train, y_train, qid_train, docno_train)

    is_qf = np.ones_like(costs)
    x = x_train.toarray()
    for j, _ in enumerate(costs):
        for a, b in group_offsets(qid_train):
            if (x[a:b, j] != x[a, j]).any():
                is_qf[j] = 0
                break

    # NOTE: costs has to be untainted (make copy before passing it to functions)
    partitions = partition_criteria(n_stages, is_qf, costs.copy(), importance)

    stages = train(train_data,
                   valid_data,
                   costs.copy(),
                   importance,
                   n_stages,
                   cutoffs=cutoffs,
                   feature_partitions=partitions,
                   alphas=alphas,
                   **params)
    if GBDT_retraining:
        stages = retrain('GBDT',
                         stages,
                         original_train_data,
                         valid_data,
                         trees=[5, 10, 50, 100, 500, 1000],
                         nodes=[32])

    cascade = {
        'stages': stages,
        'scaler': scaler,
        'score_update': core.cascade.UpshiftUpdate(gap=0.1)
    }

    if model_prefix:
        save_model(cascade, model_prefix)
    predict(cascade, test_data, costs)