def build_wlm11_cascade(train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, **kwargs): """Train a cascade over a partition of disjoint feature sets.""" train_data, valid_data, test_data = load_data( train_file, validation_file, test_file, scaler=MaxAbsScaler(copy=False)) costs, importance = load_costs_data( costs_file, importance_file, n_features=train_data[0].shape[1]) # NOTE: costs has to be untainted (make copy before passing it to functions) cascade = train(train_data, valid_data, costs.copy(), importance.copy(), **kwargs) if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs.copy())
def train_tree_ranker(train_file, validation_file, test_file, model_prefix, score_function, params, trees, nodes, set_classes=False, add_original_order=False): train_data, valid_data, test_data = load_data(train_file, validation_file, test_file) if add_original_order: # FIXME: quick hack logging.info('The original-order hack is applied to all data') train_data = (add_original_order_as_feature(train_data), train_data[1], train_data[2]) if valid_data[0] is not None: valid_data = (add_original_order_as_feature(valid_data), valid_data[1], valid_data[2]) if test_data[0] is not None: test_data = (add_original_order_as_feature(test_data), test_data[1], test_data[2]) class_weights = core.get_class_weights(train_data[1]) model = train(train_data, valid_data, score_function, class_weights, params, trees, nodes, set_classes=set_classes) if model_prefix: save_model(model, model_prefix) predict(model, test_data, score_function, class_weights)
def do_retrain(model_type, train_file, validation_file, model_file, new_model_file, test_file=None, costs_file=None, random=0, up_to=0, learning_rate="0.1", subsample="0.5", trees="[5,10,50,100,500,1000]", nodes="[32]", output_trec_run=None, output_eval=None): """Retrain a tree-based cascade using features learned in the linear models""" train_data = load_data_file(train_file) valid_data = (None, ) * 4 if validation_file: valid_data = load_data_file(validation_file) test_data = (None, ) * 4 costs = None if test_file is not None and costs_file is not None: test_data = load_data_file(test_file) costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1]) cascade = load_model(model_file) if 'scaler' in cascade: cascade['scaler'].transform(train_data[0]) if valid_data[0] is not None: cascade['scaler'].transform(valid_data[0]) if test_data[0] is not None: cascade['scaler'].transform(test_data[0]) if random > 0: for _ in range(random): tree = 1 + np.random.randint(1000) node = np.random.choice([2, 4, 8, 16, 32, 64]) print('tree %i, node %i' % (tree, node)) new_cascade = cascade.copy() new_cascade['stages'] = retrain( model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=[tree], nodes=[node], up_to=up_to) if test_data[0] is not None: predict(new_cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval) return cascade['stages'] = retrain(model_type, cascade['stages'], train_data, valid_data, learning_rate=ast.literal_eval(learning_rate), subsample=ast.literal_eval(subsample), trees=ast.literal_eval(trees), nodes=ast.literal_eval(nodes), up_to=up_to) save_model(cascade, new_model_file) if test_data[0] is not None: predict(cascade, test_data, costs, output_trec_run=output_trec_run, output_eval=output_eval)
def do_train_budgeted_GBDT(train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, budget=None, trees='[5, 10, 50, 100, 500, 1000]', nodes='[32]'): """Train a 1-stage budgeted GBDT cascade""" train_data, valid_data, test_data = load_data(train_file, validation_file, test_file) costs, importance = load_costs_data(costs_file, importance_file, n_features=train_data[0].shape[1]) x_train, _, _ = train_data x_train = x_train.toarray() # not all features will be used in a full model all_fids = [i for i in range(x_train.shape[1]) if any(x_train[:, i])] budget = float(budget) if budget: c = costs[all_fids] c[c.argsort()] = c[c.argsort()].cumsum() fids = [fid for fid, b in zip(all_fids, c) if b <= budget] else: fids = all_fids used_features = np.array(fids) # used_features = np.flatnonzero(model.get_feature_mask()) print('Train a budgeted GBDT with %i features' % used_features.size) _, y_train, _ = train_data class_weights = core.get_class_weights(y_train) params = { 'max_depth': 7, 'eta': 0.1, 'silent': True, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'subsample': 0.5 } import GBDT new_model = TreeModel(model=GBDT.train(train_data, valid_data, core.get_score_multiclass, class_weights, params, trees=ast.literal_eval(trees), nodes=ast.literal_eval(nodes), set_classes=True, features=used_features), score_function=core.get_score_multiclass, class_weights=class_weights, n_features=train_data[0].shape[1]) cascade = { 'stages': [(None, new_model)], 'score_update': core.cascade.UpshiftUpdate(gap=0.1) } if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs)
def train_disjoint_cascade(partition_criteria, train_file, validation_file, test_file, costs_file=None, importance_file=None, model_prefix=None, n_stages=3, cutoffs=[None, 10, 5], alpha=0.1, epochs=10, pairwise_transform=False, GBDT_retraining=False): """Train a cascade over a partition of disjoint feature sets.""" np.random.seed(0) # freeze the randomness bit alphas = alpha if isinstance(alpha, list) else [alpha] * n_stages params = {'epochs': epochs, 'l1_ratio': 1.0, 'penalty': 'none'} scaler = MaxAbsScaler(copy=False) train_data, valid_data, test_data = load_data(train_file, validation_file, test_file, scaler=scaler) costs, importance = load_costs_data(costs_file, importance_file, n_features=train_data[0].shape[1]) # these options don't go well together (or I haven't figured out how to make them) assert not (pairwise_transform and GBDT_retraining) # keep the original as GBDT won't work with polarized labels original_train_data = train_data # massage the data a bit ... x_train, y_train, qid_train, docno_train = train_data y_train = core.polarize(y_train) if pairwise_transform: from utils import per_query_transform_pairwise x_train, y_train = per_query_transform_pairwise( x_train.toarray(), y_train, qid_train) train_data = (x_train, y_train, qid_train, docno_train) is_qf = np.ones_like(costs) x = x_train.toarray() for j, _ in enumerate(costs): for a, b in group_offsets(qid_train): if (x[a:b, j] != x[a, j]).any(): is_qf[j] = 0 break # NOTE: costs has to be untainted (make copy before passing it to functions) partitions = partition_criteria(n_stages, is_qf, costs.copy(), importance) stages = train(train_data, valid_data, costs.copy(), importance, n_stages, cutoffs=cutoffs, feature_partitions=partitions, alphas=alphas, **params) if GBDT_retraining: stages = retrain('GBDT', stages, original_train_data, valid_data, trees=[5, 10, 50, 100, 500, 1000], nodes=[32]) cascade = { 'stages': stages, 'scaler': scaler, 'score_update': core.cascade.UpshiftUpdate(gap=0.1) } if model_prefix: save_model(cascade, model_prefix) predict(cascade, test_data, costs)