Ejemplo n.º 1
0
def do_predict(test_file,
               costs_file,
               model_file,
               output_trec_run=None,
               output_eval=None,
               override_cutoffs=None):
    """Run prediction with a saved cascade"""
    test_data = load_data_file(test_file)
    costs, _ = load_costs_data(costs_file,
                               None,
                               n_features=test_data[0].shape[1])

    cascade = load_model(model_file)
    if 'scaler' in cascade:
        cascade['scaler'].transform(test_data[0])

    if override_cutoffs:
        cutoffs = ast.literal_eval(override_cutoffs)
        logging.info('Override cutoffs with %s' % cutoffs)

        new_stages = []
        for i, (prune, model) in enumerate(cascade['stages']):
            new_stages.append((Prune(rank=cutoffs[i]), model))
        cascade['stages'] = new_stages

    predict(cascade,
            test_data,
            costs,
            output_trec_run=output_trec_run,
            output_eval=output_eval)
Ejemplo n.º 2
0
def do_info(model_file):
    s = set()
    cascade = load_model(model_file)
    for i, (_, stage) in enumerate(cascade['stages'], 1):
        fids = np.flatnonzero(stage.get_feature_mask()) + 1
        print('stage', i)
        print('n_features', len(fids))
        print('fids', fids)
        for i in fids:
            s.add(i)
    print('total n_features', len(s))
Ejemplo n.º 3
0
def do_predict_LambdaMART(test_file,
                          model_file,
                          output_trec_run=None,
                          add_original_order=False):
    """Run prediction with a saved model"""
    test_data = load_data_file(test_file)
    if add_original_order:
        test_data = (add_original_order_as_feature(test_data), test_data[1],
                     test_data[2])
    model = load_model(model_file)
    predict(model,
            test_data,
            core.get_score,
            None,
            output_trec_run=output_trec_run)
Ejemplo n.º 4
0
def do_predict_GBDT(test_file,
                    model_file,
                    output_trec_run=None,
                    add_original_order=False):
    """Run prediction with a saved model"""
    test_data = load_data_file(test_file)
    if add_original_order:
        test_data = (add_original_order_as_feature(test_data), test_data[1],
                     test_data[2])
    model = load_model(model_file)
    class_weights = core.get_class_weights(
        test_data[1])  # FIXME: shouldn't peek into this
    predict(model,
            test_data,
            core.get_score_multiclass,
            class_weights,
            output_trec_run=output_trec_run)
Ejemplo n.º 5
0
def do_predict(test_file, costs_file, model_file, output_trec_run=None, output_eval=None, train_file=None):
    """Run prediction with a saved cascade"""
    test_data = load_data_file(test_file)
    costs, _ = load_costs_data(costs_file, None, n_features=test_data[0].shape[1])

    cascade = load_model(model_file)

    # FIXME: scaler needs to be saved along the cascade
    if train_file:
        train_data = load_data_file(train_file)
        scaler = MaxAbsScaler(copy=False)
        scaler.fit(train_data[0])
        scaler.transform(test_data[0])
        logging.info('Data scaled')

    if 'scaler' in cascade:
        cascade['scaler'].transform(test_data[0])
    predict(cascade, test_data, costs,
            output_trec_run=output_trec_run, output_eval=output_eval)
Ejemplo n.º 6
0
def do_retrain(model_type,
               train_file,
               validation_file,
               model_file,
               new_model_file,
               test_file=None,
               costs_file=None,
               random=0,
               up_to=0,
               learning_rate="0.1",
               subsample="0.5",
               trees="[5,10,50,100,500,1000]",
               nodes="[32]",
               output_trec_run=None,
               output_eval=None):
    """Retrain a tree-based cascade using features learned in the linear models"""
    train_data = load_data_file(train_file)
    valid_data = (None, ) * 4
    if validation_file:
        valid_data = load_data_file(validation_file)

    test_data = (None, ) * 4
    costs = None
    if test_file is not None and costs_file is not None:
        test_data = load_data_file(test_file)
        costs, _ = load_costs_data(costs_file,
                                   None,
                                   n_features=test_data[0].shape[1])

    cascade = load_model(model_file)
    if 'scaler' in cascade:
        cascade['scaler'].transform(train_data[0])
        if valid_data[0] is not None:
            cascade['scaler'].transform(valid_data[0])
        if test_data[0] is not None:
            cascade['scaler'].transform(test_data[0])

    if random > 0:
        for _ in range(random):
            tree = 1 + np.random.randint(1000)
            node = np.random.choice([2, 4, 8, 16, 32, 64])
            print('tree %i, node %i' % (tree, node))
            new_cascade = cascade.copy()
            new_cascade['stages'] = retrain(
                model_type,
                cascade['stages'],
                train_data,
                valid_data,
                learning_rate=ast.literal_eval(learning_rate),
                subsample=ast.literal_eval(subsample),
                trees=[tree],
                nodes=[node],
                up_to=up_to)
            if test_data[0] is not None:
                predict(new_cascade,
                        test_data,
                        costs,
                        output_trec_run=output_trec_run,
                        output_eval=output_eval)
        return

    cascade['stages'] = retrain(model_type,
                                cascade['stages'],
                                train_data,
                                valid_data,
                                learning_rate=ast.literal_eval(learning_rate),
                                subsample=ast.literal_eval(subsample),
                                trees=ast.literal_eval(trees),
                                nodes=ast.literal_eval(nodes),
                                up_to=up_to)
    save_model(cascade, new_model_file)

    if test_data[0] is not None:
        predict(cascade,
                test_data,
                costs,
                output_trec_run=output_trec_run,
                output_eval=output_eval)
Ejemplo n.º 7
0
def do_info(model_file):
    cascade = load_model(model_file)
    for i, (prune, stage) in enumerate(cascade, 1):
        k = np.flatnonzero(stage)
        print('stage', i, 'prune', prune, 'fid', k + 1, 'weight', stage[k])  # fid is 0 based