Example #1
0
    def fit(self, X, y, ids, weight=None, feature_names=None):
        self.feature_names = feature_names
        # Unfortunately rankpy only works with integer labels...
        # This is far from perfect, but works as a first try
        y = (np.asanyarray(y) * 5).astype(np.intc)
        #  Split out a 10% validation set
        splitter = GroupKFold(10)
        train, valid = next(splitter.split(X, None, ids))

        X_train, X_valid, y_train, y_valid, ids_train, ids_valid, w_train, w_valid = chain.from_iterable(
            ((a[train], a[valid]) for a in [X, y, ids, weight]))

        q_train, w_train = self._build_queries(X_train, y_train, ids_train,
                                               w_train)
        q_valid, w_valid = self._build_queries(X_valid, y_valid, ids_valid,
                                               w_valid)

        self.model = LambdaMARTModel(**self.params)
        self.model.fit(q_train, w_train, q_valid, w_valid)
        return self
Example #2
0
from rankpy.models import LambdaMART
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open("test.svmlight") as f:
    head = [next(f) for x in range(4)]
features = map(lambda x: x.split(':')[1], head[-1][2:-1].split(' '))
columns = pd.read_csv("../test_set_VU_DM_2014.csv", header=0,
                      nrows=1).columns.values.tolist()
lm = LambdaMART.load("LambdaMartModel0.5.model")
feats = dict(zip(features, lm.feature_importances()))
feats = sorted(feats.items(), key=lambda kv: -kv[1])

fig, ax = plt.subplots(figsize=(1200 / 120, 500 / 120))
bp = sns.barplot(map(lambda x: x[0], feats), map(lambda x: x[1], feats))

for item in bp.get_xticklabels():
    item.set_rotation(90)
plt.subplots_adjust(bottom=0.5)

plt.savefig("feature_importances", dpi=400)
Example #3
0
    #parameters
    metric = 'nDCG@10'
    max_leaf_nodes = 7
    min_samples_split = 2
    estopping = 10
    #TODO change these to the optimal ones found from the grid search
    n_estimators = 1000
    max_features = None
    min_samples_leaf = 50
    learn_rate = 0.2

    model = LambdaMART(metric=metric,
                       max_leaf_nodes=max_leaf_nodes,
                       min_samples_split=min_samples_split,
                       estopping=estopping,
                       n_estimators=n_estimators,
                       max_features=max_features,
                       min_samples_leaf=min_samples_leaf,
                       shrinkage=learn_rate,
                       n_jobs=-1,
                       random_state=42)

    model.fit(training_queries, validation_queries=validation_queries)

    logging.info(
        '================================================================================'
    )

    test_ranks = model.predict_rankings(test_queries)
    dcg_score = NDCG(test_queries, test_ranks).mean_ndcg()
    map_score = MAP(test_queries, test_ranks).mean_average_precision()
    dcg_folds_scores.append(dcg_score)
Example #4
0
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info(
    '================================================================================'
)

model = LambdaMART(
    metric='NDCG@10',
    max_leaf_nodes=7,
    shrinkage=0.1,
    estopping=30,
    n_jobs=-1,
    random_state=42,
    use_pines=True,
    pines_kwargs=dict(
        switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN,
        tree_type=TreeType.OBLIVIOUS_CART,
        max_n_splits=10,
        min_samples_leaf=50,
        max_depth=10,
    ))

model.fit(training_queries, validation_queries=validation_queries)

logging.info(
    '================================================================================'
)

logging.info('%s on the test queries: %.8f' %
Example #5
0
cfs = find_constant_features(
    [training_queries, validation_queries, test_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('=' * 80)

model = LambdaMART(metric='NDCG@10',
                   max_leaf_nodes=7,
                   shrinkage=0.1,
                   estopping=50,
                   n_jobs=-1,
                   min_samples_leaf=50,
                   random_state=42)

model.fit(training_queries, validation_queries=validation_queries)

logging.info('=' * 80)

logging.info('%s on the test queries: %.8f' %
             (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
Example #6
0
    validation_queries.adjust(remove_features=cfs,
                              purge=remove_useless_queries)
    test_queries.adjust(remove_features=cfs)

    # Print basic info about query datasets.
    logging.info('Train queries: %s' % training_queries)
    logging.info('Valid queries: %s' % validation_queries)
    logging.info('Test queries: %s' % test_queries)

    logging.info(
        '================================================================================'
    )

    #load the model
    filename = 'models/LambdaMART_Fold_' + str(i)
    model = LambdaMART.load(filepath=filename)

    logging.info(
        '================================================================================'
    )

    test_ranks = model.predict_rankings(test_queries)
    dcg_score = NDCG(test_queries, test_ranks).mean_ndcg()
    map_score = MAP(test_queries, test_ranks).mean_average_precision()
    dcg_folds_scores.append(dcg_score)
    map_folds_scores.append(map_score)

    #evaluate nDCG10 on test queries for best model
    logging.info('%s on the test queries: %.8f' % ('NDCG@10', dcg_score))

    #evaluate MAP on test queries
#train_queries.save('../data/train_bin')
#valid_queries.save('../data/validation_bin')
#test_queries.save('../data/test_bin')

# ... because loading them will be then faster.
#train_queries = Queries.load('../data/train_bin')
#valid_queries = Queries.load('../data/validation_bin')
#test_queries = Queries.load('../data/test_bin')

logging.info('================================================================================')

# Print basic info about query datasets.
logging.info('Train queries: %s' % train_queries)
logging.info('Valid queries: %s' % valid_queries)
logging.info('Test queries: %s' %test_queries)

logging.info('================================================================================')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=4, shrinkage=0.08, estopping=100, n_jobs=-1)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))

cfs = find_constant_features([training_queries, validation_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('================================================================================')

model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=10, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)

#TODO: do some crossval here?
model.fit(training_queries, validation_queries=test_queries)

logging.info('================================================================================')
logging.info('%s on the test queries: %.8f'
             % (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
predicted_rankings = model.predict_rankings(test_queries)

test_df = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, nrows = test_queries.document_count())
test_df['pred_position'] = np.concatenate(predicted_rankings)
sorted_df = test_df[['srch_id', 'prop_id', 'pred_position']].sort_values(['srch_id', 'pred_position'])
Example #9
0
class LambdaMART(object):
    def __init__(self,
                 metric='NDCG',
                 n_estimators=100,
                 max_depth=None,
                 max_leaf_nodes=7,
                 max_features=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 shrinkage=0.1,
                 use_newton_method=True,
                 use_random_forest=0,
                 random_thresholds=False,
                 subsample=1.0,
                 use_logit_boost=False,
                 use_ada_boost=False,
                 estopping=50,
                 min_n_estimators=1,
                 base_model=None,
                 n_jobs=1,
                 random_state=None):
        self.feature_names = None

        self.params = {
            'metric': metric,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'max_leaf_nodes': max_leaf_nodes,
            'max_features': max_features,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'shrinkage': shrinkage,
            'use_newton_method': use_newton_method,
            'use_random_forest': use_random_forest,
            'random_thresholds': random_thresholds,
            'subsample': subsample,
            'use_logit_boost': use_logit_boost,
            'use_ada_boost': use_ada_boost,
            'estopping': estopping,
            'min_n_estimators': min_n_estimators,
            'base_model': base_model,
            'n_jobs': n_jobs,
            'random_state': random_state,
        }

    def __str__(self):
        return self.__repr__()

    def __repr(self):
        return (
            "%s(metric='%s', n_estimators=%d, max_depth=%d, max_leaf_nodes=%d,\n"
            "max_features=%d, min_samples_split=%d, min_samples_leaf=%d,\n"
            "shrinkage=%f, use_newton_method=%s, use_random_forest=%d,\n"
            "random_thresholds=%s, subsample=%f, use_logit_boost=%s, use_ada_boost=%s,\n"
            "estopping=%d, min_n_estimators=%d, n_jobs=%d, random_state=%s,\n"
            "base_model=%s)" % (
                self.__class__.__name__,
                self.param["metric"],
                self.params["n_estimators"],
                self.params["max_depth"],
                self.params["max_leaf_nodes"],
                self.params["max_features"],
                self.params["min_samples_split"],
                self.params["min_samples_leaf"],
                self.params["shrinkage"],
                self.params["use_newton_method"],
                self.params["use_random_forest"],
                self.params["random_thresholds"],
                self.params["subsample"],
                self.params["use_logit_boost"],
                self.params["use_ada_boost"],
                self.params["estopping"],
                self.params["min_n_estimators"],
                self.params["n_jobs"],
                self.params["random_state"],
                str(self.params["base_model"]),
            ))

    def _build_query_indptr(self, ids):
        """
        The query index pointer into the feature_vectors and relevance_scores
        array, i.e. the document feature vectors,
        ``feature_vectors[query_indptr[i]:query_indptr[i + 1]]``, and the
        corresponding relevance scores,
        ``relevance_scores[query_indptr[i]:query_indptr[i + 1]]``,
        are the feature vectors and relevance scores for the i-th
        query documents.
        """
        query_indptr = [0]
        query_ids = []
        prev_qid = None
        for qid in ids:
            if qid == prev_qid:
                query_indptr[-1] += 1
            else:
                query_ids.append(qid)
                query_indptr.append(query_indptr[-1] + 1)
                prev_qid = qid
        return query_indptr, query_ids

    def _build_queries(self, X, y, ids, w):
        query_indptr, query_ids = self._build_query_indptr(ids)
        q = Queries(X, y, query_indptr, query_ids=query_ids)
        # weights as per query instead of per-row ... just guess
        wn = [
            np.mean(w[query_indptr[i]:query_indptr[i + 1]])
            for i in range(len(query_indptr) - 1)
        ]
        wn = [w[i] for i in query_indptr[:-1]]
        return q, np.ascontiguousarray(wn, dtype='float64')

    def fit(self, X, y, ids, weight=None, feature_names=None):
        self.feature_names = feature_names
        # Unfortunately rankpy only works with integer labels...
        # This is far from perfect, but works as a first try
        y = (np.asanyarray(y) * 5).astype(np.intc)
        #  Split out a 10% validation set
        splitter = GroupKFold(10)
        train, valid = next(splitter.split(X, None, ids))

        X_train, X_valid, y_train, y_valid, ids_train, ids_valid, w_train, w_valid = chain.from_iterable(
            ((a[train], a[valid]) for a in [X, y, ids, weight]))

        q_train, w_train = self._build_queries(X_train, y_train, ids_train,
                                               w_train)
        q_valid, w_valid = self._build_queries(X_valid, y_valid, ids_valid,
                                               w_valid)

        self.model = LambdaMARTModel(**self.params)
        self.model.fit(q_train, w_train, q_valid, w_valid)
        return self

    def predict(self, X, ids, weight, feature_names=None):
        self.feature_names = feature_names
        query_indptr, query_ids = self._build_query_indptr(ids)
        # We wont be using this, but Queries wont instantiate without it
        y = np.zeros(X.shape[0])
        q = Queries(X, y, query_indptr, query_ids=query_ids)
        y_pred = self.model.predict(q, n_jobs=self.params['n_jobs'])
        return y_pred

    def plot_importance(self):
        if self.feature_names is None:
            raise Exception('No feature names available')

        importance = self.model.feature_importances(self.params['n_jobs'])

        # stolen from xgboost
        tuples = zip(self.feature_names, importance)
        tuples = sorted(tuples, key=lambda x: x[1])
        labels, values = tuples

        self.save_topn_features(labels, values)

        _, ax = plt.subplots(1, 1)
        ylocs = np.arange(len(values))
        ax.barh(ylocs, values, align='center', height=0.2)
        for x, y in zip(values, yloc):
            ax.text(x + 1, y, x, va='center')
        ax.set_yticks(ylocs)
        ax.set_yticklabels(labels)

        xlim = (0, max(values) * 1.1)
        ax.set_xlim(xlim)

        ylim = (-1, len(importance))
        ax.set_ylim(ylim)

        ax.grid()
        return ax

    def save_topn_features(self,
                           labels,
                           values,
                           fname="LambdaMART_topn_features.txt",
                           topn=-1):
        if topn == -1:
            topn = len(labels)
        else:
            topn = min(topn, len(labels))
        with open(fname, "w") as f:
            for i in range(topn):
                f.write("%s = %f" % (labels[i], values[i]))
Example #10
0
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info(
    '================================================================================'
)

model = LambdaMART(metric='nDCG@38',
                   max_leaf_nodes=7,
                   shrinkage=0.1,
                   estopping=10,
                   n_jobs=-1,
                   min_samples_leaf=50,
                   random_state=42)

#TODO: do some crossval here?
model.fit(training_queries, validation_queries=test_queries)

logging.info(
    '================================================================================'
)
logging.info('%s on the test queries: %.8f' %
             (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
predicted_rankings = model.predict_rankings(test_queries)
valid_queries = Queries.load_from_text('../data/svmlight_val.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info(
    '================================================================================'
)
logging.info('train LambdaMart')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(
    38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000,
                   max_depth=5,
                   shrinkage=0.08,
                   estopping=100,
                   n_jobs=-1,
                   n_iterations=100)
model.fit(metric, train_queries, validation=valid_queries)

logging.info(
    '================================================================================'
)
logging.info('test')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' %
             (metric,
              metric.evaluate_queries(test_queries,
                                      model.predict(test_queries, n_jobs=-1))))
Example #12
0
remove_useless_queries = False

# Find constant query-document features.
cfs = find_constant_features([training_queries,
                              validation_queries,
                              test_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('=' * 80)

model = LambdaMART(metric='NDCG@10', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=50, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)

model.fit(training_queries, validation_queries=validation_queries)

logging.info('=' * 80)

logging.info('%s on the test queries: %.8f'
             % (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
Example #13
0
       u'hour_of_day_dum22', u'hour_of_day_dum2',
       u'prop_starrating_mean_by_prop_id_x',
        u'prop_starrating_mean_by_prop_id_y',
        u'prop_starrating_median_by_prop_id_y',
        u'prop_starrating_median_by_prop_id_x',
        u'prop_starrating_std_by_prop_id_y',
        u'prop_starrating_std_by_prop_id_x']
d.drop_cols = list(set(d.drop_cols))

X,y,q = d.get_Xyq('train')
q_indptr = q[0:-1] - q[1:]
q_indptr = np.array([0] + [int(i + 1) for i in \
                            np.where(q_indptr!=0)[0]] + [X.shape[0]])
train_queries = queries.Queries(X, y, q_indptr)
model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=50, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)
#model = LambdaRandomForest(metric='nDCG@38', n_estimators=1000)
model.fit(train_queries)
X,y,q = d.get_Xyq('test')
q_indptr = q[0:-1] - q[1:]
q_indptr = np.array([0] + [int(i + 1) for i in \
                            np.where(q_indptr!=0)[0]] + [X.shape[0]])
test_queries = queries.Queries(X, y, q_indptr, has_sorted_relevances=True)
preds = model.predict(test_queries)
d.test_data['pred_rel'] = preds
result = ndcg_of_df(d.test_data, plus_random=False)
print result
imps = np.argsort(-model.feature_importances())
imps = d.pp_data.drop(d.drop_cols, axis=1).columns[imps]
print imps
from rankpy.models import LambdaMART
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open("test.svmlight") as f:
    head = [next(f) for x in range(4)]
features = map(lambda x: x.split(':')[1], head[-1][2:-1].split(' '))
columns = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, nrows = 1).columns.values.tolist()
lm = LambdaMART.load("LambdaMartModel0.5.model")
feats = dict(zip(features, lm.feature_importances()))
feats = sorted(feats.items(), key=lambda kv: -kv[1])

fig, ax = plt.subplots(figsize=(1200/120, 500/120))
bp = sns.barplot( map(lambda x: x[0], feats), map(lambda x: x[1], feats))

for item in bp.get_xticklabels():
    item.set_rotation(90)
plt.subplots_adjust(bottom=0.5)

plt.savefig("feature_importances", dpi=400)
# listwise_sampling()

logging.info('================================================================================')
logging.info('dump test file')

# listwise_sampling_test()

logging.info('================================================================================')
logging.info('load query database')

train_queries = Queries.load_from_text('../data/svmlight_train.txt')
valid_queries = Queries.load_from_text('../data/svmlight_val.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info('================================================================================')
logging.info('train LambdaMart')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')
logging.info('test')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
metric = NormalizedDiscountedCumulativeGain(
    38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
# model = LambdaMART(n_estimators=10000, max_depth=2, shrinkage=0.07,
#                    estopping=100, n_jobs=-1)
# model.fit(metric, train_queries)  # , validation=valid_queries)

model = LambdaMART(
    metric='NDCG@38',
    max_leaf_nodes=7,
    shrinkage=0.07,
    estopping=100,
    n_jobs=-1,
    random_state=42,
    use_pines=True,
    pines_kwargs=dict(
        switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN,
        tree_type=TreeType.OBLIVIOUS_CART,
        max_n_splits=10,
        min_samples_leaf=50,
        max_depth=2,
    ))

model.fit(train_queries, validation_queries=valid_queries)
logging.info('===============================================================')
# Save the model to files
# os.mkdir(MODELLER_DIR)
# logging.info('New folder is created: %s' % MODELLER_DIR)
# joblib.dump(model, MODELLER_DIR + model_name + '.pkl')
# logging.info('Model is saves as: %s' % MODELLER_DIR + model_name + '.pkl')
Example #17
0
    '================================================================================'
)
metrics = {}
# Prepare metric for this set of queries.
metrics[0] = NormalizedDiscountedCumulativeGain(
    10, queries=[train_queries, valid_queries, test_queries])
# metrics[1] = SeznamRank(10, queries=[train_queries, valid_queries, test_queries])
metrics[1] = DiscountedCumulativeGain(
    10, queries=[train_queries, valid_queries, test_queries])
metrics[2] = WinnerTakesAll(
    10, queries=[train_queries, valid_queries, test_queries])
# metrics[4] = ExpectedReciprocalRank(10, queries=[train_queries, valid_queries, test_queries])
# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=50000,
                   max_depth=4,
                   shrinkage=0.1,
                   estopping=args.iter,
                   n_jobs=4)
metric = metrics[args.metric]
model.fit(metric, train_queries, validation=valid_queries)

logging.info(
    '================================================================================'
)

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' %
             (metric,
              metric.evaluate_queries(test_queries,
                                      model.predict(test_queries, n_jobs=-1))))
#EOF