# listwise_sampling()

logging.info('================================================================================')
logging.info('dump test file')

# listwise_sampling_test()

logging.info('================================================================================')
logging.info('load query database')

train_queries = Queries.load_from_text('../data/svmlight_train.txt')
valid_queries = Queries.load_from_text('../data/svmlight_val.txt')
test_queries = Queries.load_from_text('../data/svmlight_test.txt')

logging.info('================================================================================')
logging.info('train LambdaMart')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')
logging.info('test')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
model = LambdaMART(
    metric='NDCG@38',
    max_leaf_nodes=7,
    shrinkage=0.07,
    estopping=100,
    n_jobs=-1,
    random_state=42,
    use_pines=True,
    pines_kwargs=dict(
        switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN,
        tree_type=TreeType.OBLIVIOUS_CART,
        max_n_splits=10,
        min_samples_leaf=50,
        max_depth=2,
    ))

model.fit(train_queries, validation_queries=valid_queries)
logging.info('===============================================================')
# Save the model to files
# os.mkdir(MODELLER_DIR)
# logging.info('New folder is created: %s' % MODELLER_DIR)
# joblib.dump(model, MODELLER_DIR + model_name + '.pkl')
# logging.info('Model is saves as: %s' % MODELLER_DIR + model_name + '.pkl')

logging.info('===============================================================')

# Print out the performance on the test set.
# logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(
#     test_queries, model.predict(test_queries, n_jobs=-1))))
#train_queries.save('../data/train_bin')
#valid_queries.save('../data/validation_bin')
#test_queries.save('../data/test_bin')

# ... because loading them will be then faster.
#train_queries = Queries.load('../data/train_bin')
#valid_queries = Queries.load('../data/validation_bin')
#test_queries = Queries.load('../data/test_bin')

logging.info('================================================================================')

# Print basic info about query datasets.
logging.info('Train queries: %s' % train_queries)
logging.info('Valid queries: %s' % valid_queries)
logging.info('Test queries: %s' %test_queries)

logging.info('================================================================================')

# Prepare metric for this set of queries.
metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries])

# Initialize LambdaMART model and train it.
model = LambdaMART(n_estimators=10000, max_depth=4, shrinkage=0.08, estopping=100, n_jobs=-1)
model.fit(metric, train_queries, validation=valid_queries)

logging.info('================================================================================')

# Print out the performance on the test set.
logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))

Exemple #4
0
class LambdaMART(object):
    def __init__(self,
                 metric='NDCG',
                 n_estimators=100,
                 max_depth=None,
                 max_leaf_nodes=7,
                 max_features=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 shrinkage=0.1,
                 use_newton_method=True,
                 use_random_forest=0,
                 random_thresholds=False,
                 subsample=1.0,
                 use_logit_boost=False,
                 use_ada_boost=False,
                 estopping=50,
                 min_n_estimators=1,
                 base_model=None,
                 n_jobs=1,
                 random_state=None):
        self.feature_names = None

        self.params = {
            'metric': metric,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'max_leaf_nodes': max_leaf_nodes,
            'max_features': max_features,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'shrinkage': shrinkage,
            'use_newton_method': use_newton_method,
            'use_random_forest': use_random_forest,
            'random_thresholds': random_thresholds,
            'subsample': subsample,
            'use_logit_boost': use_logit_boost,
            'use_ada_boost': use_ada_boost,
            'estopping': estopping,
            'min_n_estimators': min_n_estimators,
            'base_model': base_model,
            'n_jobs': n_jobs,
            'random_state': random_state,
        }

    def __str__(self):
        return self.__repr__()

    def __repr(self):
        return (
            "%s(metric='%s', n_estimators=%d, max_depth=%d, max_leaf_nodes=%d,\n"
            "max_features=%d, min_samples_split=%d, min_samples_leaf=%d,\n"
            "shrinkage=%f, use_newton_method=%s, use_random_forest=%d,\n"
            "random_thresholds=%s, subsample=%f, use_logit_boost=%s, use_ada_boost=%s,\n"
            "estopping=%d, min_n_estimators=%d, n_jobs=%d, random_state=%s,\n"
            "base_model=%s)" % (
                self.__class__.__name__,
                self.param["metric"],
                self.params["n_estimators"],
                self.params["max_depth"],
                self.params["max_leaf_nodes"],
                self.params["max_features"],
                self.params["min_samples_split"],
                self.params["min_samples_leaf"],
                self.params["shrinkage"],
                self.params["use_newton_method"],
                self.params["use_random_forest"],
                self.params["random_thresholds"],
                self.params["subsample"],
                self.params["use_logit_boost"],
                self.params["use_ada_boost"],
                self.params["estopping"],
                self.params["min_n_estimators"],
                self.params["n_jobs"],
                self.params["random_state"],
                str(self.params["base_model"]),
            ))

    def _build_query_indptr(self, ids):
        """
        The query index pointer into the feature_vectors and relevance_scores
        array, i.e. the document feature vectors,
        ``feature_vectors[query_indptr[i]:query_indptr[i + 1]]``, and the
        corresponding relevance scores,
        ``relevance_scores[query_indptr[i]:query_indptr[i + 1]]``,
        are the feature vectors and relevance scores for the i-th
        query documents.
        """
        query_indptr = [0]
        query_ids = []
        prev_qid = None
        for qid in ids:
            if qid == prev_qid:
                query_indptr[-1] += 1
            else:
                query_ids.append(qid)
                query_indptr.append(query_indptr[-1] + 1)
                prev_qid = qid
        return query_indptr, query_ids

    def _build_queries(self, X, y, ids, w):
        query_indptr, query_ids = self._build_query_indptr(ids)
        q = Queries(X, y, query_indptr, query_ids=query_ids)
        # weights as per query instead of per-row ... just guess
        wn = [
            np.mean(w[query_indptr[i]:query_indptr[i + 1]])
            for i in range(len(query_indptr) - 1)
        ]
        wn = [w[i] for i in query_indptr[:-1]]
        return q, np.ascontiguousarray(wn, dtype='float64')

    def fit(self, X, y, ids, weight=None, feature_names=None):
        self.feature_names = feature_names
        # Unfortunately rankpy only works with integer labels...
        # This is far from perfect, but works as a first try
        y = (np.asanyarray(y) * 5).astype(np.intc)
        #  Split out a 10% validation set
        splitter = GroupKFold(10)
        train, valid = next(splitter.split(X, None, ids))

        X_train, X_valid, y_train, y_valid, ids_train, ids_valid, w_train, w_valid = chain.from_iterable(
            ((a[train], a[valid]) for a in [X, y, ids, weight]))

        q_train, w_train = self._build_queries(X_train, y_train, ids_train,
                                               w_train)
        q_valid, w_valid = self._build_queries(X_valid, y_valid, ids_valid,
                                               w_valid)

        self.model = LambdaMARTModel(**self.params)
        self.model.fit(q_train, w_train, q_valid, w_valid)
        return self

    def predict(self, X, ids, weight, feature_names=None):
        self.feature_names = feature_names
        query_indptr, query_ids = self._build_query_indptr(ids)
        # We wont be using this, but Queries wont instantiate without it
        y = np.zeros(X.shape[0])
        q = Queries(X, y, query_indptr, query_ids=query_ids)
        y_pred = self.model.predict(q, n_jobs=self.params['n_jobs'])
        return y_pred

    def plot_importance(self):
        if self.feature_names is None:
            raise Exception('No feature names available')

        importance = self.model.feature_importances(self.params['n_jobs'])

        # stolen from xgboost
        tuples = zip(self.feature_names, importance)
        tuples = sorted(tuples, key=lambda x: x[1])
        labels, values = tuples

        self.save_topn_features(labels, values)

        _, ax = plt.subplots(1, 1)
        ylocs = np.arange(len(values))
        ax.barh(ylocs, values, align='center', height=0.2)
        for x, y in zip(values, yloc):
            ax.text(x + 1, y, x, va='center')
        ax.set_yticks(ylocs)
        ax.set_yticklabels(labels)

        xlim = (0, max(values) * 1.1)
        ax.set_xlim(xlim)

        ylim = (-1, len(importance))
        ax.set_ylim(ylim)

        ax.grid()
        return ax

    def save_topn_features(self,
                           labels,
                           values,
                           fname="LambdaMART_topn_features.txt",
                           topn=-1):
        if topn == -1:
            topn = len(labels)
        else:
            topn = min(topn, len(labels))
        with open(fname, "w") as f:
            for i in range(topn):
                f.write("%s = %f" % (labels[i], values[i]))
Exemple #5
0
logging.info('Test queries: %s' % test_queries)

logging.info(
    '================================================================================'
)

model = LambdaMART(metric='nDCG@38',
                   max_leaf_nodes=7,
                   shrinkage=0.1,
                   estopping=10,
                   n_jobs=-1,
                   min_samples_leaf=50,
                   random_state=42)

#TODO: do some crossval here?
model.fit(training_queries, validation_queries=test_queries)

logging.info(
    '================================================================================'
)
logging.info('%s on the test queries: %.8f' %
             (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
predicted_rankings = model.predict_rankings(test_queries)

test_df = pd.read_csv("../test_set_VU_DM_2014.csv",
                      header=0,
                      nrows=test_queries.document_count())
test_df['pred_position'] = np.concatenate(predicted_rankings)
sorted_df = test_df[['srch_id', 'prop_id', 'pred_position'
Exemple #6
0
remove_useless_queries = False

# Find constant query-document features.
cfs = find_constant_features([training_queries,
                              validation_queries,
                              test_queries])

# Get rid of constant features and (possibly) remove useless queries.
training_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries)
test_queries.adjust(remove_features=cfs)

# Print basic info about query datasets.
logging.info('Train queries: %s' % training_queries)
logging.info('Valid queries: %s' % validation_queries)
logging.info('Test queries: %s' % test_queries)

logging.info('=' * 80)

model = LambdaMART(metric='NDCG@10', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=50, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)

model.fit(training_queries, validation_queries=validation_queries)

logging.info('=' * 80)

logging.info('%s on the test queries: %.8f'
             % (model.metric, model.evaluate(test_queries, n_jobs=-1)))

model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
Exemple #7
0
       u'hour_of_day_dum22', u'hour_of_day_dum2',
       u'prop_starrating_mean_by_prop_id_x',
        u'prop_starrating_mean_by_prop_id_y',
        u'prop_starrating_median_by_prop_id_y',
        u'prop_starrating_median_by_prop_id_x',
        u'prop_starrating_std_by_prop_id_y',
        u'prop_starrating_std_by_prop_id_x']
d.drop_cols = list(set(d.drop_cols))

X,y,q = d.get_Xyq('train')
q_indptr = q[0:-1] - q[1:]
q_indptr = np.array([0] + [int(i + 1) for i in \
                            np.where(q_indptr!=0)[0]] + [X.shape[0]])
train_queries = queries.Queries(X, y, q_indptr)
model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1,
                   estopping=50, n_jobs=-1, min_samples_leaf=50,
                   random_state=42)
#model = LambdaRandomForest(metric='nDCG@38', n_estimators=1000)
model.fit(train_queries)
X,y,q = d.get_Xyq('test')
q_indptr = q[0:-1] - q[1:]
q_indptr = np.array([0] + [int(i + 1) for i in \
                            np.where(q_indptr!=0)[0]] + [X.shape[0]])
test_queries = queries.Queries(X, y, q_indptr, has_sorted_relevances=True)
preds = model.predict(test_queries)
d.test_data['pred_rel'] = preds
result = ndcg_of_df(d.test_data, plus_random=False)
print result
imps = np.argsort(-model.feature_importances())
imps = d.pp_data.drop(d.drop_cols, axis=1).columns[imps]
print imps