# listwise_sampling() logging.info('================================================================================') logging.info('dump test file') # listwise_sampling_test() logging.info('================================================================================') logging.info('load query database') train_queries = Queries.load_from_text('../data/svmlight_train.txt') valid_queries = Queries.load_from_text('../data/svmlight_val.txt') test_queries = Queries.load_from_text('../data/svmlight_test.txt') logging.info('================================================================================') logging.info('train LambdaMart') # Prepare metric for this set of queries. metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=10000, max_depth=5, shrinkage=0.08, estopping=100, n_jobs=-1, n_iterations=100) model.fit(metric, train_queries, validation=valid_queries) logging.info('================================================================================') logging.info('test') # Print out the performance on the test set. logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
model = LambdaMART( metric='NDCG@38', max_leaf_nodes=7, shrinkage=0.07, estopping=100, n_jobs=-1, random_state=42, use_pines=True, pines_kwargs=dict( switch_criterion=ObliviousCartSwitchCriterionType.OBLIVIOUS_WHILE_CAN, tree_type=TreeType.OBLIVIOUS_CART, max_n_splits=10, min_samples_leaf=50, max_depth=2, )) model.fit(train_queries, validation_queries=valid_queries) logging.info('===============================================================') # Save the model to files # os.mkdir(MODELLER_DIR) # logging.info('New folder is created: %s' % MODELLER_DIR) # joblib.dump(model, MODELLER_DIR + model_name + '.pkl') # logging.info('Model is saves as: %s' % MODELLER_DIR + model_name + '.pkl') logging.info('===============================================================') # Print out the performance on the test set. # logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries( # test_queries, model.predict(test_queries, n_jobs=-1))))
#train_queries.save('../data/train_bin') #valid_queries.save('../data/validation_bin') #test_queries.save('../data/test_bin') # ... because loading them will be then faster. #train_queries = Queries.load('../data/train_bin') #valid_queries = Queries.load('../data/validation_bin') #test_queries = Queries.load('../data/test_bin') logging.info('================================================================================') # Print basic info about query datasets. logging.info('Train queries: %s' % train_queries) logging.info('Valid queries: %s' % valid_queries) logging.info('Test queries: %s' %test_queries) logging.info('================================================================================') # Prepare metric for this set of queries. metric = NormalizedDiscountedCumulativeGain(38, queries=[train_queries, valid_queries, test_queries]) # Initialize LambdaMART model and train it. model = LambdaMART(n_estimators=10000, max_depth=4, shrinkage=0.08, estopping=100, n_jobs=-1) model.fit(metric, train_queries, validation=valid_queries) logging.info('================================================================================') # Print out the performance on the test set. logging.info('%s on the test queries: %.8f' % (metric, metric.evaluate_queries(test_queries, model.predict(test_queries, n_jobs=-1))))
class LambdaMART(object): def __init__(self, metric='NDCG', n_estimators=100, max_depth=None, max_leaf_nodes=7, max_features=None, min_samples_split=2, min_samples_leaf=1, shrinkage=0.1, use_newton_method=True, use_random_forest=0, random_thresholds=False, subsample=1.0, use_logit_boost=False, use_ada_boost=False, estopping=50, min_n_estimators=1, base_model=None, n_jobs=1, random_state=None): self.feature_names = None self.params = { 'metric': metric, 'n_estimators': n_estimators, 'max_depth': max_depth, 'max_leaf_nodes': max_leaf_nodes, 'max_features': max_features, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'shrinkage': shrinkage, 'use_newton_method': use_newton_method, 'use_random_forest': use_random_forest, 'random_thresholds': random_thresholds, 'subsample': subsample, 'use_logit_boost': use_logit_boost, 'use_ada_boost': use_ada_boost, 'estopping': estopping, 'min_n_estimators': min_n_estimators, 'base_model': base_model, 'n_jobs': n_jobs, 'random_state': random_state, } def __str__(self): return self.__repr__() def __repr(self): return ( "%s(metric='%s', n_estimators=%d, max_depth=%d, max_leaf_nodes=%d,\n" "max_features=%d, min_samples_split=%d, min_samples_leaf=%d,\n" "shrinkage=%f, use_newton_method=%s, use_random_forest=%d,\n" "random_thresholds=%s, subsample=%f, use_logit_boost=%s, use_ada_boost=%s,\n" "estopping=%d, min_n_estimators=%d, n_jobs=%d, random_state=%s,\n" "base_model=%s)" % ( self.__class__.__name__, self.param["metric"], self.params["n_estimators"], self.params["max_depth"], self.params["max_leaf_nodes"], self.params["max_features"], self.params["min_samples_split"], self.params["min_samples_leaf"], self.params["shrinkage"], self.params["use_newton_method"], self.params["use_random_forest"], self.params["random_thresholds"], self.params["subsample"], self.params["use_logit_boost"], self.params["use_ada_boost"], self.params["estopping"], self.params["min_n_estimators"], self.params["n_jobs"], self.params["random_state"], str(self.params["base_model"]), )) def _build_query_indptr(self, ids): """ The query index pointer into the feature_vectors and relevance_scores array, i.e. the document feature vectors, ``feature_vectors[query_indptr[i]:query_indptr[i + 1]]``, and the corresponding relevance scores, ``relevance_scores[query_indptr[i]:query_indptr[i + 1]]``, are the feature vectors and relevance scores for the i-th query documents. """ query_indptr = [0] query_ids = [] prev_qid = None for qid in ids: if qid == prev_qid: query_indptr[-1] += 1 else: query_ids.append(qid) query_indptr.append(query_indptr[-1] + 1) prev_qid = qid return query_indptr, query_ids def _build_queries(self, X, y, ids, w): query_indptr, query_ids = self._build_query_indptr(ids) q = Queries(X, y, query_indptr, query_ids=query_ids) # weights as per query instead of per-row ... just guess wn = [ np.mean(w[query_indptr[i]:query_indptr[i + 1]]) for i in range(len(query_indptr) - 1) ] wn = [w[i] for i in query_indptr[:-1]] return q, np.ascontiguousarray(wn, dtype='float64') def fit(self, X, y, ids, weight=None, feature_names=None): self.feature_names = feature_names # Unfortunately rankpy only works with integer labels... # This is far from perfect, but works as a first try y = (np.asanyarray(y) * 5).astype(np.intc) # Split out a 10% validation set splitter = GroupKFold(10) train, valid = next(splitter.split(X, None, ids)) X_train, X_valid, y_train, y_valid, ids_train, ids_valid, w_train, w_valid = chain.from_iterable( ((a[train], a[valid]) for a in [X, y, ids, weight])) q_train, w_train = self._build_queries(X_train, y_train, ids_train, w_train) q_valid, w_valid = self._build_queries(X_valid, y_valid, ids_valid, w_valid) self.model = LambdaMARTModel(**self.params) self.model.fit(q_train, w_train, q_valid, w_valid) return self def predict(self, X, ids, weight, feature_names=None): self.feature_names = feature_names query_indptr, query_ids = self._build_query_indptr(ids) # We wont be using this, but Queries wont instantiate without it y = np.zeros(X.shape[0]) q = Queries(X, y, query_indptr, query_ids=query_ids) y_pred = self.model.predict(q, n_jobs=self.params['n_jobs']) return y_pred def plot_importance(self): if self.feature_names is None: raise Exception('No feature names available') importance = self.model.feature_importances(self.params['n_jobs']) # stolen from xgboost tuples = zip(self.feature_names, importance) tuples = sorted(tuples, key=lambda x: x[1]) labels, values = tuples self.save_topn_features(labels, values) _, ax = plt.subplots(1, 1) ylocs = np.arange(len(values)) ax.barh(ylocs, values, align='center', height=0.2) for x, y in zip(values, yloc): ax.text(x + 1, y, x, va='center') ax.set_yticks(ylocs) ax.set_yticklabels(labels) xlim = (0, max(values) * 1.1) ax.set_xlim(xlim) ylim = (-1, len(importance)) ax.set_ylim(ylim) ax.grid() return ax def save_topn_features(self, labels, values, fname="LambdaMART_topn_features.txt", topn=-1): if topn == -1: topn = len(labels) else: topn = min(topn, len(labels)) with open(fname, "w") as f: for i in range(topn): f.write("%s = %f" % (labels[i], values[i]))
logging.info('Test queries: %s' % test_queries) logging.info( '================================================================================' ) model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1, estopping=10, n_jobs=-1, min_samples_leaf=50, random_state=42) #TODO: do some crossval here? model.fit(training_queries, validation_queries=test_queries) logging.info( '================================================================================' ) logging.info('%s on the test queries: %.8f' % (model.metric, model.evaluate(test_queries, n_jobs=-1))) model.save('LambdaMART_L7_S0.1_E50_' + model.metric) predicted_rankings = model.predict_rankings(test_queries) test_df = pd.read_csv("../test_set_VU_DM_2014.csv", header=0, nrows=test_queries.document_count()) test_df['pred_position'] = np.concatenate(predicted_rankings) sorted_df = test_df[['srch_id', 'prop_id', 'pred_position'
remove_useless_queries = False # Find constant query-document features. cfs = find_constant_features([training_queries, validation_queries, test_queries]) # Get rid of constant features and (possibly) remove useless queries. training_queries.adjust(remove_features=cfs, purge=remove_useless_queries) validation_queries.adjust(remove_features=cfs, purge=remove_useless_queries) test_queries.adjust(remove_features=cfs) # Print basic info about query datasets. logging.info('Train queries: %s' % training_queries) logging.info('Valid queries: %s' % validation_queries) logging.info('Test queries: %s' % test_queries) logging.info('=' * 80) model = LambdaMART(metric='NDCG@10', max_leaf_nodes=7, shrinkage=0.1, estopping=50, n_jobs=-1, min_samples_leaf=50, random_state=42) model.fit(training_queries, validation_queries=validation_queries) logging.info('=' * 80) logging.info('%s on the test queries: %.8f' % (model.metric, model.evaluate(test_queries, n_jobs=-1))) model.save('LambdaMART_L7_S0.1_E50_' + model.metric)
u'hour_of_day_dum22', u'hour_of_day_dum2', u'prop_starrating_mean_by_prop_id_x', u'prop_starrating_mean_by_prop_id_y', u'prop_starrating_median_by_prop_id_y', u'prop_starrating_median_by_prop_id_x', u'prop_starrating_std_by_prop_id_y', u'prop_starrating_std_by_prop_id_x'] d.drop_cols = list(set(d.drop_cols)) X,y,q = d.get_Xyq('train') q_indptr = q[0:-1] - q[1:] q_indptr = np.array([0] + [int(i + 1) for i in \ np.where(q_indptr!=0)[0]] + [X.shape[0]]) train_queries = queries.Queries(X, y, q_indptr) model = LambdaMART(metric='nDCG@38', max_leaf_nodes=7, shrinkage=0.1, estopping=50, n_jobs=-1, min_samples_leaf=50, random_state=42) #model = LambdaRandomForest(metric='nDCG@38', n_estimators=1000) model.fit(train_queries) X,y,q = d.get_Xyq('test') q_indptr = q[0:-1] - q[1:] q_indptr = np.array([0] + [int(i + 1) for i in \ np.where(q_indptr!=0)[0]] + [X.shape[0]]) test_queries = queries.Queries(X, y, q_indptr, has_sorted_relevances=True) preds = model.predict(test_queries) d.test_data['pred_rel'] = preds result = ndcg_of_df(d.test_data, plus_random=False) print result imps = np.argsort(-model.feature_importances()) imps = d.pp_data.drop(d.drop_cols, axis=1).columns[imps] print imps