def train(): utils._makedirs("../logs") utils._makedirs("../output") logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp()) dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t") dfTrain.columns = ["id", "left", "right", "label"] dfTrain.dropna(inplace=True) # shuffle training data dfTrain = dfTrain.sample(frac=1.0) dp = DataProcessor(max_num_words=params["max_num_words"], max_num_chars=params["max_num_chars"]) dfTrain = dp.fit_transform(dfTrain) N = dfTrain.shape[0] train_ratio = 0.6 train_num = int(N * train_ratio) X_train = get_model_data(dfTrain[:train_num], params) X_valid = get_model_data(dfTrain[train_num:], params) model = SemanticMatchingModel(model_name, params, logger=logger, threshold=0.2) model.fit(X_train, validation_data=X_valid, shuffle=False) # save model model.save_session() with open("dp.pkl", "wb") as f: pkl.dump((dp, model.threshold), f, protocol=2)
from model import LogisticRegression, DNN, RankNet, LambdaRank from prepare_data import label_file_pat, group_file_pat, feature_file_pat def load_data(type): labels = np.load(label_file_pat % type) qids = np.load(group_file_pat % type) features = np.load(feature_file_pat % type) X = {"feature": features, "label": labels, "qid": qids} return X utils._makedirs("logs") logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp()) params_common = { # you might have to tune the batch size to get ranknet and lambdarank working # keep in mind the followings: # 1. batch size should be large enough to ensure there are samples of different # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method" # this ensure the gradients are nonzeros and stable across batches, # which is important for pairwise method, e.g., ranknet and lambdarank # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank # (which are of size batch_size x batch_size) will consume large memory space "batch_size": 128, "epoch": 30, "feature_dim": 60, "batch_sampling_method": "sample", "shuffle": True,