def train(): utils._makedirs("../logs") utils._makedirs("../output") logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp()) dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t") dfTrain.columns = ["id", "left", "right", "label"] dfTrain.dropna(inplace=True) # shuffle training data dfTrain = dfTrain.sample(frac=1.0) dp = DataProcessor(max_num_words=params["max_num_words"], max_num_chars=params["max_num_chars"]) dfTrain = dp.fit_transform(dfTrain) N = dfTrain.shape[0] train_ratio = 0.6 train_num = int(N * train_ratio) X_train = get_model_data(dfTrain[:train_num], params) X_valid = get_model_data(dfTrain[train_num:], params) model = SemanticMatchingModel(model_name, params, logger=logger, threshold=0.2) model.fit(X_train, validation_data=X_valid, shuffle=False) # save model model.save_session() with open("dp.pkl", "wb") as f: pkl.dump((dp, model.threshold), f, protocol=2)
def __init__(self, model_name, params, logger, training=True): self.model_name = model_name self.params = params self.logger = logger utils._makedirs(self.params["offline_model_dir"], force=training) self._init_tf_vars() self.loss, self.num_pairs, self.score, self.train_op = self._build_model() self.sess, self.saver = self._init_session()
def __init__(self, model_name, params, logger, threshold, training=True): self.model_name = model_name self.params = params self.logger = logger self.threshold = threshold utils._makedirs(self.params["offline_model_dir"], force=training) self._init_tf_vars() self.loss, self.proba = self._build_model() # self.loss = self._get_loss() self.train_op = self._get_train_op() self.sess, self.saver = self._init_session()
def __init__(self, params, target_scaler, logger): self.params = params self.target_scaler = target_scaler self.logger = logger _makedirs(self.params["model_dir"], force=True) self._init_graph() self.gvars_state_list = [] # 14 self.bias = 0.01228477 self.weights = [ 0.00599607, 0.02999416, 0.05985384, 0.20137787, 0.03178938, 0.04612812, 0.05384821, 0.10121514, 0.05915169, 0.05521121, 0.06448063, 0.0944233, 0.08306157, 0.11769992 ] self.weights = np.array(self.weights).reshape(-1, 1)
import utils from model import LogisticRegression, DNN, RankNet, LambdaRank from prepare_data import label_file_pat, group_file_pat, feature_file_pat def load_data(type): labels = np.load(label_file_pat % type) qids = np.load(group_file_pat % type) features = np.load(feature_file_pat % type) X = {"feature": features, "label": labels, "qid": qids} return X utils._makedirs("logs") logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp()) params_common = { # you might have to tune the batch size to get ranknet and lambdarank working # keep in mind the followings: # 1. batch size should be large enough to ensure there are samples of different # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method" # this ensure the gradients are nonzeros and stable across batches, # which is important for pairwise method, e.g., ranknet and lambdarank # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank # (which are of size batch_size x batch_size) will consume large memory space "batch_size": 128, "epoch": 30, "feature_dim": 60, "batch_sampling_method": "sample",
import utils from model import LogisticRegression, DNN, RankNet, LambdaRank from prepare_data import label_file_pat, group_file_pat, feature_file_pat def load_data(type): labels = np.load(label_file_pat % type) qids = np.load(group_file_pat % type) features = np.load(feature_file_pat % type) X = {"feature": features, "label": labels, "qid": qids} return X utils._makedirs("../logs") logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp()) params_common = { # you might have to tune the batch size to get ranknet and lambdarank working # keep in mind the followings: # 1. batch size should be large enough to ensure there are samples of different # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method" # this ensure the gradients are nonzeros and stable across batches, # which is important for pairwise method, e.g., ranknet and lambdarank # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank # (which are of size batch_size x batch_size) will consume large memory space "batch_size": 128, "epoch": 50, "feature_dim": 33, #46, "batch_sampling_method": "sample",
def _save_session(self, dir): """Saves session = weights""" _makedirs(self.params["model_dir"]) self.saver.save(self.sess, dir)