Beispiel #1
0
def train():

    utils._makedirs("../logs")
    utils._makedirs("../output")
    logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp())

    dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t")
    dfTrain.columns = ["id", "left", "right", "label"]

    dfTrain.dropna(inplace=True)

    # shuffle training data
    dfTrain = dfTrain.sample(frac=1.0)

    dp = DataProcessor(max_num_words=params["max_num_words"],
                       max_num_chars=params["max_num_chars"])
    dfTrain = dp.fit_transform(dfTrain)

    N = dfTrain.shape[0]
    train_ratio = 0.6
    train_num = int(N * train_ratio)
    X_train = get_model_data(dfTrain[:train_num], params)
    X_valid = get_model_data(dfTrain[train_num:], params)

    model = SemanticMatchingModel(model_name,
                                  params,
                                  logger=logger,
                                  threshold=0.2)
    model.fit(X_train, validation_data=X_valid, shuffle=False)

    # save model
    model.save_session()
    with open("dp.pkl", "wb") as f:
        pkl.dump((dp, model.threshold), f, protocol=2)
Beispiel #2
0
    def __init__(self, model_name, params, logger, training=True):
        self.model_name = model_name
        self.params = params
        self.logger = logger
        utils._makedirs(self.params["offline_model_dir"], force=training)

        self._init_tf_vars()
        self.loss, self.num_pairs, self.score, self.train_op = self._build_model()

        self.sess, self.saver = self._init_session()
Beispiel #3
0
    def __init__(self, model_name, params, logger, threshold, training=True):
        self.model_name = model_name
        self.params = params
        self.logger = logger
        self.threshold = threshold
        utils._makedirs(self.params["offline_model_dir"], force=training)

        self._init_tf_vars()
        self.loss, self.proba = self._build_model()
        # self.loss = self._get_loss()
        self.train_op = self._get_train_op()

        self.sess, self.saver = self._init_session()
    def __init__(self, params, target_scaler, logger):
        self.params = params
        self.target_scaler = target_scaler
        self.logger = logger
        _makedirs(self.params["model_dir"], force=True)
        self._init_graph()
        self.gvars_state_list = []

        # 14
        self.bias = 0.01228477
        self.weights = [
            0.00599607, 0.02999416, 0.05985384, 0.20137787, 0.03178938, 0.04612812,
            0.05384821, 0.10121514, 0.05915169, 0.05521121, 0.06448063, 0.0944233,
            0.08306157, 0.11769992
        ]
        self.weights = np.array(self.weights).reshape(-1, 1)
Beispiel #5
0
import utils
from model import LogisticRegression, DNN, RankNet, LambdaRank
from prepare_data import label_file_pat, group_file_pat, feature_file_pat


def load_data(type):

    labels = np.load(label_file_pat % type)
    qids = np.load(group_file_pat % type)
    features = np.load(feature_file_pat % type)

    X = {"feature": features, "label": labels, "qid": qids}
    return X


utils._makedirs("logs")
logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp())

params_common = {
    # you might have to tune the batch size to get ranknet and lambdarank working
    # keep in mind the followings:
    # 1. batch size should be large enough to ensure there are samples of different
    # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method"
    # this ensure the gradients are nonzeros and stable across batches,
    # which is important for pairwise method, e.g., ranknet and lambdarank
    # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank
    # (which are of size batch_size x batch_size) will consume large memory space
    "batch_size": 128,
    "epoch": 30,
    "feature_dim": 60,
    "batch_sampling_method": "sample",
Beispiel #6
0
import utils
from model import LogisticRegression, DNN, RankNet, LambdaRank
from prepare_data import label_file_pat, group_file_pat, feature_file_pat


def load_data(type):

    labels = np.load(label_file_pat % type)
    qids = np.load(group_file_pat % type)
    features = np.load(feature_file_pat % type)

    X = {"feature": features, "label": labels, "qid": qids}
    return X


utils._makedirs("../logs")
logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp())

params_common = {
    # you might have to tune the batch size to get ranknet and lambdarank working
    # keep in mind the followings:
    # 1. batch size should be large enough to ensure there are samples of different
    # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method"
    # this ensure the gradients are nonzeros and stable across batches,
    # which is important for pairwise method, e.g., ranknet and lambdarank
    # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank
    # (which are of size batch_size x batch_size) will consume large memory space
    "batch_size": 128,
    "epoch": 50,
    "feature_dim": 33,  #46,
    "batch_sampling_method": "sample",
 def _save_session(self, dir):
     """Saves session = weights"""
     _makedirs(self.params["model_dir"])
     self.saver.save(self.sess, dir)