Esempio n. 1
0
def train():

    utils._makedirs("../logs")
    utils._makedirs("../output")
    logger = utils._get_logger("../logs", "tf-%s.log" % utils._timestamp())

    dfTrain = pd.read_csv(config.TRAIN_FILE, header=None, sep="\t")
    dfTrain.columns = ["id", "left", "right", "label"]

    dfTrain.dropna(inplace=True)

    # shuffle training data
    dfTrain = dfTrain.sample(frac=1.0)

    dp = DataProcessor(max_num_words=params["max_num_words"],
                       max_num_chars=params["max_num_chars"])
    dfTrain = dp.fit_transform(dfTrain)

    N = dfTrain.shape[0]
    train_ratio = 0.6
    train_num = int(N * train_ratio)
    X_train = get_model_data(dfTrain[:train_num], params)
    X_valid = get_model_data(dfTrain[train_num:], params)

    model = SemanticMatchingModel(model_name,
                                  params,
                                  logger=logger,
                                  threshold=0.2)
    model.fit(X_train, validation_data=X_valid, shuffle=False)

    # save model
    model.save_session()
    with open("dp.pkl", "wb") as f:
        pkl.dump((dp, model.threshold), f, protocol=2)
Esempio n. 2
0
from model import LogisticRegression, DNN, RankNet, LambdaRank
from prepare_data import label_file_pat, group_file_pat, feature_file_pat


def load_data(type):

    labels = np.load(label_file_pat % type)
    qids = np.load(group_file_pat % type)
    features = np.load(feature_file_pat % type)

    X = {"feature": features, "label": labels, "qid": qids}
    return X


utils._makedirs("logs")
logger = utils._get_logger("logs", "tf-%s.log" % utils._timestamp())

params_common = {
    # you might have to tune the batch size to get ranknet and lambdarank working
    # keep in mind the followings:
    # 1. batch size should be large enough to ensure there are samples of different
    # relevance labels from the same group, especially when you use "sample" as "batch_sampling_method"
    # this ensure the gradients are nonzeros and stable across batches,
    # which is important for pairwise method, e.g., ranknet and lambdarank
    # 2. batch size should not be very large since the lambda_ij matrix in ranknet and lambdarank
    # (which are of size batch_size x batch_size) will consume large memory space
    "batch_size": 128,
    "epoch": 30,
    "feature_dim": 60,
    "batch_sampling_method": "sample",
    "shuffle": True,