Example #1
0
def main():
    model_type = None
    if len(sys.argv) > 1:
        model_type = sys.argv[1]

    os_utils._makedirs("../logs")
    os_utils._makedirs("../output")
    logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp())


    # load data
    Q = load_question(params)
    dfTrain = load_train()
    dfTest = load_test()
    train_features = np.load(config.TRAIN_FEATURES_FILE)
    test_features = np.load(config.TEST_FEATURES_FILE)
    params["num_features"] = train_features.shape[1]


    # load split
    with open(config.SPLIT_FILE, "rb") as f:
        train_idx, valid_idx = pkl.load(f)


    # validation
    X_train = get_model_data(dfTrain.loc[train_idx], train_features[train_idx], params)
    X_valid = get_model_data(dfTrain.loc[valid_idx], train_features[valid_idx], params)

    model = get_model(model_type)(params, logger, init_embedding_matrix=init_embedding_matrix)
    model.fit(X_train, Q, validation_data=X_valid, shuffle=True)


    # submit
    X_train = get_model_data(dfTrain, train_features, params)
    X_test = get_model_data(dfTest, test_features, params)
    y_proba = np.zeros((dfTest.shape[0], params["n_runs"]), dtype=np.float32)
    for run in range(params["n_runs"]):
        params["random_seed"] = run
        params["model_name"] = "semantic_model_%s"%str(run+1)
        model = get_model(model_type)(params, logger, init_embedding_matrix=init_embedding_matrix)
        model.fit(X_train, Q, validation_data=None, shuffle=True)
        y_proba[:,run] = model.predict_proba(X_test, Q).flatten()
        dfTest["y_pre"] = np.mean(y_proba[:,:(run+1)], axis=1)
        dfTest[["y_pre"]].to_csv(config.SINGLE_SUB_FILE_PATTERN%(model_type, str(run+1)), header=True, index=False)
Example #2
0
def main():
    model_type = None
    if len(sys.argv) > 1:
        model_type = sys.argv[1]

    os_utils._makedirs("../logs")
    os_utils._makedirs("../output")
    logger = log_utils._get_logger("../logs",
                                   "tf-%s.log" % time_utils._timestamp())

    Q = load_question(params)
    dfTrain = load_train()
    dfTest = load_test()
    X_test = get_model_data(dfTest, params)

    # shuffle training data
    dfTrain = dfTrain.sample(frac=1.0)

    # validation
    train_ratio = 0.7
    N = dfTrain.shape[0]
    train_num = int(N * train_ratio)
    X_train = get_model_data(dfTrain[:train_num], params)
    X_valid = get_model_data(dfTrain[train_num:], params)

    model = get_model(model_type)(params,
                                  logger,
                                  init_embedding_matrix=init_embedding_matrix)
    model.fit(X_train, Q, validation_data=X_valid, shuffle=True)

    # submit
    X_train = get_model_data(dfTrain, params)
    y_proba = np.zeros((dfTest.shape[0], params["n_runs"]), dtype=np.float32)
    for run in range(params["n_runs"]):
        params["random_seed"] = run
        params["model_name"] = "semantic_model_%s" % str(run + 1)
        model = get_model(model_type)(
            params, logger, init_embedding_matrix=init_embedding_matrix)
        model.fit(X_train, Q, validation_data=None, shuffle=True)
        y_proba[:, run] = model.predict_proba(X_test, Q).flatten()
        dfTest["y_pre"] = np.mean(y_proba[:, :(run + 1)], axis=1)
        dfTest[["y_pre"]].to_csv(config.SUB_FILE_PATTERN % str(run + 1),
                                 header=True,
                                 index=False)
Example #3
0
def get_train_valid_test_data(augmentation=False):
    # load data
    Q = load_question(params)
    dfTrain = load_train()
    #dfTest = load_test()
    # train_features = load_feat("train")
    # test_features = load_feat("test")
    # params["num_features"] = train_features.shape[1]

    # load split
    #with open(config.SPLIT_FILE, "rb") as f:
    #    train_idx, valid_idx = pkl.load(f)

    # validation
    if augmentation:
        dfDev = pd.read_csv(config.DATA_DIR + "/" + "dev_aug.csv")
        dfDev = downsample(dfDev)
        params["use_features"] = False
        params["augmentation_decay_steps"] = 50000
        params["decay_steps"] = 50000
        X_dev = get_model_data(dfDev, None, params)
    else:
        #X_dev = get_model_data(dfTrain.loc[train_idx], None, params)245000
        X_dev = get_model_data(dfTrain.loc[:210000,:], None, params)
    #X_valid = get_model_data(dfTrain.loc[valid_idx], None, params)
    X_valid = get_model_data(dfTrain.loc[210000:220000,:], None, params)
    X_itest = get_model_data(dfTrain.loc[220000:,:], None, params)

    # submit
    #if augmentation:
    #    dfTrain = pd.read_csv(config.DATA_DIR + "/" + "train_aug.csv")
    #    dfTrain = downsample(dfTrain)
    #    params["use_features"] = False
    #    params["augmentation_decay_steps"] = 50000
    #    params["decay_steps"] = 50000
    #    X_train = get_model_data(dfTrain, None, params)
    #else:
    #    X_train = get_model_data(dfTrain, None, params)
    #X_test = get_model_data(dfTest, None, params)

    return X_dev, X_valid, Q, X_itest