def compute_score(block=20):
    print(f"computing result for block size = {block}")

    # load result file
    with open(os.path.join(predict_dir,
                           f"generation_result_block{block}_parsed.json"),
              'r',
              encoding='utf-8') as infile:
        data = json.load(infile)

    # load tfidf model
    tfidf_model_path = os.path.join(data_dir, "bookcorpus", "tfidf_model",
                                    f"block_{block}.joblib")
    model = joblib.load(tfidf_model_path)

    # turn to vector
    frame_list = []
    for d in data:
        story_frames = " ".join([frame["Frame"] for frame in d["frames"]])
        frame_list.append(story_frames)

    predicted_vectors = model.fit_transform(frame_list)
    predicted_vectors = predicted_vectors.toarray()
    print("predicted_vectors.shape = ", predicted_vectors.shape)

    # get ground truth
    y = []
    for d in data:
        y.append(d["y"][0])
    y = np.array(y)
    print(f"y.shape = {y.shape}")

    cosine = tfidf_metric(y[:1000, :], predicted_vectors[:1000, :])
    print(f"cosine = {cosine}\n")
def tfidf_replay_baseline_skip_block(block,
                                     skip,
                                     data_name="bookcorpus",
                                     device="cpu"):
    print("block = {}, skip = {}".format(block, skip))
    if data_name == "bookcorpus":
        x_test, y_test = load_tfidf("test",
                                    block,
                                    skip,
                                    verbose=True,
                                    redo=False)
    elif data_name == "coda19":
        x_test, y_test = coda_load_tfidf("test", block, verbose=True)
    else:
        print("Not supported yet!")
        quit()

    x_test, y_test = x_test.todense(), y_test.todense()
    y_pred = x_test

    res = tfidf_metric(y_test, y_pred, device=device)
    print(res)
    print_tfidf_metric(
        {
            "cosine": float(res),
            "block": block,
            "skip": skip,
            "note": data_name
        },
        filename=os.path.join(result_dir,
                              f"{data_name}_tfidf_skip_block.json"))
def tfidf_prior_baseline(block,
                         data_name="bookcorpus",
                         downsample=-1,
                         device="cpu"):
    if data_name == "bookcorpus":
        x_train, y_train = load_tfidf("train", block)
        x_test, y_test = load_tfidf("test", block)
    elif data_name == "coda19":
        x_train, y_train = coda_load_tfidf("train", block, verbose=True)
        x_test, y_test = coda_load_tfidf("test", block, verbose=True)
    else:
        print("Not supported yet!")
        quit()

    # downsample
    if downsample != -1:
        random_index = np.random.RandomState(5516).permutation(
            x_train.shape[0])[:downsample]
        x_train, y_train = x_train[random_index], y_train[random_index]

    x_train, y_train = x_train.todense(), y_train.todense()
    x_test, y_test = x_test.todense(), y_test.todense()

    prior = np.mean(y_train, axis=0)
    print(prior)
    prior = np.repeat(prior.reshape([1, -1]), y_test.shape[0], axis=0)
    print(prior.shape)
    res = tfidf_metric(y_test, prior)
    print(res)

    cosine_detail = tfidf_metric_detail(y_test, prior, device=device)
    print(cosine_detail)
    h5_save(
        os.path.join(predict_dir, f"{data_name}_tfidf_prior_{block}.h5"),
        name="cosine",
        data=cosine_detail,
    )

    print_tfidf_metric(
        {
            "cosine": float(res),
            "block": block,
            "skip": 0,
            "note": "downsample",
        },
        filename=os.path.join(result_dir,
                              f"{data_name}_tfidf_prior_baseline.json"))
def tf_ml_baseline(block=200,
                   model_name="RandomForest",
                   data_name="bookcorpus",
                   downsample=-1,
                   history=None,
                   n_jobs=10,
                   device="cpu"):
    print("loading data")

    # tfidf as feature
    if data_name == "bookcorpus":
        if history is None:
            x_train, y_train = load_tfidf("train", block, verbose=True)
            x_test, y_test = load_tfidf("test", block, verbose=True)
        else:
            x_train, y_train = load_tfidf_long("train",
                                               block,
                                               verbose=True,
                                               history=history)
            x_test, y_test = load_tfidf_long("test",
                                             block,
                                             verbose=True,
                                             history=history)
    elif data_name == "coda19":
        x_train, y_train = coda_load_tfidf("train", block, verbose=True)
        x_test, y_test = coda_load_tfidf("test", block, verbose=True)
    else:
        print("Not supported yet!")
        quit()

    if downsample != -1:
        random_index = np.random.RandomState(5516).permutation(
            x_train.shape[0])[:88720]
        x_train, y_train = x_train[random_index], y_train[random_index]

    # do sampling if the training data is too big
    if x_train.shape[0] > 1000000:
        index_list = np.random.RandomState(seed=RANDOM_SEED).permutation(
            x_train.shape[0])[:1000000]
        index_list = np.sort(index_list)
        x_train, y_train = x_train[index_list], y_train[index_list]

    x_train, y_train = x_train.astype(np.float32), y_train.astype(np.float32)
    x_test, y_test = x_test.astype(np.float32), y_test.astype(np.float32)

    x_train, y_train = x_train.todense(), y_train.todense()
    x_test, y_test = x_test.todense(), y_test.todense()

    print("train: x = {}, y = {}".format(str(x_train.shape),
                                         str(y_train.shape)))
    print("test: x = {}, y = {}".format(str(x_test.shape), str(y_test.shape)))
    print("building model using", model_name)

    # parameter setting
    rf_param = {
        "max_depth": 10,
        "random_state": RANDOM_SEED,
        "n_jobs": n_jobs,
        "n_estimators": 30,
        "verbose": 10,
    }
    lgbm_param = {
        "max_depth": 3,
        "num_leaves": 5,
        "random_state": RANDOM_SEED,
        "n_estimators": 100,
        "n_jobs": 1,
        "verbose": -1,
        "force_row_wise": True,
        "device": "gpu",
    }
    if model_name == "RandomForest":
        model = RandomForestRegressor(**rf_param)
    elif model_name == "LGBM":
        model = MultiOutputRegressor(LGBMRegressor(**lgbm_param),
                                     n_jobs=n_jobs)
    else:
        print("Please use the available model")

    print("training")
    model.fit(x_train, y_train)

    if history is None:
        model_output = os.path.join(
            model_dir, data_name,
            "block{}_{}.joblib".format(block, model_name))
        filename = os.path.join(result_dir, f"{data_name}_ml_baseline.json")
    else:
        model_output = os.path.join(
            model_dir, data_name,
            "history_block{}_{}.joblib".format(block, model_name))
        filename = os.path.join(result_dir,
                                f"history_exp_{data_name}_ml_baseline.json")

    # save model
    joblib.dump(model, model_output)

    # make prediction
    print("prediting")
    print("block number = {}".format(block))
    y_pred = model.predict(x_test)
    res = tfidf_metric(y_test, y_pred, device=device)
    print("cosine", res)
    print_tfidf_metric(
        {
            "cosine": float(res),
            "block": block,
            "model": model_name,
            "note": "clean - tfidf - downsample"
            if downsample != -1 else "clean - tfidf",
            "history": history,
        },
        filename=filename)

    # output y_pred
    if downsample == -1:
        if history:
            outpath = os.path.join(
                predict_dir, "bookcorpus",
                f"history_block{block}_{model_name}_h{history}.h5")
        else:
            outpath = os.path.join(predict_dir, "bookcorpus",
                                   f"block{block}_{model_name}.h5")
    else:
        outpath = os.path.join(predict_dir, "bookcorpus",
                               f"downsample_block{block}_{model_name}.h5")
    save_prediction(outpath, y_pred)
def tf_ml_baseline_tuning(block=200,
                          model_name="RandomForest",
                          data_name="bookcorpus",
                          sampling=False):
    print("loading data")

    # tfidf as feature
    if data_name == "bookcorpus":
        x_train, y_train = load_tfidf("train", block, verbose=True)
        x_valid, y_valid = load_tfidf("valid", block, verbose=True)
        x_test, y_test = load_tfidf("test", block, verbose=True)
    elif data_name == "coda19":
        x_train, y_train = coda_load_tfidf("train", block, verbose=True)
        x_valid, y_valid = coda_load_tfidf("valid", block, verbose=True)
        x_test, y_test = coda_load_tfidf("test", block, verbose=True)
    else:
        print("Not supported yet!")
        quit()

    # do sampling if the training data is too big
    if x_train.shape[0] > 1000000:
        index_list = np.random.RandomState(seed=RANDOM_SEED).permutation(
            x_train.shape[0])[:1000000]
        index_list = np.sort(index_list)
        x_train, y_train = x_train[index_list], y_train[index_list]

    x_train, y_train = x_train.todense(), y_train.todense()
    x_valid, y_valid = x_valid.todense(), y_valid.todense()
    x_test, y_test = x_test.todense(), y_test.todense()

    print("train: x = {}, y = {}".format(str(x_train.shape),
                                         str(y_train.shape)))
    print("test: x = {}, y = {}".format(str(x_test.shape), str(y_test.shape)))

    print("building model using", model_name)

    # parameter setting
    rf_param_list = [{
        "max_depth": max_depth,
        "n_estimators": n_estimators,
        "n_jobs": 4,
        "random_state": RANDOM_SEED,
        "verbose": 1
    } for max_depth in [10, 20, 30] for n_estimators in [100, 150, 200, 250]]

    lgbm_param_list = [{
        "max_depth": max_depth,
        "n_estimators": n_estimators,
        "random_state": RANDOM_SEED,
        "n_jobs": 10,
        "verbose": 0,
        "force_row_wise": True,
    } for max_depth in [10, 20, 30] for n_estimators in [100, 150, 200, 250]]

    param_dict = {
        "RandomForest": rf_param_list,
        "LGBM": lgbm_param_list,
    }
    param_list = param_dict[model_name]

    print("finding the best parameters")
    best_param = None
    best_score = 0.0
    best_model = None
    for i, param in enumerate(param_list):
        print(f"Running parameter tuning {i} / {len(param_list)}")
        print(param)
        model = build_model(model_name, param)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_valid)
        score = tfidf_metric(y_valid, y_pred)

        if score > best_score:
            best_param = param
            best_score = score
            best_model = model

    print()

    joblib.dump(
        best_model,
        os.path.join(model_dir, data_name,
                     "block{}_{}.joblib".format(block, model_name)))
    #with open(os.path.join(model_dir, data_name, f"block{block}_{model_name}_param.json"), 'w', encoding='utf-8') as outfile:
    #    json.dump(best_param)

    print("prediting")
    print("block number = {}".format(block))
    y_pred = best_model.predict(x_test)
    res = tfidf_metric(y_test, y_pred)
    print("cosine", res)
    print("best parameter", best_param)
    print_tfidf_metric(
        {
            "cosine": float(res),
            "block": block,
            "model": model_name,
            "note": "clean - tfidf - new",
            "best_parameter": best_param
        },
        filename=os.path.join(result_dir, f"{data_name}_ml_baseline.json"))
Esempio n. 6
0
def tfidf_ir_pytorch(block, data_name="bookcorpus", downsample=-1, device="cpu"):
    # using frame-vector as a query
    if data_name == "bookcorpus":
        x_train, y_train = load_tfidf("train", block)
        x_test, y_test = load_tfidf("test", block)
    elif data_name == "coda19":
        x_train, y_train = coda_load_tfidf("train", block)
        x_test, y_test = coda_load_tfidf("test", block)
    else:
        print(f"{data_name} not supported yet!")
        quit()

    x_train, y_train = x_train.todense(), y_train.todense()
    x_test, y_test = x_test.todense(), y_test.todense()

    if downsample != -1:
        random_index = np.random.RandomState(5516).permutation(x_train.shape[0])[:downsample]
        x_train, y_train = x_train[random_index], y_train[random_index]
        print(f"downsampling x_train.shape = {x_train.shape}, y_train.shape = {y_train.shape}")

    partition_size = 100000
    answer_list = np.ones([x_test.shape[0]], dtype=np.int32) * (-1)
    best_distance_list = np.zeros([x_test.shape[0]], dtype=np.float64)
    total_count = x_test.shape[0]
    length = x_test.shape[0]
    partition_num = x_train.shape[0] // partition_size

    for partition_id, train_start_index in enumerate(range(0, x_train.shape[0], partition_size)):
        train = x_train[train_start_index:train_start_index+partition_size]
        train = torch.DoubleTensor(train).to(device)
        train_length = train.shape[0]
        #print(partition_id, "train.shape =", train.shape)

        # go through all the testing instances
        with torch.no_grad():
            for index in range(0, x_test.shape[0]):
                if index % 100 == 0:
                    print("\x1b[2K\rpartition {} / {} , predicting {} / {} [{:.3f}%]".format(
                            partition_id, partition_num, index, total_count, 100.0*index/total_count
                        ), end="")
                x_batch = x_test[index:index+1]
                x_batch = torch.DoubleTensor(x_batch).to(device)
                x_batch = x_batch.repeat([train_length, 1])
                distances = F.cosine_similarity(x_batch, train)
                best_distance = torch.max(distances).cpu().numpy()
                answer_index = torch.argmax(distances).cpu().numpy()
                if best_distance > best_distance_list[index]:
                    best_distance_list[index] = best_distance
                    answer_list[index] = answer_index

    print()    
    y_pred = y_train[answer_list] 
    print(y_pred.shape, y_test.shape)
    res = tfidf_metric(y_test, y_pred)
    print(res)

    print_tfidf_metric({
        "cosine": float(res),
        "block": block,
        "skip": 0,
        "note": "frame - ir - downsample" if downsample != -1 else "frame - ir"
    }, filename=os.path.join(result_dir, f"{data_name}_tfidf_ir_baseline.json"))