Esempio n. 1
0
def main(model_config, validation, storage_path):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    log_path = os.path.join(dir_path, "run", "recsys.log")
    logger = get_logger(log_path)
    mat_path = os.path.join(dir_path, "run", "Xcsr.h5")
    meta_path = os.path.join(dir_path, "run", "meta.h5")
    predictions_path = os.path.join(dir_path, "run", "predictions.csv")
    model_path = os.path.join(dir_path, "run", "model.joblib")
    model_config = json.loads(model_config)
    model_config["validation"] = validation
    config_path = os.path.join(dir_path, "run", "config.json")
    with open(config_path, "wt") as out:
        out.write(json.dumps(model_config))
    download_data(model_config["dataset_path_matrix"], mat_path)
    download_data(model_config["dataset_path_meta"], meta_path)
    model_instance = parse_model_instance(model_config)
    run_model(
        mat_path=mat_path,
        meta_path=meta_path,
        model_instance=model_instance,
        predictions_path=predictions_path,
        model_path=model_path,
        val=validation,
        logger=logger,
    )
    upload_data(predictions_path, storage_path + "predictions.csv")
    upload_data(model_path, storage_path + "model.joblib")
    upload_data(config_path, storage_path + "config.json")
    upload_data(log_path, storage_path + "recsys.log")
    os.system("sudo shutdown now")
import h5sparse
import joblib
import numpy as np
import pandas as pd
from lightgbm import LGBMRanker

import sys
sys.path.append('/home/janice/el/recsys2019/src')

from recsys.config import BEST_PARAMS
from recsys.log_utils import get_logger
from recsys.metric import mrr_fast
from recsys.utils import group_lengths, timer, get_git_hash
from sklearn.metrics import roc_auc_score

logger = get_logger()

print("Staring validation")

with timer("reading data"):
    meta = pd.read_hdf("../../data/proc/vectorizer_1/meta.h5", key="data")
    mat = h5sparse.File("../../data/proc/vectorizer_1/Xcsr.h5",
                        mode="r")["matrix"]

with timer("splitting data"):
    split_idx = 4000000
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))