def main(model_config, validation, storage_path): dir_path = os.path.dirname(os.path.realpath(__file__)) log_path = os.path.join(dir_path, "run", "recsys.log") logger = get_logger(log_path) mat_path = os.path.join(dir_path, "run", "Xcsr.h5") meta_path = os.path.join(dir_path, "run", "meta.h5") predictions_path = os.path.join(dir_path, "run", "predictions.csv") model_path = os.path.join(dir_path, "run", "model.joblib") model_config = json.loads(model_config) model_config["validation"] = validation config_path = os.path.join(dir_path, "run", "config.json") with open(config_path, "wt") as out: out.write(json.dumps(model_config)) download_data(model_config["dataset_path_matrix"], mat_path) download_data(model_config["dataset_path_meta"], meta_path) model_instance = parse_model_instance(model_config) run_model( mat_path=mat_path, meta_path=meta_path, model_instance=model_instance, predictions_path=predictions_path, model_path=model_path, val=validation, logger=logger, ) upload_data(predictions_path, storage_path + "predictions.csv") upload_data(model_path, storage_path + "model.joblib") upload_data(config_path, storage_path + "config.json") upload_data(log_path, storage_path + "recsys.log") os.system("sudo shutdown now")
import h5sparse import joblib import numpy as np import pandas as pd from lightgbm import LGBMRanker import sys sys.path.append('/home/janice/el/recsys2019/src') from recsys.config import BEST_PARAMS from recsys.log_utils import get_logger from recsys.metric import mrr_fast from recsys.utils import group_lengths, timer, get_git_hash from sklearn.metrics import roc_auc_score logger = get_logger() print("Staring validation") with timer("reading data"): meta = pd.read_hdf("../../data/proc/vectorizer_1/meta.h5", key="data") mat = h5sparse.File("../../data/proc/vectorizer_1/Xcsr.h5", mode="r")["matrix"] with timer("splitting data"): split_idx = 4000000 train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))