Beispiel #1
0
    def sample_triple(self, dump=True, load_save=False):
        """Sample triples or load triples samples from files.

        This method is only applicable for basket based Recommender.

        Returns:
            None

        """
        sample_file_name = (
            "triple_"
            + self.config["dataset"]["dataset"]
            + (
                ("_" + str(self.config["dataset"]["percent"] * 100))
                if "percent" in self.config
                else ""
            )
            + "_"
            + str(self.config["model"]["n_sample"])
            if "percent" in self.config
            else "" + ".csv"
        )
        self.process_path = self.config["system"]["process_dir"]
        ensureDir(self.process_path)
        sample_file = os.path.join(self.process_path, sample_file_name)
        my_sampler = Sampler(
            self.train,
            sample_file,
            self.config["model"]["n_sample"],
            dump=dump,
            load_save=load_save,
        )
        return my_sampler.sample()
Beispiel #2
0
    def get_adj_mat(self):
        """ Get the adjacent matrix, if not previously stored then call the function to create
        This method is for NGCF model
        Return:
            Different types of adjacment matrix
        """
        self.init_train_items()

        process_file_name = ("ngcf_" + self.config["dataset"] + "_" +
                             self.config["data_split"] +
                             (("_" + str(self.config["percent"] * 100))
                              if "percent" in self.config else ""))
        self.process_path = os.path.join(self.config["root_dir"],
                                         self.config["process_dir"])
        process_file_name = os.path.join(self.process_path, process_file_name)
        ensureDir(process_file_name)
        print(process_file_name)
        try:
            t1 = time()
            adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_adj_mat.npz"))
            norm_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_norm_adj_mat.npz"))
            mean_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_mean_adj_mat.npz"))
            print("already load adj matrix", adj_mat.shape, time() - t1)
        except Exception:
            adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
            sp.save_npz(os.path.join(process_file_name, "s_adj_mat.npz"),
                        adj_mat)
            sp.save_npz(os.path.join(process_file_name, "s_norm_adj_mat.npz"),
                        norm_adj_mat)
            sp.save_npz(os.path.join(process_file_name, "s_mean_adj_mat.npz"),
                        mean_adj_mat)
        return adj_mat, norm_adj_mat, mean_adj_mat
Beispiel #3
0
    def sample_triple_time(self, dump=True, load_save=False):
        """
        Sample triples or load triples samples from files. Only applicable for basket based Recommender
        Returns:
            None

        """
        sample_file_name = ("triple_" + self.config["dataset"] +
                            (("_" + str(self.config["percent"] * 100))
                             if "percent" in self.config else "") +
                            (("_" + str(self.config["time_step"]))
                             if "time_step" in self.config else "_10") + "_" +
                            str(self.config["n_sample"])
                            if "percent" in self.config else "" + ".csv")
        self.process_path = os.path.join(self.config["root_dir"],
                                         self.config["process_dir"])
        ensureDir(self.process_path)
        sample_file = os.path.join(self.process_path, sample_file_name)
        my_sampler = Sampler(
            self.train,
            sample_file,
            self.config["n_sample"],
            dump=dump,
            load_save=load_save,
        )
        return my_sampler.sample_by_time(self.config["time_step"])
Beispiel #4
0
    def train_gmf(self):
        """Train GMF."""
        self.monitor = Monitor(log_dir=self.config["run_dir"],
                               delay=1,
                               gpu_id=self.gpu_id)
        self.model_dir = os.path.join(self.config["model_save_dir"],
                                      self.config["save_name"])
        for epoch in range(config["max_epoch"]):
            print(f"Epoch {epoch} starts !")
            print("-" * 80)
            if epoch > 0 and self.eval_engine.n_no_update == 0:
                # previous epoch have already obtained better result
                self.gmfengine.save_checkpoint(model_dir=self.model_dir)

            if self.eval_engine.n_no_update >= MAX_N_UPDATE:
                print(
                    "Early stop criterion triggered, no performance update for {:} times"
                    .format(MAX_N_UPDATE))
                break

            train_loader = self.data
            self.gmfengine.train_an_epoch(epoch_id=epoch,
                                          train_loader=train_loader)

        print("Saving embeddings to: %s" % self.config["model_save_dir"])
        user_embed, item_embed, v = (
            self.gmfengine.model.user_memory.weight.detach().cpu(),
            self.gmfengine.model.item_memory.weight.detach().cpu(),
            self.gmfengine.model.v.weight.detach().cpu(),
        )
        embed_dir = os.path.join(self.config["model_save_dir"],
                                 "pretain/embeddings")
        ensureDir(embed_dir)
        np.savez(embed_dir, user=user_embed, item=item_embed, v=v)
        self.config["run_time"] = self.monitor.stop()

        return np.array(user_embed), np.array(item_embed)
Beispiel #5
0
    def get_adj_mat(self, config):
        """Get the adjacent matrix, if not previously stored then call the function to create.

        This method is for NGCF model.

        Returns:
            Different types of adjacment matrix.
        """
        process_file_name = ("ngcf_" + config["dataset"]["dataset"] + "_" +
                             config["dataset"]["data_split"] +
                             (("_" + str(config["dataset"]["percent"] * 100))
                              if "percent" in config else ""))
        process_path = os.path.join(
            config["system"]["process_dir"],
            config["dataset"]["dataset"] + "/",
        )
        process_file_name = os.path.join(process_path, process_file_name)
        ensureDir(process_file_name)
        print(process_file_name)
        try:
            adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_adj_mat.npz"))
            norm_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_norm_adj_mat.npz"))
            mean_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_mean_adj_mat.npz"))
            print("already load adj matrix", adj_mat.shape)
        except Exception:
            adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
            sp.save_npz(os.path.join(process_file_name, "s_adj_mat.npz"),
                        adj_mat)
            sp.save_npz(os.path.join(process_file_name, "s_norm_adj_mat.npz"),
                        norm_adj_mat)
            sp.save_npz(os.path.join(process_file_name, "s_mean_adj_mat.npz"),
                        mean_adj_mat)
        return adj_mat, norm_adj_mat, mean_adj_mat
Beispiel #6
0
    def prepare_env(self):
        """Prepare running environment.

        * Load parameters from json files.
        * Initialize system folders, model name and the paths to be saved.
        * Initialize resource monitor.
        * Initialize random seed.
        * Initialize logging.
        """
        # Load config file from json
        with open(self.args.config_file) as config_params:
            print(f"loading config file {self.args.config_file}")
            config = json.load(config_params)

        # Update configs based on the received args from the command line .
        update_args(config, self.args)

        # obtain abspath for the project
        config["system"]["root_dir"] = os.path.abspath(
            config["system"]["root_dir"])

        # construct unique model run id, which consist of model name, config id and a timestamp
        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        random_str = "".join(
            [random.choice(string.ascii_lowercase) for n in range(6)])
        config["system"]["model_run_id"] = (config["model"]["model"] + "_" +
                                            config["model"]["config_id"] +
                                            "_" + timestamp_str + "_" +
                                            random_str)

        # Initialize random seeds
        set_seed(config["system"]["seed"] if "seed" in
                 config["system"] else 2020)

        # Initialize working folders
        self.initialize_folders(config)

        config["system"]["process_dir"] = os.path.join(
            config["system"]["root_dir"], config["system"]["process_dir"])

        # Initialize log file
        config["system"]["log_file"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["log_dir"],
            config["system"]["model_run_id"],
        )
        logger.init_std_logger(config["system"]["log_file"])

        print("Python version:", sys.version)
        print("pytorch version:", torch.__version__)

        #  File paths to be saved
        config["model"]["run_dir"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["run_dir"],
            config["system"]["model_run_id"],
        )
        config["system"]["run_dir"] = config["model"]["run_dir"]
        print(
            "The intermediate running statuses will be reported in folder:",
            config["system"]["run_dir"],
        )

        config["system"]["tune_dir"] = os.path.join(
            config["system"]["root_dir"], config["system"]["tune_dir"])

        def get_user_temp_dir():
            tempdir = os.path.join(config["system"]["root_dir"], "tmp")
            print(f"ray temp dir {tempdir}")
            return tempdir

        ray.utils.get_user_temp_dir = get_user_temp_dir

        #  Model checkpoints paths to be saved
        config["system"]["model_save_dir"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["checkpoint_dir"],
            config["system"]["model_run_id"],
        )
        ensureDir(config["system"]["model_save_dir"])
        print("Model checkpoint will save in file:",
              config["system"]["model_save_dir"])

        config["system"]["result_file"] = os.path.join(
            config["system"]["root_dir"],
            config["system"]["result_dir"],
            config["system"]["result_file"],
        )
        print("Performance result will save in file:",
              config["system"]["result_file"])

        print_dict_as_table(config["system"], "System configs")
        return config
Beispiel #7
0
def prepare_env(config):
    """Prepare running environment
        - Load parameters from json files.
        - Initialize system folders, model name and the paths to be saved.
        - Initialize resource monitor.
        - Initialize random seed.
        - Initialize logging.

    Args:
        config (dict): Global configs.

    """
    # obtain abspath for the project
    # You need specified it if it is running in the container.
    if "root_dir" not in config:
        file_dir = os.path.dirname(os.path.abspath(__file__))
        config["root_dir"] = os.path.abspath(os.path.join(file_dir, ".."))

    # load config file from json
    with open(config["config_file"]) as config_params:
        print("loading config file", config["config_file"])
        json_config = json.load(config_params)

    # update global parameters with these parameters received from the command line .
    json_config.update(config)
    config = json_config

    # construct unique model run id, which consist of model name, config id and a timestamp
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    random_str = "".join([random.choice(string.ascii_lowercase) for n in range(6)])
    config["model_run_id"] = (
        config["model"]
        + "_"
        + config["config_id"]
        + "_"
        + timestamp_str
        + "_"
        + random_str
    )
    set_seed(config["seed"] if "seed" in config else 2020)
    initialize_folders(config["root_dir"])

    # Initialize log file
    config["log_file"] = os.path.join(
        config["root_dir"], config["log_dir"], config["model_run_id"]
    )
    logger.init_std_logger(config["log_file"])

    print("python version:", sys.version)
    print("pytorch version:", torch.__version__)

    #  File paths to be saved
    config["run_dir"] = os.path.join(
        config["root_dir"], config["run_dir"], config["model_run_id"]
    )
    print(
        "The intermediate running statuses will be reported in folder:",
        config["run_dir"],
    )

    #  Model checkpoints paths to be saved
    config["model_save_dir"] = os.path.join(
        config["root_dir"], config["checkpoint_dir"], config["model_run_id"]
    )
    ensureDir(config["model_save_dir"])
    print("Model checkpoint will save in file:", config["model_save_dir"])

    config["result_file"] = os.path.join(
        config["root_dir"], config["result_dir"], config["result_file"]
    )
    print("Performance result will save in file:", config["result_file"])

    # remove comments

    print_dict_as_table(config, "Model configs")
    return config