def save(self, path): """ Save a model. The model can be saved, then reloaded later to provide recommendations. Parameters ---------- path : str The path where the model will be saved. This should refer to a file, not to a directory. Three items will be stored here: the underlying model parameters, the original ratings, and the column names. These are stored with suffix '.model', '.ratings', and '.metadata'. """ sc = CommonSparkContext.Instance().sc() delete_file_or_dir(path) os.makedirs(path) model_path, ratings_path, metadata_path = self._file_paths(path) # save model self.model.save(sc, model_path) # save ratings self.ratings.save(ratings_path) # save metadata metadata = [self.user_col, self.item_col, self.rating_col] with fileio.open_file(metadata_path, 'w') as f: # TODO detect filesystem errors pickle.dump(metadata, f)
def load(cls, path): """ Load a model that was saved previously. Parameters ---------- path : str The path where the model files are stored. This is the same path that was passed to ``save``. There are three files/directories based on this path, with extensions '.model', '.ratings', and '.metadata'. Returns ------- out : MatrixFactorizationModel A model that can be used to predict ratings. """ sc = CommonSparkContext.Instance().sc() model_path, ratings_path, metadata_path = cls._file_paths(path) # load model model = recommendation.MatrixFactorizationModel.load(sc, model_path) # load ratings ratings = XFrame.load(ratings_path) # load metadata with open(metadata_path) as f: user_col, item_col, rating_col = pickle.load(f) return cls(model, ratings, user_col, item_col, rating_col)
def train(self): sc = CommonSparkContext.Instance().sc() rdd = self.corpus.to_spark_rdd() model = Word2Vec().setVectorSize(self.vector_size).setSeed( self.seed).fit(rdd) return TextModel(model)