def __init__(self, data: DataFrame, user_review_dict: Dict[str, DataFrame], item_review_dict: Dict[str, DataFrame], config: NarreConfig): """ Init a NarreDataset. :param data: original data. ["userID","itemID","review","rating"] :param user_review_dict: the review grouped by userID :param item_review_dict: the review grouped by itemID :param config: the config of Narre model. """ super().__init__() self.data = data self.user_review_dict = user_review_dict self.item_review_dict = item_review_dict self.config = config logger.info("Loading dataset...") self.user_review, self.user_id, self.item_ids_per_review = self.load_user_review_data( ) self.item_review, self.item_id, self.user_ids_per_review = self.load_item_review_data( ) ratings = self.data["rating"].to_list() self.ratings = torch.Tensor(ratings).view(-1, 1) logger.info("Dataset loaded.")
def save_model(model: torch.nn.Module, train_time: time.struct_time): path = "model/checkpoints/%s_%s.pt" % ( model.__class__.__name__, time.strftime("%Y%m%d%H%M%S", train_time) ) path = ROOT_DIR.joinpath(path) torch.save(model, path) logger.info(f"model saved: {path}")
def save_embedding_weights(word_vec, out_path="data/embedding_weight.pt"): """ Save the weights of pre-trained word embedding model to file. Thus we don't need to load it when train our model. This helps to save RAM and model init time. """ weight = torch.Tensor(word_vec.vectors) torch.save(weight, ROOT_DIR.joinpath(out_path)) logger.info("Word embedding weight saved.")
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'): """ Read pre-trained word embedding model, and add "<pad>" to it with zero weight. """ logger.info("loading word2vec model...") path = ROOT_DIR.joinpath(path) word_vec = KeyedVectors.load_word2vec_format(path, binary=True) word_vec.add([PAD_WORD], np.zeros([1, 300])) logger.critical(f"PAD_WORD_ID is {word_vec.vocab[PAD_WORD].index}.") logger.info("word2vec model loaded.") return word_vec
def get_max_review_length(data: DataFrame, percentile: float = 0.85) -> int: """ We set the max review length to 85% percentile of all data as default. """ review_lengths = data["review"] \ .apply(lambda review: len(review.split())) # max_length = review_lengths.max() max_length = int(review_lengths.quantile(percentile, interpolation="lower")) logger.info(f"Max review length = {max_length}.") return max_length
def train_model(model: BaseModel, train_data: DataFrame, dev_data: DataFrame): model_name = model.__class__.__name__ train_time = time.localtime() add_log_file(logger, "log/%s_%s.log" % (model_name, time.strftime("%Y%m%d%H%M%S", train_time))) logger.info("Training %s..." % model_name) config: BaseConfig = model.config logger.info(config.__dict__) model.to(config.device) opt = torch.optim.Adam(model.parameters(), config.learning_rate, weight_decay=config.l2_regularization) lr_s = lr_scheduler.ExponentialLR(opt, gamma=config.learning_rate_decay) loss = torch.nn.MSELoss() last_progress = 0. last_loss = float("inf") train_data_iter = get_data_loader(train_data, config) dev_data_iter = get_data_loader(dev_data, config) batches_num = math.ceil(len(train_data) / float(config.batch_size)) while model.current_epoch < config.num_epochs: model.train() for batch_id, iter_i in enumerate(train_data_iter): user_review, item_review, rating = iter_i user_review = user_review.to(config.device) item_review = item_review.to(config.device) rating = rating.to(config.device) opt.zero_grad() predict = model(user_review, item_review) li = loss(predict, rating) li.backward() opt.step() # log progress current_batches = model.current_epoch * batches_num + (batch_id + 1.0) total_batches = config.num_epochs * batches_num progress = current_batches / total_batches if progress - last_progress > 0.001: logger.debug("epoch %d, batch %d, loss: %f (%.2f%%)" % (model.current_epoch, batch_id, li.item(), 100.0 * progress)) last_progress = progress # complete one epoch train_loss = eval_model(model, train_data_iter, loss) dev_loss = eval_model(model, dev_data_iter, loss) logger.info("Epoch %d complete. Total loss(train/dev)=%f/%f" % (model.current_epoch, train_loss, dev_loss)) # save best model if train_loss < last_loss: last_loss = train_loss save_model(model, train_time) lr_s.step(model.current_epoch) model.current_epoch += 1 logger.info("%s trained!" % model_name) remove_log_file(logger)
def get_max_review_count(data: DataFrame, percentile: float = 0.85): """ We set the max review count to 85% percentile of all data as default. """ review_count_user = data["review"].groupby([data["userID"]]).count() review_count_user = int( review_count_user.quantile(percentile, interpolation="lower")) review_count_item = data["review"].groupby([data["itemID"]]).count() review_count_item = int( review_count_item.quantile(percentile, interpolation="lower")) max_count = max(review_count_item, review_count_user) logger.info(f"Max review count = {max_count}.") return max_count
def get_data_loader(data: DataFrame, config: BaseConfig): logger.info("Generating data iter...") review_by_user, review_by_item = get_review_dict() user_reviews = [torch.LongTensor(load_reviews(review_by_user, userID, itemID, config.max_review_length)) for userID, itemID in zip(data["userID"], data["itemID"])] user_reviews = torch.stack(user_reviews) item_reviews = [torch.LongTensor(load_reviews(review_by_item, itemID, userID, config.max_review_length)) for userID, itemID in zip(data["userID"], data["itemID"])] item_reviews = torch.stack(item_reviews) ratings = torch.Tensor(data["rating"].to_list()).view(-1, 1) dataset = torch.utils.data.TensorDataset(user_reviews, item_reviews, ratings) pin_memory = config.device not in ["cpu", "CPU"] data_iter = torch.utils.data.DataLoader(dataset, batch_size=config.batch_size, shuffle=True, pin_memory=pin_memory) logger.info("Data iter loaded.") return data_iter
def process_raw_data(in_path="data/Digital_Music_5.json", out_path="data/reviews.json"): """ Read raw data and remove useless columns and clear review text. Then save the result to file system. """ logger.info("reading raw data...") df = pandas.read_json(ROOT_DIR.joinpath(in_path), lines=True) df = df[["reviewerID", "asin", "reviewText", "overall"]] df.columns = ["userID", "itemID", "review", "rating"] stop_words = get_stop_words() punctuations = get_punctuations() lemmatizer = nltk.WordNetLemmatizer() def clean_review(review: str): review = review.lower() assert "'" not in punctuations for p in punctuations: review = review.replace(p, " ") tokens = review.split() tokens = [word for word in tokens if word not in stop_words] tokens = [lemmatizer.lemmatize(word) for word in tokens] return " ".join(tokens) logger.info("cleaning review text...") df["review"] = df["review"].apply(clean_review) df.to_json(ROOT_DIR.joinpath(out_path), orient="records", lines=True) logger.info("Processed data saved.")
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'): """ Read pre-trained word embedding model, and add "<pad>" to it with zero weight. """ logger.info("loading word2vec model...") path = ROOT_DIR.joinpath(path) word_vec = KeyedVectors.load_word2vec_format(path, binary=True) if PAD_WORD not in word_vec: word_vec.add([PAD_WORD], np.zeros([1, 300])) logger.info(f"Add PAD_WORD to word embedding.") assert PAD_WORD_ID == word_vec.vocab[PAD_WORD].index, \ f"PAD_WORD_ID should be {word_vec.vocab[PAD_WORD].index} but not {PAD_WORD_ID}." logger.info("word2vec model loaded.") return word_vec
def train_model(model: BaseModel, train_data: DataFrame, dev_data: DataFrame, is_save_model: bool = True): model_name = model.__class__.__name__ train_time = time.localtime() add_log_file( logger, "log/%s_%s.log" % (model_name, time.strftime("%Y%m%d%H%M%S", train_time))) logger.info("Training %s..." % model_name) config: BaseConfig = model.config logger.info(config.__dict__) model.to(config.device) opt = torch.optim.Adam(model.parameters(), config.learning_rate, weight_decay=config.l2_regularization) lr_s = lr_scheduler.ExponentialLR(opt, gamma=config.learning_rate_decay) loss = torch.nn.MSELoss() last_progress = 0. min_loss = float("inf") pin_memory = config.device not in ["cpu", "CPU"] review_by_user, review_by_item = get_review_dict("train") dataset = NarreDataset(train_data, review_by_user, review_by_item, config) train_data_iter = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, pin_memory=pin_memory) dataset = NarreDataset(dev_data, review_by_user, review_by_item, config) dev_data_iter = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, pin_memory=pin_memory) batches_num = math.ceil(len(train_data) / config.batch_size) while model.current_epoch < config.num_epochs: model.train() for batch_id, iter_i in enumerate(train_data_iter): user_review, user_id, item_id_per_review, item_review, item_id, user_id_per_review, rating = iter_i user_review = user_review.to(config.device) user_id = user_id.to(config.device) item_id_per_review = item_id_per_review.to(config.device) item_review = item_review.to(config.device) item_id = item_id.to(config.device) user_id_per_review = user_id_per_review.to(config.device) rating = rating.to(config.device) opt.zero_grad() predict = model(user_review, user_id, item_id_per_review, item_review, item_id, user_id_per_review) li = loss(predict, rating) li.backward() opt.step() # log progress current_batches = model.current_epoch * batches_num + batch_id + 1 total_batches = config.num_epochs * batches_num progress = current_batches / total_batches if progress - last_progress > 0.001: logger.debug( "epoch %d, batch %d, loss: %f (%.2f%%)" % (model.current_epoch, batch_id, li.item(), 100 * progress)) last_progress = progress # complete one epoch train_loss = eval_model(model, train_data_iter, loss) dev_loss = eval_model(model, dev_data_iter, loss) logger.info("Epoch %d complete. Total loss(train/dev)=%f/%f" % (model.current_epoch, train_loss, dev_loss)) # save best model if train_loss < min_loss: min_loss = train_loss logger.info(f"Get min loss: {train_loss}") if is_save_model: save_model(model, train_time) lr_s.step(model.current_epoch) model.current_epoch += 1 logger.info("%s trained!" % model_name) remove_log_file(logger) return min_loss
import torch from torch.utils.data import DataLoader from utils.data_reader import get_train_dev_test_data, get_review_dict from utils.data_set import NarreDataset from utils.log_hepler import logger from utils.train_helper import load_model, eval_model train_data, dev_data, test_data = get_train_dev_test_data() model = load_model("model/checkpoints/NarreModel_20200606153827.pt") model.config.device = "cuda:1" model.to(model.config.device) loss = torch.nn.MSELoss() review_by_user, review_by_item = get_review_dict("test") dataset = NarreDataset(test_data, review_by_user, review_by_item, model.config) data_iter = DataLoader(dataset, batch_size=model.config.batch_size, shuffle=True) logger.info(f"Loss on test dataset: {eval_model(model, data_iter, loss)}")
"wb")) def get_review_dict(data_type: str): user_review = pickle.load( open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"), "rb")) item_review = pickle.load( open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"), "rb")) return user_review, item_review if __name__ == "__main__": process_raw_data() logger.info(f"Max user id = {get_max_user_id()}") logger.info(f"Max item id = {get_max_item_id()}") train_data, dev_data, test_data = get_train_dev_test_data() known_data = pandas.concat([train_data, dev_data]) all_data = pandas.concat([train_data, dev_data, test_data]) logger.info(f"Max review length = {get_max_review_length(all_data)}") logger.info(f"Max review count = {get_max_review_count(all_data)}") word_vec = get_word_vec() save_embedding_weights(word_vec) save_review_dict(known_data, word_vec, "train") save_review_dict(all_data, word_vec, "test")
ids = ids[:config.review_count] # pad review count pad_length = config.review_count - len(reviews) pad_review = [config.pad_word_id] * config.review_length reviews += [pad_review] * pad_length ids += [pad_id] * pad_length return reviews, ids if __name__ == '__main__': train_data, _, _ = get_train_dev_test_data() review_by_user, review_by_item = get_review_dict("train") config = NarreConfig(review_length=50, review_count=20, pad_word_id=PAD_WORD_ID, pad_item_id=get_max_item_id() + 1, pad_user_id=get_max_user_id() + 1) dataset = NarreDataset(train_data, review_by_user, review_by_item, config) loader = DataLoader(dataset, batch_size=128, shuffle=True) for user_review, user_id, item_id_per_review, item_review, item_id, user_id_per_review, rating in loader: logger.info(f"{user_review.shape}, " f"{user_id.shape}" f"{item_id_per_review.shape}, " f"{item_review.shape}, " f"{item_id.shape}" f"{user_id_per_review.shape}, " f"{rating.shape}")