def process_raw_data(in_path="data/Digital_Music_5.json", out_path="data/reviews.json"): """ Read raw data and remove useless columns and clear review text. Then save the result to file system. """ logger.info("reading raw data...") df = pandas.read_json(ROOT_DIR.joinpath(in_path), lines=True) df = df[["reviewerID", "asin", "reviewText", "overall"]] df.columns = ["userID", "itemID", "review", "rating"] stop_words = get_stop_words() punctuations = get_punctuations() lemmatizer = nltk.WordNetLemmatizer() def clean_review(review: str): review = review.lower() assert "'" not in punctuations for p in punctuations: review = review.replace(p, " ") tokens = review.split() tokens = [word for word in tokens if word not in stop_words] tokens = [lemmatizer.lemmatize(word) for word in tokens] return " ".join(tokens) logger.info("cleaning review text...") df["review"] = df["review"].apply(clean_review) df.to_json(ROOT_DIR.joinpath(out_path), orient="records", lines=True) logger.info("Processed data saved.")
def get_review_dict(data_type: str): user_review = pickle.load( open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"), "rb")) item_review = pickle.load( open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"), "rb")) return user_review, item_review
def save_review_dict(data: DataFrame, word_vec, data_type: str): user_review, item_review = get_reviews_in_idx(data, word_vec) pickle.dump( user_review, open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"), "wb")) pickle.dump( item_review, open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"), "wb"))
def save_model(model: torch.nn.Module, train_time: time.struct_time): path = "model/checkpoints/%s_%s.pt" % ( model.__class__.__name__, time.strftime("%Y%m%d%H%M%S", train_time) ) path = ROOT_DIR.joinpath(path) torch.save(model, path) logger.info(f"model saved: {path}")
def save_embedding_weights(word_vec, out_path="data/embedding_weight.pt"): """ Save the weights of pre-trained word embedding model to file. Thus we don't need to load it when train our model. This helps to save RAM and model init time. """ weight = torch.Tensor(word_vec.vectors) torch.save(weight, ROOT_DIR.joinpath(out_path)) logger.info("Word embedding weight saved.")
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'): """ Read pre-trained word embedding model, and add "<pad>" to it with zero weight. """ logger.info("loading word2vec model...") path = ROOT_DIR.joinpath(path) word_vec = KeyedVectors.load_word2vec_format(path, binary=True) word_vec.add([PAD_WORD], np.zeros([1, 300])) logger.critical(f"PAD_WORD_ID is {word_vec.vocab[PAD_WORD].index}.") logger.info("word2vec model loaded.") return word_vec
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'): """ Read pre-trained word embedding model, and add "<pad>" to it with zero weight. """ logger.info("loading word2vec model...") path = ROOT_DIR.joinpath(path) word_vec = KeyedVectors.load_word2vec_format(path, binary=True) if PAD_WORD not in word_vec: word_vec.add([PAD_WORD], np.zeros([1, 300])) logger.info(f"Add PAD_WORD to word embedding.") assert PAD_WORD_ID == word_vec.vocab[PAD_WORD].index, \ f"PAD_WORD_ID should be {word_vec.vocab[PAD_WORD].index} but not {PAD_WORD_ID}." logger.info("word2vec model loaded.") return word_vec
def load_model(path: str): path = ROOT_DIR.joinpath(path) # load model to cpu as default. model = torch.load(path, map_location=torch.device('cpu')) return model
""" Save the weights of pre-trained word embedding model to file. Thus we don't need to load it when train our model. This helps to save RAM and model init time. """ weight = torch.Tensor(word_vec.vectors) torch.save(weight, ROOT_DIR.joinpath(out_path)) logger.info("Word embedding weight saved.") def load_embedding_weights(path="data/embedding_weight.pt"): return torch.load(path) # Find the unknowns words in review text. # This step is not necessary for model train. if __name__ == "__main__": df = pandas.read_json(ROOT_DIR.joinpath("data/reviews.json"), lines=True) word_vec = get_word_vec() unknown_words: Set[str] = set() for review in df["review"]: for word in review.split(): if word not in word_vec: unknown_words.add(word) logger.warning(f"{len(unknown_words)} unknown words!") with open(ROOT_DIR.joinpath("out/UNKs.txt"), "w", encoding="utf-8") as f: for word in unknown_words: f.write(f"{word}\n")
def get_review_dict(): user_review = pickle.load( open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "rb")) item_review = pickle.load( open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "rb")) return user_review, item_review
def get_punctuations(path="data/punctuations.txt") -> Set[str]: with open(ROOT_DIR.joinpath(path)) as f: return set(f.read().splitlines())
def get_stop_words(path="data/stopwords.txt") -> Set[str]: with open(ROOT_DIR.joinpath(path)) as f: return set(f.read().splitlines())
def get_all_data(path="data/reviews.json") -> DataFrame: return pandas.read_json(ROOT_DIR.joinpath(path), lines=True)
review_by_user = dict( list(data[["itemID", "review"]].groupby(data["userID"]))) review_by_item = dict( list(data[["userID", "review"]].groupby(data["itemID"]))) return review_by_user, review_by_item def get_review_dict(): user_review = pickle.load( open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "rb")) item_review = pickle.load( open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "rb")) return user_review, item_review if __name__ == "__main__": process_raw_data() train_data, dev_data, test_data = get_train_dev_test_data() known_data = pandas.concat([train_data, dev_data]) word_vec = get_word_vec() save_embedding_weights(word_vec) user_review, item_review = get_reviews_in_idx(known_data, word_vec) pickle.dump(user_review, open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "wb")) pickle.dump(item_review, open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "wb"))
def add_log_file(logger, path: str): # create file handler with INFO level fh = logging.FileHandler(ROOT_DIR.joinpath(path)) fh.setLevel(logging.INFO) fh.setFormatter(FORMATTER) logger.addHandler(fh)