def process_raw_data(in_path="data/Digital_Music_5.json",
                     out_path="data/reviews.json"):
    """
    Read raw data and remove useless columns and clear review text.
    Then save the result to file system.
    """

    logger.info("reading raw data...")
    df = pandas.read_json(ROOT_DIR.joinpath(in_path), lines=True)
    df = df[["reviewerID", "asin", "reviewText", "overall"]]
    df.columns = ["userID", "itemID", "review", "rating"]
    stop_words = get_stop_words()
    punctuations = get_punctuations()
    lemmatizer = nltk.WordNetLemmatizer()

    def clean_review(review: str):
        review = review.lower()
        assert "'" not in punctuations
        for p in punctuations:
            review = review.replace(p, " ")
        tokens = review.split()
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)

    logger.info("cleaning review text...")
    df["review"] = df["review"].apply(clean_review)
    df.to_json(ROOT_DIR.joinpath(out_path), orient="records", lines=True)
    logger.info("Processed data saved.")
Beispiel #2
0
def get_review_dict(data_type: str):
    user_review = pickle.load(
        open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"),
             "rb"))
    item_review = pickle.load(
        open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"),
             "rb"))
    return user_review, item_review
Beispiel #3
0
def save_review_dict(data: DataFrame, word_vec, data_type: str):
    user_review, item_review = get_reviews_in_idx(data, word_vec)
    pickle.dump(
        user_review,
        open(ROOT_DIR.joinpath(f"data/user_review_word_idx_{data_type}.p"),
             "wb"))
    pickle.dump(
        item_review,
        open(ROOT_DIR.joinpath(f"data/item_review_word_idx_{data_type}.p"),
             "wb"))
def save_model(model: torch.nn.Module, train_time: time.struct_time):
    path = "model/checkpoints/%s_%s.pt" % (
        model.__class__.__name__, time.strftime("%Y%m%d%H%M%S", train_time)
    )
    path = ROOT_DIR.joinpath(path)
    torch.save(model, path)
    logger.info(f"model saved: {path}")
Beispiel #5
0
def save_embedding_weights(word_vec, out_path="data/embedding_weight.pt"):
    """
    Save the weights of pre-trained word embedding model to file.
    Thus we don't need to load it when train our model.
    This helps to save RAM and model init time.
    """

    weight = torch.Tensor(word_vec.vectors)
    torch.save(weight, ROOT_DIR.joinpath(out_path))
    logger.info("Word embedding weight saved.")
Beispiel #6
0
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'):
    """
    Read pre-trained word embedding model, and add "<pad>" to it with zero weight.
    """

    logger.info("loading word2vec model...")
    path = ROOT_DIR.joinpath(path)
    word_vec = KeyedVectors.load_word2vec_format(path, binary=True)
    word_vec.add([PAD_WORD], np.zeros([1, 300]))
    logger.critical(f"PAD_WORD_ID is {word_vec.vocab[PAD_WORD].index}.")
    logger.info("word2vec model loaded.")
    return word_vec
Beispiel #7
0
def get_word_vec(path='data/GoogleNews-vectors-negative300.bin'):
    """
    Read pre-trained word embedding model, and add "<pad>" to it with zero weight.
    """

    logger.info("loading word2vec model...")
    path = ROOT_DIR.joinpath(path)
    word_vec = KeyedVectors.load_word2vec_format(path, binary=True)

    if PAD_WORD not in word_vec:
        word_vec.add([PAD_WORD], np.zeros([1, 300]))
        logger.info(f"Add PAD_WORD to word embedding.")

    assert PAD_WORD_ID == word_vec.vocab[PAD_WORD].index, \
        f"PAD_WORD_ID should be {word_vec.vocab[PAD_WORD].index} but not {PAD_WORD_ID}."

    logger.info("word2vec model loaded.")
    return word_vec
Beispiel #8
0
def load_model(path: str):
    path = ROOT_DIR.joinpath(path)
    # load model to cpu as default.
    model = torch.load(path, map_location=torch.device('cpu'))
    return model
Beispiel #9
0
    """
    Save the weights of pre-trained word embedding model to file.
    Thus we don't need to load it when train our model.
    This helps to save RAM and model init time.
    """

    weight = torch.Tensor(word_vec.vectors)
    torch.save(weight, ROOT_DIR.joinpath(out_path))
    logger.info("Word embedding weight saved.")


def load_embedding_weights(path="data/embedding_weight.pt"):
    return torch.load(path)


# Find the unknowns words in review text.
# This step is not necessary for model train.
if __name__ == "__main__":
    df = pandas.read_json(ROOT_DIR.joinpath("data/reviews.json"), lines=True)
    word_vec = get_word_vec()
    unknown_words: Set[str] = set()
    for review in df["review"]:
        for word in review.split():
            if word not in word_vec:
                unknown_words.add(word)

    logger.warning(f"{len(unknown_words)} unknown words!")
    with open(ROOT_DIR.joinpath("out/UNKs.txt"), "w", encoding="utf-8") as f:
        for word in unknown_words:
            f.write(f"{word}\n")
def get_review_dict():
    user_review = pickle.load(
        open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "rb"))
    item_review = pickle.load(
        open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "rb"))
    return user_review, item_review
def get_punctuations(path="data/punctuations.txt") -> Set[str]:
    with open(ROOT_DIR.joinpath(path)) as f:
        return set(f.read().splitlines())
def get_stop_words(path="data/stopwords.txt") -> Set[str]:
    with open(ROOT_DIR.joinpath(path)) as f:
        return set(f.read().splitlines())
def get_all_data(path="data/reviews.json") -> DataFrame:
    return pandas.read_json(ROOT_DIR.joinpath(path), lines=True)
    review_by_user = dict(
        list(data[["itemID", "review"]].groupby(data["userID"])))
    review_by_item = dict(
        list(data[["userID", "review"]].groupby(data["itemID"])))

    return review_by_user, review_by_item


def get_review_dict():
    user_review = pickle.load(
        open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "rb"))
    item_review = pickle.load(
        open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "rb"))
    return user_review, item_review


if __name__ == "__main__":
    process_raw_data()

    train_data, dev_data, test_data = get_train_dev_test_data()
    known_data = pandas.concat([train_data, dev_data])

    word_vec = get_word_vec()
    save_embedding_weights(word_vec)

    user_review, item_review = get_reviews_in_idx(known_data, word_vec)
    pickle.dump(user_review,
                open(ROOT_DIR.joinpath("data/user_review_word_idx.p"), "wb"))
    pickle.dump(item_review,
                open(ROOT_DIR.joinpath("data/item_review_word_idx.p"), "wb"))
def add_log_file(logger, path: str):
    # create file handler with INFO level
    fh = logging.FileHandler(ROOT_DIR.joinpath(path))
    fh.setLevel(logging.INFO)
    fh.setFormatter(FORMATTER)
    logger.addHandler(fh)