def _read_and_decode(split, max_ngram_len, feature='n-gram'): voca = Vocabulary(ku.voca_root) userhelper = UserHelper() reviews = ReviewLoader(ku.Movie, product_num=50).get_data() users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) if feature == 'n-gram': feature2idx = voca.character_n_gram_table(reviews, min_threshold=6) else: feature2idx = voca.word_table(reviews, min_threshold=5) print('--------------------feature2idx-----------------', len(feature2idx)) feature_loader = FeatureLoader(user2idx=user2idx, max_ngram_len=max_ngram_len, ngram2idx=feature2idx) training_split = int(len(reviews) * 0.8) valid_split = training_split - int(training_split * 0.2) if split == 'train': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split], ) elif split == 'valid': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split]) else: X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[training_split:]) # X, Y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(Y, dtype=tf.int32) recons_Y = Y Y = keras.utils.to_categorical(Y, num_classes=len(user2idx)) features = {'text': X, 'labels': Y, 'recons_labels': recons_Y} print('X.shape: ', X.shape) print('Y.shape: ', Y.shape) return features, len(user2idx), len(feature2idx), X.shape[0]
import baselines.Syntax_CNN.model as syntax_cnn import os os.environ['CUDA_VISIBLE_DEVICES'] = '3' ngram_min_threshold = 5 max_pos_num = 10 max_words_num = 500 batch_size = 32 epoch = 100 voca = Vocabulary(ku.voca_root) userhelper = UserHelper() reviews = ReviewLoader(ku.Movie, product_num=100).get_data() users = userhelper.get_users(reviews) # # user2idx = userhelper.user2idx(users) ngram2idx = voca.character_n_gram_table(reviews, min_threshold=ngram_min_threshold) pos2idx = userhelper.pos2idx() data_params = {ku.max_ngram_len: 600, ku.max_pos_num: max_pos_num, ku.max_words_num: max_words_num, ku.user2idx: user2idx, ku.ngram2idx: ngram2idx, ku.pos2idx: pos2idx, }
from baselines.trying.input import ReviewDataSet from baselines.trying.net import TextCNN from utils.data_utils import ReviewLoader import utils.key_utils as ku from torch.utils.data import DataLoader import torch.optim as optim import torch.nn as nn import os import torch from utils.data_utils import ReviewLoader, UserHelper, DataHelper import utils.function_utils as fu userhelper = UserHelper() datahelper = DataHelper() feature = 'n-gram' # review_loader = ReviewLoader(ku.Movie, product_num=100) # reviews = review_loader.get_data() def get_reviews(): file = r'/home/leeyang/research/data/Movie.json' reviews = fu.load_array(file) return reviews reviews = get_reviews() train_dataset = ReviewDataSet(reviews, 'train', feature=feature, max_len=500) user2idx = train_dataset.user2idx user_num = len(user2idx) vocab_size = train_dataset.vocab_size
def user2idx(self): userhelper = UserHelper() users = userhelper.get_users(self.reviews) return userhelper.user2idx(users)
def get_users(self): userhelper = UserHelper() return userhelper.get_users(self.reviews)