def _read_and_decode(split, max_ngram_len, feature='n-gram'): voca = Vocabulary(ku.voca_root) userhelper = UserHelper() reviews = ReviewLoader(ku.Movie, product_num=50).get_data() users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) if feature == 'n-gram': feature2idx = voca.character_n_gram_table(reviews, min_threshold=6) else: feature2idx = voca.word_table(reviews, min_threshold=5) print('--------------------feature2idx-----------------', len(feature2idx)) feature_loader = FeatureLoader(user2idx=user2idx, max_ngram_len=max_ngram_len, ngram2idx=feature2idx) training_split = int(len(reviews) * 0.8) valid_split = training_split - int(training_split * 0.2) if split == 'train': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split], ) elif split == 'valid': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:valid_split]) else: X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[training_split:]) # X, Y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(Y, dtype=tf.int32) recons_Y = Y Y = keras.utils.to_categorical(Y, num_classes=len(user2idx)) features = {'text': X, 'labels': Y, 'recons_labels': recons_Y} print('X.shape: ', X.shape) print('Y.shape: ', Y.shape) return features, len(user2idx), len(feature2idx), X.shape[0]
def feature_label(self): data_params = { 'max_ngram_len': self.max_len, 'user2idx': self.user2idx, 'ngram2idx': self.feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews) return x, y
def feature_label(self): '''convert text to feature and get it's label''' data_params = { 'user2idx': self.user2idx, 'ngram2idx': self.feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_binary_feature_label(self.reviews) return x, y
def get_feature(reviews): if feature_name == 'n-gram': feature2idx = voca.character_n_gram_table(reviews, min_threshold=6) else: feature2idx = voca.word_table(reviews, min_threshold=5) feature_loader = FeatureLoader(user2idx=user2idx, max_ngram_len=max_len, ngram2idx=feature2idx) X, Y = feature_loader.load_n_gram_idx_feature_label(reviews) return X, Y, len(feature2idx)
def load_feature_label(self, split): feature2idx = self.feature2idx() data_params = { 'max_ngram_len': self.max_len, 'user2idx': self.user2idx, 'ngram2idx': feature2idx } feature_loader = FeatureLoader(**data_params) x, y = feature_loader.load_n_gram_idx_feature_label(self.reviews) train_split = int(x.shape[0] * 0.8) valid_split = train_split - int(train_split * 0.2) if split == 'train': x, y = x[:valid_split, :], y[:valid_split] elif split == 'valid': x, y = x[valid_split:train_split, :], y[valid_split:train_split] else: x, y = x[train_split:, :], y[train_split:] return torch.tensor(x, dtype=torch.long), torch.tensor( y, dtype=torch.long), len(feature2idx)
def get_feature(reviews, split): if feature_name == 'n-gram': feature2idx = voca.character_n_gram_table(reviews, min_threshold=6) else: feature2idx = voca.word_table(reviews, min_threshold=5) feature_loader = FeatureLoader(user2idx=user2idx, max_ngram_len=max_len, ngram2idx=feature2idx) training_split = int(len(reviews) * 0.8) valid_split = training_split - int(training_split * 0.2) if split == 'train': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[:training_split - valid_split]) elif split == 'valid': X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[training_split - valid_split:training_split]) else: X, Y = feature_loader.load_n_gram_idx_feature_label( reviews[training_split:]) Y = keras.utils.to_categorical(Y, num_classes=len(user2idx)) return X, Y, len(feature2idx)
ngram2idx = voca.character_n_gram_table(reviews, min_threshold=ngram_min_threshold) pos2idx = userhelper.pos2idx() data_params = {ku.max_ngram_len: 600, ku.max_pos_num: max_pos_num, ku.max_words_num: max_words_num, ku.user2idx: user2idx, ku.ngram2idx: ngram2idx, ku.pos2idx: pos2idx, } net_params = {ku.max_words_num: max_words_num, 'syntax_dim': 60, 'ngram_dim': 300, 'pos_type_num': len(pos2idx), 'out_dim': len(user2idx), ku.max_pos_num: max_pos_num, 'vocab_size': len(ngram2idx), 'batch_size': 32, 'filters': 300, 'kernel_size': 3, 'loss': 'categorical_crossentropy'} feature_loader = FeatureLoader(**data_params) feature = feature_loader.syntax_cnn_feature_label(reviews) pos_id, position_id, ngram_id, user_id = feature[ku.pos_id], feature[ku.pos_order_id], \ feature[ku.ngram_id], feature[ku.user_id] print('pos_id: ', pos_id.shape) print('position_id: ', position_id.shape) print('ngram_id: ', ngram_id.shape) print('user_id: ', user_id.shape) training_split = int(0.8 * ngram_id.shape[0]) training_ngram_id, testing_ngram_id = ngram_id[:training_split, :], ngram_id[training_split:, :] training_pos_id, testing_pos_id = pos_id[:training_split, :], pos_id[training_split:, :] training_position_id, testing_position_id = position_id[:training_split, :], position_id[training_split:, :] training_x = [training_ngram_id, training_pos_id, training_position_id]
from utils.data_utils import FeatureLoader, UserHelper, DataHelper import utils.key_utils as ku import numpy as np from collections import Counter from scipy import sparse userhelper = UserHelper() datahelper = DataHelper() feature_loader = FeatureLoader() def get_users(reviews): users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) users_id = [] for review in reviews: users_id.append(user2idx[review[ku.reviewer_ID]]) return np.array(users_id) # # # def get_products_id(reviews): # products = datahelper.get_products(reviews) # product2idx = datahelper.product2idx(products) # products_id = datahelper.load_products_id(products, product2idx) # return products_id, len(product2idx) def load_feature_label(reviews, products_id): y = get_users(reviews)
from baselines.gcforest.GcForest import GCForest from utils.vocabulary_utils import Vocabulary import utils.key_utils as ku from utils.data_utils import ReviewLoader, FeatureLoader, DataHelper, UserHelper import sklearn.utils as sku from sklearn.metrics import accuracy_score import os import pickle datahelper = DataHelper() voca = Vocabulary(ku.voca_root) userhelper = UserHelper() feature_loader = FeatureLoader() reviews = ReviewLoader(ku.Movie, product_num=50).get_data() users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) ngram2idx = voca.character_n_gram_table(reviews, min_threshold=2) voca.dump_n_grams(ngram2idx, type=ku.charngram2idx) def get_toy_config(): config = {} ca_config = {} ca_config["random_state"] = 0 ca_config["max_layers"] = 100 ca_config["early_stopping_rounds"] = 3 ca_config["n_classes"] = 203 ca_config["estimators"] = [] ca_config["estimators"].append({
def feature_label(self): feature2idx = self.feature2idx(None) params = {'feature2idx': feature2idx, 'user2idx': self.user2idx} feature_loader = FeatureLoader(**params) x, users = feature_loader.load_n_gram_binary_feature_label(self.reviews) return x, users
voca = Vocabulary(ku.voca_root) userhelper = UserHelper() reviews = ReviewLoader(ku.Movie, product_num=50).get_data() users = userhelper.get_users(reviews) user2idx = userhelper.user2idx(users) ngram2idx = voca.character_n_gram_table(reviews, min_threshold=ngram_min_threshold) print(len(ngram2idx)) data_params = { 'max_ngram_len': max_ngram_len, 'user2idx': user2idx, 'ngram2idx': ngram2idx } feature_loader = FeatureLoader(**data_params) param = { 'kernel_size': [3, 5, 7], 'batch_size': 32, 'epochs': 100, 'loss': 'categorical_crossentropy', 'embedding_dim': 100, 'user_num': len(user2idx), 'max_ngram_len': max_ngram_len, 'feature_num': 300, 'vocab_size': len(ngram2idx) } # # x, y = feature_loader.load_n_gram_idx_feature_label(reviews)