コード例 #1
0
def _read_and_decode(split, max_ngram_len, feature='n-gram'):
    voca = Vocabulary(ku.voca_root)
    userhelper = UserHelper()
    reviews = ReviewLoader(ku.Movie, product_num=50).get_data()

    users = userhelper.get_users(reviews)
    user2idx = userhelper.user2idx(users)
    if feature == 'n-gram':
        feature2idx = voca.character_n_gram_table(reviews, min_threshold=6)
    else:
        feature2idx = voca.word_table(reviews, min_threshold=5)
    print('--------------------feature2idx-----------------', len(feature2idx))
    feature_loader = FeatureLoader(user2idx=user2idx,
                                   max_ngram_len=max_ngram_len,
                                   ngram2idx=feature2idx)
    training_split = int(len(reviews) * 0.8)
    valid_split = training_split - int(training_split * 0.2)
    if split == 'train':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[:valid_split], )
    elif split == 'valid':
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[:valid_split])
    else:
        X, Y = feature_loader.load_n_gram_idx_feature_label(
            reviews[training_split:])
    # X, Y = tf.convert_to_tensor(X, dtype=tf.int32), tf.convert_to_tensor(Y, dtype=tf.int32)
    recons_Y = Y
    Y = keras.utils.to_categorical(Y, num_classes=len(user2idx))
    features = {'text': X, 'labels': Y, 'recons_labels': recons_Y}
    print('X.shape: ', X.shape)
    print('Y.shape: ', Y.shape)
    return features, len(user2idx), len(feature2idx), X.shape[0]
コード例 #2
0
ファイル: train_scnn.py プロジェクト: leeyanghaha/authorship
import baselines.Syntax_CNN.model as syntax_cnn
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'


ngram_min_threshold = 5
max_pos_num = 10
max_words_num = 500

batch_size = 32
epoch = 100


voca = Vocabulary(ku.voca_root)
userhelper = UserHelper()

reviews  = ReviewLoader(ku.Movie, product_num=100).get_data()

users = userhelper.get_users(reviews)
#
#
user2idx = userhelper.user2idx(users)
ngram2idx = voca.character_n_gram_table(reviews, min_threshold=ngram_min_threshold)
pos2idx = userhelper.pos2idx()


data_params = {ku.max_ngram_len: 600, ku.max_pos_num: max_pos_num,
               ku.max_words_num: max_words_num, ku.user2idx: user2idx,
               ku.ngram2idx: ngram2idx, ku.pos2idx: pos2idx,
               }
コード例 #3
0
ファイル: my_main.py プロジェクト: leeyanghaha/authorship
from baselines.trying.input import ReviewDataSet
from baselines.trying.net import TextCNN
from utils.data_utils import ReviewLoader
import utils.key_utils as ku
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import os
import torch
from utils.data_utils import ReviewLoader, UserHelper, DataHelper
import utils.function_utils as fu

userhelper = UserHelper()
datahelper = DataHelper()
feature = 'n-gram'

# review_loader = ReviewLoader(ku.Movie, product_num=100)
# reviews = review_loader.get_data()


def get_reviews():
    file = r'/home/leeyang/research/data/Movie.json'
    reviews = fu.load_array(file)
    return reviews


reviews = get_reviews()
train_dataset = ReviewDataSet(reviews, 'train', feature=feature, max_len=500)
user2idx = train_dataset.user2idx
user_num = len(user2idx)
vocab_size = train_dataset.vocab_size
コード例 #4
0
ファイル: input.py プロジェクト: leeyanghaha/authorship
 def user2idx(self):
     userhelper = UserHelper()
     users = userhelper.get_users(self.reviews)
     return userhelper.user2idx(users)
コード例 #5
0
ファイル: input.py プロジェクト: leeyanghaha/authorship
 def get_users(self):
     userhelper = UserHelper()
     return userhelper.get_users(self.reviews)