def main(): with open('./dataset/train.pickle', 'rb') as f: x = pickle.load(f) vocab = make_vocab(x) tmp_vocab = {} for c, i in vocab.items(): tmp_vocab[i] = c with open("./rnnlm_50.model", mode='rb') as f: model = pickle.load(f) word = 'EOS' in_x = Variable(np.array([vocab.get(word, vocab['UNK'])], dtype='int32')) for index in model.predict(in_x, max_length=1000): if index == vocab['EOS']: print() else: print(tmp_vocab[index], end='') print()
# y = "オッケー蓮子!!" # input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"] # output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"] # x = list(x) # y = list(y) tmp_vocab = {} train_x = [] train_y = [] with open('./json/speaker.pickle', 'rb') as f: x = pickle.load(f) with open('./json/response.pickle', 'rb') as f: y = pickle.load(f) # train_set, vocab = make_vocab(x+y) vocab = make_vocab(x+y) for speaker, utterance in zip(x, y): train_x.append(np.array([vocab[word] for word in reversed(mecab_wakati(speaker))], dtype='int32')) train_y.append(np.array([vocab[word] for word in mecab_wakati(utterance)], dtype='int32')) train_x = np.array(train_x) train_y = np.array(train_y) print("train_x: {}, train_y: {}, vocab: {}".format(len(train_x), len(train_y), len(vocab))) loss = 0 average_loss = [] accuracy_list = [] epochs = 50 batch_size = 128
from nlp.rnnlm.rnnlm import RNNLM from nlp.utils import make_vocab, mecab_wakati, plot_loss # x = "メリー!ボブスレーしよう!!" # y = "オッケー蓮子!!" # input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"] # output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"] # x = list(x) # y = list(y) tmp_vocab = {} train_x = [] with open('./dataset/train.pickle', 'rb') as f: x = pickle.load(f) vocab = make_vocab(x) for row in x: train_x.append( np.array([vocab[word] for word in mecab_wakati(row).split()], dtype='int32')) train_x = np.array(train_x) print("train_x: {}, vocab: {}".format(len(train_x), len(vocab))) loss = 0 average_loss = [] epochs = 20 batch_size = 50 num_data = len(train_x)
from nlp.utils import make_vocab, mecab_wakati, plot_loss # x = "メリー!ボブスレーしよう!!" # y = "オッケー蓮子!!" # input_sentence = ["メリー", "!", "ボブスレー", "しよ", "う", "!", "!"] + ["<eos>"] # output_sentence = ["オッケー", "蓮子", "!", "!"] + ["<eos>"] # x = list(x) # y = list(y) tmp_vocab = {} train_x = [] with open('./dataset/train.pickle', 'rb') as f: x = pickle.load(f) vocab = make_vocab(x) for row in x: train_x.append(np.array([vocab[word] for word in mecab_wakati(row).split()], dtype='int32')) train_x = np.array(train_x) print("train_x: {}, vocab: {}".format(len(train_x), len(vocab))) loss = 0 average_loss = [] epochs = 20 batch_size = 50 num_data = len(train_x) start_at = time.time() cur_at = start_at
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', default=100, type=int, help='number of units') parser.add_argument('--window', '-w', default=5, type=int, help='window size') parser.add_argument('--batchsize', '-b', type=int, default=1000, help='learning minibatch size') parser.add_argument('--epoch', '-e', default=20, type=int, help='number of epochs to learn') parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'], default='skipgram', help='model type ("skipgram", "cbow")') parser.add_argument('--negative-size', default=5, type=int, help='number of negative samples') parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'], default='ns', help='output model type ("hsm": hierarchical softmax, ' '"ns": negative sampling, "original": ' 'no approximation)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() cuda.check_cuda_available() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') # Load the dataset # 学習データの読み込み with open('./corpus/train.pickle', 'rb') as f: x = pickle.load(f) vocab = make_vocab(x) train_x = [] for row in x: train_x += [vocab[word] for word in mecab_wakati(row)] train = np.array(train_x) counts = collections.Counter(train) n_vocab = max(train) + 1 index2word = {wid: word for word, wid in six.iteritems(vocab)} print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) # Choose the model if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu() # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train_iter = WindowIterator(train, args.window, args.batchsize) # val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) # Set up an updater updater = training.updaters.StandardUpdater( train_iter, optimizer, converter=convert, device=args.gpu) # Set up a trainer trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) # trainer.extend(extensions.Evaluator( # val_iter, model, converter=convert, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() # Save the word2vec model with open('word2vec.model', 'w') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.data) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))