def read_lstm_model(self, params, train): assert train == False # reading a model to continue training is currently not supported words_file = params['config_path'] + params['words_file'] model_file = params['config_path'] + params['model_file'] unit = int(params['unit']) deep = (params['deep'] == 'yes') drop_ratio = float(params['drop_ratio']) #read and normalize target word embeddings w, word2index, index2word = self.read_words(words_file) s = numpy.sqrt((w * w).sum(1)) s[s == 0.] = 1. w /= s.reshape((s.shape[0], 1)) # normalize context_word_units = unit lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * unit target_word_units = IN_TO_OUT_UNITS_RATIO * unit cs = [1 for _ in range(len(word2index)) ] # dummy word counts - not used for eval loss_func = L.NegativeSampling( target_word_units, cs, NEGATIVE_SAMPLING_NUM) # dummy loss func - not used for eval model = BiLstmContext(deep, self.gpu, word2index, context_word_units, lstm_hidden_units, target_word_units, loss_func, train, drop_ratio) S.load_npz(model_file, model) return w, word2index, index2word, model
def __init__(self, n_documents=100, n_document_topics=10, n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True, counts=None, n_samples=15, word_dropout_ratio=0.0): em = EmbedMixture(n_documents, n_document_topics, n_units, dropout_ratio=dropout_ratio) kwargs = {} kwargs['mixture'] = em kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples) super(LDA2Vec, self).__init__(**kwargs) rand = np.random.random(self.sampler.W.data.shape) self.sampler.W.data[:, :] = rand[:, :] self.n_units = n_units self.train = train self.dropout_ratio = dropout_ratio self.word_dropout_ratio = word_dropout_ratio self.n_samples = n_samples
def create_link(self, rng=None): if rng is None: rng = numpy.random.RandomState() link = links.NegativeSampling(self.in_size, [10, 5, 2, 5, 2], self.sample_size) link.cleargrads() # W is initialized with zero. Inject random values for meaningful test. link.W.array[:] = rng.uniform(-1, 1, link.W.shape) return link
def __init__(self, counts, n_docs, n_topics, n_dim, n_vocab, n_samples=5): factors = np.random.random((n_topics, n_dim)).astype('float32') loss_func = L.NegativeSampling(n_dim, counts, n_samples) loss_func.W.data[:, :] = np.random.randn(*loss_func.W.data.shape) loss_func.W.data[:, :] /= np.sqrt(np.prod(loss_func.W.data.shape)) super(NSLDA, self).__init__(proportions=L.EmbedID(n_docs, n_topics), factors=L.Parameter(factors), loss_func=loss_func) self.n_docs = n_docs self.n_topics = n_topics self.n_vocab = n_vocab self.n_dim = n_dim
def read_model(self, params): user_file = os.path.join(params['config_path'], params['user_filename']) item_file = os.path.join(params['config_path'], params['item_filename']) vocab_file = os.path.join(params['config_path'], params['vocab_filename']) aspect_file = os.path.join(params['config_path'], params['aspect_filename']) opinion_file = os.path.join(params['config_path'], params['opinion_filename']) aspect_opinions_file = os.path.join(params['config_path'], params['aspect_opinions_filename']) model_file = os.path.join(params['config_path'], params['model_filename']) context_word_units = int(params['unit']) lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * context_word_units target_word_units = IN_TO_OUT_UNITS_RATIO * context_word_units user2index = load_dict(user_file) item2index = load_dict(item_file) word2index = load_dict(vocab_file) aspect2index = load_dict(aspect_file) opinion2index = load_dict(opinion_file) aspect_opinions = load_json(aspect_opinions_file) n_user = max(user2index.values()) + 1 n_item = max(item2index.values()) + 1 n_vocab = max(word2index.values()) + 1 n_aspect = max(aspect2index.values()) + 1 n_encode = n_aspect # dummy word counts - not used for eval cs = [1 for _ in range(n_vocab)] # dummy loss func - not used for eval loss_func = L.NegativeSampling(target_word_units, cs, NEGATIVE_SAMPLING_NUM) if params['model_type'] == 'c2v': model = Context2Vec(self.gpu, n_vocab, context_word_units, lstm_hidden_units, target_word_units, loss_func, self.resume) elif params['model_type'] in ['asc2v', 'asc2v-mter']: model = AspectSentiContext2Vec(self.gpu, n_vocab, n_encode, context_word_units, lstm_hidden_units, target_word_units, loss_func, self.resume) S.load_npz(model_file, model) w = model.loss_func.W.data return user2index, item2index, w, word2index, aspect2index, opinion2index, aspect_opinions, model
def setUp(self): # Create two identical datasets except that 2nd dataset has the # negative targets explicitly removed. Both cases should have identical # outcomes. self.link = links.NegativeSampling(3, [10, 5, 2, 5, 2], 2) self.link.zerograds() self.x = numpy.random.uniform(-1, 1, (3, 3)).astype(numpy.float32) self.t = numpy.array([-1, 1, 2]).astype(numpy.int32) self.gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32) self.idx = self.t > -1 self.x0 = self.x.copy()[self.idx] self.t0 = self.t.copy()[self.idx] self.gy0 = self.gy.copy()
def setUp(self): batch = len(self.t) x_shape = (batch, self.in_size) self.link = links.NegativeSampling(self.in_size, [10, 5, 2, 5, 2], self.sample_size) self.link.cleargrads() self.x = numpy.random.uniform(-1, 1, x_shape).astype(numpy.float32) self.t = numpy.array(self.t).astype(numpy.int32) if self.reduce == 'no': g_shape = self.t.shape elif self.reduce == 'sum': g_shape = () self.gy = numpy.random.uniform(-1, 1, g_shape).astype(numpy.float32)
def get_context_model(args, data_loader): if args.resume: model_reader = ModelReader(args.resume, args.gpu, True, data_loader.word2count) model = model_reader.model else: n_vocab = data_loader.n_vocab if args.context in ["sc2v", "sc2v-mter"]: n_aspect = 1 else: n_aspect = data_loader.n_aspect context_word_units = args.unit lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * args.unit target_word_units = IN_TO_OUT_UNITS_RATIO * args.unit cs = [data_loader.word2count[w] for w in range(n_vocab)] loss_func = L.NegativeSampling( target_word_units, cs, NEGATIVE_SAMPLING_NUM, args.ns_power ) loss_func.W.data[...] = 0 if args.context == "c2v": model = Context2Vec( args.gpu, n_vocab, context_word_units, lstm_hidden_units, target_word_units, loss_func, True, args.dropout, ) elif args.context in [ "ac2v", "sc2v", "asc2v", "sc2v-mter", "asc2v-mter", "aoc2v", "rasc2v", ]: model = AspectSentiContext2Vec( args.gpu, n_vocab, n_aspect, context_word_units, lstm_hidden_units, target_word_units, loss_func, True, args.dropout, ) return model
def __init__(self, n_documents=100, n_document_topics=10, n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True, counts=None, n_samples=15): em = EmbedMixture(n_documents, n_document_topics, n_units, dropout_ratio=dropout_ratio) kwargs = {} kwargs['mixture'] = em kwargs['embed'] = L.EmbedID(n_vocab, n_units) kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples) super(LDA2Vec, self).__init__(**kwargs) self.n_units = n_units self.train = train self.dropout_ratio = dropout_ratio self.n_samples = n_samples
def __init__(self, n_lemma_vocab, n_emb_size, hidden_size=100, n_units=100, counts=None, k=15, init_embed=None, dropout=0, freeze=0): super(Unsp_Model, self).__init__() with self.init_scope(): self.lemma_embed = L.EmbedID(n_lemma_vocab, n_emb_size, initialW=init_embed) if freeze == 1: self.lemma_embed.disable_update() self.l1 = L.Linear(hidden_size) self.l2 = L.Linear(n_units) self.path_ns = L.NegativeSampling(n_units, counts, k) self.n_units = n_units self.n_lemma_vocab = n_lemma_vocab self.counts = counts
def get_loss_func(args, vocab_context): word_counts = vocab_context.lst_frequencies if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax d_counts = {i: word_counts[i] for i in range(len(word_counts))} tree = HSM.create_huffman_tree(d_counts) loss_func = HSM(args.dimensions, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [word_counts[w] for w in range(len(word_counts))] loss_func = L.NegativeSampling(args.dimensions, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.dimensions, vocab_context.cnt_words) return loss_func
def finalize(self): loss_func = L.NegativeSampling(self.n_hidden, self.counts, self.n_samples) data = np.random.randn(len(self.counts), self.n_hidden) data /= np.sqrt(np.prod(data.shape)) loss_func.W.data[:] = data[:].astype('float32') kwargs = dict(vocab=L.EmbedID(self.n_words, self.n_hidden), loss_func=loss_func) for name, (em, transform, lf, cp) in self.categorical_features.items(): kwargs[name + '_mixture'] = em if transform is not None: kwargs[name + '_linear'] = transform super(LDA2Vec, self).__init__(**kwargs) self._setup() self._finalized = True self.logger.info("Finalized the class")
def __init__(self, n_documents=100, n_document_topics=10, n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True, counts=None, n_samples=15, word_dropout_ratio=0.0, power=0.75, temperature=1.0, vocab=None, docu_initialW=None): em = EmbedMixture(n_documents, n_document_topics, n_units, dropout_ratio=dropout_ratio, temperature=temperature, docu_initialW=docu_initialW) kwargs = {} kwargs['mixture'] = em # (Pdb) self.sampler.W.data.shape -> (4891, 300) # (Pdb) n_units -> 300, embedding dimensions # (Pdb) counts -> array([ 0, 0, 0, ..., 30, 30, 29], dtype=int32) # (Pdb) counts.shape -> (4891,) # (Pdb) len(vocab) -> 4891 # (Pdb) vocab[0] -> '<SKIP>', vocab[1] -> 'out_of_vocabulary', vocab[2] -> '-PRON-' kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples, power=power) super(LDA2Vec, self).__init__(**kwargs) # note that sample.W.data will be loaded with pre-trained GoogleNews # word2vec data later in lda2vec_run.py rand = np.random.random(self.sampler.W.data.shape) self.sampler.W.data[:, :] = rand[:, :] self.n_units = n_units self.train = train self.dropout_ratio = dropout_ratio self.word_dropout_ratio = word_dropout_ratio self.n_samples = n_samples self.vocab = vocab
def skipgram_embedding(data, dim=50, batchsize=32, window=10, negative_sample=5, epochs=10) -> list: cs = [data.counts[w] for w in range(len(data.counts))] loss_func = L.NegativeSampling(dim, cs, negative_sample) model = models.SkipGram(data.n_vocab, dim, loss_func) # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train_iter = models.WindowIterator(data.x_train, window, batchsize) val_iter = models.WindowIterator(data.x_test, window, batchsize, repeat=False) # Set up an updater updater = training.StandardUpdater(train_iter, optimizer, converter=convert, device=-1) # Set up a trainer trainer = training.Trainer(updater, (epochs, 'epoch'), out="result") trainer.extend( extensions.Evaluator(val_iter, model, converter=convert, device=-1)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) chainer.config.train = True trainer.run() return model.predictor.embed.W.data
val = val[:100] vocab = chainer.datasets.get_ptb_words_vocabulary() index2word = {wid: word for word, wid in six.iteritems(vocab)} print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu()
def execute_c(): index2word = {} word2index = {} dataset = [] counts = collections.Counter() with open(args["data"]) as f: for line in f: for word in line.split(): if word not in word2index: ind = len(word2index) word2index[word] = ind index2word[ind] = word counts[word2index[word]] += 1 dataset.append(word2index[word]) n_vocab = len(word2index) print("n_vocab: %d" % n_vocab) print("data length: %d" % len(dataset)) if args["out_type"] == "hsm": HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args["unit"], tree) elif args["out_type"] == "ns": cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args["unit"], cs, 20) elif args["out_type"] == "original": loss_func = SoftmaxCrossEntropyLoss(args["unit"], n_vocab) else: raise Exception("Unknown output type: {}".format(args["out_type"])) if args["model"] == "skipgram": model = SkipGram(n_vocab, args["unit"], loss_func) elif args["model"] == "cbow": model = ContinuousBow(n_vocab, args["unit"], loss_func) else: raise Exception('Unknown model type:'.format(args["model"])) dataset = np.array(dataset, dtype=np.int32) optimizer = O.Adam() optimizer.setup(model) begin_time = time.time() cur_at = begin_time word_count = 0 skip = (len(dataset) - args["window"] * 2) // args["batchsize"] next_count = 100000 for epoch in range(args["epoch"]): accum_loss = 0 print('epoch: {0}'.format(epoch)) indexes = np.random.permutation(skip) for i in indexes: if word_count >= next_count: now = time.time() duration = now - cur_at throuput = 100000. / duration print('{} word, {:.2f} sec, {:.2f} word/sec'.format( word_count, duration, throuput)) next_count += 100000 cur_at = now position = np.array(range(0, args["batchsize"])) * skip + ( args["window"] + i) loss = calculate_loss(model, dataset, position) accum_loss += loss.data word_count += args["batchsize"] model.zerograds() loss.backward() optimizer.update() print(accum_loss) with open('word2vec.model', 'w') as f: f.write('%d %d\n' % (len(index2word), args["unit"])) w = model.weight_xi.W.data for i in range(w.shape[0]): v = ' '.join(['%f' % v for v in w[i]]) f.write('%s %s\n' % (index2word[i], v))
context_word_units = args.unit lstm_hidden_units = IN_TO_OUT_UNITS_RATIO*args.unit target_word_units = IN_TO_OUT_UNITS_RATIO*args.unit # if args.gpu >= 0: # cuda.check_cuda_available() # cuda.get_device(args.gpu).use() # xp = cuda.cupy if args.gpu >= 0 else np xp = np reader = SentenceReaderDir(args.indir, args.trimfreq, args.batchsize) print('n_vocab: %d' % (len(reader.word2index)-3)) # excluding the three special tokens print('corpus size: %d' % (reader.total_words)) cs = [reader.trimmed_word2count[w] for w in range(len(reader.trimmed_word2count))] loss_func = L.NegativeSampling(target_word_units, cs, NEGATIVE_SAMPLING_NUM, args.ns_power) if args.context == 'lstm': model = BiLstmContext(args.deep, args.gpu, reader.word2index, context_word_units, lstm_hidden_units, target_word_units, loss_func, True, args.dropout) else: raise Exception('Unknown context type: {}'.format(args.context)) optimizer = O.Adam(alpha=args.alpha) optimizer.setup(model) if args.grad_clip: optimizer.add_hook(GradientClipping(args.grad_clip)) STATUS_INTERVAL = 1000000 for epoch in range(args.epoch):
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--unit', '-u', default=200, type=int, help='number of units') parser.add_argument('--window', '-w', default=10, type=int, help='window size') parser.add_argument('--batchsize', '-b', type=int, default=1000, help='learning minibatch size') parser.add_argument('--epoch', '-e', default=3, type=int, help='number of epochs to learn') parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'], default='skipgram', help='model type ("skipgram", "cbow")') parser.add_argument('--negative-size', default=5, type=int, help='number of negative samples') parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'], default='hsm', help='output model type ("hsm": hierarchical softmax, ' '"ns": negative sampling, "original": ' 'no approximation)') parser.add_argument('--out', default='result', help='Directory to output the result') group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() if chainer.get_dtype() == np.float16: warnings.warn('This example may cause NaN in FP16 mode.', RuntimeWarning) device = chainer.get_device(args.device) device.use() with open('tokenized_data.txt', 'r', encoding='utf-8') as f: data = f.read().split() import collections index2word = {} word2index = {} idx = 0 for w in set(data): word2index[w] = idx index2word[idx] = w idx += 1 data_array = [] for w in data: data_array.append(word2index[w]) data_array = np.array(data_array, dtype='int32') # Set up the dataset train = data_array[:] counts = collections.Counter(train) n_vocab = max(train) + 1 vocab = word2index print('Device: {}'.format(device)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.array[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.array[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) # Choose the model if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) model.to_device(device) # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train_iter = WindowIterator(train, args.window, args.batchsize) # Set up an updater updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=convert, device=device) # Set up a trainer trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport(['epoch', 'main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() # Save the word2vec model with open('word2vec.model', 'w', encoding='utf-8') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.array) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))
def main(): if args.gpu >= 0: cuda.get_device(args.gpu).use() train, val, _ = chainer.datasets.get_ptb_words() counts = collections.Counter(train) counts.update(collections.Counter(val)) n_vocab = max(train) + 1 if args.test: train = train[:100] val = val[:100] vocab = chainer.datasets.get_ptb_words_vocabulary( ) # dict which maps word2index index2word = {wid: word for word, wid in six.iteritems(vocab) } # dict which maps index2word print("n_vocab: %d" % n_vocab) print("data length: %d" % len(train)) if args.out_type == "hsm": HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == "ns": cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == "original": loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception("Unknown output type: {}".format(args.out_type)) if args.model == "skipgram": model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == "cbow": model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception("Unknown model type: {}".format(args.model)) if args.gpu >= 0: model.to_gpu() optimizer = O.Adam() optimizer.setup(model) train_iter = WindowIterator(train, args.window, args.batchsize) val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) updater = training.StandardUpdater(train_iter, optimizer, converter=convert, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.out) trainer.extend( extensions.Evaluator(val_iter, model, converter=convert, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() with open("word2vec.model", "w") as f: f.write("%d %d\n" % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.data) for i, wi in enumerate(w): v = " ".join(map(str, wi)) f.write("%s %s\n" % (index2word[i], v))
def setUp(self): self.link = links.NegativeSampling(3, [10, 5, 2, 5, 2], 2) self.link.zerograds() self.x = numpy.random.uniform(-1, 1, (2, 3)).astype(numpy.float32) self.t = numpy.array([0, 2]).astype(numpy.int32) self.gy = numpy.random.uniform(-1, 1, ()).astype(numpy.float32)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', '-d', type=str, default='-1', help='Device specifier. Either ChainerX device ' 'specifier or an integer. If non-negative integer, ' 'CuPy arrays with specified device id are used. If ' 'negative integer, NumPy arrays are used') parser.add_argument('--unit', '-u', default=100, type=int, help='number of units') parser.add_argument('--window', '-w', default=5, type=int, help='window size') parser.add_argument('--batchsize', '-b', type=int, default=1000, help='learning minibatch size') parser.add_argument('--epoch', '-e', default=20, type=int, help='number of epochs to learn') parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'], default='skipgram', help='model type ("skipgram", "cbow")') parser.add_argument('--negative-size', default=5, type=int, help='number of negative samples') parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'], default='hsm', help='output model type ("hsm": hierarchical softmax, ' '"ns": negative sampling, "original": ' 'no approximation)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', type=str, help='Resume the training from snapshot') parser.add_argument('--snapshot-interval', type=int, help='Interval of snapshots') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) group = parser.add_argument_group('deprecated arguments') group.add_argument('--gpu', '-g', dest='device', type=int, nargs='?', const=0, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() device = chainer.get_device(args.device) device.use() if args.snapshot_interval is None: args.snapshot_interval = args.epoch args.snapshot_interval = min(args.snapshot_interval, args.epoch) print('Device: {}'.format(device)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') # Load the dataset train, val, _ = chainer.datasets.get_ptb_words() counts = collections.Counter(train) counts.update(collections.Counter(val)) n_vocab = max(train) + 1 if args.test: train = train[:100] val = val[:100] vocab = chainer.datasets.get_ptb_words_vocabulary() index2word = {wid: word for word, wid in six.iteritems(vocab)} print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.array[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.array[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) # Choose the model if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) model.to_device(device) # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train_iter = WindowIterator(train, args.window, args.batchsize) val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) # Set up an updater updater = training.updaters.StandardUpdater(train_iter, optimizer, converter=convert, device=device) # Set up a trainer trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( extensions.Evaluator(val_iter, model, converter=convert, device=device)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.extend( extensions.snapshot(filename='snapshot_epoch_{.updater.epoch}'), trigger=(args.snapshot_interval, 'epoch')) if args.resume is not None: chainer.serializers.load_npz(args.resume, trainer) trainer.run() # Save the word2vec model with open(os.path.join(args.out, 'word2vec.model'), 'w') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.array) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', default=100, type=int, help='number of units') parser.add_argument('--window', '-w', default=5, type=int, help='window size') parser.add_argument('--batchsize', '-b', type=int, default=1000, help='learning minibatch size') parser.add_argument('--epoch', '-e', default=20, type=int, help='number of epochs to learn') parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'], default='skipgram', help='model type ("skipgram", "cbow")') parser.add_argument('--negative-size', default=5, type=int, help='number of negative samples') parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'], default='hsm', help='output model type ("hsm": hierarchical softmax, ' '"ns": negative sampling, "original": ' 'no approximation)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--test', dest='test', action='store_true') parser.add_argument('--wakati_corpus_list') parser.add_argument('--num_tokens', type=int, default=None, help='If not set, we count words as the 1st-pash.') parser.add_argument('--word_count_threshold', default=5, type=int) parser.set_defaults(test=False) args = parser.parse_args() if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() cuda.check_cuda_available() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() wakati_corpus_list = [line.rstrip() for line in open(args.wakati_corpus_list, 'r').readlines() if not re.match('^\s*#', line)] # Create vocab. vocab = word2vec_module.create_vocab(wakati_corpus_list, count_threshold=args.word_count_threshold) index2word = dict([(wid, word) for (word, wid) in vocab.items()]) # Load the dataset words_generator = word2vec_module.WordsGenerator(wakati_corpus_list, batch_size=1000) class WidsGenerator: def __init__(self, words_generator, vocab): self.words_generator = words_generator self.vocab = vocab def __call__(self): for words in self.words_generator(): wids = [vocab[word] if word in vocab else 0 for word in words] yield wids class WidGenerator: def __init__(self, wids_generator): self.wids_generator = wids_generator def __call__(self): for wids in self.wids_generator(): for wid in wids: yield wid wids_generator = WidsGenerator(words_generator, vocab) # Generator call returns iterator object. wid_generator = WidGenerator(wids_generator) # train, val, _ = chainer.datasets.get_ptb_words() num_tokens = len([wid for wid in wid_generator()]) if args.num_tokens is None else args.num_tokens train = itertools.islice(wid_generator(), min(int(num_tokens*0.05), 10000), sys.maxsize) val = itertools.islice(wid_generator(), 0, min(int(num_tokens*0.05), 10000)) counts = collections.Counter(wid_generator()) # counts.update(collections.Counter(WidGenerator(val)())) # n_vocab = max(train) + 1 n_vocab = len(vocab) # if args.test: # train = train[:100] # val = val[:100] print('n_vocab: %d' % n_vocab) # print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) # Choose the model if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu() # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train = itertools.islice(wids_generator(), min(int(num_tokens*0.05), 10000), sys.maxsize) val = itertools.islice(wids_generator(), 0, min(int(num_tokens*0.05), 10000)) train_iter = WindowIteratorIterator(train, args.window, args.batchsize) val_iter = WindowIteratorIterator(val, args.window, args.batchsize, repeat=False) # train_iter = WindowIterator(train, args.window, args.batchsize) # val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) # Set up an updater updater = training.updater.StandardUpdater( train_iter, optimizer, converter=convert, device=args.gpu) # Set up a trainer trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator( val_iter, model, converter=convert, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() # Save the word2vec model with open('word2vec.model', 'w') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.data) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', default=-1, type=int, help='GPU ID (negative value indicates CPU)') parser.add_argument('--unit', '-u', default=100, type=int, help='number of units') parser.add_argument('--window', '-w', default=5, type=int, help='window size') parser.add_argument('--batchsize', '-b', type=int, default=1000, help='learning minibatch size') parser.add_argument('--epoch', '-e', default=20, type=int, help='number of epochs to learn') parser.add_argument('--model', '-m', choices=['skipgram', 'cbow'], default='skipgram', help='model type ("skipgram", "cbow")') parser.add_argument('--negative-size', default=5, type=int, help='number of negative samples') parser.add_argument('--out-type', '-o', choices=['hsm', 'ns', 'original'], default='hsm', help='output model type ("hsm": hierarchical softmax, ' '"ns": negative sampling, "original": ' 'no approximation)') parser.add_argument('--out', default='result', help='Directory to output the result') parser.add_argument('--test', dest='test', action='store_true') parser.set_defaults(test=False) args = parser.parse_args() if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() cuda.check_cuda_available() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') if args.gpu >= 0: cuda.get_device_from_id(args.gpu).use() # Load the dataset train, val, _ = chainer.datasets.get_ptb_words() counts = collections.Counter(train) counts.update(collections.Counter(val)) n_vocab = max(train) + 1 if args.test: train = train[:100] val = val[:100] vocab = chainer.datasets.get_ptb_words_vocabulary() index2word = {wid: word for word, wid in six.iteritems(vocab)} print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) # Choose the model if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu() # Set up an optimizer optimizer = O.Adam() optimizer.setup(model) # Set up an iterator train_iter = WindowIterator(train, args.window, args.batchsize) val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) # Set up an updater updater = training.updater.StandardUpdater(train_iter, optimizer, converter=convert, device=args.gpu) # Set up a trainer trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( extensions.Evaluator(val_iter, model, converter=convert, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() # Save the word2vec model with open('word2vec.model', 'w') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.data) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))
index2word[ind] = word counts[word2index[word]] += 1 dataset.append(word2index[word]) n_vocab = len(word2index) print('n_vocab: %d' % n_vocab) print('data length: %d' % len(dataset)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, 20) elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu()
def main(): args = get_args() if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() cuda.check_cuda_available() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('Window: {}'.format(args.window)) print('Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('Training model: {}'.format(args.model)) print('Output type: {}'.format(args.out_type)) print('') train, val, _ = chainer.datasets.get_ptb_words() train: np.ndarray = train val: np.ndarray = val counts = collections.Counter(train) counts.update(collections.Counter(val)) n_vocab: int = max(train) + 1 assert len(train.shape) == 1 assert len(val.shape) == 1 if args.test: train: np.ndarray = train[:100] val: np.ndarray = val[:100] vocab: Dict[str, int] = chainer.datasets.get_ptb_words_vocabulary() index2word: Dict[int, str] = {wid: word for word, wid in vocab.items()} print('n_vocab: %d' % n_vocab) print('data length: %d' % len(train)) if args.out_type == 'hsm': HSM = L.BinaryHierarchicalSoftmax tree = HSM.create_huffman_tree(counts) loss_func = HSM(args.unit, tree) loss_func.W.data[...] = 0 elif args.out_type == 'ns': cs = [counts[w] for w in range(len(counts))] loss_func = L.NegativeSampling(args.unit, cs, args.negative_size) loss_func.W.data[...] = 0 elif args.out_type == 'original': loss_func = SoftmaxCrossEntropyLoss(args.unit, n_vocab) else: raise Exception('Unknown output type: {}'.format(args.out_type)) if args.model == 'skipgram': model = SkipGram(n_vocab, args.unit, loss_func) elif args.model == 'cbow': model = ContinuousBoW(n_vocab, args.unit, loss_func) else: raise Exception('Unknown model type: {}'.format(args.model)) if args.gpu >= 0: model.to_gpu() optimizer = O.Adam() optimizer.setup(model) train_iter = WindowIterator(train, args.window, args.batchsize) val_iter = WindowIterator(val, args.window, args.batchsize, repeat=False) updater = training.StandardUpdater(train_iter, optimizer, converter=convert, device=args.gpu) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( extensions.Evaluator(val_iter, model, converter=convert, device=args.gpu)) trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() with open('word2vec.model', 'w') as f: f.write('%d %d\n' % (len(index2word), args.unit)) w = cuda.to_cpu(model.embed.W.data) for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2word[i], v))
def train(**args): set_seed(42) args = EasyDict(args) logger.info(args) dataset_file = Path(args.dataset_file) data = json.loads(dataset_file.read_text()) ladder = data['ladder'] train_data, valid_data = data['train'], data['valid'] counter = Counter() pokes = train_data + valid_data for poke in pokes: counter.update(poke) counts = [0] * (args.topk + 1) index2poke = ['<unk>'] for i, (name, freq) in enumerate(counter.most_common()): if i < args.topk: counts[i + 1] = freq index2poke.append(name) else: counts[0] += freq vocab = {x: i for i, x in enumerate(index2poke)} n_vocab = len(vocab) logger.info('n_vocab = {}'.format(n_vocab)) train_data = vectorize(train_data, vocab) valid_data = vectorize(valid_data, vocab) X_valid, y_valid = convert(valid_data) X_train, y_train = convert(train_data) train = TupleDataset(X_train, y_train) valid = TupleDataset(X_valid, y_valid) logger.info('train size = {}'.format(len(train))) logger.info('valid size = {}'.format(len(valid))) train_iter = chainer.iterators.SerialIterator(train, 32) valid_iter = chainer.iterators.SerialIterator(valid, 32, repeat=False, shuffle=False) if args.loss_func == 'softmax': loss_func = SoftmaxCrossEntropyLoss(args.n_units, n_vocab) elif args.loss_func == 'ns': loss_func = L.NegativeSampling(args.n_units, counts, args.negative_size) loss_func.W.data[...] = 0 else: raise ValueError('invalid loss_func: {}'.format(args.loss_func)) prefix = '{}_{}_{}'.format(ladder, args.loss_func, args.n_units) model = ContinuousBoW(n_vocab, args.n_units, loss_func) optimizer = O.Adam() optimizer.setup(model) updater = training.updater.StandardUpdater(train_iter, optimizer) trainer = training.Trainer(updater, (10, 'epoch'), out='results') trainer.extend(extensions.Evaluator(valid_iter, model)) trainer.extend(extensions.LogReport(log_name='{}_log'.format(prefix))) trainer.extend( extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss'])) trainer.extend(extensions.ProgressBar()) trainer.run() # Save the word2vec model Path('results').mkdir(exist_ok=True) poke2vec_file = 'results/{}_poke2vec.model'.format(prefix) with open(poke2vec_file, 'w') as f: f.write('%d %d\n' % (n_vocab, args.n_units)) w = model.embed.W.data for i, wi in enumerate(w): v = ' '.join(map(str, wi)) f.write('%s %s\n' % (index2poke[i], v))