def __init__(self, tgt_vocab, args, tgt_w2v): super(GRUDecoder, self).__init__() with self.init_scope(): self.word2embedding = chainLinks.EmbedID(tgt_vocab.size, args.edim, ignore_label=-1) self.gru = BahadanauGRU(args.edim, args.nhid) self.U_o = chainLinks.Linear(args.nhid, args.nhid) self.V_o = chainLinks.Linear(args.edim, args.nhid) self.C_o = chainLinks.Linear(2 * args.nhid, args.nhid) self.W_o = chainLinks.Linear(args.nhid // 2, tgt_vocab.size) self.attention = additiveAttention(args.nhid) if tgt_w2v is not None: for i in range(tgt_vocab.size): word = tgt_vocab.id2word[i] if word in tgt_w2v: self.word2embedding.W.data[i] = tgt_w2v[word] self.vocab_size = tgt_vocab.size self.embedding_size = args.edim self.hidden_size = args.nhid self.gen_limit = args.genlimit if args.useDropout: self.use_dropout = args.useDropout self.dropoutr = args.dlr else: self.use_dropout = None self.dropoutr = 0 util.trace('{}'.format(chainer.global_config.__dict__))
def __init__(self, src_vocab, args, src_w2v): super(biGRU_encoder, self).__init__() with self.init_scope(): self.word2embed = chainLinks.EmbedID(src_vocab.size, args.edim, ignore_label=-1) self.embed2hiddenf = chainLinks.GRU(args.edim, args.nhid) self.embed2hiddenb = chainLinks.GRU(args.edim, args.nhid) #embedding weight is intialized by w2v. if src_w2v is not None: for i in range(src_vocab.size): word = src_vocab.id2word[i] if word in src_w2v: self.word2embed.W.data[i] = src_w2v[word] self.vocab_size = src_vocab.size self.embedding_size = args.edim self.hidden_size = args.nhid if args.useDropout: self.use_dropout = args.useDropout self.dropoutr = args.dlr else: self.use_dropout = None self.dropoutr = 0 util.trace('{}'.format(chainer.global_config.__dict__))
def make_matrix(co_occur, word_occur, context_occur, model, th=1): """ this function is making the matrix expresses word-graph. word_name: word to id num_pairs: sum of the all co-occurrence feature_list: a list the elemint of which is dict (key; co-occurrence, value: occurrence or pmi). co_word_dict: key is a word, values are the co-occurrence words around the word. word_dict: key is the co-occurrence word, value is occurrence (Freq) or PPMI value (PPMI). """ vectorizer = sklearn.feature_extraction.DictVectorizer() word_name = dict() feature_list = list() co_word_dict = collections.defaultdict(list) num_pairs = 0 for value in co_occur.values(): num_pairs += value for word in co_occur.keys(): w, c = word.split('\t') # if only once appearance (or under threshold) of the co-occurrence # the matrix element is 0. if co_occur[word] <= th: c = c+'@@@@@None' co_word_dict[w].append(c) counter = 0 # caluculation of the matrix element for word, co_words in sorted(co_word_dict.items()): if counter % 10000 == 0: util.trace('add word num :'+str(counter)) word_name[word] = counter word_dict = dict() for c in co_words: if c.find('@@@@@None') != -1: word_dict[c.rsplit('@@@@@None')[0]] = 0 else: if model == 'Freq': word_dict[c] = co_occur['{}\t{}'.format(word, c)] elif model == 'PPMI': pmi_value = returnPmi(word, c, co_occur, word_occur, context_occur, num_pairs) pmi_value += np.log2(co_occur['{}\t{}'.format(word, c)]) word_dict[c] = max(0, pmi_value) feature_list.append(word_dict) counter += 1 matrix = (word_name, vectorizer.fit_transform(feature_list)) return matrix, vectorizer
def __init__(self, src_vocab, tgt_vocab, args): super().__init__() with self.init_scope(): self.word2embed = chainLinks.EmbedID(src_vocab.size, args.nhid, ignore_label=-1) # W_1, U self.embed2hidden = chainLinks.GRU(args.nhid, args.nhid) # W_2 self.W_2 = chainLinks.Linear(args.nhid, tgt_vocab.size) self.vocab_size = src_vocab.size self.hidden_size = args.nhid if args.useDropout: self.use_dropout = args.useDropout self.dropoutr = args.dlr else: self.use_dropout = None self.dropoutr = 0 util.trace('{}'.format(chainer.global_config.__dict__))
def extract_context(input_file, window_size): """ this function is counting the co-occurrence in the words at the input file. co_occur: one word to co-occurrence word word_occur: occur word (as token) context_occur: co-occurrence word This function is a very naive implementation; I want to devise a little more. """ co_occur = collections.defaultdict(int) word_occur = collections.defaultdict(int) context_occur = collections.defaultdict(int) i_f = open(input_file) for num, line in enumerate(i_f): if num % 10000 == 0: util.trace('look at sent: '+str(num)) words = line.strip().split() for i in range(len(words)): # left word for j in range(1, window_size+1): if i - j > -1: co_occur['{}\t{}'.format(words[i], words[i-j])] += 1 word_occur[words[i]] += 1 context_occur[words[i-j]] += 1 else: continue #right word for j in range(1, window_size+1): if i + j < len(words): co_occur['{}\t{}'.format(words[i], words[i+j])] += 1 word_occur[words[i]] += 1 context_occur[words[i+j]] += 1 else: continue i_f.close() return co_occur, word_occur, context_occur
def test(args): src_word2vec = None tgt_word2vec = None util.trace('start testing ...') corpus_file = args.vocabdir + '/test.' + args.sourcelang output_file = args.savedir + '/{}.generate.{}-{}'.format( args.name, args.sourcelang, args.targetlang) args.name = args.datadir + '/{}.{:03d}'.format(args.name, args.epochNum) chainer.global_config.train = False #chainer.global_config.debug = True util.trace('chainer config: {}'.format(chainer.global_config.__dict__)) util.trace('load vocab...') vocab_file = args.vocabdir + '/vocabulary.' source_vocab = util.Vocabulary.load(vocab_file + args.sourcelang) target_vocab = util.Vocabulary.load(vocab_file + args.targetlang) util.trace('Loading Model ...') NMTmodel = nmt_model.BahdanauNMT(source_vocab, target_vocab, args, src_word2vec, tgt_word2vec) if args.useGPU >= 0: import cupy as xp chainer.cuda.check_cuda_available() chainer.cuda.get_device(args.useGPU).use() NMTmodel.to_gpu() util.trace('use GPU id: {}'.format(args.useGPU)) else: import numpy as xp args.useGPU = -1 util.trace('without GPU') chainer.serializers.load_npz('{}.weights'.format(args.name), NMTmodel) util.trace('Generating translation ...') finished = 0 with open(output_file, 'w') as o_f: for src_sent in util.monoBatch(corpus_file, source_vocab, args): util.trace('Sample {} ...'.format(finished + 1)) prds = NMTmodel.generate(src_sent) for predict in util.convert_b2w(prds, target_vocab): o_f.write('{}\n'.format(predict)) finished += 1
epilog='end') parser.add_argument('sourcelang') parser.add_argument('targetlang') parser.add_argument('-datadir', help='saved the weight file in this folder', default='') parser.add_argument('-vocabdir', help='saved the vocab file in this folder', default='') parser.add_argument('-savedir', help='save the output sentence in this folder', default='') parser.add_argument('-useGPU', type=int, default=-1) parser.add_argument('-epochNum', type=int, help='point at the model you want to use') parser.add_argument('-genlimit', type=int, help='generation limit') parser.add_argument('-name', default='sample', help='model name') parser.add_argument('-edim', default=512, type=int, help='embedding size for model') parser.add_argument('-nhid', default=1024, type=int, help='hidden size for model') parser.add_argument('-useDropout', action='store_true') args = parser.parse_args() test(args) util.trace('finish generation')
def main(): parser = argparse.ArgumentParser(usage='sorry, look at readme...', \ description='arg description', epilog='end') parser.add_argument('inputF', help='write the file name of the input text.') parser.add_argument('-model', help='select Freq or PPMI.', default='PPMI', choices=['Freq', 'PPMI']) parser.add_argument('-outF', help='write the output file name.', default='sample') parser.add_argument('-window', help='define the window size.', type=int, default=2) parser.add_argument('-iter', help='the number of HITS iteration.', type=int, default=300) parser.add_argument('-vocabSize', help='define the vocabulary size. default is all.', type=int, default=None) args = parser.parse_args() # counting co-occurrence util.trace('count the co-occurrence') co_occur, word_occur, context_occur = word_graph.extract_context( args.inputF, args.window) util.trace('vocabulary size of the input data is {}.'.format( len(word_occur))) if args.vocabSize: vocabSize = args.vocabSize else: vocabSize = len(word_occur) # calculate matrix util.trace('make matrix (word-graph)') matrix, vec = word_graph.make_matrix(co_occur, word_occur, context_occur, args.model) # save data (matrix) util.trace('save the matrix') util.save_data(matrix, args.outF + '/pmi_matrix_{}.pickle'.format(args.model)) util.save_data(vec, args.outF + '/pmi_vectorizer_{}.pickle'.format(args.model)) # get the intial vector HITS_obj = hits.HITS(matrix) # matrix is symmetry; authority score is equal to hubness score. util.trace('start HITS') i = HITS_obj.startHITS(args.iter).toarray() util.trace('finish HITS') # write the ranking words by HITS util.trace('write the vocabulary') util.writeVocab(HITS_obj, i, vocabSize, args.outF + '/vocab_file.hits') util.trace('finish program')
def train(args): start_epoch = 0 corpus_file = args.datadir+'/train.' util.trace('start training...') chainer.global_config.train = True chainer.global_config.use_cudnn = 'always' chainer.global_config.type_check = True util.trace('chainer config: {}'.format(chainer.global_config.__dict__)) util.trace('load vocab...') vocab_file = args.datadir+'/vocabulary.' source_vocab = util.Vocabulary.load(vocab_file+args.sourcelang) target_vocab = util.Vocabulary.load(vocab_file+args.targetlang) """ util.trace('make vocab...') source_vocab = util.Vocabulary.make(corpus_file+args.sourcelang, 3000) target_vocab = util.Vocabulary.make(corpus_file+args.targetlang, 3000) """ if args.gensim_mode == 'make': util.trace('making word2vec...') src_word2vec = util.make_word2vec(corpus_file+args.sourcelang, args.edim) tgt_word2vec = util.make_word2vec(corpus_file+args.targetlang, args.edim) util.save(src_word2vec, args.datadir+'/src_word2vec.'+args.sourcelang) util.save(tgt_word2vec, args.datadir+'/tgt_word2vec.'+args.targetlang) elif args.gensim_mode == 'load': util.trace('loading word2vec...') src_word2vec = util.load_word2vec(args.datadir+'/src_word2vec.'+args.sourcelang) tgt_word2vec = util.load_word2vec(args.datadir+'/tgt_word2vec.'+args.targetlang) elif args.gensim_mode == 'not': util.trace('do not use word2vec') src_word2vec = None tgt_word2vec = None util.trace('making model...') #initialize model NMTmodel = nmt_model.BahdanauNMT(source_vocab, target_vocab, args, src_word2vec, tgt_word2vec) if args.gpunum >= 0: import cupy as xp chainer.cuda.check_cuda_available() chainer.cuda.get_device(args.gpunum).use() NMTmodel.to_gpu() util.trace('use GPU id: {}'.format(args.gpunum)) else: import numpy as xp args.gpunum = -1 util.trace('without GPU') util.trace('random seed: {}'.format(args.seed_num)) np.random.seed(args.seed_num) xp.random.seed(args.seed_num) random.seed(args.seed_num) optim = args.optim #this is change optim = chainer.optimizers.AdaGrad(lr=args.lr) optim.setup(NMTmodel) optim.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) for epoch in range(start_epoch, args.epoch): util.trace('Epoch {}/{}'.format(epoch+1, args.epoch)) accum_loss = 0.0 num_sent = 0 for batch_src, batch_tgt in util.miniBatch(corpus_file+args.sourcelang, corpus_file+args.targetlang,\ source_vocab, target_vocab, args.batch, args.pooling): NMTmodel.zerograds() loss, batch_hyp = NMTmodel(batch_src, batch_tgt) accum_loss += loss.data loss.backward() optim.update() for src, tgt, hyp in zip(util.convert_b2w(batch_src, source_vocab), util.convert_b2w(batch_tgt, target_vocab), \ util.convert_b2w(batch_hyp, target_vocab)): util.trace('Epoch {}/{}, {} sent'.format(epoch+1, args.epoch, num_sent+1)) util.trace('src: {}'.format(src)) util.trace('tgt: {}'.format(tgt)) util.trace('hyp: {}'.format(hyp)) num_sent += 1 util.trace('accum_loss: {}'.format(accum_loss)) util.trace('Save model ...') model_name = '{}.{:03d}'.format(args.name, epoch+1) chainer.serializers.save_npz(args.savedir+'/{}.weights'.format(model_name), NMTmodel) chainer.serializers.save_npz(args.savedir+'/{}.optimizer'.format(model_name), optim)
parser.add_argument('sourcelang') parser.add_argument('targetlang') parser.add_argument('-datadir', help='data directory to use corpus and vocab', default='') parser.add_argument('-savedir', help='save directory for weight', default='') #parser.add_argument('-model', help='model for neural MT', default='bahdanau') parser.add_argument('-edim', help='embedding size for model', type=int, default=512) parser.add_argument('-nhid', help='hidden size for model', type=int, default=512) parser.add_argument('-gensim_mode', help='use gensim for embedding, make, load, or not?', default='not', choices=['make', 'load', 'not']) #parser.add_argument('-gensimfileS', help='gensim file for source'. default='') #parser.add_argument('-gensimfileT', help='gensim file for target'. default='') #parser.add_argument('-nlayer', help='hidden layer for model, attention: 1layer using gensim is 2layer without gensim'. type=int, default=2) parser.add_argument('-optim', help='select optimizer', default='AdaGrad') parser.add_argument('-lr', help='learning rate for optimizer', type=float, default=0.01) parser.add_argument('-gpunum', help='GPU number (negative value is using CPU)', type=int, default=-1) parser.add_argument('-epoch', help='max epoch during training', type=int, default=50) parser.add_argument('-useDropout', help='max epoch during training', action='store_true') parser.add_argument('-dlr', help='dropout rate', type=float, default=0.2) parser.add_argument('-batch', help='batch size', type=int, default=100) parser.add_argument('-pooling', help='pooling size', type=int, default=100) parser.add_argument('-genlimit', help='generation limit', type=int, default=60) #parser.add_argument('-useBeam', help='use beamsearch or not?', action='store_true') #parser.add_argument('-beamsize', help='beam size', type=int, default=2) parser.add_argument('-grad_clip', help='gradient cliping', type=float, default=5.0) parser.add_argument('-useSeed', help='use random seed or not?', action='store_true') parser.add_argument('-seed_num', help='random seed number', type=int, default=2434) parser.add_argument('-name', help='model name, default is "sample"', default='sample') args = parser.parse_args() train(args) util.trace('finish training!')
tgt_vocab = util.Vocabulary.make(args.savedir+'/train.'+args.targetlang, args.tgtvocab_size) #tgt_vocab = util.Vocabulary.make(tgt_corpus, args.tgtvocab_size) src_vocab_file = args.savedir+'/vocabulary.'+args.sourcelang tgt_vocab_file = args.savedir+'/vocabulary.'+args.targetlang src_vocab.save(src_vocab_file) tgt_vocab.save(tgt_vocab_file) #word2vec if args.word2vec: #default; worker is 5. src_w2v=util.make_word2vec(args.savedir+'/train.'+args.sourcelang, args.edim) tgt_w2v=util.make_word2vec(args.savedir+'/train.'+args.targetlang, args.edim) util.save_word2vec(src_w2v, args.savedir+'/src_word2vec.'+args.sourcelang) util.save_word2vec(tgt_w2v, args.savedir+'/tgt_word2vec.'+args.targetlang) if __name__ == '__main__': parser = argparse.ArgumentParser(usage='sorry, see the readme.', description='arg description', epilog='end') parser.add_argument('sourcelang') parser.add_argument('targetlang') parser.add_argument('-srcfile', help='source corpus file path') parser.add_argument('-tgtfile', help='source corpus file path') parser.add_argument('-savedir', help='save directory for training corpus') parser.add_argument('-word2vec', help='use word2vec or not', action='store_true') parser.add_argument('-edim', help='embedding size', type=int) parser.add_argument('-srcvocab_size', help='vocabulary size for src', type=int) parser.add_argument('-tgtvocab_size', help='vocabulary size for tgt', type=int) #parser.add_argument('-vocab_thred', help='under threthold, not use word for vocabulary', type=int) args = parser.parse_args() preprocess(args) util.trace('finish preprocess')