def __init__(self, vocab_size, embed_size, hidden_size, output_size, pre_word_embed=None, dropout=0.5, use_gpu=False): super(Intent_Model, self).__init__() self.use_gpu = use_gpu self.embed_size = embed_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.output_size = output_size self.dropout = nn.Dropout(dropout) self.word_embed = nn.Embedding(vocab_size, embed_size) if pre_word_embed is not None: self.word_embed.weight = nn.Parameter( torch.FloatTensor(pre_word_embed)) self.pre_word_embed = True else: self.pre_word_embed = False init_embedding(self.word_embed.weight) self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True) init_lstm(self.lstm) self.output_layer = nn.Linear(hidden_size * 2, 13) init_linear(self.output_layer)
def rand_init(self, init_char_embedding=True, init_word_embedding=False): """ random initialization args: init_char_embedding: random initialize char embedding or not init_word_embedding: random initialize word embedding or not """ if init_char_embedding: utils.init_embedding(self.char_embeds.weight) if init_word_embedding: utils.init_embedding(self.word_embeds.weight) if self.if_highway: self.forw2char.rand_init() self.back2char.rand_init() self.forw2word.rand_init() self.back2word.rand_init() self.fb2char.rand_init() utils.init_lstm(self.forw_char_lstm) utils.init_lstm(self.back_char_lstm) utils.init_lstm(self.word_lstm) utils.init_linear(self.char_pre_train_out) utils.init_linear(self.word_pre_train_out) self.crf.rand_init()
def rand_init(self, init_embedding=False): """ random initialization args: init_embedding: random initialize word embedding or not """ if init_embedding: utils.init_embedding(self.word_embeds.weight) if self.position: utils.init_embedding(self.position_embeds.weight) utils.init_lstm(self.lstm) utils.init_linear(self.att2out)
def get_embed(self, fembed): with open(fembed, 'r') as f: lines = [line for line in f] splits = [line.split() for line in lines] # 获取预训练数据中的词汇和嵌入矩阵 words, embed = zip(*[(split[0], list(map(float, split[1:]))) for split in splits]) # 扩充词汇 self.extend(words) # 初始化词嵌入 embed = torch.tensor(embed, dtype=torch.float) embed_indices = [self.wdict[w] for w in words] extended_embed = torch.Tensor(self.n_words, embed.size(1)) init_embedding(extended_embed) extended_embed[embed_indices] = embed return extended_embed
def get_embed(self, fembed): with open(fembed, 'r') as f: lines = [line for line in f] splits = [line.split() for line in lines] # 获取预训练数据中的词汇和嵌入矩阵 words, embed = zip(*[ (split[0], list(map(float, split[1:]))) for split in splits ]) # 扩充词汇 self.extend(words) # 初始化词嵌入 embed = torch.tensor(embed, dtype=torch.float) embed_indices = [self.wdict[w] for w in words] extended_embed = torch.Tensor(self.n_words, embed.size(1)) init_embedding(extended_embed) extended_embed[embed_indices] = embed return extended_embed
def rand_init(self, init_embedding=False): """ random initialization args: init_embedding: random initialize word embedding or not """ if init_embedding: utils.init_embedding(self.word_embeds.weight) if self.position: utils.init_embedding(self.position_embeds.weight) if self.enable_att: self.attention.rand_init() # initialize tree self.treernn.rand_init() # initialize linear layer utils.init_linear(self.linear)
def _test(train_args, pretrain_args, args): """Test saved model on specified speakers.""" print('Testing', ', '.join(args.speakers), '...') # update args with new test args test_args = utils.set_new_args(train_args, args) # get test data and id_to_word lookup _, _, test_data, id_to_word = data_reader.get_data(test_args) # set configurations/hyperparameters for model _, test_config = utils.set_config(test_args, id_to_word) # model requires init embed but this will be overridden by restored model init_embed = utils.init_embedding(id_to_word, dim=test_args.embed_size, init_scale=test_args.init_scale, embed_path=test_args.embed_path) with tf.Graph().as_default(): with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=None): m_test = model.Model(test_args, is_training=False, config=test_config, init_embed=init_embed, name='Test') m_test.build_graph() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print('Restoring model...') saver.restore(sess, test_args.load_path) # test model on specified speakers for test_ind, test_speaker in enumerate(test_args.speakers): for train_ind, train_speaker in enumerate(train_args.speakers): print('Testing {0} with {1} model'.format( test_speaker, train_speaker)) test_perplexity = _run_epoch(sess, m_test, test_args, test_data, train_ind, test_ind) print('Test Perplexity: {0:.3f}'.format(test_perplexity))
def __init__(self, vocab_size, embed_size, hidden_size, tag2id, pre_word_embed=None, dropout=0.5, use_gpu=False): super(BiLSTM_CRF, self).__init__() self.use_gpu = use_gpu self.embed_size = embed_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.tag2id = tag2id self.tag_size = len(tag2id) # cnn after # self.cnn = CNN_Encoder(hidden_size, hidden_size) # self.bridge = nn.Linear(hidden_size*2, self.hidden_size) self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True) # cnn before # self.cnn = CNN_Encoder(embed_size, hidden_size) # self.bridge = nn.Linear(hidden_size * 2, self.hidden_size) # self.lstm = nn.LSTM(hidden_size, hidden_size, bidirectional=True) self.dropout = nn.Dropout(dropout) self.word_embed = nn.Embedding(vocab_size, embed_size) if pre_word_embed is not None: self.word_embed.weight = nn.Parameter(torch.FloatTensor(pre_word_embed)) self.pre_word_embed = True else: self.pre_word_embed = False init_embedding(self.word_embed.weight) init_lstm(self.lstm) self.hidden2tag = nn.Linear(hidden_size*2, self.tag_size) init_linear(self.hidden2tag) self.tanh = nn.Tanh() # crf layer self.transitions = nn.Parameter(torch.zeros(self.tag_size, self.tag_size)) self.transitions.data[tag2id['START'], :] = -10000 self.transitions.data[:, tag2id['STOP']] = -10000
def _generate(train_args, pretrain_args, args): """Restore trained model and use to generate sample text.""" # update args with new generate args gen_args = utils.set_new_args(train_args, args) # get id_to_word lookup _, _, _, id_to_word = data_reader.get_data(gen_args) # # get hyperparameters corresponding to text generation gen_config, _ = utils.set_config(gen_args, id_to_word) # model requires init embed but this will be overridden by restored model init_embed = utils.init_embedding(id_to_word, dim=gen_args.embed_size, init_scale=gen_args.init_scale, embed_path=gen_args.embed_path) with tf.Graph().as_default(): # use Train name scope as this contains trained model parameters with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None): m_gen = model.Model(gen_args, is_training=False, config=gen_config, init_embed=init_embed, name='Generate') m_gen.build_graph() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print('Restoring model...') saver.restore(sess, gen_args.load_path) # generate text for all specified speakers for gen_ind, gen_speaker in enumerate(gen_args.speakers): print('Generating text for {0}'.format(gen_speaker)) for train_ind, train_speaker in enumerate(train_args.speakers): if gen_speaker == train_speaker: generate_text(sess, m_gen, id_to_word, train_ind, args.temp)
def __init__(self, vocab_size, embed_size, hidden_size, tag2id, pre_word_embed=None, dropout=0.5, use_gpu=False): super(BiLSTM, self).__init__() self.use_gpu = use_gpu self.embed_size = embed_size self.hidden_size = hidden_size self.vocab_size = vocab_size self.tag2id = tag2id self.tag_size = len(tag2id) self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.dropout = nn.Dropout(dropout) self.word_embed = nn.Embedding(vocab_size, embed_size) if pre_word_embed is not None: self.word_embed.weight = nn.Parameter(torch.FloatTensor(pre_word_embed)) self.pre_word_embed = True else: self.pre_word_embed = False init_embedding(self.word_embed.weight) init_lstm(self.lstm) self.hidden2tag = nn.Linear(hidden_size * 2, self.tag_size) init_linear(self.hidden2tag)
def rand_init(self): """ random initialization args: init_char_embedding: random initialize char embedding or not """ utils.init_embedding(self.char_embeds.weight) if self.char_lstm: utils.init_lstm(self.forw_char_lstm) utils.init_lstm(self.back_char_lstm) utils.init_lstm(self.word_lstm_lm) utils.init_linear(self.char_pre_train_out) utils.init_linear(self.word_pre_train_out) if self.if_highway: self.forw2char.rand_init() self.back2char.rand_init() self.forw2word.rand_init() self.back2word.rand_init() self.fb2char.rand_init() else: utils.init_lstm(self.word_lstm_cnn)
def rand_init(self): """ random initialization args: init_char_embedding: random initialize char embedding or not """ utils.init_embedding(self.char_embeds.weight) if self.char_lstm: utils.init_lstm(self.forw_char_lstm) utils.init_lstm(self.back_char_lstm) utils.init_lstm(self.word_lstm_lm) utils.init_linear(self.char_pre_train_out) utils.init_linear(self.word_pre_train_out) if self.if_highway: self.forw2char.rand_init() self.back2char.rand_init() self.forw2word.rand_init() self.back2word.rand_init() self.fb2char.rand_init() else: utils.init_lstm(self.word_lstm_cnn)
def run_train(): data = { 'trainAnswers': [], 'devAnswers': [], 'trainQuestions': [], 'devQuestions': [] } data['trainAnswers'].extend( map( lambda x: x.split(' '), open('opennmt-kb-char-' + str(dev_id) + '/train.txt.tgt', encoding="utf-8").read().strip().split('\n'))) data['trainQuestions'].extend( map( lambda x: x.split(' '), open('opennmt-kb-char-' + str(dev_id) + '/train.txt.src', encoding="utf-8").read().strip().split('\n'))) data['devAnswers'].extend( map( lambda x: x.split(' '), open('opennmt-kb-char-' + str(dev_id) + '/val.txt.tgt', encoding="utf-8").read().strip().split('\n'))) data['devQuestions'].extend( map( lambda x: x.split(' '), open('opennmt-kb-char-' + str(dev_id) + '/val.txt.src', encoding="utf-8").read().strip().split('\n'))) # for debug ''' data['trainAnswers'] = data['trainAnswers'][:5] data['trainQuestions'] = data['trainQuestions'][:5] data['devAnswers'] = [list(x) for x in data['trainAnswers']] data['devQuestions'] = [list(x) for x in data['trainQuestions']] ''' # 生成词表(word) if os.path.exists(model_dir + '/vocab_word'): t = open(model_dir + '/vocab_word', 'rb') vocab_word = pickle.load(t) t.close() else: vocab_word = prepare_vocabulary(data, cut=cut) t = open(model_dir + '/vocab_word', 'wb') pickle.dump(vocab_word, t) t.close() print("========================word===========================") print('dec_vocab_size: ', vocab_word.n_words_for_decoder) print('vocab_size: ', vocab_word.n_words) print('max_word_length: ', max(map(lambda x: len(x), vocab_word.word2index))) # 生成数据(截断,生成负例) if os.path.exists(model_dir + '/data'): t = open(model_dir + '/data', 'rb') train_examples, dev_examples = pickle.load(t) t.close() else: train_examples = gen_data(vocab_word, data['trainQuestions'], data['trainAnswers'], 1, max_length, max_num_utterance) dev_examples = gen_data(vocab_word, data['devQuestions'], data['devAnswers'], 10, max_length, max_num_utterance) t = open(model_dir + '/data', 'wb') pickle.dump((train_examples, dev_examples), t) t.close() print(train_examples[0][1]) print(train_examples[0][2]) print(dev_examples[0][1]) print(dev_examples[0][2]) print("========================dataset===========================") print('train: ', len(train_examples[0]), len(train_examples[1]), len(train_examples[2])) print('dev: ', len(dev_examples[0]), len(dev_examples[1]), len(dev_examples[2])) embed = init_embedding(embed_size, vocab_word.n_words, vocab_word.word2index, True) embedding = nn.Embedding(vocab_word.n_words, embed_size, padding_idx=0).from_pretrained(embed, freeze=False) dam = DAM(embed_size, max_num_utterance, max_length, max_stacks) embedding = torch.nn.DataParallel(embedding).to(device_cuda) dam = torch.nn.DataParallel(dam).to(device_cuda) if os.path.isfile(model_dir + '/embedding'): embedding.load_state_dict(torch.load(model_dir + '/embedding')) if os.path.isfile(model_dir + '/dam'): dam.load_state_dict(torch.load(model_dir + '/dam')) optimizer = optim.Adam([{ "params": embedding.parameters() }, { "params": dam.parameters() }], lr=lr, amsgrad=True) if os.path.isfile(model_dir + '/optimizer'): optimizer.load_state_dict(torch.load(model_dir + '/optimizer')) trainIters(vocab_word, embedding, dam, optimizer, train_examples, dev_examples, n_epochs, lr, batch_size, infer_batch_size, print_every=1)
args['epoch'] = 10 # 迭代次数 args['hidden_dim'] = 100 # lstm cell输出的数据的维度 args['optimizer'] = 'Adam' # 优化损失函数的方法 args['lr'] = 0.001 # 学习率 args['clip'] = 5.0 # 限定梯度更新的时候的阈值 args['dropout'] = 0.5 # 保留率 args[ 'update_embedding'] = True # 是否要对embedding进行更新,embedding初始化之后,这里设置成更新,就可以更新embedding args['embedding_dim'] = 100 # embedding的维度 args['shuffle'] = True # 是否每次在把数据送进lstm中训练时都混洗 # 读取词典,把一个字映射到一个id,这个词典是从训练数据中得到的 word2id = read_dict(os.path.join('.', args['train_data'], 'word2id.pkl')) # 随机初始化embedding embeddings = init_embedding(word2id, args['embedding_dim']) # 设置模型的输出路径 model_path = 'BLCM3' output_path = os.path.join('.', model_path) if not os.path.exists(output_path): os.makedirs(output_path) summary_path = os.path.join(output_path, "summaries") if not os.path.exists(summary_path): os.makedirs(summary_path) model_path = os.path.join(output_path, "checkpoints/") #if not os.path.exists(model_path): #os.makedirs(model_path) ckpt_prefix = os.path.join(model_path, "model") result_path = os.path.join(output_path, "results") if not os.path.exists(result_path):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, char_to_ix=None, pre_word_embeds=None, char_out_dimension=25, char_embedding_dim=25, use_gpu=True, use_crf=True, char_mode='CNN', encoder_mode='LSTM', dropout=0.5): ''' Input parameters: vocab_size= Size of vocabulary (int) tag_to_ix = Dictionary that maps NER tags to indices embedding_dim = Dimension of word embeddings (int) hidden_dim = The hidden dimension of the LSTM layer (int) char_to_ix = Dictionary that maps characters to indices pre_word_embeds = Numpy array which provides mapping from word embeddings to word indices char_out_dimension = Output dimension from the CNN encoder for character char_embedding_dim = Dimension of the character embeddings use_gpu = defines availability of GPU, when True: CUDA function calls are made else: Normal CPU function calls are made use_crf = parameter which decides if you want to use the CRF layer for output decoding ''' super(BiLSTM_CRF, self).__init__() # parameter initialization for the model self.use_gpu = use_gpu self.embedding_dim = embedding_dim self.hidden_dim = hidden_dim self.vocab_size = vocab_size self.tag_to_ix = tag_to_ix self.use_crf = use_crf self.tagset_size = len(tag_to_ix) self.out_channels = char_out_dimension self.char_mode = char_mode self.encoder_mode = encoder_mode self.char_lstm_dim = char_out_dimension if char_embedding_dim is not None: self.char_embedding_dim = char_embedding_dim # Initializing the character embedding layer self.char_embeds = nn.Embedding(len(char_to_ix), char_embedding_dim) init_embedding(self.char_embeds.weight) # Performing LSTM encoding on the character embeddings if self.char_mode == 'LSTM': self.char_lstm = nn.LSTM(char_embedding_dim, self.char_lstm_dim, num_layers=1, bidirectional=True) init_lstm(self.char_lstm) # Performing CNN encoding on the character embeddings if self.char_mode == 'CNN': self.char_cnn3 = nn.Conv2d(in_channels=1, out_channels=self.out_channels, kernel_size=(3, char_embedding_dim), padding=(2, 0)) # Creating Embedding layer with dimension of ( number of words * dimension of each word) self.word_embeds = nn.Embedding(vocab_size, embedding_dim) if pre_word_embeds is not None: # Initializes the word embeddings with pretrained word embeddings self.pre_word_embeds = True self.word_embeds.weight = nn.Parameter( torch.FloatTensor(pre_word_embeds)) else: self.pre_word_embeds = False # Initializing the dropout layer, with dropout specificed in parameters self.dropout = nn.Dropout(dropout) # Lstm Layer: if self.encoder_mode == 'LSTM': if self.char_mode == 'LSTM': self.lstm = nn.LSTM(embedding_dim + self.char_lstm_dim * 2, hidden_dim, bidirectional=True) if self.char_mode == 'CNN': self.lstm = nn.LSTM(embedding_dim + self.out_channels, hidden_dim, bidirectional=True) # Initializing the lstm layer using predefined function for initialization init_lstm(self.lstm) # Linear layer which maps the output of the bidirectional LSTM into tag space. self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size) # CNN (One-layer): if self.encoder_mode == 'CNN': # Conv layer self.conv1 = nn.Conv2d(in_channels=1, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) # Initializing the conv layer nn.init.xavier_uniform_(self.conv1.weight) if self.char_mode == 'LSTM': print( f'embedding_dim={embedding_dim}, char_lstm_dim={self.char_lstm_dim*2}, in={embedding_dim+self.char_lstm_dim*2}' ) self.maxpool1 = nn.MaxPool2d( (1, embedding_dim + self.char_lstm_dim * 2)) if self.char_mode == 'CNN': print( f'embedding_dim={embedding_dim}, self.out_channels={self.out_channels}, in={embedding_dim+self.out_channels}' ) self.maxpool1 = nn.MaxPool2d( (1, embedding_dim + self.out_channels)) # CNN (Two-layer): if self.encoder_mode == 'CNN2': # Conv layer self.conv1 = nn.Conv2d(in_channels=1, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) # Initializing the conv layer nn.init.xavier_uniform_(self.conv1.weight) nn.init.xavier_uniform_(self.conv2.weight) self.maxpool1 = nn.MaxPool2d((1, 2)) self.maxpool2 = nn.MaxPool2d( (1, (embedding_dim + self.out_channels) // 2)) # CNN (Three-layer): if self.encoder_mode == 'CNN3': # Conv layer self.conv1 = nn.Conv2d(in_channels=1, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) self.conv3 = nn.Conv2d(in_channels=hidden_dim * 2, out_channels=hidden_dim * 2, kernel_size=(1, 1), padding=(0, 0)) # Initializing the conv layer nn.init.xavier_uniform_(self.conv1.weight) nn.init.xavier_uniform_(self.conv2.weight) nn.init.xavier_uniform_(self.conv3.weight) self.maxpool1 = nn.MaxPool2d((1, 2)) self.maxpool2 = nn.MaxPool2d((1, 2)) self.maxpool3 = nn.MaxPool2d( (1, (embedding_dim + self.out_channels) // 4)) # CNN (Dilated Three-layer): if self.encoder_mode == 'CNN_DILATED': # Conv layer self.conv1 = nn.Conv2d(in_channels=1, out_channels=hidden_dim * 2, kernel_size=(1, 2), padding=(0, 0), dilation=1) self.conv2 = nn.Conv2d(in_channels=hidden_dim * 2, out_channels=hidden_dim * 2, kernel_size=(1, 2), padding=(0, 0), dilation=2) self.conv3 = nn.Conv2d(in_channels=hidden_dim * 2, out_channels=hidden_dim * 2, kernel_size=(1, 2), padding=(0, 0), dilation=3) # Initializing the conv layer nn.init.xavier_uniform_(self.conv1.weight) nn.init.xavier_uniform_(self.conv2.weight) nn.init.xavier_uniform_(self.conv3.weight) self.maxpool1 = nn.MaxPool2d((1, 2)) self.maxpool2 = nn.MaxPool2d((1, 2)) self.maxpool3 = nn.MaxPool2d((1, 27)) # Linear layer which maps the output of the bidirectional LSTM into tag space. self.hidden2tag = nn.Linear(hidden_dim * 2, self.tagset_size) # Initializing the linear layer using predefined function for initialization init_linear(self.hidden2tag) if self.use_crf: # Matrix of transition parameters. Entry i,j is the score of transitioning *to* i *from* j. # Matrix has a dimension of (total number of tags * total number of tags) self.transitions = nn.Parameter( torch.zeros(self.tagset_size, self.tagset_size)) # These two statements enforce the constraint that we never transfer # to the start tag and we never transfer from the stop tag self.transitions.data[tag_to_ix[START_TAG], :] = -10000 self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
def train_with_earlystop(corpus, device, n_hidden=128, n_emb=128, batch_size=32, use_hie=False, nn_type='gru', pooling_type='attention', w2v_fn=None, save_fn=None, disp_proc=True): ''' Input: use_hie: whether use hierarchical structure. nn_type: gru, lstm, conv pooling_type: mean, max, attention use_w2v: whether to use pre-trained embeddings from word2vec ''' print('%d training samples' % corpus.current_split_sizes[0]) print('%d validation samples' % corpus.current_split_sizes[1]) # rng = np.random.RandomState(1224) # th_rng = RandomStreams(1224) if save_fn is None: save_fn = 'model-res/%s-%s-%s.ckpt' % (nn_type, pooling_type, use_hie) # Load Word2Vec if w2v_fn is None: pre_embedding = None else: print('Loading word2vec model...') if w2v_fn == 'tencent': vectors = np.load( r'G:\word2vec\Tencent-AI-Lab\tencent-ailab-vecs-128.npy') word_sr = pd.read_hdf( r'G:\word2vec\Tencent-AI-Lab\tencent-ailab-voc.h5', 'voc') gensim_w2v = (vectors, word_sr) else: gensim_w2v = Word2Vec.load(w2v_fn) pre_embedding = init_embedding(gensim_w2v, corpus.current_dic) classifier = construct_classifier(corpus.current_dic.size, n_emb, n_hidden, corpus.n_target, pre_embedding=pre_embedding, use_hie=use_hie, nn_type=nn_type, pooling_type=pooling_type) classifier.to(device) # Loss and Optimizer loss_func = nn.NLLLoss() adadelta_optimizer = optim.Adadelta(classifier.parameters(), lr=1.0, rho=0.9, weight_decay=1e-8) sgd_optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-8) # First step: optimize using Adadelta disp_freq = 20 if disp_proc is True else None train_corpus(classifier, corpus, loss_func, adadelta_optimizer, save_fn, disp_freq=disp_freq, batch_size=batch_size) # Retrieve the state optimized by Adadelta classifier.load_state_dict(torch.load(save_fn)) # Second step: optimize using SGD train_corpus(classifier, corpus, loss_func, sgd_optimizer, save_fn, disp_freq=disp_freq, batch_size=batch_size)
def run_train(): data = { 'trainAnswers': [], 'devAnswers': [], 'trainQuestions': [], 'devQuestions': [] } data['trainAnswers'].extend( map( lambda x: x.split(' '), open('opennmt-kb-' + str(dev_id) + '/train.txt.tgt', encoding="utf-8").read().strip().split('\n'))) data['trainQuestions'].extend( map( lambda x: x.split(' '), open('opennmt-kb-' + str(dev_id) + '/train.txt.src', encoding="utf-8").read().strip().split('\n'))) data['devAnswers'].extend( map( lambda x: x.split(' '), open('opennmt-kb-' + str(dev_id) + '/val.txt.tgt', encoding="utf-8").read().strip().split('\n'))) data['devQuestions'].extend( map( lambda x: x.split(' '), open('opennmt-kb-' + str(dev_id) + '/val.txt.src', encoding="utf-8").read().strip().split('\n'))) # for debug ''' data['trainAnswers'] = data['trainAnswers'][:2] data['trainQuestions'] = data['trainQuestions'][:2] data['devAnswers'] = [list(x) for x in data['trainAnswers']] data['devQuestions'] = [list(x) for x in data['trainQuestions']] ''' # 生成词表(word) if os.path.exists(model_dir + '/vocab_word'): t = open(model_dir + '/vocab_word', 'rb') vocab_word = pickle.load(t) t.close() else: vocab_word = prepare_vocabulary(data, cut=cut) t = open(model_dir + '/vocab_word', 'wb') pickle.dump(vocab_word, t) t.close() print("========================word===========================") print('dec_vocab_size: ', vocab_word.n_words_for_decoder) print('vocab_size: ', vocab_word.n_words) print('max_word_length: ', max(map(lambda x: len(x), vocab_word.word2index))) # 生成数据(截断) if os.path.exists(model_dir + '/data'): t = open(model_dir + '/data', 'rb') train_pairs, dev_pairs = pickle.load(t) t.close() else: train_pairs = [] dev_pairs = [] for i in range(len(data['trainQuestions'])): data['trainQuestions'][i] = cut_utterances( data['trainQuestions'][i], max_num_utterance, max_seq_length) data['trainAnswers'][i] = data['trainAnswers'][i][:DEC_MAX_LEN] train_pairs.append( (data['trainQuestions'][i], data['trainAnswers'][i])) for i in range(len(data['devQuestions'])): data['devQuestions'][i] = cut_utterances(data['devQuestions'][i], max_num_utterance, max_seq_length) data['devAnswers'][i] = data['devAnswers'][i][:DEC_MAX_LEN] dev_pairs.append((data['devQuestions'][i], data['devAnswers'][i])) t = open(model_dir + '/data', 'wb') pickle.dump((train_pairs, dev_pairs), t) t.close() print(train_pairs[0]) print(train_pairs[1]) print(dev_pairs[0]) print(dev_pairs[1]) print("========================dataset===========================") print('train: ', len(train_pairs)) print('dev: ', len(dev_pairs)) # 共用一套embedding embed = init_embedding(embed_size, vocab_word.n_words, vocab_word.word2index) embedding = nn.Embedding(vocab_word.n_words, embed_size, padding_idx=0).from_pretrained(embed, freeze=False) encoder = EncoderRNN(embed_size, vocab_word.n_words, hidden_size, bidirectional=bidirectional, num_layers=num_layers, dropout_p=dropout_p) attn_decoder = CopynetDecoderRNN(embed_size, hidden_size, vocab_word.n_words_for_decoder, vocab_word.n_words, num_layers=num_layers, dropout_p=dropout_p) embedding = torch.nn.DataParallel(embedding).to(device_cuda) encoder = torch.nn.DataParallel(encoder).to(device_cuda) attn_decoder = torch.nn.DataParallel(attn_decoder).to(device_cuda) if os.path.isfile(model_dir + '/embedding'): embedding.load_state_dict(torch.load(model_dir + '/embedding')) if os.path.isfile(model_dir + '/encoder') and os.path.isfile(model_dir + '/decoder'): encoder.load_state_dict(torch.load(model_dir + '/encoder')) attn_decoder.load_state_dict(torch.load(model_dir + '/decoder')) optimizer = optim.Adam([{ "params": embedding.parameters() }, { "params": encoder.parameters() }, { "params": attn_decoder.parameters() }], lr=lr, amsgrad=True) if os.path.isfile(model_dir + '/optimizer'): optimizer.load_state_dict(torch.load(model_dir + '/optimizer')) trainIters(vocab_word, embedding, encoder, attn_decoder, optimizer, train_pairs, dev_pairs, max_length, n_epochs, lr, batch_size, infer_batch_size, print_every=1)
nn_type = 'gru' # nn_type = 'lstm' # nn_type = 'conv' # pooling_type = 'mean' # pooling_type = 'max' pooling_type = 'attention' # Load Word2Vec if w2v_fn is None: pre_embedding = None else: print('Loading word2vec model...') if not os.path.exists(w2v_fn): raise Exception('Word2Vec model does NOT exist!', w2v_fn) gensim_w2v = Word2Vec.load(w2v_fn) pre_embedding = init_embedding(gensim_w2v, corpus.current_dic) classifier = construct_classifier(corpus.current_dic.size, n_emb, n_hidden, corpus.n_target, pre_embedding=pre_embedding, use_hie=use_hie, nn_type=nn_type, pooling_type=pooling_type) # classifier.load_state_dict(torch.load('model-res/model-gru-attention-False-Adadelta-1.0000-v1.ckpt')) classifier.to(device) # Loss and Optimizer loss_func = nn.NLLLoss() # It seems that Adadelta is better than Adagrad and Adam...
def build_model(self): with tf.variable_scope('Placeholder'): self.nodes_placeholder = tf.placeholder(tf.int32, (None, ), name='nodes_placeholder') self.seqlen_placeholder = tf.placeholder(tf.int32, (None, ), name='seqlen_placeholder') self.neighborhood_placeholder = tf.placeholder( tf.int32, (None, self.args.sampling_size), name='neighborhood_placeholder') self.label_placeholder = tf.placeholder(tf.float32, (None, ), name='label_placeholder') self.data = network.next_batch(self.graph, self.degree_max, sampling=True, sampling_size=self.args.sampling_size) with tf.variable_scope('Embeddings'): self.embeddings = tf.get_variable( 'embeddings', [len(self.graph), self.args.embedding_size], initializer=tf.constant_initializer( utils.init_embedding(self.degree, self.degree_max, self.args.embedding_size))) with tf.variable_scope('LSTM'): cell = tf.contrib.rnn.DropoutWrapper( #tf.contrib.rnn.BasicLSTMCell(num_units=self.args.embedding_size), tf.contrib.rnn.LayerNormBasicLSTMCell( num_units=self.args.embedding_size, layer_norm=False), input_keep_prob=1.0, output_keep_prob=1.0) _, states = tf.nn.dynamic_rnn( cell, tf.nn.embedding_lookup(self.embeddings, self.neighborhood_placeholder), dtype=tf.float32, sequence_length=self.seqlen_placeholder) self.lstm_output = states.h with tf.variable_scope('Guilded'): self.predict_info = tf.squeeze( tf.layers.dense(self.lstm_output, units=1, activation=utils.selu)) with tf.variable_scope('Loss'): self.structure_loss = tf.losses.mean_squared_error( tf.nn.embedding_lookup(self.embeddings, self.nodes_placeholder), self.lstm_output) self.guilded_loss = tf.reduce_mean( tf.abs(tf.subtract(self.predict_info, self.label_placeholder))) self.orth_loss = tf.losses.mean_squared_error( tf.matmul(self.embeddings, self.embeddings, transpose_a=True), tf.eye(self.args.embedding_size)) self.total_loss = self.structure_loss + self.args.alpha * self.orth_loss + self.args.lamb * self.guilded_loss with tf.variable_scope('Optimizer'): #self.optimizer = tf.train.AdamOptimizer(self.args.learning_rate) self.optimizer = tf.train.RMSPropOptimizer(self.args.learning_rate) tvars = tf.trainable_variables() grads, self.global_norm = tf.clip_by_global_norm( tf.gradients(self.total_loss, tvars), self.args.grad_clip) self.train_op = self.optimizer.apply_gradients(zip(grads, tvars)) with tf.variable_scope('Summary'): tf.summary.scalar("orth_loss", self.orth_loss) tf.summary.scalar("guilded_loss", self.guilded_loss) tf.summary.scalar("structure_loss", self.structure_loss) tf.summary.scalar("total_loss", self.total_loss) tf.summary.scalar("globol_norm", self.global_norm) for (grad, var) in zip(grads, tvars): if grad is not None: tf.summary.histogram('grad/{}'.format(var.name), grad) tf.summary.histogram('weight/{}'.format(var.name), var) log_dir = os.path.join(self.save_path, 'logs') if os.path.exists(log_dir): shutil.rmtree(log_dir) self.summary_writer = tf.summary.FileWriter( log_dir, self.sess.graph) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = self.embeddings.name embedding.metadata_path = os.path.join( os.path.join(self.args.save_path, 'data', 'index.tsv')) projector.visualize_embeddings(self.summary_writer, config) self.merged_summary = tf.summary.merge_all() self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer())
def rand_init_embedding(self): """ random initialize char-level embedding """ utils.init_embedding(self.char_embeds.weight)
) args = parser.parse_args() if args.account is None: print("Missing account,it must be provided.") sys.exit(1) else: # Todo:Check if account is valid,should create a collection to store all accounts? pass t0 = time.time() input_segment_file = INPUT_SEGMENT_FILE.format(name=args.account) word_voc_file = WORD_VOC_FILE.format(name=args.account) tag_voc_file = TAG_VOC_FILE.format(name=args.account) label_voc_file = LABEL_VOC_FILE.format(name=args.account) init_vocabulary(args.account, input_segment_file, word_voc_file, tag_voc_file, label_voc_file, args.regen) embedding_root = EMBEDDING_PATH word2vec_file = WORD2VEC_FILE.format(name=args.account) word_embedding_file = WORD_EMBEDDING_FILE.format(name=args.account) tag_embedding_file = TAG_EMBEDDING_FILE.format(name=args.account) init_embedding(embedding_root, word2vec_file, word_embedding_file, tag_embedding_file, word_voc_file, tag_voc_file, args.regen) demo(word_embedding_file) print('Done in %.1fs!' % (time.time() - t0))
def _train(args, pretrain_args): """Train the language model. Creates train/valid/test models, runs training epochs, saves model and writes results to database if specified. """ start_time = time.time() print('Training', ', '.join(args.speakers), '...') # randomly sample validation set monte_carlo_cv_num times for num in range(args.monte_carlo_cv_num): # get seed used to sub-sample validation dataset (use 42 for 1st run) seed = utils.get_seed(num) # get train/valid/test data and convert to sequences train_data, valid_data, test_data, id_to_word = data_reader.get_data( args, seed=seed) # set configurations/hyperparameters for model config, test_config = utils.set_config(args, id_to_word) # initialize word embeddings init_embed = utils.init_embedding(id_to_word, dim=args.embed_size, init_scale=args.init_scale, embed_path=args.embed_path) with tf.Graph().as_default(): # initializer used to initialize TensorFlow variables initializer = tf.random_uniform_initializer( -config['init_scale'], config['init_scale']) # create Train model with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None, initializer=initializer): m_train = model.Model(args, is_training=True, config=config, init_embed=init_embed, name='Train') m_train.build_graph() # create Valid model with tf.name_scope('Valid'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_valid = model.Model(args, is_training=False, config=config, init_embed=init_embed, name='Valid') m_valid.build_graph() # create Test model with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_test = model.Model(args, is_training=False, config=test_config, init_embed=init_embed, name='Test') m_test.build_graph() # create summaries to be viewed in TensorBoard tb_summaries = utils.TensorBoardSummaries() tb_summaries.create_ops() init = tf.global_variables_initializer() # if pretrained, must create dict to initialize TF Saver if bool(pretrain_args): # get trainable variables and convert to dict for Saver reuse_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars]) # create saver for TF session (see function for addl details) saver = utils.create_tf_saver(args, pretrain_args, reuse_vars_dict) else: saver = tf.train.Saver() # ppls dict has perplexities that are stored in results database ppls = {} ppls, _ = _update_ppls(ppls, initialize=True) with tf.Session() as sess: sess.run(init) if args.load_path != '': print('Restoring model...') saver.restore(sess, args.load_path) for epoch in range(config['max_epoch']): print('Epoch: {0} Learning rate: {1:.3f}\n'.format( epoch + 1, sess.run(m_train.lr))) for i, speaker in enumerate(args.speakers): print('Training {0} ...'.format(speaker)) # run epoch on training data train_perplexity = _run_epoch( sess, m_train, args, train_data, i, tb_summaries, id_to_word, train_op=m_train.train_op, verbose=True) print('Epoch: {0} Train Perplexity: {1:.3f}'.format( epoch + 1, train_perplexity)) ppls, _ = _update_ppls(ppls, epoch=epoch + 1, speaker=speaker, ppl=train_perplexity, dataset='train') print('Validating...') # run epoch on validation data valid_perplexity = _run_epoch(sess, m_valid, args, valid_data, i, tb_summaries, id_to_word, verbose=True) print('Epoch: {0} Valid Perplexity: {1:.3f}'.format( epoch + 1, valid_perplexity)) ppls, improved = _update_ppls(ppls, epoch=epoch + 1, speaker=speaker, ppl=valid_perplexity, dataset='valid') if improved: # save model if valid ppl is lower than current # best valid ppl if args.save_path != '': print('Saving model to {0}.'.format( args.save_path)) saver.save(sess, args.save_path) for i, speaker in enumerate(args.speakers): print('Testing {0} ...'.format(speaker)) print('Restoring best model for testing...') saver.restore(sess, args.save_path) # run model on test data test_perplexity = _run_epoch(sess, m_test, args, test_data, i) ppls['test_ppl_' + speaker] = test_perplexity print('Test Perplexity: {0:.3f}'.format(test_perplexity)) if args.insert_db == 'True': # write params/config/results to sql database results_db.insert_results(args, config, start_time, ppls)