def __init__(self, n_target_vocab, n_decoder_units, n_attention_units, n_encoder_units, n_maxout_units): super(CRNNAttention, self).__init__() with self.init_scope(): self.crnn = CRNN() self.decoder = Decoder( n_target_vocab, n_decoder_units, n_attention_units, n_encoder_units * 2, # because of bi-directional lstm n_maxout_units, )
def load_seq2seq(filename, map_location=None): """ Loads a model from a filename. (This function requires that the model was saved with save_model utils function) :param filename: Filename to be used. """ from seq2seq import Encoder, Decoder model_dict = torch.load( filename, map_location=None ) if map_location is not None else torch.load(filename) encoder = Encoder(**model_dict["encoder_init_args"]) decoder = Decoder(**model_dict["decoder_init_args"]) encoder.load_state_dict(model_dict["encoder_state"]) decoder.load_state_dict(model_dict["decoder_state"]) return encoder, decoder
def Main(vocab, vocab_inv): i = 0 arret = False # peut-être mis à True par le programme (étape 3) contexte = None # préparation GRU enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad]) dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos], vocab[eos], vocab[unk]) enc.to('cuda') dec.to('cuda') # chargement des poids path_enc = "encoder_9.pkl" path_dec = "decoder_9.pkl" encoder_state = torch.load(path_enc) decoder_state = torch.load(path_dec) enc.load_states(encoder_state) enc.eval() dec.load_states(dict(decoder_state)) dec.eval() # paramétrage de la taille de la prédiction taille = int(input("nombre de mots prédis à la fois ? : ")) while (not arret): phrase = takeInput(i) exit_c, contexte = callGRU(enc, dec, phrase, vocab, contexte, taille) sortie = posttreatment(exit_c, vocab_inv) # sortie = "David Albert Huffman est un petit garçon de 10 ans des plus intelligents. Cependant, son monde cours à sa perte lorsque Poupoune décide de s'emparer de l'Europe, alors en pleine crise politique, pour y imposer son monde rose et fleurissant.Avec son ami Lamy, David va devoir lutter contre des adversaires redoutables pour sauver le monde, entrer au MIT et repousser la plus grande menace du Siècle (pour le moment) pour rétablir l'équilibre dans le rapport de Force." #test enchaînement printResult(sortie) # contexte = exit_c i += 1
def load_model(path='files/model_De', device=device): checkpoint = torch.load(path, map_location=device) in_lang, out_lang, pairs = prepareData('En', 'De') in_lang = checkpoint['in_lang_class'] out_lang = checkpoint['out_lang_class'] hidden_size = checkpoint['hidden_size'] encoder = Encoder(in_lang.n_words, hidden_size).to(device) decoder = Decoder(hidden_size, out_lang.n_words, dropout_p=0.1).to(device) encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) return encoder, decoder, in_lang, out_lang
def main(): #Dataset dataset = PadDataset(WORKING_DIR, EMBEDDING_SIZE, diff_vocab = DIFF_VOCAB, embedding_path = EMBEDDING_PATH,\ limit_encode = LIMIT_ENCODE, limit_decode = LIMIT_DECODE) print("112") encoder_vocab_size = dataset.length_vocab_encode() decoder_vocab_size = dataset.length_vocab_decode() print("Steps per epoch %d" %(int(math.ceil(float(dataset.datasets["train"].number_of_samples)\ /float(BATCH_SIZE))))) #Initialising Model embeddings_encoder = dataset.vocab.embeddings_encoder embeddings_encoder = torch.Tensor(embeddings_encoder).cuda() embeddings_decoder = dataset.vocab.embeddings_decoder embeddings_decoder = torch.Tensor(embeddings_decoder).cuda() content_encoder = Encoder(encoder_vocab_size, embeddings_encoder, EMBEDDING_SIZE, HIDDEN_SIZE).cuda() print("123") query_encoder = Encoder(encoder_vocab_size, embeddings_encoder, EMBEDDING_SIZE, HIDDEN_SIZE).cuda() print("ddf") decoder = Decoder(EMBEDDING_SIZE, embeddings_decoder, HIDDEN_SIZE, decoder_vocab_size).cuda() print("adsf") seq2seqwattn = Seq2Seq(content_encoder, query_encoder, decoder).cuda() print("adsdf") run_this = run_model(dataset, seq2seqwattn) print('rehc') run_this.run_training() print('124124')
def training(data_dir): news_train = NEWS(data_path=os.path.join(data_dir, '.train.pkl'), vocab_path='../data/vocab.pkl') encoder, decoder = Encoder(emb_size=300, hidden_size=512, vocab_size=len(news_train.vocab)), Decoder() SAE = SentenceAE()
def create_model(gen_config): encoder = Encoder(gen_config.vocab_size, gen_config.emb_dim, gen_config.hidden_size, n_layers=2, dropout=0.5) decoder = Decoder(gen_config.emb_dim, gen_config.hidden_size, gen_config.vocab_size, n_layers=1, dropout=0.5) seq2seq = Seq2Seq(encoder, decoder).cuda() optimizer = optim.Adam(seq2seq.parameters(), lr=gen_config.lr) return seq2seq, optimizer
class CRNNAttention(chainer.Chain): def __init__(self, n_target_vocab, n_decoder_units, n_attention_units, n_encoder_units, n_maxout_units): super(CRNNAttention, self).__init__() with self.init_scope(): self.crnn = CRNN() self.decoder = Decoder( n_target_vocab, n_decoder_units, n_attention_units, n_encoder_units * 2, # because of bi-directional lstm n_maxout_units, ) def __call__(self, xs, ys): recurrent_output = self.crnn(xs) output = self.decoder(ys, recurrent_output) concatenated_os = F.concat(output, axis=0) concatenated_ys = F.flatten(ys.T) n_words = len(self.xp.where(concatenated_ys.data != PAD)[0]) loss = F.sum( F.softmax_cross_entropy(concatenated_os, concatenated_ys, reduce='no', ignore_label=PAD)) loss = loss / n_words chainer.report({'loss': loss.data}, self) perp = self.xp.exp(loss.data * batch_size / n_words) chainer.report({'perp': perp}, self) return loss def translate(self, xs, max_length=100): """Generate sentences based on xs. Args: xs: Source sentences. Returns: ys: Generated target sentences. """ with chainer.no_backprop_mode(), chainer.using_config('train', False): hxs = self.crnn(xs) ys = self.decoder.translate(hxs, max_length) return ys
def train(article, title, word2idx, target2idx, source_lengths, target_lengths, args, val_article=None, val_title=None, val_source_lengths=None, val_target_lengths=None): if not os.path.exists('./temp/x.pkl'): size_of_val = int(len(article) * 0.05) val_article, val_title, val_source_lengths, val_target_lengths = \ utils.sampling(article, title, source_lengths, target_lengths, size_of_val) utils.save_everything(article, title, source_lengths, target_lengths, val_article, val_title, val_source_lengths, val_target_lengths, word2idx) size_of_val = len(val_article) batch_size = args.batch train_size = len(article) val_size = len(val_article) max_a = max(source_lengths) max_t = max(target_lengths) print("source vocab size:", len(word2idx)) print("target vocab size:", len(target2idx)) print("max a:{}, max t:{}".format(max_a, max_t)) print("train_size:", train_size) print("val size:", val_size) print("batch_size:", batch_size) print("-" * 30) use_coverage = False encoder = Encoder(len(word2idx)) decoder = Decoder(len(target2idx), 50) if os.path.exists('decoder_model'): encoder.load_state_dict(torch.load('encoder_model')) decoder.load_state_dict(torch.load('decoder_model')) optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001) n_epoch = 5 print("Making word index and extend vocab") #article, article_tar, title, ext_vocab_all, ext_count = indexing_word(article, title, word2idx, target2idx) #article = to_tensor(article) #article_extend = to_tensor(article_extend) #title = to_tensor(title) print("preprocess done") if args.use_cuda: encoder.cuda() decoder.cuda() print("start training") for epoch in range(n_epoch): total_loss = 0 batch_n = int(train_size / batch_size) if epoch > 0: use_coverage = True for b in range(batch_n): # initialization batch_x = article[b * batch_size:(b + 1) * batch_size] batch_y = title[b * batch_size:(b + 1) * batch_size] #batch_x_ext = article_extend[b*batch_size: (b+1)*batch_size] batch_x, batch_x_ext, batch_y, extend_vocab, extend_lengths = \ utils.batch_index(batch_x, batch_y, word2idx, target2idx) if args.use_cuda: batch_x = batch_x.cuda() batch_y = batch_y.cuda() batch_x_ext = batch_x_ext.cuda() x_lengths = source_lengths[b * batch_size:(b + 1) * batch_size] y_lengths = target_lengths[b * batch_size:(b + 1) * batch_size] # work around to deal with length pack = pack_padded_sequence(batch_x_ext, x_lengths, batch_first=True) batch_x_ext_var, _ = pad_packed_sequence(pack, batch_first=True) current_loss = train_on_batch(encoder, decoder, optimizer, batch_x, batch_y, x_lengths, y_lengths, word2idx, target2idx, batch_x_ext_var, extend_lengths, use_coverage) batch_x = batch_x.cpu() batch_y = batch_y.cpu() batch_x_ext = batch_x_ext.cpu() print('epoch:{}/{}, batch:{}/{}, loss:{}'.format( epoch + 1, n_epoch, b + 1, batch_n, current_loss)) if (b + 1) % args.show_decode == 0: torch.save(encoder.state_dict(), 'encoder_model') torch.save(decoder.state_dict(), 'decoder_model') batch_x_val, batch_x_ext_val, batch_y_val, extend_vocab, extend_lengths = \ utils.batch_index(val_article, val_title, word2idx, target2idx) for i in range(1): idx = np.random.randint(0, val_size) decode.beam_search(encoder, decoder, batch_x_val[idx].unsqueeze(0), batch_y_val[idx].unsqueeze(0), word2idx, target2idx, batch_x_ext_val[idx], extend_lengths[idx], extend_vocab[idx]) batch_x_val = batch_x_val.cpu() batch_y_val = batch_y_val.cpu() batch_x_ext_val = batch_x_ext_val.cpu() total_loss += current_loss print('-' * 30) print() print("training finished")
parameters_dict = Load_Parameters(hyper_parameters_file) en_embedding_dim = parameters_dict['en_embedding_dim'] de_embedding_dim = parameters_dict['de_embedding_dim'] hidden_dim = parameters_dict['hidden_dim'] num_layers = parameters_dict['num_layers'] bidirectional = parameters_dict['bidirectional'] use_lstm = parameters_dict['use_lstm'] use_cuda = False batch_size = 1 dropout_p = 0.0 encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda) decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda) encoder.load_state_dict(torch.load(encoder_model_file, map_location='cpu')) decoder.load_state_dict(torch.load(decoder_model_file, map_location='cpu')) encoder.eval() decoder.eval() f_en_test = open('input.txt', 'r', encoding='utf-8') f_de_pred = open('output.txt', 'w', encoding='utf-8') while True: en_sent = f_en_test.readline() if not en_sent: break
def train(): # region Process Data path_to_files = tf.keras.utils.get_file( 'english.txt', origin= 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en', extract=False) path_to_files = tf.keras.utils.get_file( 'german.txt', origin= 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de', extract=False) path_to_files = os.path.dirname(path_to_files) input_tensor, target_tensor, inp_lang, targ_lang = load_wmt_dataset( path_to_files, num_examples, dict_size) max_length_targ, max_length_inp = max_length(target_tensor), max_length( input_tensor) input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( input_tensor, target_tensor, test_size=0.2) BUFFER_SIZE = len(input_tensor_train) steps_per_epoch = len(input_tensor_train) // BATCH_SIZE dataset = tf.data.Dataset.from_tensor_slices( (input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE) dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) # endregion # Region: model definition encoder = Encoder(dict_size, embedding_dim, units, BATCH_SIZE) attention_layer = BahdanauAttention(10) decoder = Decoder(dict_size, embedding_dim, units, BATCH_SIZE) optimizer = tf.keras.optimizers.Adam() checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) @tf.function def train_step(inp, targ, enc_hidden): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = encoder(inp, enc_hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1) # Teacher forcing - feeding the target as the next input for t in range(1, targ.shape[1]): # passing enc_output to the decoder predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output) loss += loss_function(targ[:, t], predictions) # using teacher forcing dec_input = tf.expand_dims(targ[:, t], 1) batch_loss = (loss / int(targ.shape[1])) variables = encoder.trainable_variables + decoder.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) return batch_loss for epoch in range(EPOCHS): start = time.time() enc_hidden = encoder.initialize_hidden_state() total_loss = 0 for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)): batch_loss = train_step(inp, targ, enc_hidden) total_loss += batch_loss if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format( epoch + 1, batch, batch_loss.numpy())) # saving (checkpoint) the model every 2 epochs if (epoch + 1) % 2 == 0: checkpoint.save(file_prefix=checkpoint_prefix) print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
def __init__(self): def Load_Vocab(file): with open(file, 'rb') as fd: _vocab = pickle.load(fd) return _vocab def Load_Parameters(file): with open(file, 'rb') as fd: parameters_dict = pickle.load(fd) return parameters_dict torch.manual_seed(1) torch.set_num_threads(4) en_vocab_dur_file = './en_vocab_dur.pkl' de_vocab_dur_file = './de_vocab_dur.pkl' encoder_dur_model_file = './encoder_dur.10.pt' decoder_dur_model_file = './decoder_dur.10.pt' en_vocab_key_file = './en_vocab.pkl' de_vocab_key_file = './de_vocab.pkl' encoder_key_model_file = './encoder.10.pt' decoder_key_model_file = './decoder.10.pt' hyper_parameters_file = './parameters_dict.pkl' self.en_vocab_key = Load_Vocab(en_vocab_key_file) self.de_vocab_key = Load_Vocab(de_vocab_key_file) self.en_vocab_dur = Load_Vocab(en_vocab_dur_file) self.de_vocab_dur = Load_Vocab(de_vocab_dur_file) self.trf_key = Transfrom(self.en_vocab_key) self.trf_dur = Transfrom(self.en_vocab_dur) self.parameters_dict = Load_Parameters(hyper_parameters_file) en_embedding_dim = self.parameters_dict['en_embedding_dim'] de_embedding_dim = self.parameters_dict['de_embedding_dim'] hidden_dim = self.parameters_dict['hidden_dim'] num_layers = self.parameters_dict['num_layers'] bidirectional = self.parameters_dict['bidirectional'] use_lstm = self.parameters_dict['use_lstm'] self.use_cuda_dur = self.use_cuda_key = False batch_size = 1 dropout_p = 0.0 self.encoder_key = Encoder(en_embedding_dim, hidden_dim, self.en_vocab_key.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_key) self.decoder_key = Decoder(de_embedding_dim, hidden_dim, self.de_vocab_key.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_key) self.encoder_dur = Encoder(en_embedding_dim, hidden_dim, self.en_vocab_dur.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_dur) self.decoder_dur = Decoder(de_embedding_dim, hidden_dim, self.de_vocab_dur.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_dur) self.encoder_key.load_state_dict( torch.load(encoder_key_model_file, map_location='cpu')) self.decoder_key.load_state_dict( torch.load(decoder_key_model_file, map_location='cpu')) self.encoder_dur.load_state_dict( torch.load(encoder_dur_model_file, map_location='cpu')) self.decoder_dur.load_state_dict( torch.load(decoder_dur_model_file, map_location='cpu')) self.encoder_key.eval() self.decoder_key.eval() self.encoder_dur.eval() self.decoder_dur.eval() """ __init__ return the parameters: {self.trf_dur,self.trf_key;
# 디코더 입력 인덱스 변환 x_decoder = preprocessor.convert_text_to_index(answer, word_to_index, DECODER_INPUT) # 디코더 목표 인덱스 변환 y_decoder = preprocessor.convert_text_to_index(answer, word_to_index, DECODER_TARGET) # 원핫 인코딩 y_decoder = preprocessor.one_hot_encode(y_decoder) # 훈련 모델 인코더, 디코더 정의 # 모델을 객체화 했기 때문에 객체로 가져오는 코드 encoder = Encoder(len(preprocessor.words)) decoder = Decoder(encoder.states, encoder.len_of_words) # github코드와 거의 동일 # 훈련 모델 정의 model = models.Model([encoder.inputs, decoder.inputs], decoder.outputs) # github코드와 동일 # 학습 방법 설정 model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) # github코드와 같은 결과를 내보냄 # 예측 모델을 생성하는 것을 객체의 함수로 만들었기 때문에 다음과 같이 코딩했습니다. # 예측 모델 정의 encoder_model = encoder.get_predict_model()
#for i,batch in enumerate(valid_iterator): # print("Train Src Shape: ",str(batch.src.shape)) # print("Train Trg Shape: ",str(batch.trg.shape)) # + INPUT_DIM = len(src.vocab) OUTPUT_DIM = len(trg.vocab) ENC_EMB_DIM = 128 DEC_EMB_DIM = 128 HID_DIM = 256 N_LAYERS = 1 ENC_DROPOUT = 0.0 DEC_DROPOUT = 0.0 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE) model.apply(init_weights) # - optimizer = optim.Adam(model.parameters()) TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token] criterion = nn.CrossEntropyLoss().to(DEVICE) print(TRG_PAD_IDX) # + N_EPOCHS = 1000 CLIP = 1 counter = 0 patience = 200
def train(): encode_input, decode_input, decode_output = gen_datasets() encode_input, decode_input, decode_output = shuffle( encode_input, decode_input, decode_output) encoder_input = Input(shape=(config['max_num'], ), name='encode_input') embedded_input = Embedding(config['en_voc_size'], 100, weights=[en_w2v_matrix], trainable=False, name="embedded_layer")(encoder_input) encoder = Encoder(3, 100, 5, 20, config['max_num'], mask_zero=True, name='encoder') encoder_output = encoder([ embedded_input, embedded_input, embedded_input, encoder_input, encoder_input ]) # decoder decoder_input = Input(shape=(config['max_num'], ), name='decode_input') embedded_input2 = Embedding(config['ch_voc_size'], 100, weights=[ch_w2v_matrix], trainable=False, name="embedded_layer2")(decoder_input) decoder = Decoder(3, 100, 5, 20, config['max_num'], mask_zero=True, name='decoder') decoder_output = decoder([ embedded_input2, encoder_output, encoder_output, decoder_input, encoder_input ]) decoder_dense = Dense(config['ch_voc_size'], activation='softmax', name='dense_layer') decoder_output = decoder_dense(decoder_output) label_smooth = LabelSmooth(0.1, config['ch_voc_size']) decoder_output = label_smooth(decoder_output) model = Model([encoder_input, decoder_input], decoder_output) opt = Adam(0.001, 0.9, 0.98, epsilon=1e-9) # model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) model.compile(optimizer=opt, loss=mask_loss, metrics=[mask_accuracy]) model.summary() tb = TensorBoard(log_dir='./tb_logs/0125', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) cp = ModelCheckpoint( './models/attention_seq2seq.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1) try: model.fit([encode_input, decode_input], decode_output, validation_split=0.2, batch_size=128, epochs=10, callbacks=[tb, cp]) except KeyboardInterrupt: model.save('attention_seq2seq') else: model.save('attention_seq2seq')
parameters_dict = {} parameters_dict['en_embedding_dim'] = en_embedding_dim parameters_dict['de_embedding_dim'] = de_embedding_dim parameters_dict['hidden_dim'] = hidden_dim parameters_dict['num_layers'] = num_layers parameters_dict['bidirectional'] = bidirectional parameters_dict['use_lstm'] = use_lstm with open('parameters_dict.pkl', 'wb') as fd: pickle.dump(parameters_dict, fd) batch_total = sum([1 for _ in pl.gen_pairs(batch_size)]) ones_matrix = autograd.Variable(torch.ones(1, de_vocab.n_items)) encoder = Encoder(en_embedding_dim, hidden_dim, en_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda) decoder = Decoder(de_embedding_dim, hidden_dim, de_vocab.n_items, num_layers, dropout_p, bidirectional, use_lstm, use_cuda) encoder_model_file = 'encoder_rev.7.pt' decoder_model_file = 'decoder_rev.7.pt' encoder.load_state_dict(torch.load(encoder_model_file)) decoder.load_state_dict(torch.load(decoder_model_file)) ''' #Load Pre-trained Embedding model_file = 'bi_gru.100.100.2.pt' if model_file != '' : model.load_state_dict(torch.load(model_file)) else: model.load_pre_train_emb('cityu_training.char.emb.npy', 'cityu_training.char.dict', vocab) ''' loss_function = nn.NLLLoss(reduction = 'sum', ignore_index = de_vocab.item2index['_PAD_'])
class Text2song(object): def __init__(self): def Load_Vocab(file): with open(file, 'rb') as fd: _vocab = pickle.load(fd) return _vocab def Load_Parameters(file): with open(file, 'rb') as fd: parameters_dict = pickle.load(fd) return parameters_dict torch.manual_seed(1) torch.set_num_threads(4) en_vocab_dur_file = './en_vocab_dur.pkl' de_vocab_dur_file = './de_vocab_dur.pkl' encoder_dur_model_file = './encoder_dur.10.pt' decoder_dur_model_file = './decoder_dur.10.pt' en_vocab_key_file = './en_vocab.pkl' de_vocab_key_file = './de_vocab.pkl' encoder_key_model_file = './encoder.10.pt' decoder_key_model_file = './decoder.10.pt' hyper_parameters_file = './parameters_dict.pkl' self.en_vocab_key = Load_Vocab(en_vocab_key_file) self.de_vocab_key = Load_Vocab(de_vocab_key_file) self.en_vocab_dur = Load_Vocab(en_vocab_dur_file) self.de_vocab_dur = Load_Vocab(de_vocab_dur_file) self.trf_key = Transfrom(self.en_vocab_key) self.trf_dur = Transfrom(self.en_vocab_dur) self.parameters_dict = Load_Parameters(hyper_parameters_file) en_embedding_dim = self.parameters_dict['en_embedding_dim'] de_embedding_dim = self.parameters_dict['de_embedding_dim'] hidden_dim = self.parameters_dict['hidden_dim'] num_layers = self.parameters_dict['num_layers'] bidirectional = self.parameters_dict['bidirectional'] use_lstm = self.parameters_dict['use_lstm'] self.use_cuda_dur = self.use_cuda_key = False batch_size = 1 dropout_p = 0.0 self.encoder_key = Encoder(en_embedding_dim, hidden_dim, self.en_vocab_key.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_key) self.decoder_key = Decoder(de_embedding_dim, hidden_dim, self.de_vocab_key.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_key) self.encoder_dur = Encoder(en_embedding_dim, hidden_dim, self.en_vocab_dur.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_dur) self.decoder_dur = Decoder(de_embedding_dim, hidden_dim, self.de_vocab_dur.n_items, num_layers, dropout_p, bidirectional, use_lstm, self.use_cuda_dur) self.encoder_key.load_state_dict( torch.load(encoder_key_model_file, map_location='cpu')) self.decoder_key.load_state_dict( torch.load(decoder_key_model_file, map_location='cpu')) self.encoder_dur.load_state_dict( torch.load(encoder_dur_model_file, map_location='cpu')) self.decoder_dur.load_state_dict( torch.load(decoder_dur_model_file, map_location='cpu')) self.encoder_key.eval() self.decoder_key.eval() self.encoder_dur.eval() self.decoder_dur.eval() """ __init__ return the parameters: {self.trf_dur,self.trf_key; self.encoder_dur,self.encoder_key; self.decoder_dur,self.decoder_key; self.en_vocab_dur,self.en_vocab_key; self.de_vocab_dur,self.de_vocab_key; self.use_cuda_dur,self,self.use_cuda_key.}""" def get_song(self, lyric): def stop_before_eos(li, length): if '_EOS_' in li: i = li.index('_EOS_') li = li[:i] while (li.__len__() < length): li.append(li[-1]) return li def important_function_in_while_loop(trf, sent, encoder, decoder, de_vocab, use_cuda, en_sent): en_seq, en_seq_len = trf.trans_input(sent) en_seq = torch.LongTensor(en_seq) encoder_input = en_seq encoder_output, encoder_state = encoder(encoder_input, en_seq_len) # initial decoder hidden decoder_state = decoder.init_state(encoder_state) # Start decoding decoder_inputs = torch.LongTensor([de_vocab.item2index['_START_']]) pred_char = '' if use_cuda: decoder_inputs = decoder_inputs.cuda() decoder_outputs, decoder_state = decoder(decoder_inputs, encoder_output, decoder_state) max_len = len(en_sent.split()) return (pred_char, encoder_output, decoder_outputs, decoder_state, max_len) f_en_test = io.StringIO(lyric) pred_list = [] while True: en_sent = f_en_test.readline() if not en_sent: break sent = en_sent.strip() pred_sent_dur = [] pred_sent_key = [] pred_char_key, encoder_output_key, decoder_outputs_key, decoder_state_key, max_len_key = \ important_function_in_while_loop(self.trf_key, sent, self.encoder_key, self.decoder_key, self.de_vocab_key, self.use_cuda_key, en_sent) pred_char_dur, encoder_output_dur, decoder_outputs_dur, decoder_state_dur, max_len_dur = \ important_function_in_while_loop(self.trf_dur, sent, self.encoder_dur, self.decoder_dur, self.de_vocab_dur, self.use_cuda_dur, en_sent) # Greedy search while pred_char_key != '_EOS_' and pred_char_dur != '_EOS_': log_prob_key, v_idx_key = decoder_outputs_key.data.topk(1) pred_char_key = self.de_vocab_key.index2item[v_idx_key.item()] pred_sent_key.append(pred_char_key) log_prob_dur, v_idx_dur = decoder_outputs_dur.data.topk(1) pred_char_dur = self.de_vocab_dur.index2item[v_idx_dur.item()] pred_sent_dur.append(pred_char_dur) if (len(pred_sent_dur) > max_len_dur or len(pred_sent_dur) > max_len_key): break decoder_inputs_dur = torch.LongTensor([v_idx_dur.item()]) if self.use_cuda_dur: decoder_inputs_dur = decoder_inputs_dur.cuda() decoder_outputs_dur, decoder_state_dur = self.decoder_dur( decoder_inputs_dur, encoder_output_dur, decoder_state_dur) decoder_inputs_key = torch.LongTensor([v_idx_key.item()]) if self.use_cuda_key: decoder_inputs_key = decoder_inputs_key.cuda() decoder_outputs_key, decoder_state_key = self.decoder_key( decoder_inputs_key, encoder_output_key, decoder_state_key) length = len(sent.split()) pred_list.append({ 'lyrics': sent, 'key': stop_before_eos(pred_sent_key, length), 'duration': stop_before_eos(pred_sent_dur, length) }) # pred_list.append({'lyrics': sent, 'key': pred_sent_key, 'duration': pred_sent_dur}) return pred_list
def run_smiles_generator(test_file): src = Field(sequential=True, tokenize=tokenize_drug, init_token='<sos>', eos_token='<eos>') trg = Field(sequential=True, tokenize=tokenize_drug, init_token='<sos>', eos_token='<eos>') #Get the train and test set in torchtext format datafields = [ ("src", src), # we won't be needing the id, so we pass in None as the field ("trg", trg) ] train, test = TabularDataset.splits(path='../data/SMILES_Autoencoder/', train='all_smiles_revised_final.csv', test=test_file, format='csv', skip_header=True, fields=datafields) #Split the dataset into train and validation set train_data, valid_data = train.split(split_ratio=0.99) print(f"Number of examples: {len(train_data.examples)}") src.build_vocab(train_data, min_freq=2) trg.build_vocab(train_data, min_freq=2) #Total no of unique words in our vocabulary print(f"Unique tokens in source vocabulary: {len(src.vocab)}") print(f"Unique tokens in target vocabulary: {len(trg.vocab)}") TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token] print("Padding Id: ", TRG_PAD_IDX) #Create the iterator to traverse over test samples for which we need to generate latent space BATCH_SIZE = 128 (train_iterator, test_iterator) = BucketIterator.splits( (train_data, test), batch_size=BATCH_SIZE, device=DEVICE, sort=False, shuffle=False) print(src.vocab.stoi) print(trg.vocab.stoi) #Define the model once again INPUT_DIM = len(src.vocab) OUTPUT_DIM = len(trg.vocab) ENC_EMB_DIM = 128 DEC_EMB_DIM = 128 HID_DIM = 256 N_LAYERS = 1 ENC_DROPOUT = 0.0 DEC_DROPOUT = 0.0 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE) model.apply(init_weights) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss().to(DEVICE) model.load_state_dict( torch.load('../models/lstm_out/torchtext_checkpoint.pt', map_location=torch.device('cpu'))) #Get latent space for all drugs model.eval() epoch_loss = 0 ls_list = [] encode_list = [] decode_list = [] error_list = [] with torch.no_grad(): for j, batch in enumerate(test_iterator): new_src = batch.src new_trg = batch.trg #Get output outputs = model(new_src, new_trg, 1) #turn on teacher forcing output = outputs[0] hidden = outputs[1] cell_state = outputs[2] #Get latent space o1 = torch.argmax(torch.softmax(output, dim=2), dim=2) h1 = torch.mean(hidden, dim=0).cpu().detach().tolist() c1 = torch.mean(cell_state, dim=0).cpu().detach().tolist() for i in range(len(h1)): temp_ls = h1[i] temp_encode = new_trg[:, i].cpu().detach().tolist() temp_decode = o1[:, i].cpu().detach().tolist() try: index_1 = temp_decode.index(1) except: index_1 = len(temp_decode) temp_error = np.array(temp_encode) - np.array(temp_decode) error = sum( np.absolute(temp_error[1:index_1]) > 0) / len(temp_error) error_list.append(error) ls_list.append(temp_ls) encode_list.append(temp_encode) decode_list.append(temp_decode) output_dim = output.shape[-1] output = output[1:].view(-1, output_dim) rev_trg = new_trg[1:].view(-1) loss = criterion(output, rev_trg) print("Reconstruction Loss for iteration " + str(j) + " is :" + str(round(loss.item(), 3))) epoch_loss += loss.item() #Print overall average error print("Average reconstruction error: ", epoch_loss / len(test_iterator)) torch.cuda.empty_cache() final_list, only_smiles_list = [], [] for i in range(len(encode_list)): temp_encode = encode_list[i] temp_decode = decode_list[i] temp_encode_str, temp_decode_str, temp_mol_str, temp_error_str = '', '', '', '' #Get original string for j in range(1, len(temp_encode)): #Break when it sees padding if (temp_encode[j] == 1): break #Don't pad end of sentence if (temp_encode[j] != 3): temp_encode_str += src.vocab.itos[temp_encode[j]] #Get decoded string for j in range(1, len(temp_decode)): if (temp_decode[j] == 1): break if (temp_decode[j] != 3): temp_decode_str += src.vocab.itos[temp_decode[j]] #m = Chem.MolFromSmiles(temp_decode_str) #if (m is not None): # temp_mol_str = '1' #else: # temp_mol_str = '0' #string_list = [temp_encode_str, temp_decode_str, temp_mol_str, str(error_list[i])] #only_smiles_list.append(string_list) #string_list_with_ls = string_list + ls_list[i] #final_list.append(string_list_with_ls) colids = ['LS_' + str(x) for x in range(len(ls_list[0]))] final_out_df = pd.DataFrame(ls_list, columns=colids) return (final_out_df)
tst_src_p = pickle.load(open('data/tst_src_p.pkl', 'rb')) l_tst_tgt = pickle.load(open('data/l_tst_tgt.pkl', 'rb')) tst_tgt_p = pickle.load(open('data/tst_tgt_p.pkl', 'rb')) l_trn_src = pickle.load(open('data/l_trn_src.pkl', 'rb')) trn_src_p = pickle.load(open('data/trn_src_p.pkl', 'rb')) l_trn_tgt = pickle.load(open('data/l_trn_tgt.pkl', 'rb')) trn_tgt_p = pickle.load(open('data/trn_tgt_p.pkl', 'rb')) tst_src_t = torch.LongTensor(tst_src_p) tst_tgt_t = torch.LongTensor(tst_tgt_p) trn_src_t = torch.LongTensor(trn_src_p) trn_tgt_t = torch.LongTensor(trn_tgt_p) enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad]) dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos], vocab[eos], vocab[unk]) enc.to('cuda') dec.to('cuda') opt_enc = torch.optim.Adam(enc.parameters()) opt_dec = torch.optim.Adam(dec.parameters()) n_batch = len(trn_src_p) // batch_size for e in range(epochs): enc.train() dec.train() epoch_loss = 0 for i in range(n_batch): opt_enc.zero_grad() opt_dec.zero_grad() lengths = torch.LongTensor(l_trn_src[batch_size * i:batch_size *