def build_model1(self): # LookupTable to Embedding src_embedding_layer = EmbeddingLayer(input_dim=self.n_src_vocab, output_dim=self.src_embed_dim, name='src_embedding') tgt_embedding_layer = EmbeddingLayer(input_dim=self.n_tgt_vocab, output_dim=self.tgt_embed_dim, name='src_embedding') # LSTMs src_lstm_forward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) src_lstm_backward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) tgt_lstm = LSTM(input_dim=self.tgt_embed_dim, output_dim=self.tgt_lstm_op_dim) sys.stderr.write(str(tgt_lstm.params) + "\n") # TODO # From target LSTM to target word indexes # Input: target LSTM output dim + Attention from BiLSTM proj_layer = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=self.n_tgt_vocab, activation='softmax') params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer.params # declare input variables src_ip = T.ivector() tgt_ip = T.ivector() tgt_op = T.ivector() # lookup table -> embedding src_embed_ip = src_embedding_layer.fprop(src_ip) tgt_embed_ip = tgt_embedding_layer.fprop(tgt_ip) # embedding -> source BiLSTM src_lstm_forward.fprop(src_embed_ip) src_lstm_backward.fprop(src_embed_ip[::-1, :]) # Concatenate foward/backward. (Flip backward again to get corresponding h for the same word) encoderh = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1) # End of source BiLSTM -> target LSTM tgt_lstm.h_0 = encoderh[-1] tgt_lstm.fprop(tgt_embed_ip) # Attention # Read http://arxiv.org/abs/1508.04025 attention = tgt_lstm.h.dot(encoderh.transpose()) attention = attention.dot(encoderh) # Order preference? decoderh = T.concatenate((attention, tgt_lstm.h), axis=1) # LSTM output -> target word proj_op = proj_layer.fprop(decoder) # Cost + regularization cost = T.nnet.categorical_crossentropy(proj_op, tgt_op).mean() cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) return dict({'cost': cost, 'src_ip': src_ip, 'tgt_ip': tgt_ip, 'tgt_op': tgt_op, 'params': params, 'proj_op': proj_op})
def __init__(self, train_steps): self.parser = Parser('bach/chorales/01ausmei.mid') self.frames, self.num_tracks, self.min_time = self.parser.parse() self.train_steps = train_steps self.time_steps = len(self.frames) self.num_features = len(self.frames[0]) self.network = LSTM(self.train_steps - 1, self.num_features)
def main(): with open(args.file, 'r') as f: inp = f.read() # Create tokenizing dictionary for text in ABC notation char2int = dict((a, b) for b, a in enumerate(list(set(inp)))) char2int['<pad>'] = len(char2int) # Create reverse lookup dictionary for the text int2char = {v: k for k, v in char2int.items()} train_data, valid_data = utils.grab_data(args.split_pct, inp) # Number of total unique characters num_outputs = len(char2int) # Initialize recurrent network if args.network == 'GRU': print("Using GRU Network") model = GRU.GRU(args.batch_size, args.num_units, args.num_layers, num_outputs, args.dropout) else: print("Using LSTM Network") model = LSTM.LSTM(args.batch_size, args.num_units, args.num_layers, num_outputs, args.dropout) # Initialize Loss function for network criterion = torch.nn.CrossEntropyLoss() # Initialize optimizer for network if args.optim.lower() == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.learning_rate) elif args.optim.lower() == "rms": optimizer = optim.RMSprop(model.parameters(), lr=args.learning_rate) else: optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) if args.training.lower() == 'true' and args.resume_training.lower() == 'true': print('Loading model...') model, optimizer, epoch_i, losses, seq_len = utils.resume(model, optimizer, gpu, filepath=('./saves/checkpoint-' + str(args.save_append) + '.pth.tar')) print('Resuming training with model loaded from ' + './saves/checkpoint-' + str(args.save_append) + '.pth.tar') print('Epoch: %d\tCurrent Train Loss: %f\tCurrent Valid Loss: %f' %(epoch_i,losses['train'][-1],losses['valid'][-1])) _, _ = train(model, train_data, valid_data, seq_len, criterion, optimizer, char2int, losses=losses, epoch=(epoch_i+1)) elif args.training.lower() == 'true': _, _ = train(model, train_data, valid_data, args.seq_len, criterion, optimizer, char2int) else: model, _, _, _, _ = utils.resume(model, optimizer, gpu, filepath=('./saves/checkpoint-' + str(args.save_append) + '.pth.tar')) if args.heat_map: heat_map(model,char2int,int2char) else: generate_music(model, char2int, int2char) if args.generate_heat_map: print('in if statement') heat_map(model,char2int,int2char,song_path = args.generate_file)
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % ( best_valid_score) log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()
BATCH = 20 TRAIN_FILE = STOCK + PERIOD + '_train.csv' TEST_FILE = STOCK + PERIOD + '_test.csv' STANDARDIZE = True # train data_train = generate_df(TRAIN_FILE, AFFECT, STANDARDIZE) data_test = generate_df(TEST_FILE, AFFECT, STANDARDIZE) data_loader_train = DataLoader(data_train['dataset'], batch_size=BATCH, shuffle=False) data_loader_test = DataLoader(data_test['dataset'], batch_size=1, shuffle=False) net = LSTM(AFFECT).cuda() optimizer = optim.Adam(net.parameters()) loss_func = nn.MSELoss().cuda() # loss_func = DA().cuda() da_list = list() mse_list = list() theil_list = list() l1_list = list() mape_list = list() r_list = list() for step in range(EPOCH): loss = None for tx, ty in data_loader_train: output = net(tx.reshape(1, BATCH, AFFECT))