class Scorer(object): def __init__(self, char_list, model_path, rnn_type, ninp, nhid, nlayers, device): char_list = list(char_list) + ['sil_start', 'sil_end'] self.inv_vocab_map = dict([(i, c) for (i, c) in enumerate(char_list)]) self.vocab_map = dict([(c, i) for (i, c) in enumerate(char_list)]) self.criterion = nn.CrossEntropyLoss() self.device = device self.rnn = RNN(rnn_type, len(char_list), ninp, nhid, nlayers).to(self.device) self.rnn.load_state_dict(torch.load(model_path)) self.rnn.eval() self.history = defaultdict(tuple) def get_score(self, string): if len(string) < 2: return 0, self.rnn.init_hidden(1) string_idx = map(lambda x: self.vocab_map[x], string) input = string_idx[:-1] grt = string_idx[1:] input, grt = torch.LongTensor(input).to( self.device), torch.LongTensor(grt).to(self.device) input = input.view(1, input.size()[0]) init_hidden = self.rnn.init_hidden(1) pred, hidden = self.rnn(input, init_hidden) pred = pred.view(-1, pred.size(-1)) loss = self.criterion(pred, grt) return -(len(string_idx) - 1) * loss.item(), hidden def get_score_fast(self, strings): strings = [''.join(x) for x in strings] history_to_update = defaultdict(tuple) scores = [] for string in strings: if len(string) <= 2: score, hidden_state = self.get_score(string) scores.append(score) history_to_update[string] = (score, hidden_state) elif string in self.history: history_to_update[string] = self.history[string] scores.append(self.history[string][0]) elif string[:-1] in self.history: score, hidden = self.history[string[:-1]] input, grt = torch.LongTensor([ self.vocab_map[string[-2]] ]).view(1, 1).to(self.device), torch.LongTensor( [self.vocab_map[string[-1]]]).to(self.device) pred, hidden = self.rnn(input, hidden) loss = self.criterion(pred.view(-1, pred.size(-1)), grt).item() history_to_update[string] = (score - loss, hidden) scores.append(score - loss) else: raise ValueError("%s not stored" % (string[:-1])) self.history = history_to_update return scores
def load_model(input_size): model = RNN(input_size, hidden_size, num_layers) # load on CPU only checkpoint = torch.load('checkpoint.pt', map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) model.eval() print(model) print('model training loss', checkpoint['loss']) print('model training epoch', checkpoint['epoch']) return model
def main(): train_names = [] train_labels = [] # Read all the file names and the labels of the speakers for the training set with open('train.txt', 'r') as file: for line in file: speaker = line.split('-')[0] speech = line.split('-')[1] file_path = os.path.join('./LibriSpeech/dev-clean/', speaker, speech, line.split('\n')[0]) train_names.append(file_path) train_labels.append(speaker) file.close() test_names = [] test_labels = [] # Read all the file names and the labels of the speakers for the testing set with open('test.txt', 'r') as file: for line in file: speaker = line.split('-')[0] speech = line.split('-')[1] file_path = os.path.join('./LibriSpeech/dev-clean/', speaker, speech, line.split('\n')[0]) test_names.append(file_path) test_labels.append(speaker) file.close() # The following lines are used for encoding our speakers into one-hot encodings. # One-hot is useful for representation as we do not have a large number of speakers: 40. label_encoder = LabelEncoder() train_data_labels = label_encoder.fit_transform(train_labels) n_classes = len(np.unique(train_data_labels)) print('Number of Train classes', len(np.unique(train_data_labels))) binarize = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) train_data_labels = binarize.fit_transform(train_data_labels) label_encoder = LabelEncoder() test_data_labels = label_encoder.fit_transform(test_labels) n_classes = len(np.unique(test_data_labels)) print('Number of Test classes', len(np.unique(test_data_labels))) binarize = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False) test_data_labels = binarize.fit_transform(test_data_labels) # Loading the data for training and testing train, train_labels = load_data_truncate(train_names, train_data_labels) val, val_labels = load_data_truncate(test_names, test_data_labels) # Preparing the data for the DataLoader so that it can be used in batches train = np.array(train).astype(np.float32) val = np.array(val).astype(np.float32) train_labels = np.array(train_labels).astype(np.float32) val_labels = np.array(val_labels).astype(np.float32) train_load = [] for i in range(0, len(train)): train_load.append((train[i], train_labels[i])) val_load = [] for i in range(0, len(val)): val_load.append((val[i], val_labels[i])) # Data Loader for the train set. Batch Size of 4, shuffled # and dropping the samples which do not fit the batch size. train_dataset = DataLoader(train_load, batch_size=4, shuffle=True, drop_last=True) # Data Loader for the test set. val_dataset = DataLoader(val_load) # Initialize the RNN. model = RNN(input_size=100, output_size=n_classes, hidden_dim=256, n_layers=1) # Specifying the hyperparameters for training n_epochs = 100 lr = 0.00001 # Define Loss, Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Training part train_accuracy = [] test_accuracy = [] train_loss = [] test_loss = [] for epoch in range(0, n_epochs): model.train() optimizer.zero_grad() # Clears existing gradients from previous epoch epoch_loss = [] # Store the losses for all batches of an epoch correct_predictions = 0 total_predictions = 0 # Iterate through data loader for i, (x, y) in enumerate(train_dataset): # Reshaping for training x = Variable(x.view(-1, 20, 100)) y = Variable(y) output, _ = model(x) # Obtain predictions target = torch.argmax(y, dim=1) loss = criterion(output, target) loss.backward() # Does backpropagation and calculates gradients optimizer.step() # Updates the weights accordingly epoch_loss.append(loss.item()) # Compute number of correct predictions and total number of predictions correct, predicted = compute_accuracy(output, target) correct_predictions += correct total_predictions += predicted # Every 10th epoch present statistics if epoch % 10 == 0: print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ') print("Loss: {:.4f}, Accuracy: {}/{}".format( np.average(epoch_loss), correct_predictions.item(), total_predictions)) train_accuracy.append(correct_predictions.item() / total_predictions) train_loss.append(np.average(epoch_loss)) # Evaluate the model on the test set model.eval() correct_predictions = 0 total_predictions = 0 epoch_val_loss = [] for i, (x, y) in enumerate(val_dataset): x = Variable(x.view(-1, 20, 100)) y = Variable(y) output, _ = model(x) target = torch.argmax(y, dim=1) loss = criterion(output, target) epoch_val_loss.append(loss.item()) correct, predicted = compute_accuracy(output, target) correct_predictions += correct total_predictions += predicted print("Eval Accuracy: {}/{}".format(correct_predictions.item(), total_predictions)) test_accuracy.append(correct_predictions.item() / total_predictions) test_loss.append(np.average(epoch_val_loss)) model.eval() correct_predictions = 0 total_predictions = 0 preds = [] targets = [] for i, (x, y) in enumerate(val_dataset): x = Variable(x.view(-1, 20, 100)) y = Variable(y) output, _ = model(x) target = torch.argmax(y, dim=1) correct, predicted = compute_accuracy(output, target) preds.append(output) targets.append(target) correct_predictions += correct total_predictions += predicted print("Final Eval Accuracy: {}/{}".format(correct_predictions.item(), total_predictions)) with open('accuracy.pickle', 'wb') as f: pickle.dump(train_accuracy, f) pickle.dump(test_accuracy, f) f.close() with open('loss.pickle', 'wb') as f: pickle.dump(train_loss, f) pickle.dump(test_loss, f) f.close() with open('preds.pickle', 'wb') as f: pickle.dump(preds, f) pickle.dump(targets, f) f.close()
next_chars = input_text[start + 1:end + 1] chars_tensor = torch.stack( tuple(map(one_hot_encode, map(char_to_index, chars)))) next_chars_index = tuple(map(char_to_index, next_chars)) next_chars_tensor = torch.tensor(next_chars_index) optimizer.zero_grad() pred_chars, hidden_state = net(chars_tensor, hidden_state) loss = F.cross_entropy(pred_chars, next_chars_tensor) loss.backward() loss = loss.item() total_loss += loss if i - last_print > 1000: print(total_loss / i) last_print = i optimizer.step() hidden_state = hidden_state.detach() i += BATCH_SIZE torch.save(net.state_dict(), MODEL_SAVE_PATH) net.eval()
class SDFA(): def __init__(self, model_path, gpu=-1): self.model_path = model_path if gpu < 0: self.device = torch.device("cpu") model_dict = torch.load(self.model_path, map_location=lambda storage, loc: storage) self.fa = face_alignment.FaceAlignment( face_alignment.LandmarksType._2D, device="cpu", flip_input=False) else: self.device = torch.device("cuda:" + str(gpu)) model_dict = torch.load( self.model_path, map_location=lambda storage, loc: storage.cuda(gpu)) self.fa = face_alignment.FaceAlignment( face_alignment.LandmarksType._2D, device="cuda:" + str(gpu), flip_input=False) self.stablePntsIDs = [33, 36, 39, 42, 45] self.mean_face = model_dict["mean_face"] self.img_size = model_dict["img_size"] self.audio_rate = model_dict["audio_rate"] self.video_rate = model_dict["video_rate"] self.audio_feat_len = model_dict['audio_feat_len'] self.audio_feat_samples = model_dict['audio_feat_samples'] self.id_enc_dim = model_dict['id_enc_dim'] self.rnn_gen_dim = model_dict['rnn_gen_dim'] self.aud_enc_dim = model_dict['aud_enc_dim'] self.aux_latent = model_dict['aux_latent'] self.sequential_noise = model_dict['sequential_noise'] self.conversion_dict = {'s16': np.int16, 's32': np.int32} self.img_transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((self.img_size[0], self.img_size[1])), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) self.encoder = RNN(self.audio_feat_len, self.aud_enc_dim, self.rnn_gen_dim, self.audio_rate, init_kernel=0.005, init_stride=0.001) self.encoder.to(self.device) self.encoder.load_state_dict(model_dict['encoder']) self.encoder_id = ImageEncoder(code_size=self.id_enc_dim, img_size=self.img_size) self.encoder_id.to(self.device) self.encoder_id.load_state_dict(model_dict['encoder_id']) skip_channels = list(self.encoder_id.channels) skip_channels.reverse() self.generator = Generator( self.img_size, self.rnn_gen_dim, condition_size=self.id_enc_dim, num_gen_channels=self.encoder_id.channels[-1], skip_channels=skip_channels, aux_size=self.aux_latent, sequential_noise=self.sequential_noise) self.generator.to(self.device) self.generator.load_state_dict(model_dict['generator']) self.encoder.eval() self.encoder_id.eval() self.generator.eval() def save_video(self, video, audio, path, overwrite=True, experimental_ffmpeg=False, scale=None): if not os.path.isabs(path): path = os.getcwd() + "/" + path with tempdir() as dirpath: writer = sio.FFmpegWriter(dirpath + "/tmp.avi", inputdict={ '-r': str(self.video_rate) + "/1", }, outputdict={ '-r': str(self.video_rate) + "/1", }) for i in range(video.shape[0]): frame = np.rollaxis(video[i, :, :, :], 0, 3) if scale is not None: frame = tf.rescale(frame, scale, anti_aliasing=True, multichannel=True, mode='reflect') writer.writeFrame(frame) writer.close() wav.write(dirpath + "/tmp.wav", self.audio_rate, audio) in1 = ffmpeg.input(dirpath + "/tmp.avi") in2 = ffmpeg.input(dirpath + "/tmp.wav") if experimental_ffmpeg: out = ffmpeg.output(in1['v'], in2['a'], path, strict='-2', loglevel="panic") else: out = ffmpeg.output(in1['v'], in2['a'], path, loglevel="panic") if overwrite: out = out.overwrite_output() out.run() def preprocess_img(self, img): src = self.fa.get_landmarks(img)[0][self.stablePntsIDs, :] dst = self.mean_face[self.stablePntsIDs, :] tform = tf.estimate_transform('similarity', src, dst) warped = tf.warp(img, inverse_map=tform.inverse, output_shape=self.img_size) warped = warped * 255 warped = warped.astype('uint8') return warped def _cut_sequence_(self, seq, cutting_stride, pad_samples): pad_left = torch.zeros(pad_samples // 2, 1) pad_right = torch.zeros(pad_samples - pad_samples // 2, 1) seq = torch.cat((pad_left, seq), 0) seq = torch.cat((seq, pad_right), 0) stacked = seq.narrow(0, 0, self.audio_feat_samples).unsqueeze(0) iterations = (seq.size()[0] - self.audio_feat_samples) // cutting_stride + 1 for i in range(1, iterations): stacked = torch.cat( (stacked, seq.narrow(0, i * cutting_stride, self.audio_feat_samples).unsqueeze(0))) return stacked.to(self.device) def _broadcast_elements_(self, batch, repeat_no): total_tensors = [] for i in range(0, batch.size()[0]): total_tensors += [torch.stack(repeat_no * [batch[i]])] return torch.stack(total_tensors) def __call__(self, img, audio, fs=None, aligned=False): if isinstance(img, str): frm = Image.open(img) frm.thumbnail((400, 400)) frame = np.array(frm) else: frame = img if not aligned: frame = self.preprocess_img(frame) if isinstance(audio, str): info = mediainfo(audio) fs = int(info['sample_rate']) audio = np.array( AudioSegment.from_file(audio, info['format_name']).set_channels( 1).get_array_of_samples()) if info['sample_fmt'] in self.conversion_dict: audio = audio.astype(self.conversion_dict[info['sample_fmt']]) else: if max(audio) > np.iinfo(np.int16).max: audio = audio.astype(np.int32) else: audio = audio.astype(np.int16) if audio.ndim > 1 and audio.shape[1] > 1: audio = audio[:, 0] max_value = np.iinfo(audio.dtype).max if fs != self.audio_rate: seq_length = audio.shape[0] speech = torch.from_numpy( signal.resample( audio, int(seq_length * self.audio_rate / float(fs))) / float(max_value)).float() speech = speech.view(-1, 1) else: audio = torch.from_numpy(audio / float(max_value)).float() speech = audio.view(-1, 1) frame = self.img_transform(frame).to(self.device) cutting_stride = int(self.audio_rate / float(self.video_rate)) audio_seq_padding = self.audio_feat_samples - cutting_stride audio_feat_seq = self._cut_sequence_(speech, cutting_stride, audio_seq_padding) frame = frame.unsqueeze(0) audio_feat_seq = audio_feat_seq.unsqueeze(0) audio_feat_seq_length = audio_feat_seq.size()[1] z = self.encoder(audio_feat_seq, [audio_feat_seq_length]) noise = torch.FloatTensor(1, audio_feat_seq_length, self.aux_latent).normal_(0, 0.33).to( self.device) z_id, skips = self.encoder_id(frame, retain_intermediate=True) skip_connections = [] for skip_variable in skips: skip_connections.append( self._broadcast_elements_(skip_variable, z.size()[1])) skip_connections.reverse() z_id = self._broadcast_elements_(z_id, z.size()[1]) gen_video = self.generator(z, c=z_id, aux=noise, skip=skip_connections) returned_audio = ((2**15) * speech.detach().cpu().numpy()).astype( np.int16) gen_video = 125 * gen_video.squeeze().detach().cpu().numpy() + 125 return gen_video, returned_audio
if (idx + 1) % 5000 == 0: create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss, params) if (idx + 1) % 500 == 0 or (idx + 1) == len(train_data_loader): print("Epoch %d (Step %d) - %0.4f train loss, %0.2f time." % (epoch + 1, idx + 1, loss, time.time() - start_time)) print("Epoch %d - %0.4f loss, %.2f time. " % (epoch + 1, np.mean(train_loss), time.time() - start_time)) create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss, params) if (epoch + 1) % 5 == 0: # Test model on a random sub-batch of test loader cnn.eval() rnn.eval() print("Steps to be taken - %d\n", params['sub_batch_test']) test_model(cnn, rnn, optimizer, loss_fn, test_data_loader, vocab, params, 'model_' + str(epoch + 1) + '.ckpt', params['device'], params['sub_batch_test']) cnn.train() rnn.train() print('Training completed.') if params['is_testing']: cnn.eval() rnn.eval() print("Steps to be taken - %d\n" % (len(test_data_loader))) test_model(cnn, rnn, optimizer, loss_fn, test_data_loader, vocab, params, params['load_model_test'], params['device'], -1,
model.train() prediction = model(X_train_dep_std) loss = loss_func(prediction, y_train_dep_std) optimizer.zero_grad() # clear gradients for this training step loss.backward() # back propagation, compute gradients optimizer.step() if iter % 100 == 0: print("iteration: %s, loss: %s" % (iter, loss.item())) # Save model save_filename = 'checkpoints/LSTM_FC.pth' torch.save(model, save_filename) print('Saved as %s' % save_filename) # Start evaluating model model.eval() y_pred_dep_ = model(X_test_dep_std).detach().numpy() y_pred_dep = ss_y_dep.inverse_transform(y_pred_dep_[0, 144:]) print('the value of R-squared of Evaporation is ', r2_score(Outputs[144:], y_pred_dep)) print('the value of Root mean squared error of Evaporation is ', rmse(Outputs[144:], y_pred_dep)) f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(6, 4)) ax1.plot(Outputs[144:], color="blue", linestyle="-", linewidth=1.5,