Example #1
0
class Scorer(object):
    def __init__(self, char_list, model_path, rnn_type, ninp, nhid, nlayers,
                 device):
        char_list = list(char_list) + ['sil_start', 'sil_end']
        self.inv_vocab_map = dict([(i, c) for (i, c) in enumerate(char_list)])
        self.vocab_map = dict([(c, i) for (i, c) in enumerate(char_list)])
        self.criterion = nn.CrossEntropyLoss()
        self.device = device
        self.rnn = RNN(rnn_type, len(char_list), ninp, nhid,
                       nlayers).to(self.device)
        self.rnn.load_state_dict(torch.load(model_path))
        self.rnn.eval()
        self.history = defaultdict(tuple)

    def get_score(self, string):
        if len(string) < 2:
            return 0, self.rnn.init_hidden(1)
        string_idx = map(lambda x: self.vocab_map[x], string)
        input = string_idx[:-1]
        grt = string_idx[1:]
        input, grt = torch.LongTensor(input).to(
            self.device), torch.LongTensor(grt).to(self.device)
        input = input.view(1, input.size()[0])
        init_hidden = self.rnn.init_hidden(1)
        pred, hidden = self.rnn(input, init_hidden)
        pred = pred.view(-1, pred.size(-1))
        loss = self.criterion(pred, grt)
        return -(len(string_idx) - 1) * loss.item(), hidden

    def get_score_fast(self, strings):
        strings = [''.join(x) for x in strings]
        history_to_update = defaultdict(tuple)
        scores = []
        for string in strings:
            if len(string) <= 2:
                score, hidden_state = self.get_score(string)
                scores.append(score)
                history_to_update[string] = (score, hidden_state)
            elif string in self.history:
                history_to_update[string] = self.history[string]
                scores.append(self.history[string][0])
            elif string[:-1] in self.history:
                score, hidden = self.history[string[:-1]]
                input, grt = torch.LongTensor([
                    self.vocab_map[string[-2]]
                ]).view(1, 1).to(self.device), torch.LongTensor(
                    [self.vocab_map[string[-1]]]).to(self.device)
                pred, hidden = self.rnn(input, hidden)
                loss = self.criterion(pred.view(-1, pred.size(-1)), grt).item()
                history_to_update[string] = (score - loss, hidden)
                scores.append(score - loss)
            else:
                raise ValueError("%s not stored" % (string[:-1]))
        self.history = history_to_update
        return scores
Example #2
0
def load_model(input_size):

    model = RNN(input_size, hidden_size, num_layers)

    # load on CPU only
    checkpoint = torch.load('checkpoint.pt', map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    print(model)
    print('model training loss', checkpoint['loss'])
    print('model training epoch', checkpoint['epoch'])

    return model
Example #3
0
def main():

    train_names = []
    train_labels = []
    # Read all the file names and the labels of the speakers for the training set
    with open('train.txt', 'r') as file:
        for line in file:
            speaker = line.split('-')[0]
            speech = line.split('-')[1]
            file_path = os.path.join('./LibriSpeech/dev-clean/', speaker,
                                     speech,
                                     line.split('\n')[0])
            train_names.append(file_path)
            train_labels.append(speaker)
    file.close()

    test_names = []
    test_labels = []
    # Read all the file names and the labels of the speakers for the testing set
    with open('test.txt', 'r') as file:
        for line in file:
            speaker = line.split('-')[0]
            speech = line.split('-')[1]
            file_path = os.path.join('./LibriSpeech/dev-clean/', speaker,
                                     speech,
                                     line.split('\n')[0])
            test_names.append(file_path)
            test_labels.append(speaker)
    file.close()

    # The following lines are used for encoding our speakers into one-hot encodings.
    # One-hot is useful for representation as we do not have a large number of speakers: 40.

    label_encoder = LabelEncoder()
    train_data_labels = label_encoder.fit_transform(train_labels)
    n_classes = len(np.unique(train_data_labels))
    print('Number of Train classes', len(np.unique(train_data_labels)))
    binarize = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
    train_data_labels = binarize.fit_transform(train_data_labels)

    label_encoder = LabelEncoder()
    test_data_labels = label_encoder.fit_transform(test_labels)
    n_classes = len(np.unique(test_data_labels))
    print('Number of Test classes', len(np.unique(test_data_labels)))
    binarize = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
    test_data_labels = binarize.fit_transform(test_data_labels)

    # Loading the data for training and testing

    train, train_labels = load_data_truncate(train_names, train_data_labels)
    val, val_labels = load_data_truncate(test_names, test_data_labels)

    # Preparing the data for the DataLoader so that it can be used in batches

    train = np.array(train).astype(np.float32)
    val = np.array(val).astype(np.float32)
    train_labels = np.array(train_labels).astype(np.float32)
    val_labels = np.array(val_labels).astype(np.float32)
    train_load = []
    for i in range(0, len(train)):
        train_load.append((train[i], train_labels[i]))

    val_load = []
    for i in range(0, len(val)):
        val_load.append((val[i], val_labels[i]))

    # Data Loader for the train set. Batch Size of 4, shuffled
    # and dropping the samples which do not fit the batch size.
    train_dataset = DataLoader(train_load,
                               batch_size=4,
                               shuffle=True,
                               drop_last=True)

    # Data Loader for the test set.
    val_dataset = DataLoader(val_load)

    # Initialize the RNN.
    model = RNN(input_size=100,
                output_size=n_classes,
                hidden_dim=256,
                n_layers=1)

    # Specifying the hyperparameters for training
    n_epochs = 100
    lr = 0.00001

    # Define Loss, Optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training part
    train_accuracy = []
    test_accuracy = []
    train_loss = []
    test_loss = []

    for epoch in range(0, n_epochs):

        model.train()

        optimizer.zero_grad()  # Clears existing gradients from previous epoch
        epoch_loss = []  # Store the losses for all batches of an epoch
        correct_predictions = 0
        total_predictions = 0

        # Iterate through data loader
        for i, (x, y) in enumerate(train_dataset):
            # Reshaping for training
            x = Variable(x.view(-1, 20, 100))
            y = Variable(y)
            output, _ = model(x)  # Obtain predictions
            target = torch.argmax(y, dim=1)
            loss = criterion(output, target)
            loss.backward()  # Does backpropagation and calculates gradients
            optimizer.step()  # Updates the weights accordingly
            epoch_loss.append(loss.item())
            # Compute number of correct predictions and total number of predictions
            correct, predicted = compute_accuracy(output, target)
            correct_predictions += correct
            total_predictions += predicted

        # Every 10th epoch present statistics
        if epoch % 10 == 0:
            print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
            print("Loss: {:.4f}, Accuracy: {}/{}".format(
                np.average(epoch_loss), correct_predictions.item(),
                total_predictions))
            train_accuracy.append(correct_predictions.item() /
                                  total_predictions)
            train_loss.append(np.average(epoch_loss))

            # Evaluate the model on the test set
            model.eval()
            correct_predictions = 0
            total_predictions = 0
            epoch_val_loss = []

            for i, (x, y) in enumerate(val_dataset):
                x = Variable(x.view(-1, 20, 100))
                y = Variable(y)
                output, _ = model(x)
                target = torch.argmax(y, dim=1)
                loss = criterion(output, target)
                epoch_val_loss.append(loss.item())
                correct, predicted = compute_accuracy(output, target)
                correct_predictions += correct
                total_predictions += predicted
            print("Eval Accuracy: {}/{}".format(correct_predictions.item(),
                                                total_predictions))
            test_accuracy.append(correct_predictions.item() /
                                 total_predictions)
            test_loss.append(np.average(epoch_val_loss))

    model.eval()
    correct_predictions = 0
    total_predictions = 0
    preds = []
    targets = []

    for i, (x, y) in enumerate(val_dataset):
        x = Variable(x.view(-1, 20, 100))
        y = Variable(y)
        output, _ = model(x)
        target = torch.argmax(y, dim=1)
        correct, predicted = compute_accuracy(output, target)
        preds.append(output)
        targets.append(target)
        correct_predictions += correct
        total_predictions += predicted
    print("Final Eval Accuracy: {}/{}".format(correct_predictions.item(),
                                              total_predictions))

    with open('accuracy.pickle', 'wb') as f:
        pickle.dump(train_accuracy, f)
        pickle.dump(test_accuracy, f)
    f.close()

    with open('loss.pickle', 'wb') as f:
        pickle.dump(train_loss, f)
        pickle.dump(test_loss, f)
    f.close()

    with open('preds.pickle', 'wb') as f:
        pickle.dump(preds, f)
        pickle.dump(targets, f)
    f.close()
Example #4
0
    next_chars = input_text[start + 1:end + 1]

    chars_tensor = torch.stack(
        tuple(map(one_hot_encode, map(char_to_index, chars))))
    next_chars_index = tuple(map(char_to_index, next_chars))
    next_chars_tensor = torch.tensor(next_chars_index)

    optimizer.zero_grad()

    pred_chars, hidden_state = net(chars_tensor, hidden_state)

    loss = F.cross_entropy(pred_chars, next_chars_tensor)
    loss.backward()

    loss = loss.item()
    total_loss += loss

    if i - last_print > 1000:
        print(total_loss / i)
        last_print = i

    optimizer.step()

    hidden_state = hidden_state.detach()

    i += BATCH_SIZE

torch.save(net.state_dict(), MODEL_SAVE_PATH)

net.eval()
Example #5
0
class SDFA():
    def __init__(self, model_path, gpu=-1):
        self.model_path = model_path
        if gpu < 0:
            self.device = torch.device("cpu")
            model_dict = torch.load(self.model_path,
                                    map_location=lambda storage, loc: storage)
            self.fa = face_alignment.FaceAlignment(
                face_alignment.LandmarksType._2D,
                device="cpu",
                flip_input=False)
        else:
            self.device = torch.device("cuda:" + str(gpu))
            model_dict = torch.load(
                self.model_path,
                map_location=lambda storage, loc: storage.cuda(gpu))
            self.fa = face_alignment.FaceAlignment(
                face_alignment.LandmarksType._2D,
                device="cuda:" + str(gpu),
                flip_input=False)

        self.stablePntsIDs = [33, 36, 39, 42, 45]
        self.mean_face = model_dict["mean_face"]
        self.img_size = model_dict["img_size"]
        self.audio_rate = model_dict["audio_rate"]
        self.video_rate = model_dict["video_rate"]
        self.audio_feat_len = model_dict['audio_feat_len']
        self.audio_feat_samples = model_dict['audio_feat_samples']
        self.id_enc_dim = model_dict['id_enc_dim']
        self.rnn_gen_dim = model_dict['rnn_gen_dim']
        self.aud_enc_dim = model_dict['aud_enc_dim']
        self.aux_latent = model_dict['aux_latent']
        self.sequential_noise = model_dict['sequential_noise']
        self.conversion_dict = {'s16': np.int16, 's32': np.int32}

        self.img_transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((self.img_size[0], self.img_size[1])),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        self.encoder = RNN(self.audio_feat_len,
                           self.aud_enc_dim,
                           self.rnn_gen_dim,
                           self.audio_rate,
                           init_kernel=0.005,
                           init_stride=0.001)
        self.encoder.to(self.device)
        self.encoder.load_state_dict(model_dict['encoder'])

        self.encoder_id = ImageEncoder(code_size=self.id_enc_dim,
                                       img_size=self.img_size)
        self.encoder_id.to(self.device)
        self.encoder_id.load_state_dict(model_dict['encoder_id'])

        skip_channels = list(self.encoder_id.channels)
        skip_channels.reverse()

        self.generator = Generator(
            self.img_size,
            self.rnn_gen_dim,
            condition_size=self.id_enc_dim,
            num_gen_channels=self.encoder_id.channels[-1],
            skip_channels=skip_channels,
            aux_size=self.aux_latent,
            sequential_noise=self.sequential_noise)

        self.generator.to(self.device)
        self.generator.load_state_dict(model_dict['generator'])

        self.encoder.eval()
        self.encoder_id.eval()
        self.generator.eval()

    def save_video(self,
                   video,
                   audio,
                   path,
                   overwrite=True,
                   experimental_ffmpeg=False,
                   scale=None):
        if not os.path.isabs(path):
            path = os.getcwd() + "/" + path

        with tempdir() as dirpath:

            writer = sio.FFmpegWriter(dirpath + "/tmp.avi",
                                      inputdict={
                                          '-r': str(self.video_rate) + "/1",
                                      },
                                      outputdict={
                                          '-r': str(self.video_rate) + "/1",
                                      })
            for i in range(video.shape[0]):
                frame = np.rollaxis(video[i, :, :, :], 0, 3)

                if scale is not None:
                    frame = tf.rescale(frame,
                                       scale,
                                       anti_aliasing=True,
                                       multichannel=True,
                                       mode='reflect')

                writer.writeFrame(frame)
            writer.close()

            wav.write(dirpath + "/tmp.wav", self.audio_rate, audio)

            in1 = ffmpeg.input(dirpath + "/tmp.avi")
            in2 = ffmpeg.input(dirpath + "/tmp.wav")
            if experimental_ffmpeg:
                out = ffmpeg.output(in1['v'],
                                    in2['a'],
                                    path,
                                    strict='-2',
                                    loglevel="panic")
            else:
                out = ffmpeg.output(in1['v'], in2['a'], path, loglevel="panic")

            if overwrite:
                out = out.overwrite_output()
            out.run()

    def preprocess_img(self, img):
        src = self.fa.get_landmarks(img)[0][self.stablePntsIDs, :]
        dst = self.mean_face[self.stablePntsIDs, :]
        tform = tf.estimate_transform('similarity', src, dst)
        warped = tf.warp(img,
                         inverse_map=tform.inverse,
                         output_shape=self.img_size)
        warped = warped * 255
        warped = warped.astype('uint8')

        return warped

    def _cut_sequence_(self, seq, cutting_stride, pad_samples):
        pad_left = torch.zeros(pad_samples // 2, 1)
        pad_right = torch.zeros(pad_samples - pad_samples // 2, 1)

        seq = torch.cat((pad_left, seq), 0)
        seq = torch.cat((seq, pad_right), 0)

        stacked = seq.narrow(0, 0, self.audio_feat_samples).unsqueeze(0)
        iterations = (seq.size()[0] -
                      self.audio_feat_samples) // cutting_stride + 1
        for i in range(1, iterations):
            stacked = torch.cat(
                (stacked,
                 seq.narrow(0, i * cutting_stride,
                            self.audio_feat_samples).unsqueeze(0)))
        return stacked.to(self.device)

    def _broadcast_elements_(self, batch, repeat_no):
        total_tensors = []
        for i in range(0, batch.size()[0]):
            total_tensors += [torch.stack(repeat_no * [batch[i]])]

        return torch.stack(total_tensors)

    def __call__(self, img, audio, fs=None, aligned=False):
        if isinstance(img, str):
            frm = Image.open(img)
            frm.thumbnail((400, 400))
            frame = np.array(frm)
        else:
            frame = img

        if not aligned:
            frame = self.preprocess_img(frame)

        if isinstance(audio, str):
            info = mediainfo(audio)
            fs = int(info['sample_rate'])
            audio = np.array(
                AudioSegment.from_file(audio,
                                       info['format_name']).set_channels(
                                           1).get_array_of_samples())

            if info['sample_fmt'] in self.conversion_dict:
                audio = audio.astype(self.conversion_dict[info['sample_fmt']])
            else:
                if max(audio) > np.iinfo(np.int16).max:
                    audio = audio.astype(np.int32)
                else:
                    audio = audio.astype(np.int16)

        if audio.ndim > 1 and audio.shape[1] > 1:
            audio = audio[:, 0]

        max_value = np.iinfo(audio.dtype).max
        if fs != self.audio_rate:
            seq_length = audio.shape[0]
            speech = torch.from_numpy(
                signal.resample(
                    audio, int(seq_length * self.audio_rate / float(fs))) /
                float(max_value)).float()
            speech = speech.view(-1, 1)
        else:
            audio = torch.from_numpy(audio / float(max_value)).float()
            speech = audio.view(-1, 1)

        frame = self.img_transform(frame).to(self.device)

        cutting_stride = int(self.audio_rate / float(self.video_rate))
        audio_seq_padding = self.audio_feat_samples - cutting_stride

        audio_feat_seq = self._cut_sequence_(speech, cutting_stride,
                                             audio_seq_padding)
        frame = frame.unsqueeze(0)
        audio_feat_seq = audio_feat_seq.unsqueeze(0)
        audio_feat_seq_length = audio_feat_seq.size()[1]

        z = self.encoder(audio_feat_seq, [audio_feat_seq_length])
        noise = torch.FloatTensor(1, audio_feat_seq_length,
                                  self.aux_latent).normal_(0, 0.33).to(
                                      self.device)
        z_id, skips = self.encoder_id(frame, retain_intermediate=True)
        skip_connections = []
        for skip_variable in skips:
            skip_connections.append(
                self._broadcast_elements_(skip_variable,
                                          z.size()[1]))
        skip_connections.reverse()

        z_id = self._broadcast_elements_(z_id, z.size()[1])
        gen_video = self.generator(z, c=z_id, aux=noise, skip=skip_connections)

        returned_audio = ((2**15) * speech.detach().cpu().numpy()).astype(
            np.int16)
        gen_video = 125 * gen_video.squeeze().detach().cpu().numpy() + 125
        return gen_video, returned_audio
Example #6
0
            if (idx + 1) % 5000 == 0:
                create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1,
                                  train_loss, params)
            if (idx + 1) % 500 == 0 or (idx + 1) == len(train_data_loader):
                print("Epoch %d (Step %d) - %0.4f train loss, %0.2f time." %
                      (epoch + 1, idx + 1, loss, time.time() - start_time))

        print("Epoch %d - %0.4f loss, %.2f time. " %
              (epoch + 1, np.mean(train_loss), time.time() - start_time))
        create_checkpoint(cnn, rnn, optimizer, epoch + 1, idx + 1, train_loss,
                          params)

        if (epoch + 1) % 5 == 0:
            # Test model on a random sub-batch of test loader
            cnn.eval()
            rnn.eval()
            print("Steps to be taken - %d\n", params['sub_batch_test'])
            test_model(cnn, rnn, optimizer, loss_fn, test_data_loader, vocab,
                       params, 'model_' + str(epoch + 1) + '.ckpt',
                       params['device'], params['sub_batch_test'])
            cnn.train()
            rnn.train()

    print('Training completed.')

if params['is_testing']:
    cnn.eval()
    rnn.eval()
    print("Steps to be taken - %d\n" % (len(test_data_loader)))
    test_model(cnn, rnn, optimizer, loss_fn, test_data_loader, vocab, params,
               params['load_model_test'], params['device'], -1,
    model.train()
    prediction = model(X_train_dep_std)
    loss = loss_func(prediction, y_train_dep_std)
    optimizer.zero_grad()  # clear gradients for this training step
    loss.backward()  # back propagation, compute gradients
    optimizer.step()
    if iter % 100 == 0:
        print("iteration: %s, loss: %s" % (iter, loss.item()))

# Save model
save_filename = 'checkpoints/LSTM_FC.pth'
torch.save(model, save_filename)
print('Saved as %s' % save_filename)

# Start evaluating model
model.eval()

y_pred_dep_ = model(X_test_dep_std).detach().numpy()
y_pred_dep = ss_y_dep.inverse_transform(y_pred_dep_[0, 144:])

print('the value of R-squared of Evaporation is ',
      r2_score(Outputs[144:], y_pred_dep))
print('the value of Root mean squared error of Evaporation is ',
      rmse(Outputs[144:], y_pred_dep))

f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(6, 4))

ax1.plot(Outputs[144:],
         color="blue",
         linestyle="-",
         linewidth=1.5,