Example #1
0
def train(opt, train_loader, model, epoch):
    # average meters to record the training statistics
    batch_time = AverageMeter()
    data_time = AverageMeter()
    train_logger = LogCollector()

    # switch to train mode
    model.train_start()

    progbar = Progbar(train_loader.dataset.length)
    end = time.time()
    for i, train_data in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)

        # make sure train logger is used
        model.logger = train_logger

        # Update the model
        b_size, loss = model.train_emb(*train_data)
        # print loss
        progbar.add(b_size, values=[("loss", loss)])

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # Record logs in tensorboard
        tb_logger.log_value('epoch', epoch, step=model.Eiters)
        tb_logger.log_value('step', i, step=model.Eiters)
        tb_logger.log_value('batch_time', batch_time.val, step=model.Eiters)
        tb_logger.log_value('data_time', data_time.val, step=model.Eiters)
        model.logger.tb_log(tb_logger, step=model.Eiters)
    def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1,
            validation_split=0., validation_data=None, shuffle=True, show_accuracy=False):
        y = standardize_y(y)

        do_validation = False
        if validation_data:
            try:
                X_val, y_val = validation_data
            except:
                raise Exception("Invalid format for validation data; provide a tuple (X_val, y_val).")
            do_validation = True
            y_val = standardize_y(y_val)
            if verbose:
                print "Train on %d samples, validate on %d samples" % (len(y), len(y_val))
        else:
            if 0 < validation_split < 1:
                # If a validation split size is given (e.g. validation_split=0.2)
                # then split X into smaller X and X_val,
                # and split y into smaller y and y_val.
                do_validation = True
                split_at = int(len(X) * (1 - validation_split))
                (X, X_val) = (X[0:split_at], X[split_at:])
                (y, y_val) = (y[0:split_at], y[split_at:])
                if verbose:
                    print "Train on %d samples, validate on %d samples" % (len(y), len(y_val))
        
        index_array = np.arange(len(X))
        for epoch in range(nb_epoch):
            if verbose:
                print 'Epoch', epoch
            if shuffle:
                np.random.shuffle(index_array)

            batches = make_batches(len(X), batch_size)
            progbar = Progbar(target=len(X))
            for batch_index, (batch_start, batch_end) in enumerate(batches):
                batch_ids = index_array[batch_start:batch_end]
                X_batch = X[batch_ids]
                y_batch = y[batch_ids]

                if show_accuracy:
                    loss, acc = self._train_with_acc(X_batch, y_batch)
                else:
                    loss = self._train(X_batch, y_batch)
                
                # logging
                if verbose:
                    is_last_batch = (batch_index == len(batches) - 1)
                    if not is_last_batch or not do_validation:
                        if show_accuracy:
                            progbar.update(batch_end, [('loss', loss), ('acc.', acc)])
                        else:
                            progbar.update(batch_end, [('loss', loss)])
                    else:
                        if show_accuracy:
                            val_loss, val_acc = self.test(X_val, y_val, accuracy=True)
                            progbar.update(batch_end, [('loss', loss), ('acc.', acc), ('val. loss', val_loss), ('val. acc.', val_acc)])
                        else:
                            val_loss = self.test(X_val, y_val, accuracy=False)
                            progbar.update(batch_end, [('loss', loss), ('val. loss', val_loss)])
Example #3
0
def process(options, collection):
    rootpath = options.rootpath
    feature = options.feature
    pooling = options.pooling
    overwrite = options.overwrite

    pooling_func = get_pooling_func(pooling)
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feature)
    res_dir = os.path.join(rootpath, collection, 'FeatureData',
                           '%s_%s' % (pooling, feature))

    if os.path.exists(res_dir):
        if overwrite:
            logger.info("%s exists. overwrite", res_dir)
        else:
            logger.info("%s exists. quit", res_dir)
            return 0

    feat_file = BigFile(feat_dir)
    video2frames = {}
    for frame_id in feat_file.names:
        video_id, frame_index = frame_id.rsplit('_', 1)
        frame_index = int(frame_index)
        video2frames.setdefault(video_id, []).append(frame_id)

    if not os.path.exists(res_dir):
        os.makedirs(res_dir)

    res_binary_file = os.path.join(res_dir, 'feature.bin')
    fw = open(res_binary_file, 'wb')
    videoset = []

    pbar = Progbar(len(video2frames))
    for video_id, frame_id_list in video2frames.iteritems():
        renamed, vectors = feat_file.read(frame_id_list)
        name2vec = dict(zip(renamed, vectors))
        frame_id_list.sort(key=lambda v: int(v.rsplit('_', 1)[-1]))

        feat_matrix = np.zeros((len(renamed), len(vectors[0])))
        for i, frame_id in enumerate(frame_id_list):
            feat_matrix[i, :] = name2vec[frame_id]

        video_vec = pooling_func(feat_matrix)
        video_vec.astype(np.float32).tofile(fw)
        videoset.append(video_id)
        pbar.add(1)
    fw.close()

    fw = open(os.path.join(res_dir, 'id.txt'), 'w')
    fw.write(' '.join(videoset))
    fw.close()

    fw = open(os.path.join(res_dir, 'shape.txt'), 'w')
    fw.write('%d %d' % (len(videoset), len(video_vec)))
    fw.close()

    logger.info("%s pooling -> %dx%d video feature file", pooling,
                len(videoset), len(video_vec))
Example #4
0
 def fit_epoch(self, train_data, batch_size=None, incl_progbar=True):
     '''Fit on training data for an epoch'''
     if incl_progbar:
         progbar = Progbar(target=len(train_data)*batch_size if batch_size else len(train_data))
     for (word_id_batch, tag_id_batch, deprel_id_batch), class_batch in \
             train_data:
         loss = self.fit_batch(
             word_id_batch, tag_id_batch, deprel_id_batch, class_batch)
         if incl_progbar:
             progbar.add(word_id_batch.shape[0], [("Cross-entropy", loss)])
def process(options, collection, featnames):
    rootpath = options.rootpath
    target_featname = featnames
    featnames = featnames.split('+')
    target_feat_dir = os.path.join(rootpath, collection, 'FeatureData',
                                   target_featname)

    if os.path.exists(target_feat_dir):
        if options.overwrite:
            logger.info('%s exists! overwrite.', target_feat_dir)
        else:
            logger.info('%s exists! quit.', target_feat_dir)
            sys.exit(0)
    else:
        os.makedirs(target_feat_dir)

    target_binary_file = os.path.join(target_feat_dir, 'feature.bin')
    target_id_file = os.path.join(target_feat_dir, 'id.txt')

    feat_dim = 0
    img_ids = []
    featfiles = []

    for i, feat in enumerate(featnames):
        feat_dir = os.path.join(rootpath, collection, 'FeatureData', feat)
        featfile = BigFile(feat_dir)
        feat_dim += featfile.ndims
        if i == 0:
            img_ids = featfile.names
        else:
            assert len(img_ids) == len(featfile.names) and set(img_ids) == set(
                featfile.names), '%s not match target feature' % feat
        featfiles.append(featfile)

    with open(target_binary_file, 'w') as fw:
        progbar = Progbar(len(img_ids))
        for im in img_ids:
            target_feat_vec = []
            for feat in featfiles:
                vec = feat.read_one(im)
                target_feat_vec.extend(vec)
            vec = np.array(target_feat_vec, dtype=np.float32)
            vec.tofile(fw)
            progbar.add(1)

    with open(os.path.join(target_feat_dir, 'id.txt'), 'w') as fw:
        fw.write(' '.join(img_ids))

    with open(os.path.join(target_feat_dir, 'shape.txt'), 'w') as fw:
        fw.write('%d %d' % (len(img_ids), feat_dim))

    logger.info('%s: (%d, %d)', target_featname, len(img_ids), feat_dim)
Example #6
0
 def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1):
     y = standardize_y(y)
     for epoch in range(nb_epoch):
         if verbose:
             print 'Epoch', epoch
         
         nb_batch = len(X)/batch_size+1
         progbar = Progbar(target=len(X))
         for batch_index in range(0, nb_batch):
             batch = range(batch_index*batch_size, min(len(X), (batch_index+1)*batch_size))
             if not batch:
                 break
             loss = self._train(X[batch], y[batch])
             if verbose:                
                 progbar.update(batch[-1]+1, [('loss', loss)])
Example #7
0
    def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1):
        y = standardize_y(y)
        for epoch in range(nb_epoch):
            if verbose:
                print('Epoch', epoch)

            nb_batch = len(X) // batch_size + 1
            progbar = Progbar(target=len(X))
            for batch_index in range(0, nb_batch):
                batch = range(batch_index * batch_size,
                              min(len(X), (batch_index + 1) * batch_size))
                if not batch:
                    break
                loss = self._train(X[batch], y[batch])
                if verbose:
                    progbar.update(batch[-1] + 1, [('loss', loss)])
    def predict_proba(self, X, batch_size=128, verbose=1):
        batches = make_batches(len(X), batch_size)
        if verbose:
            progbar = Progbar(target=len(X))
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            X_batch = X[batch_start:batch_end]
            batch_preds = self._predict(X_batch)

            if batch_index == 0:
                shape = (len(X),) + batch_preds.shape[1:]
                preds = np.zeros(shape)
            preds[batch_start:batch_end] = batch_preds

            if verbose:
                progbar.update(batch_end)
        return preds
Example #9
0
    def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1,
            validation_split=0., validation_data=None, shuffle=True, show_accuracy=False):
        y = standardize_y(y)

        do_validation = False
        if validation_data:
            try:
                X_val, y_val = validation_data
            except:
                raise Exception("Invalid format for validation data; provide a tuple (X_val, y_val).")
            do_validation = True
            y_val = standardize_y(y_val)
            if verbose:
                print "Train on %d samples, validate on %d samples" % (len(y), len(y_val))
        else:
            if 0 < validation_split < 1:
                # If a validation split size is given (e.g. validation_split=0.2)
                # then split X into smaller X and X_val,
                # and split y into smaller y and y_val.
                do_validation = True
                split_at = int(len(X) * (1 - validation_split))
                (X, X_val) = (X[0:split_at], X[split_at:])
                (y, y_val) = (y[0:split_at], y[split_at:])
                if verbose:
                    print "Train on %d samples, validate on %d samples" % (len(y), len(y_val))
        
        index_array = np.arange(len(X))
        for epoch in range(nb_epoch):
            if verbose:
                print 'Epoch', epoch
            if shuffle:
                np.random.shuffle(index_array)

            batches = make_batches(len(X), batch_size)
            progbar = Progbar(target=len(X))
Example #10
0
def process(options, collection, feat_name):
    overwrite = options.overwrite
    rootpath = options.rootpath

    feature_dir = os.path.join(rootpath, collection, 'feature')
    resdir = os.path.join(rootpath, collection, 'FeatureData', feat_name)

    train_csv = os.path.join(rootpath, collection, 'split', 'train.csv')
    val_csv = os.path.join(rootpath, collection, 'split', 'val.csv')
    test_csv = os.path.join(rootpath, collection, 'split', 'test.csv')

    train_val_test_set = []
    train_val_test_set.extend(map(str.strip, open(train_csv).readlines()))
    train_val_test_set.extend(map(str.strip, open(val_csv).readlines()))
    train_val_test_set.extend(map(str.strip, open(test_csv).readlines()))

    target_feat_file = os.path.join(resdir, 'id.feature.txt')
    if checkToSkip(os.path.join(resdir, 'feature.bin'), overwrite):
        sys.exit(0)
    makedirsforfile(target_feat_file)

    frame_count = []
    print 'Processing %s - %s' % (collection, feat_name)
    with open(target_feat_file, 'w') as fw_feat:
        progbar = Progbar(len(train_val_test_set))
        for d in train_val_test_set:
            feat_file = os.path.join(feature_dir, d,
                                     '%s-%s.npy' % (d, feat_name))
            feats = np.load(feat_file)
            if len(feats.shape) == 1:  # video level feature
                dim = feats.shape[0]
                fw_feat.write('%s %s\n' %
                              (d, ' '.join(['%.6f' % x for x in feats])))
            elif len(feats.shape) == 2:  # frame level feature
                frames, dim = feats.shape
                frame_count.append(frames)
                for i in range(frames):
                    frame_id = d + '_' + str(i)
                    fw_feat.write(
                        '%s %s\n' %
                        (frame_id, ' '.join(['%.6f' % x for x in feats[i]])))
            progbar.add(1)

    text2bin(dim, [target_feat_file], resdir, 1)
    os.system('rm %s' % target_feat_file)
Example #11
0
    def fit(self, X, y, batch_size=128, nb_epoch=100, verbose=1,
            validation_split=0., shuffle=True):
        # If a validation split size is given (e.g. validation_split=0.2)
        # then split X into smaller X and X_val,
        # and split y into smaller y and y_val.
        y = standardize_y(y)

        do_validation = False
        if validation_split > 0 and validation_split < 1:
            do_validation = True
            split_at = int(len(X) * (1 - validation_split))
            (X, X_val) = (X[0:split_at], X[split_at:])
            (y, y_val) = (y[0:split_at], y[split_at:])
            if verbose:
                print "Train on %d samples, validate on %d samples" % (len(y), len(y_val))
        
        index_array = np.arange(len(X))
        for epoch in range(nb_epoch):
            if verbose:
                print 'Epoch', epoch
            if shuffle:
                np.random.shuffle(index_array)

            nb_batch = len(X)/batch_size+1
            progbar = Progbar(target=len(X))
            for batch_index in range(0, nb_batch):
                batch_start = batch_index*batch_size
                batch_end = min(len(X), (batch_index+1)*batch_size)
                batch_ids = index_array[batch_start:batch_end]

                X_batch = X[batch_ids]
                y_batch = y[batch_ids]
                loss = self._train(X_batch, y_batch)
                
                if verbose:
                    is_last_batch = (batch_index == nb_batch - 1)
                    if not is_last_batch or not do_validation:
                        progbar.update(batch_end, [('loss', loss)])
                    else:
                        progbar.update(batch_end, [('loss', loss), ('val. loss', self.test(X_val, y_val))])
    def evaluate(self, X, y, batch_size=128, show_accuracy=False, verbose=1):
        y = standardize_y(y)

        if show_accuracy:
            tot_acc = 0.
        tot_score = 0.

        batches = make_batches(len(X), batch_size)
        progbar = Progbar(target=len(X))
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            X_batch = X[batch_start:batch_end]
            y_batch = y[batch_start:batch_end]

            if show_accuracy:
                loss, acc = self._test_with_acc(X_batch, y_batch)
                tot_acc += acc
            else:
                loss = self._test(X_batch, y_batch)
            tot_score += loss

            if verbose:
                if show_accuracy:
                    progbar.update(batch_end, [('loss', loss), ('acc.', acc)])
                else:
                    progbar.update(batch_end, [('loss', loss)])

        if show_accuracy:
            return tot_score/len(batches), tot_acc/len(batches)
        else:
            return tot_score/len(batches)
Example #13
0
    def fit(self,
            X,
            y,
            batch_size=128,
            nb_epoch=100,
            verbose=1,
            validation_split=0.,
            shuffle=True):
        # If a validation split size is given (e.g. validation_split=0.2)
        # then split X into smaller X and X_val,
        # and split y into smaller y and y_val.
        y = standardize_y(y)

        do_validation = False
        if validation_split > 0 and validation_split < 1:
            do_validation = True
            split_at = int(len(X) * (1 - validation_split))
            (X, X_val) = (X[0:split_at], X[split_at:])
            (y, y_val) = (y[0:split_at], y[split_at:])
            if verbose:
                print "Train on %d samples, validate on %d samples" % (
                    len(y), len(y_val))

        index_array = np.arange(len(X))
        for epoch in range(nb_epoch):
            if verbose:
                print 'Epoch', epoch
            if shuffle:
                np.random.shuffle(index_array)

            nb_batch = int(np.ceil(len(X) / float(batch_size)))
            progbar = Progbar(target=len(X))
            for batch_index in range(0, nb_batch):
                batch_start = batch_index * batch_size
                batch_end = min(len(X), (batch_index + 1) * batch_size)
                if shuffle:
                    batch_ids = index_array[batch_start:batch_end]
                else:
                    batch_ids = slice(batch_start, batch_end)

                X_batch = X[batch_ids]
                y_batch = y[batch_ids]
                loss = self._train(X_batch, y_batch)

                if verbose:
                    is_last_batch = (batch_index == nb_batch - 1)
                    if not is_last_batch or not do_validation:
                        progbar.update(batch_end, [('loss', loss)])
                    else:
                        progbar.update(
                            batch_end,
                            [('loss', loss),
                             ('val. loss', self.test(X_val, y_val))])
Example #14
0
def train(model, criterion, criterion_st, data_loader, optimizer, epoch):
    model = model.train()
    epoch_time = 0
    avg_linear_loss = 0
    avg_mel_loss = 0
    avg_stop_loss = 0
    avg_attn_loss = 0

    print(" | > Epoch {}/{}".format(epoch, c.epochs))
    progbar = Progbar(len(data_loader.dataset) / c.batch_size)
    progbar_display = {}
    for num_iter, data in enumerate(data_loader):
        start_time = time.time()

        # setup input data
        text_input = data[0]
        text_lengths = data[1]
        linear_spec = data[2]
        mel_spec = data[3]
        mel_lengths = data[4]
        stop_target = data[5]

        current_step = num_iter + args.restore_step + \
            epoch * len(data_loader) + 1

        # setup lr
        current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
        for params_group in optimizer.param_groups:
            params_group['lr'] = current_lr

        optimizer.zero_grad()
        
        stop_target = stop_target.view(text_input.shape[0], stop_target.size(1) // c.r, -1)
        stop_target = (stop_target.sum(2) > 0.0).float()

        # dispatch data to GPU
        if use_cuda:
            text_input = text_input.cuda()
            mel_spec = mel_spec.cuda()
            mel_lengths = mel_lengths.cuda()
            linear_spec = linear_spec.cuda()
            stop_target = stop_target.cuda()
            
        # create attention mask
        if c.mk > 0.0:
            N = text_input.shape[1]
            T = mel_spec.shape[1] // c.r
            M = create_attn_mask(N, T, 0.03)
            mk = mk_decay(c.mk, c.epochs, epoch)
        
        # forward pass
        mel_output, linear_output, alignments, stop_tokens =\
            model.forward(text_input, mel_spec)

        # loss computation
        mel_loss = criterion(mel_output, mel_spec, mel_lengths)
        linear_loss = criterion(linear_output, linear_spec, mel_lengths)
        stop_loss = criterion_st(stop_tokens, stop_target)
        if c.priority_freq:
            linear_loss =  0.5 * linear_loss\
                + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                  linear_spec[:, :, :n_priority_freq],
                                  mel_lengths)
        loss = mel_loss + linear_loss + stop_loss
        if c.mk > 0.0:
            attention_loss = criterion(alignments, M, mel_lengths)
            loss += mk * attention_loss
            avg_attn_loss += attention_loss.item()
            progbar_display['attn_loss'] = attention_loss.item()

        # backpass and check the grad norm
        loss.backward()
        grad_norm, skip_flag = check_update(model, 0.5, 100)
        if skip_flag:
            optimizer.zero_grad()
            print(" | > Iteration skipped!!")
            continue
        optimizer.step()

        step_time = time.time() - start_time
        epoch_time += step_time
        
        progbar_display['total_loss'] =  loss.item()
        progbar_display['linear_loss'] = linear_loss.item()
        progbar_display['mel_loss'] = mel_loss.item()
        progbar_display['stop_loss'] = stop_loss.item()
        progbar_display['grad_norm'] = grad_norm.item()

        # update
        progbar.update(num_iter+1, values=list(progbar_display.items()))
        avg_linear_loss += linear_loss.item()
        avg_mel_loss += mel_loss.item()
        avg_stop_loss += stop_loss.item()

        # Plot Training Iter Stats
        tb.add_scalar('TrainIterLoss/TotalLoss', loss.item(), current_step)
        tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.item(),
                      current_step)
        tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.item(), current_step)
        tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                      current_step)
        tb.add_scalar('Params/GradNorm', grad_norm, current_step)
        tb.add_scalar('Time/StepTime', step_time, current_step)

        if current_step % c.save_step == 0:
            if c.checkpoint:
                # save model
                save_checkpoint(model, optimizer, linear_loss.item(),
                                OUT_PATH, current_step, epoch)

            # Diagnostic visualizations
            const_spec = linear_output[0].data.cpu().numpy()
            gt_spec = linear_spec[0].data.cpu().numpy()

            const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
            gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
            tb.add_image('Visual/Reconstruction', const_spec, current_step)
            tb.add_image('Visual/GroundTruth', gt_spec, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Visual/Alignment', align_img, current_step)

            # Sample audio
            audio_signal = linear_output[0].data.cpu().numpy()
            data_loader.dataset.ap.griffin_lim_iters = 60
            audio_signal = data_loader.dataset.ap.inv_spectrogram(
                audio_signal.T)
            try:
                tb.add_audio('SampleAudio', audio_signal, current_step,
                             sample_rate=c.sample_rate)
            except:
                # print("\n > Error at audio signal on TB!!")
                # print(audio_signal.max())
                # print(audio_signal.min())
                pass

    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_stop_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss

    # Plot Training Epoch Stats
    tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step)
    tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step)
    tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step)
    tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step)
    if c.mk > 0:
        avg_attn_loss /= (num_iter + 1)
        tb.add_scalar('TrainEpochLoss/AttnLoss', avg_attn_loss, current_step)
    tb.add_scalar('Time/EpochTime', epoch_time, epoch)
    epoch_time = 0

    return avg_linear_loss, current_step
Example #15
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'), c.r,
                              c.sample_rate, c.text_cleaner)

    model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq,
                     c.r)
    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    try:
        checkpoint = torch.load(
            os.path.join(CHECKPOINT_PATH,
                         'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)

    except:
        print("\n > Starting a new training\n")

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    for epoch in range(c.epochs):

        dataloader = DataLoader(dataset,
                                batch_size=c.batch_size,
                                shuffle=True,
                                collate_fn=dataset.collate_fn,
                                drop_last=True,
                                num_workers=32)
        progbar = Progbar(len(dataset) / c.batch_size)

        for i, data in enumerate(dataloader):
            text_input = data[0]
            magnitude_input = data[1]
            mel_input = data[2]

            current_step = i + args.restore_step + epoch * len(dataloader) + 1

            optimizer.zero_grad()

            try:
                mel_input = np.concatenate(
                    (np.zeros([c.batch_size, 1, c.num_mels],
                              dtype=np.float32), mel_input[:, 1:, :]),
                    axis=1)
            except:
                raise TypeError("not same dimension")

            if use_cuda:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.cuda.LongTensor),
                                          requires_grad=False).cuda()
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                         requires_grad=False).cuda()
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                        requires_grad=False).cuda()
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(
                        torch.cuda.FloatTensor),
                    requires_grad=False).cuda()

            else:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.LongTensor),
                                          requires_grad=False)
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                         requires_grad=False)
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                        requires_grad=False)
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(torch.FloatTensor),
                    requires_grad=False)

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_input_var)

            mel_loss = criterion(mel_output, mel_spec_var)
            linear_loss = torch.abs(linear_output - linear_spec_var)
            linear_loss = 0.5 * \
                torch.mean(linear_loss) + 0.5 * \
                torch.mean(linear_loss[:, :n_priority_freq, :])
            loss = mel_loss + linear_loss
            loss = loss.cuda()

            start_time = time.time()

            loss.backward()

            nn.utils.clip_grad_norm(model.parameters(), 1.)

            optimizer.step()

            time_per_step = time.time() - start_time
            progbar.update(i,
                           values=[('total_loss', loss.data[0]),
                                   ('linear_loss', linear_loss.data[0]),
                                   ('mel_loss', mel_loss.data[0])])

            if current_step % c.save_step == 0:
                checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
                checkpoint_path = os.path.join(OUT_PATH, checkpoint_path)
                save_checkpoint(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'step': current_step,
                        'total_loss': loss.data[0],
                        'linear_loss': linear_loss.data[0],
                        'mel_loss': mel_loss.data[0],
                        'date': datetime.date.today().strftime("%B %d, %Y")
                    }, checkpoint_path)
                print(" > Checkpoint is saved : {}".format(checkpoint_path))

            if current_step in c.decay_step:
                optimizer = adjust_learning_rate(optimizer, current_step)
Example #16
0
def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st, epoch):
    model = model.train()
    epoch_time = 0
    avg_linear_loss = 0
    avg_mel_loss = 0
    avg_stop_loss = 0
    print(" | > Epoch {}/{}".format(epoch, c.epochs))
    progbar = Progbar(len(data_loader.dataset) / c.batch_size)
    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
    for num_iter, data in enumerate(data_loader):
        start_time = time.time()

        # setup input data
        text_input = data[0]
        print(text_input)
        text_lengths = data[1]
        linear_input = data[2]
        mel_input = data[3]
        mel_lengths = data[4]
        stop_targets = data[5]

        # set stop targets view, we predict a single stop token per r frames prediction
        stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()

        current_step = num_iter + args.restore_step + \
                       epoch * len(data_loader) + 1

        # setup lr
        current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
        current_lr_st = lr_decay(c.lr, current_step, c.warmup_steps)

        for params_group in optimizer.param_groups:
            params_group['lr'] = current_lr

        for params_group in optimizer_st.param_groups:
            params_group['lr'] = current_lr_st

        optimizer.zero_grad()
        optimizer_st.zero_grad()

        # dispatch data to GPU
        if use_cuda:
            text_input = text_input.cuda()
            mel_input = mel_input.cuda()
            mel_lengths = mel_lengths.cuda()
            linear_input = linear_input.cuda()
            stop_targets = stop_targets.cuda()

        # forward pass
        mel_output, linear_output, alignments, stop_tokens = \
            model.forward(text_input, mel_input)

        # loss computation
        stop_loss = criterion_st(stop_tokens, stop_targets)
        mel_loss = criterion(mel_output, mel_input, mel_lengths)
        linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \
                      + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                        linear_input[:, :, :n_priority_freq],
                                        mel_lengths)
        loss = mel_loss + linear_loss

        # backpass and check the grad norm for spec losses
        loss.backward(retain_graph=True)
        grad_norm, skip_flag = check_update(model, 0.5, 100)
        if skip_flag:
            optimizer.zero_grad()
            print(" | > Iteration skipped!!")
            continue
        optimizer.step()

        # backpass and check the grad norm for stop loss
        stop_loss.backward()
        grad_norm_st, skip_flag = check_update(model.module.decoder.stopnet, 0.5, 100)
        if skip_flag:
            optimizer_st.zero_grad()
            print(" | > Iteration skipped fro stopnet!!")
            continue
        optimizer_st.step()

        step_time = time.time() - start_time
        epoch_time += step_time

        # update
        progbar.update(num_iter + 1, values=[('total_loss', loss.item()),
                                             ('linear_loss', linear_loss.item()),
                                             ('mel_loss', mel_loss.item()),
                                             ('stop_loss', stop_loss.item()),
                                             ('grad_norm', grad_norm.item()),
                                             ('grad_norm_st', grad_norm_st.item())])
        avg_linear_loss += linear_loss.item()
        avg_mel_loss += mel_loss.item()
        avg_stop_loss += stop_loss.item()

        # Plot Training Iter Stats
        tb.add_scalar('TrainIterLoss/TotalLoss', loss.item(), current_step)
        tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.item(),
                      current_step)
        tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.item(), current_step)
        tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                      current_step)
        tb.add_scalar('Params/GradNorm', grad_norm, current_step)
        tb.add_scalar('Params/GradNormSt', grad_norm_st, current_step)
        tb.add_scalar('Time/StepTime', step_time, current_step)

        if current_step % c.save_step == 0:
            if c.checkpoint:
                # save model
                save_checkpoint(model, optimizer, linear_loss.item(),
                                OUT_PATH, current_step, epoch)

            # Diagnostic visualizations
            const_spec = linear_output[0].data.cpu().numpy()
            gt_spec = linear_input[0].data.cpu().numpy()

            const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
            gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
            tb.add_image('Visual/Reconstruction', const_spec, current_step)
            tb.add_image('Visual/GroundTruth', gt_spec, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Visual/Alignment', align_img, current_step)

            # Sample audio
            audio_signal = linear_output[0].data.cpu().numpy()
            data_loader.dataset.ap.griffin_lim_iters = 60
            audio_signal = data_loader.dataset.ap.inv_spectrogram(
                audio_signal.T)
            try:
                tb.add_audio('SampleAudio', audio_signal, current_step,
                             sample_rate=c.sample_rate)
            except:
                # print("\n > Error at audio signal on TB!!")
                # print(audio_signal.max())
                # print(audio_signal.min())
                pass

    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_stop_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss

    # Plot Training Epoch Stats
    tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step)
    tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step)
    tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step)
    tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step)
    tb.add_scalar('Time/EpochTime', epoch_time, epoch)
    epoch_time = 0

    return avg_linear_loss, current_step
Example #17
0
def evaluate(model, criterion, criterion_st, data_loader, current_step):
    model = model.eval()
    epoch_time = 0
    avg_linear_loss = 0
    avg_mel_loss = 0
    avg_stop_loss = 0
    print(" | > Validation")
    progbar = Progbar(len(data_loader.dataset) / c.batch_size)
    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
    with torch.no_grad():
        for num_iter, data in enumerate(data_loader):
            start_time = time.time()

            # setup input data
            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]
            mel_lengths = data[4]
            stop_targets = data[5]

            # set stop targets view, we predict a single stop token per r frames prediction
            stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1)
            stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()

            # dispatch data to GPU
            if use_cuda:
                text_input = text_input.cuda()
                mel_input = mel_input.cuda()
                mel_lengths = mel_lengths.cuda()
                linear_input = linear_input.cuda()
                stop_targets = stop_targets.cuda()

            # forward pass
            mel_output, linear_output, alignments, stop_tokens = \
                model.forward(text_input, mel_input)

            # loss computation
            stop_loss = criterion_st(stop_tokens, stop_targets)
            mel_loss = criterion(mel_output, mel_input, mel_lengths)
            linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \
                          + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                            linear_input[:, :, :n_priority_freq],
                                            mel_lengths)
            loss = mel_loss + linear_loss + stop_loss

            step_time = time.time() - start_time
            epoch_time += step_time

            # update
            progbar.update(num_iter + 1, values=[('total_loss', loss.item()),
                                                 ('linear_loss', linear_loss.item()),
                                                 ('mel_loss', mel_loss.item()),
                                                 ('stop_loss', stop_loss.item())])

            avg_linear_loss += linear_loss.item()
            avg_mel_loss += mel_loss.item()
            avg_stop_loss += stop_loss.item()

    # Diagnostic visualizations
    idx = np.random.randint(mel_input.shape[0])
    const_spec = linear_output[idx].data.cpu().numpy()
    gt_spec = linear_input[idx].data.cpu().numpy()
    align_img = alignments[idx].data.cpu().numpy()

    const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
    gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
    align_img = plot_alignment(align_img)

    tb.add_image('ValVisual/Reconstruction', const_spec, current_step)
    tb.add_image('ValVisual/GroundTruth', gt_spec, current_step)
    tb.add_image('ValVisual/ValidationAlignment', align_img, current_step)

    # Sample audio
    audio_signal = linear_output[idx].data.cpu().numpy()
    data_loader.dataset.ap.griffin_lim_iters = 60
    audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T)
    try:
        tb.add_audio('ValSampleAudio', audio_signal, current_step,
                     sample_rate=c.sample_rate)
    except:
        # print(" | > Error at audio signal on TB!!")
        # print(audio_signal.max())
        # print(audio_signal.min())
        pass

    # compute average losses
    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_stop_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss + stop_loss

    # Plot Learning Stats
    tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step)
    tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
    tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
    tb.add_scalar('ValEpochLoss/Stop_loss', avg_stop_loss, current_step)

    return avg_linear_loss
Example #18
0
 def dl_progress(count, block_size, total_size):
     global progbar
     if progbar is None:
         progbar = Progbar(total_size)
     else:
         progbar.update(count*block_size)
Example #19
0
def process(opt):

    rootpath = opt.rootpath
    collection = opt.collection
    feature = opt.feature
    stride = opt.stride
    overwrite = opt.overwrite
    pooling_style = opt.pooling_style


    feat_path = os.path.join(rootpath, collection, "FeatureData", feature)

    output_dir = os.path.join(rootpath, collection, "FeatureData", '%s-' % pooling_style + feature + "-stride%s" %  stride)
    feat_combined_file = os.path.join(output_dir, "id_feat.txt")
    if checkToSkip(os.path.join(output_dir, "feature.bin"), overwrite):
        sys.exit(0)
    makedirsforfile(feat_combined_file)

    print "Generate augmented frame-level features and operate mean pooling..."

    feat_data = BigFile(feat_path)
    video2fmnos = {}
    for frame_id in feat_data.names:
        data = frame_id.strip().split("_")
        video_id = '_'.join(data[:-1])
        fm_no = data[-1]
        video2fmnos.setdefault(video_id, []).append(int(fm_no))

    video2frames = {}
    for video_id, fmnos in video2fmnos.iteritems():
        for fm_no in sorted(fmnos):
            video2frames.setdefault(video_id, []).append(video_id + "_" + str(fm_no))
    

    stride = map(int, stride.strip().split('-'))
    f_auger = Frame_Level_Augmenter(stride)

    video2subvideo = {}
    fout = open(feat_combined_file, 'w')
    progbar = Progbar(len(video2frames))
    for video in video2frames:
        frame_ids = video2frames[video]

        # output the while video level feature
        video2subvideo.setdefault(video, []).append(video)
        reanme, feats  = feat_data.read(frame_ids)
        if pooling_style == 'avg':
            feat_vec = np.array(feats).mean(axis=0)
        elif pooling_style == 'max':
            feat_vec = np.array(feats).max(axis=0)
        fout.write(video + " " + " ".join(map(str,feat_vec)) + '\n')

    
        # output the sub video level feature
        counter = 0
        aug_index = f_auger.get_aug_index(len(frame_ids))  # get augmented frame list
        for sub_index in aug_index:
            sub_frames = [frame_ids[idx] for idx in sub_index]
            reanme, sub_feats  = feat_data.read(sub_frames)
            
            if pooling_style == 'avg':
                feat_vec = np.array(sub_feats).mean(axis=0)
            elif pooling_style == 'max':
                feat_vec = np.array(sub_feats).max(axis=0)

            video2subvideo.setdefault(video, []).append(video + "_sub%d" % counter)
            fout.write(video + "_sub%d" % counter + " " + " ".join(map(str,feat_vec)) + '\n')
            counter += 1
        progbar.add(1)

    fout.close()

    f = open(os.path.join(output_dir, "video2subvideo.txt"),'w')  
    f.write(str(video2subvideo))  
    f.close()  

    text2bin(len(feat_vec), [feat_combined_file], output_dir, 1)
    os.system('rm %s' % feat_combined_file)
Example #20
0
def process(options, collection):
    rootpath = options.rootpath
    oversample = options.oversample
    model_prefix = os.path.join(rootpath, options.model_prefix)
    sub_mean = model_prefix.find('resnext-101_rbps13k') >= 0
    logger.info('subtract mean? %d', sub_mean)
    layer = 'flatten0_output'
    batch_size = 1  # change the batch size will get slightly different feature vectors. So stick to batch size of 1.
    feat_name = get_feat_name(model_prefix, layer, oversample)
    feat_dir = os.path.join(rootpath, collection, 'FeatureData', feat_name)
    id_file = os.path.join(feat_dir, 'id.txt')
    feat_file = os.path.join(feat_dir, 'id.feature.txt')

    for x in [id_file, feat_file]:
        if os.path.exists(x):
            if not options.overwrite:
                logger.info('%s exists. skip', x)
                return 0
            else:
                logger.info('%s exists. overwrite', x)

    id_path_file = os.path.join(rootpath, collection, 'id.imagepath.txt')
    data = map(str.strip, open(id_path_file).readlines())
    img_ids = [x.split()[0] for x in data]
    filenames = [x.split()[1] for x in data]

    fe_mod = get_feat_extractor(model_prefix=model_prefix,
                                gpuid=options.gpu,
                                oversample=oversample)
    if fe_mod is None:
        return 0

    if not os.path.exists(feat_dir):
        os.makedirs(feat_dir)

    feat_file = os.path.join(feat_dir, 'id.feature.txt')
    fails_id_path = []
    fw = open(feat_file, 'w')

    im2path = zip(img_ids, filenames)
    success = 0
    fail = 0

    start_time = time.time()
    logger.info('%d images, %d done, %d to do', len(img_ids), 0, len(img_ids))
    progbar = Progbar(len(im2path))

    for i, (imgid, impath) in enumerate(im2path):
        try:
            imid, features = extract_mxnet_feat(fe_mod, imgid, impath,
                                                sub_mean, oversample)
            fw.write('%s %s\n' % (imid, ' '.join(['%g' % x
                                                  for x in features])))
            success += 1
        except Exception as e:
            fail += 1
            logger.error('failed to process %s', impath)
            logger.info('%d success, %d fail', success, fail)
            fails_id_path.append((imgid, impath))
        finally:
            progbar.add(1)

    logger.info('%d success, %d fail', success, fail)
    elapsed_time = time.time() - start_time
    logger.info('total running time %s',
                time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    fw.close()
    if len(fails_id_path) > 0:
        fail_fw = open(os.path.join(rootpath, collection, 'feature.fails.txt'),
                       'w')
        for (imgid, impath) in fails_id_path:
            fail_fw.write('%s %s\n' % (imgid, impath))
        fail_fw.close()
Example #21
0
def train(model, criterion, data_loader, optimizer, epoch):
    model = model.train()
    epoch_time = 0
    avg_linear_loss = 0
    avg_mel_loss = 0

    print(" | > Epoch {}/{}".format(epoch, c.epochs))
    progbar = Progbar(len(data_loader.dataset) / c.batch_size)
    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
    for num_iter, data in enumerate(data_loader):
        start_time = time.time()

        # setup input data
        text_input = data[0]
        text_lengths = data[1]
        linear_input = data[2]
        mel_input = data[3]
        mel_lengths = data[4]

        current_step = num_iter + args.restore_step + \
            epoch * len(data_loader) + 1

        # setup lr
        current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
        for params_group in optimizer.param_groups:
            params_group['lr'] = current_lr

        optimizer.zero_grad()

        # convert inputs to variables
        text_input_var = Variable(text_input)
        mel_spec_var = Variable(mel_input)
        mel_lengths_var = Variable(mel_lengths)
        linear_spec_var = Variable(linear_input, volatile=True)

        # dispatch data to GPU
        if use_cuda:
            text_input_var = text_input_var.cuda()
            mel_spec_var = mel_spec_var.cuda()
            mel_lengths_var = mel_lengths_var.cuda()
            linear_spec_var = linear_spec_var.cuda()

        # forward pass
        mel_output, linear_output, alignments =\
            model.forward(text_input_var, mel_spec_var)

        # loss computation
        mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
        linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
            + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                              linear_spec_var[:, :, :n_priority_freq],
                              mel_lengths_var)
        loss = mel_loss + linear_loss

        # backpass and check the grad norm
        loss.backward()
        grad_norm, skip_flag = check_update(model, 0.5, 100)
        if skip_flag:
            optimizer.zero_grad()
            print(" | > Iteration skipped!!")
            continue
        optimizer.step()

        step_time = time.time() - start_time
        epoch_time += step_time

        # update
        progbar.update(num_iter + 1,
                       values=[('total_loss', loss.data[0]),
                               ('linear_loss', linear_loss.data[0]),
                               ('mel_loss', mel_loss.data[0]),
                               ('grad_norm', grad_norm)])
        avg_linear_loss += linear_loss.data[0]
        avg_mel_loss += mel_loss.data[0]

        # Plot Training Iter Stats
        tb.add_scalar('TrainIterLoss/TotalLoss', loss.data[0], current_step)
        tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0],
                      current_step)
        tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step)
        tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                      current_step)
        tb.add_scalar('Params/GradNorm', grad_norm, current_step)
        tb.add_scalar('Time/StepTime', step_time, current_step)

        if current_step % c.save_step == 0:
            if c.checkpoint:
                # save model
                save_checkpoint(model, optimizer, linear_loss.data[0],
                                OUT_PATH, current_step, epoch)

            # Diagnostic visualizations
            const_spec = linear_output[0].data.cpu().numpy()
            gt_spec = linear_spec_var[0].data.cpu().numpy()

            const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
            gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
            tb.add_image('Visual/Reconstruction', const_spec, current_step)
            tb.add_image('Visual/GroundTruth', gt_spec, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Visual/Alignment', align_img, current_step)

            # Sample audio
            audio_signal = linear_output[0].data.cpu().numpy()
            data_loader.dataset.ap.griffin_lim_iters = 60
            audio_signal = data_loader.dataset.ap.inv_spectrogram(
                audio_signal.T)
            try:
                tb.add_audio('SampleAudio',
                             audio_signal,
                             current_step,
                             sample_rate=c.sample_rate)
            except:
                # print("\n > Error at audio signal on TB!!")
                # print(audio_signal.max())
                # print(audio_signal.min())
                pass

    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss

    # Plot Training Epoch Stats
    tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step)
    tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step)
    tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step)
    tb.add_scalar('Time/EpochTime', epoch_time, epoch)
    epoch_time = 0

    return avg_linear_loss, current_step
Example #22
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # save config to tmp place to be loaded by subsequent modules.
    file_name = str(os.getpid())
    tmp_path = os.path.join("/tmp/", file_name+'_tts')
    pickle.dump(c, open(tmp_path, "wb"))

    # setup tensorboard
    LOG_DIR = OUT_PATH
    tb = SummaryWriter(LOG_DIR)

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(1)
    signal.signal(signal.SIGINT, signal_handler)

    # Setup the dataset
    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    dataloader = DataLoader(dataset, batch_size=c.batch_size,
                            shuffle=True, collate_fn=dataset.collate_fn,
                            drop_last=True, num_workers=c.num_loader_workers)

    # setup the model
    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r)

    # plot model on tensorboard
    dummy_input = dataset.get_dummy_data()

    ## TODO: onnx does not support RNN fully yet
    # model_proto_path = os.path.join(OUT_PATH, "model.proto")
    # onnx.export(model, dummy_input, model_proto_path, verbose=True)
    # tb.add_graph_onnx(model_proto_path)

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    if args.restore_step:
        checkpoint = torch.load(os.path.join(
            args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)
        start_epoch = checkpoint['step'] // len(dataloader)
        best_loss = checkpoint['linear_loss']
    else:
        start_epoch = 0
        print("\n > Starting a new training")

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
    #                               patience=c.lr_patience, verbose=True)
    epoch_time = 0
    best_loss = float('inf')
    for epoch in range(0, c.epochs):

        print("\n | > Epoch {}/{}".format(epoch, c.epochs))
        progbar = Progbar(len(dataset) / c.batch_size)

        for num_iter, data in enumerate(dataloader):
            start_time = time.time()

            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]

            current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1

            # setup lr
            current_lr = lr_decay(c.lr, current_step)
            for params_group in optimizer.param_groups:
                params_group['lr'] = current_lr

            optimizer.zero_grad()

            # Add a single frame of zeros to Mel Specs for better end detection
            #try:
            #    mel_input = np.concatenate((np.zeros(
            #        [c.batch_size, 1, c.num_mels], dtype=np.float32),
            #        mel_input[:, 1:, :]), axis=1)
            #except:
            #    raise TypeError("not same dimension")

            # convert inputs to variables
            text_input_var = Variable(text_input)
            mel_spec_var = Variable(mel_input)
            linear_spec_var = Variable(linear_input, volatile=True)

            # sort sequence by length.
            # TODO: might be unnecessary
            sorted_lengths, indices = torch.sort(
                     text_lengths.view(-1), dim=0, descending=True)
            sorted_lengths = sorted_lengths.long().numpy()

            text_input_var = text_input_var[indices]
            mel_spec_var = mel_spec_var[indices]
            linear_spec_var = linear_spec_var[indices]

            if use_cuda:
                text_input_var = text_input_var.cuda()
                mel_spec_var = mel_spec_var.cuda()
                linear_spec_var = linear_spec_var.cuda()

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_spec_var,
                              input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))

            mel_loss = criterion(mel_output, mel_spec_var)
            #linear_loss = torch.abs(linear_output - linear_spec_var)
            #linear_loss = 0.5 * \
                #torch.mean(linear_loss) + 0.5 * \
                #torch.mean(linear_loss[:, :n_priority_freq, :])
            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
            # loss = loss.cuda()

            loss.backward()
            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
            optimizer.step()

            step_time = time.time() - start_time
            epoch_time += step_time

            progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
                                       ('linear_loss', linear_loss.data[0]),
                                       ('mel_loss', mel_loss.data[0]),
                                       ('grad_norm', grad_norm)])

            # Plot Learning Stats
            tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
                          current_step)
            tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
            tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                          current_step)
            tb.add_scalar('Params/GradNorm', grad_norm, current_step)
            tb.add_scalar('Time/StepTime', step_time, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Attn/Alignment', align_img, current_step)

            if current_step % c.save_step == 0:

                if c.checkpoint:
                    # save model
                    save_checkpoint(model, optimizer, linear_loss.data[0],
                                    OUT_PATH, current_step, epoch)

                # Diagnostic visualizations
                const_spec = linear_output[0].data.cpu().numpy()
                gt_spec = linear_spec_var[0].data.cpu().numpy()

                const_spec = plot_spectrogram(const_spec, dataset.ap)
                gt_spec = plot_spectrogram(gt_spec, dataset.ap)
                tb.add_image('Spec/Reconstruction', const_spec, current_step)
                tb.add_image('Spec/GroundTruth', gt_spec, current_step)

                align_img = alignments[0].data.cpu().numpy()
                align_img = plot_alignment(align_img)
                tb.add_image('Attn/Alignment', align_img, current_step)

                # Sample audio
                audio_signal = linear_output[0].data.cpu().numpy()
                dataset.ap.griffin_lim_iters = 60
                audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
                try:
                    tb.add_audio('SampleAudio', audio_signal, current_step,
                                 sample_rate=c.sample_rate)
                except:
                    print("\n > Error at audio signal on TB!!")
                    print(audio_signal.max())
                    print(audio_signal.min())


        # average loss after the epoch
        avg_epoch_loss = np.mean(
            progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
        best_loss = save_best_model(model, optimizer, avg_epoch_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

        #lr_scheduler.step(loss.data[0])
        tb.add_scalar('Time/EpochTime', epoch_time, epoch)
        epoch_time = 0
Example #23
0
def evaluate(model, criterion, data_loader, current_step):
    model = model.eval()
    epoch_time = 0
    avg_linear_loss = 0
    avg_mel_loss = 0

    print(" | > Validation")
    progbar = Progbar(len(data_loader.dataset) / c.batch_size)
    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
    for num_iter, data in enumerate(data_loader):
        start_time = time.time()

        # setup input data
        text_input = data[0]
        text_lengths = data[1]
        linear_input = data[2]
        mel_input = data[3]
        mel_lengths = data[4]

        # convert inputs to variables
        text_input_var = Variable(text_input)
        mel_spec_var = Variable(mel_input)
        mel_lengths_var = Variable(mel_lengths)
        linear_spec_var = Variable(linear_input, volatile=True)

        # dispatch data to GPU
        if use_cuda:
            text_input_var = text_input_var.cuda()
            mel_spec_var = mel_spec_var.cuda()
            mel_lengths_var = mel_lengths_var.cuda()
            linear_spec_var = linear_spec_var.cuda()

        # forward pass
        mel_output, linear_output, alignments =\
            model.forward(text_input_var, mel_spec_var)

        # loss computation
        mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
        linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
            + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                              linear_spec_var[:, :, :n_priority_freq],
                              mel_lengths_var)
        loss = mel_loss + linear_loss

        step_time = time.time() - start_time
        epoch_time += step_time

        # update
        progbar.update(num_iter + 1,
                       values=[('total_loss', loss.data[0]),
                               ('linear_loss', linear_loss.data[0]),
                               ('mel_loss', mel_loss.data[0])])

        avg_linear_loss += linear_loss.data[0]
        avg_mel_loss += mel_loss.data[0]

    # Diagnostic visualizations
    idx = np.random.randint(mel_input.shape[0])
    const_spec = linear_output[idx].data.cpu().numpy()
    gt_spec = linear_spec_var[idx].data.cpu().numpy()
    align_img = alignments[idx].data.cpu().numpy()

    const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
    gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
    align_img = plot_alignment(align_img)

    tb.add_image('ValVisual/Reconstruction', const_spec, current_step)
    tb.add_image('ValVisual/GroundTruth', gt_spec, current_step)
    tb.add_image('ValVisual/ValidationAlignment', align_img, current_step)

    # Sample audio
    audio_signal = linear_output[idx].data.cpu().numpy()
    data_loader.dataset.ap.griffin_lim_iters = 60
    audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T)
    try:
        tb.add_audio('ValSampleAudio',
                     audio_signal,
                     current_step,
                     sample_rate=c.sample_rate)
    except:
        # print(" | > Error at audio signal on TB!!")
        # print(audio_signal.max())
        # print(audio_signal.min())
        pass

    # compute average losses
    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss

    # Plot Learning Stats
    tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step)
    tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
    tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
    return avg_linear_loss