Example #1
0
def list_data(
    dataset_dir,
    speaker_ids,
    noise_dirs,
    max_files=None,
    shuffle=True,
    augmentation_factor=1,
    oversample_noise=True,
):
    speech_dataset = AudioVisualDataset(dataset_dir)
    speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle)

    noise_dataset = AudioDataset(noise_dirs)
    noise_subset = noise_dataset.subset(max_files, shuffle)

    if not oversample_noise:
        n_files = min(len(speech_subset), len(noise_subset))
        speech_entries = speech_subset[:n_files]
        noise_entries = noise_subset[:n_files]
    else:
        speech_and_noise_entries = [
            (s, n) for s, n in zip(speech_subset, itertools.cycle(noise_subset))
        ]
        speech_entries, noise_entries = [
            list(x) for x in zip(*speech_and_noise_entries)
        ]

    all_speech_entries = speech_entries
    all_noise_file_paths = noise_entries

    for i in range(augmentation_factor - 1):
        all_speech_entries += speech_entries
        all_noise_file_paths += random.sample(noise_entries, len(noise_entries))

    return all_speech_entries, all_noise_file_paths
def list_data(dataset_dir,
              speaker_ids,
              noise_dirs,
              max_files=None,
              shuffle=True,
              augmentation_factor=1):
    speech_dataset = AudioVisualDataset(dataset_dir)
    speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle)

    noise_dataset = AudioDataset(noise_dirs)
    noise_file_paths = noise_dataset.subset(max_files, shuffle)

    n_files = min(len(speech_subset), len(noise_file_paths))

    speech_entries = speech_subset[:n_files]
    noise_file_paths = noise_file_paths[:n_files]

    all_speech_entries = speech_entries
    all_noise_file_paths = noise_file_paths

    for i in range(augmentation_factor - 1):
        all_speech_entries += speech_entries
        all_noise_file_paths += random.sample(noise_file_paths,
                                              len(noise_file_paths))

    return all_speech_entries, all_noise_file_paths
def main(log_dir, results_path):

    model = AudioDenoiserNet(cfg.SEQUENCE_LENGTH)
    # weight initialization
    model = model.apply(weight_init)

    if torch.cuda.is_available():
        model.cuda()

    batch_size = cfg.BATCH_SIZE
    learning_rate = cfg.LEARNING_RATE
    num_epochs = cfg.NUM_EPOCHS

    train_dataset = AudioDataset(cfg.DATA_DIR, "training")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=batch_size,
                                   shuffle=False,
                                   num_workers=2)

    val_dataset = AudioDataset(cfg.DATA_DIR, "validation")
    val_data_loader = DataLoader(val_dataset,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=2)

    # hyperparameter_search(model, train_data_loader, val_data_loader, log_dir, num_epochs)
    train_loss, valid_loss = train_model(model, train_data_loader,
                                         val_data_loader, log_dir, batch_size,
                                         learning_rate, num_epochs)
def main(csv_base):
    sequence_length = 8
    model = AudioDenoiserNet(sequence_length)
    # weight initialization
    model = model.apply(weight_init)

    if torch.cuda.is_available():
        model.cuda()

    batch_size = 512
    learning_rate = 0.001
    num_epochs = 30

    data_dir = "/local/mnt2/workspace2/tkuai/cnn_audio_denoiser/pytorch_dataset2/old_data"
    saving_figure = "/local/mnt2/workspace2/tkuai/cnn_audio_denoiser/pytorch_model2/results2/"

    train_dataset = AudioDataset(data_dir, "training")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=batch_size,
                                   shuffle=False,
                                   num_workers=2)

    val_dataset = AudioDataset(data_dir, "validation")
    val_data_loader = DataLoader(val_dataset,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=2)

    train_loss, valid_loss = train_model(model, train_data_loader,
                                         val_data_loader, csv_base, batch_size,
                                         learning_rate, num_epochs)
    plot_loss(train_loss, valid_loss, saving_figure)
Example #5
0
def main(args):

    # initialize dataset
    audio_dataset = AudioDataset(args.manifest,
                                 batch_size=args.batch_size,
                                 root_dir=args.root_dir)

    audio_dataloader = audio_dataset.dataloader()

    attack = Attack(args.model_path,
                    batch_size=args.batch_size,
                    lr_stage1=args.lr_stage1,
                    lr_stage2=args.lr_stage2,
                    num_iter_stage1=args.num_iter_stage1,
                    num_iter_stage2=args.num_iter_stage2)

    # initialize attack class
    attack.attack_stage1(audio_dataloader)

    for idx, batch in enumerate(audio_dataloader):
        #print(idx, batch)
        for key, value in batch.items():
            try:
                print(key, ':', value.shape)
            except AttributeError:
                print(key, ':', value)

        exit()
Example #6
0
    def test_audio_dataset(self):
        mel_ids = ['small_sample']
        text_dict = {'small_sample': 'Small sample text.'}
        cleaner = lambda x: x.lower()
        symbols = 'abcdefghijklmnopqrstuvwxyz. '
        tokenizer = Tokenizer(cleaners=cleaner, symbols=symbols)
        dataset = AudioDataset(mel_path=self.mel_path,
                               mel_ids=mel_ids,
                               text_dict=text_dict,
                               tokenizer=tokenizer)
        self.assertEqual(1, len(dataset))
        seq, mel, mel_id, mel_len = dataset[0]

        text = tokenizer.decode(seq)
        self.assertEqual('small sample text.', text)
        self.assertEqual((101, 80), mel.shape)
        self.assertEqual('small_sample', mel_id)
        self.assertEqual(101, mel_len)
def test(save_epoch, batchsize, data_path, save_path, modeldir, cls_num):

    # Dataset definition
    dataset = AudioDataset(data_path)
    collator = AudioCollator(cls_num)

    # Model & Optimizer definition
    generator = Generator(cls_num=cls_num)
    generator.load_state_dict(
        torch.load(f"{modeldir}/generator_{save_epoch - 1}.model"))
    generator.eval()  # evaluation mode

    # Data loader
    dataloader = DataLoader(dataset,
                            batch_size=batchsize,
                            shuffle=False,
                            collate_fn=collator,
                            drop_last=True)
    dataloader = tqdm(dataloader)
    output = []

    # Evaluation
    for i, data in enumerate(dataloader):
        x_sp, x_label, y_label = data
        x_to_y = torch.cat([y_label, x_label], dim=1)
        y_to_x = torch.cat([x_label, y_label], dim=1)
        x_to_x = torch.cat([x_label, x_label], dim=1)

        # Generator update
        y_eval = generator(x_sp, x_to_y)
        y_npy = y_eval.to('cpu').detach().numpy().flatten()

        # Save to List
        output.append(y_npy)

    # Writer
    out_array = np.array(output)
    out_array = 0.8 * out_array / np.max(np.abs(out_array))  # Normalization
    path = str(Path(save_path)) + '.wav'
    write_wav(path, out_array, sr=22050)
Example #8
0
        "n_epochs": 20,
        "dropout": [0.5, 0.3],
        "masking": [20, 10],
        "sample_rate": 22050,
        "n_mels": 128,
        "n_fft": 1024,
        "win_length": 512,
        "hop_length": 512,
        "augment": True
    }

    train_loader = DataLoader(AudioDataset(
        path=os.path.join("audio", "train"),
        sample_rate=config["sample_rate"],
        n_mels=config["n_mels"],
        n_fft=config["n_fft"],
        win_length=config["win_length"],
        hop_length=config["hop_length"],
        augment=config["augment"],
    ),
                              batch_size=config["batch_size"],
                              shuffle=True,
                              pin_memory=True)
    val_loader = DataLoader(AudioDataset(
        path=os.path.join("audio", "validation"),
        sample_rate=config["sample_rate"],
        n_mels=config["n_mels"],
        n_fft=config["n_fft"],
        win_length=config["win_length"],
        hop_length=config["hop_length"],
    ),
Example #9
0
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from logger import Logger
import torch.utils.data
from dataset import AudioDataset
from model import AudioCycleGAN
from hparams import hparams as opt
import time

# MNIST dataset
dataset = AudioDataset("./data/preprocess")

# Data loader
data_loader = torch.utils.data.DataLoader(dataset=dataset,
                                          batch_size=2,
                                          shuffle=True)
data_iter = iter(data_loader)
iter_per_epoch = len(data_loader)
# Fully connected neural network with one hidden layer
model = AudioCycleGAN(opt)

logger = Logger('./logs')

# Loss and optimizer
model.setup(opt)  # regular setup: load and print networks; create schedulers

total_iters = 0  # the total number of training iterations
for epoch in range(
        opt.n_epochs, opt.niter + opt.niter_decay + 1
Example #10
0
def experiment(cfg, fold, use_pretrained, predict_on_private):
    print(fold)
    set_seed(cfg.seed)

    device = torch.device(cfg.device)

    datasets = {
        'train':
        AudioDataset(cfg.data_root, fold['train'], 'train'),
        'valid':
        AudioDataset(cfg.data_root, fold['valid'], 'valid'),
        'public_test':
        AudioDataset(cfg.data_root, fold['public_test'], 'public_test'),
    }

    dataloaders = {
        'train':
        DataLoader(datasets['train'],
                   batch_size=cfg.batch_size,
                   shuffle=True,
                   collate_fn=datasets['train'].collate_fn),
        'valid':
        DataLoader(datasets['valid'],
                   batch_size=cfg.batch_size,
                   shuffle=False,
                   collate_fn=datasets['valid'].collate_fn),
        'public_test':
        DataLoader(datasets['public_test'],
                   batch_size=cfg.batch_size,
                   shuffle=False,
                   collate_fn=datasets['public_test'].collate_fn)
    }

    if predict_on_private:
        datasets['private_test'] = AudioDataset(cfg.data_root,
                                                fold['private_test'],
                                                'private_test')
        dataloaders['private_test'] = DataLoader(
            datasets['private_test'],
            batch_size=cfg.batch_size,
            shuffle=False,
            collate_fn=datasets['private_test'].collate_fn)

    model_path = f'./predictions/{cfg.init_time}/{cfg.task}_{"_".join([str(i) for i in fold["train"]])}_pretrained_model.pt'

    if use_pretrained:
        print(f'Using pre-trained model from {model_path}')
        best_metric = -1.0
    else:
        best_metric = train(cfg, datasets, dataloaders, device, model_path)

    # make predictions
    test_predictions = {}
    predict(
        cfg, model_path, dataloaders['train'], device,
        f'./predictions/{cfg.init_time}/{cfg.task}_train_{"_".join([str(i) for i in fold["train"]])}_vggish.csv'
    )
    predict(
        cfg, model_path, dataloaders['valid'], device,
        f'./predictions/{cfg.init_time}/{cfg.task}_valid_{"_".join([str(i) for i in fold["valid"]])}_vggish.csv'
    )
    test_predictions['public_test'] = predict(
        cfg, model_path, dataloaders['public_test'], device,
        f'./predictions/{cfg.init_time}/{cfg.task}_public_test_trained_on_{"_".join([str(i) for i in fold["train"]])}_vggish.csv'
    )

    if predict_on_private:
        test_predictions['private_test'] = predict(
            cfg, model_path, dataloaders['private_test'], device,
            f'./predictions/{cfg.init_time}/{cfg.task}_private_test_trained_on_{"_".join([str(i) for i in fold["train"]])}_vggish.csv'
        )

    return best_metric, test_predictions
Example #11
0
    parser.add_argument("--language", type=str, default="english")
    # mode = "melgan"
    # load_vqvae = False
    # load_melgan = False
    args = parser.parse_args()

    data_path = os.path.join(args.datadir, args.language)

    if not os.path.exists(os.path.join(args.ckpt_path, "logs")):
        os.makedirs(os.path.join(args.ckpt_path, "logs"))

    if args.mode == "vqvae":
        print("[VQVAE] Loading training data...")
        rec_train_dataset = AudioDataset(audio_files=Path(data_path) /
                                         "rec_train_files.txt",
                                         segment_length=hps.seg_len,
                                         sampling_rate=16000,
                                         mode='reconst')
        num_speaker = rec_train_dataset.get_speaker_num()
        speaker2id = rec_train_dataset.get_speaker2id()
        #test_set = AudioDataset(audio_files=Path(data_path) / "test_files.txt", segment_length=22050 * 4, sampling_rate=22050, augment=False)
        train_data_loader = DataLoader(
            rec_train_dataset,
            batch_size=hps.batch_size_vqvae,
            shuffle=True,
            num_workers=4,
            pin_memory=True)  #hps.batch_size, num_workers=4)
        #test_data_loader = DataLoader(test_set, batch_size=1)
        trainer = Trainer(
            hps=hps,
            logger_path=os.path.join(args.ckpt_path, "logs"),
Example #12
0
def train(out_dir, inp_txt, num_threads, task, batch_size):

    torch.set_num_threads(num_threads)
    print('Number of threads: ', torch.get_num_threads())

    melspec_dir = os.path.normpath(out_dir) + '/melspec'

    print('Create directory to save models...')
    model_dir = os.path.normpath(out_dir) + '/' + f'{task}_model'
    os.makedirs(model_dir, exist_ok=True)

    print('Reading training list file...')
    ref_labels_dict, (train_fnames, val_fnames, train_labels, val_labels) =\
        get_train_val_data(inp_txt)
    with open(model_dir + '/label_ids.pkl', 'wb') as f:
        pickle.dump(ref_labels_dict, f)

    print('Creating PyTorch datasets...')
    train_dataset = AudioDataset(train_fnames, train_labels, melspec_dir)
    val_dataset = AudioDataset(val_fnames, val_labels, melspec_dir, False,
                               train_dataset.mean, train_dataset.std)

    mean, std = train_dataset.mean, train_dataset.std
    with open(model_dir + '/mean_std.pkl', 'wb') as f:
        pickle.dump((mean, std), f)

    train_loader_1 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_loader_2 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    num_classes = CONFIG[task]['num_classes']

    model = MirexModel(num_classes)

    # Define optimizer, scheduler and loss criteria
    optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)
    criterion = nn.CrossEntropyLoss()

    cuda = False
    device = torch.device('cuda:0' if cuda else 'cpu')
    print('Device: ', device)
    model = model.to(device)

    epochs = 100
    train_loss_hist = []
    valid_loss_hist = []
    lowest_val_loss = np.inf
    epochs_without_new_lowest = 0

    print('Training...')
    for i in range(epochs):

        start_time = time.time()
        this_epoch_train_loss = 0
        for i1, i2 in zip(train_loader_1, train_loader_2):

            # mixup---------
            x1, y1 = i1
            x2, y2 = i2

            alpha = 1
            mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0])

            mvals = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1))
            inputs = (mvals * x1) + ((1 - mvals) * x2)

            y1_onehot = torch.nn.functional.one_hot(y1, num_classes).float()
            y2_onehot = torch.nn.functional.one_hot(y2, num_classes).float()
            mvals = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1))
            labels = (mvals * y1_onehot) + ((1 - mvals) * y2_onehot)
            # mixup ends ----------

            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                model = model.train()
                outputs = model(inputs)
                loss = mixup_cross_entropy_loss(outputs, labels)
                loss.backward()
                optimizer.step()
                this_epoch_train_loss += loss.detach().cpu().numpy()

        this_epoch_valid_loss = 0
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            with torch.set_grad_enabled(False):
                model = model.eval()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                this_epoch_valid_loss += loss.detach().cpu().numpy()

        this_epoch_train_loss /= len(train_loader_1)
        this_epoch_valid_loss /= len(val_loader)

        train_loss_hist.append(this_epoch_train_loss)
        valid_loss_hist.append(this_epoch_valid_loss)

        if this_epoch_valid_loss < lowest_val_loss:
            lowest_val_loss = this_epoch_valid_loss
            torch.save(model.state_dict(), f'{model_dir}/best_model.pth')
            epochs_without_new_lowest = 0
        else:
            epochs_without_new_lowest += 1

        if epochs_without_new_lowest >= 25:
            break

        print(f'Epoch: {i+1}\ttrain_loss: {this_epoch_train_loss}\tval_loss: {this_epoch_valid_loss}\ttime: {(time.time()-start_time):.0f}s')

        scheduler.step(this_epoch_valid_loss)
    
    return model_dir
Example #13
0
    SEED = 42

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    preprocess = DataPreprocess(config['DataPreprocess']['sr'],
                                config['DataPreprocess']['max_length'],
                                config['DataPreprocess']['classes_mapping'])
    preprocess.get_train_data()
    preprocess.get_test_data()

    train_dataset = AudioDataset(path_to_sound_files='data/train/audio',
                                 path_to_csv='data/train/meta/train.csv')
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    test_dataset = AudioDataset(path_to_sound_files='data/test/audio',
                                path_to_csv='data/test/meta/test.csv')
    test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Resnet18Multi(config['Model']['num_classes'])
    model.to(device)

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    logging.info('Total num of model parameters: {}'.format(pytorch_total_params))

    optimizer = torch.optim.Adam(model.parameters(), lr=config['Model']['learning_rate'])
Example #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--log_dir',
                        type=str,
                        default='logs',
                        help='output log directory')
    parser.add_argument('--feature',
                        type=str,
                        choices=['melgram', 'mfcc'],
                        default='mfcc',
                        help='feature')
    parser.add_argument('--model_type',
                        type=str,
                        choices=['alex1d', 'alex2d', 'lstm', 'resnet'],
                        default='alex2d',
                        help='convolution type')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='training and valid batch size')
    parser.add_argument('--valid_ratio',
                        type=float,
                        default=0.1,
                        help='the ratio of validation data')
    parser.add_argument('--epochs',
                        type=int,
                        default=32,
                        help='number of epochs to train')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate')
    parser.add_argument('--seed', type=int, default=1234, help='random seed')

    args = parser.parse_args()

    print('log_dir:', args.log_dir)
    print('feature:', args.feature)
    print('model_type:', args.model_type)
    print('batch_size:', args.batch_size)
    print('valid_ratio:', args.valid_ratio)
    print('epochs:', args.epochs)
    print('lr:', args.lr)
    print('seed:', args.seed)

    # seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if cuda:
        torch.cuda.manual_seed(args.seed)

    # データリストをDataFrameとしてロード
    train_df = pd.read_csv('./data/train.csv')
    test_df = pd.read_csv('./data/sample_submission.csv')

    # DataFrameのラベルをインデックスに変換
    le = LabelEncoder()
    le.fit(np.unique(train_df.label))
    train_df['label_idx'] = le.transform(train_df['label'])
    num_classes = len(le.classes_)

    # Datasetをロード
    # test=Trueにするとラベルは読み込まれない
    train_dataset = AudioDataset(train_df,
                                 './data/audio_train',
                                 feature=args.feature,
                                 model_type=args.model_type)

    test_dataset = AudioDataset(test_df,
                                './data/audio_test',
                                test=True,
                                feature=args.feature,
                                model_type=args.model_type)

    # 訓練データを訓練とバリデーションにランダムに分割
    # あとでCVによるEnsembleできるようにシードを指定する
    num_train = len(train_dataset)

    indices = list(range(num_train))
    split = int(args.valid_ratio * num_train)
    np.random.shuffle(indices)
    train_idx, valid_idx = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               args.batch_size,
                                               sampler=train_sampler,
                                               num_workers=num_workers)

    # バリデーションデータはtrain_datasetの一部を使う
    val_loader = torch.utils.data.DataLoader(train_dataset,
                                             args.batch_size,
                                             sampler=valid_sampler,
                                             num_workers=num_workers)

    # テストデータはDataFrameの順番のまま読み込みたいため
    # shuffle=Falseとする
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              args.batch_size,
                                              shuffle=False)

    # build model
    if args.model_type == 'alex2d':
        model = AlexNet2d(num_classes).to(device)
    elif args.model_type == 'alex1d':
        model = AlexNet1d(num_classes).to(device)
    elif args.model_type == 'lstm':
        model = ConvLSTM(num_classes).to(device)
    elif args.model_type == 'resnet':
        model = ResNet([2, 2, 2, 2]).to(device)
    else:
        print('Invalid model_type: %s' % args.model_type)
        exit(1)

    print(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    scheduler = CyclicLR(optimizer,
                         base_lr=0.0001,
                         max_lr=0.01,
                         step_size=10,
                         mode="exp_range")

    # 学習率の履歴を保存(可視化用)
    lr_list = []

    best_acc = 0.0
    best_model = None
    writer = SummaryWriter(args.log_dir)

    for epoch in range(1, args.epochs + 1):
        loss, acc = train(train_loader, model, criterion, optimizer)
        val_loss, val_acc = valid(val_loader, model, criterion)

        lr_list.append(scheduler.get_lr()[0])
        scheduler.step()

        # logging
        writer.add_scalar('train/loss', loss, epoch)
        writer.add_scalar('train/acc', acc, epoch)
        writer.add_scalar('valid/loss', val_loss, epoch)
        writer.add_scalar('valid/acc', val_acc, epoch)

        print(
            'Epoch [%d/%d] loss: %.5f acc: %.5f val_loss: %.5f val_acc: %.5f' %
            (epoch, args.epochs, loss, acc, val_loss, val_acc))

        if val_acc > best_acc:
            print('val_acc improved from %.5f to %.5f' % (best_acc, val_acc))
            best_acc = val_acc

            # remove the old model file
            if best_model is not None:
                os.remove(best_model)

            best_model = os.path.join(
                args.log_dir,
                'epoch%03d-%.3f-%.3f.pth' % (epoch, val_loss, val_acc))
            torch.save(model.state_dict(), best_model)

    # ベストモデルでテストデータを評価
    # あとでEnsembleできるようにモデルの出力値も保存しておく
    print('best_model:', best_model)
    model.load_state_dict(
        torch.load(best_model, map_location=lambda storage, loc: storage))
    predictions = test(test_loader, model)
    np.save(os.path.join(args.log_dir, 'predictions.npy'),
            predictions.cpu().numpy())

    # Top3の出力を持つラベルに変換
    _, indices = predictions.topk(3)  # (N, 3)
    # ラベルに変換
    predicted_labels = le.classes_[indices]
    predicted_labels = [' '.join(lst) for lst in predicted_labels]
    test_df['label'] = predicted_labels
    test_df.to_csv(os.path.join(args.log_dir, 'submission.csv'), index=False)
Example #15
0
    vqvae_model = args.vqvae_model
    melgan_model = args.melgan_model
    save_path = args.save_path

    #os.remove(wav_save_path)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    #hps.seg_len = 16000 * 10
    eval_mode = "both"

    if eval_mode in ['vqvae', 'both']:
        encoding_dataset = AudioDataset(audio_files=Path(data_path) /
                                        "eval_files.txt",
                                        segment_length=2048 * 126,
                                        sampling_rate=16000,
                                        mode='reconst',
                                        augment=False,
                                        load_speech_id=True)
        #dataset = AudioDataset(audio_files=Path(data_path) / "eval_files.txt", segment_length=2048 * 126, sampling_rate=16000, mode=data_mode, augment=False, load_speech_id=True)
        num_src_speaker = AudioDataset(audio_files=Path(data_path) /
                                       "rec_train_files.txt",
                                       segment_length=2048 * 126,
                                       sampling_rate=16000,
                                       mode='reconst',
                                       augment=False).get_speaker_num()

    if eval_mode in ['melgan', 'both']:
        convert_dataset = AudioDataset(audio_files=Path(data_path) /
                                       "synthesis.txt",
                                       segment_length=2048 * 126,
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('log_dir', type=str, help='input log directory')
    parser.add_argument('model_file', type=str, help='input model file')
    parser.add_argument('--feature',
                        type=str,
                        choices=['melgram', 'mfcc'],
                        default='mfcc',
                        help='feature')
    parser.add_argument('--model_type',
                        type=str,
                        choices=['alex1d', 'alex2d', 'lstm', 'resnet'],
                        default='alex2d',
                        help='convolution type of the model')
    args = parser.parse_args()

    print('log_dir:', args.log_dir)
    print('model_file:', args.model_file)
    print('feature:', args.feature)
    print('model_type:', args.model_type)

    # load dataset
    train_df = pd.read_csv('./data/train.csv')
    test_df = pd.read_csv('./data/sample_submission.csv')

    le = LabelEncoder()
    le.fit(np.unique(train_df.label))
    train_df['label_idx'] = le.transform(train_df['label'])
    num_classes = len(le.classes_)

    test_dataset = AudioDataset(test_df,
                                './data/audio_test',
                                test=True,
                                feature=args.feature,
                                model_type=args.model_type)

    test_loader = torch.utils.data.DataLoader(test_dataset, 128, shuffle=False)

    # load model
    if args.model_type == 'alex2d':
        model = AlexNet2d(num_classes).to(device)
    elif args.model_type == 'alex1d':
        model = AlexNet1d(num_classes).to(device)
    elif args.model_type == 'lstm':
        model = ConvLSTM(num_classes).to(device)
    elif args.model_type == 'resnet':
        model = ResNet([2, 2, 2, 2]).to(device)
    else:
        print('Invalid model_type: %s' % args.model_type)
        exit(1)

    print(model)

    # 学習済みモデルをロード
    model.load_state_dict(
        torch.load(args.model_file, map_location=lambda storage, loc: storage))

    # test time augmentation
    tta_predictions = test_time_augmentation(test_loader, model, num_aug=5)
    np.save(os.path.join(args.log_dir, 'tta_predictions.npy'),
            tta_predictions.cpu().numpy())

    # Top3の出力を持つラベルに変換
    _, indices = tta_predictions.topk(3)
    predicted_labels = le.classes_[indices]
    predicted_labels = [' '.join(lst) for lst in predicted_labels]
    test_df['label'] = predicted_labels
    test_df.to_csv(os.path.join(args.log_dir, 'tta_submission.csv'),
                   index=False)
Example #17
0
def train(epochs, batchsize, data_path, modeldir, cls_num, duration):
    # Dataset definition
    dataset = AudioDataset(data_path)
    collator = AudioCollator(cls_num)

    # Model & Optimizer definition
    generator = Generator(cls_num=cls_num)
    generator.cuda()
    generator.train()
    gen_opt = torch.optim.Adam(generator.parameters(),
                               lr=0.0002,
                               betas=(0.5, 0.999))

    discriminator = Discriminator(cls_num)
    discriminator.cuda()
    discriminator.train()
    dis_opt = torch.optim.Adam(discriminator.parameters(),
                               lr=0.0002,
                               betas=(0.5, 0.999))

    # Writer definition
    writer = tbx.SummaryWriter()

    iterations = 0

    for epoch in range(epochs):
        dataloader = DataLoader(dataset,
                                batch_size=batchsize,
                                shuffle=True,
                                collate_fn=collator,
                                drop_last=True)
        dataloader = tqdm(dataloader)

        for i, data in enumerate(dataloader):
            iterations += 1
            x_sp, x_label, y_label = data
            x_to_y = torch.cat([y_label, x_label], dim=1)
            y_to_x = torch.cat([x_label, y_label], dim=1)
            x_to_x = torch.cat([x_label, x_label], dim=1)

            # Discriminator update
            y_fake = generator(x_sp, x_to_y)

            # Adversarial loss
            dis_loss_real, dis_loss_fake = adversarial_loss_dis(
                discriminator, y_fake, x_sp, x_to_y, y_to_x)
            dis_loss = dis_loss_real + dis_loss_fake

            dis_opt.zero_grad()
            dis_loss.backward()
            dis_opt.step()

            write(writer, "dis_loss_real", dis_loss_real, iterations)
            write(writer, "dis_loss_fake", dis_loss_fake, iterations)

            # Generator update
            y_fake = generator(x_sp, x_to_y)
            x_fake = generator(y_fake, y_to_x)
            x_identity = generator(x_sp, x_to_x)

            # Adversarial loss
            gen_loss_fake = adversarial_loss_gen(discriminator, y_fake, x_to_y)

            # Cycle-consistency loss
            cycle_loss = cycle_consistency_loss(x_fake, x_sp)

            # Identity-mapping loss
            if epoch < duration:
                identity_loss = identity_mapping_loss(x_identity, x_sp)
            else:
                identity_loss = torch.as_tensor(np.array(0))
                #identity_loss = torch.from_numpy(0)

            gen_loss = gen_loss_fake + cycle_loss + identity_loss

            gen_opt.zero_grad()
            gen_loss.backward()
            gen_opt.step()

            write(writer, "gen_loss_fake", gen_loss_fake, iterations)
            write(writer, "cycle_loss", cycle_loss, iterations)
            write(writer, "identity_loss", identity_loss, iterations)

            print(f"iteration: {iterations}")
            print(
                f"dis loss real: {dis_loss_real} dis loss fake: {dis_loss_fake}"
            )
            print(
                f"gen loss fake: {gen_loss_fake} cycle loss: {cycle_loss} identity loss: {identity_loss}"
            )

            if i == 0:
                torch.save(generator.state_dict(),
                           f"{modeldir}/generator_{epoch}.model")
Example #18
0
            model, optimizer, scheduler = loadModel(model, adam, scheduler,
                                                    fileName, stage,
                                                    startEpoch == 1)
            if startEpoch is not 1:
                print(
                    f"Successfully loaded model with last completed epoch as {startEpoch-1}"
                )

        else:
            raise Exception("No such file exixts")
    else:
        model = Lipreader(stage)
        adam = optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.)
        scheduler = lrScheduler.LambdaLR(adam, lr_lambda=[updateLRFunc])

    trainDataset = AudioDataset("train")
    trainDataLoader = DataLoader(trainDataset,
                                 batch_size=config.data["batchSize"],
                                 shuffle=config.data["shuffle"],
                                 num_workers=config.data["workers"])
    validationDataset = AudioDataset("val")
    validationDataLoader = DataLoader(validationDataset,
                                      batch_size=config.data["batchSize"],
                                      shuffle=config.data["shuffle"],
                                      num_workers=config.data["workers"])

    trainCriterion = nn.CrossEntropyLoss() if isinstance(
        model.Backend, TemporalCNN) else NLLSequenceLoss()

    validationCriterion = temporalCNNValidator if isinstance(
        model.Backend, TemporalCNN) else gruValidator
Example #19
0
def recognize(args):
    model, LFR_m, LFR_n = Transformer.load_model(args.model_path)
    print(model)
    model.eval()
    model.cuda()
    char_list, sos_id, eos_id = process_dict(args.dict)
    assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id
    tr_dataset = AudioDataset('test', args.batch_size)
    path_list = tr_dataset.path_lst
    label_list = tr_dataset.han_lst
    num_data = tr_dataset.path_count
    ran_num = random.randint(0, num_data - 1)

    num = args.count
    words_num = 0
    word_error_num = 0
    seq_error = 0
    data = ''
    with torch.no_grad():
        for index in range(num):
            try:
                print('\nthe ', index + 1, 'th example.')
                data += 'the ' + str(index + 1) + 'th example.\n'
                index = (ran_num + index) % num_data
                standard_label = label_list[index]
                feature, label = get_fbank_and_hanzi_data(
                    index, args.feature_dim, char_list, path_list, label_list)
                if len(feature) > 1600:
                    continue
                input = build_LFR_features(feature, args.LFR_m, args.LFR_n)
                input = torch.from_numpy(input).float()
                input_length = torch.tensor([input.size(0)], dtype=torch.int)
                input = input.cuda()
                nbest_hyps = model.recognize(input, input_length, char_list,
                                             args)
                pred_label = nbest_hyps[0]['yseq'][1:-1]
                pred_res = ''.join([char_list[index] for index in pred_label])
                print("stand:", label)
                print("pred :", pred_label)
                data += "stand:" + str(standard_label) + '\n'
                data += "pred :" + str(pred_res) + '\n'
                words_n = len(label)
                words_num += words_n
                word_distance = GetEditDistance(pred_label, label)
                if (word_distance <= words_n):
                    word_error_num += word_distance
                else:
                    word_error_num += words_n

                if pred_label != label:
                    seq_error += 1
            except ValueError:
                continue
    print('WER = ', (1 - word_error_num / words_num) * 100, '%')
    print('CER = ', (1 - seq_error / args.count) * 100, '%')
    data += 'WER = ' + str((1 - word_error_num / words_num) * 100) + '%'
    data += 'CER = ' + str((1 - seq_error / args.count) * 100) + '%'
    with open('../../model_log/pred/test_' + str(args.count) + '.txt',
              'w',
              encoding='utf-8') as f:
        f.writelines(data)