num_features = 64
    eps = 2**-24
    if args.model == 'crnn':
        # CRNN supports only 32 features
        num_features = 32
        eps = 1e-20

    duration = 5.0  # s
    sample_rate = 16000
    print("recording %0.1fs audio..." % duration)
    recorded_audio = sd.rec(int(duration * sample_rate),
                            samplerate=sample_rate,
                            channels=1,
                            blocking=True)
    print("recorded, replaying it before doing speech recognition...")
    sd.play(recorded_audio, samplerate=sample_rate, blocking=True)

    transform = Compose([
        ComputeMagSpectrogram(),
        ComputeMelSpectrogramFromMagSpectrogram(num_features=num_features,
                                                normalize=args.normalize,
                                                eps=eps)
    ])

    transcribe(
        transform({
            'samples': recorded_audio,
            'sample_rate': sample_rate,
            'text': ''
        }), num_features, args)
        LibriSpeech(name='train-clean-100'),
        LibriSpeech(name='train-clean-360'),
        LibriSpeech(name='train-other-500'),
        LibriSpeech(name='dev-clean',)
    ])
elif args.dataset == 'backgroundsounds':
    from datasets.background_sounds import BackgroundSounds
    dataset = BackgroundSounds(is_random=False)
elif args.dataset == 'bolorspeech':
    from datasets.bolor_speech import BolorSpeech
    dataset = ConcatDataset([
        BolorSpeech(name='train'),
        BolorSpeech(name='train2'),
        BolorSpeech(name='test'),
        BolorSpeech(name='demo'),
        BolorSpeech(name='annotation'),
        BolorSpeech(name='annotation-1111')
    ])
else:
    print("unknown dataset!")
    import sys
    sys.exit(1)


transform=Compose([LoadAudio(), ComputeMagSpectrogram()])
for data in tqdm(dataset):
    fname = data['fname']
    data = transform(data)
    mel_spectrogram = data['input']
    np.save(fname.replace('.wav', '.npy'), mel_spectrogram)
Ejemplo n.º 3
0
                    default=0.0003,
                    help='learning rate for optimization')
args = parser.parse_args()

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

if args.dataset == 'librispeech':
    from datasets.libri_speech import LibriSpeech as SpeechDataset, vocab, idx2char
else:
    from datasets.mb_speech import MBSpeech as SpeechDataset, vocab, idx2char

train_dataset = SpeechDataset(
    transform=Compose([LoadAudio(
    ), SpeedChange(), ExtractSpeechFeatures()]))
valid_dataset = SpeechDataset(
    transform=Compose([LoadAudio(), ExtractSpeechFeatures()]))
indices = list(range(len(train_dataset)))
train_sampler = SubsetRandomSampler(indices[:-args.batch_size])
valid_sampler = SubsetRandomSampler(indices[-args.batch_size:])

train_data_loader = DataLoader(train_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               collate_fn=collate_fn,
                               num_workers=args.dataload_workers_nums,
                               sampler=train_sampler)
valid_data_loader = DataLoader(valid_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                        type=str,
                        required=False,
                        help='link to KenLM 5-gram binary language model')
    parser.add_argument("--alpha",
                        type=float,
                        default=0.3,
                        help='alpha for CTC decode')
    parser.add_argument("--beta",
                        type=float,
                        default=1.85,
                        help='beta for CTC decode')
    args = parser.parse_args()

    duration = 5.0  #s
    sample_rate = 16000
    print("recording %0.1fs audio..." % duration)
    recorded_audio = sd.rec(int(duration * sample_rate),
                            samplerate=sample_rate,
                            channels=1,
                            blocking=True)
    print("recorded, replaying it before doing speech recognition...")
    sd.play(recorded_audio, samplerate=sample_rate, blocking=True)

    data = {'samples': recorded_audio, 'sample_rate': sample_rate, 'text': ''}
    data = Compose([ExtractSpeechFeatures()])(data)

    result = transcribe(data, args)

    print("Predicted:")
    print(result)
Ejemplo n.º 5
0
                                 alpha=args.alpha, beta=args.beta, cutoff_top_n=40, cutoff_prob=1.0, beam_width=1000)
    else:
        decoder = GreedyDecoder(labels=vocab)

    t = time.time()
    decoded_output, _ = decoder.decode(outputs)
    print("decode time: %.3fs" % (time.time() - t))

    return decoded_output[0][0]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test')
    parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model')
    parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode')
    parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode')
    parser.add_argument("audio", help='a WAV file')
    args = parser.parse_args()

    data = {
        'fname': args.audio,
        'text': ''
    }
    data = Compose([LoadAudio(), ExtractSpeechFeatures()])(data)

    result = transcribe(data, args)

    print("Predicted:")
    print(result)
Ejemplo n.º 6
0
from datasets import Compose, ComputeMelSpectrogram
from transcribe import transcribe


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--checkpoint", type=str, required=True, help='checkpoint file to test')
    parser.add_argument("--model", choices=['jasper', 'w2l', 'crnn'], default='w2l',
                        help='choices of neural network')
    parser.add_argument("--lm", type=str, required=False, help='link to KenLM 5-gram binary language model')
    parser.add_argument("--alpha", type=float, default=0.3, help='alpha for CTC decode')
    parser.add_argument("--beta", type=float, default=1.85, help='beta for CTC decode')
    args = parser.parse_args()

    duration = 5.0  # s
    sample_rate = 16000
    print("recording %0.1fs audio..." % duration)
    recorded_audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, blocking=True)
    print("recorded, replaying it before doing speech recognition...")
    sd.play(recorded_audio, samplerate=sample_rate, blocking=True)

    data = {
        'samples': recorded_audio,
        'sample_rate': sample_rate,
        'text': ''
    }
    data = Compose([ComputeMelSpectrogram()])(data)

    transcribe(data, args)
def Train(train_root, train_csv, test_csv):

    # parameters
    args = parse_args()
    record_params(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_order
    torch.manual_seed(args.torch_seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.torch_seed)
    np.random.seed(args.torch_seed)
    random.seed(args.torch_seed)

    if args.cudnn == 0:
        cudnn.benchmark = False
    else:
        cudnn.benchmark = True
        cudnn.deterministic = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    num_classes = 4
    net = build_model(args.model_name, num_classes, args.pretrain)

    # resume
    checkpoint_name_loss = os.path.join(
        args.checkpoint,
        args.params_name.split('.')[0] + '_loss.' +
        args.params_name.split('.')[-1])
    checkpoint_name_acc = os.path.join(
        args.checkpoint,
        args.params_name.split('.')[0] + '_acc.' +
        args.params_name.split('.')[-1])
    if args.resume != 0:
        logging.info('Resuming from checkpoint...')
        checkpoint = torch.load(checkpoint_name_loss)
        best_loss = checkpoint['loss']
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']
        history = checkpoint['history']
        net.load_state_dict(checkpoint['net'])
    else:
        best_loss = float('inf')
        best_acc = 0.0
        start_epoch = 0
        history = {
            'train_loss': [],
            'train_acc': [],
            'test_loss': [],
            'test_acc': []
        }
    end_epoch = start_epoch + args.num_epoch

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        net = nn.DataParallel(net)
    net.to(device)

    # data
    img_size = args.img_size
    ## train
    train_aug = Compose([
        Resize(size=(img_size, img_size)),
        RandomHorizontallyFlip(),
        RandomVerticallyFlip(),
        RandomRotate(90),
        ToTensor(),
        Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
    ])
    ## test
    # test_aug = train_aug
    test_aug = Compose([
        Resize(size=(img_size, img_size)),
        ToTensor(),
        Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
    ])

    train_dataset = breast_classify_inbreast(root=train_root,
                                             csv_file=train_csv,
                                             transform=train_aug)
    test_dataset = breast_classify_inbreast(root=train_root,
                                            csv_file=test_csv,
                                            transform=test_aug)

    if args.weighted_sampling == 1:
        weights = torch.FloatTensor([1.0, 1.0, 1.5, 5.0]).to(device)
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=4,
                                  shuffle=True)
    else:
        weights = None
        train_loader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=4,
                                  shuffle=True)

    # train_loader = DataLoader(train_dataset, batch_size=args.batch_size,
    #                           num_workers=4, shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             num_workers=4,
                             shuffle=True)

    # loss function, optimizer and scheduler

    criterion = nn.NLLLoss(size_average=True, weight=weights).to(device)

    optimizer = Adam(net.parameters(), lr=args.lr, amsgrad=True)

    ## scheduler
    if args.lr_policy == 'StepLR':
        scheduler = StepLR(optimizer, step_size=30, gamma=0.5)
    if args.lr_policy == 'PolyLR':
        scheduler = PolyLR(optimizer, max_epoch=end_epoch, power=0.9)

    # training process
    logging.info('Start Training For Breast Density Classification')
    for epoch in range(start_epoch, end_epoch):
        ts = time.time()
        if args.lr_policy != 'None':
            scheduler.step()

        # train
        net.train()
        train_loss = 0.
        train_acc = 0.

        for batch_idx, (inputs, targets) in tqdm(enumerate(train_loader),
                                                 total=int(len(train_loader))):
            inputs = inputs.to(device)
            targets = targets.to(device)
            targets = targets.long()
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(F.log_softmax(outputs, dim=1), targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            accuracy = float(sum(outputs.argmax(dim=1) == targets))
            train_acc += accuracy

        train_acc_epoch = train_acc / (len(train_loader.dataset))

        train_loss_epoch = train_loss / (batch_idx + 1)
        history['train_loss'].append(train_loss_epoch)
        history['train_acc'].append(train_acc_epoch)

        # test
        net.eval()
        test_loss = 0.
        test_acc = 0.

        for batch_idx, (inputs, targets) in tqdm(
                enumerate(test_loader),
                total=int(len(test_loader.dataset) / args.batch_size) + 1):
            with torch.no_grad():
                inputs = inputs.to(device)
                targets = targets.to(device)
                targets = targets.long()
                outputs = net(inputs)
                loss = criterion(F.log_softmax(outputs, dim=1), targets)
                accuracy = float(sum(outputs.argmax(dim=1) == targets))

            test_acc += accuracy
            test_loss += loss.item()

        test_loss_epoch = test_loss / (batch_idx + 1)
        test_acc_epoch = test_acc / (len(test_loader.dataset))
        history['test_loss'].append(test_loss_epoch)
        history['test_acc'].append(test_acc_epoch)

        time_cost = time.time() - ts
        logging.info(
            'epoch[%d/%d]: train_loss: %.3f | train_acc: %.3f | test_loss: %.3f | test_acc: %.3f || time: %.1f'
            % (epoch + 1, end_epoch, train_loss_epoch, train_acc_epoch,
               test_loss_epoch, test_acc_epoch, time_cost))

        # save checkpoint
        if test_loss_epoch < best_loss:
            logging.info('Loss checkpoint Saving...')

            save_model = net
            if torch.cuda.device_count() > 1:
                save_model = list(net.children())[0]
            state = {
                'net': save_model.state_dict(),
                'loss': test_loss_epoch,
                'acc': test_acc_epoch,
                'epoch': epoch + 1,
                'history': history
            }
            torch.save(state, checkpoint_name_loss)
            best_loss = test_loss_epoch

        if test_acc_epoch > best_acc:
            logging.info('Acc checkpoint Saving...')

            save_model = net
            if torch.cuda.device_count() > 1:
                save_model = list(net.children())[0]
            state = {
                'net': save_model.state_dict(),
                'loss': test_loss_epoch,
                'acc': test_acc_epoch,
                'epoch': epoch + 1,
                'history': history
            }
            torch.save(state, checkpoint_name_acc)
            best_acc = test_acc_epoch
def Train(train_root, train_csv, test_csv):
    # parameters
    args = parse_args()

    # record
    record_params(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_order
    torch.manual_seed(args.torch_seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.torch_seed)
    np.random.seed(args.torch_seed)
    random.seed(args.torch_seed)

    if args.cudnn == 0:
        cudnn.benchmark = False
    else:
        cudnn.benchmark = True
        cudnn.deterministic = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    num_classes = 2

    net = build_model(args.model_name, num_classes)

    params_name = '{}_r{}.pkl'.format(args.model_name, args.repetition)
    start_epoch = 0
    history = {
        'train_loss': [],
        'test_loss': [],
        'train_dice': [],
        'test_dice': []
    }
    end_epoch = start_epoch + args.num_epoch

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        net = nn.DataParallel(net)
    net.to(device)

    # data
    train_aug = Compose([
        Resize(size=(args.img_size, args.img_size)),
        ToTensor(),
        Normalize(mean=args.data_mean, std=args.data_std)
    ])
    test_aug = Compose([
        Resize(size=(args.img_size, args.img_size)),
        ToTensor(),
        Normalize(mean=args.data_mean, std=args.data_std)
    ])

    train_dataset = breast_seg(root=train_root,
                               csv_file=train_csv,
                               transform=train_aug)
    test_dataset = breast_seg(root=train_root,
                              csv_file=test_csv,
                              transform=test_aug)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=4,
                              shuffle=True,
                              drop_last=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             num_workers=4,
                             shuffle=False)

    # loss function, optimizer and scheduler
    cedice_weight = torch.tensor(args.cedice_weight)
    ceclass_weight = torch.tensor(args.ceclass_weight)
    diceclass_weight = torch.tensor(args.diceclass_weight)

    if args.loss == 'ce':
        criterion = CrossEntropyLoss2d(weight=ceclass_weight).to(device)
    elif args.loss == 'dice':
        criterion = MulticlassDiceLoss(weight=diceclass_weight).to(device)
    elif args.loss == 'cedice':
        criterion = CEMDiceLoss(cediceweight=cedice_weight,
                                ceclassweight=ceclass_weight,
                                diceclassweight=diceclass_weight).to(device)
    else:
        print('Do not have this loss')

    optimizer = Adam(net.parameters(), lr=args.lr, amsgrad=True)

    ## scheduler
    if args.lr_policy == 'StepLR':
        scheduler = StepLR(optimizer, step_size=30, gamma=0.5)
    if args.lr_policy == 'PolyLR':
        scheduler = PolyLR(optimizer, max_epoch=end_epoch, power=0.9)

    # training process
    logging.info('Start Training For Breast Seg')

    besttraindice = 0.

    for epoch in range(start_epoch, end_epoch):

        ts = time.time()

        net.train()

        for batch_idx, (imgs, _, targets) in tqdm(
                enumerate(train_loader),
                total=int(len(train_loader.dataset) / args.batch_size)):
            imgs = imgs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs = net(imgs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        # test
        net.eval()
        test_loss = 0.
        test_dice = 0.
        test_count = 0
        for batch_idx, (imgs, _, targets) in tqdm(
                enumerate(test_loader),
                total=int(len(test_loader.dataset) / args.batch_size)):
            with torch.no_grad():
                imgs = imgs.to(device)
                targets = targets.to(device)
                outputs = net(imgs)
                loss = criterion(outputs, targets).mean()
            test_count += imgs.shape[0]
            test_loss += loss.item() * imgs.shape[0]
            test_dice += Dice_fn(outputs, targets).item()

        test_loss_epoch = test_loss / float(test_count)
        test_dice_epoch = test_dice / float(test_count)
        history['test_loss'].append(test_loss_epoch)
        history['test_dice'].append(test_dice_epoch)
        train_loss = 0.
        train_dice = 0.
        train_count = 0
        for batch_idx, (imgs, _, targets) in tqdm(
                enumerate(train_loader),
                total=int(len(train_loader.dataset) / args.batch_size)):
            with torch.no_grad():
                imgs = imgs.to(device)
                targets = targets.to(device)
                outputs = net(imgs)
                loss = criterion(outputs, targets).mean()
            train_count += imgs.shape[0]
            train_loss += loss.item() * imgs.shape[0]
            train_dice += Dice_fn(outputs, targets).item()
        train_loss_epoch = train_loss / float(train_count)
        train_dice_epoch = train_dice / float(train_count)
        history['train_loss'].append(train_loss_epoch)
        history['train_dice'].append(train_dice_epoch)

        time_cost = time.time() - ts
        logging.info(
            'epoch[%d/%d]: train_loss: %.3f | test_loss: %.3f | train_dice: %.3f | test_dice: %.3f || time: %.1f'
            % (epoch + 1, end_epoch, train_loss_epoch, test_loss_epoch,
               train_dice_epoch, test_dice_epoch, time_cost))

        if args.lr_policy != 'None':
            scheduler.step()

        # save checkpoint
        if train_dice_epoch > besttraindice:
            besttraindice = train_dice_epoch
            logging.info('Besttraindice Checkpoint {} Saving...'.format(epoch +
                                                                        1))

            save_model = net
            if torch.cuda.device_count() > 1:
                save_model = list(net.children())[0]
            state = {
                'net': save_model.state_dict(),
                'loss': test_loss_epoch,
                'dice': test_dice_epoch,
                'epoch': epoch + 1,
                'history': history
            }
            savecheckname = os.path.join(
                args.checkpoint,
                params_name.split('.pkl')[0] + '_besttraindice.' +
                params_name.split('.')[-1])
            torch.save(state, savecheckname)