Beispiel #1
0
def main():
    model = Tacotron().to(DEVICE)
    print('Model {} is working...'.format(model.name))
    print('{} threads are used...'.format(torch.get_num_threads()))
    ckpt_dir = os.path.join(args.logdir, model.name)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = StepLR(optimizer,
                       step_size=args.lr_decay_step // 10,
                       gamma=0.933)  # around 1/2 per decay step

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    elif not os.path.exists(os.path.join(ckpt_dir, 'ckpt.csv')):
        shutil.rmtree(ckpt_dir)
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    else:
        print('Already exists. Retrain the model.')
        ckpt = pd.read_csv(os.path.join(ckpt_dir, 'ckpt.csv'),
                           sep=',',
                           header=None)
        ckpt.columns = ['models', 'loss']
        ckpt = ckpt.sort_values(by='loss', ascending=True)
        state = torch.load(os.path.join(ckpt_dir, ckpt.models.loc[0]))
        model.load_state_dict(state['model'])
        args.global_step = state['global_step']
        optimizer.load_state_dict(state['optimizer'])
        scheduler.load_state_dict(state['scheduler'])

    # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE)

    dataset = SpeechDataset(args.data_path,
                            args.meta_train,
                            model.name,
                            mem_mode=args.mem_mode)
    validset = SpeechDataset(args.data_path,
                             args.meta_eval,
                             model.name,
                             mem_mode=args.mem_mode)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             collate_fn=collate_fn,
                             drop_last=True,
                             pin_memory=True)
    valid_loader = DataLoader(dataset=validset,
                              batch_size=args.test_batch,
                              shuffle=False,
                              collate_fn=collate_fn,
                              pin_memory=True)

    writer = SummaryWriter(ckpt_dir)
    train(model,
          data_loader,
          valid_loader,
          optimizer,
          scheduler,
          batch_size=args.batch_size,
          ckpt_dir=ckpt_dir,
          writer=writer)
    return None
Beispiel #2
0
def main():
    G = SSRN().to(DEVICE)
    D = MultiScaleDiscriminator().to(DEVICE)

    print('{} threads are used...'.format(torch.get_num_threads()))
    ckpt_dir = os.path.join(args.logdir, type(G).__name__)
    G_optim = torch.optim.Adam(G.parameters(), lr=args.lr)
    D_optim = torch.optim.Adam(D.parameters(), lr=args.lr)
    # scheduler = MultiStepLR(optimizer, milestones=[100000, 200000], gamma=0.5)

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    else:
        print('Already exists. Retrain the model.')
        import pdb
        pdb.set_trace()
        ckpt = sorted(
            glob.glob(
                os.path.join(ckpt_dir,
                             '{}-*k.pth.tar'.format(type(G).__name__))))
        state = torch.load(ckpt[-1])
        args.global_step = state['global_step']
        G.load_state_dict(state['model'])
        G_optim.load_state_dict(state['optimizer'])
        # ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '{}-*k.pth'.format(type(D).__name__))))
        # state = torch.load(ckpt[-1])
        # D.load_state_dict(state['model'])
        # D_optim.load_state_dict(state['optimizer'])

    dataset = SpeechDataset(args.data_path,
                            args.meta_train,
                            type(G).__name__,
                            mem_mode=args.mem_mode)
    validset = SpeechDataset(args.data_path,
                             args.meta_eval,
                             type(G).__name__,
                             mem_mode=args.mem_mode)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             collate_fn=collate_fn,
                             drop_last=True,
                             pin_memory=True,
                             num_workers=args.n_workers)
    valid_loader = DataLoader(dataset=validset,
                              batch_size=args.test_batch,
                              shuffle=False,
                              collate_fn=collate_fn)

    writer = SummaryWriter(ckpt_dir)
    train(G,
          D,
          data_loader,
          valid_loader,
          G_optim,
          D_optim,
          batch_size=args.batch_size,
          ckpt_dir=ckpt_dir,
          writer=writer)
    return None
Beispiel #3
0
def main(network):
    if network == 'text2mel':
        model = Text2Mel().to(DEVICE)
    elif network == 'ssrn':
        model = SSRN().to(DEVICE)
    else:
        print('Wrong network. {text2mel, ssrn}')
        return
    print('Model {} is working...'.format(type(model).__name__))
    print('{} threads are used...'.format(torch.get_num_threads()))
    ckpt_dir = os.path.join(args.logdir, type(model).__name__)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = MultiStepLR(optimizer, milestones=[50000, 150000, 300000], gamma=0.5) #

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    else:
        print('Already exists. Retrain the model.')
        ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '*k.pth.tar')))[-1]
        state = torch.load(ckpt)
        model.load_state_dict(state['model'])
        args.global_step = state['global_step']
        optimizer.load_state_dict(state['optimizer'])
        # scheduler.load_state_dict(state['scheduler'])

    # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE)
    if type(model).__name__ == 'Text2Mel':
        if args.ga_mode:
            cfn_train, cfn_eval = t2m_ga_collate_fn, t2m_collate_fn
        else:
            cfn_train, cfn_eval = t2m_collate_fn, t2m_collate_fn
    else:
        cfn_train, cfn_eval = collate_fn, collate_fn

    dataset = SpeechDataset(args.data_path, args.meta_train, type(model).__name__, mem_mode=args.mem_mode, ga_mode=args.ga_mode)
    validset = SpeechDataset(args.data_path, args.meta_eval, type(model).__name__, mem_mode=args.mem_mode)
    data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size,
                             shuffle=True, collate_fn=cfn_train,
                             drop_last=True, pin_memory=True)
    valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch,
                              shuffle=False, collate_fn=cfn_eval, pin_memory=True)
    
    writer = SummaryWriter(ckpt_dir)
    train(model, data_loader, valid_loader, optimizer, scheduler,
          batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer)
    return None
def main(mode):


    t2m = Text2Mel().to(DEVICE)
    ssrn = SSRN().to(DEVICE)


    if mode == "train":
        dataset = SpeechDataset(args.data_path, args.meta_train, "Text2Mel", mem_mode=args.mem_mode)
    elif mode=="test":
        dataset = SpeechDataset(args.data_path, args.meta_test, "Text2Mel", mem_mode=args.mem_mode)
    elif mode=="eval":
        dataset = SpeechDataset(args.data_path, args.meta_eval, "Text2Mel", mem_mode=args.mem_mode)

    else:
        print('[ERROR] Please set correct type: TRAIN or TEST!' )
        exit(0)


    data_loader = DataLoader(dataset=dataset, batch_size=args.mse_batch,
                             shuffle=False, collate_fn=t2m_collate_fn, pin_memory=True)


    
    ckpt = pd.read_csv(os.path.join(args.logdir, t2m.name, 'ckpt.csv'), sep=',', header=None)
    ckpt.columns = ['models', 'loss']
    ckpt = ckpt.sort_values(by='loss', ascending=True)
    state = torch.load(os.path.join(args.logdir, t2m.name, ckpt.models.loc[0]))
    t2m.load_state_dict(state['model'])
    args.global_step = state['global_step']

    ckpt = pd.read_csv(os.path.join(args.logdir, ssrn.name, 'ckpt.csv'), sep=',', header=None)
    ckpt.columns = ['models', 'loss']
    ckpt = ckpt.sort_values(by='loss', ascending=True)
    state = torch.load(os.path.join(args.logdir, ssrn.name, ckpt.models.loc[0]))
    ssrn.load_state_dict(state['model'])

    print('All of models are loaded.')

    t2m.eval()
    ssrn.eval()
    
    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    return calculate_MSE(t2m=t2m, ssrn=ssrn, data_loader=data_loader, batch_size=args.mse_batch)
Beispiel #5
0
def main():
    model = DCTTS(args).to(DEVICE)
    print('Model {} is working...'.format(args.model_name))
    print('{} threads are used...'.format(torch.get_num_threads()))
    ckpt_dir = os.path.join(args.logdir, args.model_name)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    # scheduler = MultiStepLR(optimizer, milestones=[50000, 150000, 300000], gamma=0.5) #
    scheduler = LambdaLR(optimizer, lr_policy)

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
        if args.pretrained_path is not None:
            print('Train with pretrained model {}'.format(args.pretrained_path))
            state = torch.load(args.pretrained_path)
            model.custom_load_state_dict(state['model'])
    else:
        print('Already exists. Retrain the model.')
        ckpt = sorted(glob.glob(os.path.join(ckpt_dir, '*k.pth.tar')))[-1]
        state = torch.load(ckpt)
        model.load_state_dict(state['model'])
        args.global_step = state['global_step']
        optimizer.load_state_dict(state['optimizer'])
        # scheduler.load_state_dict(state['scheduler'])

    # model = torch.nn.DataParallel(model, device_ids=list(range(args.no_gpu))).to(DEVICE)

    dataset = SpeechDataset(args.data_path, args.meta_train, mem_mode=args.mem_mode)
    validset = SpeechDataset(args.data_path, args.meta_eval, mem_mode=args.mem_mode)
    data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size,
                             shuffle=True, collate_fn=t2m_ga_collate_fn,
                             drop_last=True, pin_memory=True)
    valid_loader = DataLoader(dataset=validset, batch_size=args.test_batch,
                              shuffle=False, collate_fn=t2m_ga_collate_fn, pin_memory=True)
    
    writer = SummaryWriter(ckpt_dir)
    train(model, data_loader, valid_loader, optimizer, scheduler,
          batch_size=args.batch_size, ckpt_dir=ckpt_dir, writer=writer)
    return None
Beispiel #6
0
def main():
    ssrn = SSRN().to(DEVICE)

    mname = type(ssrn).__name__
    ckpt = sorted(
        glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname))))
    state = torch.load(ckpt[-1])
    ssrn.load_state_dict(state['model'])

    if not os.path.exists(args.testdir):
        os.makedirs(args.testdir)

    validset = SpeechDataset(args.data_path,
                             args.meta_eval,
                             type(ssrn).__name__,
                             mem_mode=args.mem_mode)
    valid_loader = DataLoader(dataset=validset,
                              batch_size=args.test_batch,
                              shuffle=False,
                              collate_fn=collate_fn)

    evaluate(ssrn, valid_loader, args.test_batch)
    return None
Beispiel #7
0
def main(DEVICE):
    """
    main function

    :param DEVICE: 'cpu' or 'gpu'

    """
    model = TPGST().to(DEVICE)

    print('Model {} is working...'.format(type(model).__name__))
    ckpt_dir = os.path.join(args.logdir, type(model).__name__)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = LambdaLR(optimizer, lr_policy)

    if not os.path.exists(ckpt_dir):
        os.makedirs(os.path.join(ckpt_dir, 'A', 'train'))
    else:
        print('Already exists. Retrain the model.')
        model_path = sorted(glob.glob(os.path.join(
            ckpt_dir, 'model-*.tar')))[-1]  # latest model
        state = torch.load(model_path)
        model.load_state_dict(state['model'])
        args.global_step = state['global_step']
        optimizer.load_state_dict(state['optimizer'])
        scheduler.last_epoch = state['scheduler']['last_epoch']
        scheduler.base_lrs = state['scheduler']['base_lrs']

    dataset = SpeechDataset(args.data_path,
                            args.meta,
                            mem_mode=args.mem_mode,
                            training=True)
    validset = SpeechDataset(args.data_path,
                             args.meta,
                             mem_mode=args.mem_mode,
                             training=False)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             collate_fn=collate_fn,
                             drop_last=True,
                             pin_memory=True,
                             num_workers=args.n_workers)
    valid_loader = DataLoader(dataset=validset,
                              batch_size=args.test_batch,
                              shuffle=False,
                              collate_fn=collate_fn,
                              pin_memory=True)
    # torch.set_num_threads(4)
    print('{} threads are used...'.format(torch.get_num_threads()))

    writer = SummaryWriter(ckpt_dir)
    train(model,
          data_loader,
          valid_loader,
          optimizer,
          scheduler,
          batch_size=args.batch_size,
          ckpt_dir=ckpt_dir,
          writer=writer,
          DEVICE=DEVICE)
    return None
Beispiel #8
0
def main():
    #if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data")
    parser.add_argument("-data_path",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-prior_path",
                        default=None,
                        help="the path to load the final.occs file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-frame_subsampling_factor",
                        default=1,
                        type=int,
                        help="the factor to subsample the features")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data

    config["source_paths"].append(data_config)
    config["data_path"] = args.data_path

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    print(transform)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda:1" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    log_prior = None
    if (args.prior_path):
        prior = read_matrix(args.prior_path).numpy()
        log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])),
                              dtype=th.float)

    model.eval()
    with th.no_grad():
        with MatrixWriter("ark:" + args.out_file) as llout:
            for i, data in enumerate(test_dataloader):
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                if (args.frame_subsampling_factor > 1):
                    x = x.unfold(1, 1,
                                 args.frame_subsampling_factor).squeeze(-1)
                x = x.cuda()
                prediction = model(x)
                # save only unpadded part for each utt in batch
                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()
                    loglikes_j = loglikes[:num_frs[j], :]
                    if (log_prior):
                        loglikes_j = loglikes_j - log_prior

                    llout[utt_ids[j][0]] = loglikes_j

                print("Process batch [{}/{}]".format(i + 1,
                                                     len(test_dataloader)))
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-model_path")
    parser.add_argument("-data_path")
    parser.add_argument("-prior_path",
                        help="the path to load the final.occs file")
    parser.add_argument("-out_file",
                        help="write out the log-probs to this file")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument("-graph_dir", help="the decoding graph directory")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-data_loader_threads",
                        default=4,
                        type=int,
                        help="number of workers for data loading")

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    config["source_paths"] = list()
    data_config = dict()

    data_config["type"] = "Eval"
    data_config["wav"] = args.data_path

    config["source_paths"].append(data_config)

    print("job starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)

    dataset = SpeechDataset(config)
    #data = trainset.__getitem__(0)
    test_dataloader = SeqDataloader(dataset,
                                    batch_size=args.batch_size,
                                    test_only=True,
                                    global_mvn=True,
                                    transform=transform)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(test_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    device = th.device("cuda" if th.cuda.is_available() else "cpu")
    model.cuda()

    assert os.path.isfile(
        args.model_path), "ERROR: model file {} does not exit!".format(
            args.model_path)

    checkpoint = th.load(args.model_path, map_location='cuda:0')
    state_dict = checkpoint['model']
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        header = k[:7]
        name = k[7:]  # remove 'module.' of dataparallel
        new_state_dict[name] = v
    if header == "module.":
        model.load_state_dict(new_state_dict)
    else:
        model.load_state_dict(state_dict)
    print("=> loaded checkpoint '{}' ".format(args.model_path))

    HCLG = args.graph_dir + "/HCLG.fst"
    words_txt = args.graph_dir + "/words.txt"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    prior = read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = True  #To produce compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    model.eval()
    with th.no_grad():
        with kaldi_util.table.CompactLatticeWriter("ark:" +
                                                   args.out_file) as lat_out:
            for data in test_dataloader:
                feat = data["x"]
                num_frs = data["num_frs"]
                utt_ids = data["utt_ids"]

                x = feat.to(th.float32)
                x = x.cuda()

                prediction = model(x)

                for j in range(len(num_frs)):
                    loglikes = prediction[j, :, :].data.cpu()

                    loglikes_j = loglikes[:num_frs[j], :]
                    loglikes_j = loglikes_j - log_prior

                    decoder_out = asr_decoder.decode(
                        kaldi_matrix.Matrix(loglikes_j.numpy()))

                    key = utt_ids[j][0]
                    print(key, decoder_out["text"])

                    print("Log-like per-frame for utterance {} is {}".format(
                        key, decoder_out["likelihood"] / num_frs[j]))

                    # save lattice
                    lat_out[key] = decoder_out["lattice"]
    eos_token = char_to_token['<eos>']

    # #test_dataset = SpeechDataset(test_df, dataset_dir)
    # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    model = load_model(args.load_dir, args.epoch, device=DEVICE)

    num_sent = 10
    model.eval()
    model.tf_ratio = 0.9

    for i in range(num_sent):

        if args.first_ten:
            idx = i
        else:
            idx = random.randint(0, train_df.shape[0])
        trial_dataset = SpeechDataset(train_df, root_dir, char_to_token)

        x, y = trial_dataset.__getitem__(idx)
        # plt.imshow(x[0,:,:].detach())

        # Model output
        target = y.unsqueeze(dim=0).to(DEVICE)
        data = x.permute(0, 2, 1).to(DEVICE)
        loss, output = model(data, target)
        print("True sent : ", decode_true_sent(y))
        print("Pred sent : ", decode_pred_sent(output))
        print("Loss :", loss.item())
        print("\n")
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-data", help="data yaml file")
    parser.add_argument("-dataPath",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-seed_model",
                        default='',
                        help="the seed nerual network model")
    parser.add_argument("-exp_dir", help="the directory to save the outputs")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument(
        "-ali_dir",
        help="the directory to load trans_model and tree used for alignments")
    parser.add_argument("-lang_dir",
                        help="the lexicon directory to load L.fst")
    parser.add_argument(
        "-chain_dir",
        help=
        "the directory to load trans_model, tree and den.fst for chain model")
    parser.add_argument("-lr", type=float, help="set the base learning rate")
    parser.add_argument(
        "-warmup_steps",
        default=4000,
        type=int,
        help="the number of warmup steps to adjust the learning rate")
    parser.add_argument("-xent_regularize",
                        default=0,
                        type=float,
                        help="cross-entropy regularization weight")
    parser.add_argument("-momentum",
                        default=0,
                        type=float,
                        help="set the momentum")
    parser.add_argument("-weight_decay",
                        default=1e-4,
                        type=float,
                        help="set the L2 regularization weight")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=100,
                        type=float,
                        help="process n hours of data per sweep (default:100)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument(
        "-anneal_lr_epoch",
        default=2,
        type=int,
        help="start to anneal the learning rate from this epoch")
    parser.add_argument("-anneal_lr_ratio",
                        default=0.5,
                        type=float,
                        help="the ratio to anneal the learning rate ratio")
    parser.add_argument('-print_freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('-save_freq',
                        default=1000,
                        type=int,
                        metavar='N',
                        help='save model frequency (default: 1000)')

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size

    print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]
        if 'dir_noise' in data:
            config["dir_noise_paths"] = [
                j for i, j in data['dir_noise'].items()
            ]
        if 'rir' in data:
            config["rir_paths"] = [j for i, j in data['rir'].items()]
    config['data_path'] = args.dataPath

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.data_loader_threads,
                                     distributed=True,
                                     test_only=False)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    model = lstm.LSTMAM(model_config["feat_dim"], model_config["label_size"],
                        model_config["hidden_size"],
                        model_config["num_layers"], model_config["dropout"],
                        True)

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)
        state_dict = checkpoint['model']
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            header = k[:7]
            name = k[7:]  # remove 'module.' of dataparallel
            new_state_dict[name] = v
        if header == "module.":
            model.load_state_dict(new_state_dict)
        else:
            model.load_state_dict(state_dict)
        print("=> loaded checkpoint '{}' ".format(args.seed_model))

    ali_model = args.ali_dir + "/final.mdl"
    ali_tree = args.ali_dir + "/tree"
    L_fst = args.lang_dir + "/L.fst"
    disambig = args.lang_dir + "/phones/disambig.int"

    den_fst = kaldi_fst.StdVectorFst.read(args.chain_dir + "/den.fst")
    chain_model_path = args.chain_dir + "/0.trans_mdl"
    chain_tree_path = args.chain_dir + "/tree"

    if os.path.isfile(chain_model_path):
        chain_trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(chain_model_path) as ki:
            chain_trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (trans_model))
        sys.exit(0)

    chain_tree = kaldi_tree.ContextDependency()
    with kaldi_util.io.xopen(chain_tree_path) as ki:
        chain_tree.read(ki.stream(), ki.binary)

    # chain supervision options
    supervision_opts = kaldi_chain.SupervisionOptions()
    supervision_opts.convert_to_pdfs = True
    supervision_opts.frame_subsampling_factor = 3
    supervision_opts.left_tolerance = 5
    supervision_opts.right_tolerance = 5

    # chain training options
    chain_opts = kaldi_chain.ChainTrainingOptions()
    chain_opts.leaky_hmm_coefficient = 1e-4
    chain_opts.xent_regularize = args.xent_regularize

    # setup the aligner
    aligner = kaldi_align.MappedAligner.from_files(ali_model,
                                                   ali_tree,
                                                   L_fst,
                                                   None,
                                                   disambig,
                                                   None,
                                                   beam=10,
                                                   transition_scale=1.0,
                                                   self_loop_scale=0.1,
                                                   acoustic_scale=0.1)
    den_graph = kaldi_chain.DenominatorGraph(den_fst,
                                             model_config["label_size"])

    #encoder_layer = nn.TransformerEncoderLayer(512, 8)
    #print(encoder_layer)

    model.train()
    for epoch in range(args.num_epochs):

        # anneal learning rate
        if epoch > args.anneal_lr_epoch:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= args.anneal_lr_ratio

        run_train_epoch(model, optimizer, train_dataloader, epoch,
                        chain_trans_model, chain_tree, supervision_opts,
                        aligner, den_graph, chain_opts, args)

        # save model
        if hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/chain.model.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)
Beispiel #12
0
    train_data_df = pd.read_csv(os.path.join(train_dir, 'train_data.csv'), 
                                skiprows=[0], 
                                header=None, 
                                names=['index', 'clip', 'sentence'])
    test_data_df = pd.read_csv(os.path.join(test_dir, 'test_data.csv'), 
                                skiprows=[0], 
                                header=None, 
                                names=['index', 'clip', 'sentence'])

    max_data_len = 2500
    max_sent_len = 100

    bs = int(input("Enter batch_size:"))

    train_dataset = SpeechDataset(train_data_df, train_dir, max_data_len, max_sent_len)
    train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)

    test_dataset = SpeechDataset(test_data_df, test_dir, max_data_len, max_sent_len)
    test_loader = DataLoader(test_dataset, batch_size=bs)


    device = torch.device("cuda") if torch.cuda.is_available() else 'cpu' 
    print('device:',device)
    input_len = 201
    hidden_size = 50
    num_layers = 3
    output_shape = 28
    bidirectional = True
    model = BasicASR(input_len, hidden_size, num_layers, output_shape, bidirectional).to(device)
Beispiel #13
0
    args = get_args()

    pkg = torch.load(args.model_file)
    model_config = pkg['model_config']
    vocab = load_vocab(args.vocab_file)
    id2token = [None] * len(vocab)
    for k, v in vocab.items():
        id2token[v] = k

    collate = Collate(model_config["left_context"],
                      model_config["right_context"],
                      model_config["skip_frame"], model_config["norm_mean"],
                      model_config["norm_var"])

    testset = SpeechDataset(args.data_file)
    test_loader = torch.utils.data.DataLoader(testset,
                                              collate_fn=collate,
                                              shuffle=False)

    # check dim match
    if model_config["feat_dim"] != testset[0]["feat"]["dim"]:
        raise ValueError(("Dim mismatch: " + "model {} vs. feat {}.").format(
            model_config["feat_dim"], testset[0]["feat"]["dim"]))

    model_load_timer = Timer()
    model_load_timer.tic()
    # build encoder and decoder
    if model_config["encoder"]["type"] == "BiRNN":
        encoder = BiRNN(model_config["encoder"])
    elif model_config["encoder"]["type"] == "BiRNN_Torch":
Beispiel #14
0
def train_model(model_class, preprocess_fun, is_1d, reshape_size, BATCH_SIZE, epochs, CODER, preprocess_param={}, bagging_num=1, semi_train_path=None, pretrained=None, pretraining=False, MGPU=False):
    """
    :param model_class: model class. e.g. vgg, resnet, senet
    :param preprocess_fun: preprocess function. e.g. mel, mfcc, raw wave
    :param is_1d: boolean. True for conv1d models and false for conv2d
    :param reshape_size: int. only for conv2d, reshape the image size
    :param BATCH_SIZE: batch size.
    :param epochs: number of epochs
    :param CODER: string for saving and loading model/files
    :param preprocess_param: parameters for preprocessing function
    :param bagging_num: number of training per model, aka bagging models
    :param semi_train_path: path to semi supervised learning file.
    :param pretrained: path to pretrained model
    :param pretraining: boolean. if this is pretraining
    :param MGPU: whether using multiple gpus
    """
    # 학습에 사용되는 모델을 정의하는 get_model() 함수이다
    def get_model(model=model_class, m=MGPU, pretrained=pretrained):
        # multi-GPU일 경우, Data Parallelism
        mdl = torch.nn.DataParallel(model()) if m else model()
        if not pretrained:
            return mdl
        else:
            print("load pretrained model here...")
            # 기학습된 torch.load()로 모델을 불러온다
            mdl.load_state_dict(torch.load(pretrained))
            if 'vgg' in pretrained:
                # VGG 모델의 경우, 최상위층 파라미터 외 모든 파라미터를 학습이 안되도록 requires_grad=False로 지정한다. 
                fixed_layers = list(mdl.features)
                for l in fixed_layers:
                    for p in l.parameters():
                        p.requires_grad = False
            return mdl

    label_to_int, int_to_label = get_label_dict()
    # bagging_num 만큼 모델 학습을 반복 수행한다
    for b in range(bagging_num):
        print("training model # ", b)

        # 학습에 사용되는 loss function을 정의한다
        loss_fn = torch.nn.CrossEntropyLoss()

        # 모델을 정의하고, .cuda()로 GPU, CUDA와 연동한다
        speechmodel = get_model()
        speechmodel = speechmodel.cuda()

        # 학습 중간에 성능 표시를 위한 값을 준비한다
        total_correct = 0
        num_labels = 0
        start_time = time()

        # 지정된 epoch 만큼 학습을 수행한다.
        for e in range(epochs):
            print("training epoch ", e)
            # 10 epoch 이후에는 learning_rate를 1/10로 줄인다
            learning_rate = 0.01 if e < 10 else 0.001
            # 학습에 사용할 SGD optimizer + momentum을 정의한다
            optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001)

            # 모델 내부 모듈을 학습 직전에 활성화시킨다
            speechmodel.train()

            if semi_train_path:
                # semi-supervised 학습일 경우에는 훈련 데이터를 불러오는 기준이 다르다. [ Semi-Supervised 모델 학습 ] 에서 자세하게 다룬다.
                # 학습에 사용할 파일 목록 train_list에 테스트 데이터를 추가한다.
                train_list, label_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path,
                                           test_ratio=choice([0.2, 0.25, 0.3, 0.35]))
                print("semi training list length: ", len(train_list))
            else:
                # Supervised 학습의 경우, 훈련 데이터 목록을 받아온다.
                train_list, label_list, _ = get_wav_list(words=label_to_int.keys())

            if pretraining:
                traindataset = PreDataset(label_words_dict=label_to_int,
                                          add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                          resize_shape=reshape_size, is_1d=is_1d)
            else:
                traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=(train_list, label_list),
                                             add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                             resize_shape=reshape_size, is_1d=is_1d)

            # Dataloader를 통해 Data Queue를 생성한다. Shuffle=True 설정을 통하여 매 epoch마다 읽어오는 데이터를 랜덤하게 선정한다.
            trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True)

            # trainloader를 통해 batch_size 만큼의 훈련 데이터를 읽어온다
            for batch_idx, batch_data in enumerate(trainloader):

                # spec은 스펙트로그램의 약자로 음성 데이터를 의미하고, label은 정답값을 의미한다
                spec = batch_data['spec']
                label = batch_data['label']
                spec, label = Variable(spec.cuda()), Variable(label.cuda())

                # 현재 모델(speechmodel)에 데이터(spec)을 입력하여, 예측 결과물(y_pred)을 얻는다
                y_pred = speechmodel(spec)

                # 예측 결과물과 정답값으로 현재 모델의 Loss값을 구한다
                loss = loss_fn(y_pred, label)
                optimizer.zero_grad()
                # backpropagation을 수행하여, Loss 값을 개선하기 위해 모델 파라미터를 수정해야하는 방향을 얻는다.
                loss.backward()
                # optimizer.step() 함수를 통해 모델 파라미터를 업데이트한다. 이전보다 loss 값이 줄어들도록 하는 방향으로 모델 파라미터가 업데이트 되었다.
                optimizer.step()

                # 확률값인 y_pred에서 max값을 구하여 현재 모델의 정확률(correct)을 구한다
                _, pred_labels = torch.max(y_pred.data, 1)
                correct = (pred_labels == label.data).sum()
                total_correct += correct
                num_labels += len(label)

            # 훈련 데이터에 대한 정확률을 중간마다 출력해준다.
            print("training loss:", 100. * total_correct / num_labels, time()-start_time)

        # 학습이 완료된 모델 파라미터를 저장한다
        create_directory("model")
        torch.save(speechmodel.state_dict(), "model/model_%s_%s.pth" % (CODER, b))

    if not pretraining:
        print("doing prediction...")
        softmax = Softmax()

        # 저장된 학습 모델 경로를 지정한다. Bagging_num 개수만큼의 모델을 읽어온다
        trained_models = ["model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)]

        # 테스트 데이터에 대한 Dataset을 생성하고, DataLoader를 통해 Data Queue를 생성한다.
        _, _, test_list = get_wav_list(words=label_to_int.keys())
        testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=(test_list, []),
                                    add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                    resize_shape=reshape_size, is_1d=is_1d)
        testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False)

        for e, m in enumerate(trained_models):
            print("predicting ", m)
            speechmodel = get_model(m=MGPU)
            # torch.load() 함수를 통해 학습이 완료된 모델을 읽어온다.
            speechmodel.load_state_dict(torch.load(m))
            # 모델을 cuda와 연동하고, evaluation 모드로 지정한다.
            speechmodel = speechmodel.cuda()
            speechmodel.eval()

            test_fnames, test_labels = [], []
            pred_scores = []
            # 테스트 데이터를 batch_size 만큼 받아와 예측 결과물을 생성한다.
            for batch_idx, batch_data in enumerate(testloader):
                spec = Variable(batch_data['spec'].cuda())
                fname = batch_data['id']
                # y_pred는 테스트 데이터에 대한 모델의 예측값이다.
                y_pred = softmax(speechmodel(spec))
                pred_scores.append(y_pred.data.cpu().numpy())
                test_fnames += fname

            # bagging_num 개의 모델이 출력한 확률값 y_pred를 더하여 앙상블 예측값을 구한다.
            if e == 0:
                final_pred = np.vstack(pred_scores)
                final_test_fnames = test_fnames
            else:
                final_pred += np.vstack(pred_scores)
                assert final_test_fnames == test_fnames

        # bagging_num 개수로 나누어, 최종 예측 확률값(final_pred)을 기반으로 최종 예측값(final_labels)를 생성한다.
        final_pred /= len(trained_models)
        final_labels = [int_to_label[x] for x in np.argmax(final_pred, 1)]

        # 캐글 제출용 파일 생성을 위한 파일 이름(test_fnames)를 정의한다.
        test_fnames = [x.split("/")[-1] for x in final_test_fnames]
        labels = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence']
        # 캐글 제출용 파일을 저장한다. (파일명과 최종 예측값이 기록된다)
        create_directory("sub")
        pd.DataFrame({'fname': test_fnames,
                      'label': final_labels}).to_csv("sub/%s.csv" % CODER, index=False)

        # 서로 다른 모델의 앙상블, 학습 성능 향상을 목적으로 bagging 앙상블 모델의 예측 확률값을 별도 파일로 저장한다.
        pred_scores = pd.DataFrame(np.vstack(final_pred), columns=labels)
        pred_scores['fname'] = test_fnames
        create_directory("pred_scores")
        pred_scores.to_csv("pred_scores/%s.csv" % CODER, index=False)
Beispiel #15
0
def main():
    args = parser.parse_args()
    cf = ConfigParser.ConfigParser()
    try:
        cf.read(args.conf)
    except:
        print("conf file not exists")
        sys.exit(1)
    USE_CUDA = cf.getboolean('Training', 'use_cuda')
    try:
        seed = long(cf.get('Training', 'seed'))
    except:
        seed = torch.cuda.initial_seed()
        cf.set('Training', 'seed', seed)
        cf.write(open(args.conf, 'w'))

    torch.manual_seed(seed)
    if USE_CUDA:
        torch.cuda.manual_seed(seed)

    log_dir = cf.get('Data', 'log_dir')
    log_file = os.path.join(log_dir, cf.get('Data', 'log_file'))
    logger = init_logger(log_file)

    # Define Model
    rnn_input_size = cf.getint('Model', 'rnn_input_size')
    rnn_hidden_size = cf.getint('Model', 'rnn_hidden_size')
    rnn_layers = cf.getint('Model', 'rnn_layers')
    rnn_type = RNN[cf.get('Model', 'rnn_type')]
    bidirectional = cf.getboolean('Model', 'bidirectional')
    batch_norm = cf.getboolean('Model', 'batch_norm')
    rnn_param = {
        "rnn_input_size": rnn_input_size,
        "rnn_hidden_size": rnn_hidden_size,
        "rnn_layers": rnn_layers,
        "rnn_type": rnn_type,
        "bidirectional": bidirectional,
        "batch_norm": batch_norm
    }
    num_class = cf.getint('Model', 'num_class')
    drop_out = cf.getfloat('Model', 'drop_out')

    model = CTC_Model(rnn_param=rnn_param,
                      num_class=num_class,
                      drop_out=drop_out)
    print("Model Structure:")
    logger.info("Model Structure:")
    for idx, m in enumerate(model.children()):
        print(idx, m)
        logger.info(str(idx) + "->" + str(m))

    data_dir = cf.get('Data', 'data_dir')
    batch_size = cf.getint("Training", 'batch_size')

    # Data Loader
    train_dataset = SpeechDataset(data_dir, data_set='train')
    dev_dataset = SpeechDataset(data_dir, data_set="dev")
    train_loader = SpeechDataLoader(train_dataset,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=4,
                                    pin_memory=False)
    dev_loader = SpeechDataLoader(dev_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=4,
                                  pin_memory=False)

    # ensure the feats is equal to the rnn_input_Size
    assert train_dataset.n_feats == rnn_input_size

    # decoder for dev set
    decoder = GreedyDecoder(int2char,
                            space_idx=len(int2char) - 1,
                            blank_index=0)

    # Training
    init_lr = cf.getfloat('Training', 'init_lr')
    num_epoches = cf.getint('Training', 'num_epoches')
    end_adjust_acc = cf.getfloat('Training', 'end_adjust_acc')
    decay = cf.getfloat("Training", 'lr_decay')
    weight_decay = cf.getfloat("Training", 'weight_decay')

    params = {
        'num_epoches': num_epoches,
        'end_adjust_acc': end_adjust_acc,
        'seed': seed,
        'decay': decay,
        'learning_rate': init_lr,
        'weight_decay': weight_decay,
        'batch_size': batch_size,
        'n_feats': train_dataset.n_feats
    }
    print(params)

    if USE_CUDA:
        model = model.cuda()

    loss_fn = CTCLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=init_lr,
                                 weight_decay=weight_decay)

    # visualization for training
    from visdom import Visdom
    viz = Visdom()
    title = 'TIMIT LSTM_CTC Acoustic Model'

    opts = [
        dict(title=title + " Loss", ylabel='Loss', xlabel='Epoch'),
        dict(title=title + " Loss on Dev", ylabel='DEV Loss', xlabel='Epoch'),
        dict(title=title + ' CER on DEV', ylabel='DEV CER', xlabel='Epoch')
    ]
    viz_window = [None, None, None]

    count = 0
    learning_rate = init_lr
    loss_best = 1000
    loss_best_true = 1000
    adjust_rate_flag = False
    stop_train = False
    adjust_time = 0
    acc_best = 0
    start_time = time.time()
    loss_results = []
    dev_loss_results = []
    dev_cer_results = []

    while not stop_train:
        if count >= num_epoches:
            break
        count += 1

        if adjust_rate_flag:
            learning_rate *= decay
            adjust_rate_flag = False
            for param in optimizer.param_groups:
                param['lr'] *= decay

        print("Start training epoch: %d, learning_rate: %.5f" %
              (count, learning_rate))
        logger.info("Start training epoch: %d, learning_rate: %.5f" %
                    (count, learning_rate))

        loss = train(model,
                     train_loader,
                     loss_fn,
                     optimizer,
                     logger,
                     print_every=20,
                     USE_CUDA=USE_CUDA)
        loss_results.append(loss)
        acc, dev_loss = dev(model,
                            dev_loader,
                            loss_fn,
                            decoder,
                            logger,
                            USE_CUDA=USE_CUDA)
        print("loss on dev set is %.4f" % dev_loss)
        logger.info("loss on dev set is %.4f" % dev_loss)
        dev_loss_results.append(dev_loss)
        dev_cer_results.append(acc)

        # adjust learning rate by dev_loss
        #adjust_rate_count  :  表示连续超过count个epoch的loss在end_adjust_acc区间内认为稳定
        if dev_loss < (loss_best - end_adjust_acc):
            loss_best = dev_loss
            loss_best_true = dev_loss
            adjust_rate_count = 0
            acc_best = acc
            best_model_state = copy.deepcopy(model.state_dict())
            best_op_state = copy.deepcopy(optimizer.state_dict())
        elif (dev_loss < loss_best + end_adjust_acc):
            adjust_rate_count += 1
            if dev_loss < loss_best and dev_loss < loss_best_true:
                loss_best_true = dev_loss
                acc_best = acc
                best_model_state = copy.deepcopy(model.state_dict())
                best_op_state = copy.deepcopy(optimizer.state_dict())
        else:
            adjust_rate_count = 10

        print("adjust_rate_count: %d" % adjust_rate_count)
        print('adjust_time: %d' % adjust_time)
        logger.info("adjust_rate_count: %d" % adjust_rate_count)
        logger.info('adjust_time: %d' % adjust_time)

        if adjust_rate_count == 10:
            adjust_rate_flag = True
            adjust_time += 1
            adjust_rate_count = 0
            if loss_best > loss_best_true:
                loss_best = loss_best_true
            model.load_state_dict(best_model_state)
            optimizer.load_state_dict(best_op_state)

        if adjust_time == 8:
            stop_train = True

        time_used = (time.time() - start_time) / 60
        print("epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" %
              (count, acc, time_used))
        logger.info(
            "epoch %d done, dev acc is: %.4f, time_used: %.4f minutes" %
            (count, acc, time_used))

        x_axis = range(count)
        y_axis = [
            loss_results[0:count], dev_loss_results[0:count],
            dev_cer_results[0:count]
        ]
        for x in range(len(viz_window)):
            if viz_window[x] is None:
                viz_window[x] = viz.line(
                    X=np.array(x_axis),
                    Y=np.array(y_axis[x]),
                    opts=opts[x],
                )
            else:
                viz.line(
                    X=np.array(x_axis),
                    Y=np.array(y_axis[x]),
                    win=viz_window[x],
                    update='replace',
                )

    print("End training, best dev loss is: %.4f, acc is: %.4f" %
          (loss_best_true, acc_best))
    logger.info("End training, best dev loss acc is: %.4f, acc is: %.4f" %
                (loss_best_true, acc_best))
    model.load_state_dict(best_model_state)
    optimizer.load_state_dict(best_op_state)
    best_path = os.path.join(log_dir,
                             'best_model' + '_dev' + str(acc_best) + '.pkl')
    cf.set('Model', 'model_file', best_path)
    cf.write(open(args.conf, 'w'))
    params['epoch'] = count

    torch.save(
        CTC_Model.save_package(model,
                               optimizer=optimizer,
                               epoch=params,
                               loss_results=loss_results,
                               dev_loss_results=dev_loss_results,
                               dev_cer_results=dev_cer_results), best_path)
Beispiel #16
0
    'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'
]
label_to_int = dict(zip(labels, range(len(labels))))
int_to_label = dict(zip(range(len(labels)), labels))
int_to_label.update({len(labels): 'unknown', len(labels) + 1: 'silence'})

# 모드에 따라 학습 및 검증에 사용할 파일을 선택한다
trn = 'input/trn.txt' if mode == 'cv' else 'input/trn_all.txt'
tst = 'input/val.txt' if mode == 'cv' else 'input/tst.txt'

trn = [line.strip() for line in open(trn, 'r').readlines()]
wav_list = [line.split(',')[-1] for line in trn]
label_list = [line.split(',')[0] for line in trn]
# 학습용 SpeechDataset을 불러온다
traindataset = SpeechDataset(mode='train',
                             label_to_int=label_to_int,
                             wav_list=wav_list,
                             label_list=label_list)

start_time = time()
for e in range(epochs):
    print("training epoch ", e)
    # learning_rate를 epoch마다 다르게 지정한다
    learning_rate = 0.01 if e < 10 else 0.001
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       speechmodel.parameters()),
                                lr=learning_rate,
                                momentum=0.9,
                                weight_decay=0.00001)
    # 모델을 학습하기 위하여 .train() 함수를 실행한다
    speechmodel.train()
Beispiel #17
0
    if utils.TENSORBOARD_LOGGING == 1:
        utils.visulizer.set_writer(
            os.path.join(trainer_config["exp_dir"], 'log'))

    collate = Collate(model_config["left_context"],
                      model_config["right_context"],
                      model_config["skip_frame"], model_config["norm_mean"],
                      model_config["norm_var"])

    batch_frames = trainer_config["batch_frames"]
    valid_batch_size = 20
    if "multi_gpu" in trainer_config and trainer_config["multi_gpu"] == True:
        batch_frames *= torch.cuda.device_count()
        valid_batch_size *= torch.cuda.device_count()

    trainset = SpeechDataset(os.path.join(args.data_dir, "train.json"))
    validset = SpeechDataset(os.path.join(args.data_dir, "dev.json"))
    logger.info("Loaded {} utterances for training.".format(len(trainset)))
    logger.info("Loaded {} utterances for validation.".format(len(validset)))

    trainsampler = FrameBasedSampler(trainset, frame_num=batch_frames)
    tr_loader = torch.utils.data.DataLoader(trainset,
                                            batch_sampler=trainsampler,
                                            collate_fn=collate,
                                            shuffle=False,
                                            num_workers=16,
                                            pin_memory=True)
    cv_loader = torch.utils.data.DataLoader(validset,
                                            collate_fn=collate,
                                            batch_size=valid_batch_size,
                                            num_workers=16,
Beispiel #18
0
                                                       lengths=output_lengths,
                                                       batch_first=True)

        output_padded = nn.utils.rnn.pad_packed_sequence(output,
                                                         batch_first=True)
        output, output_lengths = output_padded
        output = output.index_select(0, idx_unsort.to(output.device))
        output_lengths = output_lengths.index_select(
            0, idx_unsort.to(output_lengths.device))
        return output, output_lengths


if __name__ == "__main__":
    from data import FrameBasedSampler, Collate, SpeechDataset
    fn = "/home/baiye/Speech/las/egs/timit/data/test.json"
    dataset = SpeechDataset(fn)
    sampler = FrameBasedSampler(dataset)
    collate = Collate(left=0, right=0)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_sampler=sampler,
                                             collate_fn=collate,
                                             shuffle=False)
    dataiter = iter(dataloader)
    feats, feat_lengths, targets, target_lengths = next(dataiter)
    config = {
        "input_dim": 40,
        "hidden_size": 256,
        "num_layers": 3,
    }
    rnn = PyramidBiRNN(config)
    output, output_lengths = rnn(feats, feat_lengths)
Beispiel #19
0
def train_model(model_class,
                preprocess_fun,
                is_1d,
                reshape_size,
                BATCH_SIZE,
                epochs,
                CODER,
                preprocess_param={},
                bagging_num=1,
                semi_train_path=None,
                pretrained=None,
                pretraining=False,
                MGPU=False):
    """
    :param model_class: model class. e.g. vgg, resnet, senet
    :param preprocess_fun: preprocess function. e.g. mel, mfcc, raw wave
    :param is_1d: boolean. True for conv1d models and false for conv2d
    :param reshape_size: int. only for conv2d, reshape the image size
    :param BATCH_SIZE: batch size.
    :param epochs: number of epochs
    :param CODER: string for saving and loading model/files
    :param preprocess_param: parameters for preprocessing function
    :param bagging_num: number of training per model, aka bagging models
    :param semi_train_path: path to semi supervised learning file.
    :param pretrained: path to pretrained model
    :param pretraining: boolean. if this is pretraining
    :param MGPU: whether using multiple gpus
    """
    def get_model(model=model_class, m=MGPU, pretrained=pretrained):
        mdl = torch.nn.DataParallel(model()) if m else model()
        if not pretrained:
            return mdl
        else:
            print("load pretrained model here...")
            mdl.load_state_dict(torch.load(pretrained))
            if 'vgg' in pretrained:
                fixed_layers = list(mdl.features)
                for l in fixed_layers:
                    for p in l.parameters():
                        p.requires_grad = False
            return mdl

    label_to_int, int_to_label = get_label_dict()
    for b in range(bagging_num):
        print("training model # ", b)

        loss_fn = torch.nn.CrossEntropyLoss()

        speechmodel = get_model()
        speechmodel = speechmodel.cuda()

        total_correct = 0
        num_labels = 0
        start_time = time()

        for e in range(epochs):
            print("training epoch ", e)
            learning_rate = 0.01 if e < 10 else 0.001
            optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                               speechmodel.parameters()),
                                        lr=learning_rate,
                                        momentum=0.9,
                                        weight_decay=0.00001)
            speechmodel.train()
            if semi_train_path:
                train_list = get_semi_list(words=label_to_int.keys(),
                                           sub_path=semi_train_path,
                                           test_ratio=choice(
                                               [0.2, 0.25, 0.3, 0.35]))
                print("semi training list length: ", len(train_list))
            else:
                train_list, _ = get_wav_list(words=label_to_int.keys())

            if pretraining:
                traindataset = PreDataset(label_words_dict=label_to_int,
                                          add_noise=True,
                                          preprocess_fun=preprocess_fun,
                                          preprocess_param=preprocess_param,
                                          resize_shape=reshape_size,
                                          is_1d=is_1d)
            else:
                traindataset = SpeechDataset(mode='train',
                                             label_words_dict=label_to_int,
                                             wav_list=train_list,
                                             add_noise=True,
                                             preprocess_fun=preprocess_fun,
                                             preprocess_param=preprocess_param,
                                             resize_shape=reshape_size,
                                             is_1d=is_1d)
            trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True)
            for batch_idx, batch_data in enumerate(trainloader):
                spec = batch_data['spec']
                label = batch_data['label']
                spec, label = Variable(spec.cuda()), Variable(label.cuda())
                y_pred = speechmodel(spec)
                _, pred_labels = torch.max(y_pred.data, 1)
                correct = (pred_labels == label.data).sum()
                loss = loss_fn(y_pred, label)

                total_correct += correct
                num_labels += len(label)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print("training loss:", 100. * total_correct / num_labels,
                  time() - start_time)

        # save model
        create_directory("model")
        torch.save(speechmodel.state_dict(),
                   "model/model_%s_%s.pth" % (CODER, b))

    if not pretraining:
        print("doing prediction...")
        softmax = Softmax()

        trained_models = [
            "model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)
        ]

        # prediction
        _, test_list = get_wav_list(words=label_to_int.keys())
        testdataset = SpeechDataset(mode='test',
                                    label_words_dict=label_to_int,
                                    wav_list=test_list,
                                    add_noise=False,
                                    preprocess_fun=preprocess_fun,
                                    preprocess_param=preprocess_param,
                                    resize_shape=reshape_size,
                                    is_1d=is_1d)
        testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False)

        for e, m in enumerate(trained_models):
            print("predicting ", m)
            speechmodel = get_model(m=MGPU)
            speechmodel.load_state_dict(torch.load(m))
            speechmodel = speechmodel.cuda()
            speechmodel.eval()

            test_fnames, test_labels = [], []
            pred_scores = []
            # do prediction and make a submission file
            for batch_idx, batch_data in enumerate(testloader):
                spec = Variable(batch_data['spec'].cuda())
                fname = batch_data['id']
                y_pred = softmax(speechmodel(spec))
                pred_scores.append(y_pred.data.cpu().numpy())
                test_fnames += fname

            if e == 0:
                final_pred = np.vstack(pred_scores)
                final_test_fnames = test_fnames
            else:
                final_pred += np.vstack(pred_scores)
                assert final_test_fnames == test_fnames

        final_pred /= len(trained_models)
        final_labels = [int_to_label[x] for x in np.argmax(final_pred, 1)]

        test_fnames = [x.split("/")[-1] for x in final_test_fnames]

        labels = [
            'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop',
            'go', 'unknown', 'silence'
        ]
        pred_scores = pd.DataFrame(np.vstack(final_pred), columns=labels)
        pred_scores['fname'] = test_fnames

        create_directory("pred_scores")
        pred_scores.to_csv("pred_scores/%s.csv" % CODER, index=False)

        create_directory("sub")
        pd.DataFrame({
            'fname': test_fnames,
            'label': final_labels
        }).to_csv("sub/%s.csv" % CODER, index=False)
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-data", help="data yaml file")
    parser.add_argument("-data_path",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-seed_model", help="the seed nerual network model")
    parser.add_argument("-exp_dir", help="the directory to save the outputs")
    parser.add_argument("-transform",
                        help="feature transformation matrix or mvn statistics")
    parser.add_argument("-criterion",
                        type=str,
                        choices=["mmi", "mpfe", "smbr"],
                        help="set the sequence training crtierion")
    parser.add_argument(
        "-trans_model",
        help="the HMM transistion model, used for lattice generation")
    parser.add_argument(
        "-prior_path",
        help="the prior for decoder, usually named as final.occs in kaldi setup"
    )
    parser.add_argument(
        "-den_dir",
        help="the decoding graph directory to find HCLG and words.txt files")
    parser.add_argument("-lr", type=float, help="set the learning rate")
    parser.add_argument("-ce_ratio",
                        default=0.1,
                        type=float,
                        help="the ratio for ce regularization")
    parser.add_argument("-momentum",
                        default=0,
                        type=float,
                        help="set the momentum")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=100,
                        type=float,
                        help="process n hours of data per sweep (default:60)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument('-print_freq',
                        default=10,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 10)')
    parser.add_argument('-save_freq',
                        default=1000,
                        type=int,
                        metavar='N',
                        help='save model frequency (default: 1000)')

    args = parser.parse_args()

    with open(args.config) as f:
        config = yaml.safe_load(f)

    config['data_path'] = args.data_path

    config["sweep_size"] = args.sweep_size

    print("pytorch version:{}".format(th.__version__))

    with open(args.data) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    hvd.init()

    th.cuda.set_device(hvd.local_rank())

    print("Run experiments with world size {}".format(hvd.size()))

    dataset = SpeechDataset(config)
    transform = None
    if args.transform is not None and os.path.isfile(args.transform):
        with open(args.transform, 'rb') as f:
            transform = pickle.load(f)
            dataset.transform = transform

    train_dataloader = SeqDataloader(dataset,
                                     batch_size=args.batch_size,
                                     num_workers=args.data_loader_threads,
                                     distributed=True,
                                     test_only=False)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    model.cuda()

    # setup the optimizer
    optimizer = th.optim.SGD(model.parameters(),
                             lr=args.lr,
                             momentum=args.momentum)

    # Broadcast parameters and opterimizer state from rank 0 to all other processes.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())

    if os.path.isfile(args.seed_model):
        checkpoint = th.load(args.seed_model)
        state_dict = checkpoint['model']
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove 'module.' of dataparallel
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
        print("=> loaded checkpoint '{}' ".format(args.seed_model))
    else:
        sys.stderr.write('ERROR: The model file %s does not exist!\n' %
                         (model_file))
        sys.exit(0)

    HCLG = args.den_dir + "/HCLG.fst"
    words_txt = args.den_dir + "/words.txt"
    silence_phones = args.den_dir + "/phones/silence.csl"

    if not os.path.isfile(HCLG):
        sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG))
        sys.exit(0)

    if not os.path.isfile(words_txt):
        sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' %
                         (words_txt))
        sys.exit(0)

    if not os.path.isfile(silence_phones):
        sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' %
                         (silence_phones))
        sys.exit(0)
    with open(silence_phones) as f:
        silence_ids = [int(i) for i in f.readline().strip().split(':')]
        f.close()

    if os.path.isfile(args.trans_model):
        trans_model = kaldi_hmm.TransitionModel()
        with kaldi_util.io.xopen(args.trans_model) as ki:
            trans_model.read(ki.stream(), ki.binary)
    else:
        sys.stderr.write('ERROR: The trans_model %s does not exist!\n' %
                         (args.trans_model))
        sys.exit(0)

    # now we can setup the decoder
    decoder_opts = LatticeFasterDecoderOptions()
    decoder_opts.beam = config["decoder_config"]["beam"]
    decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"]
    decoder_opts.max_active = config["decoder_config"]["max_active"]
    acoustic_scale = config["decoder_config"]["acoustic_scale"]
    decoder_opts.determinize_lattice = False  #To produce raw state-level lattice instead of compact lattice
    asr_decoder = MappedLatticeFasterRecognizer.from_files(
        args.trans_model,
        HCLG,
        words_txt,
        acoustic_scale=acoustic_scale,
        decoder_opts=decoder_opts)

    prior = kaldi_util.io.read_matrix(args.prior_path).numpy()
    log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float)

    model.train()
    for epoch in range(args.num_epochs):

        run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader,
                        epoch, asr_decoder, trans_model, silence_ids, args)

        # save model
        if hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)
    train_df = pd.read_csv(os.path.join(dataset_dir, 'train_df.csv'),
                           names=['path', 'sent'])
    train_df = train_df.dropna(how='any')
    print(train_df.head())
    # test_df = pd.read_csv('test_df.csv', names=['id', 'sent'])

    save_file = os.path.join('save', 'chars')
    chars = get_chars('chinese', save_file, train_df)
    char_to_token = {c: i for i, c in enumerate(chars)}
    token_to_char = {i: c for c, i in char_to_token.items()}
    sos_token = char_to_token['<sos>']
    eos_token = char_to_token['<eos>']
    pad_token = char_to_token['<pad>']

    train_dataset = SpeechDataset(train_df, dataset_dir, char_to_token)
    train_loader = AudioDataLoader(pad_token,
                                   train_dataset,
                                   batch_size=32,
                                   shuffle=True,
                                   drop_last=True)

    # #test_dataset = SpeechDataset(test_df, dataset_dir)
    # #test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    input_size = 128  # num rows in instagram
    hidden_dim = 64  # 256*2 nodes in each LSTM
    num_layers = 3
    dropout = 0.1
    layer_norm = False
    encoder = Listener(input_size,
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-exp_dir")
    parser.add_argument("-dataPath",
                        default='',
                        type=str,
                        help="path of data files")
    parser.add_argument("-train_config")
    parser.add_argument("-data_config")
    parser.add_argument("-lr",
                        default=0.0001,
                        type=float,
                        help="Override the LR in the config")
    parser.add_argument("-batch_size",
                        default=32,
                        type=int,
                        help="Override the batch size in the config")
    parser.add_argument("-data_loader_threads",
                        default=0,
                        type=int,
                        help="number of workers for data loading")
    parser.add_argument("-max_grad_norm",
                        default=5,
                        type=float,
                        help="max_grad_norm for gradient clipping")
    parser.add_argument("-sweep_size",
                        default=200,
                        type=float,
                        help="process n hours of data per sweep (default:200)")
    parser.add_argument("-num_epochs",
                        default=1,
                        type=int,
                        help="number of training epochs (default:1)")
    parser.add_argument("-global_mvn",
                        default=False,
                        type=bool,
                        help="if apply global mean and variance normalization")
    parser.add_argument(
        "-resume_from_model",
        type=str,
        help="the model from which you want to resume training")
    parser.add_argument("-dropout", type=float, help="set the dropout ratio")
    parser.add_argument("-aneal_lr_epoch",
                        default=2,
                        type=int,
                        help="start to aneal the learning rate from this epoch"
                        )  # aneal -> anneal?
    parser.add_argument("-aneal_lr_ratio",
                        default=0.5,
                        type=float,
                        help="the ratio to aneal the learning rate")
    parser.add_argument('-p',
                        '--print-freq',
                        default=100,
                        type=int,
                        metavar='N',
                        help='print frequency (default: 100)')
    parser.add_argument('-hvd',
                        default=False,
                        type=bool,
                        help="whether to use horovod for training")

    args = parser.parse_args()

    with open(args.train_config) as f:
        config = yaml.safe_load(f)

    config["sweep_size"] = args.sweep_size
    with open(args.data_config) as f:
        data = yaml.safe_load(f)
        config["source_paths"] = [j for i, j in data['clean_source'].items()]
        if 'dir_noise' in data:
            config["dir_noise_paths"] = [
                j for i, j in data['dir_noise'].items()
            ]
        if 'rir' in data:
            config["rir_paths"] = [j for i, j in data['rir'].items()]

    config['data_path'] = args.dataPath

    print("Experiment starts with config {}".format(
        json.dumps(config, sort_keys=True, indent=4)))

    # Initialize Horovod
    if args.hvd:
        import horovod.torch as hvd
        hvd.init()
        th.cuda.set_device(hvd.local_rank())
        print("Run experiments with world size {}".format(hvd.size()))

    if not os.path.isdir(args.exp_dir):
        os.makedirs(args.exp_dir)

    trainset = SpeechDataset(config)
    train_dataloader = ChunkDataloader(trainset,
                                       batch_size=args.batch_size,
                                       distributed=args.multi_gpu,
                                       num_workers=args.data_loader_threads)

    if args.global_mvn:
        transform = GlobalMeanVarianceNormalization()
        print("Estimating global mean and variance of feature vectors...")
        transform.learn_mean_and_variance_from_train_loader(
            trainset, trainset.stream_idx_for_transform, n_sample_to_use=2000)
        trainset.transform = transform
        print("Global mean and variance transform trained successfully!")

        with open(args.exp_dir + "/transform.pkl", 'wb') as f:
            pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL)

    print("Data loader set up successfully!")
    print("Number of minibatches: {}".format(len(train_dataloader)))

    # ceate model
    model_config = config["model_config"]
    lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"],
                     model_config["num_layers"], model_config["dropout"], True)
    model = NnetAM(lstm, model_config["hidden_size"] * 2,
                   model_config["label_size"])

    # Start training
    th.backends.cudnn.enabled = True
    if th.cuda.is_available():
        model.cuda()

    # optimizer
    optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True)

    if args.hvd:
        # Broadcast parameters and opterimizer state from rank 0 to all other processes.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        # Add Horovod Distributed Optimizer
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())

    # criterion
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    start_epoch = 0
    if args.resume_from_model:

        assert os.path.isfile(args.resume_from_model
                              ), "ERROR: model file {} does not exit!".format(
                                  args.resume_from_model)

        checkpoint = th.load(args.resume_from_model)
        state_dict = checkpoint['model']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(state_dict)
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("=> loaded checkpoint '{}' ".format(args.resume_from_model))

    model.train()
    for epoch in range(start_epoch, args.num_epochs):

        # aneal learning rate
        if epoch > args.aneal_lr_epoch:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= args.aneal_lr_ratio

        run_train_epoch(model, optimizer, criterion, train_dataloader, epoch,
                        args)

        # save model
        if not args.hvd or hvd.rank() == 0:
            checkpoint = {}
            checkpoint['model'] = model.state_dict()
            checkpoint['optimizer'] = optimizer.state_dict()
            checkpoint['epoch'] = epoch
            output_file = args.exp_dir + '/model.' + str(epoch) + '.tar'
            th.save(checkpoint, output_file)
Beispiel #23
0
def test():
    args = parser.parse_args()
    cf = ConfigParser.ConfigParser()
    cf.read(args.conf)
    USE_CUDA = cf.getboolean('Training', 'USE_CUDA')
    model_path = cf.get('Model', 'model_file')
    data_dir = cf.get('Data', 'data_dir')
    beam_width = cf.getint('Decode', 'beam_width')
    package = torch.load(model_path)
    
    rnn_param = package["rnn_param"]
    num_class = package["num_class"]
    n_feats = package['epoch']['n_feats']
    drop_out = package['_drop_out']

    decoder_type =  cf.get('Decode', 'decoder_type')
    data_set = cf.get('Decode', 'eval_dataset')

    test_dataset = SpeechDataset(data_dir, data_set=data_set)
    
    model = CTC_Model(rnn_param=rnn_param, num_class=num_class, drop_out=drop_out)
        
    test_loader = SpeechDataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4, pin_memory=False)
    
    model.load_state_dict(package['state_dict'])
    model.eval()
    
    if USE_CUDA:
        model = model.cuda()

    if decoder_type == 'Greedy':
        decoder  = GreedyDecoder(int2char, space_idx=len(int2char) - 1, blank_index = 0)
    else:
        decoder = BeamDecoder(int2char, beam_width=beam_width, blank_index = 0, space_idx = len(int2char) - 1)    

    total_wer = 0
    total_cer = 0
    start = time.time()
    for data in test_loader:
        inputs, target, input_sizes, input_size_list, target_sizes = data 
        inputs = inputs.transpose(0,1)
        inputs = Variable(inputs, volatile=True, requires_grad=False)
        
        if USE_CUDA:
            inputs = inputs.cuda()
        
        inputs = nn.utils.rnn.pack_padded_sequence(inputs, input_size_list)
        probs = model(inputs)

        probs = probs.data.cpu()
        decoded = decoder.decode(probs, input_size_list)
        targets = decoder._unflatten_targets(target, target_sizes)
        labels = decoder._process_strings(decoder._convert_to_strings(targets))

        for x in range(len(labels)):
            print("origin : " + labels[x])
            print("decoded: " + decoded[x])
        cer = 0
        wer = 0
        for x in range(len(labels)):
            cer += decoder.cer(decoded[x], labels[x])
            wer += decoder.wer(decoded[x], labels[x])
            decoder.num_word += len(labels[x].split())
            decoder.num_char += len(labels[x])
        total_cer += cer
        total_wer += wer
    CER = (1 - float(total_cer) / decoder.num_char)*100
    WER = (1 - float(total_wer) / decoder.num_word)*100
    print("Character error rate on test set: %.4f" % CER)
    print("Word error rate on test set: %.4f" % WER)
    end = time.time()
    time_used = (end - start) / 60.0
    print("time used for decode %d sentences: %.4f minutes." % (len(test_dataset), time_used))