Ejemplo n.º 1
0
 def __init__(self, pase_cfg, pase_ckpt, pase_ft,
              num_inputs,
              pase_feats,
              save_path,
              global_mode=False,
              stft_cfg=None,
              stft_ckpt=None,
              name='PASEInjector'):
     super().__init__(name=name)
     self.pase = wf_builder(pase_cfg)
     if pase_ckpt is not None:
         self.pase.load_pretrained(pase_ckpt, load_last=True, verbose=True)
     """
     if num_inputs != pase_feats:
         # make a projector
         self.pase_W = nn.Conv1d(num_inputs, pase_feats, 1)
     """
     self.global_mode = global_mode 
     if pase_ft:
         #self.saver = Saver(self, save_path,
         #                   prefix='PASE')
         self.pase.train()
     else:
         self.pase.eval()
     if stft_cfg is not None:
         stft_cfg['frontend_cfg'] = pase_cfg
         stft_cfg['frontend_ckpt'] = pase_ckpt
         self.stft_net = DCRegression(**stft_cfg)
         if stft_ckpt is not None:
             self.stft_net.load_pretrained(stft_ckpt, 
                                           load_last=True,
                                           verbose=True)
Ejemplo n.º 2
0
 def __init__(self, frontend_cfg, num_outputs, 
              frontend_ckpt=None, ft_fe=False,
              rnn_size=512, rnn_layers=3,
              rnn_type='lstm',
              cuda=False,
              name='DCRegression'):
     super().__init__(name=name)
     self.frontend = wf_builder(frontend_cfg)
     if frontend_ckpt is not None:
         self.frontend.load_pretrained(frontend_ckpt,
                                       load_last=True,
                                       verbose=True)
     self.ft_fe = ft_fe
     ninp = self.frontend.emb_dim
     #self.rnn = nn.LSTM(ninp, rnn_size, rnn_layers,
     #                   batch_first=True, bidirectional=True)
     self.rnn = build_rnn_block(ninp, rnn_size, rnn_layers,
                                rnn_type, use_cuda=cuda)
     # Build skip connection adapter
     self.W = nn.Conv1d(ninp, 2 * rnn_size, 1)
     self.backend = nn.Sequential(
         nn.Conv1d(2 * rnn_size, 2 * rnn_size, 1),
         nn.ReLU(inplace=True),
         nn.Conv1d(2 * rnn_size, num_outputs, 1)
     )
Ejemplo n.º 3
0
    def __init__(self,
                 pase_cfg,
                 pase_cp=None,
                 n_z=256,
                 proj_size=0,
                 ncoef=100,
                 sm_type='none'):
        super(global_MLP, self).__init__()

        self.encoder = wf_builder(pase_cfg)
        if pase_cp:
            self.encoder.load_pretrained(pase_cp,
                                         load_last=True,
                                         verbose=False)

        self.model = nn.Sequential(nn.Linear(ncoef, 512), nn.BatchNorm1d(512),
                                   nn.ReLU(inplace=True), nn.Linear(512, 512),
                                   nn.BatchNorm1d(512), nn.ReLU(inplace=True),
                                   nn.Linear(512, n_z))

        if proj_size > 0 and sm_type != 'none':
            if sm_type == 'softmax':
                self.out_proj = Softmax(input_features=n_z,
                                        output_features=proj_size)
            elif sm_type == 'am_softmax':
                self.out_proj = AMSoftmax(input_features=n_z,
                                          output_features=proj_size)
            else:
                raise NotImplementedError
 def __init__(self, PASE_cfg, MLP_cfg, PASE_ckpt, context_left=0, context_right=0):
     super().__init__()
     input_dim = 4 * PASE_cfg['rnn_dim'] * (1 + context_left + context_right)
     self.context_left = context_left
     self.context_right = context_right
     
     self.pase = wf_builder(PASE_cfg)
     self.pase.load_pretrained(PASE_ckpt, load_last=True, verbose=False)
     self.decoder = MLP(MLP_cfg, input_dim)
Ejemplo n.º 5
0
def load_pase_plus(PASE_FOLDER,
                   parameters='trained_model/PASE+_parameters.ckpt'):

    sys.path.append(PASE_FOLDER)
    from pase.models.frontend import wf_builder

    pase = wf_builder(join(PASE_FOLDER, 'cfg/frontend/PASE+.cfg'))
    pase.eval()
    pase.load_pretrained(parameters, load_last=True, verbose=True)
    return pase.to(CUDA0)
    def __init__(self, PASE_cfg, LSTM_cfg, PASE_ckpt):
        super().__init__()

        self.pase = wf_builder(PASE_cfg)
        self.pase.load_pretrained(PASE_ckpt, load_last=True, verbose=False)

        input_dim = 4 * PASE_cfg['rnn_dim']
        
        self.decoder = nn.ModuleList([LSTM_cudnn(LSTM_cfg, input_dim),
                                        nn.Linear(136, 136)])
Ejemplo n.º 7
0
    def __init__(self, ckpt, config, **kwargs):
        super(UpstreamExpert, self).__init__() 

        self.pase = wf_builder(config)
        self.pase.load_pretrained(ckpt, load_last=True, verbose=False)

        # pseudo_input = torch.randn(1, 1, SAMPLE_RATE * EXAMPLE_SEC)
        # r = self.pase(pseudo_input) # size will be (1, 256, 625), which are 625 frames of 256 dims each
        self.output_dim = 256 # r.size(1)
        raise RuntimeError('There are some import errors with the PASE repo, see this issue: https://github.com/santi-pdp/pase/issues/114.')
Ejemplo n.º 8
0
    def __init__(self, ckpt, model_config, **kwargs):
        super(UpstreamExpert, self).__init__()

        self.pase = wf_builder(model_config)
        self.pase.load_pretrained(ckpt, load_last=True, verbose=False)

        # Pase can not easily switch between cpu/gpu for now
        self.pase.cuda()

        pseudo_input = torch.randn(1, 1, SAMPLE_RATE * EXAMPLE_SEC).cuda()
        self.output_dim = self.pase(pseudo_input).size(1)
Ejemplo n.º 9
0
def retrieve_model_and_datasets(encoder_cfg, model_cfg, data_cfg, train_list,
                                valid_list, test_list):

    with open(model_cfg, 'r') as cfg_f:
        model_cfg = json.load(cfg_f)

    if encoder_cfg is not None:
        with open(encoder_cfg, 'r') as cfg_f:
            encoder_cfg = json.load(cfg_f)

    with open(data_cfg, 'r') as cfg_f:
        data_cfg = json.load(cfg_f)

    # prepare the three datasets; train, valid and test
    splits = [train_list, valid_list, test_list]
    cls_name = model_cfg.pop('name')
    dset_name = data_cfg.pop('name')
    if 'chunk_cfg' in data_cfg:
        chunker = SingleChunkWav(**data_cfg.pop('chunk_cfg'))
        data_cfg['chunker'] = chunker

    if encoder_cfg is not None:
        name = encoder_cfg.pop('name')

        if name == 'pase' or name == 'PASE':
            if 'ckpt' in encoder_cfg:
                ckpt = encoder_cfg.pop('ckpt')
            else:
                ckpt = None
            encoder = wf_builder(encoder_cfg)
            if ckpt is not None:
                encoder.load_pretrained(ckpt, load_last=True, verbose=True)
            model_cfg['frontend'] = encoder
        elif name == 'tdnn' or name == 'TDNN':
            model_cfg['xvector'] = True
            encoder = TDNN(**encoder_cfg)
            model_cfg['frontend'] = encoder
        else:
            raise ValueError('Unrecognized encoder: ', name)

    model = getattr(pmods, cls_name)(**model_cfg)
    datasets = []
    for si, split in enumerate(splits, start=1):
        if split is None:
            # skip this split (validation for instance)
            datasets.append(None)
        else:
            data_cfg['split_list'] = split
            if si >= len(splits) - 1 and 'chunker' in data_cfg:
                # remove the chunker for test split
                del data_cfg['chunker']
            datasets.append(getattr(pdsets, dset_name)(**data_cfg))
    return model, datasets
Ejemplo n.º 10
0
    def __init__(self,
                 pase_cfg,
                 pase_cp=None,
                 n_z=256,
                 layers=[2, 2, 2, 2],
                 block=PreActBlock,
                 proj_size=0,
                 ncoef=23,
                 sm_type='none'):
        self.in_planes = 16
        super(ResNet_18, self).__init__()

        self.model = nn.ModuleList()

        self.model.append(
            nn.Sequential(
                nn.Conv2d(1,
                          16,
                          kernel_size=(2 * ncoef, 3),
                          stride=(1, 1),
                          padding=(0, 1),
                          bias=False), nn.BatchNorm2d(16), nn.ReLU()))

        self.model.append(self._make_layer(block, 64, layers[0], stride=1))
        self.model.append(self._make_layer(block, 128, layers[1], stride=2))
        self.model.append(self._make_layer(block, 256, layers[2], stride=2))
        self.model.append(self._make_layer(block, 512, layers[3], stride=2))

        self.initialize_params()

        self.pooling = SelfAttention(block.expansion * 512)

        self.post_pooling = nn.Sequential(
            nn.Conv1d(block.expansion * 512 * 2, 512, 1), nn.BatchNorm1d(512),
            nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512),
            nn.ReLU(inplace=True), nn.Conv1d(512, n_z, 1))

        if proj_size > 0 and sm_type != 'none':
            if sm_type == 'softmax':
                self.out_proj = Softmax(input_features=n_z,
                                        output_features=proj_size)
            elif sm_type == 'am_softmax':
                self.out_proj = AMSoftmax(input_features=n_z,
                                          output_features=proj_size)
            else:
                raise NotImplementedError

        ## Load after initialize main model params
        self.encoder = wf_builder(pase_cfg)
        if pase_cp:
            self.encoder.load_pretrained(pase_cp,
                                         load_last=True,
                                         verbose=False)
Ejemplo n.º 11
0
    def __init__(self,
                 pase_cfg,
                 pase_cp=None,
                 n_layers=4,
                 n_z=256,
                 proj_size=0,
                 ncoef=23,
                 sm_type='none'):
        super(pyr_rnn, self).__init__()

        self.model = nn.ModuleList(
            [nn.LSTM(2 * ncoef, 256, 1, bidirectional=True, batch_first=True)])

        for i in range(1, n_layers):
            self.model.append(
                nn.LSTM(256 * 2 * 2,
                        256,
                        1,
                        bidirectional=True,
                        batch_first=True))

        self.pooling = StatisticalPooling()

        self.post_pooling = nn.Sequential(nn.Conv1d(256 * 2 * 2 * 2, 512, 1),
                                          nn.BatchNorm1d(512),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(512, 512, 1),
                                          nn.BatchNorm1d(512),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(512, n_z, 1))

        self.initialize_params()

        self.attention = SelfAttention(512)

        if proj_size > 0 and sm_type != 'none':
            if sm_type == 'softmax':
                self.out_proj = Softmax(input_features=n_z,
                                        output_features=proj_size)
            elif sm_type == 'am_softmax':
                self.out_proj = AMSoftmax(input_features=n_z,
                                          output_features=proj_size)
            else:
                raise NotImplementedError

        self.encoder = wf_builder(pase_cfg)
        if pase_cp:
            self.encoder.load_pretrained(pase_cp,
                                         load_last=True,
                                         verbose=False)
Ejemplo n.º 12
0
    def __init__(self,
                 pase_cfg,
                 pase_cp=None,
                 n_z=256,
                 proj_size=0,
                 ncoef=100,
                 sm_type='none'):
        super(TDNN, self).__init__()

        self.encoder = wf_builder(pase_cfg)
        if pase_cp:
            self.encoder.load_pretrained(pase_cp,
                                         load_last=True,
                                         verbose=False)

        self.model = nn.Sequential(
            nn.BatchNorm1d(2 * ncoef), nn.Conv1d(2 * ncoef, 512, 5, padding=2),
            nn.BatchNorm1d(512), nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, 3, dilation=2, padding=2), nn.BatchNorm1d(512),
            nn.ReLU(inplace=True), nn.Conv1d(512,
                                             512,
                                             3,
                                             dilation=3,
                                             padding=3), nn.BatchNorm1d(512),
            nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512),
            nn.ReLU(inplace=True), nn.Conv1d(512, 1500, 1),
            nn.BatchNorm1d(1500), nn.ReLU(inplace=True))

        self.pooling = StatisticalPooling()

        self.post_pooling = nn.Sequential(nn.Conv1d(3000, 512, 1),
                                          nn.BatchNorm1d(512),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(512, 512, 1),
                                          nn.BatchNorm1d(512),
                                          nn.ReLU(inplace=True),
                                          nn.Conv1d(512, n_z, 1))

        if proj_size > 0 and sm_type != 'none':
            if sm_type == 'softmax':
                self.out_proj = Softmax(input_features=n_z,
                                        output_features=proj_size)
            elif sm_type == 'am_softmax':
                self.out_proj = AMSoftmax(input_features=n_z,
                                          output_features=proj_size)
            else:
                raise NotImplementedError
Ejemplo n.º 13
0
    def __init__(self, res_ckpt_path, pase_cfg_path, pase_ckpt_path):
        super().__init__()
        m3 = model.model3.SVHFNet()
        map_location = None if torch.cuda.is_available() else 'cpu'
        check_point = torch.load(res_ckpt_path, map_location=map_location)
        state_dict = check_point['net']
        m3.load_state_dict(state_dict)
        self.vis_stream = m3.vis_stream
        pase = wf_builder(pase_cfg_path)
        pase.load_pretrained(pase_ckpt_path, load_last=True, verbose=True)
        self.aud_stream = AudioStream(pase)

        self.fc8 = nn.Linear(3072, 1024)
        self.bn8 = nn.BatchNorm1d(1024)
        self.relu8 = nn.ReLU()
        self.fc9 = nn.Linear(1024, 512)
        self.bn9 = nn.BatchNorm1d(512)
        self.relu9 = nn.ReLU()
        self.fc10 = nn.Linear(512, 2)
Ejemplo n.º 14
0
def main(opts):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # build network
    minions_cfg = pase_parser(opts.minions_cfg, do_losses=False)
    remove_Dcfg(minions_cfg)
    pase = wf_builder(opts.cfg)
    model = Waveminionet(minions_cfg=minions_cfg,
                         num_devices=0,
                         pretrained_ckpts=opts.ckpt,
                         z_minion=False,
                         frontend=pase)
    model.eval()
    model.to(device)
    transf = Reverb(['data/omologo_revs/IRs_2/IR_223108.imp'], ir_fmt='imp')
    minion = model.minions[0]
    minion.loss = None
    pase = model.frontend
    #print(opts.in_files)
    in_files = [os.path.join(opts.files_root, inf) for inf in opts.in_files]
    wavs = []
    wfiles = []
    max_len = 0
    print('Total batches: ', len(in_files) // opts.batch_size)
    with torch.no_grad():
        for wi, wfile in tqdm.tqdm(enumerate(in_files, start=1),
                                   total=len(in_files)):
            wfiles.append(wfile)
            wav, rate = sf.read(wfile)
            wavs.append(wav)
            if len(wav) > max_len:
                max_len = len(wav)
            if wi % opts.batch_size == 0 or wi >= len(in_files):
                lens = []
                batch = []
                for bi in range(len(wavs)):
                    P_ = max_len - len(wavs[bi])
                    lens.append(len(wavs[bi]))
                    if P_ > 0:
                        pad = np.zeros((P_))
                        wav_ = np.concatenate((wavs[bi], pad), axis=0)
                    else:
                        wav_ = wavs[bi]
                    wav = torch.FloatTensor(wav_)
                    wav_r = transf({'chunk': wav})
                    batch.append(wav_r['chunk'].view(1, 1, -1))
                batch = torch.cat(batch, dim=0)
                x = batch.to(device)
                h = pase(x)
                #print('frontend size: ', h.size())
                y = minion(h).cpu()
                for bi in range(len(wavs)):
                    bname = os.path.basename(wfiles[bi])
                    y_ = y[bi].squeeze().data.numpy()
                    y_ = y_[:lens[bi]]
                    sf.write(os.path.join(opts.out_path, '{}'.format(bname)),
                             y_, 16000)
                    x_ = x[bi].squeeze().data.numpy()
                    x_ = x_[:lens[bi]]
                    sf.write(
                        os.path.join(opts.out_path, 'input_{}'.format(bname)),
                        x_, 16000)
                max_len = 0
                wavs = []
                wfiles = []
                batch = None
    """
Ejemplo n.º 15
0
options['dnn_use_batchnorm']='True,True,True,True,True,False'
options['dnn_use_laynorm']='False,False,False,False,False,False'
options['dnn_use_laynorm_inp']='False'
options['dnn_use_batchnorm_inp']='False'
options['dnn_act']='linear,relu,relu,relu,relu,softmax'



device=get_freer_gpu()


# folder creation
text_file=open(output_file, "w")

# Loading pase
pase =wf_builder(pase_cfg)
pase.load_pretrained(pase_model, load_last=True, verbose=False)
pase.to(device)
pase.eval()

# reading the training signals
print("Waveform reading...")
fea={}
for wav_file in tr_lst:
    [signal, fs] = sf.read(data_folder+'/'+wav_file)
    signal=signal/np.max(np.abs(signal))
    signal = signal.astype(np.float32)
    
    fea_id=wav_file.split('/')[-2]+'_'+wav_file.split('/')[-1].split('.')[0]
    fea[fea_id]=torch.from_numpy(signal).float().to(device).view(1,1,-1)
Ejemplo n.º 16
0
def main(opts):
    CUDA = torch.cuda.is_available() and not opts.no_cuda
    device = 'cuda' if CUDA else 'cpu'
    torch.manual_seed(opts.seed)
    random.seed(opts.seed)
    np.random.seed(opts.seed)
    if device == 'cuda':
        torch.cuda.manual_seed_all(opts.seed)
    spk2idx = load_spk2idx(opts.spk2idx)
    NSPK = len(set(spk2idx.values()))
    # Build Model
    fe = wf_builder(opts.fe_cfg)
    if opts.train:
        print('=' * 20)
        print('Entering TRAIN mode')
        print('=' * 20)
        with open(os.path.join(opts.save_path, 'train.opts'), 'w') as cfg_f:
            cfg_f.write(json.dumps(vars(opts), indent=2))
        # Open up guia and split valid
        with open(opts.train_guia) as tr_guia_f:
            tr_files = [l.rstrip() for l in tr_guia_f]

        if opts.test_guia is not None:
            with open(opts.test_guia) as te_guia_f:
                te_files = [l.rstrip() for l in te_guia_f]

        tr_files_, va_files = build_valid_list(tr_files,
                                               spk2idx,
                                               va_split=opts.va_split)
        # compute total samples dur
        beg_t = timeit.default_timer()
        tr_durs, sr = compute_utterances_durs(tr_files_, opts.data_root)
        va_durs, _ = compute_utterances_durs(va_files, opts.data_root)
        train_dur = np.sum(tr_durs)
        valid_dur = np.sum(va_durs)
        end_t = timeit.default_timer()
        print('Read tr/va {:.1f} s/{:.1f} s in {} s'.format(
            train_dur / sr, valid_dur / sr, end_t - beg_t))
        # Build Datasets
        dset = LibriSpkIDDataset(opts.data_root, tr_files_, spk2idx)
        va_dset = LibriSpkIDDataset(opts.data_root, va_files, spk2idx)
        cc = WavCollater(max_len=opts.max_len)
        #cc_vate = WavCollater(max_len=None)
        cc_vate = cc
        dloader = DataLoader(dset,
                             batch_size=opts.batch_size,
                             collate_fn=cc,
                             shuffle=True)
        va_dloader = DataLoader(va_dset,
                                batch_size=opts.batch_size,
                                collate_fn=cc_vate,
                                shuffle=False)
        tr_bpe = (train_dur // opts.max_len) // opts.batch_size
        va_bpe = (valid_dur // opts.max_len) // opts.batch_size
        if opts.test_guia is not None:
            te_dset = LibriSpkIDDataset(opts.data_root, te_files, spk2idx)
            te_dloader = DataLoader(te_dset,
                                    batch_size=opts.batch_size,
                                    collate_fn=cc_vate,
                                    shuffle=False)
        if opts.fe_ckpt is not None:
            fe.load_pretrained(opts.fe_ckpt, load_last=True, verbose=True)
        else:
            print('*' * 50)
            print('** WARNING: TRAINING WITHOUT PRETRAIED WEIGHTS FOR THE '
                  'FRONT-END **')
            print('*' * 50)
            # Enforce training the frontend
            opts.ft_fe = True
        model = select_model(opts, fe, NSPK)
        model.to(device)
        print(model)
        # Build optimizer and scheduler
        opt = select_optimizer(opts, model)
        sched = select_scheduler(opts, opt)
        # Make writer
        writer = SummaryWriter(opts.save_path)
        best_val_acc = 0
        # flag for saver
        best_val = False
        for epoch in range(1, opts.epoch + 1):
            train_epoch(dloader,
                        model,
                        opt,
                        epoch,
                        opts.log_freq,
                        writer=writer,
                        device=device,
                        bpe=tr_bpe)
            eloss, eacc = eval_epoch(va_dloader,
                                     model,
                                     epoch,
                                     opts.log_freq,
                                     writer=writer,
                                     device=device,
                                     bpe=va_bpe,
                                     key='valid')
            if opts.sched_mode == 'step':
                sched.step()
            else:
                sched.step(eacc)
            if eacc > best_val_acc:
                print('*' * 40)
                print('New best val acc: {:.3f} => {:.3f}.'
                      ''.format(best_val_acc, eacc))
                print('*' * 40)
                best_val_acc = eacc
                best_val = True
            model.save(opts.save_path, epoch - 1, best_val=best_val)
            best_val = False
            if opts.test_guia is not None:
                # Eval test on the fly whilst training/validating
                teloss, teacc = eval_epoch(te_dloader,
                                           model,
                                           epoch,
                                           opts.log_freq,
                                           writer=writer,
                                           device=device,
                                           key='test')
    if opts.test:
        print('=' * 20)
        print('Entering TEST mode')
        print('=' * 20)

        #fe = WaveFe(rnn_pool=opts.rnn_pool, emb_dim=opts.emb_dim)
        model = select_model(opts, fe, NSPK)
        model.load_pretrained(opts.test_ckpt, load_last=True, verbose=True)
        model.to(device)
        model.eval()
        with open(opts.test_guia) as te_guia_f:
            te_files = [l.rstrip() for l in te_guia_f]
            te_dset = LibriSpkIDDataset(opts.data_root, te_files, spk2idx)
            cc = WavCollater(max_len=None)
            te_dloader = DataLoader(
                te_dset,
                batch_size=1,
                #collate_fn=cc,
                shuffle=False)

            def filter_by_slens(T, slens, sfactor=160):
                dims = len(T.size())
                # extract each sequence by its length
                seqs = []
                for bi in range(T.size(0)):
                    slen = int(np.ceil(slens[bi] / sfactor))
                    if dims == 3:
                        seqs.append(T[bi, :, :slen])
                    else:
                        seqs.append(T[bi, :slen])
                return seqs

            with torch.no_grad():
                teloss = []
                teacc = []
                timings = []
                beg_t = timeit.default_timer()
                if opts.test_log_file is not None:
                    test_log_f = open(opts.test_log_file, 'w')
                    test_log_f.write('Filename\tAccuracy [%]\tError [%]\n')
                else:
                    test_log_f = None
                for bidx, batch in enumerate(te_dloader, start=1):
                    #X, Y, slen = batch
                    X, Y = batch
                    X = X.unsqueeze(1)
                    X = X.to(device)
                    Y = Y.to(device)
                    Y_ = model(X)
                    Y = Y.view(-1, 1).repeat(1, Y_.size(2))
                    #Y__seqs = filter_by_slens(Y_, slen)
                    #Y_seqs = filter_by_slens(Y, slen)
                    #assert len(Y__seqs) == len(Y_seqs)
                    #for sidx in range(len(Y__seqs)):
                    #    y_ = Y__seqs[sidx].unsqueeze(0)
                    #    y = Y_seqs[sidx].unsqueeze(0)
                    #    loss = F.nll_loss(y_, y)
                    #    teacc.append(accuracy(y_, y))
                    #    teloss.append(loss)
                    loss = F.nll_loss(Y_, Y)
                    acc = accuracy(Y_, Y)
                    if test_log_f:
                        test_log_f.write('{}\t{:.2f}\t{:.2f}\n' \
                                         ''.format(te_files[bidx - 1],
                                                   acc * 100,
                                                   100 - (acc * 100)))
                    teacc.append(accuracy(Y_, Y))
                    teloss.append(loss.item())
                    end_t = timeit.default_timer()
                    timings.append(end_t - beg_t)
                    beg_t = timeit.default_timer()
                    if bidx % 100 == 0 or bidx == 1:
                        mteloss = np.mean(teloss)
                        mteacc = np.mean(teacc)
                        mtimings = np.mean(timings)
                    print('Processed test file {}/{} mfiletime: {:.2f} s, '
                          'macc: {:.4f}, mloss: {:.2f}'
                          ''.format(bidx, len(te_dloader), mtimings, mteacc,
                                    mteloss),
                          end='\r')
                print()
                if test_log_f:
                    test_log_f.write('-' * 30 + '\n')
                    test_log_f.write('Test accuracy: ' \
                                     '{:.2f}\n'.format(np.mean(teacc) * 100))
                    test_log_f.write('Test error: ' \
                                     '{:.2f}\n'.format(100 - (np.mean(teacc) *100)))
                    test_log_f.write('Test loss: ' \
                                     '{:.2f}\n'.format(np.mean(teloss)))
                    test_log_f.close()
                print('Test accuracy: {:.4f}'.format(np.mean(teacc)))
                print('Test loss: {:.2f}'.format(np.mean(teloss)))
def get_pase_representations(pase_model, audio_path):
    y, fs = librosa.core.load(audio_path, sr=None)
    y = torch.tensor(y)[(None, ) * 2].to(
        device)  # unsqueeze twice at first dim
    pase_reps = pase_model(y)
    pase_reps = pase_reps.detach().cpu().numpy()
    return pase_reps


if __name__ == "__main__":
    audio_path = sys.argv[1]
    save_path = sys.argv[2]

    # load model
    pase = wf_builder('cfg/frontend/PASE+.cfg').eval()
    pase = pase.to(device)
    pase.load_pretrained('checkpoints/pase_pretrained.ckpt',
                         load_last=True,
                         verbose=True)

    # get list of speaker ids from VoxCeleb
    speaker_ids = os.listdir(audio_path)

    # get PASE representations for utterances from each speaker
    for speaker_id in tqdm(speaker_ids):
        os.makedirs(os.path.join(save_path, speaker_id), exist_ok=True)
        path_to_speaker = os.path.join(audio_path, speaker_id)

        video_ids = os.listdir(path_to_speaker)
        utt_idx = 1
Ejemplo n.º 18
0
def train(opts):
    CUDA = True if torch.cuda.is_available() and not opts.no_cuda else False
    device = 'cuda' if CUDA else 'cpu'
    num_devices = 1
    np.random.seed(opts.seed)
    random.seed(opts.seed)
    torch.manual_seed(opts.seed)
    if CUDA:
        torch.cuda.manual_seed_all(opts.seed)
        num_devices = torch.cuda.device_count()
        print('[*] Using CUDA {} devices'.format(num_devices))
    else:
        print('[!] Using CPU')
    print('Seeds initialized to {}'.format(opts.seed))

    # ---------------------
    # Build Model
    frontend = wf_builder(opts.fe_cfg)
    minions_cfg = pase_parser(opts.net_cfg,
                              batch_acum=opts.batch_acum,
                              device=device,
                              frontend=frontend)
    model = Waveminionet(minions_cfg=minions_cfg,
                         adv_loss=opts.adv_loss,
                         num_devices=num_devices,
                         frontend=frontend)

    print(model)
    print('Frontend params: ', model.frontend.describe_params())
    model.to(device)
    trans = make_transforms(opts, minions_cfg)
    print(trans)
    if opts.dtrans_cfg is not None:
        with open(opts.dtrans_cfg, 'r') as dtr_cfg:
            dtr = json.load(dtr_cfg)
            #dtr['trans_p'] = opts.distortion_p
            dist_trans = config_distortions(**dtr)
            print(dist_trans)
    else:
        dist_trans = None
    # Build Dataset(s) and DataLoader(s)
    dataset = getattr(pase.dataset, opts.dataset)
    dset = dataset(opts.data_root,
                   opts.data_cfg,
                   'train',
                   transform=trans,
                   noise_folder=opts.noise_folder,
                   whisper_folder=opts.whisper_folder,
                   distortion_probability=opts.distortion_p,
                   distortion_transforms=dist_trans,
                   preload_wav=opts.preload_wav)
    dloader = DataLoader(dset,
                         batch_size=opts.batch_size,
                         shuffle=True,
                         collate_fn=DictCollater(),
                         num_workers=opts.num_workers,
                         pin_memory=CUDA)
    # Compute estimation of bpe. As we sample chunks randomly, we
    # should say that an epoch happened after seeing at least as many
    # chunks as total_train_wav_dur // chunk_size
    bpe = (dset.total_wav_dur // opts.chunk_size) // opts.batch_size
    opts.bpe = bpe
    if opts.do_eval:
        va_dset = dataset(opts.data_root,
                          opts.data_cfg,
                          'valid',
                          transform=trans,
                          noise_folder=opts.noise_folder,
                          whisper_folder=opts.whisper_folder,
                          distortion_probability=opts.distortion_p,
                          distortion_transforms=dist_trans,
                          preload_wav=opts.preload_wav)
        va_dloader = DataLoader(va_dset,
                                batch_size=opts.batch_size,
                                shuffle=False,
                                collate_fn=DictCollater(),
                                num_workers=opts.num_workers,
                                pin_memory=CUDA)
        va_bpe = (va_dset.total_wav_dur // opts.chunk_size) // opts.batch_size
        opts.va_bpe = va_bpe
    else:
        va_dloader = None
    # fastet lr to MI
    #opts.min_lrs = {'mi':0.001}
    model.train_(dloader, vars(opts), device=device, va_dloader=va_dloader)
Ejemplo n.º 19
0
                        '-e',
                        default='.wav',
                        help='file extension to search for in dataset folder')
    parser.add_argument('--batch_size', type=int, default=32)
    args = parser.parse_args()

    extension = args.extension
    path = args.path
    wav_files = get_files(path, extension)
    if hp.pase_cfg is None:
        raise ValueError
        assert hp.pase_ckpt is not None
    CUDA = torch.cuda.is_available() and hp.cuda
    hp.device = 'cuda' if CUDA else 'cpu'
    # Load pase model
    pase = wf_builder(hp.pase_cfg)
    pase.load_pretrained(hp.pase_ckpt, load_last=True, verbose=True)
    pase.to(hp.device)
    pase.eval()
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    print(f'\n{len(wav_files)} {extension[1:]} files found in "{path}"\n')

    if len(wav_files) == 0:
        print('Please point wav_path in hparams.py to your dataset,')
        print('or use the --path option.\n')
    else:
        if not hp.ignore_tts:

            text_dict = ljspeech(path)
Ejemplo n.º 20
0
    dnn_act = cfg['dnn_act']

    options = {}
    options['dnn_lay'] = dnn_lay
    options['dnn_drop'] = dnn_drop
    options['dnn_use_batchnorm'] = dnn_use_batchnorm
    options['dnn_use_laynorm'] = dnn_use_laynorm
    options['dnn_use_laynorm_inp'] = dnn_use_laynorm_inp
    options['dnn_use_batchnorm_inp'] = dnn_use_batchnorm_inp
    options['dnn_act'] = dnn_act

    # folder creation
    text_file = open(output_file, "w")

    # Loading pase
    pase = wf_builder(cfg_pase)
    pase.load_pretrained(pase_ckpt, load_last=True, verbose=False)
    pase.to(device)
    pase.eval()

    # reading the training signals
    print("Waveform reading...")

    # reading the dev signals
    fea_dev = {}
    for wav_file in dev_lst:
        [signal, fs] = sf.read(data_folder + '/' + wav_file)
        signal = signal / np.max(np.abs(signal))
        fea_id = wav_file.split('/')[-2] + '_' + wav_file.split('/')[-1].split(
            '.')[0]
        fea_dev[fea_id] = torch.from_numpy(signal).float().to(device).view(
def cluster(opts):
    CUDA = True if torch.cuda.is_available() else False
    device = 'cuda' if CUDA else 'cpu'
    num_devices = 1
    np.random.seed(opts.seed)
    random.seed(opts.seed)
    torch.manual_seed(opts.seed)
    if CUDA:
        torch.cuda.manual_seed_all(opts.seed)
        num_devices = torch.cuda.device_count()
        print('[*] Using CUDA {} devices'.format(num_devices))
    else:
        print('[!] Using CPU')
    fe = wf_builder(opts.fe_cfg)
    if opts.fe_ckpt is not None:
        fe.load_pretrained(opts.fe_ckpt, load_last=True, verbose=True)
    else:
        print('WARNING: No pretrained ckpt loaded for FE! Random clustering?')
    fe.to(device)
    fe.eval()
    trans = Compose(
        [ToTensor(),
         SingleChunkWav(opts.chunk_size, random_scale=False)])
    # Build Dataset(s) and DataLoader(s)
    dset = PairWavDataset(opts.data_root,
                          opts.data_cfg,
                          'train',
                          transform=trans)
    dloader = DataLoader(dset,
                         batch_size=opts.batch_size,
                         shuffle=True,
                         collate_fn=DictCollater(),
                         num_workers=opts.num_workers)
    # acumulate train chunks and do clustering on them,
    # with each chunk containing several frames
    X = []
    timings = []
    N = opts.num_samples // opts.batch_size
    beg_t = timeit.default_timer()
    for bidx in range(1, N + 1, 1):
        batch = next(dloader.__iter__())
        chunk = batch['chunk']
        y = fe(chunk.to(device)).mean(dim=2)
        X.append(y.view(-1, y.size(-1)).cpu().data.numpy())
        end_t = timeit.default_timer()
        timings.append(end_t - beg_t)
        beg_t = timeit.default_timer()
        if bidx % opts.log_freq == 0 or bidx >= N:
            print('Forwarded batch {:4d}/{:4d}, btime: {:.2f} s, '
                  'mbtime: {:.2f} s'.format(bidx, N, timings[-1],
                                            np.mean(timings)),
                  end='\r')
    print()
    X = np.concatenate(X, axis=0)
    print('Total X shape: ', X.shape)
    print('Running KMeans...')
    beg_t = timeit.default_timer()
    kmeans = KMeans(n_clusters=opts.k_clusters, n_jobs=opts.n_jobs,
                    verbose=0).fit(X)
    end_t = timeit.default_timer()
    print('Clusterized in {:.2f} s'.format(end_t - beg_t))
    print('Saving KMeans...')
    with open(os.path.join(opts.save_path, 'kmeans.pkl'), 'wb') as f:
        pickle.dump(kmeans, f)
    print('Finished program')
Ejemplo n.º 22
0
 def build_pase(ckpt, model_config):
     pase = wf_builder(model_config)
     pase.load_pretrained(ckpt, load_last=True, verbose=False)
     return pase
Ejemplo n.º 23
0
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        adaptnet=adaptnet,
                        mode=hp.voc_mode).to(device)

    print(voc_model)
    trainable_params = list(voc_model.parameters())

    paths = Paths(hp.data_path, hp.voc_model_id, '')

    # Load pase model
    print('Building PASE...')
    if hp.pase_cfg is not None:
        # 2 PASEs: (1) Identifier extractor, (2) Content extractor
        pase_cntnt = wf_builder(hp.pase_cfg)
        if hp.pase_ckpt is not None:
            pase_cntnt.load_pretrained(hp.pase_ckpt,
                                       load_last=True,
                                       verbose=True)
        pase_cntnt.to(device)
        if conversion:
            pase_id = wf_builder(hp.pase_cfg)
            if hp.pase_ckpt is not None:
                pase_id.load_pretrained(hp.pase_ckpt,
                                        load_last=True,
                                        verbose=True)
            pase_id.to(device)
        if hp.pase_cntnt_ft:
            print('Setting Content PASE in TRAIN mode')
            pase_cntnt.train()
Ejemplo n.º 24
0
def load_pase_plus(pase_folder=PASE_FOLDER,
                   parameters='trained_model/PASE+_parameters.ckpt'):
    pase = wf_builder(join(PASE_FOLDER, 'cfg/frontend/PASE+.cfg'))
    pase.eval()
    pase.load_pretrained(parameters, load_last=True, verbose=True)
    return pase.to(CUDA0)