예제 #1
0
def main(datapath, modelpath, idxs):
    dataset_names = get_filenames(datapath)
    print(f'using sets {dataset_names} from {datapath}')
    print('looking for idx files')
    idxs = load_pkl(idxs)
    idxs_train = idxs['train']
    idxs_test = idxs['test']

    for dataset_name in dataset_names:
        dataset: Dataset = load_pkl(datapath+dataset_name)
        X = dataset.data.detach().numpy()
        Y = dataset.get_labels_numerical()
        x_train, x_test = X[idxs_train], X[idxs_test]
        y_train, y_test = Y[idxs_train], Y[idxs_test]
        hiddim = dataset_name.split('/')[-1].split('_')[2]
        print(f'training gmlvq on {hiddim} dim embedding')
        gmlvq = GmlvqModel()
        gmlvq.fit(x_train, y_train)
        train_error = get_error(gmlvq, x_train, y_train)
        test_error = get_error(gmlvq, x_test, y_test)
        var = gmlvq_covered_variance(gmlvq, thresh=1, verbose=True)
        misc = {'train_error': train_error, 'test_error' : test_error, 'matrix_var' : var}
        print(f'adding misc data to gmlvq model {misc}')
        gmlvq.misc = misc
        modelname = f'gmlvq{hiddim}.pkl'
        print(f'saving model to {modelname}')
        pkl.dump(gmlvq, open(modelpath+modelname, 'wb'))
예제 #2
0
def eval_gmlvq(data, modelnames, destination, verbose):
    del data

    for model in modelnames:
        m: GmlvqModel = load_pkl(model)
        name = model.split('/')[-1].split('_')[0]
        thresh = 1
        dims, var, v, eig = gmlvq_covered_variance(m, thresh=thresh)
        f = plt.figure()
        ax1 = f.add_subplot(221)
        ax1.title.set_text(
            f'{name}\n#eigv explaining more \nthan {thresh}% variance = {dims}'
        )
        ax1.bar(np.arange(len(eig)), eig, width=.5)
        ax1.set_xticks(range(len(eig)), range(len(eig)))
        # ax1.set_aspect()
        e = m.misc['test_error']
        print(f'{model}, {dims}, {e:.3f}')
        # path = f'{destination}/{name}.pdf'
        # plt.savefig(path)

        lmbd = m.omega_.conj().T.dot(m.omega_)
        for i in range(len(lmbd)):
            lmbd[i, i] = 0
        ax2 = f.add_subplot(222)
        # ax2.title.set_text(f'{name} Lambda Matrix')
        im = ax2.imshow(lmbd)
        f.colorbar(im, ax=ax2)
        ax2.title.set_text(f'Lambda')
        path = f'{destination}/{name}.pdf'
        plt.savefig(path, bbox_inches='tight')
예제 #3
0
def main(modelpath, datapath, dataset_name):
    modelnames = get_filenames(modelpath, substr='tensor')
    # modelnames = ['GRUEncoder_mfccs_1_500_20_tensor(0.0563)_state.tp']
    #    models = ['GRUEncoder_mfccs_100_143_tensor(0.0566)_state.tp',  'GRUEncoder_mfccs_300_29_tensor(0.0535)_state.tp',
    #'GRUEncoder_mfccs_150_104_tensor(0.0567)_state.tp',  'GRUEncoder_mfccs_400_91_tensor(0.0558)_state.tp',
    #'GRUEncoder_mfccs_200_56_tensor(0.0559)_state.tp','GRUEncoder_mfccs_500_28_tensor_state.tp',
    #'GRUEncoder_mfccs_250_24_tensor(0.0555)_state.tp',  'GRUEncoder_mfccs_75_200_tensor(0.0623)_state.tp',
    #'GRUEncoder_mfccs_25_200_tensor(0.0837)_state.tp',
    #    models = ['GRUEncoder_mfccs_50_200_tensor(0.0721)_state.tp']
    #    modelnames=[models[-1]]
    #    print(modelnames); exit()
    # dataset_name = 'mfccs.pkl'

    data = load_pkl(datapath + dataset_name)
    data.data = torch.tensor(data.data, dtype=torch.float)  # jibbles..
    device = check_for_gpu()
    if device.type == 'cuda':
        data.data = data.data.to(device)
    # Y_cat = torch.tensor(to_categorical(data['Y']), dtype=torch.float32)

    for modelname in modelnames:

        model = load_model(modelpath,
                           modelname,
                           inference_only=True,
                           dev='cpu')
        #        model.to('cpu')
        print('compute encoding')
        batch_size = 256
        _, hidden_states = model.forward(data.data[:batch_size])
        hidden_states = hidden_states.squeeze(0).to('cpu')
        for i in range(batch_size, len(data.data), batch_size):
            #            x_ = data['X'][i:i+batch_size]
            h_t = model.forward(data.data[i:i +
                                          batch_size])[1].squeeze(0).to('cpu')
            hidden_states = torch.cat([hidden_states, h_t], dim=0)
        print(f'done {len(hidden_states)} {hidden_states[0].shape}\n\n\n')
        #        h_t = torch.cat(hidden_states, dim=0).to('cpu')
        #        del hidden_states
        #        print(f'{h_t.shape}')
        #        exit()
        print('saving')
        result = Dataset(data=hidden_states,
                         labels=data.labels,
                         labelranges=data.labelranges)
        pkl.dump(
            result,
            open(f'{datapath}/embedded/{modelname}_embedding.pkl_2', 'wb'))
        del result
예제 #4
0
def main(datapath, modelpath, destination, idxs, modeltype):
    verbose = False
    if modeltype == 'gmlvq':
        evalmethod = eval_gmlvq
    elif modeltype == 'classifier':
        evalmethod = eval_classifier
    else:
        raise ValueError(f'--modeltype {modeltype} not recognized')
        return -1

    if not os.path.exists(destination):
        os.system(f'mkdir -p {destination}')
        if not os.path.exists(destination):
            raise IOError(f'could not create destination path {destination}')

    modelnames = [os.path.join(modelpath, x) for x in get_filenames(modelpath)]
    data: Dataset = load_pkl(datapath)
    evalmethod(data, modelnames, destination, verbose=verbose)
    return 0
예제 #5
0
def main(datapath, modelpath):

    print(f'loading data from {datapath}')
    filename = datapath.split('/')[-1].split('.')[0]
    #    print(datapath.split('/')[:-1])
    #    exit()
    datadir = datapath.split('/')[:-1]
    datadir = os.path.join(*datadir)
    dataset = load_pkl(datapath)  # Dataset from asr.util
    print('looking for idx files')
    #    idxs_train = load_pkl(os.path.join('/', datadir, 'idxs_train.pkl'))
    #    idxs_test = load_pkl(os.path.join('/', datadir, 'idxs_test.pkl'))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Additional Info when using cuda
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1),
              'GB')
        print('Cached:   ', round(torch.cuda.memory_cached(0) / 1024**3, 1),
              'GB')

    x = dataset.data
    Y = dataset.get_labels_categorical()
    seqlen, nfeatures = x.shape[1:]
    print('splitting up data into train and test sets')

    idxs = [x for x in range(len(x))]
    np.random.shuffle(idxs)
    split_percent = 0.90
    split = int(split_percent * len(idxs))
    idxs_train = idxs[:split]
    idxs_test = idxs[split:]
    idx_path = '/' + datadir + f'/idxs_{filename}.pkl'
    print(f'saving idxs to {idx_path}')
    with open(idx_path, 'wb') as f:
        pkl.dump({'train': idxs_train, 'test': idxs_test}, f)
#    print(idxs_test); exit()
    x_train = torch.tensor(x[idxs_train], dtype=torch.float)
    y_train = torch.tensor(Y[idxs_train], dtype=torch.float)
    x_test = torch.tensor(x[idxs_test], dtype=torch.float)
    y_test = torch.tensor(Y[idxs_test], dtype=torch.float)
    del x, Y, dataset

    n_samples, num_classes = y_train.shape

    x_train = x_train.to(device)
    y_train = y_train.to(device)
    y_test = y_test.to(device)
    x_test = x_test.to(device)

    # hidden_size = [25, 50, 75, 100, 150, 200, 250, 300, 400]
    hidden_size = [25, 250, 50]
    print(
        f'starting training of classifiers with hidden dims of {hidden_size}')

    for hid_s in hidden_size:
        print(f'training model with {hid_s} hidden dims')
        network = GRUEncoder(input_size=nfeatures,
                             hidden_size=hid_s,
                             out_dim=num_classes,
                             act_out=nn.Sigmoid,
                             num_layers=1)

        if device.type == 'cuda':
            network = network.to(device)

        optim = Adam(network.parameters(), )  #lr=0.005) # default 0.001
        loss_fun = nn.BCELoss()
        histories = []
        test_errors = []
        target_error = 57e-3
        train = True

        batch_size = 256
        n_epochs, max_epochs = 0, 200
        print(
            f'test performances without training: {test_classifier(network.forward, x_test, y_test, batch_size)}'
        )

        while train:
            history = []

            for i in range(0, n_samples, batch_size):
                x, y = x_train[i:i + batch_size], y_train[i:i + batch_size]
                optim.zero_grad()
                y_pred, _ = network.forward(x)
                #                print(y_pred); exit();
                loss = loss_fun(y_pred, y)
                loss.backward()
                history.append(loss)
                optim.step()
                if i / batch_size % 10 == 0:
                    print(
                        f'epoch {n_epochs} {i}:{i+batch_size}/{n_samples} loss {loss}',
                        end='\r',
                        flush=True)


#            train_error = test_classifier(network.forward, x_train, y_train, 512)
#            print(f'\ntrain error: {train_error}')
            current_error = test_classifier(network.forward, x_test, y_test,
                                            512)
            print(f'\ntest error: {current_error} \n')
            test_errors.append(current_error)

            n_epochs += 1
            train = target_error < current_error and n_epochs < max_epochs
            histories.append(history)

        modelname = '_'.join([
            network.__class__.__name__, filename,
            str(hid_s),
            str(n_epochs),
            str(current_error)
        ])
        network.optimizer = optim
        network.history = histories
        network.test_errors = test_errors
        network.epochs_trained = n_epochs
        save_model(network, path=modelpath, modelname=modelname)