from ptsdae.sdae import StackedDenoisingAutoEncoder as SDAE

if __name__ == '__main__':
    # #############################################################################
    dset = sys.argv[1]
    #raw_data = DuoBenchmark('data/datasets/'+dset+'.csv')
    raw_data = FromPickle('data/embeddings/mouse-pca-15000-log1p-True.pickle')
    model = SDAE([raw_data.dims, 7500, 500, 2000, 50])
    #model.load_state_dict(torch.load('data/models/'+dset+'.pt'))
    model.load_state_dict(torch.load(sys.argv[1]))
    if int(torch.__version__.split('.')[1]) == 3:
        var = torch.autograd.variable.Variable(torch.Tensor(raw_data.data))
    else:
        var = torch.Tensor(raw_data.data)
    embedding = model.encoder(var).data.numpy()

    labels = DBSCAN().fit(embedding).labels_

    tsne_embedding = TSNE(n_components=2).fit_transform(embedding)

    # #############################################################################

    plt_file = 'data/plots/mouse_SDAE.pdf'

    plt.scatter(tsne_embedding[:, 0],
                tsne_embedding[:, 1],
                c=labels,
                s=1,
                marker=',')
Ejemplo n.º 2
0
def main(cuda, batch_size, pretrain_epochs, finetune_epochs):
    writer = SummaryWriter()  # create the TensorBoard object

    # callback function to call during training, uses writer from the scope
    def training_callback(epoch, lr, loss, validation_loss):
        writer.add_scalars('data/autoencoder', {
            'lr': lr,
            'loss': loss,
            'validation_loss': validation_loss,
        }, epoch)

    ds_train = CachedMNIST(train=True, cuda=cuda)  # training dataset
    ds_val = CachedMNIST(train=False, cuda=cuda)  # evaluation dataset
    autoencoder = StackedDenoisingAutoEncoder([28 * 28, 500, 500, 2000, 10],
                                              final_activation=None)
    if cuda:
        autoencoder.cuda()
    print('Pretraining stage.')
    ae.pretrain(
        ds_train,
        autoencoder,
        cuda=cuda,
        validation=ds_val,
        epochs=pretrain_epochs,
        batch_size=batch_size,
        optimizer=lambda model: SGD(model.parameters(), lr=0.1, momentum=0.9),
        scheduler=lambda x: StepLR(x, 100, gamma=0.1),
        corruption=0.2)
    print('Training stage.')
    ae_optimizer = SGD(params=autoencoder.parameters(), lr=0.1, momentum=0.9)
    ae.train(ds_train,
             autoencoder,
             cuda=cuda,
             validation=ds_val,
             epochs=finetune_epochs,
             batch_size=batch_size,
             optimizer=ae_optimizer,
             scheduler=StepLR(ae_optimizer, 100, gamma=0.1),
             corruption=0.2,
             update_callback=training_callback)
    print('k-Means stage')
    dataloader = DataLoader(ds_train, batch_size=1024, shuffle=False)
    kmeans = KMeans(n_clusters=10, n_init=20)
    autoencoder.eval()
    features = []
    actual = []
    for index, batch in enumerate(dataloader):
        if (isinstance(batch, tuple)
                or isinstance(batch, list)) and len(batch) == 2:
            batch, value = batch  # if we have a prediction label, separate it to actual
            actual.append(value)
        if cuda:
            batch = batch.cuda(async=True)
        batch = batch.squeeze(1).view(batch.size(0), -1)
        features.append(autoencoder.encoder(batch).detach().cpu())
    actual = torch.cat(actual).long().cpu().numpy()
    predicted = kmeans.fit_predict(torch.cat(features).numpy())
    reassignment, accuracy = cluster_accuracy(predicted, actual)
    print('Final k-Means accuracy: %s' % accuracy)
    predicted_reassigned = [reassignment[item]
                            for item in predicted]  # TODO numpify
    confusion = confusion_matrix(actual, predicted_reassigned)
    normalised_confusion = confusion.astype('float') / confusion.sum(
        axis=1)[:, np.newaxis]
    confusion_id = uuid.uuid4().hex
    sns.heatmap(normalised_confusion).get_figure().savefig('confusion_%s.png' %
                                                           confusion_id)
    print('Writing out confusion diagram with UUID: %s' % confusion_id)
    writer.add_embedding(
        torch.cat(features),
        metadata=predicted,
        label_img=ds_train.ds.train_data.float().unsqueeze(1),  # TODO bit ugly
        tag='predicted')
    writer.close()
Ejemplo n.º 3
0
            ds_path = os.path.join('data/datasets', ds_name + '.csv')
            dataset = DuoBenchmark(ds_path, log1p=log, split_head=False)

            for scale in [True]:
                # Do scaling second as the function will
                # overwrite the existing data
                # yes - yes I know this is bad design but it's too late now
                mlist = model_dict[ds_name][log][scale]
                # Given all of the pre-existing conditions ...
                # cycle through each of the models that match this criteria
                for model in mlist:
                    filename = model[0]
                    print(filename)
                    if scale:
                        scale_dataset(dataset)
                    # get parameter information
                    model_path = os.path.join(model_dir, filename)
                    layers = model[1]
                    # prepare the model
                    model = SDAE([dataset.dims] + layers)
                    model.load_state_dict(
                        torch.load(model_path, map_location='cpu'))
                    # generate the embedding
                    inputs = torch.Tensor(dataset.data)
                    embedding = model.encoder(inputs).data.numpy()
                    # save the embedding
                    with open(
                            os.path.join('data/sdae_embeddings',
                                         filename + '.pickle'), 'wb') as fh:
                        pickle.dump(embedding, fh, protocol=4)