Ejemplo n.º 1
0
    def run(self):
        self.log("Starting ReduceEmbeddingDimensionality")
        vectorizer = get_vectorizer(self._vectorizer_name)
        paper_matrix = vectorizer.paper_matrix
        X = 0.5 * paper_matrix['abstract'] + 0.5 * paper_matrix['title']
        self.log(X.shape)
        points = TSNE(n_components=3, verbose=True).fit_transform(X)
        points = scale(points)
        dois = paper_matrix['index_arr']
        id_map = paper_matrix['id_map']
        result = dict()
        category_memberships = CategoryMembership.objects.filter(
            paper__in=dois)
        for membership in self.progress(category_memberships):
            doi = membership.paper.pk
            matrix_index = id_map[doi]

            category_pk = membership.category.pk
            category_score = membership.score

            if doi not in result:
                result[doi] = {
                    'doi':
                    doi,
                    'title':
                    membership.paper.title,
                    'point':
                    points[matrix_index].tolist(),
                    'top_category':
                    category_pk,
                    'published_at':
                    json.dumps(membership.paper.published_at,
                               cls=DjangoJSONEncoder),
                    'top_category_score':
                    category_score
                }
            elif result[doi]['top_category_score'] <= category_score:
                result[doi]['top_category'] = category_pk
                result[doi]['top_category_score'] = category_score
        output = {
            'papers': list(result.values()),
            'means': points.mean(axis=0).tolist(),
            'max': points.max(axis=0).tolist(),
            'min': points.min(axis=0).tolist()
        }
        if settings.DEBUG:
            with open('../web/assets/embeddings_3d.json', 'w+') as f:
                json.dump(output, f)
        else:
            s3_bucket_client = S3BucketClient(
                aws_access_key=settings.AWS_ACCESS_KEY_ID,
                aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
                endpoint_url=settings.AWS_S3_ENDPOINT_URL,
                bucket=settings.AWS_STORAGE_BUCKET_NAME)
            s3_bucket_client.upload_as_json(settings.AWS_EMBEDDINGS_FILE_PATH,
                                            output)

        Paper.objects.all().update(visualized=True)

        self.log("ReduceEmbeddingDimensionality finished")
Ejemplo n.º 2
0
        print('Input:\t\t', ip.data.cpu().numpy()[0])
        enc = encoder(ip)
        print('Encoding:\t', enc.data.cpu().numpy()[0])
        enc = channel_output(enc)
        print('Channel:\t', enc.data.cpu().numpy()[0])
        op = decoder(enc)
        print('Output:\t\t', torch.softmax(op, dim=1).data.cpu().numpy()[0])

if hp.constellation:  # to visualize encodings, etc.
    try:
        os.makedirs('Constellations')
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    ip = torch.eye(hp.M, device=device)
    enc = encoder(ip).cpu().detach().numpy()

    enc_emb = TSNE().fit_transform(enc).T
    enc_emb -= enc_emb.mean(axis=1).reshape(2, 1)
    enc_emb /= enc_emb.std()

    plt.figure(dpi=250)
    plt.grid()
    plt.scatter(enc_emb[0], enc_emb[1])
    plt.title('Constellation for RBF ({0},{1})'.format(hp.n, hp.k))
    plt.savefig(join('Constellations', 'RBF({0},{1}).png'.format(hp.n, hp.k)))
    plt.show()

print('Total time taken:{0:.2f} seconds'.format(time() - start))
Ejemplo n.º 3
0
Archivo: data.py Proyecto: leesael/GSDT
def preprocess(data, data_in='../data/raw', data_out='../data/preprocessed'):
    """
    Preprocess a dataset based on its name.
    """
    if data == 'breast-cancer':
        df = pd.read_csv('{}/{}/breast-cancer.data'.format(data_in, data),
                         header=None)
        trn_idx, test_idx = split(df.shape[0])
        features = to_categorical_features(df.iloc[:, 1:])
        labels = to_labels(df[0])
    elif data == 'breast-cancer-wisconsin':
        df = pd.read_csv('{}/{}/breast-cancer-wisconsin.data'.format(
            data_in, data),
                         header=None)
        trn_idx, test_idx = split(df.shape[0])
        features = to_numerical_features(df, trn_idx, list(range(1, 10)))
        labels = to_labels(df[10])
    elif data == 'heart-disease':
        df = pd.read_csv('{}/{}/processed.cleveland.data'.format(
            data_in, data),
                         header=None)
        trn_idx, test_idx = split(df.shape[0])
        x1 = to_numerical_features(df, trn_idx, [0, 3, 4, 7, 9])
        x2 = to_categorical_features(df, [1, 2, 5, 6, 8, 10, 12])
        features = np.concatenate((x1, x2), axis=1)
        labels = to_labels((df[13] > 0).astype(np.int64))
    elif data == 'hepatitis':
        df = pd.read_csv('{}/{}/hepatitis.data'.format(data_in, data),
                         header=None)
        trn_idx, test_idx = split(df.shape[0])
        x1 = to_numerical_features(df, trn_idx, [1, 14, 15, 16, 17, 18])
        x2 = to_categorical_features(df, [2, 4, 5])
        features = np.concatenate((x1, x2), axis=1)
        labels = to_labels(df[19])
    elif data == 'brain-tumor':
        df = pd.read_csv('{}/{}/Dataset.csv'.format(data_in, data))
        df = df[df['Area'] != 0].reset_index()
        trn_idx, test_idx = split(df.shape[0])
        cols = [
            'Area', 'Perimeter', 'Convex Area', 'Solidity',
            'Equivalent Diameter', 'Major Axis', 'Minor Axis'
        ]
        features = to_numerical_features(df, trn_idx, cols)
        labels = to_labels(df['Class'])
    elif data == 'diabetes':
        df = pd.read_csv('{}/{}/pima-indians-diabetes.csv'.format(
            data_in, data),
                         header=None)
        trn_idx, test_idx = split(df.shape[0])
        df.iloc[:, 1:-1] = df.iloc[:, 1:-1].replace(0, np.nan)
        features = to_numerical_features(df.iloc[:, 1:-1], trn_idx)
        labels = to_labels(df.iloc[:, -1])
    elif data == 'synthetic':
        df = pd.read_csv(
            '{}/breast-cancer-wisconsin/breast-cancer-wisconsin.data'.format(
                data_in),
            header=None)
        trn_idx, test_idx = split(df.shape[0])
        features = to_numerical_features(df, trn_idx, list(range(1, 10)))
        features = TSNE(random_state=0).fit_transform(features)
        features = (features - features.mean(axis=0)) / features.std(axis=0)
        labels = to_labels(df[10])
    else:
        raise ValueError(data)

    print('{}\t{}\t{}\t{}'.format(data, features.shape[0], features.shape[1],
                                  labels.max() + 1))

    trn_x = features[trn_idx]
    trn_y = labels[trn_idx]
    test_x = features[test_idx]
    test_y = labels[test_idx]

    os.makedirs('{}/{}'.format(data_out, data), exist_ok=True)
    np.save('{}/{}/trn_x'.format(data_out, data), trn_x)
    np.save('{}/{}/trn_y'.format(data_out, data), trn_y)
    np.save('{}/{}/test_x'.format(data_out, data), test_x)
    np.save('{}/{}/test_y'.format(data_out, data), test_y)
plt.colorbar()
plt.rcParams['font.size'] = 10
for sample_number in range(score.shape[0]):
    plt.text(score.iloc[sample_number, 0], score.iloc[sample_number, 1], score.index[sample_number],
             horizontalalignment='center', verticalalignment='top')
plt.xlabel('t_1 (PCA)')
plt.ylabel('t_2 (PCA)')
plt.show()

# t-SNE
# k3n-error を用いた perplexity の最適化 
k3n_errors = []
for index, perplexity in enumerate(candidates_of_perplexity):
    print(index + 1, '/', len(candidates_of_perplexity))
    t = TSNE(perplexity=perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x)
    scaled_t = (t - t.mean(axis=0)) / t.std(axis=0, ddof=1)

    k3n_errors.append(
        sample_functions.k3n_error(autoscaled_x, scaled_t, k_in_k3n_error) + sample_functions.k3n_error(
            scaled_t, autoscaled_x, k_in_k3n_error))
plt.rcParams['font.size'] = 18
plt.scatter(candidates_of_perplexity, k3n_errors, c='blue')
plt.xlabel("perplexity")
plt.ylabel("k3n-errors")
plt.show()
optimal_perplexity = candidates_of_perplexity[np.where(k3n_errors == np.min(k3n_errors))[0][0]]
print('\nk3n-error による perplexity の最適値 :', optimal_perplexity)
# t-SNE
t = TSNE(perplexity=optimal_perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x)
t = pd.DataFrame(t, index=x.index, columns=['t_1 (t-SNE)', 't_2 (t-SNE)'])
t.to_csv('tsne_t.csv')
Ejemplo n.º 5
0
            q = enc(images.cuda())
            z = q['styles'].value.cpu().detach().numpy()
        else:
            q = enc(images)
            z = q['styles'].value.data.detach().numpy()
        zs.append(z)
        ys.append(y.numpy())
ys = np.concatenate(ys,0)
zs = np.concatenate(zs,0)


# run TSNE when number of latent dims exceeds 2
if NUM_STYLE > 2:
    from sklearn.manifold import TSNE
    zs2 = TSNE().fit_transform(zs)
    zs2_mean = zs2.mean(0)
    zs2_std = zs2.std(0)
else:
    zs2 = zs

# display a 2D plot of the digit classes in the latent space
fig = plt.figure(figsize=(6,6))
ax = plt.gca()

colors = []
for k in range(10):
    m = (ys == k)
    p = ax.scatter(zs2[m, 0], zs2[m, 1], label='y=%d' % k, alpha=0.5, s=5)
    colors.append(p.get_facecolor())
ax.legend()
Ejemplo n.º 6
0
def main(args):
    parser = get_parser()
    parser = DAPC.add_arguments(parser)
    args = parser.parse_args(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Handle multiple gpu issues.

    T = args.T
    fdim = args.fdim
    encoder_name = args.encoder_type
    params = ''
    print(params)

    idim = 30 # lift projection dim
    noise_dim = 7 # noisify raw DCA
    split_rate = args.split_rate # train/valid split
    snr_vals = [0.3, 1.0, 5.0]  # signal-to-noise ratios
    num_samples = 10000  # samples to collect from the lorenz system

    print("Generating ground truth dynamics ...")
    X_dynamics = gen_lorenz_data(num_samples)  # 10000 * 3

    noisy_model = DNN(X_dynamics.shape[1], idim, dropout=0.5)  # DNN lift projection: 3 -> 30 for d-DCA
    use_gpu = False
    if use_gpu:
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")

    dca_recons = []
    dapc_recons = []
    r2_vals = np.zeros((len(snr_vals), 2))  # obtain R2 scores for DCA and dDCA
    for snr_idx, snr in enumerate(snr_vals):
        print("Generating noisy data with snr=%.2f ..." % snr)
        X_clean, X_noisy = gen_nonlinear_noisy_lorenz(idim, T, snr, X_dynamics=X_dynamics, noisy_model=noisy_model, seed=args.seed)
        X_noisy = X_noisy - X_noisy.mean(axis=0)

        X_clean_train, X_clean_val = split(X_clean, split_rate)
        X_noisy_train, X_noisy_val = split(X_noisy, split_rate)
        X_dyn_train, X_dyn_val = split(X_dynamics, split_rate)
        if not os.path.exists("runs"):
            os.mkdir("runs")

        chunk_size = 500
        X_train_seqs, L_train = chunk_long_seq(X_noisy_train, 30, chunk_size)
        X_valid_seqs, L_valid = chunk_long_seq(X_noisy_val, 30, chunk_size)
        X_clean_seqs, L_clean = chunk_long_seq(X_clean_val, 30, chunk_size)
        X_dyn_seqs, L_dyn = chunk_long_seq(X_dyn_val, 30, chunk_size)
        print(X_train_seqs[0].shape)

        # 0:500 test, 1000:1500 valid
        X_match = torch.from_numpy(_context_concat(X_noisy_val[1000:1500], 0)).float().to(device)
        Y_match = X_dyn_val[1000:1500]
        # Linear DCA
        print("Training {}".format(args.base_encoder_type))

        dca_model = DCA(d=fdim, T=T)
        dca_model.fit(X_train_seqs + X_valid_seqs[:1])
        X_dca = dca_model.transform(X_noisy_val[:500])
        if X_dca.shape[1] > 3:
            X_dca = TSNE(n_components=3).fit_transform(X_dca)

        # deep DCA
        print("Training {}".format(encoder_name))
        dapc_model = DAPC(args.obj, idim, fdim, T, encoder_type=args.encoder_type,
                                                 ortho_lambda=args.ortho_lambda, recon_lambda=args.recon_lambda,
                                                 dropout=args.dropout, masked_recon=args.masked_recon,
                                                 args=args, device=device)

        dapc_model = fit_dapc(dapc_model, X_train_seqs, L_train, X_valid_seqs, L_valid, None, args.lr, use_gpu,
                batch_size=args.batchsize, max_epochs=args.epochs, device=device, snapshot=params + ".cpt", X_match=X_match, Y_match=Y_match, use_writer=False)

        X_dapc = dapc_model.encode(
            torch.from_numpy(_context_concat(X_noisy_val[:500], dapc_model.input_context)).float().to(device,
                                                            dtype=dapc_model.dtype)).cpu().numpy()
        if X_dapc.shape[1] > 3:
            X_dapc = TSNE(n_components=3).fit_transform(X_dapc)

        print(np.matmul((X_dapc - X_dapc.mean(0)).T, (X_dapc - X_dapc.mean(0))) / X_dapc.shape[0])

        if not os.path.exists("pngs"):
            os.mkdir("pngs")

        # match DCA with ground-truth
        if not os.path.exists("npys"):
            os.mkdir("npys")
        np.save("npys/dapc_bases_{}.npy".format(params), X_dapc)
        print("Matching {}".format(args.base_encoder_type))
        X_dca_recon, _ = match(X_dca, X_dyn_val[:500], 15000, device)
        # match DAPC with ground-truth
        print("Matching {}".format(encoder_name))
        X_dapc_recon, _ = match(X_dapc, X_dyn_val[:500], 15000, device)

        # R2 of dca
        r2_dca = 1 - np.sum((X_dca_recon - X_dyn_val[:500]) ** 2) / np.sum(
                (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0)) ** 2)
        print("\nr2_dca:", r2_dca)
        # R2 of dapc
        r2_dapc = 1 - np.sum((X_dapc_recon - X_dyn_val[:500]) ** 2) / np.sum(
                (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0)) ** 2)
        print("r2_dapc:", r2_dapc)
        # store R2's
        r2_vals[snr_idx] = [r2_dca, r2_dapc]
        # store reconstructed signals
        dca_recons.append(X_dca_recon)
        dapc_recons.append(X_dapc_recon)

    if not os.path.exists("plots"):
        os.mkdir("plots")
    if not os.path.exists("plots/{}".format(params)):
        os.mkdir("plots/{}".format(params))

    plot_figs(dca_recons, dapc_recons, X_dyn_val[:500], X_clean_val[:500], X_noisy_val[:500], r2_vals, snr_vals, args.base_encoder_type,
              encoder_name, "plots/{}".format(params))
Ejemplo n.º 7
0
def main(args):
    parser = get_parser()
    parser = DAPC.add_arguments(parser)
    args = parser.parse_args(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # Handle multiple gpu issues.
    gpuid = args.gpuid
    gpulist = parsegpuid(gpuid)
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(x) for x in gpulist])
    numGPUs = len(gpulist)
    print("Using %d gpus, CUDA_VISIBLE_DEVICES=%s" %
          (numGPUs, os.environ["CUDA_VISIBLE_DEVICES"]))

    T = args.T
    fdim = args.fdim
    encoder_name = args.encoder_type
    params = 'obj={}_encoder={}_split={}_fdim={}_context={}_T={}_lr={}_bs={}_dropout={}_rate-lambda={}_ortho-lambda={}_recon-lambda={}_seed={}'.format(
        args.obj, encoder_name, args.split_rate, args.fdim, args.input_context,
        args.T, args.lr, args.batchsize, args.dropout, args.rate_lambda,
        args.ortho_lambda, args.recon_lambda, args.seed)
    if args.obj == "vae":
        params = params + "_priorpi={}_dimpi={}_{}_{}_{}_{}".format(
            args.use_prior_pi, args.use_dim_pi, args.vae_alpha, args.vae_beta,
            args.vae_gamma, args.vae_zeta)
    print(params)

    idim = 30  # lift projection dim
    noise_dim = 7  # noisify raw DCA
    split_rate = args.split_rate  # train/valid split
    snr_vals = [0.3, 1.0, 5.0]  # signal-to-noise ratios
    num_samples = 10000  # samples to collect from the lorenz system

    print("Generating ground truth dynamics ...")
    X_dynamics = gen_lorenz_data(num_samples)  # 10000 * 3

    noisy_model = DNN(X_dynamics.shape[1], idim,
                      dropout=0.5)  # DNN lift projection: 3 -> 30 for d-DCA
    use_gpu = True
    if use_gpu:
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")

    dca_recons = []
    dapc_recons = []
    r2_vals = np.zeros((len(snr_vals), 2))  # obtain R2 scores for DCA and dDCA
    for snr_idx, snr in enumerate(snr_vals):
        print("Generating noisy data with snr=%.2f ..." % snr)
        X_clean, X_noisy = gen_nonlinear_noisy_lorenz(idim,
                                                      T,
                                                      snr,
                                                      X_dynamics=X_dynamics,
                                                      noisy_model=noisy_model,
                                                      seed=args.seed)
        X_noisy = X_noisy - X_noisy.mean(axis=0)

        X_clean_train, X_clean_val = split(X_clean, split_rate)
        X_noisy_train, X_noisy_val = split(X_noisy, split_rate)
        X_dyn_train, X_dyn_val = split(X_dynamics, split_rate)
        if not os.path.exists("runs"):
            os.mkdir("runs")
        writer = SummaryWriter(
            create_writer_name('runs/dapc_{}'.format(params)))

        chunk_size = 500
        X_train_seqs, L_train = chunk_long_seq(X_noisy_train, 30, chunk_size)
        X_valid_seqs, L_valid = chunk_long_seq(X_noisy_val, 30, chunk_size)
        X_clean_seqs, L_clean = chunk_long_seq(X_clean_val, 30, chunk_size)
        X_dyn_seqs, L_dyn = chunk_long_seq(X_dyn_val, 30, chunk_size)

        # 0:500 test, 1000:1500 valid
        X_match = torch.from_numpy(_context_concat(X_noisy_val[1000:1500],
                                                   0)).float().to(device)
        Y_match = X_dyn_val[1000:1500]
        # Linear DCA
        print("Training {}".format(args.base_encoder_type))

        if args.base_encoder_type != "lin":
            dca_model = DAPC(args.obj,
                             idim,
                             fdim,
                             T,
                             encoder_type=args.base_encoder_type,
                             ortho_lambda=args.ortho_lambda,
                             recon_lambda=args.recon_lambda,
                             dropout=args.dropout,
                             masked_recon=args.masked_recon,
                             args=args,
                             device=device)
        else:
            dca_model = DAPC("dca",
                             idim,
                             fdim,
                             T,
                             encoder_type="lin",
                             ortho_lambda=10.0,
                             recon_lambda=0.0,
                             dropout=0.0,
                             masked_recon=False,
                             args=args)
        dca_model = fit_dapc(dca_model,
                             X_train_seqs,
                             L_train,
                             X_valid_seqs[:1],
                             L_valid[:1],
                             writer,
                             args.lr,
                             use_gpu,
                             batch_size=args.batchsize,
                             max_epochs=args.epochs,
                             device=device,
                             snapshot="lin_dca.cpt",
                             X_match=X_match,
                             Y_match=Y_match,
                             use_writer=False)

        X_dca = dca_model.encode(
            torch.from_numpy(
                _context_concat(X_noisy_val[:500],
                                dca_model.input_context)).float().to(
                                    device,
                                    dtype=dca_model.dtype)).cpu().numpy()
        if X_dca.shape[1] > 3:
            X_dca = TSNE(n_components=3).fit_transform(X_dca)

        # deep DCA
        print("Training {}".format(encoder_name))
        dapc_model = DAPC(args.obj,
                          idim,
                          fdim,
                          T,
                          encoder_type=args.encoder_type,
                          ortho_lambda=args.ortho_lambda,
                          recon_lambda=args.recon_lambda,
                          dropout=args.dropout,
                          masked_recon=args.masked_recon,
                          args=args,
                          device=device)

        dapc_model = fit_dapc(dapc_model,
                              X_train_seqs,
                              L_train,
                              X_valid_seqs,
                              L_valid,
                              writer,
                              args.lr,
                              use_gpu,
                              batch_size=args.batchsize,
                              max_epochs=args.epochs,
                              device=device,
                              snapshot=params + ".cpt",
                              X_match=X_match,
                              Y_match=Y_match)

        X_dapc = dapc_model.encode(
            torch.from_numpy(
                _context_concat(X_noisy_val[:500],
                                dapc_model.input_context)).float().to(
                                    device,
                                    dtype=dapc_model.dtype)).cpu().numpy()
        if X_dapc.shape[1] > 3:
            X_dapc = TSNE(n_components=3).fit_transform(X_dapc)

        print(
            np.matmul((X_dapc - X_dapc.mean(0)).T,
                      (X_dapc - X_dapc.mean(0))) / X_dapc.shape[0])

        if not os.path.exists("pngs"):
            os.mkdir("pngs")
        if dapc_model.obj == "vae":
            ax = sns.heatmap(dapc_model.post_cov.detach().cpu().numpy(),
                             linewidth=0.05)
            plt.savefig("pngs/post_cov_heat_{}.png".format(params))
            plt.clf()
            ax = sns.heatmap(dapc_model.cov.detach().cpu().numpy(),
                             linewidth=0.05)
            plt.savefig("pngs/cov_heat_{}.png".format(params))
        else:
            ax = sns.heatmap(dapc_model.cov.detach().cpu().numpy(),
                             linewidth=0.05)
            plt.savefig("pngs/post_cov_heat_{}.png".format(params))

        # match DCA with ground-truth
        if not os.path.exists("npys"):
            os.mkdir("npys")
        np.save("npys/dapc_bases_{}.npy".format(params), X_dapc)
        print("Matching {}".format(args.base_encoder_type))
        X_dca_recon, _ = match(X_dca, X_dyn_val[:500], 15000, device)
        # match DAPC with ground-truth
        print("Matching {}".format(encoder_name))
        X_dapc_recon, _ = match(X_dapc, X_dyn_val[:500], 15000, device)

        # R2 of dca
        r2_dca = 1 - np.sum((X_dca_recon - X_dyn_val[:500])**2) / np.sum(
            (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0))**2)
        print("\nr2_dca:", r2_dca)
        # R2 of dapc
        r2_dapc = 1 - np.sum((X_dapc_recon - X_dyn_val[:500])**2) / np.sum(
            (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0))**2)
        print("r2_dapc:", r2_dapc)
        # store R2's
        r2_vals[snr_idx] = [r2_dca, r2_dapc]
        # store reconstructed signals
        dca_recons.append(X_dca_recon)
        dapc_recons.append(X_dapc_recon)

    if not os.path.exists("plots"):
        os.mkdir("plots")
    if not os.path.exists("plots/{}".format(params)):
        os.mkdir("plots/{}".format(params))

    plot_figs(dca_recons, dapc_recons, X_dyn_val[:500], X_clean_val[:500],
              X_noisy_val[:500], r2_vals, snr_vals, args.base_encoder_type,
              encoder_name, "plots/{}".format(params))