def run(self): self.log("Starting ReduceEmbeddingDimensionality") vectorizer = get_vectorizer(self._vectorizer_name) paper_matrix = vectorizer.paper_matrix X = 0.5 * paper_matrix['abstract'] + 0.5 * paper_matrix['title'] self.log(X.shape) points = TSNE(n_components=3, verbose=True).fit_transform(X) points = scale(points) dois = paper_matrix['index_arr'] id_map = paper_matrix['id_map'] result = dict() category_memberships = CategoryMembership.objects.filter( paper__in=dois) for membership in self.progress(category_memberships): doi = membership.paper.pk matrix_index = id_map[doi] category_pk = membership.category.pk category_score = membership.score if doi not in result: result[doi] = { 'doi': doi, 'title': membership.paper.title, 'point': points[matrix_index].tolist(), 'top_category': category_pk, 'published_at': json.dumps(membership.paper.published_at, cls=DjangoJSONEncoder), 'top_category_score': category_score } elif result[doi]['top_category_score'] <= category_score: result[doi]['top_category'] = category_pk result[doi]['top_category_score'] = category_score output = { 'papers': list(result.values()), 'means': points.mean(axis=0).tolist(), 'max': points.max(axis=0).tolist(), 'min': points.min(axis=0).tolist() } if settings.DEBUG: with open('../web/assets/embeddings_3d.json', 'w+') as f: json.dump(output, f) else: s3_bucket_client = S3BucketClient( aws_access_key=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, endpoint_url=settings.AWS_S3_ENDPOINT_URL, bucket=settings.AWS_STORAGE_BUCKET_NAME) s3_bucket_client.upload_as_json(settings.AWS_EMBEDDINGS_FILE_PATH, output) Paper.objects.all().update(visualized=True) self.log("ReduceEmbeddingDimensionality finished")
print('Input:\t\t', ip.data.cpu().numpy()[0]) enc = encoder(ip) print('Encoding:\t', enc.data.cpu().numpy()[0]) enc = channel_output(enc) print('Channel:\t', enc.data.cpu().numpy()[0]) op = decoder(enc) print('Output:\t\t', torch.softmax(op, dim=1).data.cpu().numpy()[0]) if hp.constellation: # to visualize encodings, etc. try: os.makedirs('Constellations') except OSError as e: if e.errno != errno.EEXIST: raise ip = torch.eye(hp.M, device=device) enc = encoder(ip).cpu().detach().numpy() enc_emb = TSNE().fit_transform(enc).T enc_emb -= enc_emb.mean(axis=1).reshape(2, 1) enc_emb /= enc_emb.std() plt.figure(dpi=250) plt.grid() plt.scatter(enc_emb[0], enc_emb[1]) plt.title('Constellation for RBF ({0},{1})'.format(hp.n, hp.k)) plt.savefig(join('Constellations', 'RBF({0},{1}).png'.format(hp.n, hp.k))) plt.show() print('Total time taken:{0:.2f} seconds'.format(time() - start))
def preprocess(data, data_in='../data/raw', data_out='../data/preprocessed'): """ Preprocess a dataset based on its name. """ if data == 'breast-cancer': df = pd.read_csv('{}/{}/breast-cancer.data'.format(data_in, data), header=None) trn_idx, test_idx = split(df.shape[0]) features = to_categorical_features(df.iloc[:, 1:]) labels = to_labels(df[0]) elif data == 'breast-cancer-wisconsin': df = pd.read_csv('{}/{}/breast-cancer-wisconsin.data'.format( data_in, data), header=None) trn_idx, test_idx = split(df.shape[0]) features = to_numerical_features(df, trn_idx, list(range(1, 10))) labels = to_labels(df[10]) elif data == 'heart-disease': df = pd.read_csv('{}/{}/processed.cleveland.data'.format( data_in, data), header=None) trn_idx, test_idx = split(df.shape[0]) x1 = to_numerical_features(df, trn_idx, [0, 3, 4, 7, 9]) x2 = to_categorical_features(df, [1, 2, 5, 6, 8, 10, 12]) features = np.concatenate((x1, x2), axis=1) labels = to_labels((df[13] > 0).astype(np.int64)) elif data == 'hepatitis': df = pd.read_csv('{}/{}/hepatitis.data'.format(data_in, data), header=None) trn_idx, test_idx = split(df.shape[0]) x1 = to_numerical_features(df, trn_idx, [1, 14, 15, 16, 17, 18]) x2 = to_categorical_features(df, [2, 4, 5]) features = np.concatenate((x1, x2), axis=1) labels = to_labels(df[19]) elif data == 'brain-tumor': df = pd.read_csv('{}/{}/Dataset.csv'.format(data_in, data)) df = df[df['Area'] != 0].reset_index() trn_idx, test_idx = split(df.shape[0]) cols = [ 'Area', 'Perimeter', 'Convex Area', 'Solidity', 'Equivalent Diameter', 'Major Axis', 'Minor Axis' ] features = to_numerical_features(df, trn_idx, cols) labels = to_labels(df['Class']) elif data == 'diabetes': df = pd.read_csv('{}/{}/pima-indians-diabetes.csv'.format( data_in, data), header=None) trn_idx, test_idx = split(df.shape[0]) df.iloc[:, 1:-1] = df.iloc[:, 1:-1].replace(0, np.nan) features = to_numerical_features(df.iloc[:, 1:-1], trn_idx) labels = to_labels(df.iloc[:, -1]) elif data == 'synthetic': df = pd.read_csv( '{}/breast-cancer-wisconsin/breast-cancer-wisconsin.data'.format( data_in), header=None) trn_idx, test_idx = split(df.shape[0]) features = to_numerical_features(df, trn_idx, list(range(1, 10))) features = TSNE(random_state=0).fit_transform(features) features = (features - features.mean(axis=0)) / features.std(axis=0) labels = to_labels(df[10]) else: raise ValueError(data) print('{}\t{}\t{}\t{}'.format(data, features.shape[0], features.shape[1], labels.max() + 1)) trn_x = features[trn_idx] trn_y = labels[trn_idx] test_x = features[test_idx] test_y = labels[test_idx] os.makedirs('{}/{}'.format(data_out, data), exist_ok=True) np.save('{}/{}/trn_x'.format(data_out, data), trn_x) np.save('{}/{}/trn_y'.format(data_out, data), trn_y) np.save('{}/{}/test_x'.format(data_out, data), test_x) np.save('{}/{}/test_y'.format(data_out, data), test_y)
plt.colorbar() plt.rcParams['font.size'] = 10 for sample_number in range(score.shape[0]): plt.text(score.iloc[sample_number, 0], score.iloc[sample_number, 1], score.index[sample_number], horizontalalignment='center', verticalalignment='top') plt.xlabel('t_1 (PCA)') plt.ylabel('t_2 (PCA)') plt.show() # t-SNE # k3n-error を用いた perplexity の最適化 k3n_errors = [] for index, perplexity in enumerate(candidates_of_perplexity): print(index + 1, '/', len(candidates_of_perplexity)) t = TSNE(perplexity=perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x) scaled_t = (t - t.mean(axis=0)) / t.std(axis=0, ddof=1) k3n_errors.append( sample_functions.k3n_error(autoscaled_x, scaled_t, k_in_k3n_error) + sample_functions.k3n_error( scaled_t, autoscaled_x, k_in_k3n_error)) plt.rcParams['font.size'] = 18 plt.scatter(candidates_of_perplexity, k3n_errors, c='blue') plt.xlabel("perplexity") plt.ylabel("k3n-errors") plt.show() optimal_perplexity = candidates_of_perplexity[np.where(k3n_errors == np.min(k3n_errors))[0][0]] print('\nk3n-error による perplexity の最適値 :', optimal_perplexity) # t-SNE t = TSNE(perplexity=optimal_perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x) t = pd.DataFrame(t, index=x.index, columns=['t_1 (t-SNE)', 't_2 (t-SNE)']) t.to_csv('tsne_t.csv')
q = enc(images.cuda()) z = q['styles'].value.cpu().detach().numpy() else: q = enc(images) z = q['styles'].value.data.detach().numpy() zs.append(z) ys.append(y.numpy()) ys = np.concatenate(ys,0) zs = np.concatenate(zs,0) # run TSNE when number of latent dims exceeds 2 if NUM_STYLE > 2: from sklearn.manifold import TSNE zs2 = TSNE().fit_transform(zs) zs2_mean = zs2.mean(0) zs2_std = zs2.std(0) else: zs2 = zs # display a 2D plot of the digit classes in the latent space fig = plt.figure(figsize=(6,6)) ax = plt.gca() colors = [] for k in range(10): m = (ys == k) p = ax.scatter(zs2[m, 0], zs2[m, 1], label='y=%d' % k, alpha=0.5, s=5) colors.append(p.get_facecolor()) ax.legend()
def main(args): parser = get_parser() parser = DAPC.add_arguments(parser) args = parser.parse_args(args) np.random.seed(args.seed) torch.manual_seed(args.seed) # Handle multiple gpu issues. T = args.T fdim = args.fdim encoder_name = args.encoder_type params = '' print(params) idim = 30 # lift projection dim noise_dim = 7 # noisify raw DCA split_rate = args.split_rate # train/valid split snr_vals = [0.3, 1.0, 5.0] # signal-to-noise ratios num_samples = 10000 # samples to collect from the lorenz system print("Generating ground truth dynamics ...") X_dynamics = gen_lorenz_data(num_samples) # 10000 * 3 noisy_model = DNN(X_dynamics.shape[1], idim, dropout=0.5) # DNN lift projection: 3 -> 30 for d-DCA use_gpu = False if use_gpu: device = torch.device("cuda:0") else: device = torch.device("cpu") dca_recons = [] dapc_recons = [] r2_vals = np.zeros((len(snr_vals), 2)) # obtain R2 scores for DCA and dDCA for snr_idx, snr in enumerate(snr_vals): print("Generating noisy data with snr=%.2f ..." % snr) X_clean, X_noisy = gen_nonlinear_noisy_lorenz(idim, T, snr, X_dynamics=X_dynamics, noisy_model=noisy_model, seed=args.seed) X_noisy = X_noisy - X_noisy.mean(axis=0) X_clean_train, X_clean_val = split(X_clean, split_rate) X_noisy_train, X_noisy_val = split(X_noisy, split_rate) X_dyn_train, X_dyn_val = split(X_dynamics, split_rate) if not os.path.exists("runs"): os.mkdir("runs") chunk_size = 500 X_train_seqs, L_train = chunk_long_seq(X_noisy_train, 30, chunk_size) X_valid_seqs, L_valid = chunk_long_seq(X_noisy_val, 30, chunk_size) X_clean_seqs, L_clean = chunk_long_seq(X_clean_val, 30, chunk_size) X_dyn_seqs, L_dyn = chunk_long_seq(X_dyn_val, 30, chunk_size) print(X_train_seqs[0].shape) # 0:500 test, 1000:1500 valid X_match = torch.from_numpy(_context_concat(X_noisy_val[1000:1500], 0)).float().to(device) Y_match = X_dyn_val[1000:1500] # Linear DCA print("Training {}".format(args.base_encoder_type)) dca_model = DCA(d=fdim, T=T) dca_model.fit(X_train_seqs + X_valid_seqs[:1]) X_dca = dca_model.transform(X_noisy_val[:500]) if X_dca.shape[1] > 3: X_dca = TSNE(n_components=3).fit_transform(X_dca) # deep DCA print("Training {}".format(encoder_name)) dapc_model = DAPC(args.obj, idim, fdim, T, encoder_type=args.encoder_type, ortho_lambda=args.ortho_lambda, recon_lambda=args.recon_lambda, dropout=args.dropout, masked_recon=args.masked_recon, args=args, device=device) dapc_model = fit_dapc(dapc_model, X_train_seqs, L_train, X_valid_seqs, L_valid, None, args.lr, use_gpu, batch_size=args.batchsize, max_epochs=args.epochs, device=device, snapshot=params + ".cpt", X_match=X_match, Y_match=Y_match, use_writer=False) X_dapc = dapc_model.encode( torch.from_numpy(_context_concat(X_noisy_val[:500], dapc_model.input_context)).float().to(device, dtype=dapc_model.dtype)).cpu().numpy() if X_dapc.shape[1] > 3: X_dapc = TSNE(n_components=3).fit_transform(X_dapc) print(np.matmul((X_dapc - X_dapc.mean(0)).T, (X_dapc - X_dapc.mean(0))) / X_dapc.shape[0]) if not os.path.exists("pngs"): os.mkdir("pngs") # match DCA with ground-truth if not os.path.exists("npys"): os.mkdir("npys") np.save("npys/dapc_bases_{}.npy".format(params), X_dapc) print("Matching {}".format(args.base_encoder_type)) X_dca_recon, _ = match(X_dca, X_dyn_val[:500], 15000, device) # match DAPC with ground-truth print("Matching {}".format(encoder_name)) X_dapc_recon, _ = match(X_dapc, X_dyn_val[:500], 15000, device) # R2 of dca r2_dca = 1 - np.sum((X_dca_recon - X_dyn_val[:500]) ** 2) / np.sum( (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0)) ** 2) print("\nr2_dca:", r2_dca) # R2 of dapc r2_dapc = 1 - np.sum((X_dapc_recon - X_dyn_val[:500]) ** 2) / np.sum( (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0)) ** 2) print("r2_dapc:", r2_dapc) # store R2's r2_vals[snr_idx] = [r2_dca, r2_dapc] # store reconstructed signals dca_recons.append(X_dca_recon) dapc_recons.append(X_dapc_recon) if not os.path.exists("plots"): os.mkdir("plots") if not os.path.exists("plots/{}".format(params)): os.mkdir("plots/{}".format(params)) plot_figs(dca_recons, dapc_recons, X_dyn_val[:500], X_clean_val[:500], X_noisy_val[:500], r2_vals, snr_vals, args.base_encoder_type, encoder_name, "plots/{}".format(params))
def main(args): parser = get_parser() parser = DAPC.add_arguments(parser) args = parser.parse_args(args) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # Handle multiple gpu issues. gpuid = args.gpuid gpulist = parsegpuid(gpuid) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(x) for x in gpulist]) numGPUs = len(gpulist) print("Using %d gpus, CUDA_VISIBLE_DEVICES=%s" % (numGPUs, os.environ["CUDA_VISIBLE_DEVICES"])) T = args.T fdim = args.fdim encoder_name = args.encoder_type params = 'obj={}_encoder={}_split={}_fdim={}_context={}_T={}_lr={}_bs={}_dropout={}_rate-lambda={}_ortho-lambda={}_recon-lambda={}_seed={}'.format( args.obj, encoder_name, args.split_rate, args.fdim, args.input_context, args.T, args.lr, args.batchsize, args.dropout, args.rate_lambda, args.ortho_lambda, args.recon_lambda, args.seed) if args.obj == "vae": params = params + "_priorpi={}_dimpi={}_{}_{}_{}_{}".format( args.use_prior_pi, args.use_dim_pi, args.vae_alpha, args.vae_beta, args.vae_gamma, args.vae_zeta) print(params) idim = 30 # lift projection dim noise_dim = 7 # noisify raw DCA split_rate = args.split_rate # train/valid split snr_vals = [0.3, 1.0, 5.0] # signal-to-noise ratios num_samples = 10000 # samples to collect from the lorenz system print("Generating ground truth dynamics ...") X_dynamics = gen_lorenz_data(num_samples) # 10000 * 3 noisy_model = DNN(X_dynamics.shape[1], idim, dropout=0.5) # DNN lift projection: 3 -> 30 for d-DCA use_gpu = True if use_gpu: device = torch.device("cuda:0") else: device = torch.device("cpu") dca_recons = [] dapc_recons = [] r2_vals = np.zeros((len(snr_vals), 2)) # obtain R2 scores for DCA and dDCA for snr_idx, snr in enumerate(snr_vals): print("Generating noisy data with snr=%.2f ..." % snr) X_clean, X_noisy = gen_nonlinear_noisy_lorenz(idim, T, snr, X_dynamics=X_dynamics, noisy_model=noisy_model, seed=args.seed) X_noisy = X_noisy - X_noisy.mean(axis=0) X_clean_train, X_clean_val = split(X_clean, split_rate) X_noisy_train, X_noisy_val = split(X_noisy, split_rate) X_dyn_train, X_dyn_val = split(X_dynamics, split_rate) if not os.path.exists("runs"): os.mkdir("runs") writer = SummaryWriter( create_writer_name('runs/dapc_{}'.format(params))) chunk_size = 500 X_train_seqs, L_train = chunk_long_seq(X_noisy_train, 30, chunk_size) X_valid_seqs, L_valid = chunk_long_seq(X_noisy_val, 30, chunk_size) X_clean_seqs, L_clean = chunk_long_seq(X_clean_val, 30, chunk_size) X_dyn_seqs, L_dyn = chunk_long_seq(X_dyn_val, 30, chunk_size) # 0:500 test, 1000:1500 valid X_match = torch.from_numpy(_context_concat(X_noisy_val[1000:1500], 0)).float().to(device) Y_match = X_dyn_val[1000:1500] # Linear DCA print("Training {}".format(args.base_encoder_type)) if args.base_encoder_type != "lin": dca_model = DAPC(args.obj, idim, fdim, T, encoder_type=args.base_encoder_type, ortho_lambda=args.ortho_lambda, recon_lambda=args.recon_lambda, dropout=args.dropout, masked_recon=args.masked_recon, args=args, device=device) else: dca_model = DAPC("dca", idim, fdim, T, encoder_type="lin", ortho_lambda=10.0, recon_lambda=0.0, dropout=0.0, masked_recon=False, args=args) dca_model = fit_dapc(dca_model, X_train_seqs, L_train, X_valid_seqs[:1], L_valid[:1], writer, args.lr, use_gpu, batch_size=args.batchsize, max_epochs=args.epochs, device=device, snapshot="lin_dca.cpt", X_match=X_match, Y_match=Y_match, use_writer=False) X_dca = dca_model.encode( torch.from_numpy( _context_concat(X_noisy_val[:500], dca_model.input_context)).float().to( device, dtype=dca_model.dtype)).cpu().numpy() if X_dca.shape[1] > 3: X_dca = TSNE(n_components=3).fit_transform(X_dca) # deep DCA print("Training {}".format(encoder_name)) dapc_model = DAPC(args.obj, idim, fdim, T, encoder_type=args.encoder_type, ortho_lambda=args.ortho_lambda, recon_lambda=args.recon_lambda, dropout=args.dropout, masked_recon=args.masked_recon, args=args, device=device) dapc_model = fit_dapc(dapc_model, X_train_seqs, L_train, X_valid_seqs, L_valid, writer, args.lr, use_gpu, batch_size=args.batchsize, max_epochs=args.epochs, device=device, snapshot=params + ".cpt", X_match=X_match, Y_match=Y_match) X_dapc = dapc_model.encode( torch.from_numpy( _context_concat(X_noisy_val[:500], dapc_model.input_context)).float().to( device, dtype=dapc_model.dtype)).cpu().numpy() if X_dapc.shape[1] > 3: X_dapc = TSNE(n_components=3).fit_transform(X_dapc) print( np.matmul((X_dapc - X_dapc.mean(0)).T, (X_dapc - X_dapc.mean(0))) / X_dapc.shape[0]) if not os.path.exists("pngs"): os.mkdir("pngs") if dapc_model.obj == "vae": ax = sns.heatmap(dapc_model.post_cov.detach().cpu().numpy(), linewidth=0.05) plt.savefig("pngs/post_cov_heat_{}.png".format(params)) plt.clf() ax = sns.heatmap(dapc_model.cov.detach().cpu().numpy(), linewidth=0.05) plt.savefig("pngs/cov_heat_{}.png".format(params)) else: ax = sns.heatmap(dapc_model.cov.detach().cpu().numpy(), linewidth=0.05) plt.savefig("pngs/post_cov_heat_{}.png".format(params)) # match DCA with ground-truth if not os.path.exists("npys"): os.mkdir("npys") np.save("npys/dapc_bases_{}.npy".format(params), X_dapc) print("Matching {}".format(args.base_encoder_type)) X_dca_recon, _ = match(X_dca, X_dyn_val[:500], 15000, device) # match DAPC with ground-truth print("Matching {}".format(encoder_name)) X_dapc_recon, _ = match(X_dapc, X_dyn_val[:500], 15000, device) # R2 of dca r2_dca = 1 - np.sum((X_dca_recon - X_dyn_val[:500])**2) / np.sum( (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0))**2) print("\nr2_dca:", r2_dca) # R2 of dapc r2_dapc = 1 - np.sum((X_dapc_recon - X_dyn_val[:500])**2) / np.sum( (X_dyn_val[:500] - np.mean(X_dyn_val[:500], axis=0))**2) print("r2_dapc:", r2_dapc) # store R2's r2_vals[snr_idx] = [r2_dca, r2_dapc] # store reconstructed signals dca_recons.append(X_dca_recon) dapc_recons.append(X_dapc_recon) if not os.path.exists("plots"): os.mkdir("plots") if not os.path.exists("plots/{}".format(params)): os.mkdir("plots/{}".format(params)) plot_figs(dca_recons, dapc_recons, X_dyn_val[:500], X_clean_val[:500], X_noisy_val[:500], r2_vals, snr_vals, args.base_encoder_type, encoder_name, "plots/{}".format(params))