def dropout_variability(model, args, df=None): if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) probs = [] for i in range(args.num_preds): probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda()) probs = torch.stack(probs).cuda() mean = probs.mean(dim=0).cuda() var = torch.abs(probs - mean).sum(dim=0).sum(dim=1).cuda() # var = torch.pow(preds - mean, 2).sum(dim=0).sum(dim=1) sorted_var, sorted_ind = var.sort(descending=True) if args.cluster and (df is not None): top_var = torch.empty(args.inc) top_ind = torch.empty(args.inc) for i in range(args.inc): for j in range(len(sorted_var)): if kmeans[sorted_ind[j]] == i: top_var[i] = sorted_var[j] top_ind[i] = sorted_ind[j] break else: top_var = sorted_var[:args.inc] top_ind = sorted_ind[:args.inc] subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_var.cpu().numpy().sum()
def dropout_margin(model, args, df=None): if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) probs = [] for i in range(args.num_preds): probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda()) probs = torch.stack(probs).cuda() sorted_mean = probs.mean(dim=0).sort(descending=True)[0] margins = sorted_mean[:,0] - sorted_mean[:,1] sorted_m, sorted_ind = margins.sort(descending=False) if args.cluster and (df is not None): top_m = torch.empty(args.inc) top_ind = torch.empty(args.inc) for i in range(args.inc): for j in range(len(sorted_m)): if kmeans[sorted_ind[j]] == i: top_m[i] = sorted_m[j] top_ind[i] = sorted_m[j] break else: top_m = sorted_m[:args.inc] top_ind = sorted_ind[:args.inc] subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_m.cpu().numpy().sum()
def entropy(model, args, df=None): ''' input: model: trained model df: dataframe column of text documents ''' if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) preds = model.get_preds(DatasetType.Test)[0].cuda() entropy = (-preds*torch.log(preds)).sum(dim=1).cuda() sorted_e, sorted_ind = entropy.sort(0,True) if args.cluster and (df is not None): top_e = torch.empty(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() for i in range(args.inc): for j in range(len(df)): if kmeans[sorted_ind[j]] == i: top_e[i] = sorted_e[j] top_ind[i] = sorted_ind[j] break else: top_e = sorted_e[:args.inc] top_ind = sorted_ind[:args.inc] subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_e.cpu().numpy().sum()
def margin(data, model, softmax, args, df=None): ''' input: data: dataset model: trained model softmax: softmax activation function df: dataframe of dataset ''' model.eval() if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args) top_m = torch.ones(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() text_field = data.fields['text'] for i, example in enumerate(data): logits = helpers.get_single_probs(example, text_field, model, softmax, args).cuda() logits = logits.sort(descending=True)[0].cuda() margin_ = logits[0][0] - logits[0][1] if args.cluster and (df is not None): if margin_ < top_m[kmeans[i]]: top_m[kmeans[i]], top_ind[kmeans[i]] = margin_, i else: if margin_ < top_m.max(): max_m, idx = torch.max(top_m, dim=0) top_m[idx], top_ind[idx] = margin_, i subset = list(top_ind.cpu().numpy()) if args.cluster: for i in range(len(subset)): subset[i] = int(subset[i]) return subset, top_m.cpu().detach().numpy().sum()
def dropout_entropy(model, args, df=None): if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) probs = [] for i in range(args.num_preds): probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda()) probs = torch.stack(probs).cuda() mean = probs.mean(dim=0).cuda() entropies = -(mean*torch.log(mean)).sum(dim=1).cuda() sorted_e, sorted_ind = entropies.sort(descending=True) if args.cluster and (df is not None): top_e = torch.empty(args.inc) top_ind = torch.empty(args.inc) for i in range(args.inc): for j in range(len(sorted_e)): if kmeans[sorted_ind[j]] == i: top_e[i] = sorted_e[j] top_ind[i] = sorted_ind[j] break else: top_e = sorted_e[:args.inc] top_ind = sorted_ind[:args.inc] subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_e.cpu().numpy().sum()
def dropout_variation(data, model, softmax, args, df=None): ''' input: data: dataset model: trained model softmax: softmax activation function df: dataframe of dataset ''' model.train() if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args) top_var = -torch.ones(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() text_field = data.fields['text'] for i, example in enumerate(data): probs = [] for j in range(args.num_preds): probs.append(helpers.get_single_probs(example, text_field, model, softmax, args)) probs = torch.stack(probs).cuda() mean = probs.mean(dim=0).cuda() var = 1 - mean.max() if args.cluster and (df is not None): if var > top_var[kmeans[i]]: top_var[kmeans[i]], top_ind[kmeans[i]] = var, i else: if var > top_var.min(): min_var, idx = torch.min(top_var, dim=0) top_var[idx], top_ind[idx] = var, i model.eval() subset = list(top_ind.cpu().numpy()) if args.cluster: for i in range(len(subset)): subset[i] = int(subset[i]) return subset, top_var.cpu().detach().numpy().sum()
def entropy(data, model, log_softmax, args, df=None): ''' input: data: dataset model: trained model log_softmax: log softmax activation function df: dataframe of dataset ''' model.eval() if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args) top_e = -torch.ones(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() text_field = data.fields['text'] for i, example in enumerate(data): logPys = helpers.get_single_probs(example, text_field, model, log_softmax, args).cuda() entropy_ = -(logPys*torch.exp(logPys)).sum().cuda() if args.cluster and df is not None: if entropy_ > top_e[kmeans[i]]: top_e[kmeans[i]], top_ind[kmeans[i]] = entropy_, i else: if entropy_ > top_e.min(): min_e, idx = torch.min(top_e, dim=0) top_e[idx], top_ind[idx] = entropy_, i subset = list(top_ind.cpu().numpy()) if args.cluster: for i in range(len(subset)): subset[i] = int(subset[i]) return subset, top_e.cpu().detach().numpy().sum()
def variation_ratio(model, args, df=None): if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) preds = model.get_preds(DatasetType.Test)[0].cuda() var = 1 - preds.max(dim=1)[0].cuda() sorted_var, sorted_ind = var.sort(0, descending=True) if args.cluster and (df is not None): top_var = args.empty(args.inc).cuda() top_ind = args.empty(args.inc).cuda() for i in range(args.inc): for j in range(len(df)): if kmeans[sorted_ind[j]] == i: top_var[i] = sorted_var[j] top_ind[i] = sorted_ind[j] break else: top_e = sorted_var[:args.inc].cuda() top_ind = sorted_var[:args.inc].cuda() subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_e.cpu().numpy().sum()
def margin(model, args, df=None): if args.cluster and (df is not None): kmeans = helpers.clustering(df, args) preds = model.get_preds(DatasetType.Test)[0].cuda() sorted_ = preds.sort(descending=True)[0].cuda() #print(len(sorted_), len(sorted_[0]), len(sorted_[1])) margins = sorted_[:,0] - sorted_[:,1] sorted_m, sorted_ind = margins.sort(0,descending=False) #print(len(margins)) if args.cluster and (df is not None): top_m = torch.empty(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() for i in range(args.inc): for j in range(len(df)): if kmeans[sorted_ind[j]] == i: top_m[i] = sorted_m[j] top_ind[i] = sorted_ind[j] break else: top_m = sorted_m[:args.inc] top_ind = sorted_ind[:args.inc] subset = list(top_ind.cpu().numpy()) for i, idx in enumerate(subset): subset[i] = int(idx) return subset, top_m.cpu().numpy().sum()
def variation_ratio(data, model, softmax, args, df = None): ''' input: data: dataset model: trained model softmax: softmax activation function df: dataframe of dataset ''' model.eval() if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args) test_k = kmeans.cpu().detach().numpy() for i in range(args.inc): for j in range(len(data)): if test_k[j] == i: print(i, 'check') break top_var = -torch.ones(args.inc).cuda() top_ind = torch.empty(args.inc).cuda() text_field = data.fields['text'] for i, example in enumerate(data): logits = helpers.get_single_probs(example, text_field, model, softmax, args).cuda() var = 1 - logits.max() if args.cluster and (df is not None): if var > top_var[kmeans[i]]: top_var[kmeans[i]], top_ind[kmeans[i]] = var, i else: if var > top_var.min(): min_var, idx = torch.min(top_var, dim=0) top_var[idx], top_ind[idx] = var, i subset = list(top_ind.cpu().numpy()) if args.cluster: for i in range(len(subset)): subset[i] = int(subset[i]) return subset, top_var.cpu().detach().numpy().sum()
latent=config['latent'], annealing=False, batch_size=batch_size, prefix=PREFIX, label=label, scale=config['scale'], patience=config['patience'] ) # res_file = PREFIX+'_res.h5' # res_data = h5py.File( name=res_file,mode='r' ) # dim2 = res_data['RES5'] # print(np.max(dim2)) print(res.shape) k = len( np.unique(label) ) cl,_ = clustering( res,k=k) dm = measure( cl,label ) # res_data.close() ### analysis results # plot loss # plot 2-D visulation fig = print_2D( points=res,label=label,id_map=id_map ) # fig.savefig('embryo.eps') # fig = print_2D( points=res_data['RES5'],label=label,id_map=id_map ) # fig.show() # res_data.close() # time.sleep(30) #res_data.close() # plot NMI,ARI curve
def vasc(expr, epoch=5000, latent=2, patience=50, min_stop=500, batch_size=32, var=False, prefix='test', label=None, log=True, scale=True, annealing=False, tau0=1.0, min_tau=0.5, rep=0): ''' VASC: variational autoencoder for scRNA-seq datasets ============ Parameters: expr: expression matrix (n_cells * n_features) epoch: maximum number of epochs, default 5000 latent: dimension of latent variables, default 2 patience: stop if loss showes insignificant decrease within *patience* epochs, default 50 min_stop: minimum number of epochs, default 500 batch_size: batch size for stochastic optimization, default 32 var: whether to estimate the variance parameters, default False prefix: prefix to store the results, default 'test' label: numpy array of true labels, default None log: if log-transformation should be performed, default True scale: if scaling (making values within [0,1]) should be performed, default True annealing: if annealing should be performed for Gumbel approximation, default False tau0: initial temperature for annealing or temperature without annealing, default 1.0 min_tau: minimal tau during annealing, default 0.5 rep: not used ============= Values: point: dimension-*latent* results A file named (*prefix*_*latent*_res.h5): we prefer to use this file to analyse results to the only return values. This file included the following keys: POINTS: all intermediated latent results during the iterations LOSS: loss values during the training procedure RES*i*: i from 0 to 14 - hidden values just for reference We recommend use POINTS and LOSS to select the final results in terms of users' preference. ''' expr[expr < 0] = 0.0 if log: expr = np.log2(expr + 1) if scale: for i in range(expr.shape[0]): expr[i, :] = expr[i, :] / np.max(expr[i, :]) # if outliers: # o = outliers_detection(expr) # expr = expr[o==1,:] # if label is not None: # label = label[o==1] if rep > 0: expr_train = np.matlib.repmat(expr, rep, 1) else: expr_train = np.copy(expr) vae_ = VASC(in_dim=expr.shape[1], latent=latent, var=var) vae_.vaeBuild() #print_summary( vae_.vae ) points = [] loss = [] prev_loss = np.inf #tau0 = 1. tau = tau0 #min_tau = 0.5 anneal_rate = 0.0003 for e in range(epoch): cur_loss = prev_loss #mask = np.ones( expr_train.shape,dtype='float32' ) #mask[ expr_train==0 ] = 0.0 if e % 100 == 0 and annealing: tau = max(tau0 * np.exp(-anneal_rate * e), min_tau) print(tau) tau_in = np.ones(expr_train.shape, dtype='float32') * tau #print(tau_in.shape) loss_ = vae_.vae.fit([expr_train, tau_in], expr_train, epochs=1, batch_size=batch_size, shuffle=True, verbose=0) train_loss = loss_.history['loss'][0] cur_loss = min(train_loss, cur_loss) loss.append(train_loss) #val_loss = -loss.history['val_loss'][0] res = vae_.ae.predict([expr, tau_in]) points.append(res[5]) if label is not None: k = len(np.unique(label)) if e % patience == 1: print("Epoch %d/%d" % (e + 1, epoch)) print("Loss:" + str(train_loss)) if abs(cur_loss - prev_loss) < 1 and e > min_stop: break prev_loss = train_loss if label is not None: try: cl, _ = clustering(res[5], k=k) measure(cl, label) except: print('Clustering error') # ### analysis results #cluster_res = np.asarray( cluster_res ) points = np.asarray(points) aux_res = h5py.File(prefix + '_' + str(latent) + '_res.h5', mode='w') #aux_res.create_dataset( name='EXPR',data=expr ) #aux_res.create_dataset( name='CLUSTER',data=cluster_res ) aux_res.create_dataset(name='POINTS', data=points) aux_res.create_dataset(name='LOSS', data=loss) count = 0 for r in res: aux_res.create_dataset(name='RES' + str(count), data=r) count += 1 aux_res.close() return res[5]