Exemple #1
0
def dropout_variability(model, args, df=None):
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    probs = []
    for i in range(args.num_preds):
        probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda())
    probs = torch.stack(probs).cuda()
    mean = probs.mean(dim=0).cuda()
    var = torch.abs(probs - mean).sum(dim=0).sum(dim=1).cuda()
    # var = torch.pow(preds - mean, 2).sum(dim=0).sum(dim=1)
    sorted_var, sorted_ind = var.sort(descending=True)
    if args.cluster and (df is not None):
        top_var = torch.empty(args.inc)
        top_ind = torch.empty(args.inc)
        for i in range(args.inc):
            for j in range(len(sorted_var)):
                if kmeans[sorted_ind[j]] == i:
                    top_var[i] = sorted_var[j]
                    top_ind[i] = sorted_ind[j]
                    break
    else:
        top_var = sorted_var[:args.inc]
        top_ind = sorted_ind[:args.inc]
                
    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_var.cpu().numpy().sum()
Exemple #2
0
def dropout_margin(model, args, df=None):
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    probs = []
    for i in range(args.num_preds):
        probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda())
    probs = torch.stack(probs).cuda()
    sorted_mean = probs.mean(dim=0).sort(descending=True)[0]
    margins = sorted_mean[:,0] - sorted_mean[:,1]
    sorted_m, sorted_ind = margins.sort(descending=False)
    if args.cluster and (df is not None):
        top_m = torch.empty(args.inc)
        top_ind = torch.empty(args.inc)
        for i in range(args.inc):
            for j in range(len(sorted_m)):
                if kmeans[sorted_ind[j]] == i:
                    top_m[i] = sorted_m[j]
                    top_ind[i] = sorted_m[j]
                    break
    else:
        top_m = sorted_m[:args.inc]
        top_ind = sorted_ind[:args.inc]

    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_m.cpu().numpy().sum()
Exemple #3
0
def entropy(model, args, df=None):
    '''
        input:
        model: trained model
        df: dataframe column of text documents
    '''
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    preds = model.get_preds(DatasetType.Test)[0].cuda()
    entropy = (-preds*torch.log(preds)).sum(dim=1).cuda()
    sorted_e, sorted_ind = entropy.sort(0,True)
    if args.cluster and (df is not None):
        top_e = torch.empty(args.inc).cuda()
        top_ind = torch.empty(args.inc).cuda()
        for i in range(args.inc):
            for j in range(len(df)):
                if kmeans[sorted_ind[j]] == i:
                    top_e[i] = sorted_e[j]
                    top_ind[i] = sorted_ind[j]
                    break
    else:
        top_e = sorted_e[:args.inc]
        top_ind = sorted_ind[:args.inc]
                
    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_e.cpu().numpy().sum()
Exemple #4
0
def margin(data, model, softmax, args, df=None):
    '''
        input:
        data: dataset
        model: trained model
        softmax: softmax activation function
        df: dataframe of dataset
    '''
    model.eval()
    if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args) 
    top_m = torch.ones(args.inc).cuda()
    top_ind = torch.empty(args.inc).cuda()
    text_field = data.fields['text']
    
    for i, example in enumerate(data):
        logits = helpers.get_single_probs(example, text_field, model, softmax, args).cuda()
        logits = logits.sort(descending=True)[0].cuda()
        margin_ = logits[0][0] - logits[0][1]
        if args.cluster and (df is not None):
            if margin_ < top_m[kmeans[i]]: top_m[kmeans[i]], top_ind[kmeans[i]] = margin_, i
        else:
            if margin_ < top_m.max():
                max_m, idx = torch.max(top_m, dim=0)
                top_m[idx], top_ind[idx] = margin_, i
                
    subset = list(top_ind.cpu().numpy())
    if args.cluster:
        for i in range(len(subset)):
            subset[i] = int(subset[i])
    return subset, top_m.cpu().detach().numpy().sum()
Exemple #5
0
def dropout_entropy(model, args, df=None):
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    probs = []
    for i in range(args.num_preds):
        probs.append(eval_w_dropout.get_preds(model, DatasetType.Test)[0].cuda())
    probs = torch.stack(probs).cuda()
    mean = probs.mean(dim=0).cuda()
    entropies = -(mean*torch.log(mean)).sum(dim=1).cuda()
    sorted_e, sorted_ind = entropies.sort(descending=True)
    if args.cluster and (df is not None):
        top_e = torch.empty(args.inc)
        top_ind = torch.empty(args.inc)
        for i in range(args.inc):
            for j in range(len(sorted_e)):
                if kmeans[sorted_ind[j]] == i:
                    top_e[i] = sorted_e[j]
                    top_ind[i] = sorted_ind[j]
                    break
    else:
        top_e = sorted_e[:args.inc]
        top_ind = sorted_ind[:args.inc]

    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_e.cpu().numpy().sum()
Exemple #6
0
def dropout_variation(data, model, softmax, args, df=None):
    '''
        input:
        data: dataset
        model: trained model
        softmax: softmax activation function
        df: dataframe of dataset
    '''
    model.train()
    if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args)
    top_var = -torch.ones(args.inc).cuda()
    top_ind = torch.empty(args.inc).cuda()
    text_field = data.fields['text']
    for i, example in enumerate(data):
        probs = []
        for j in range(args.num_preds): probs.append(helpers.get_single_probs(example, text_field, model, softmax, args))
        probs = torch.stack(probs).cuda()
        mean = probs.mean(dim=0).cuda()
        var = 1 - mean.max()
        if args.cluster and (df is not None):
            if var > top_var[kmeans[i]]: top_var[kmeans[i]], top_ind[kmeans[i]] = var, i
        else:
            if var > top_var.min():
                min_var, idx = torch.min(top_var, dim=0)
                top_var[idx], top_ind[idx] = var, i
                
    model.eval()
    subset = list(top_ind.cpu().numpy())
    if args.cluster:
        for i in range(len(subset)):
            subset[i] = int(subset[i])
    return subset, top_var.cpu().detach().numpy().sum()
        
Exemple #7
0
def entropy(data, model, log_softmax, args, df=None):
    '''
        input:
        data: dataset
        model: trained model
        log_softmax: log softmax activation function        
        df: dataframe of dataset
    '''
    model.eval()
    if args.cluster and (df is not None): kmeans = helpers.clustering(df['text'], args)
    top_e = -torch.ones(args.inc).cuda()
    top_ind = torch.empty(args.inc).cuda()
    text_field = data.fields['text']
    
    for i, example in enumerate(data):
        logPys = helpers.get_single_probs(example, text_field, model, log_softmax, args).cuda()
        entropy_ = -(logPys*torch.exp(logPys)).sum().cuda()
        if args.cluster and df is not None:
            if entropy_ > top_e[kmeans[i]]: top_e[kmeans[i]], top_ind[kmeans[i]] = entropy_, i
        else:
            if entropy_ > top_e.min():
                min_e, idx = torch.min(top_e, dim=0)
                top_e[idx], top_ind[idx] = entropy_, i
        
    subset = list(top_ind.cpu().numpy())
    if args.cluster:
        for i in range(len(subset)):
            subset[i] = int(subset[i])
    return subset, top_e.cpu().detach().numpy().sum()
Exemple #8
0
def variation_ratio(model, args, df=None):
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    preds = model.get_preds(DatasetType.Test)[0].cuda()
    var = 1 - preds.max(dim=1)[0].cuda()
    sorted_var, sorted_ind = var.sort(0, descending=True)
    if args.cluster and (df is not None):
        top_var = args.empty(args.inc).cuda()
        top_ind = args.empty(args.inc).cuda()
        for i in range(args.inc):
            for j in range(len(df)):
                if kmeans[sorted_ind[j]] == i:
                    top_var[i] = sorted_var[j]
                    top_ind[i] = sorted_ind[j]
                    break
    else:
        top_e = sorted_var[:args.inc].cuda()
        top_ind = sorted_var[:args.inc].cuda()
        
    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_e.cpu().numpy().sum()
Exemple #9
0
def margin(model, args, df=None):
    if args.cluster and (df is not None): kmeans = helpers.clustering(df, args)
    preds = model.get_preds(DatasetType.Test)[0].cuda()
    sorted_ = preds.sort(descending=True)[0].cuda()
    #print(len(sorted_), len(sorted_[0]), len(sorted_[1]))
    margins = sorted_[:,0] - sorted_[:,1]
    sorted_m, sorted_ind = margins.sort(0,descending=False)
    #print(len(margins))
    if args.cluster and (df is not None):
        top_m = torch.empty(args.inc).cuda()
        top_ind = torch.empty(args.inc).cuda()
        for i in range(args.inc):
            for j in range(len(df)):
                if kmeans[sorted_ind[j]] == i:
                    top_m[i] = sorted_m[j]
                    top_ind[i] = sorted_ind[j]
                    break
    else:
        top_m = sorted_m[:args.inc]
        top_ind = sorted_ind[:args.inc]
        
    subset = list(top_ind.cpu().numpy())
    for i, idx in enumerate(subset): subset[i] = int(idx)
    return subset, top_m.cpu().numpy().sum()
Exemple #10
0
def variation_ratio(data, model, softmax, args, df = None):
    '''
        input:
        data: dataset
        model: trained model
        softmax: softmax activation function
        df: dataframe of dataset
    '''
    model.eval()
    if args.cluster and (df is not None):
        kmeans = helpers.clustering(df['text'], args)
        test_k = kmeans.cpu().detach().numpy()
        for i in range(args.inc):
            for j in range(len(data)):
                if test_k[j] == i:
                    print(i, 'check')
                    break
    top_var = -torch.ones(args.inc).cuda()
    top_ind = torch.empty(args.inc).cuda()
    text_field = data.fields['text']
    
    for i, example in enumerate(data):
        logits = helpers.get_single_probs(example, text_field, model, softmax, args).cuda()
        var = 1 - logits.max()
        if args.cluster and (df is not None):
            if var > top_var[kmeans[i]]: top_var[kmeans[i]], top_ind[kmeans[i]] = var, i
        else:
            if var > top_var.min():
                min_var, idx = torch.min(top_var, dim=0)
                top_var[idx], top_ind[idx] = var, i

    subset = list(top_ind.cpu().numpy())
    if args.cluster:
        for i in range(len(subset)):
            subset[i] = int(subset[i])
    return subset, top_var.cpu().detach().numpy().sum()
Exemple #11
0
                    latent=config['latent'],
                    annealing=False,
                    batch_size=batch_size,
                    prefix=PREFIX,
                    label=label,
                    scale=config['scale'],
                    patience=config['patience'] 
                )
#            res_file = PREFIX+'_res.h5'
#            res_data = h5py.File( name=res_file,mode='r' )
#            dim2 = res_data['RES5']
#            print(np.max(dim2))
        
        print(res.shape)
        k = len( np.unique(label) )
        cl,_ = clustering( res,k=k)
        dm = measure( cl,label )
        
#            res_data.close()
        ### analysis results
        # plot loss
        
        # plot 2-D visulation
        fig = print_2D( points=res,label=label,id_map=id_map )
#        fig.savefig('embryo.eps')
#        fig = print_2D( points=res_data['RES5'],label=label,id_map=id_map )
#        fig.show()
#        res_data.close()
#        time.sleep(30)
        #res_data.close()
    # plot NMI,ARI curve
Exemple #12
0
def vasc(expr,
         epoch=5000,
         latent=2,
         patience=50,
         min_stop=500,
         batch_size=32,
         var=False,
         prefix='test',
         label=None,
         log=True,
         scale=True,
         annealing=False,
         tau0=1.0,
         min_tau=0.5,
         rep=0):
    '''
    VASC: variational autoencoder for scRNA-seq datasets
    
    ============
    Parameters:
        expr: expression matrix (n_cells * n_features)
        epoch: maximum number of epochs, default 5000
        latent: dimension of latent variables, default 2
        patience: stop if loss showes insignificant decrease within *patience* epochs, default 50
        min_stop: minimum number of epochs, default 500
        batch_size: batch size for stochastic optimization, default 32
        var: whether to estimate the variance parameters, default False
        prefix: prefix to store the results, default 'test'
        label: numpy array of true labels, default None
        log: if log-transformation should be performed, default True
        scale: if scaling (making values within [0,1]) should be performed, default True
        annealing: if annealing should be performed for Gumbel approximation, default False
        tau0: initial temperature for annealing or temperature without annealing, default 1.0
        min_tau: minimal tau during annealing, default 0.5
        rep: not used
    
    =============
    Values:
        point: dimension-*latent* results
        A file named (*prefix*_*latent*_res.h5): we prefer to use this file to analyse results to the only return values.
        This file included the following keys:
            POINTS: all intermediated latent results during the iterations
            LOSS: loss values during the training procedure
            RES*i*: i from 0 to 14
                - hidden values just for reference
        We recommend use POINTS and LOSS to select the final results in terms of users' preference.
    '''

    expr[expr < 0] = 0.0

    if log:
        expr = np.log2(expr + 1)
    if scale:
        for i in range(expr.shape[0]):
            expr[i, :] = expr[i, :] / np.max(expr[i, :])

#    if outliers:
#        o = outliers_detection(expr)
#        expr = expr[o==1,:]
#        if label is not None:
#            label = label[o==1]

    if rep > 0:
        expr_train = np.matlib.repmat(expr, rep, 1)
    else:
        expr_train = np.copy(expr)

    vae_ = VASC(in_dim=expr.shape[1], latent=latent, var=var)
    vae_.vaeBuild()
    #print_summary( vae_.vae )

    points = []
    loss = []
    prev_loss = np.inf
    #tau0 = 1.
    tau = tau0
    #min_tau = 0.5
    anneal_rate = 0.0003
    for e in range(epoch):
        cur_loss = prev_loss

        #mask = np.ones( expr_train.shape,dtype='float32' )
        #mask[ expr_train==0 ] = 0.0
        if e % 100 == 0 and annealing:
            tau = max(tau0 * np.exp(-anneal_rate * e), min_tau)
            print(tau)

        tau_in = np.ones(expr_train.shape, dtype='float32') * tau
        #print(tau_in.shape)

        loss_ = vae_.vae.fit([expr_train, tau_in],
                             expr_train,
                             epochs=1,
                             batch_size=batch_size,
                             shuffle=True,
                             verbose=0)
        train_loss = loss_.history['loss'][0]
        cur_loss = min(train_loss, cur_loss)
        loss.append(train_loss)
        #val_loss = -loss.history['val_loss'][0]
        res = vae_.ae.predict([expr, tau_in])
        points.append(res[5])
        if label is not None:
            k = len(np.unique(label))

        if e % patience == 1:
            print("Epoch %d/%d" % (e + 1, epoch))
            print("Loss:" + str(train_loss))
            if abs(cur_loss - prev_loss) < 1 and e > min_stop:
                break
            prev_loss = train_loss
            if label is not None:
                try:
                    cl, _ = clustering(res[5], k=k)
                    measure(cl, label)
                except:
                    print('Clustering error')

    #
    ### analysis results
    #cluster_res = np.asarray( cluster_res )
    points = np.asarray(points)
    aux_res = h5py.File(prefix + '_' + str(latent) + '_res.h5', mode='w')
    #aux_res.create_dataset( name='EXPR',data=expr )
    #aux_res.create_dataset( name='CLUSTER',data=cluster_res )
    aux_res.create_dataset(name='POINTS', data=points)
    aux_res.create_dataset(name='LOSS', data=loss)
    count = 0
    for r in res:
        aux_res.create_dataset(name='RES' + str(count), data=r)
        count += 1
    aux_res.close()

    return res[5]