def remove_undesirable_configs(self, res):
        """Removes undesirable configurations or performs other postprocessing adjustments to the list of configurations.
        For example, if we want to have the LDST filter and LGC algorithms to have the same parameter MU, we can remove configs where they do not match. Moreover, if we want
        to have the number of tuning iterations of LDST to be a fraction of the amount of noise, we can manually instanstiate this number over each list element.
        
            Args:
                res (List[Dict]): A list of all possible configs
            Returns:
            `List[dict]` An updated list of all possible configs. 
        """

        if self.FORCE_GTAM_LDST_SAME_MU:
            old_len = len(res)
            res = [x for x in res if \
                   not (x[ALG_PREFIX+"algorithm"]=="GTAM" and \
                   x[FILTER_PREFIX+"filter"]=="LDST" and x[ALG_PREFIX+"mu"] != x[FILTER_PREFIX+"mu"])]

            res = [x for x in res if \
                   not (x[ALG_PREFIX+"algorithm"]=="LGC" and \
                   x[FILTER_PREFIX+"filter"]in["LDST","LGC_LVO"] and np.round((1-x[ALG_PREFIX+"alpha"])/x[ALG_PREFIX+"alpha"],4) != np.round(x[FILTER_PREFIX+"mu"],4))]

            LOG.debug("Number of configurations removed due to forcing GTAM have same mu param as filter: {}"\
                      .format(old_len-len(res)),LOG.ll.SPECIFICATION)

        if self.TUNING_ITER_AS_NOISE_PCT:
            for x in res:
                if not FILTER_PREFIX + "tuning_iter" in x.keys():
                    pass
                else:
                    x[FILTER_PREFIX + "tuning_iter" ] = x[INPUT_PREFIX+"labeled_percent"] *\
                                         x[NOISE_PREFIX+"corruption_level"] *\
                                         x[FILTER_PREFIX+"tuning_iter"]
                    x[FILTER_PREFIX + "tuning_iter_as_pct"] = True
        return res
    def run_all(self):

        CSV_PATH = os.path.join(CSV_FOLDER, self.get_spec_name() + '.csv')
        JOINED_CSV_PATH = os.path.join(CSV_FOLDER,
                                       self.get_spec_name() + '_joined.csv')

        cfgs = self.get_all_configs()
        cfgs_keys = set()
        for x in cfgs:
            cfgs_keys.update(x.keys())

        #List of produced output dicts
        output_dicts = list()

        cfgs_size = len(cfgs)

        has_written_already = False

        bar = progressbar.ProgressBar(maxval=cfgs_size)
        counter = 0
        bar.start()
        bar.update(0)

        for i in range(cfgs_size):
            print("PROGRESS: {}".format(i / cfgs_size))
            #Maybe suppress output
            nullwrite = open(os.devnull, 'w')
            oldstdout = sys.stdout
            if not self.DEBUG_MODE:
                sys.stdout = nullwrite

            output_dicts.append(self.run(cfgs[i]))

            sys.stdout = oldstdout
            #Append to csv if conditions are met
            if i == cfgs_size - 1 or i % self.WRITE_FREQ == 0:
                LOG.info("appending csv...", LOG.ll.SPECIFICATION)
                csv_exists = os.path.isfile(CSV_PATH)
                if self.OVERWRITE:
                    if csv_exists and has_written_already:
                        f_mode = 'a'
                    else:
                        f_mode = 'w'
                else:
                    if csv_exists:
                        f_mode = 'a'
                    else:
                        f_mode = 'w'
                LOG.debug("f_mode={}".format(f_mode), LOG.ll.SPECIFICATION)
                self._append_to_csv(output_dicts, CSV_PATH, f_mode, cfgs_keys)
                has_written_already = True
                output_dicts.clear()

            bar.update(i + 1)
        LOG.info(f"CSV saved at f{CSV_PATH}", LOG.ll.SPECIFICATION)
        aggregate_csv([CSV_PATH], JOINED_CSV_PATH)
Example #3
0
def _add_remaining_vars(f, kwargs, experiment):
    if isinstance(f, partial):
        f = f.func
    f_vars = [k for k, v in signature(f).parameters.items()]

    LOG.debug("FUNCTION NECESSARY VARS:{}".format(f_vars), LOG.ll.HOOK)

    for k in f_vars:
        if not k in kwargs.keys():
            kwargs[k] = getattr(experiment, k)
    return kwargs
Example #4
0
    def createVideo(self):
        if not self.create_video:
            return

        if self.steps_taken == 0:
            return
        LOG.info("Creating video...", LOG.ll.HOOK)
        video_command = "ffmpeg -r {} -y  -pattern_type glob -i '{}' -c:v libx264 -vf fps=25 -pix_fmt yuv420p '{}'".format(\
            self.steps_taken/(15.0*5.0),
            os.path.join(self.filename_dir,self.temp_subfolder_name,"*.png".format(self.str_len)),
            os.path.join(self.filename_dir,self.video_path)
            )
        LOG.debug(video_command, LOG.ll.HOOK)
        os.system(video_command)
        LOG.info("Created video...", LOG.ll.HOOK)
Example #5
0
def LGC_iter_TF(X,W,Y,labeledIndexes, alpha = 0.1,num_iter = 1000, hook=None):
    c = time.time()
    
    """ Set W to sparse if necessary, make copy of Y """
    W = sparse.csr_matrix(W)        
    Y = np.copy(Y)
    
    """ Convert W to tensor """
    W = convert_sparse_matrix_to_sparse_tensor(W)
    LOG.debug(W,LOG.ll.CLASSIFIER)
    
    """ Get degree Matrix """
    D =  tf.sparse.reduce_sum(W,axis=1)
    
    
    """ F_0 is a copy of the label matrix, but we erase the information on labeled Indexes """
    F_0 = np.copy(Y).astype(np.float32) 
    F_0[np.logical_not(labeledIndexes),:] = 0.0
    
    
    
    """
        CREATE S - Needed for LGC propagation
    """
    S =  get_S_fromtensor(W)
    
    
    """
    CREATE F variable
    """
    F = tf.Variable(np.copy(F_0).astype(np.float32),name="F")
    F_0 = tf.Variable(F_0)
    TOTAL_ITER = tf.constant(int(num_iter))
    for _ in range(num_iter):
        F = (1-alpha)*F_0 + alpha*tf.sparse.sparse_dense_matmul(S,F)
    
    elapsed = time.time() - c
    LOG.info('Label Prop done in {:.2} seconds'.format(elapsed),
             LOG.ll.CLASSIFIER)
    
    return F.numpy()
Example #6
0
    def __GTAM(self,X,W,Y,labeledIndexes,mu = 99.0,useEstimatedFreq=True,num_iter = None,
             constant_prop=False,hook=None):
        '''BEGIN initialization'''
        Y = self.CLEAN_UNLABELED_ROWS(Y, labeledIndexes)
        labeledIndexes = np.array(labeledIndexes)

        
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        
        num_labeled = Y[labeledIndexes].shape[0]
        num_unlabeled = Y.shape[0] - num_labeled
        num_classes = Y.shape[1]
        
        
        
        """ Estimate frequency of classes"""
        if isinstance(useEstimatedFreq,bool):
            if useEstimatedFreq == False:
                estimatedFreq = np.repeat(1/num_classes,num_classes)
            elif useEstimatedFreq == True:
                estimatedFreq = np.sum(Y[labeledIndexes],axis=0) / num_labeled
        LOG.debug("Estimated frequency: {}".format(estimatedFreq),LOG.ll.CLASSIFIER)

        
        
        
        D = gutils.deg_matrix(W, flat=True)
        #Identity matrix
        I = np.identity(W.shape[0])
        #Get graph laplacian
        L = gutils.lap_matrix(W, which_lap='sym')
        #Propagation matrix
        from scipy.linalg import inv as invert
        P = invert( I- 1/(1+mu) *(I-L) )*mu/(1+mu)
        
        P_t = P.transpose()
        #Matrix A
        A = ((P_t @ L) @ P) + mu* ((P_t - I) @ (P - I))
        A = 0.5*(A + A.transpose())
        
        if not hook is None:
            W = scipy.sparse.coo_matrix(W)
        
        Z = []
        Q = None
        
        
        #Determine nontuning iter
        if num_iter is None:
            num_iter = num_unlabeled
        else:
            num_iter = min(num_iter,num_unlabeled)
            
        id_min_line, id_min_col = -1,-1
        '''END initialization'''
        #######################################################################################
        '''BEGIN iterations'''
        for i in np.arange(num_iter):

            '''Z matrix - The binary values of current Y are replaced with their corresponding D entries.
                Then, we normalize each row so that row sums to its estimated influence
            '''
            ul = np.logical_not(labeledIndexes)
            
            Z = gutils.calc_Z(Y, labeledIndexes, D, estimatedFreq,weigh_by_degree=True)


            if Q is None:
                #Compute graph gradient
                Q = np.matmul(A,Z)
                if not hook is None:
                    Q_pure = np.copy(Q)
                
                Q[labeledIndexes,:] = np.inf
                
            else:
                Q[id_min_line,:] = np.inf
                d_sj = np.sum(Z[labeledIndexes,id_min_col])
                d_sj1 = d_sj + Z[id_min_line,id_min_col]
                Q[ul,id_min_col] =\
                 (d_sj/(d_sj1) * Q[ul,id_min_col]) + (Z[id_min_line,id_min_col]/d_sj1 * A[ul,id_min_line])
            
            #Find minimum unlabeled index
            
            if constant_prop:
                    expectedNumLabels = estimatedFreq * sum(labeledIndexes)
                    actualNumLabels = np.sum(Y[labeledIndexes],axis=0)
                    class_to_label = np.argmax(expectedNumLabels-actualNumLabels)
                    id_min_col = class_to_label
                    id_min_line = np.argmin(Q[:,class_to_label])
                
                    
            else:
                id_min = np.argmin(Q)
                id_min_line = id_min // num_classes
                id_min_col = id_min % num_classes
            
                
            
            #Update Y and labeledIndexes
            labeledIndexes[id_min_line] = True
            Y[id_min_line,id_min_col] = 1
            
            
            
            #Maybe plot current iteration
            
            
            if not hook is None:
                hook._step(step=i,Y=Y,labeledIndexes=labeledIndexes,P=P,Z=Z,Q=Q_pure,
                           id_min_line=id_min_line,id_min_col=id_min_col)
        '''END iterations'''    
        ######################################################################################################
        if self.return_labels:
            return np.asarray(Z)
        else:
            return np.asarray(P@Z)
        return np.asarray(P@Z)
    def __MR(self, X, W, Y, labeledIndexes, p, optimize_labels, hook=None):
        """
            -------------------------------------------------------------
                INITIALIZATION
            --------------------------------------------------------------
        """

        ORACLE_Y = Y.copy()
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0

        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        l = np.reshape(np.array(np.where(labeledIndexes)), (-1))
        num_lab = l.shape[0]

        if not isinstance(p, int):
            p = int(p * num_lab)

        if p > Y.shape[0]:
            p = Y.shape[0]
            LOG.warn("Warning: p greater than the number of labeled indexes",
                     LOG.ll.CLASSIFIER)
        #W = gutils.scipy_to_np(W)
        #W =  0.5* (W + W.T)
        L = gutils.lap_matrix(W, which_lap='sym')
        D = gutils.deg_matrix(W, flat=True, pwr=-1.0)

        L = 0.5 * (L + L.T)

        def check_symmetric(a, tol=1e-8):
            return np.allclose(a, a.T, atol=tol)

        def is_pos_sdef(x):
            return np.all(np.linalg.eigvals(x) >= -1e-06)

        import scipy.sparse
        sym_err = L - L.T
        sym_check_res = np.all(np.abs(sym_err.data) < 1e-7)  # tune this value
        assert sym_check_res
        """---------------------------------------------------------------------------------------------------
                EIGENFUNCTION EXTRACTION
        ---------------------------------------------------------------------------------------------------
        """
        import time
        start_time = time.time()

        import os.path as osp
        from tf_labelprop.settings import INPUT_FOLDER

        cache_eigvec = osp.join(INPUT_FOLDER, 'eigenVectors.npy')
        cache_eigval = osp.join(INPUT_FOLDER, 'eigenValues.npy')

        if False:
            eigenValues, eigenVectors = np.load(cache_eigval), np.load(
                cache_eigvec)
            eigenVectors = eigenVectors[:, :p]
            eigenValues = eigenValues[:p]
        else:

            eigenVectors, eigenValues = W.load_eigenfunctions(p)

            time_elapsed = time.time() - start_time
            LOG.info("Took {} seconds to calculate eigenvectors".format(
                int(time_elapsed)))
            idx = eigenValues.argsort()
            eigenValues = eigenValues[idx]
            LOG.debug(eigenValues)
            assert eigenValues[0] <= eigenValues[eigenValues.shape[0] - 1]
            eigenVectors = eigenVectors[:, idx]
            np.save(cache_eigval, arr=eigenValues)
            np.save(cache_eigvec, arr=eigenVectors)
        U = eigenVectors
        LAMBDA = eigenValues

        U = U[:, np.argsort(LAMBDA)]
        LAMBDA = LAMBDA[np.argsort(LAMBDA)]

        import tensorflow as tf

        gpus = tf.config.experimental.list_physical_devices('GPU')

        #tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*8)])
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        """
        -------------------------------------------------------------------------
            Define Constants on GPU
        ------------------------------------------------------------------------------
        """
        U, X, Y = [tf.constant(x.astype(np.float32)) for x in [U, X, Y]]

        _U_times_U = tf.multiply(U, U)
        N = X.shape[0]

        def to_sp_diag(x):
            n = tf.cast(x.shape[0], tf.int64)
            indices = tf.concat([
                tf.range(n, dtype=tf.int64)[None, :],
                tf.range(n, dtype=tf.int64)[None, :]
            ],
                                axis=0)
            return tf.sparse.SparseTensor(indices=tf.transpose(indices),
                                          values=x,
                                          dense_shape=[n, n])

        @tf.function
        def smooth_labels(labels, factor=0.001):
            # smooth the labels
            labels = tf.cast(labels, tf.float32)
            labels *= (1 - factor)
            labels += (factor / tf.cast(tf.shape(labels)[0], tf.float32))
            # returned the smoothed labels
            return labels

        @tf.function
        def divide_by_row(x, eps=1e-07):
            x = tf.maximum(x, 0 * x)
            x = x + eps  # [N,C]    [N,1]
            return x / (tf.reduce_sum(x, axis=-1)[:, None])

        def spd_matmul(x, y):
            return tf.sparse.sparse_dense_matmul(x, y)

        def mult_each_row_by(X, by):
            """ Elementwise multiplies each row by a given row vector.
            
                For a 2D tensor, also correponds to multiplying each column by the respective scalar in the given row vector
                
                Args:
                    X (Tensor)  
                    by (Tensor[shape=(N,)]): row vector
            
            """
            #[N,C]  [N,1]
            return X * by[None, :]

        def mult_each_col_by(X, by):
            #[N,C]  [1,C]
            return X * by[:, None]

        @tf.function
        def accuracy(y_true, y_pred):
            acc = tf.cast(
                tf.equal(tf.argmax(y_true, axis=-1),
                         tf.argmax(y_pred, axis=-1)), tf.float32)
            acc = tf.cast(acc, tf.float32)
            return tf.reduce_mean(acc)

        """
            -----------------------------------------------------------------------------
            DEFINE VARS
            --------------------------------------------------------------------------------
        """

        MU = tf.Variable(0.1, name="MU")

        LAMBDA = tf.constant(LAMBDA.astype(np.float32), name="LAMBDA")
        PI = tf.Variable(tf.ones(shape=(tf.shape(Y)[0], ), dtype=tf.float32),
                         name="PI")
        _l = LAMBDA.numpy()
        CUTOFF = tf.Variable(0.0, name='CUTOFF')
        CUTOFF_K = tf.Variable(1.0)

        @tf.function
        def get_alpha(MU):
            return tf.pow(2.0, -tf.math.reciprocal(tf.abs(100 * MU)))

        @tf.function
        def to_prob(x):
            return tf.nn.softmax(x, axis=1)

        @tf.function
        def cutoff(x):
            return 1.0 / (1.0 + tf.exp(-CUTOFF_K * (CUTOFF - x)))

        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Conv1D(8, kernel_size=5, padding='same'))
        model.add(tf.keras.layers.Activation('relu'))
        model.add(tf.keras.layers.Conv1D(8, kernel_size=5, padding='same'))
        model.add(tf.keras.layers.Activation('relu'))
        model.add(tf.keras.layers.Conv1D(1, kernel_size=3, padding='same'))

        model.add(tf.keras.layers.Flatten())
        """
            -----------------------------------------------------------------------------
            DEFINE FORWARD
            --------------------------------------------------------------------------------
        """

        @tf.function
        def forward(Y, U, PI, mode='train', remove_diag=True):
            if mode == 'train':
                U = tf.gather(U, indices=np.where(labeledIndexes)[0], axis=0)
                Y = tf.gather(Y, indices=np.where(labeledIndexes)[0], axis=0)
                #F = tf.gather(F,indices=np.where(labeledIndexes)[0],axis=0)

                PI = tf.gather(PI, indices=np.where(labeledIndexes)[0], axis=0)

            pi_Y = spd_matmul(to_sp_diag(tf.abs(PI)), Y)

            alpha = get_alpha(MU)
            """
                Maybe apply custom convolution to LAMBDA, otherwise just fit LGC's alpha using the corresponding filter 1/(1-alpha + alpha*lambda)
            """
            if not self.custom_conv:
                lambda_tilde = tf.math.reciprocal(1 - alpha + alpha * LAMBDA)
            else:
                #lambda_tilde = tf.math.reciprocal(1-alpha + alpha*LAMBDA)
                _lambda = (LAMBDA -
                           tf.reduce_mean(LAMBDA)) / tf.math.reduce_std(LAMBDA)
                lambda_tilde = tf.clip_by_value(
                    2 * tf.nn.sigmoid(
                        tf.reshape(model(_lambda[None, :, None]), (-1, ))), 0,
                    1)
                lambda_tilde = tf.sort(lambda_tilde, direction='DESCENDING')
            lambda_tilde = tf.reshape(divide_by_row(lambda_tilde[None, :]),
                                      (-1, ))

            _self_infl = mult_each_row_by(
                tf.square(U), by=lambda_tilde
            )  #Square each element of U, then dot product of each row with lambda_tilde
            _self_infl = tf.reduce_sum(_self_infl, axis=1)

            _P_op = U @ (mult_each_col_by(
                (tf.transpose(U) @ pi_Y), by=lambda_tilde))
            if not remove_diag:
                _diag_P_op = tf.zeros_like(
                    mult_each_col_by(pi_Y, by=_self_infl))
            else:
                _diag_P_op = mult_each_col_by(pi_Y, by=_self_infl)
            return divide_by_row(_P_op - _diag_P_op), lambda_tilde, pi_Y

        """
            -----------------------------------------------------------------------------
                DEFINE LOSSES and learning schedule
            --------------------------------------------------------------------------------
        """
        losses = {
            'xent':
            lambda y_, y: tf.reduce_mean(-tf.reduce_sum(y_ * tf.cast(
                tf.math.log(smooth_labels(y, factor=0.01)), tf.float32),
                                                        axis=[1])),
            'sq_loss':
            lambda y_, y: tf.reduce_mean(
                tf.reduce_sum(tf.square(y_ - y), axis=[1])),
            'abs_loss':
            lambda y_, y: tf.reduce_mean(
                tf.reduce_sum(tf.abs(y_ - y), axis=[1])),
            'hinge':
            lambda y_, y: tf.reduce_mean(
                tf.reduce_sum(tf.maximum(1. - y_ * y, tf.zeros_like(y)),
                              axis=1))
        }

        NUM_ITER = 700
        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            0.5, decay_steps=200, decay_rate=0.9, staircase=False)

        opt = tf.keras.optimizers.Adam(0.05)

        Y_l = tf.gather(Y, indices=np.where(labeledIndexes)[0], axis=0)

        #import matplotlib.pyplot as plt
        #import matplotlib
        #matplotlib.use('tkagg')
        import pandas as pd
        """
            -----------------------------------------------------------------------------
            LEARNING
            --------------------------------------------------------------------------------
        """
        L = []
        df = pd.DataFrame()
        max_acc, min_loss = [0, np.inf]
        for i in range(NUM_ITER):
            #MU.assign(i)
            with tf.GradientTape() as t:
                # no need to watch a variable:
                # trainable variables are always watched
                pred_L, lambda_tilde, pi_Y = forward(Y, U, PI, mode='train')
                loss_sq = losses['sq_loss'](pred_L, Y_l)
                loss = losses['xent'](pred_L, Y_l)

                loss_xent = losses['xent'](pred_L, Y_l)

            acc = accuracy(Y_l, pred_L)
            _not_lab = np.where(np.logical_not(labeledIndexes))[0]
            acc_true = accuracy(
                tf.gather(ORACLE_Y, indices=_not_lab, axis=0),
                tf.gather(forward(Y, U, PI, mode='eval')[0],
                          indices=_not_lab,
                          axis=0))

            L.append(
                np.array([i, loss_sq, loss, loss_xent, acc,
                          acc_true])[None, :])
            """
                TRAINABLE VARIABLES GO HERE
            """
            if self.custom_conv:
                trainable_variables = model.weights
            else:
                trainable_variables = [MU]
            if optimize_labels:
                trainable_variables.append(PI)

            if acc > max_acc:
                print(max_acc)
                best_trainable_variables = [
                    k.numpy() for k in trainable_variables
                ]
                max_acc = acc
                min_loss = loss
                counter_since_best = 0
            elif acc <= max_acc:

                counter_since_best += 1
                if counter_since_best > 2000:
                    break
            """
                Apply gradients
            """
            gradients = t.gradient(loss, trainable_variables)
            opt.apply_gradients(zip(gradients, trainable_variables))
            """
                Project labels such that they sum up to the original amount
            """
            pi = PI.numpy()
            pi[labeledIndexes] = np.sum(
                labeledIndexes) * pi[labeledIndexes] / (np.sum(
                    pi[labeledIndexes]))
            PI.assign(pi)

            if i % 10 == 0:
                """ Print info """
                if not hook is None:
                    if self.hook_iter_mode == "labeled":
                        plot_y = np.zeros_like(Y)
                        plot_y[labeledIndexes] = Y_l.numpy()
                    else:
                        plot_y = tf.clip_by_value(
                            forward(Y, U, PI, mode='eval')[0], 0,
                            999999).numpy()
                    hook._step(step=i,
                               X=X,
                               W=W,
                               Y=plot_y,
                               labeledIndexes=labeledIndexes)
                alpha = get_alpha(MU)
                PI_l = tf.gather(PI,
                                 indices=np.where(labeledIndexes)[0],
                                 axis=0)
                LOG.info(
                    f"Acc: {acc.numpy():.3f}; ACC_TRUE:{acc_true.numpy():.3f}  Loss: {loss.numpy():.3f}; alpha = {alpha.numpy():.3f}; PI mean = {tf.reduce_mean(PI_l).numpy():.3f} "
                )

        #plt.scatter(range(lambda_tilde.shape[0]),np.log10(lambda_tilde/LAMBDA),s=2)
        #plt.show()
        for k in range(len(trainable_variables)):
            trainable_variables[k].assign(best_trainable_variables[k])
        return tf.clip_by_value(forward(Y, U, PI, mode='eval')[0], 0,
                                999999).numpy()
Example #8
0
    def run(self,hook_list=PLOT_HOOKS):
        for k,v in self.args.items():
            LOG.info("{}:{}".format(k,v),LOG.ll.EXPERIMENT)
        
        
        #Multiplex the arguments, allocating each to the correct step
        mplex = postprocess(keys_multiplex(self.args))
        
        
        #Get Hooks:
        hooks = select_and_add_hook(hook_list, mplex, self)
        
        
        
        
        LOG.info("Step 1: Read Dataset",LOG.ll.EXPERIMENT)
        
        #Select Input 
        self.X,self.W,  self.Y_true, self.labeledIndexes = select_input(**mplex["INPUT"])
        
        if self.W is None:
            self.W = select_affmat(**mplex["AFFMAT"]).generateAffMat(self.X,hook=hooks["AFFMAT"])
        
        
        
        if "know_estimated_freq" in mplex["ALG"].keys():
            if mplex["ALG"]['know_estimated_freq']:
                mplex["ALG"]["use_estimated_freq"] = np.sum(self.Y_true,axis=0) / self.Y_true.shape[0]
            mplex["ALG"].pop("know_estimated_freq")
            
        if "know_estimated_freq" in mplex["FILTER"].keys():
            if mplex["ALG"]['know_estimated_freq']:
                mplex["FILTER"]["use_estimated_freq"] = np.sum(self.Y_true,axis=0) / self.Y_true.shape[0]
            mplex["FILTER"].pop("know_estimated_freq")
            
            
        
        
        
        
        LOG.info("Step 2: Apply Noise",LOG.ll.EXPERIMENT)
        #Apply Noise
        self.Y_noisy = select_noise(**mplex["NOISE"]).corrupt(self.Y_true, self.labeledIndexes,hook=hooks["NOISE"])
        



        
        LOG.info("Step 3: Create Affinity Matrix",LOG.ll.EXPERIMENT)
        #Generate Affinity Matrix
        self.W = select_affmat(**mplex["AFFMAT"]).generateAffMat(self.X,hook=hooks["AFFMAT"])
        
        
        
        LOG.info("Step 4: Filtering",LOG.ll.EXPERIMENT)
        #Create Filter
        ft = select_filter(**mplex["FILTER"])
        self.ft = ft

        
        noisyIndexes = (np.argmax(self.Y_true,axis=1) != np.argmax(self.Y_noisy,axis=1))
        
        self.Y_filtered, self.labeledIndexes_filtered = ft.fit(self.X, self.Y_noisy, self.labeledIndexes, self.W, hook=hooks["FILTER"])
        
        
        LOG.info("Step 5: Classification",LOG.ll.EXPERIMENT)
        #Select Classifier 
        alg = select_classifier(**mplex["ALG"])
        #Get Classification
        self.F = alg.fit(self.X,self.W,self.Y_filtered,self.labeledIndexes_filtered,hook=hooks["ALG"])
        
        
        LOG.info("Step 6: Evaluation",LOG.ll.EXPERIMENT)
        LOG.debug("ALGORITHM settings:{}".format(mplex["ALG"]["algorithm"]),LOG.ll.EXPERIMENT)
        
        """ Accuracy. """
        acc = gutils.accuracy(gutils.get_pred(self.F), gutils.get_pred(self.Y_true))
        
        
        acc_unlabeled = gutils.accuracy(gutils.get_pred(self.F)[np.logical_not(self.labeledIndexes)],\
                                         gutils.get_pred(self.Y_true)[np.logical_not(self.labeledIndexes)])
        acc_labeled = gutils.accuracy(gutils.get_pred(self.F)[self.labeledIndexes],\
                                         gutils.get_pred(self.Y_true)[self.labeledIndexes])
        
        
        CMN_acc = gutils.accuracy(gutils.get_pred(gutils.class_mass_normalization(self.F,self.Y_filtered,self.labeledIndexes,normalize_rows=True)), gutils.get_pred(self.Y_true))
      
        
        """
            Log accuracy results and update output dictionary
        """
        def _log(msg):
            LOG.info(msg,LOG.ll.EXPERIMENT)
            
        _log("Accuracy: {:.3%} | {:.3%}".format(acc,1-acc))
        _log("Accuracy (unlabeled): {:.3%} |{:.3%}".format(acc_unlabeled,1-acc_unlabeled))
        _log("Accuracy (labeled): {:.3%} | {:.3%}".format(acc_labeled,1-acc_labeled))    
        _log("Accuracy w/ CMN: {:.3%} | {:.3%}".format(CMN_acc,1-CMN_acc))
        
        self.out_dict.update({OUTPUT_PREFIX + "acc" :acc})
        self.out_dict.update({OUTPUT_PREFIX + "acc_unlabeled" :acc_unlabeled})
        self.out_dict.update({OUTPUT_PREFIX + "acc_labeled" :acc_labeled})
        self.out_dict.update({OUTPUT_PREFIX + "CMN_acc" :CMN_acc})
        
        
        
        return self.out_dict
Example #9
0
    def __MR(self, X, W, Y, labeledIndexes, p, tuning_iter, hook=None):
        Y = np.copy(Y)
        if Y.ndim == 1:
            Y[np.logical_not(labeledIndexes)] = 0
            Y = gutils.init_matrix(Y, labeledIndexes)
        Y[np.logical_not(labeledIndexes), :] = 0
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")

        l = np.reshape(np.array(np.where(labeledIndexes)), (-1))
        num_lab = l.shape[0]

        if not isinstance(p, int):
            p = int(p * num_lab)
        if p > Y.shape[0]:
            p = Y.shape[0]
            LOG.warn("Warning: p greater than the number of labeled indexes",
                     LOG.ll.FILTER)

        W = scipy_to_np(W)
        L = gutils.lap_matrix(W, which_lap='sym')
        D = gutils.deg_matrix(W)

        def check_symmetric(a, tol=1e-8):
            return np.allclose(a, a.T, atol=tol)

        if check_symmetric(L):
            E = sp.eigh(L, D, eigvals=(1, p))[1]
        else:
            LOG.warn("Warning: Laplacian not symmetric", LOG.ll.FILTER)
            eigenValues, eigenVectors = sp.eig(L, D)
            idx = eigenValues.argsort()
            eigenValues = eigenValues[idx]
            assert eigenValues[0] <= eigenValues[eigenValues.shape[0] - 1]
            eigenVectors = eigenVectors[:, idx]
            E = eigenVectors[:, 1:(p + 1)]

        e_lab = E[labeledIndexes, :]
        """ TIKHONOV REGULARIZATION. Currently set to 0."""
        TIK = np.zeros(shape=e_lab.shape)
        try:
            A = np.linalg.inv(e_lab.T @ e_lab + TIK.T @ TIK) @ e_lab.T
        except:
            A = np.linalg.pinv(e_lab.T @ e_lab + TIK.T @ TIK) @ e_lab.T
        F = np.zeros(shape=Y.shape)

        y_m = np.argmax(Y, axis=1)[labeledIndexes]

        for i in range(Y.shape[1]):
            c = np.ones(num_lab)
            c[y_m != i] = -1
            a = A @ np.transpose(c)
            LOG.debug(a, LOG.ll.FILTER)
            for j in np.arange(F.shape[0]):
                F[j, i] = np.dot(a, E[j, :])

        ERmat = -1 * np.ones((Y.shape[0], ))

        Y_amax = np.argmax(Y, axis=1)
        for i in np.where(labeledIndexes):
            ERmat[i] = np.square(Y[i, Y_amax[i]] - F[i, Y_amax[i]])

        removed_Lids = np.argsort(ERmat)
        removed_Lids = removed_Lids[::-1]

        labeledIndexes = np.array(labeledIndexes)
        Y = np.copy(Y)
        for i in range(tuning_iter):
            labeledIndexes[removed_Lids[i]] = False
            if not hook is None:
                hook._step(step=i,
                           X=X,
                           W=W,
                           Y=Y,
                           labeledIndexes=labeledIndexes)

        return Y, labeledIndexes
Example #10
0
def apply_noise(Y, labeledIndexes, A, seed=None, deterministic=True):
    """ Corrupts a set percentage of initial labels with noise.
    
    Args:
        Y (`[NDArray[int].shape[N,C]`) : Matrix encoding initial beliefs.
        A (`[NDArray[int].shape[C,C]`): Transition probabilities between each class.
        labeledIndexes (`NDArray[bool].shape[N]`) : determines which indices are to be considered as labeled.
        seed (float) : Optional. Used to reproduce results. 
        
    Returns:
        `NDArray[int].shape[N,C]` : Belief matrix after corruption.
        
    """
    np.random.seed(seed)
    old_A = np.copy(np.asarray(A))
    if not np.all(old_A <= 1):
        LOG.debug(old_A, LOG.ll.NOISE)
        raise Exception("trans. mat has value >1")
    old_Y = np.copy(Y)
    is_flat = np.ndim(Y) == 1
    if is_flat:
        Y = gutils.init_matrix(Y, labeledIndexes)
    c = Y.shape[1]
    n = Y.shape[0]

    Y = Y[labeledIndexes, :]
    Y_flat = np.argmax(Y, axis=1)

    vec = np.random.RandomState(seed).permutation(Y.shape[0])
    assert not vec is None
    cursor = np.zeros((c), dtype=np.int32)

    if deterministic == True:
        A = transition_count_mat(Y, A)
    else:

        class_freq = [int(np.sum(Y[:, i])) for i in range(c)]

        num_clean = np.sum(labeledIndexes) * sum(
            [old_A[i, i] for i in range(c)]) / c

        num_clean = int(np.round(num_clean))
        num_noisy = np.sum(labeledIndexes) - num_clean

        ##########3
        perm = np.random.permutation(Y.shape[0])[0:num_noisy]
        A = np.zeros((c, c))
        for i in range(c):
            A[i, i] = class_freq[i]

        for my_id in perm:
            j = np.argmax(Y[my_id, :])
            A[j, j] -= 1
            new_j = j
            while new_j == j:
                new_j = np.random.choice(c)
            A[j, new_j] += 1

        assert np.sum(A) == np.sum(labeledIndexes)
        LOG.debug(A, LOG.ll.NOISE)
        ###############

    for i in np.arange(Y_flat.shape[0]):
        current_class = Y_flat[vec[i]]
        while A[current_class, cursor[current_class]] == 0:
            cursor[current_class] += 1
            assert cursor[current_class] < c
        Y_flat[vec[i]] = cursor[current_class]
        A[current_class, cursor[current_class]] -= 1

    noisy_Y = np.zeros(shape=(n, c))
    labeledIndexes_where = np.where(labeledIndexes)[0]
    for l in range(Y_flat.shape[0]):
        noisy_Y[labeledIndexes_where[l], Y_flat[l]] = 1
    noisy_Y[np.logical_not(labeledIndexes), :] = old_Y[
        np.logical_not(labeledIndexes), :]
    LOG.info(
        "Changed {} percent of entries".format(
            np.round(1 - gutils.accuracy(np.argmax(Y, axis=1), Y_flat), 6)),
        LOG.ll.NOISE)

    if is_flat:
        old_Y[labeledIndexes] = np.argmax(noisy_Y[labeledIndexes], axis=1)
        return old_Y
    else:
        return noisy_Y
Example #11
0
    def __SIIS(self,X,W,Y,labeledIndexes,m,alpha,beta,rho,max_iter,hook=None):
        Y = self.CLEAN_UNLABELED_ROWS(Y, labeledIndexes)
        
        if not W.shape[0] == Y.shape[0]:
            raise ValueError("W,Y shape not compatible")
        
        if m is None:
            m = W.shape[0]
        
        
        
        c = Y.shape[1]
        

        
        
        D = gutils.deg_matrix(W, pwr=1.0)

        
        L = gutils.lap_matrix(W,  which_lap='sym')
        
        U, SIGMA = W.load_eigenfunctions(m=m,remove_first_eig=False)
        
        U = scipy.sparse.csr_matrix(U)
        SIGMA =  _to_np(scipy.sparse.diags([SIGMA],[0]))
        
    

        
        
        J = gutils.labels_indicator(labeledIndexes)
        
        """ !!! """
        P = SIISClassifier.edge_mat(W) 
        
        
        
        """ Initialize params """
        LAMB_1 = np.ones((P.shape[0],c))
        LAMB_2 = np.ones((Y.shape[0],c))
        mu = 1.0
        mu_max = 10000000.0
        eps = 1/(10000)
        
        """ Reusable matrices """
        JU = _to_np(J@U)
        PU = _to_np(P@U)
        PU_T = PU.transpose()
        JU_T = JU.transpose()
        
        
        
        A = np.zeros((m,c))
        Q = None
        B = None
        
        improvement  = 1
        iter = 0
        
        """ TODO: Tensorflow version 
            import tensorflow as tf
            with tf.Session() as sess:
                A = tf.Variable(1e-06*tf.ones((m,c),dtype=tf.float64))
                sess.run(tf.global_variables_initializer())
                
                C = tf.reduce_sum(tf.linalg.norm(tf.matmul(PU,A),axis=1)) +\
                 alpha*tf.reduce_sum(tf.linalg.norm(tf.matmul(_to_np(U)[labeledIndexes,:],A)-Y[labeledIndexes,:],axis=1)) +\
                 beta* tf.trace(tf.matmul(tf.matmul(tf.transpose(A),SIGMA),A))
                opt = tf.train.AdamOptimizer(learning_rate=0.5*1e-02)
                opt_min = opt.minimize(C)
                sess.run(tf.global_variables_initializer())
                for i in range(2000):
                    sess.run(opt_min)
                    LOG.debug(sess.run(C),LOG.ll.CLASSIFIER)
                LOG.debug(sess.run(C),LOG.ll.CLASSIFIER)    
                F = _to_np(U)@sess.run(A)
                
                LOG.debug(F.shape,LOG.ll.CLASSIFIER)
            
        
        """
        A = np.zeros((m,c))
        while  iter <= max_iter and improvement > eps:
            
            """ Update Q """
            N = PU@A - (1/mu)*LAMB_1
            N_norm = np.linalg.norm(N, axis=1)
            
            
            to_zero = N_norm <= (1/mu)
            mult = ((N_norm - (1/mu))/N_norm)
            N = N * mult[:,np.newaxis]
            
            
            N[to_zero,:] = 0.0
            Q = N 
            
            """ Update B """
            M = JU@A - Y - (1/mu)*LAMB_2
            M_norm = np.linalg.norm(M,axis=1)
            to_zero = M_norm <= (alpha/mu)
            mult = ((M_norm - (alpha/mu))/M_norm)
            M = M * mult[:,np.newaxis]
            M[to_zero,:] = 0.0 
            B = M
            
            
            old_A = A
            """ Update A """
            
            A_inv_term = 2*beta*SIGMA + mu*PU_T@PU + mu*JU_T@JU
            A_inv_term = np.linalg.inv(A_inv_term) 
            A = A_inv_term @ \
                (PU_T@ LAMB_1 + JU_T@LAMB_2 +\
                  mu * PU_T@Q + mu* JU_T @ (B + Y) )
        
            """ Update Lagrangian coeffs """
            LAMB_1 = LAMB_1 + mu* (Q - PU@A)
            LAMB_2 = LAMB_2 + mu*(B- JU@A + Y)
            """ Update penalty coeffficients """
            mu = min(rho*mu,mu_max)
        
        
            
            if not old_A is None:
                improvement = (np.max(np.abs(A-old_A)))/np.max(np.abs(old_A))
                
            
            
            LOG.debug("Iter {}".format(iter),LOG.ll.CLASSIFIER)
            iter += 1
        
        C = np.sum(np.linalg.norm(PU@A,axis=1)) + alpha*np.sum(np.linalg.norm(JU@A - Y,axis=1)) +\
             beta*np.trace(A.T@SIGMA@A)
        LOG.debug("Iter {} - Cost {}".format(iter,C),LOG.ll.CLASSIFIER)
            
        
        F = U@A
            
            
        for i in range(F.shape[0]):
            mx = np.argmax(F[i,:])
            F[i,:] = 0.0
            F[i,mx] = 1.0
        
        
        return F
Example #12
0
def calculate_statistics(df):
    """ Obtains a dataframe which 'merges' runs that share the same configuration but different ``'id'``.
        For each output variable (as in, one that uses the prefix :const:`experiment.prefixes.OUTPUT_PREFIX`), a few summary statistics are calculated:
         
         * A string with the comma-separated values.
         * The mean of the attribute.
         * The standard deviation of the attribute.
         * The median of the attribute.
         * The minimum value of the attribute.
         * The maximum value of the attribute.
         
    Args:
        df (pd.Dataframe): The original dataframe
    Returns:
        pd.Dataframe: A dataframe with statistics about runs that share same configuration.
    
    """
    
    '''
        out_dict (dict) : A dictionary with k,v pairs:
            
            k (str): a string uniquely identifying the runs that differ only in id
            v (dict): A dictionary with k1, v1 pairs:
            
                k1 (str): some output attribute
                v1: The value of the output attribute 
        
    '''
    out_dict = {}
    freq_dict = {}
    index_dict = {}
    
    rel_cols = [x  for x in df.columns if not (x.startswith(OUTPUT_PREFIX) or x == "id" or x == "index") ]
    out_cols = [x  for x in df.columns if x.startswith(OUTPUT_PREFIX) ]
    
    
    LOG.debug("rows:{}".format(df.shape[0]),LOG.ll.SPECIFICATION)
    
    for i in range(df.shape[0]):
        debug(i)
        k = str(df.loc[df.index[i],rel_cols].values)

        #Initialize with empty dicts
        if not k in out_dict.keys():
            out_dict[k] = {}
            freq_dict[k] = 0
            
        
        if index_dict.get(k,None) is None:
            index_dict[k] = i
        
        freq_dict[k] += 1
        for k1 in out_cols:            
            v1 = df.loc[df.index[i],k1]
            L = (out_dict[k]).get(k1,[])    
            L.append(v1)
            (out_dict[k])[k1] = L
    

    agg_cols =  [[x + "_mean", x + "_sd", x + "_values",\
                  x + "_min", x + "_max", x + "_median"] for x in out_cols]
    agg_cols = [item for sublist in  agg_cols for item in sublist]
    agg_cols += ["out_num_experiments"] + rel_cols
    
    key_list = list(index_dict.keys())
    
    new_df = pd.DataFrame(index=range(len(key_list)),columns=agg_cols)


    
    debug("Num keys:{}".format(len(key_list)))
    for i in range(len(key_list)):
        debug(i)
        k = key_list[i]
        new_df.loc[df.index[i],"out_num_experiments"] = freq_dict[k]   
       
        new_df.loc[df.index[i],rel_cols] = df.loc[df.index[index_dict[k]],rel_cols]
        for k1 in out_cols:
            vals = out_dict[k][k1]
            new_df[k1+ "_mean"].iloc[i] = np.mean(vals)
            new_df[k1+ "_sd"].iloc[i] = np.std(vals)
            new_df[k1 + "_values"].iloc[i] = ','.join([str(x) for x in vals])
            new_df[k1 + "_min"].iloc[i] = min(vals)
            new_df[k1 + "_max"].iloc[i] = max(vals)
            new_df[k1 + "_median"].iloc[i] = np.median(vals)
        
        
    new_df = new_df.loc[:,[x not in ['acc','id','index'] for x in new_df.columns]]
    new_df = new_df.reindex(sorted(new_df.columns), axis=1)
    new_df = new_df.sort_values(by = list(new_df.columns))
    return(new_df)
Example #13
0
def info(msg):
    LOG.debug(msg,LOG.ll.OUTPUT)
Example #14
0
def debug(msg):
    LOG.debug(msg,LOG.ll.OUTPUT)