Esempio n. 1
0
def save_results_for_KM(root_dir, res_dict, method_name, dat_name):
    """
       The function is used to save generated results for K-means and its variants
    """
    res_dir = os.path.join(
        root_dir, 'results', method_name,
        dat_name)  # get the result directory where the result is stored
    f_manager = FileManager(res_dir)
    f_path = os.path.join(res_dir, 'cls_quality.csv')
    f_manager.add_file(f_path)

    print f_path

    # Then, we save the results to one csv file like
    #        "seed_num"  "time"   "Purity"     "ARI"      "ACC"     "NMI" ...
    #     1      1         000      000         000        000       000  ...
    #     2      2         000      000         000        000       000  ...
    field_names = ['seed', 'time', 'Purity', 'ARI', 'ACC', 'NMI',
                   'd_W']  # fill out the field names for CSV

    with open(f_path,
              mode='wb') as csv_file:  # open the file, if not exist, create it
        writer = csv.DictWriter(
            csv_file, fieldnames=field_names
        )  # create a writer which maps the dictionaries onto output rows in CSV
        writer.writeheader()  # write the field names to the header
        for key in res_dict.keys():
            writer.writerow(res_dict[key])
Esempio n. 2
0
def full_db():

    fm = FileManager('sqlite://')
    fm.init_db()

    for flow_id, object_name, size in FILES:
        parts = flow_id.split('/')
        dataset_id = '/'.join(parts[:2])
        object_name = '/'.join([dataset_id, object_name])
        FILE_TO_FLOW.setdefault(object_name, set()).add(flow_id)
        fm.add_file('bucket', object_name, PRIVACY[dataset_id], parts[0],
                    parts[0] + '_id', dataset_id, flow_id, size, now)

    return fm
Esempio n. 3
0
def gather_results_by_seeds(root_dir, seeds):
    ''' This function is used to gather results generated by different initializations on NCP methods

    Args:
	root_dir the parent dir 
	seeds a list of seeds used
    Returns:
	a csv files generated with results collected from all files    
    '''

    size = len(seeds)  # the number of initializations

    nmf_palm = np.ones((8000, size + 2)) * (-1)
    nmf_sncp = np.ones((8000, size + 2)) * (-1)
    onmf_cost_palm = np.ones((8000, size + 2)) * (-1)
    onmf_cost_sncp = np.ones((8000, size + 2)) * (-1)
    cost_palm = np.ones((8000, size + 2)) * (-1)
    cost_sncp = np.ones((8000, size + 2)) * (-1)
    WH_nr = np.ones((8000, size + 2)) * (-1)
    ortho_nr = np.ones((8000, size + 2)) * (-1)

    cls_acc = np.ones((8000, size + 2)) * (-1)
    for seed in seeds:
        res_path = os.path.join(root_dir, 'seed' + str(seed), 'res.csv')
        cls_path = os.path.join(root_dir, 'seed' + str(seed),
                                'cls_quality.csv')
        if not os.path.exists(res_path) or not os.path.exists(cls_path):
            raise ValueError('Error: the result path cannot be found!')

        df = pd.read_csv(res_path, header=0)
        # convet the dataframe into a numpy mat
        res_arr = df.as_matrix()

        #print res_arr[0, 0]
        dim = res_arr.shape[0]

        # move the nmf cost VS palm
        nmf_palm[0:dim, seed - 1] = res_arr[:, 0]
        # move the cost VS palm
        cost_palm[0:dim, seed - 1] = res_arr[:, 1]
        # move the nmf cost VS sncp
        nmf_sncp[0:dim, seed - 1] = res_arr[:, 2]
        # move the onmf cost VS SNCP
        onmf_cost_sncp[0:dim, seed - 1] = res_arr[:, 3]
        # move the onmf cost VS PALM
        onmf_cost_palm[0:dim, seed - 1] = res_arr[:, 4]
        # move the cost VS SNCP
        cost_sncp[0:dim, seed - 1] = res_arr[:, 5]
        # move the WH_nr
        WH_nr[0:dim - 1, seed -
              1] = res_arr[1:, 12].astype(float) + res_arr[1:,
                                                           13].astype(float)
        # move the ortho nr
        ortho_nr[0:dim, seed - 1] = res_arr[:, 14]

        df2 = pd.read_csv(cls_path, header=0)
        cls_arr = df2.as_matrix()
        # move the cluster accuracy VS SNCP
        dim = cls_arr.shape[0]
        cls_acc[0:dim, seed - 1] = cls_arr[:, 5]

    f_manager = FileManager(root_dir)

    nmf_palm_path = os.path.join(root_dir, 'nmf_cost_palm.csv')
    f_manager.add_file(nmf_palm_path)
    m01 = np.mean(nmf_palm[:, 0:size], axis=1)
    std01 = np.std(nmf_palm[:, 0:size], axis=1)
    nmf_palm[:, size] = m01.T
    nmf_palm[:, size + 1] = std01.T
    np.savetxt(nmf_palm_path,
               np.asmatrix(nmf_palm),
               delimiter=',',
               fmt='%.30f')
    nmf_sncp_path = os.path.join(root_dir, 'nmf_cost_sncp.csv')
    f_manager.add_file(nmf_sncp_path)
    m02 = np.mean(nmf_sncp[:, 0:size], axis=1)
    std02 = np.std(nmf_sncp[:, 0:size], axis=1)
    nmf_sncp[:, size] = m02.T
    nmf_sncp[:, size + 1] = std02.T
    np.savetxt(nmf_sncp_path,
               np.asmatrix(nmf_sncp),
               delimiter=',',
               fmt='%.30f')
    onmf_palm_path = os.path.join(root_dir, 'onmf_cost_palm.csv')
    f_manager.add_file(onmf_palm_path)
    m1 = np.mean(onmf_cost_palm[:, 0:size], axis=1)
    std1 = np.std(onmf_cost_palm[:, 0:size], axis=1)
    onmf_cost_palm[:, size] = m1.T
    onmf_cost_palm[:, size + 1] = std1.T
    np.savetxt(onmf_palm_path,
               np.asmatrix(onmf_cost_palm),
               delimiter=',',
               fmt='%.30f')
    onmf_sncp_path = os.path.join(root_dir, 'onmf_cost_sncp.csv')
    f_manager.add_file(onmf_sncp_path)
    m2 = np.mean(onmf_cost_sncp[:, 0:size], axis=1)
    std2 = np.std(onmf_cost_sncp[:, 0:size], axis=1)
    onmf_cost_sncp[:, size] = m2.T
    onmf_cost_sncp[:, size + 1] = std2.T
    np.savetxt(onmf_sncp_path,
               np.asmatrix(onmf_cost_sncp),
               delimiter=',',
               fmt='%.30f')
    sncp_path = os.path.join(root_dir, 'cost_sncp.csv')
    f_manager.add_file(sncp_path)
    m3 = np.mean(cost_sncp[:, 0:size], axis=1)
    std3 = np.std(cost_sncp[:, 0:size], axis=1)
    cost_sncp[:, size] = m3.T
    cost_sncp[:, size + 1] = std3.T
    np.savetxt(sncp_path, np.asmatrix(cost_sncp), delimiter=',', fmt='%.30f')
    palm_path = os.path.join(root_dir, 'cost_palm.csv')
    f_manager.add_file(palm_path)
    m4 = np.mean(cost_palm[:, 0:size], axis=1)
    std4 = np.std(cost_palm[:, 0:size], axis=1)
    cost_palm[:, size] = m4.T
    cost_palm[:, size + 1] = std4.T
    np.savetxt(palm_path, np.asmatrix(cost_palm), delimiter=',', fmt='%.30f')
    WH_nr_path = os.path.join(root_dir, 'WH_NR.csv')
    f_manager.add_file(WH_nr_path)
    m5 = np.mean(WH_nr[:, 0:size], axis=1)
    std5 = np.std(WH_nr[:, 0:size], axis=1)
    WH_nr[:, size] = m5.T
    WH_nr[:, size + 1] = std5.T
    np.savetxt(WH_nr_path, np.asmatrix(WH_nr), delimiter=',', fmt='%.30f')
    ortho_nr_path = os.path.join(root_dir, 'ortho_NR.csv')
    f_manager.add_file(ortho_nr_path)
    m6 = np.mean(ortho_nr[:, 0:size], axis=1)
    std6 = np.std(ortho_nr[:, 0:size], axis=1)
    ortho_nr[:, size] = m6.T
    ortho_nr[:, size + 1] = std6.T
    np.savetxt(ortho_nr_path,
               np.asmatrix(ortho_nr),
               delimiter=',',
               fmt='%.30f')

    cls_acc_path = os.path.join(root_dir, 'cls_acc.csv')
    f_manager.add_file(cls_acc_path)
    m7 = np.mean(cls_acc[:, 0:size], axis=1)
    std7 = np.std(cls_acc[:, 0:size], axis=1)
    cls_acc[:, size] = m7.T
    cls_acc[:, size + 1] = std7.T
    np.savetxt(cls_acc_path, np.asmatrix(cls_acc), delimiter=',', fmt='%.30f')
Esempio n. 4
0
    def gen_inits_WH(self, init='random', seed=1, H_ortho=True):
        ''' The function is to initialize the factors W, H for nonnegative matrix factorization
        There are some options:
            1. random ------  generate W, H randomly
            2. kmeans ------  generate H based on cluster assignments obtained by Kmeans
                            then W = data_mat * H (since H is orthogonal)
            3. nmf    ------  use sklearn.nmf on data matrix firstly to get W, H for initialization
            4. kmeans++ ----  use heuristic strategy kmeans++ to get cluster assignment
                                    which can be used for H and W = data_mat * H

        Args:
            data (numpy array or mat): the input data
            init (string): the name of method used for generating the initializations
            rank (int): the rank for decomposition
            seed (float): the seed for random generator
        Returns:
            numpy matrix W and H
        '''
        ortho = 'ortho' if H_ortho else ''
        data_name = self.data_kind + str(self.data_num)

        initW_path = os.path.join(self.root_dir, 'inits', data_name,
                                  'W' + str(seed) + '.csv')
        initH_path = os.path.join(self.root_dir, 'inits', data_name,
                                  'H' + '_' + ortho + str(seed) + '.csv')
        if os.path.exists(initW_path) and os.path.exists(initH_path):
            if seed < 100:
                W_init = self.read_data_from_csvfile(initW_path)
            H_init = self.read_data_from_csvfile(initH_path)
        else:
            (
                m, n
            ) = self.data_mat.shape  # get the size of data matrix to be decomposed

            np.random.seed(seed)
            if init == 'random':
                abs_mat = np.absolute(self.data_mat)
                #print np.any(abs_mat < 0)
                avg = np.sqrt(abs_mat.mean() / self.num_of_cls)
                print 'mean: ' + str(abs_mat.mean())
                print 'rank: ' + str(self.num_of_cls)
                print 'avg: ' + str(avg)
                W_init = np.asmatrix(avg * np.random.random(
                    (m, self.num_of_cls)))
                H_init = np.asmatrix(avg * np.random.random(
                    (n, self.num_of_cls)))
            elif init == 'kmeans':
                km = sklearn_KMeans(n_clusters=self.num_of_cls).fit(
                    self.data_mat.transpose())
                clusters = km.predict(self.data_mat.transpose())
                H_init = np.asmatrix(np.zeros((n, self.num_of_cls)))
                for i in range(len(clusters)):
                    H_init[i, clusters[i]] = 1
                H_init = H_init * np.diag(
                    np.diag(H_init.transpose() * H_init)**(-0.5))
                W_init = self.data_mat * H_init
            elif init == 'nmf':
                model = sklearn_NMF(n_components=self.num_of_cls,
                                    init='nndsvd',
                                    random_state=0)
                W = model.fit_transform(self.data_mat.transpose())
                H = model.components_
                H_init = np.asmatrix(W)
                W_init = np.asmatrix(H).transpose()
            elif init == 'kmeans++':
                print 'using k++ initialization....'
                data_mat = self.data_mat.transpose()
                initial_centroids = np.ones((self.num_of_cls, m)) * (-1)
                ind_list = []
                idx = np.random.choice(n)
                ind_list.append(idx)
                initial_centroids[0, :] = data_mat[idx, :]
                while len(ind_list) < self.rank:
                    cent = initial_centroids[0:len(ind_list), :]
                    D2 = np.array([
                        min([LA.norm(x - c)**2 for c in cent])
                        for x in data_mat
                    ])
                    probs = D2 / D2.sum()
                    cumprobs = probs.cumsum()
                    #r = random.random()
                    r = np.random.random()
                    idx = np.where(cumprobs >= r)[0][0]
                    ind_list.append(idx)
                    initial_centroids[len(ind_list) - 1, :] = data_mat[idx, :]
                print ind_list

                W_init = np.asmatrix(initial_centroids).transpose()
                distances = np.ones((m, self.num_of_cls)) * (-1)
                for centroid_idx in range(self.num_of_cls):
                    for data_idx in range(n):
                        distances[data_idx, centroid_idx] = LA.norm(
                            data_mat[data_idx, :] -
                            initial_centroids[centroid_idx, :])

                cluster_assignments = np.argmin(distances, axis=1)
                temp_H = np.asmatrix(np.zeros((n, self.num_of_cls)))
                for j in range(n):
                    temp_H[j, cluster_assignments[j]] = 1

                #temp_H = np.diag(np.diag(temp_H * temp_H.transpose()) ** (-0.5)) * temp_H
                H_init = np.asmatrix(temp_H)

            else:
                raise ValueError(
                    'Error: invalid int parameter - init (None, random, kmeans, nmf)!!'
                )

            H_init = np.asmatrix(H_init.transpose())

            if H_ortho:
                #H_init = np.asmatrix(H_init.transpose())
                (ha, hb) = H_init.shape
                ortho = LA.norm(
                    H_init * H_init.transpose() - np.asmatrix(np.eye(ha)),
                    'fro')
                print H_init * H_init.transpose()
                if ortho > 1e-6:
                    H = np.zeros((ha, hb))
                    ind = np.asarray(np.argmax(H_init, 0))[0, :]
                    for j in range(hb):
                        H[ind[j], j] = 1
                    H = np.asmatrix(H)
                    temp = np.diag(H * H.transpose())
                    if np.any(temp == 0):
                        print temp
                        raise ValueError("some rows of H are zeros!!!")
                    H = np.asmatrix(np.diag(temp**(-0.5))) * H
                    H_init = H

        if seed >= 100:
            np.random.seed(seed)
            (m, n) = self.data_mat.shape

            # find centers from the smallest clusters
            cls_idx, cls_sizes = np.unique(self.true_labels,
                                           return_counts=True)
            s_id = cls_idx[np.argmax(cls_sizes)]
            id_list = np.where(self.true_labels == s_id)[0]
            print s_id
            print id_list

            dis_mat = pdist(self.data_mat.transpose())
            print np.argmin(dis_mat)
            print np.unravel_index(dis_mat.argmin(), dis_mat.shape)
            print np.where(dis_mat == np.min(dis_mat[np.nonzero(dis_mat)]))
            print 'select initial points -----'
            select_idx = [997, 998, 999]
            print select_idx
            #print id_list
            #select_idx = np.random.choice(id_list, self.num_of_cls, replace = False)

            W_init = self.data_mat[:, select_idx]
            #raise ValueError('TTEST!')
            W_init = np.asmatrix(W_init)
            print W_init.shape

            # save generated initializations
            f_manager = FileManager(self.root_dir)
            f_manager.add_file(initW_path)
            np.savetxt(initW_path, np.asmatrix(W_init), delimiter=',')
            f_manager.add_file(initH_path)
            np.savetxt(initH_path, np.asmatrix(H_init), delimiter=',')

        return np.asmatrix(W_init), np.asmatrix(H_init)
Esempio n. 5
0
    def __init__(self, root_dir, is_real, data_kind, data_num, has_outliers = True, dim_reduced = False, \
  num_of_features = 2000, num_of_samples = 1000, num_of_cls = 10, seed = 0):
        self.root_dir = root_dir
        self.is_real = is_real
        self.num_of_cls = num_of_cls
        self.data_kind = data_kind
        self.data_num = data_num
        self.has_outliers = has_outliers
        self.num_of_features = num_of_features
        self.num_of_samples = num_of_samples
        self.dim_reduced = dim_reduced
        dr_str = 'DR' if dim_reduced else ''
        outliers = 'otlrs' if has_outliers else ''
        if is_real:  # save the newly generated data so that we don't need to regenerate it again
            if not self.data_kind in {'mnist', 'tdt2', 'tcga'}:
                raise ValueError(
                    'Error: other data kinds are not supported now!')
            data_path = os.path.join(
                self.root_dir, 'real_data', self.data_kind,
                'data' + dr_str + '#' + str(self.data_num) + '.csv')
            label_path = os.path.join(self.root_dir, 'real_data',
                                      self.data_kind,
                                      'label#' + str(self.data_num) + '.csv')
            print data_path
            print label_path
            print self.root_dir
            if os.path.exists(data_path):  # data file exists, just read it
                self.data_mat = self.read_data_from_csvfile(data_path)
                if not self.dim_reduced:
                    if self.data_kind in {'tdt2', 'mnist'}:
                        self.data_mat = self.data_mat.transpose()
                self.true_labels = self.read_data_from_csvfile(label_path)
                print 'labels shape: ' + str(self.true_labels.shape)
                if self.data_kind in {'tdt2', 'mnist'}:
                    self.true_labels = self.true_labels.transpose()
                self.true_labels = self.true_labels[
                    0, :]  # since labels are stored as matrix, we just extrac row 0
                self.existed = True
            else:  # in case the original dataset without dimension reduction exists
                print False
                orig_data_path = os.path.join(
                    self.root_dir, 'real_data', self.data_kind, 'data#' +
                    str(self.data_num) + '_seed' + str(seed) + '.csv')
                orig_label_path = os.path.join(
                    self.root_dir, 'real_data', self.data_kind, 'label#' +
                    str(self.data_num) + '_seed' + str(seed) + '.csv')
                if os.path.exists(orig_data_path):
                    data_mat = self.read_data_from_csvfile(orig_data_path)
                    #self.data_mat = self.data_mat.transpose()[:, 0:20001] # just for testing
                    labels = self.read_data_from_csvfile(orig_label_path)
                    print(data_mat.shape)
                    labels = labels.transpose()[0, :]
                    if self.dim_reduced:
                        self.data_mat = self.dim_reduction_by_spectral()
                        self.data_mat = self.data_mat.transpose()
                    f_manager = FileManager(self.root_dir)
                    f_manager.add_file(data_path)
                    np.savetxt(data_path,
                               np.asmatrix(self.data),
                               delimiter=',')
                    f_manager.add_file(label_path)
                    np.savetxt(label_path,
                               np.asmatrix(self.true_labels),
                               delimiter=',')
                    self.existed = False
                else:
                    raise ValueError('Error: no available datasets')
        else:
            print('seed: ' + str(seed))
            np.random.seed(seed)  # set the seed
            # at first, we check whether the data file has been generated or not
            data_path = os.path.join(self.root_dir, 'synthetic_data', self.data_kind + '#' + str(self.data_num) + '_' + dr_str \
                       + '_' + str(self.num_of_features) + 'x' + str(self.num_of_samples) + '_K' + str(self.num_of_cls) + '_seed' + str(seed) + '.csv')
            label_path = os.path.join(self.root_dir, 'synthetic_data', self.data_kind + '#' + str(self.data_num)  + '_' + dr_str \
                + '_' + str(self.num_of_features) + 'x' + str(self.num_of_samples) + '_K' + str(self.num_of_cls) + '_seed' + str(seed) + '_label.csv')
            print data_path
            if os.path.exists(data_path):  # the data file exists, just read it
                self.data_mat = self.read_data_from_csvfile(data_path)
                self.true_labels = self.read_data_from_csvfile(label_path)
                self.true_labels = self.true_labels[
                    0, :]  # since labels are stored as matrix, we just extrac row 0
                self.existed = True
            else:
                if self.data_kind.startswith('syn'):
                    # we should generate synthetic dta with the linear model
                    self.data_mat, self.true_labels = self.gen_data_with_noise(self.num_of_features, self.num_of_samples, self.num_of_cls, \
                     self.data_num, self.has_outliers)
                    if self.dim_reduced:
                        self.data_mat = self.dim_reduction_by_spectral()
                        self.data_mat = self.data_mat.transpose()
                elif self.data_kind.startswith('2d'):
                    self.data_mat, self.true_labels = self.gen_2Data_with_3clusters(
                        data_num=self.data_num)
                else:
                    raise ValueError('Error: no other synthetic datasets!')

    #print (self.root_dir)
                f_manager = FileManager(self.root_dir)
                f_manager.add_file(data_path)
                np.savetxt(data_path,
                           np.asmatrix(self.data_mat),
                           delimiter=',')
                f_manager.add_file(label_path)
                np.savetxt(label_path,
                           np.asmatrix(self.true_labels),
                           delimiter=',')
                self.existed = False

        print 'data_mat'
        print self.data_mat.shape