Example #1
0
def anonymize(df: "pandas.DataFrame") -> "pandas.DataFrame":
    """
    Remove private data from a dataframe

    Any column containing at least one piece of private data is removed from
    the dataframe. This is a naive solution but limits the possibility of
    false negatives.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to anonymize

    Returns
    -------
    pd.DataFrame
        The dataframe with columns containing private data removed
    """
    private_cols = []

    checks = [check_addresses, check_emails, check_phonenumbers]
    for check in checks:
        new_private_cols = check(df)
        private_cols += new_private_cols

    # Get unique columns
    private_cols = np_unique(private_cols).tolist()

    # Drop columns
    return df.drop(private_cols, axis=1)
Example #2
0
    def predict(self, X, post_analyze_distribution=False, verbose=1):
        df = pd_df(X)
        print("started prediction for ", self.cluster_model, " X(", X.shape,
              ")")

        if self.cluster_model == 'KMeans':
            # default vals for kmeans --> max_iter=300, 1e-4
            self.predictedKlusters = self.trained_model.predict(df).astype(
                float)
            self.kluster_centers = self.trained_model.cluster_centers_.astype(
                float)
        elif self.cluster_model == 'GMM_full':
            # default vals for gmm --> max_iter=100, 1e-3
            _, log_resp = self.trained_model._e_step(df)
            self.predictedKlusters = log_resp.argmax(axis=1)
        elif self.cluster_model == 'GMM_diag':
            _, log_resp = self.trained_model._e_step(df)
            self.predictedKlusters = log_resp.argmax(axis=1)
        elif self.cluster_model == 'Spectral':
            self.predictedKlusters = self.trained_model.predict(X).labels_

        self.kluster_centroids = get_cluster_centroids(
            X,
            self.predictedKlusters,
            kluster_centers=self.kluster_centers,
            verbose=0)

        if post_analyze_distribution:
            numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution(
                self.predictedKlusters, self.n_clusters, verbose=1)
            unique_clust_cnt = len(np_unique(self.predictedKlusters))
            print("prediction completed for ",
                  self.cluster_model, " - unique_clust_cnt(",
                  str(unique_clust_cnt), "), numOf_1_sample_bins(",
                  str(numOf_1_sample_bins), ")")
        return np_asarray(self.predictedKlusters,
                          dtype=int), self.kluster_centroids
Example #3
0
    def fit(self,
            X,
            post_analyze_distribution=False,
            verbose=1,
            random_state=0):
        df = pd_df(np_array(X))

        curTol = 0.0001 if self.cluster_model == 'KMeans' else 0.01
        max_iter = 300 if self.cluster_model == 'KMeans' else 200

        numOf_1_sample_bins = 1
        unique_clust_cnt = 1
        expCnt = 0
        while (unique_clust_cnt == 1 or
               numOf_1_sample_bins - expCnt > 0) and expCnt < self.max_try_cnt:
            t = time()
            if expCnt > 0:
                if numOf_1_sample_bins > 0:
                    print("running ", self.cluster_model, " for the ",
                          str(expCnt), " time due to numOf_1_sample_bins(",
                          str(numOf_1_sample_bins), ")")
                if unique_clust_cnt == 1:
                    print("running ", self.cluster_model, " for the ",
                          str(expCnt), " time due to unique_clust_cnt==1")
            if verbose > 0:
                print('Clustering the featVec(', X.shape, ') with n_clusters(',
                      str(self.n_clusters),
                      ') and model = ', self.cluster_model, ", curTol(",
                      str(curTol), "), max_iter(", str(max_iter), "), at ",
                      datetime.now().strftime("%H:%M:%S"))
            self.kluster_centers = None
            self.predictedKlusters = None

            if self.cluster_model == 'KMeans':
                # default vals for kmeans --> max_iter=300, 1e-4
                self.trained_model = KMeans(init='k-means++',
                                            n_clusters=self.n_clusters,
                                            n_init=20,
                                            tol=curTol,
                                            max_iter=max_iter,
                                            random_state=random_state).fit(df)
                self.predictedKlusters = self.trained_model.labels_.astype(
                    float)
                self.kluster_centers = self.trained_model.cluster_centers_.astype(
                    float)
            elif self.cluster_model == 'GMM_full':
                # default vals for gmm --> max_iter=100, 1e-3
                self.trained_model = GaussianMixture(
                    n_components=self.n_clusters,
                    covariance_type='full',
                    tol=curTol,
                    random_state=random_state,
                    max_iter=max_iter,
                    reg_covar=1e-4).fit(df)
                _, log_resp = self.trained_model._e_step(X)
                self.predictedKlusters = log_resp.argmax(axis=1)
            elif self.cluster_model == 'GMM_diag':
                self.trained_model = GaussianMixture(
                    n_components=self.n_clusters,
                    covariance_type='diag',
                    tol=curTol,
                    random_state=random_state,
                    max_iter=max_iter,
                    reg_covar=1e-4).fit(df)
                _, log_resp = self.trained_model._e_step(X)
                self.predictedKlusters = log_resp.argmax(axis=1)
            elif self.cluster_model == 'Spectral':
                sc = SpectralClustering(n_clusters=self.n_clusters,
                                        affinity=self.spectral_affinity,
                                        random_state=random_state)
                self.trained_model = sc.fit(X)
                self.predictedKlusters = self.trained_model.labels_

            self.kluster_centroids = get_cluster_centroids(
                X,
                self.predictedKlusters,
                kluster_centers=self.kluster_centers,
                verbose=0)

            if post_analyze_distribution:
                numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution(
                    self.predictedKlusters, self.n_clusters, verbose=verbose)
                unique_clust_cnt = len(np_unique(self.predictedKlusters))
                curTol = curTol * 10
                max_iter = max_iter + 50
                expCnt = expCnt + 1
            else:
                expCnt = self.max_try_cnt

            elapsed = time() - t
            if verbose > 0:
                print('Clustering done in (', getElapsedTimeFormatted(elapsed),
                      '), ended at ',
                      datetime.now().strftime("%H:%M:%S"))
        removeLastLine()
        if verbose > 0:
            print('Clustering completed with (',
                  np_unique(self.predictedKlusters).shape,
                  ') clusters,  expCnt(', str(expCnt), ')')
        # elif 'OPTICS' in clusterModel:
        #     N = featVec.shape[0]
        #     min_cluster_size = int(np.ceil(N / (n_clusters * 4)))
        #     pars = clusterModel.split('_')  # 'OPTICS_hamming_dbscan', 'OPTICS_russellrao_xi'
        #     #  metricsAvail = np.sort(['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
        #     #                'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
        #     #                'sokalsneath', 'sqeuclidean', 'yule',
        #     #                'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'])
        #     #  cluster_methods_avail = ['xi', 'dbscan']
        #     clust = ClusterOPT(min_samples=50, xi=.05, min_cluster_size=min_cluster_size, metric=pars[1], cluster_method=pars[2])
        #     clust.fit(featVec)
        #     predictedKlusters = cluster_optics_dbscan(reachability=clust.reachability_,
        #                                                core_distances=clust.core_distances_,
        #                                                ordering=clust.ordering_, eps=0.5)
        #     n1 = np.unique(predictedKlusters)
        #     print(clusterModel, ' found ', str(n1), ' uniq clusters')
        #     predictedKlusters = predictedKlusters + 1

        return self
Example #4
0
def extrude_edges(vertices, edges, faces, edge_mask, face_data, matrices):
    if not matrices:
        matrices = [Matrix()]
    if face_data:
        face_data_matched = repeat_last_for_length(face_data, len(faces))
    if edge_mask:
        edge_mask_matched = repeat_last_for_length(edge_mask, len(edges))

    if isinstance(edges, np_ndarray):
        if edge_mask:
            np_edges = edges[edge_mask_matched]
        else:
            np_edges = edges
    else:
        if edge_mask:
            np_edges = np_array(edges)[edge_mask_matched]
        else:
            np_edges = np_array(edges)
    if isinstance(vertices, np_ndarray):
        np_verts = vertices
    else:
        np_verts = np_array(vertices)

    affeced_verts_idx = np_unique(np_edges)
    if len(matrices) == 1:
        extruded_verts = matrix_apply_np(np_verts[affeced_verts_idx],
                                         matrices[0])
        new_vertices = np_concatenate([np_verts, extruded_verts]).tolist()
    else:
        extruded_verts = [
            m @ Vector(v) for v, m in zip(np_verts[affeced_verts_idx].tolist(),
                                          cycle(matrices))
        ]
        new_vertices = vertices + extruded_verts

    top_edges = np_edges + len(vertices)
    mid_edges = np_zeros((len(affeced_verts_idx), 2), dtype=int)
    mid_edges[:, 0] = affeced_verts_idx
    mid_edges[:, 1] = affeced_verts_idx + len(vertices)
    extruded_edges_py = (np_concatenate([top_edges, mid_edges])).tolist()
    extruded_faces = np_zeros((len(np_edges), 4), dtype=int)
    extruded_faces[:, :2] = np_edges
    extruded_faces[:, 2] = top_edges[:, 1]
    extruded_faces[:, 3] = top_edges[:, 0]
    extruded_faces_py = extruded_faces.tolist()
    if isinstance(edges, np_ndarray):
        new_edges = np_concatenate([edges, top_edges, mid_edges]).tolist()
    else:
        new_edges = edges + extruded_edges_py

    if faces and faces[0]:
        if isinstance(faces, np_ndarray):
            new_faces = np_concatenate([faces, extruded_faces]).tolist()
        else:
            new_faces = faces + extruded_faces_py
    else:
        new_faces = extruded_faces_py

    if face_data:
        bvh = bvh_tree_from_polygons(vertices,
                                     faces,
                                     all_triangles=False,
                                     epsilon=0.0,
                                     safe_check=True)
        mid_points = (np_verts[np_edges[:, 1]] + np_verts[np_edges[:, 0]]) / 2
        face_idx = [bvh.find_nearest(P)[2] for P in mid_points.tolist()]
        new_face_data = face_data_matched + [
            face_data_matched[p] for p in face_idx
        ]
    else:
        new_face_data = []

    return (new_vertices, new_edges, new_faces, extruded_verts,
            extruded_edges_py, extruded_faces_py, new_face_data)
Example #5
0
                sub_counts += 1
        h_s = hash(s[i + 1:i + 1 + sub_len])
    hash_results += sub_counts

print(s)
print('Substrings count by naive search: {}'.format(naive_results))
print('Substrings count by Rabin-Karp with hash(): {}'.format(hash_results))

# 2. Закодируйте любую строку из трех слов по алгоритму Хаффмана.

seed(42)

message = input('Введите любую строку: ')
message_list = list(message)

message_symb, message_freq = np_unique(message_list, return_counts=True)

df = pd_DataFrame({'s': message_symb, 'f': message_freq})
message_dict = dict(zip(message_symb, ['' for _ in range(len(message_symb))]))

while df.shape[0] >= 2:
    df.sort_values(by=['f'], inplace=True)  #by=['f', 's'], ascending=True
    i0, i1 = choice([[1, 0], [0, 1]])
    for s in message_dict:
        if s in df.iloc[i0].s:
            message_dict[s] = '0' + message_dict[s]
        if s in df.iloc[i1].s:
            message_dict[s] = '1' + message_dict[s]
    df = df.append(df.iloc[0:2].sum(), ignore_index=True)
    df = df.iloc[2:]