def anonymize(df: "pandas.DataFrame") -> "pandas.DataFrame": """ Remove private data from a dataframe Any column containing at least one piece of private data is removed from the dataframe. This is a naive solution but limits the possibility of false negatives. Parameters ---------- df : pd.DataFrame The dataframe to anonymize Returns ------- pd.DataFrame The dataframe with columns containing private data removed """ private_cols = [] checks = [check_addresses, check_emails, check_phonenumbers] for check in checks: new_private_cols = check(df) private_cols += new_private_cols # Get unique columns private_cols = np_unique(private_cols).tolist() # Drop columns return df.drop(private_cols, axis=1)
def predict(self, X, post_analyze_distribution=False, verbose=1): df = pd_df(X) print("started prediction for ", self.cluster_model, " X(", X.shape, ")") if self.cluster_model == 'KMeans': # default vals for kmeans --> max_iter=300, 1e-4 self.predictedKlusters = self.trained_model.predict(df).astype( float) self.kluster_centers = self.trained_model.cluster_centers_.astype( float) elif self.cluster_model == 'GMM_full': # default vals for gmm --> max_iter=100, 1e-3 _, log_resp = self.trained_model._e_step(df) self.predictedKlusters = log_resp.argmax(axis=1) elif self.cluster_model == 'GMM_diag': _, log_resp = self.trained_model._e_step(df) self.predictedKlusters = log_resp.argmax(axis=1) elif self.cluster_model == 'Spectral': self.predictedKlusters = self.trained_model.predict(X).labels_ self.kluster_centroids = get_cluster_centroids( X, self.predictedKlusters, kluster_centers=self.kluster_centers, verbose=0) if post_analyze_distribution: numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution( self.predictedKlusters, self.n_clusters, verbose=1) unique_clust_cnt = len(np_unique(self.predictedKlusters)) print("prediction completed for ", self.cluster_model, " - unique_clust_cnt(", str(unique_clust_cnt), "), numOf_1_sample_bins(", str(numOf_1_sample_bins), ")") return np_asarray(self.predictedKlusters, dtype=int), self.kluster_centroids
def fit(self, X, post_analyze_distribution=False, verbose=1, random_state=0): df = pd_df(np_array(X)) curTol = 0.0001 if self.cluster_model == 'KMeans' else 0.01 max_iter = 300 if self.cluster_model == 'KMeans' else 200 numOf_1_sample_bins = 1 unique_clust_cnt = 1 expCnt = 0 while (unique_clust_cnt == 1 or numOf_1_sample_bins - expCnt > 0) and expCnt < self.max_try_cnt: t = time() if expCnt > 0: if numOf_1_sample_bins > 0: print("running ", self.cluster_model, " for the ", str(expCnt), " time due to numOf_1_sample_bins(", str(numOf_1_sample_bins), ")") if unique_clust_cnt == 1: print("running ", self.cluster_model, " for the ", str(expCnt), " time due to unique_clust_cnt==1") if verbose > 0: print('Clustering the featVec(', X.shape, ') with n_clusters(', str(self.n_clusters), ') and model = ', self.cluster_model, ", curTol(", str(curTol), "), max_iter(", str(max_iter), "), at ", datetime.now().strftime("%H:%M:%S")) self.kluster_centers = None self.predictedKlusters = None if self.cluster_model == 'KMeans': # default vals for kmeans --> max_iter=300, 1e-4 self.trained_model = KMeans(init='k-means++', n_clusters=self.n_clusters, n_init=20, tol=curTol, max_iter=max_iter, random_state=random_state).fit(df) self.predictedKlusters = self.trained_model.labels_.astype( float) self.kluster_centers = self.trained_model.cluster_centers_.astype( float) elif self.cluster_model == 'GMM_full': # default vals for gmm --> max_iter=100, 1e-3 self.trained_model = GaussianMixture( n_components=self.n_clusters, covariance_type='full', tol=curTol, random_state=random_state, max_iter=max_iter, reg_covar=1e-4).fit(df) _, log_resp = self.trained_model._e_step(X) self.predictedKlusters = log_resp.argmax(axis=1) elif self.cluster_model == 'GMM_diag': self.trained_model = GaussianMixture( n_components=self.n_clusters, covariance_type='diag', tol=curTol, random_state=random_state, max_iter=max_iter, reg_covar=1e-4).fit(df) _, log_resp = self.trained_model._e_step(X) self.predictedKlusters = log_resp.argmax(axis=1) elif self.cluster_model == 'Spectral': sc = SpectralClustering(n_clusters=self.n_clusters, affinity=self.spectral_affinity, random_state=random_state) self.trained_model = sc.fit(X) self.predictedKlusters = self.trained_model.labels_ self.kluster_centroids = get_cluster_centroids( X, self.predictedKlusters, kluster_centers=self.kluster_centers, verbose=0) if post_analyze_distribution: numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution( self.predictedKlusters, self.n_clusters, verbose=verbose) unique_clust_cnt = len(np_unique(self.predictedKlusters)) curTol = curTol * 10 max_iter = max_iter + 50 expCnt = expCnt + 1 else: expCnt = self.max_try_cnt elapsed = time() - t if verbose > 0: print('Clustering done in (', getElapsedTimeFormatted(elapsed), '), ended at ', datetime.now().strftime("%H:%M:%S")) removeLastLine() if verbose > 0: print('Clustering completed with (', np_unique(self.predictedKlusters).shape, ') clusters, expCnt(', str(expCnt), ')') # elif 'OPTICS' in clusterModel: # N = featVec.shape[0] # min_cluster_size = int(np.ceil(N / (n_clusters * 4))) # pars = clusterModel.split('_') # 'OPTICS_hamming_dbscan', 'OPTICS_russellrao_xi' # # metricsAvail = np.sort(['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', # # 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', # # 'sokalsneath', 'sqeuclidean', 'yule', # # 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']) # # cluster_methods_avail = ['xi', 'dbscan'] # clust = ClusterOPT(min_samples=50, xi=.05, min_cluster_size=min_cluster_size, metric=pars[1], cluster_method=pars[2]) # clust.fit(featVec) # predictedKlusters = cluster_optics_dbscan(reachability=clust.reachability_, # core_distances=clust.core_distances_, # ordering=clust.ordering_, eps=0.5) # n1 = np.unique(predictedKlusters) # print(clusterModel, ' found ', str(n1), ' uniq clusters') # predictedKlusters = predictedKlusters + 1 return self
def extrude_edges(vertices, edges, faces, edge_mask, face_data, matrices): if not matrices: matrices = [Matrix()] if face_data: face_data_matched = repeat_last_for_length(face_data, len(faces)) if edge_mask: edge_mask_matched = repeat_last_for_length(edge_mask, len(edges)) if isinstance(edges, np_ndarray): if edge_mask: np_edges = edges[edge_mask_matched] else: np_edges = edges else: if edge_mask: np_edges = np_array(edges)[edge_mask_matched] else: np_edges = np_array(edges) if isinstance(vertices, np_ndarray): np_verts = vertices else: np_verts = np_array(vertices) affeced_verts_idx = np_unique(np_edges) if len(matrices) == 1: extruded_verts = matrix_apply_np(np_verts[affeced_verts_idx], matrices[0]) new_vertices = np_concatenate([np_verts, extruded_verts]).tolist() else: extruded_verts = [ m @ Vector(v) for v, m in zip(np_verts[affeced_verts_idx].tolist(), cycle(matrices)) ] new_vertices = vertices + extruded_verts top_edges = np_edges + len(vertices) mid_edges = np_zeros((len(affeced_verts_idx), 2), dtype=int) mid_edges[:, 0] = affeced_verts_idx mid_edges[:, 1] = affeced_verts_idx + len(vertices) extruded_edges_py = (np_concatenate([top_edges, mid_edges])).tolist() extruded_faces = np_zeros((len(np_edges), 4), dtype=int) extruded_faces[:, :2] = np_edges extruded_faces[:, 2] = top_edges[:, 1] extruded_faces[:, 3] = top_edges[:, 0] extruded_faces_py = extruded_faces.tolist() if isinstance(edges, np_ndarray): new_edges = np_concatenate([edges, top_edges, mid_edges]).tolist() else: new_edges = edges + extruded_edges_py if faces and faces[0]: if isinstance(faces, np_ndarray): new_faces = np_concatenate([faces, extruded_faces]).tolist() else: new_faces = faces + extruded_faces_py else: new_faces = extruded_faces_py if face_data: bvh = bvh_tree_from_polygons(vertices, faces, all_triangles=False, epsilon=0.0, safe_check=True) mid_points = (np_verts[np_edges[:, 1]] + np_verts[np_edges[:, 0]]) / 2 face_idx = [bvh.find_nearest(P)[2] for P in mid_points.tolist()] new_face_data = face_data_matched + [ face_data_matched[p] for p in face_idx ] else: new_face_data = [] return (new_vertices, new_edges, new_faces, extruded_verts, extruded_edges_py, extruded_faces_py, new_face_data)
sub_counts += 1 h_s = hash(s[i + 1:i + 1 + sub_len]) hash_results += sub_counts print(s) print('Substrings count by naive search: {}'.format(naive_results)) print('Substrings count by Rabin-Karp with hash(): {}'.format(hash_results)) # 2. Закодируйте любую строку из трех слов по алгоритму Хаффмана. seed(42) message = input('Введите любую строку: ') message_list = list(message) message_symb, message_freq = np_unique(message_list, return_counts=True) df = pd_DataFrame({'s': message_symb, 'f': message_freq}) message_dict = dict(zip(message_symb, ['' for _ in range(len(message_symb))])) while df.shape[0] >= 2: df.sort_values(by=['f'], inplace=True) #by=['f', 's'], ascending=True i0, i1 = choice([[1, 0], [0, 1]]) for s in message_dict: if s in df.iloc[i0].s: message_dict[s] = '0' + message_dict[s] if s in df.iloc[i1].s: message_dict[s] = '1' + message_dict[s] df = df.append(df.iloc[0:2].sum(), ignore_index=True) df = df.iloc[2:]