def agglo(data): global agglo import pandas as pd from sklearn import cluster agglo = cluster.FeatureAgglomeration(n_clusters=32) agglo.fit(data) agglo = agglo.transform(data) agglo = pd.DataFrame(data=agglo)
def FeatureAgglomeration(array, percent_samples): print "Feature Agglomeration", percent_samples * 100, "% of training data." print "Features\tTime" array = array[:int(percent_samples * len(array))] for pct in pct_features_list: num_features = int(pct * len(array[0])) start = time() Y = cluster.FeatureAgglomeration( n_clusters=num_features).fit_transform(array) end = time() print num_features, "\t", (end - start)
def agglomerate(dataset, features_number, clusters_number): app_logger.info( 'STARTED [Feature Agglomeration] on {0} with features number = {1}'. format(dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = all_features_train.iloc[:, 1:] indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] agglomeration = cluster.FeatureAgglomeration(n_clusters=features_number) agglomeration.fit(indipendent_columns_train) reduced_train = agglomeration.transform(indipendent_columns_train) reduced_test = agglomeration.transform(indipendent_columns_test) app_logger.info('Reduced train set: {0}'.format(reduced_train), extra=LOGGER_EXTRA_OBJECT) app_logger.info('Reduced test set: {0}'.format(reduced_test), extra=LOGGER_EXTRA_OBJECT) # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'AGGLOMERATION', features_number, dataset, reduced_train, reduced_test, clusters_number, known_labels_test) app_logger.info('ENDED [Feature Agglomeration] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT)
def varclus_agglo(df_x): import numpy as np import pandas as pd from sklearn import cluster obs = len(df_x.columns) agglo = cluster.FeatureAgglomeration(n_clusters=int(np.sqrt(obs))) agglo.fit(df_x) varclus_agglo = pd.DataFrame(df_x.columns,columns=["feature_name"]) varclus_agglo["cluster"]=agglo.labels_ return varclus_agglo.sort_values(by=["cluster"],ascending=True).reset_index(drop=True)
def feature_agg(df, drop=None, components=4): if drop: keep = df[drop] df = df.drop(drop, axis=1) components = min(df.shape[1] - 1, components) agglo = cluster.FeatureAgglomeration(n_clusters=components) agglo.fit(df) df = pd.DataFrame(agglo.transform(df), index=df.index) df = df.add_prefix('feagg_') if drop: return pd.concat((keep, df), axis=1) else: return df
def feature_agglomeration(data): feature_agglomeration_program = cluster.FeatureAgglomeration( 50, memory="cache/") #new_dataset = feature_agglomeration_program.fit(data) print(feature_agglomeration_program.labels_) print(feature_agglomeration_program.children_) reduced_model = feature_agglomeration_program.transform(data) np.save("feature_agglo_model", reduced_model) feature_groups = collections.defaultdict(list) for index, value in enumerate(feature_agglomeration_program.labels_): feature_groups[int(value)].append(index) with open('feature_agglo_feature_clusters.json', 'w') as fp: json.dump(dict(feature_groups), fp) for key in feature_groups.keys(): print("{}: {}".format(key, len(feature_groups[key])))
def components(K): Sum_of_squared_distances = [] k=[] accuracy_train=[] accuracy_test=[] score=[] for i in range(1,K): print(i) agglo=cluster.FeatureAgglomeration(n_clusters=i,affinity="precomputed",linkage='complete') #X_new_train,y_new_train=transformer.fit(X_train,y_train) #X_new_test,y_new_test = transformer.transform(X_test,y_test) agglo.fit(X) X_reduced=agglo.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20) km =MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=[7,7,7,7,7,7,7],random_state=1) km.fit(X_train,y_train) km.fit(X_test,y_test) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_compo label_train=km.predict(X_train) label_test=km.predict(X_test) accu_train=km.score(X_test,y_test) accu_test=km.score(X_train,y_train) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') #Sum_of_squared_distances.append(km.inenents=i,eps=0.6) #label=transformer.predicn)rtia_) k.append(i) accuracy_train.append(accu_train) accuracy_test.append(accu_test) #score.append(score_train1) #print(accuracy) k=np.array(k) Sum_of_squared_distances=np.array(Sum_of_squared_distances) score=np.array(score) accuracy_train=np.array(accuracy_train) accuracy_test=np.asarray(accuracy_test) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3,=plt.plot(k,accuracy_train,color='r',marker='o',label='train_accuracy') line4,=plt.plot(k,accuracy_test,color='g',marker='o',label='test_accuracy') #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.xlabel('k') plt.legend() plt.ylabel('accuracy') #plt.ylim(0,1) plt.show() return None
def main(): digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) connectivity = image.grid_to_graph(*images[0].shape) agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) X_restored = agglo.inverse_transform(X_reduced) images_restored = np.reshape(X_restored, images.shape) plt.figure(1, figsize=(4, 3.5)) plt.clf() plt.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91) for i in range(4): plt.subplot(3, 4, i + 1) plt.imshow(images[i], cmap=plt.cm.gray, vmax=16, interpolation='nearest') plt.xticks() plt.yticks() if i == 1: plt.title('Original data') plt.subplot(3, 4, 4 + i + 1) plt.imshow(images_restored[i], cmap=plt.cm.gray, vmax=16, interpolation='nearest') if i == 1: plt.title("Agglomerated data") plt.xticks() plt.yticks() plt.subplot(3, 4, 10) plt.imshow(np.reshape(agglo.labels_, images[0].shape), interpolation='nearest', cmap=plt.cm.spectral) plt.xticks() plt.yticks() plt.title('Labels') plt.show()
def train_drfs(train_x, train_y, eps=0.5, threshold="median"): n_samples, n_features, n_classes = \ get_counts_tt(train_x, train_y) # pick number of components min_comp = random_projection.johnson_lindenstrauss_min_dim( \ n_samples=n_samples, eps=eps) min_comp = min(min_comp, n_features) # scale and agglomerate to min_comp #scaler = preprocessing.StandardScaler() scaler = preprocessing.QuantileTransformer() feat_agg = cluster.FeatureAgglomeration( \ n_clusters=min_comp) xtc = ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1) scaler2 = preprocessing.RobustScaler() #poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True) # train the model pipeline dr_pipe = pipeline.Pipeline([('scaler', scaler), \ ('feat_agg', feat_agg), ('scaler2', scaler2)]) dr_pipe.fit(train_x) # transform train_x to train xtc train_x = dr_pipe.transform(train_x) # train the xtc xtc.fit(train_x, train_y) print("Feature importances:") print("\tMax:", max(xtc.feature_importances_)) print("\tMin:", min(xtc.feature_importances_)) #print(xtc.feature_importances_) # create the feature selection model from the xtc feat_sel = feature_selection.SelectFromModel( \ xtc, prefit=True, threshold=threshold) # create the pipeline to reduce dim then feature select drfs_pipe = pipeline.Pipeline(\ [('dr_pipe', dr_pipe), ('feat_sel', feat_sel)]) return drfs_pipe
def performance_1(data): dataReduced = { "queries": transform_to_1(data["queries"]), "docs": transform_to_1(data["docs"]) } print("Preparing model") model = cluster.FeatureAgglomeration(n_clusters=384) print(dataReduced["docs"][0][:10]) print("Fitting model") model.fit(data["docs"]) dataNew = { "docs": model.transform(dataReduced["docs"]), "queries": model.transform(dataReduced["queries"]), } # is not 1bit print(dataNew["docs"][0][:10]) return summary_performance(dataNew)
import sys sys.path.append("src") from misc.load_utils import read_pickle, center_data, norm_data from misc.retrieval_utils import rprec_a_ip, rprec_a_l2 import argparse from sklearn import cluster parser = argparse.ArgumentParser() parser.add_argument('--data', default="/data/hp/dpr-c.embd_cn") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) print("Preparing model") model = cluster.FeatureAgglomeration(n_clusters=128) print("Fitting model") model.fit(data["docs"]) dataNew = { "docs": model.transform(data["docs"]), "queries": model.transform(data["queries"]), } val_ip_pca = rprec_a_ip(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) val_l2_pca = rprec_a_l2(dataNew["queries"],
import numpy as np from sklearn import datasets, cluster import IPython digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) agglo = cluster.FeatureAgglomeration(n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) X_reduced.shape IPython.embed()
def FeatureAgglomeration_cluster(): global tfidf_matrix agglo = cluster.FeatureAgglomeration(n_clusters=32) agglo.fit(tfidf_matrix) return agglo.labels_
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def preprocess(data, features): import pandas as pd import numpy as np np.random.seed(10) N = 3 print(data.shape) row_reduce = data for i in range (0, N): remove_n = row_reduce.shape[0] // 2 drop_indices = np.random.choice(row_reduce.index, remove_n, replace=False) row_reduce = row_reduce.drop(drop_indices) df = row_reduce df = df.astype({'DATA[0]': 'str', 'DATA[1]': 'str', 'DATA[2]': 'str', 'DATA[3]': 'str', 'DATA[4]': 'str', 'DATA[5]': 'str', 'DATA[6]': 'str', 'DATA[7]': 'str', 'Flag': 'str'}) print("Preprocessing Y...") Y = df.iloc[:, 8] le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) # pd.DataFrame(Y).to_csv("./attack_labels.csv") print("Preprocessing X...") X = df.iloc[:, 0: 8] print(X.shape) # LabelEncoder object and fit it to each feature print("Encoding X...") le = preprocessing.LabelEncoder() X = X.apply(le.fit_transform) print(X.shape) row_reduce = None data = None df = None # OneHotEncoder object, and fit it to all data print("One-Hot Encoding X...") enc = preprocessing.OneHotEncoder() enc.fit(X) X = enc.transform(X).toarray() print(X.shape) from sklearn import datasets, cluster print("Performing Feature Agglomeration...") agglo = cluster.FeatureAgglomeration(n_clusters = features) agglo.fit(X) X_reduced = agglo.transform(X) print(X_reduced.shape) X = None return Y, X_reduced
# print(reducedDataSet.shape) # # labels = model.labels_ # # print('labels') # print(labels) # # # print('labels') # # print(labels) # # # sil = metrics.silhouette_score(X, labels, metric='euclidian', sample_size=5000) from sklearn.decomposition import PCA import numpy as np pca = cluster.FeatureAgglomeration(n_clusters=2) pca.fit(X) U, S, VT = np.linalg.svd(X - X.mean(0)) X_train_pca = pca.transform(X) X_train_pca2 = (X - pca.mean_).dot(pca.components_.T) X_projected = pca.inverse_transform(X_train_pca) X_projected2 = X_train_pca.dot(pca.components_) + pca.mean_ loss = ((X - X_projected) ** 2).mean() print(loss)
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
def dim_reduction(X, n_components=2, mode="MDS"): """Reduces the number of dimensions in which a dataset is defined. Arguments X - NumPy array with shape (N,M), where N is the number of observations, and M the number of features. Keyword Arguments n_components - Intended number of features after dimensionality reduction. Default = 2 mode - String that defines the type of dim reduction: - None - "PCA" principal component analysis - "ICA" independent component analysis - "FA" factor analysis - "TSNE" t-stochastic neighbour embedding - "UMAP" uniform manifold approximation and embedding - "RANDOMPROJECTION" - "FEATUREAGGLOMERATION" - "ISOMAP" - "LLE" local linear embedding - "HESSIAN" Hessian eigenmaps - "MLLE" modified local linear embedding - "LTSA" local tangent space alignment - "MDS" multi-dimensional scaling - "DICTIONARY" dictionary learning - "TSVD" truncated SVD (also known as "LSE") Default = "MDS" Returns X - NumPy array with shape (N-n,M), where N is the number of observations and n is the number of observations with a NaN. M is the number of features. Now with scaled values. """ # Make sure the mode is in all caps. if type(mode) == str: mode = mode.upper() # Copy X into a new matrix. X_ = numpy.copy(X) # None if mode is None or mode == "NONE": # Literally nothing happens here for now. print("Fart noise!") # Principal component analysis. elif mode == 'PCA': # Initialise a new PCA. pca = decomposition.PCA(n_components=n_components) # Fit the PCA with the data. pca.fit(X_) # Transform the data. X_ = pca.transform(X_) # Independent component analysis. elif mode == 'ICA': # Initialise a new ICA. ica = decomposition.FastICA(n_components=n_components) # Fit the ICA with the data. ica.fit(X_) # Transform the data. X_ = ica.transform(X_) # Factor analysis. elif mode == 'FA': # Initialise a new factor analysis. fa = decomposition.FactorAnalysis(n_components=n_components) # Perform the factor analysis on the data. fa.fit(X_) # Transform the data. X_ = fa.transform(X_) # T-Distributed stochastic neighbour embedding. elif mode == 'TSNE': # Run several t-SNEs to find a good one. n_runs = 10 Xs_ = [] dkl = numpy.ones(n_runs, dtype=float) * numpy.inf print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \ % (n_runs)) for i in range(n_runs): # Initialise a new t-distributed stochastic neighbouring embedding # (t-SNE) analysis. tsne = TSNE(n_components=n_components) # Copy the data into a new variable. Xs_.append(numpy.copy(X_)) # Fit to and transform the data. Xs_[i] = tsne.fit_transform(Xs_[i]) # Get the KL-divergence. dkl[i] = tsne.kl_divergence_ print("\tCurrent KL-divergence = %.5f" % (dkl[i])) # Choose the solution with the lowest KL-divergence. X_ = numpy.copy(Xs_[numpy.argmin(dkl)]) # Get rid of all the excess X copies. del Xs_ # Uniform manifold approximation and projection. elif mode == 'UMAP': # Create a new UMAP instance. um = umap.UMAP(n_components=n_components, min_dist=0.01) # Fit and transform X. X_ = um.fit_transform(X_) # Gaussian Random Projection. elif mode == 'RANDOMPROJECTION': # Create a new GaussianRandomProjection instance. rp = GaussianRandomProjection(n_components=n_components) # Fit and transform X. X_ = rp.fit_transform(X_) # Feature Agglomeration. elif mode == 'FEATUREAGGLOMERATION': # Create a new FeatureAgglomeration instance. fa = cluster.FeatureAgglomeration(n_clusters=n_components) # Fit and transform X. X_ = fa.fit_transform(X_) # Isomap. elif mode == 'ISOMAP': # Create a new Isomap instance. im = Isomap(n_components=n_components) # Fit and transform X. X_ = im.fit_transform(X_) # Locally Linear Embedding. elif mode == 'LLE': # Create a new LocallyLinearEmbedding instance. lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='standard', eigen_solver='dense') # Fit and transform X. X_ = lle.fit_transform(X_) # Hessian eigenmaps. elif mode == 'HESSIAN': # Create a new LocallyLinearEmbedding instance. hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='hessian', eigen_solver='dense') # Fit and transform X. X_ = hlle.fit_transform(X_) # MLLE. elif mode == 'MLLE': # Create a new LocallyLinearEmbedding instance. mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='modified', eigen_solver='dense') # Fit and transform X. X_ = mlle.fit_transform(X_) # LTSA. elif mode == 'LTSA': # Create a new LocallyLinearEmbedding instance. ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \ method='ltsa', eigen_solver='dense') # Fit and transform X. X_ = ltsa.fit_transform(X_) # Multi-dimensional scaling. elif mode == 'MDS': # Create a new MDS instance. mds = MDS(n_components=n_components) # Fit and transform X. X_ = mds.fit_transform(X_) # Dictionary Learning elif mode == "DICTIONARY": # Create a DictionaryLearning instance. dictlearn = decomposition.DictionaryLearning( \ n_components=n_components, \ fit_algorithm='cd', \ # The 'omp' algorithm orthogonalises the whole thing, whereas # a lasso solution with a low alpha leaves a slightly more # scattered solution. transform_algorithm='lasso_cd', \ transform_alpha=0.1, \ ) # Fit and transform X. X_ = dictlearn.fit_transform(X) # Truncated SVD (also known as 'Latent Semantic analysis' (LSE) elif mode in ['TSVD', 'LSE']: tsvd = decomposition.TruncatedSVD(n_components=n_components) # Fit and transform X. X_ = tsvd.fit_transform(X) else: raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode)) return X_
def detect_anomaly(in_data, N_clusters, eng_id, threshold, N_features, n_min=60, steps=80): """ N_features: The number of features to extract from the PCA vector n_min = 60 # Minimum place to start the line fit steps = 80 # How many steps to take in fitting the line """ # Some fixed parameters savgol_window_size = 81 out_data = 'savgol_eng_' + str(eng_id) + "/" #n_min = 60 # Minimum place to start the line fit #threshold = 0.5 # In units of sigma try: # Create target Directory os.mkdir(out_data) #print("Directory " , out_data , " Created ") except FileExistsError: print("Directory ", out_data, " already exists") # Read in the data data = pd.read_csv('data/' + in_data, header=None, delim_whitespace=True) # Now we label the columns settings = [ 'operational_setting_1', 'operational_setting_2', 'operational_setting_3' ] sensors = [ 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21' ] cols = ['engine_num', 'time_cycles'] + settings + sensors data.columns = cols sensor_data = data.drop(settings, axis=1) sensor_data = sensor_data[sensor_data['engine_num'] == eng_id] sensor_data = sensor_data[sensors] # Now we examine the correlations eng1_data = sensor_data # These three sensors are flat lines eng1_data = eng1_data.drop(["sensor_1"], axis=1) eng1_data = eng1_data.drop(["sensor_18"], axis=1) eng1_data = eng1_data.drop(["sensor_19"], axis=1) corr = eng1_data.corr() corr = np.abs(corr) # plot the heatmap plt.clf() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) plt.title("Engine Number: " + str(eng_id)) plt.plot() plt.savefig(out_data + "corr_data_full_" + in_data + "_sensor_" + str(int(eng_id)) + '.pdf', bboxes='tight') # Now we examine the correlations eng1_data = sensor_data # These three sensors are flat lines eng1_data = eng1_data.drop(["sensor_1"], axis=1) eng1_data = eng1_data.drop(["sensor_18"], axis=1) eng1_data = eng1_data.drop(["sensor_19"], axis=1) # Drop these correlated sensors eng1_data = eng1_data.drop(["sensor_5"], axis=1) eng1_data = eng1_data.drop(["sensor_6"], axis=1) eng1_data = eng1_data.drop(["sensor_10"], axis=1) eng1_data = eng1_data.drop(["sensor_16"], axis=1) corr = np.abs(eng1_data.corr()) # plot the heatmap plt.clf() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) plt.plot() plt.title("Engine Number: " + str(eng_id)) plt.savefig(out_data + "corr_data_sub_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') #================================================================ # Choose the N clusters #================================================================ N_ind_sensors_label = [] N_ind_sensors_name = [] corr = np.abs(eng1_data.corr()) M = np.asarray(corr.iloc[:, :]) Z = linkage(M, 'single') plt.figure(figsize=(25, 10)) labelsize = 20 ticksize = 15 plt.title('Hierarchical Clustering Dendrogram for Sensor Data', fontsize=labelsize) plt.xlabel('stock', fontsize=labelsize) plt.ylabel('distance', fontsize=labelsize) dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels labels=corr.columns) plt.yticks(fontsize=ticksize) plt.xticks(rotation=-90, fontsize=ticksize) plt.savefig(out_data + "sensor_dendrogram_sub_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') # Lets generate three clusters based on the data agglo = cluster.FeatureAgglomeration(n_clusters=N_clusters) agglo.fit(M) M_reduced = agglo.transform(M) cluster_label = agglo.labels_ data_col = corr.columns # Now we find representatives of the N clusters # Initialize our array N_ind_sensors_label.append(cluster_label[0]) N_ind_sensors_name.append(data_col[0]) for k in range(1, len(cluster_label)): if (cluster_label[k] not in N_ind_sensors_label): N_ind_sensors_label.append(cluster_label[k]) N_ind_sensors_name.append(data_col[k]) # Now we examine the correlations eng1_data_ind = sensor_data[N_ind_sensors_name] corr = np.abs(eng1_data_ind.corr()) # plot the heatmap plt.clf() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) plt.title("Engine Number: " + str(eng_id)) plt.plot() plt.savefig(out_data + "_corr_data_N_sensors_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') def rms(y): s = np.dot(y, y) s = s / float(len(y)) s = np.sqrt(s) return s def max_peak(y): s = np.max(y) return s def line_int(y): s = 0.0 for i in range(1, len(y)): s += np.abs(y[i] - y[i - 1]) return s def energy(y): y = y - np.mean(y) s = np.dot(y, y) return s def std(y): s = np.std(y) return s def compute_property(func, y_vec): N = len(y_vec) y_func = np.zeros(N) for i in range(1, N + 1): yi = y_vec[0:i] fi = func(yi) y_func[i - 1] = fi return y_func # # Next, we compute all features for all of the sensors and combine all of the data into a large feature matrix # Generate a new data frame with all of these new features X = pd.DataFrame() energy_set = ['energy_' + str(int(k)) for k in range(0, N_clusters)] rms_set = ['rms_' + str(int(k)) for k in range(0, N_clusters)] line_set = ['line_' + str(int(k)) for k in range(0, N_clusters)] max_set = ['max_' + str(int(k)) for k in range(0, N_clusters)] std_set = ['std_' + str(int(k)) for k in range(0, N_clusters)] for k in range(len(energy_set)): feature_name_1 = energy_set[k] feature_name_2 = rms_set[k] feature_name_3 = line_set[k] feature_name_4 = max_set[k] feature_name_5 = std_set[k] X[feature_name_1] = compute_property(energy, eng1_data_ind.iloc[0:, k].values) X[feature_name_2] = compute_property(rms, eng1_data_ind.iloc[0:, k].values) X[feature_name_3] = compute_property(line_int, eng1_data_ind.iloc[0:, k].values) X[feature_name_4] = compute_property(max_peak, eng1_data_ind.iloc[0:, k].values) X[feature_name_5] = compute_property(std, eng1_data_ind.iloc[0:, k].values) all_features = energy_set + rms_set + line_set + max_set + std_set # # The feature matrix has high dimensionality, use PCA to find the first two Priciple components of the feature matrix # In[9]: #==================================================================================== # PCA Analysis #==================================================================================== pca = PCA(n_components=2) # Scale all of the features X = X.loc[:, all_features].values X = StandardScaler().fit_transform(X) principalComponents = pca.fit_transform(X) principalDf = pd.DataFrame(data=principalComponents, columns=['pc1', 'pc2']) print('Explained Variance: ', pca.explained_variance_ratio_) V1 = principalDf['pc1'] V2 = principalDf['pc2'] plt.clf() plt.title("PCA 1") plt.plot(V1) plt.xlabel("Cycles", size=20) plt.ylabel("PCA [unitless]", size=20) plt.savefig(out_data + "PCA1_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') #==================================================================================== # Bayesian Fit #==================================================================================== #savgol_filter(y, 11, 3) # window size 51, polynomial order 3 x = range(len(V1)) y1 = savgol_filter(V1, savgol_window_size, 3) y2 = savgol_filter(V2, savgol_window_size, 3) plt.clf() plt.title("PCA 1 Savgol Filter") plt.plot(x, y1) plt.xlabel("Cycles", size=20) plt.ylabel("PCA [unitless]", size=20) plt.savefig(out_data + "PCA1_filter_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') def bayesian_fit(x, y): def lnlike(theta, x, y): a1, b, sigma = theta model = a1 * x + b inv_sigma2 = 1.0 / sigma**2 return -0.5 * (np.sum((y - model)**2 * inv_sigma2 - np.log(inv_sigma2))) def lnprob(theta, x, y): #lp = lnprior(theta) #return lp + lnlike(theta, x, y, yerr) return lnlike(theta, x, y) nll = lambda *args: -lnlike(*args) result = op.minimize(nll, [1.0, 1.0, 1.0], args=(x, y)) a1_ml, b_ml, sigma_ml = result["x"] ndim, nwalkers = 3, 8 pos = [ result["x"] + 1e-4 * np.random.randn(ndim) for i in range(nwalkers) ] sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y)) sampler.run_mcmc(pos, 100) samples = sampler.chain[:, 50:, :].reshape((-1, ndim)) return samples Nk = [] a1_k = [] a2_k = [] a2_error_k = [] a1_error_k = [] sse_k = [] n_max = len(y1) dh = int(float(n_max - n_min) / float(steps)) for N in range(n_min, n_max, dh): x_N = x[0:N] y_N = y1[0:N] samples = bayesian_fit(x_N, y_N) a1_N = np.mean(samples[:, 0]) b_N = np.mean(samples[:, 1]) a1_k.append(a1_N) model_k = b_N + a1_N * np.asarray(x_N) sse = np.dot(y_N - model_k, y_N - model_k) a1_error_k = np.std(samples[:, 0]) sse_k.append(sse) Nk.append(N) plt.clf() plt.title('Linear coefficient') plt.plot(Nk, a1_k) plt.ylabel('$a_1$', size=20) plt.xlabel('Cycles', size=20) plt.savefig(out_data + "linear_coeff_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') plt.clf() plt.plot(Nk, sse_k, '-o', color='blue') plt.xlabel("Cycles", size=20) plt.ylabel('Residuals', size=20) plt.savefig(out_data + "residuals_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') plt.clf() sse2_k = np.gradient(sse_k, 2) plt.title("Residual Acc. vs Cycle", size=20) plt.plot(Nk, sse2_k, '-o', color='green') plt.xlabel("Cycles", size=20) plt.ylabel("Residual Acceleration", size=20) plt.savefig(out_data + "residual_acc_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') #============================================================================================ # Now we determine the range of the anomalies # Apply the standard Scaler to the SSE-acceleration data sse2_k_median = np.median(sse2_k) sse2_k_std = np.std(sse2_k) sse2_k_scaled = np.abs(sse2_k - sse2_k_median) / sse2_k_std # Now normalize from zero to one sse2_k_max = np.max(sse2_k_scaled) sse2_k_min = np.min(sse2_k_scaled) sse2_k_scaled = (sse2_k_scaled - sse2_k_min) / (sse2_k_max - sse2_k_min) # Define a possible threshold for the failure region Nk_anom = [] for i in range(len(sse2_k)): # Find all points that have an anomaly if (sse2_k_scaled[i] >= threshold): Nk_anom.append(Nk[i]) if (len(Nk_anom) != 0): plt.clf() [ plt.axvline(Ni, alpha=1.0, color='red', linewidth=2.0) for Ni in Nk_anom ] plt.axvspan(Nk_anom[0], Nk_anom[-1], alpha=0.2, color='purple') plt.plot(Nk, sse2_k_scaled, '-o', color='green') plt.title("Residual Acc. Scaled vs Cycle", size=20) plt.xlabel("Cycles", size=20) plt.ylabel("Residual Acceleration", size=20) plt.savefig(out_data + "_scaled_residual_acc_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') for k in range(0, len(N_ind_sensors_name)): plt.clf() plt.title(N_ind_sensors_name[k], size='20') plt.plot(eng1_data_ind.iloc[:, k].values) if (len(Nk_anom) != 0): [ plt.axvline(Ni, alpha=0.5, color='red', linewidth=3.0) for Ni in Nk_anom ] # plt.axvspan(Nk_anom[0],Nk_anom[-1],alpha=0.2, color='purple') plt.xlabel("Cycles", size=20) plt.legend() plt.savefig(out_data + "sensor_" + str(N_ind_sensors_name[k]) + "_anomaly_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') plt.clf() plt.plot(x, y1) if (len(Nk_anom) != 0): [ plt.axvline(Ni, alpha=0.3, color='red', linewidth=3.0) for Ni in Nk_anom ] #plt.axvspan(Nk_anom[0],Nk_anom[-1],alpha=0.2, color='purple') plt.ylabel("PCA1 filter", size=20) plt.xlabel("Cycles", size=20) plt.savefig(out_data + "PCA_" + in_data + "_eng_" + str(int(eng_id)) + '.pdf', bboxes='tight') y_feature = x[-1] - x[N_features] x_feature = y1[0:N_features] return x_feature, y_feature
def transform(self, results_file='', short_texts_length=15): """ Classify texts for each provider and save predictions :param results_file: path to previously computed predictions :param short_texts_length: length of short texts for different objects """ if path.exists(results_file): self.load_results(results_file) return file_names = os.listdir(self.data_directory) paths = [self.data_directory + '/' + name for name in file_names] ids_vector = [name.split('-')[0] for name in file_names] categories_vector = [name.split('-')[1] for name in file_names] ratings_vector = [ int(name.split('-')[2].split('.')[0]) for name in file_names ] #features = texts_to_vectors(paths) features, ratings_vector, categories_vector, ids_vector, paths = divide_texts( paths, ratings_vector, categories_vector, ids_vector, n=short_texts_length) # Feature Agglomeration if self.feature_agglomeration: agglomeration = cluster.FeatureAgglomeration(n_clusters=5) agglomeration.fit(features) features_reduced = agglomeration.transform(features) features = features_reduced self.unique_ratings = sorted(list(set(ratings_vector))) unique_ids = list(set(ids_vector)) # Object selection if self.selection == 'none': selected_features = features selected_ids_vector = ids_vector selected_ratings_vector = ratings_vector elif self.selection == 'kmeans': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_kmeans( ids_vector, ratings_vector, features) elif self.selection == 'random': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_random( ids_vector, ratings_vector, features) elif self.selection == 'silhouette': selected_features, selected_ids_vector, selected_ratings_vector = self.selection_silhouette( ids_vector, ratings_vector, features, categories_vector) true_ratings_object = {} predicted_ratings_object = {} predicted_ratings_vector = [] true_ratings_vector = [] paths_object = {} ids_object = {} if self.algorithm == 'knn': model = KNeighborsClassifier(n_neighbors=3) elif self.algorithm == 'lr': model = linear_model.Lasso(alpha=0.1) else: model = RandomForestClassifier() for current_id in unique_ids: # Images for current_id to test set and other images to train set test_indexes = [] train_indexes = [] for index, img_id in enumerate(ids_vector): if img_id == current_id: test_indexes.append(index) for index, img_id in enumerate(selected_ids_vector): if img_id != current_id: train_indexes.append(index) train_X = selected_features[train_indexes, :] test_X = features[test_indexes, :] train_y = [selected_ratings_vector[j] for j in train_indexes] test_y = [ratings_vector[j] for j in test_indexes] if len(test_y) == 0: continue model.fit(train_X, train_y) predictions = model.predict(test_X) # Save to object predicted_ratings_object[current_id] = predictions true_ratings_object[current_id] = test_y paths_object[current_id] = [ paths[test_index] for test_index in test_indexes ] ids_object[current_id] = [ ids_vector[test_index] for test_index in test_indexes ] # Save to vector predicted_ratings_vector.extend(predictions) true_ratings_vector.extend(test_y) # Save to class properties self.predicted_ratings_object = predicted_ratings_object self.true_ratings_object = true_ratings_object self.predicted_ratings_vector = predicted_ratings_vector self.true_ratings_vector = true_ratings_vector self.paths_object = paths_object self.ids_object = ids_object # Save predictions to a file self.save_results(results_file)
# 4. 连通性约束的聚类 import matplotlib.pyplot as plt from skimage.data import coins from skimage.transform import rescale from sklearn.feature_extraction.image import grid_to_graph from sklearn.cluster import AgglomerativeClustering orig_coins = coins() # 使用高斯滤波对其进行平滑,然后缩小便于处理 # smoothened_coins = gaussian_filter(orig_coins, sigma=2) # 这个高斯滤波有问题 # rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect") # X = np.reshape(rescaled_coins, (-1, 1)) # connectivity = grid_to_graph(*rescaled_coins.shape) # 5. 特征聚集(将类似的特征合并在一起) 看的有点迷糊,先把代码粘下来吧 import numpy as np digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) connectivity = grid_to_graph(*images[0].shape) agglo = cluster.FeatureAgglomeration(connectivity=connectivity, n_clusters=32) agglo.fit(X) X_reduced = agglo.transform(X) X_approx = agglo.inverse_transform(X_reduced) images_approx = np.reshape(X_approx, images.shape)
preprocess_pipeline = compose.ColumnTransformer([ ('cat', categorical_pipeline, cat_features), ('num', numerical_pipeline, num_features) ]) house_train2 = preprocess_pipeline.fit_transform(house_train1) #outlier detection pipeline outlier_pipeline = pipeline.Pipeline([ ('preprocess', preprocess_pipeline), ('outlier_estimator', ensemble.IsolationForest(contamination=0.01)) ]) labels = outlier_pipeline.fit_predict(house_train1) house_train1[labels == -1] #add clustering label as new featuer clustering = cluster.AgglomerativeClustering(n_clusters=5) clustering.fit(house_train2) house_train2['house_group'] = clustering.labels_ #feature reduction by clustering related features cluster_pipeline = pipeline.Pipeline([ ('preprocess', preprocess_pipeline), ('cluster_features', cluster.FeatureAgglomeration(n_clusters=50)) ]) cluster_data = cluster_pipeline.fit_transform(house_train1) #feature reduction on top of correlation matrix corr_matrix = np.corrcoef(house_train2, rowvar=False) clustering = cluster.AgglomerativeClustering(n_clusters=10) clustering.fit(corr_matrix)