def main(cm_file, perm_file, steps, labels_file, limit_classes=None): """Run optimization and generate output.""" # Load confusion matrix with open(cm_file) as f: cm = json.load(f) cm = np.array(cm) # Load labels if os.path.isfile(labels_file): with open(labels_file, "r") as f: labels = json.load(f) else: labels = list(range(len(cm))) n_clusters = 14 # hyperparameter spectral = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") spectral.fit(cm) if hasattr(spectral, 'labels_'): y_pred = spectral.labels_.astype(np.int) else: y_pred = spectral.predict(cm) sscore = silhouette_score(cm, y_pred) print("silhouette_score={} with {} clusters" .format(sscore, n_clusters)) grouping = [[] for _ in range(n_clusters)] for label, y in zip(labels, y_pred): grouping[y].append(label) for group in grouping: print(" {}: {}".format(len(group), group))
class ClusterAndRegress(): def __init__(self, n_clusters=1000, clustering='MiniBatchKMeans'): self.n_clusters=n_clusters if clustering == 'MiniBatchKMeans': self.batch_size = 10 * self.n_clusters self.clustering=MiniBatchKMeans(n_clusters=self.n_clusters, batch_size = self.batch_size) if clustering == 'Birch': self.clustering = Birch(n_clusters=self.n_clusters) if clustering == 'Ward': self.clustering = AgglomerativeClustering(n_clusters=self.clusters, linkage='ward') if clustering == 'SpectralClustering': self.clustering = SpectralClustering(n_clusters=self.n_clusters, affinity='nearest_neighbors', eigen_solver='arpack') self.regressors=[None]*n_clusters def fit(self,Xhr,Xlr): y_pred = self.clustering.fit_predict(Xlr) for i in range(self.n_clusters): x = Xlr[y_pred==i,:] y = Xhr[y_pred==i,:] self.regressors[i] = linear_model.LinearRegression() self.regressors[i].fit(x,y) def transform(self,X): y_lr = self.clustering.predict(X) p = np.zeros(X.shape) for c in range(self.n_clusters): index = y_lr==c p[index,:] = self.regressors[c].predict(X[index,:]) return p
def SpectralClustering(trainSet, trainLabels, testSet): classifier = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0).fit(X) ##clustering.labels_ ?? classifier.fit(trainSet, trainLabels) predictedLabels = classifier.predict(testSet) return predictedLabels
class ClusteringPredictiveModel: def __init__(self, similarities, Activity_train, case_id_col, event_col, label_col, timestamp_col, cat_cols, numeric_cols, n_clusters, n_estimators, random_state=22, fillna=True, pos_label="A_Pending"): # columns self.case_id_col = case_id_col self.label_col = label_col self.pos_label = pos_label self.n_clusters = n_clusters self.freq_encoder = FrequencyEncoder(case_id_col, event_col) self.data_encoder = LastStateEncoder(case_id_col, timestamp_col, cat_cols, numeric_cols, fillna) self.clusteringKMeans = KMeans(n_clusters, random_state=random_state) #self.clustering = SpectralClustering(n_clusters=n_clusters,assign_labels="discretize",random_state=random_state) self.aff_matrix = similarities self.clustering = SpectralClustering(n_clusters=6, affinity='precomputed') self.clss = [ RandomForestClassifier(n_estimators=n_estimators, random_state=random_state) for _ in range(n_clusters) ] self.data_freqs = 0 self.Activity_train = Activity_train def fit(self, X, y=None): # encode events as frequencies #data_freqs = self.freq_encoder.fit_transform(X) #print(len(data_freqs)) # cluster traces according to event frequencies #cluster_assignments = self.clustering.fit_predict(data_freqs) cluster_assignments = self.clustering.fit_predict(self.aff_matrix) # train classifier for each cluster for cl in range(self.n_clusters): #cases = X[self.case_id_col][cluster_assignments == cl] cases = list( np.array(self.Activity_train.index)[cluster_assignments == cl]) tmp = X[X[self.case_id_col].isin(cases)] tmp = self.data_encoder.transform(tmp) self.clss[cl].fit( tmp.drop([self.case_id_col, self.label_col], axis=1), tmp[self.label_col]) #print(tmp.drop([self.case_id_col, self.label_col], axis=1)) return self def predict_proba(self, X, Activity_test): # encode events as frequencies self.data_freqs = self.freq_encoder.transform(X) self.Activity_test = Activity_test # calculate closest clusters for each trace #cluster_assignments = self.clustering.predict(self.data_freqs) cluster_assignments = self.clustering.predict(self.Activity_test) # predict outcomes for each cluster cols = [self.case_id_col] + list(self.clss[0].classes_) preds = pd.DataFrame(columns=cols) self.actual = pd.DataFrame(columns=cols) for cl in range(self.n_clusters): # select cases belonging to given cluster #cases = self.Activity_test.index[cluster_assignments == cl] cases = list( np.array(self.Activity_test.index)[cluster_assignments == cl]) if len(cases) > 0: tmp = X[X[self.case_id_col].isin(cases)] # encode data attributes tmp = self.data_encoder.transform(tmp) # make predictions new_preds = pd.DataFrame(self.clss[cl].predict_proba( tmp.drop([self.case_id_col, self.label_col], axis=1))) new_preds.columns = self.clss[cl].classes_ new_preds[self.case_id_col] = cases #new_preds preds = pd.concat([preds, new_preds], axis=0, ignore_index=True, sort=False) #print('cluster ', cl) #print(new_preds.head()) # extract actual label values actuals = pd.get_dummies(tmp[self.label_col]) #print(actuals) actuals[self.case_id_col] = tmp[self.case_id_col] self.actual = pd.concat([self.actual, actuals], axis=0, ignore_index=True, sort=False) print(' ', len(new_preds), end='') else: print(' 0', end='') print('') preds.fillna(0, inplace=True) self.actual.fillna(0, inplace=True) #self.actual = self.actual[self.pos_label] #return preds[self.pos_label] return preds
def main(event, context): body = {} dataObject = [] if 'body' in event.keys(): body = json.loads(event['body']) dataObject = body['data'] else: return createResponse({ 'error': 'no data'}) X = np.empty(shape=[0,2]) print(X) for address in dataObject: lat = float(address['location']['lat']) lng = float(address['location']['lng']) X = np.append(X, [[lat, lng]], axis=0) n_clusters = 2; if ('clusters' in body): n_clusters = body['clusters'] algoName = 'kmeans' if ('algo' in body): algoName = body['algo'] if algoName == 'spectral': algo = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity='nearest_neighbors') algo.fit(X) prediction = algo.labels_.astype(np.int) if algoName == 'minikmeans': algo = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, n_init=10, max_no_improvement=10, verbose=0) algo.fit(X) prediction = algo.predict(X) if algoName == 'meanshift': bandwidth = estimate_bandwidth(X, quantile=0.3) algo = MeanShift(bandwidth=bandwidth, bin_seeding=True) algo.fit(X) prediction = algo.labels_.astype(np.int) if algoName == 'affinity': algo = AffinityPropagation(damping=0.8) algo.fit(X) prediction = algo.labels_.astype(np.int) if algoName == 'agglo': connectivity = kneighbors_graph(X, n_neighbors=2, include_self=False) algo = AgglomerativeClustering(linkage='average', affinity='cityblock', n_clusters=n_clusters, connectivity=connectivity) algo.fit(X) prediction = algo.labels_.astype(np.int) if algoName == 'birch': algo = Birch(n_clusters=n_clusters, threshold=0.1, branching_factor=10) algo.fit(X) prediction = algo.labels_.astype(np.int) if algoName == 'gaussian': algo = GaussianMixture(n_components=n_clusters, covariance_type='full') algo.fit(X) prediction = algo.predict(X) else: algo = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) algo.fit(X) prediction = algo.predict(X) else: algo = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) prediction = algo.predict(X) print(prediction) clusterData = [] for index, address in enumerate(dataObject): lat = float(address['location']['lat']) lng = float(address['location']['lng']) cluster = int(prediction[index]) clusterData.append({ 'address': address['address'], 'lat': lat, 'lng': lng, 'cluster': cluster, }) dataWithMetaData = { 'clusters': n_clusters, 'algo': algoName, 'addresses': len(dataObject), 'clusterData': clusterData } return createResponse(dataWithMetaData)
#print(Y.head()) scaler = Normalizer().fit(X) trainX = scaler.transform(X) traindata = np.array(X) trainlabel = np.array(Y) traindata, testdata, trainlabel, testlabel = model_selection.train_test_split( traindata, trainlabel, test_size=0.3) #print(testdata.shape) #print(traindata.shape) model = KNeighborsClassifier() model.fit(traindata, trainlabel) print(model) # make predictions expected = testlabel predicted = model.predict(testdata) #np.savetxt('res/predictedKNN.txt', predicted, fmt='%01d') # summarize the fit of the model accuracy = accuracy_score(expected, predicted) recall = recall_score(expected, predicted, average="binary") precision = precision_score(expected, predicted, average="binary") f1 = f1_score(expected, predicted, average="binary") cm = metrics.confusion_matrix(expected, predicted) print(cm) tpr = float(cm[0][0]) / np.sum(cm[0]) fpr = float(cm[1][1]) / np.sum(cm[1]) print("%.3f" % tpr) print("%.3f" % fpr) print("Accuracy") print("%.3f" % accuracy)
# In order to visualize K-Means clustering, I wrote the code in the above comment, # yet, it returned a ValueError: # "Only sparse matrices with 32-bit integer indices are accepted. Got int64 indices." # Therefore, I defined convert_to_32bit_indices function, yet it did not work either. # ------------------------------------------------------------------------------- # Spectral Clustering # For the Spectral Clustering, I run the convert_to_64bit_indices function, so that # I get rid of "RuntimeError: nnz of the result is too large" and run Spectral Clustering # properly. Yet, I cannot make run Spectral Clustering, it did not work. def convert_to_64bit_indices(x): x.indptr = np.array(x.indptr, copy=False, dtype=np.int64) x.indices = np.array(x.indices, copy=False, dtype=np.int64) return x X = convert_to_64bit_indices(X) from sklearn.cluster import SpectralClustering spectral = SpectralClustering(n_clusters=2, random_state=44, gamma=1.0, n_neighbors=10) spectral.fit(X) preds = spectral.predict(X) print(spectral.affinity_matrix_) # Gaussian Mixture Model from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=2) gmm_labels = gmm.fit_predict(X.toarray()) proba_lists = gmm.predict_proba(X.toarray()) print(f"The score of Gaussian Mixture model is: {gmm.score(X.toarray())}")