def main(cm_file, perm_file, steps, labels_file, limit_classes=None):
    """Run optimization and generate output."""
    # Load confusion matrix
    with open(cm_file) as f:
        cm = json.load(f)
        cm = np.array(cm)

    # Load labels
    if os.path.isfile(labels_file):
        with open(labels_file, "r") as f:
            labels = json.load(f)
    else:
        labels = list(range(len(cm)))

    n_clusters = 14  # hyperparameter
    spectral = SpectralClustering(n_clusters=n_clusters,
                                  eigen_solver='arpack',
                                  affinity="nearest_neighbors")
    spectral.fit(cm)
    if hasattr(spectral, 'labels_'):
        y_pred = spectral.labels_.astype(np.int)
    else:
        y_pred = spectral.predict(cm)
    sscore = silhouette_score(cm, y_pred)
    print("silhouette_score={} with {} clusters"
          .format(sscore, n_clusters))
    grouping = [[] for _ in range(n_clusters)]
    for label, y in zip(labels, y_pred):
        grouping[y].append(label)
    for group in grouping:
        print("  {}: {}".format(len(group), group))
class ClusterAndRegress():
  def __init__(self, n_clusters=1000, clustering='MiniBatchKMeans'):

    self.n_clusters=n_clusters
    if clustering == 'MiniBatchKMeans':
      self.batch_size = 10 * self.n_clusters
      self.clustering=MiniBatchKMeans(n_clusters=self.n_clusters, batch_size = self.batch_size)
    if clustering == 'Birch':
      self.clustering = Birch(n_clusters=self.n_clusters)      
    if clustering == 'Ward':
      self.clustering = AgglomerativeClustering(n_clusters=self.clusters, linkage='ward')
    if clustering == 'SpectralClustering':
      self.clustering = SpectralClustering(n_clusters=self.n_clusters, affinity='nearest_neighbors', eigen_solver='arpack')
    self.regressors=[None]*n_clusters
    
  def fit(self,Xhr,Xlr):
        
    y_pred = self.clustering.fit_predict(Xlr)

    for i in range(self.n_clusters):
      x = Xlr[y_pred==i,:]
      y = Xhr[y_pred==i,:]
      self.regressors[i] = linear_model.LinearRegression()
      self.regressors[i].fit(x,y)

  def transform(self,X):
    y_lr = self.clustering.predict(X)   
    p = np.zeros(X.shape)
    for c in range(self.n_clusters):
      index = y_lr==c
      p[index,:] = self.regressors[c].predict(X[index,:])
    return p
Beispiel #3
0
def SpectralClustering(trainSet, trainLabels, testSet):


    classifier = SpectralClustering(n_clusters=2, assign_labels="discretize", random_state=0).fit(X)


    ##clustering.labels_ ??


    classifier.fit(trainSet, trainLabels)


    predictedLabels = classifier.predict(testSet)


    return predictedLabels 
Beispiel #4
0
class ClusteringPredictiveModel:
    def __init__(self,
                 similarities,
                 Activity_train,
                 case_id_col,
                 event_col,
                 label_col,
                 timestamp_col,
                 cat_cols,
                 numeric_cols,
                 n_clusters,
                 n_estimators,
                 random_state=22,
                 fillna=True,
                 pos_label="A_Pending"):

        # columns
        self.case_id_col = case_id_col
        self.label_col = label_col
        self.pos_label = pos_label

        self.n_clusters = n_clusters

        self.freq_encoder = FrequencyEncoder(case_id_col, event_col)
        self.data_encoder = LastStateEncoder(case_id_col, timestamp_col,
                                             cat_cols, numeric_cols, fillna)
        self.clusteringKMeans = KMeans(n_clusters, random_state=random_state)
        #self.clustering = SpectralClustering(n_clusters=n_clusters,assign_labels="discretize",random_state=random_state)
        self.aff_matrix = similarities
        self.clustering = SpectralClustering(n_clusters=6,
                                             affinity='precomputed')
        self.clss = [
            RandomForestClassifier(n_estimators=n_estimators,
                                   random_state=random_state)
            for _ in range(n_clusters)
        ]
        self.data_freqs = 0
        self.Activity_train = Activity_train

    def fit(self, X, y=None):

        # encode events as frequencies
        #data_freqs = self.freq_encoder.fit_transform(X)
        #print(len(data_freqs))
        # cluster traces according to event frequencies
        #cluster_assignments = self.clustering.fit_predict(data_freqs)
        cluster_assignments = self.clustering.fit_predict(self.aff_matrix)
        # train classifier for each cluster
        for cl in range(self.n_clusters):
            #cases = X[self.case_id_col][cluster_assignments == cl]
            cases = list(
                np.array(self.Activity_train.index)[cluster_assignments == cl])
            tmp = X[X[self.case_id_col].isin(cases)]
            tmp = self.data_encoder.transform(tmp)
            self.clss[cl].fit(
                tmp.drop([self.case_id_col, self.label_col], axis=1),
                tmp[self.label_col])
        #print(tmp.drop([self.case_id_col, self.label_col], axis=1))
        return self

    def predict_proba(self, X, Activity_test):

        # encode events as frequencies
        self.data_freqs = self.freq_encoder.transform(X)
        self.Activity_test = Activity_test
        # calculate closest clusters for each trace
        #cluster_assignments = self.clustering.predict(self.data_freqs)
        cluster_assignments = self.clustering.predict(self.Activity_test)

        # predict outcomes for each cluster
        cols = [self.case_id_col] + list(self.clss[0].classes_)
        preds = pd.DataFrame(columns=cols)
        self.actual = pd.DataFrame(columns=cols)
        for cl in range(self.n_clusters):

            # select cases belonging to given cluster
            #cases = self.Activity_test.index[cluster_assignments == cl]

            cases = list(
                np.array(self.Activity_test.index)[cluster_assignments == cl])

            if len(cases) > 0:
                tmp = X[X[self.case_id_col].isin(cases)]

                # encode data attributes
                tmp = self.data_encoder.transform(tmp)

                # make predictions
                new_preds = pd.DataFrame(self.clss[cl].predict_proba(
                    tmp.drop([self.case_id_col, self.label_col], axis=1)))
                new_preds.columns = self.clss[cl].classes_
                new_preds[self.case_id_col] = cases
                #new_preds
                preds = pd.concat([preds, new_preds],
                                  axis=0,
                                  ignore_index=True,
                                  sort=False)
                #print('cluster ', cl)
                #print(new_preds.head())
                # extract actual label values
                actuals = pd.get_dummies(tmp[self.label_col])
                #print(actuals)
                actuals[self.case_id_col] = tmp[self.case_id_col]
                self.actual = pd.concat([self.actual, actuals],
                                        axis=0,
                                        ignore_index=True,
                                        sort=False)
                print(' ', len(new_preds), end='')
            else:
                print(' 0', end='')
        print('')
        preds.fillna(0, inplace=True)
        self.actual.fillna(0, inplace=True)
        #self.actual = self.actual[self.pos_label]

        #return preds[self.pos_label]
        return preds
Beispiel #5
0
def main(event, context):
    body = {}
    dataObject = []
    if 'body' in event.keys():
        body = json.loads(event['body'])
        dataObject = body['data']
    else:
        return createResponse({ 'error': 'no data'})
    
    X = np.empty(shape=[0,2])
    print(X)
    for address in dataObject:
        lat = float(address['location']['lat'])
        lng = float(address['location']['lng'])
        X = np.append(X, [[lat, lng]], axis=0)

    n_clusters = 2;
    if ('clusters' in body):
        n_clusters = body['clusters']

    algoName = 'kmeans'
    if ('algo' in body):
        algoName = body['algo']
        if algoName == 'spectral':
            algo = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity='nearest_neighbors')
            algo.fit(X)
            prediction = algo.labels_.astype(np.int)
        if algoName == 'minikmeans':
            algo = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, n_init=10, max_no_improvement=10, verbose=0)
            algo.fit(X)
            prediction = algo.predict(X)
        if algoName == 'meanshift':
            bandwidth = estimate_bandwidth(X, quantile=0.3)
            algo = MeanShift(bandwidth=bandwidth, bin_seeding=True)
            algo.fit(X)
            prediction = algo.labels_.astype(np.int)
        if algoName == 'affinity':
            algo = AffinityPropagation(damping=0.8)
            algo.fit(X)
            prediction = algo.labels_.astype(np.int)
        if algoName == 'agglo':
            connectivity = kneighbors_graph(X, n_neighbors=2, include_self=False)
            algo = AgglomerativeClustering(linkage='average', affinity='cityblock', n_clusters=n_clusters, connectivity=connectivity)
            algo.fit(X)
            prediction = algo.labels_.astype(np.int)
        if algoName == 'birch':
            algo = Birch(n_clusters=n_clusters, threshold=0.1, branching_factor=10)
            algo.fit(X)
            prediction = algo.labels_.astype(np.int)
        if algoName == 'gaussian':
            algo = GaussianMixture(n_components=n_clusters, covariance_type='full')
            algo.fit(X)
            prediction = algo.predict(X)
        else:
            algo = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
            algo.fit(X)
            prediction = algo.predict(X)
    else:
        algo = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
        prediction = algo.predict(X)




    print(prediction)

    clusterData = []
    for index, address in enumerate(dataObject):
        lat = float(address['location']['lat'])
        lng = float(address['location']['lng'])
        cluster = int(prediction[index])
        clusterData.append({
            'address': address['address'],
            'lat': lat,
            'lng': lng,
            'cluster': cluster,
        })
    
    dataWithMetaData = {
        'clusters': n_clusters,
        'algo': algoName,
        'addresses': len(dataObject),
        'clusterData': clusterData
    }
    return createResponse(dataWithMetaData)
Beispiel #6
0
#print(Y.head())
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
traindata = np.array(X)
trainlabel = np.array(Y)
traindata, testdata, trainlabel, testlabel = model_selection.train_test_split(
    traindata, trainlabel, test_size=0.3)
#print(testdata.shape)
#print(traindata.shape)

model = KNeighborsClassifier()
model.fit(traindata, trainlabel)
print(model)
# make predictions
expected = testlabel
predicted = model.predict(testdata)
#np.savetxt('res/predictedKNN.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted, average="binary")
f1 = f1_score(expected, predicted, average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0]) / np.sum(cm[0])
fpr = float(cm[1][1]) / np.sum(cm[1])
print("%.3f" % tpr)
print("%.3f" % fpr)
print("Accuracy")
print("%.3f" % accuracy)
# In order to visualize K-Means clustering, I wrote the code in the above comment,
# yet, it returned a ValueError:
# "Only sparse matrices with 32-bit integer indices are accepted. Got int64 indices."
# Therefore, I defined convert_to_32bit_indices function, yet it did not work either.
# -------------------------------------------------------------------------------

# Spectral Clustering

# For the Spectral Clustering, I run the convert_to_64bit_indices function, so that
# I get rid of "RuntimeError: nnz of the result is too large" and run Spectral Clustering
# properly. Yet, I cannot make run Spectral Clustering, it did not work.
def convert_to_64bit_indices(x):
    x.indptr = np.array(x.indptr, copy=False, dtype=np.int64)
    x.indices = np.array(x.indices, copy=False, dtype=np.int64)
    return x
X = convert_to_64bit_indices(X)

from sklearn.cluster import SpectralClustering
spectral = SpectralClustering(n_clusters=2, random_state=44, gamma=1.0, n_neighbors=10)
spectral.fit(X)
preds = spectral.predict(X)
print(spectral.affinity_matrix_)

# Gaussian Mixture Model
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm_labels = gmm.fit_predict(X.toarray())
proba_lists = gmm.predict_proba(X.toarray())
print(f"The score of Gaussian Mixture model is: {gmm.score(X.toarray())}")