Exemple #1
0
    def colorClustering(self, HSV):
        if "No matched user" in HSV:
            centroid = "error:No matched user or No matched purchase list"
        else:
            ## Cluster 개수
            n_hsv, n_ctrl = len(self.hsv_list), 7
            if n_hsv == 1:
                n_clusters = 2
            elif n_hsv / n_ctrl <= 2:
                if n_hsv < 5:
                    n_clusters = n_hsv
                else:
                    n_clusters = 5
            else:
                n_clusters = (n_hsv // n_ctrl) + 5

            ## Clustering Dataset 생성
            X = HSV[:, :, :3].reshape(HSV.shape[0] * HSV.shape[1],
                                      HSV.shape[2])
            ## 알고리즘 선정 : Hierarchical
            algorithm = AgglomerativeClustering(n_clusters=n_clusters,
                                                affinity="euclidean")
            ## Clustering 실행
            with ignore_warnings(category=UserWarning):
                algorithm.fit(X)
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            ## centroid(중심점)찾기
            clf = NearestCentroid()
            centroid = clf.fit(X, y_pred).centroids_

        return centroid
def get_silhouette_score(df, X, n_clusters, model='KM'):
    '''
    Calculate silhouette score for clustered dataframe.

    :param df: dataframe to cluster
    :param X: dense binary array for silhouette scoring
    :param n_clusters: number of clusters for model to cluster data into
    :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes)
    :returns: silhouette score
    '''
    # Initialize clusterer and set random state, if possible
    if model == 'AG':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X)
        labels = clusterer.labels_
        sil_avg = silhouette_score(X, labels, metric='hamming')

    elif model == 'KM':
        clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=5, init='Huang', verbose=1)
        labels = clusterer.fit_predict(df)
        sil_avg = silhouette_score(X, labels, metric='hamming')

    elif model == 'GM':
        clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X)
        labels = clusterer.predict(X)
        sil_avg = silhouette_score(X, labels, metric='hamming')

    return sil_avg
Exemple #3
0
class PureStylo:
    
    def train(self, bookset):
        
        self.agg = AgglomerativeClustering(n_clusters = len(bookset))
        bookX = []
        
        for b in bookset:
            databook = ngrams(b, self.gramn)
            fdist = FreqDist(databook)
            common = fdist.most_common(100)
            
            inputlist = []
            for c in common:
                inputlist.append(c[0])
                inputlist.append(c[1])
            bookX.append(inputlist)
        self.agg.fit(bookX)
            

    def classify(self, book):
        grams = ngrams(book, self.gramn)
        fdist = FreqDist(grams)
        common = fdist.most_common(100)
        
        X = []
        for c in common:
            X.append(c[0])
            X.append(c[1])
        
        return self.agg.predict(X)
        
    def __init__(self, gramn):
        self.gramn = gramn
Exemple #4
0
 def check_cluster_sizes_vs_hclust(self):
     print(f"here is the cluster sizes we're guessing w kmeans: {self.train_cluster_count_plot}")
     hclust = AgglomerativeClustering(n_clusters=len(self.train_clusters["cluster"].unique()))
     hclust.fit(self.X)
     hclust_clusters = pd.DataFrame(
         {"train_row": range(self.X.shape[0]),
          "cluster": hclust.predict(self.X)})
     hclust_cluster_count_plot = ggplot(hclust_clusters, aes("cluster")) + \
                                     geom_bar()
     print(hclust_cluster_count_plot)
    def fitAndPredict(self):
        startTime = time.time()
        k = 10
        agglomerative = AgglomerativeClustering(n_clusters=k)
        agglomerative = agglomerative.fit(self.featureVectorList)
        self.kMeanslabels = agglomerative.predict(self.featureVectorList)
        print("Clustered using KMeans in [%.3f seconds]" %
              (time.time() - startTime))
        self.kMeanscentroids = agglomerative.cluster_centers_
        labels = agglomerative.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        print('Estimated number of clusters: %d' % n_clusters_)
        print('Estimated number of noise points: %d' % n_noise_)
def my_Kmeans(x, y, k=4, time=10, return_NMI=False):

    x = np.array(x)
    x = np.squeeze(x)
    y = np.array(y)

    if len(y.shape) > 1:
        y = np.argmax(y, axis=1)
    estimator = AgglomerativeClustering(affinity='cosine',
                                        compute_full_tree='auto',
                                        connectivity=None,
                                        linkage='complete',
                                        memory=None,
                                        n_clusters=k)
    ARI_list = []  # adjusted_rand_score(
    NMI_list = []
    silhouette_score_list = []
    if time:
        for i in range(time):
            #            estimator.fit(x, y)
            y_pred = estimator.fit_predict(x, y)
            score = normalized_mutual_info_score(y, y_pred)
            NMI_list.append(score)
            s2 = adjusted_rand_score(y, y_pred)
            ARI_list.append(s2)
            # silhouette_score
            labels = estimator.labels_
            s3 = silhouette_score(x, labels, metric='euclidean')
            silhouette_score_list.append(s3)
        # print('NMI_list: {}'.format(NMI_list))
        score = sum(NMI_list) / len(NMI_list)
        s2 = sum(ARI_list) / len(ARI_list)
        s3 = sum(silhouette_score_list) / len(silhouette_score_list)
        print(
            'NMI (10 avg): {:.4f} , ARI (10avg): {:.4f}, silhouette(10avg): {:.4f}'
            .format(score, s2, s3))

    else:
        estimator.fit(x, y)
        y_pred = estimator.predict(x)
        score = normalized_mutual_info_score(y, y_pred)
        print("NMI on all label data: {:.5f}".format(score))
    if return_NMI:
        return score
Exemple #7
0
def clustering(idTfidf, num_clu, term_num):
    docFeature = idTfidf
    vecTfidf = {}
    for file in idTfidf:
        row = np.zeros(len(idTfidf[file]))
        col = idTfidf[file].keys()
        val = idTfidf[file].values()
        vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num))
        vecTfidf[file] = vec.todense().tolist()[0]
    # print vecTfidf
    features = vecTfidf.values()
    # print features

    selection = 'GM'  # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM

    if selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)

    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()
    # print result
    with open('gt_GMRes.json', 'w') as f:
        f.write(json.dumps(result))

    return result
class clu_model():
    def __init__(self,method = 'kmeans'):
        self.method = method
        self.clu_model = None
        self.para = None
        
    def fit(self,x,para = None):
        if self.method == 'kmeans':
            self.clu_model = KMeans(para)
            self.para = para
            self.clu_model.fit(x)
             
        elif self.method == 'DBSCAN':#密度聚类
            self.clu_model = DBSCAN(para)
            self.para = para
            self.clu_model.fit(x)
        
        elif self.method =='Agg':#凝聚聚类
            self.clu_model = AgglomerativeClustering(para)
            self.para = para
            self.clu_model.fit(x)
            
    def predict(self,x):
        return self.clu_model.predict(x)
full_data = np.empty((150, 3), dtype='float32')
for i in range(150):
    full_data[i, 0] = x[i, 0]
    full_data[i, 1] = x[i, 0]
    full_data[i, 2] = y[i]

np.random.shuffle(full_data)

training_data = full_data[:130, :]
testing_data = full_data[130:, :]

model = KMeans()
model.fit(training_data[:, :2], training_data[:, 2])

test_results = np.array(
    [model.predict(i[:2].reshape(1, -1)) for i in testing_data],
    dtype='float32')

correct = 0

for idx, _ in enumerate(test_results):
    if test_results[idx] == testing_data[idx, 2]:
        correct += 1

print('model Accuracy:\t{}'.format(correct / len(test_results)))
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()

test_results = np.array(
    [model.predict(i[:2].reshape(1, -1)) for i in full_data], dtype='float32')
plt.scatter(x[:, 0], x[:, 1], c=y)
Exemple #10
0
class ClusteringSegmentation:
    def __init__(self, algorithm, n_clusters):
        """
        Initialize a clustering segmentation object.

        Args:
            algorithm: the algorithm to use for clustering segmentation (kmeans and em)
            n_clusters: the number of clusters
        """
        self.algorithm = algorithm.lower()
        self.n_clusters = n_clusters

        if self.algorithm == 'kmeans':
            self.model = KMeans(n_clusters=self.n_clusters,
                                max_iter=300,
                                tol=0.0001)
        elif self.algorithm == 'gmm':
            self.model = GaussianMixture(n_components=self.n_clusters,
                                         covariance_type='full',
                                         tol=0.0001,
                                         reg_covar=1e-06,
                                         max_iter=300)
        elif self.algorithm == "affinity":
            self.model = AffinityPropagation(affinity='euclidean',
                                             convergence_iter=15,
                                             damping=0.5,
                                             max_iter=200,
                                             preference=None,
                                             verbose=False)
        elif self.algorithm == 'aglo':
            pass
        elif self.algorithm == 'spectral':
            pass
        else:
            raise Exception("Algorithm is not yet implemented")

    def fit(self, image):
        """
        Compute parameters of the model.

        Args:
            image: a ndarray representing an image (x, y, color_dimension)

        Returns:
            Nothing
        """
        f_dim = image.shape[-1] if len(image.shape) > 2 else 1
        X = image.reshape(-1, f_dim)

        if self.algorithm == 'aglo':
            connectivity = img_to_graph(image)
            self.model = AgglomerativeClustering(n_clusters=self.n_clusters,
                                                 affinity='euclidean',
                                                 connectivity=connectivity,
                                                 compute_full_tree=False,
                                                 linkage='average')
        elif self.algorithm == 'spectral':
            return None

        self.model.fit(X)

    def predict(self, image):
        """
        Predict the cluster of each pixel of an image.

        Args:
            X: an ndarray representing an image (x, y, color_dim)

        Returns:
            an ndarray representing the image (x, y, cluster)

        """
        f_dim = image.shape[-1] if len(image.shape) > 2 else 1
        X = image.reshape(-1, f_dim)

        if self.algorithm == 'spectral':
            graph = img_to_graph(image)
            graph.data = np.exp(-graph.data / graph.data.std())

            X_clustered = spectral_clustering(graph,
                                              n_clusters=self.n_clusters,
                                              eigen_solver='arpack')
        else:
            X_clustered = self.model.predict(X)

        return X_clustered.reshape(image.shape[0], image.shape[1])

    def fit_predict(self, image):
        """
        Compute parameters of the model and predict the cluster of each pixel of an image.

        Args:
            image: an ndarray representing an image (x, y, color_dim)

        Returns:
            an ndarray representing the image (x, y, cluster)

        """
        self.fit(image)
        return self.predict(image)
Exemple #11
0
def train_cluster(data_type=0,
                  dimension_reduction=0,
                  cluster_way=0,
                  n_components=50,
                  threshold=2,
                  n_clusters=210,
                  branching_factor=50,
                  linkage=0,
                  max_iter=500,
                  eps=1.0):
    if data_type == 0:
        train_data = load_stage2_tf_idf("")
    elif data_type == 1:
        train_data = load_stage2_tf_idf("")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
    elif data_type == 2:
        train_data = load_nn_stage2_features()
    elif data_type == 3:
        train_data = load_stage2_tf_idf("1000")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
        dll = load_stage2_tf_idf("_dll")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "first")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "last")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        train_data.fillna(0, inplace=True)
    elif data_type == 4:
        train_data = load_stage2_tf_idf("1000")
        nn_data = load_nn_stage2_features()
        train_data = pd.merge(train_data, nn_data, 'left', on="file_name")
        dll = load_stage2_tf_idf("_dll")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "first")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_stage2_tf_idf("_hkey", "last")
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        dll = load_clustering_statics_files()
        train_data = pd.merge(train_data, dll, 'left', on="file_name")
        train_data.fillna(0, inplace=True)

    file_name = train_data["file_name"]
    train_data.drop(columns=["file_name"], inplace=True)
    X = StandardScaler(with_mean=False).fit_transform(train_data)
    origin_data = X

    if dimension_reduction == 0:
        pass
    elif dimension_reduction == 1:
        model = IncrementalPCA(n_components=n_components)
        X = model.fit_transform(X)
    elif dimension_reduction == 2:
        model = NMF(n_components=n_components,
                    init='random',
                    random_state=0,
                    max_iter=max_iter)
        X = model.fit_transform(X)
    elif dimension_reduction == 3:
        model = PCA(n_components=n_components)
        X = model.fit_transform(X)

    print(len(X[0]))
    if cluster_way == 0:
        mode = ["ward", "complete", "average", "single"]
        db = AgglomerativeClustering(n_clusters=n_clusters,
                                     linkage=mode[linkage]).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join(
            "predictions",
            "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" +
            str(dimension_reduction) + "_" + str(n_components) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 1:
        db = Birch(branching_factor=branching_factor,
                   n_clusters=n_clusters,
                   threshold=threshold).fit(X)
        labels = db.predict(X)
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions", "birch" + ".csv"), index=False)
        print(len(set(labels)))
    elif cluster_way == 2:
        db = hdbscan.HDBSCAN(min_cluster_size=40)
        db.fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions", "hdb_40" + ".csv"), index=False)
        print(len(set(labels)))
    elif cluster_way == 3:
        db = DBSCAN(eps=eps, n_jobs=-1).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join(
            "predictions",
            "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 4:
        labels = np.zeros((len(file_name), ))
        pd.DataFrame(data={
            "id": file_name,
            "family_id": np.zeros((len(file_name), ))
        }).to_csv(os.path.join("predictions", "zeros" + ".csv"), index=False)
    elif cluster_way == 5:
        db = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
        labels = db.labels_
        pd.DataFrame(data={
            "id": file_name,
            "family_id": db.labels_
        }).to_csv(os.path.join("predictions",
                               "kmeans" + str(n_clusters) + ".csv"),
                  index=False)
        print(len(set(labels)))
    elif cluster_way == 6:
        db = AffinityPropagation()

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)

    scores = evaluate_cluster_performance(origin_data, labels)
    evaluate_cluster_performance(X, labels)
    return scores
Exemple #12
0
class geodata:
    '''
        a dedicated class to geolocation data ( longitude , latitude )
    '''
    def __init__(self, X, map_path=None):
        '''
        X : data array of shape ( N , 2 )  where columns are  :  ( longitude , latitude )
        map_path : file containing the map image on which we will scatter/plot  our geodata
        '''

        self.X = X
        self.map = plt.imread(map_path) if (map_path is not None) else None
        self.model = None
        self.n_clusters = 2
        # box frame for the map to fit the background correctly : ( left , right  , bottom , top )
        h = 0.005
        self.box = X[:, 0].min() - h, X[:, 0].max() + h, X[:, 1].min(
        ) - h, X[:, 1].max() + h

    # getters
    def getModel(self):
        return self.model

    def apply_clustering(self, model="kmeans", K=2, random_seed=0):
        '''
        :param model: clustering model ( kmeans | spectral | agglo )   , agglo stands for agglomerative
        :param K: number of clusters
        :param random_seed: random seed for random numbers generations
        '''

        # Keep K between 1 and N ( number of examples )
        if (K > self.X.shape[0]):
            K = self.X.shape[0]
        elif (K < 1):
            K = 1

        self.n_clusters = K

        if (model == "kmeans"):
            self.model = KMeans(n_clusters=K,
                                random_state=random_seed).fit(self.X)

        elif (model == "spectral"):
            self.model = SpectralClustering(n_clusters=K,
                                            random_state=random_seed,
                                            affinity="laplacian").fit(self.X)

        elif (model == "agglo"):
            self.model = AgglomerativeClustering(n_clusters=K,
                                                 linkage='complete').fit(
                                                     self.X)

        else:
            raise Exception(
                "the clustering model should be 'kmeans'|'spectral'|'agglo' or None for no clustering and nothing else "
            )

    def plot_data(self,
                  plot_type="scatter",
                  map_transparency=0.4,
                  figsize=(16, 10),
                  save=True,
                  map_name="Brisbane"):
        '''
        :param plot_type  : - will have no effect if clustering is None
                            - determines type of the clustering plot "scatter" | "regions" | "distances"
        :param map_transparency : float in range (0,1) , to determine the transparency
        :param figsize: the plot figure size , a tuple of int ( width , height ) proportion
        :param save: a boolean to choose to save the figures outputs on './outputs/specific_name.png'  or not
        :param map_name: tha name of the map appearing on the figures title

        it plots the geodata according to the previous params
        '''

        ###########      general plots configuration for all types of plots    ##############
        fig, ax = plt.subplots(figsize=figsize)
        # => customized title
        map_str = " " if (map_name is None) else "on '" + map_name + "' map"
        plot_str = " simple plot " if (
            self.model is None) else " clustering plot "
        model_str = "" if (
            self.model is None
        ) else " using " + self.model.__class__.__name__ + " with n_clusters = " + str(
            self.n_clusters)
        title = 'Geolocation data' + plot_str + map_str + model_str

        ax.set_title(title)
        ax.set_ylabel('Latitude')
        ax.set_xlabel('Longitude')

        # =>  background Map
        if (map_transparency is not None and map_transparency > 0):
            # alpha for transparency
            ax.imshow(self.map,
                      extent=self.box,
                      alpha=map_transparency,
                      aspect='auto')

        ######################  plot different types of visualizations ########################

        # simple scatter
        if (self.model is None):
            ax.scatter(self.X[:, 0], self.X[:, 1])
        else:

            # labels
            if hasattr(self.model, 'labels_'):
                labels = self.model.labels_.astype(np.int)
            else:
                labels = self.model.predict(self.X)

            # preparing colors etc ...
            if (self.n_clusters <= 5):
                colors = [
                    '#0000ff', '#ff3300', '#00cc66', '#cc0099', '#00ffcc'
                ]
            else:
                # generate random colors for n_clusters
                colors = np.random.rand(self.n_clusters, 3)

            if (plot_type == "scatter"):

                for cluster, color in zip(range(self.n_clusters), colors):
                    ax.scatter(self.X[labels == cluster, 0],
                               self.X[labels == cluster, 1],
                               color=color)

                    if isinstance(self.model, KMeans):
                        centroids = self.model.cluster_centers_
                        ax.scatter(centroids[cluster, 0],
                                   centroids[cluster, 1],
                                   color=color,
                                   marker="o",
                                   edgecolors="black",
                                   s=300)

            elif (plot_type == "regions"):

                if not isinstance(self.model, KMeans):
                    raise Exception(
                        "It is impossible to plot regions for this model , we cannot retrieve the cluster membership of new data "
                    )

                #  PLOT REGIONS
                h = (self.box[1] - self.box[0]) / 100
                xx, yy = np.meshgrid(np.arange(self.box[0], self.box[1], h),
                                     np.arange(self.box[2], self.box[3], h))

                # Obtain labels for each point in mesh. Use last trained model.
                Z = self.model.predict(np.c_[xx.ravel(), yy.ravel()])

                Z = Z.reshape(xx.shape)

                my_color_map = LinearSegmentedColormap.from_list(
                    "my_color_map", colors)
                plt.imshow(Z,
                           interpolation='nearest',
                           extent=self.box,
                           cmap=my_color_map,
                           aspect='auto',
                           origin='lower',
                           alpha=0.4)

                # scatter points
                for cluster, color in zip(range(self.n_clusters), colors):
                    ax.scatter(self.X[labels == cluster, 0],
                               self.X[labels == cluster, 1],
                               cmap=my_color_map)

                # scatter centroids if Kmeans
                if isinstance(self.model, KMeans):
                    centroids = self.model.cluster_centers_
                    plt.scatter(centroids[:, 0],
                                centroids[:, 1],
                                marker='x',
                                s=400,
                                linewidths=5,
                                color='black',
                                zorder=10)

            elif (plot_type == "distances"):

                if isinstance(self.model, KMeans):
                    centroids = self.model.cluster_centers_
                    for cluster, color in zip(range(len(centroids)), colors):
                        ax.scatter(self.X[labels == cluster, 0],
                                   self.X[labels == cluster, 1],
                                   color=color)
                        ax.scatter(centroids[cluster, 0],
                                   centroids[cluster, 1],
                                   color=color,
                                   marker="o",
                                   edgecolors="blue",
                                   s=300)

                        for x in self.X[labels == cluster]:
                            plt.plot([centroids[cluster, 0], x[0]],
                                     [centroids[cluster, 1], x[1]],
                                     color=color)
                else:
                    raise Exception(
                        " distances plot requires clustering to be 'kmeans' to plot distances to centroids "
                    )

        ################################  save figures  #######################################
        if (save == True):
            if (not os.path.isdir("./outputs")):
                os.makedirs("./outputs/")
            plt.savefig('outputs/' + title + '.png')

        plt.show()
 def agglomerative(self, score_df, col_name):
     aggromerative = AgglomerativeClustering(n_clusters = self.clust_num, affinity=self.affinity, linkage=self.linkage)
     aggromerative.fit(score_df.col_name)
     res_clusters = aggromerative.predict(score_df.col_name)
     return res_clusters
Exemple #14
0
#print(Y.head())
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)
traindata = np.array(X)
trainlabel = np.array(Y)
traindata, testdata, trainlabel, testlabel = model_selection.train_test_split(
    traindata, trainlabel, test_size=0.3)
#print(testdata.shape)
#print(traindata.shape)

model = KNeighborsClassifier()
model.fit(traindata, trainlabel)
print(model)
# make predictions
expected = testlabel
predicted = model.predict(testdata)
#np.savetxt('res/predictedKNN.txt', predicted, fmt='%01d')
# summarize the fit of the model
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted, average="binary")
f1 = f1_score(expected, predicted, average="binary")

cm = metrics.confusion_matrix(expected, predicted)
print(cm)
tpr = float(cm[0][0]) / np.sum(cm[0])
fpr = float(cm[1][1]) / np.sum(cm[1])
print("%.3f" % tpr)
print("%.3f" % fpr)
print("Accuracy")
print("%.3f" % accuracy)
Exemple #15
0
def unsupervised_clu(feature, part, model_selection):
    if part:
        if feature == 'graph':
            docFeature = json.loads(
                open('rmMultiPart1WOZeroGraph.json').read())
        if feature == 'doc2vec':
            docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read())
        if feature == 'comb':
            walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read())
            dv = json.loads(open('rmMultiPart1Doc2vec.json').read())
            docFeature = {}
            for doc in walk:
                val = walk[doc] + dv[doc]
                docFeature[doc] = val
        groundTruth = json.loads(open('rmMultiPart1CluInd.json').read())
        num_clu = len(groundTruth)  # number of clusters in each part
    else:
        rmMulti = True  # False #
        if rmMulti:
            if feature == 'graph':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(
                    open('rmMultiCluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(
                    open('rmMultiCluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('rmMultiGroundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk)
        else:
            if feature == 'graph':
                docFeature = json.loads(
                    open('cluDatabaseWOZeroGraph.json').read())
            if feature == 'doc2vec':
                docFeature = json.loads(open('cluDatabaseDoc2vec.json').read())
            if feature == 'comb':
                walk = json.loads(open('cluDatabaseWOZeroGraph.json').read())
                dv = json.loads(open('cluDatabaseDoc2vec.json').read())
                docFeature = {}
                for doc in walk:
                    val = walk[doc] + dv[doc]
                    docFeature[doc] = val
            groundTruth = json.loads(open('groundTruth.json').read())
            num_clu = len(
                groundTruth
            )  # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk)

    features = docFeature.values()
    if model_selection == 'AC':
        model = AC(n_clusters=num_clu, affinity='cosine', linkage='average')
    if model_selection == 'SC':
        model = SC(n_clusters=num_clu, affinity='cosine')
    if model_selection == 'GMM':
        model = GMM(n_components=num_clu, covariance_type='full')
    if model_selection == 'KMeans':
        model = KMeans(n_clusters=num_clu)
    if model_selection == 'GM':
        model = GM(n_components=num_clu)
        model.fit(features)
        res = model.predict(features)
    else:
        res = model.fit_predict(features)
    resDic = {}
    for i in range(len(res)):
        if not resDic.has_key(res[i]):
            resDic[res[i]] = []
            resDic[res[i]].append(int(docFeature.keys()[i]))
        else:
            resDic[res[i]].append(int(docFeature.keys()[i]))
    result = resDic.values()

    return (result, groundTruth)
Exemple #16
0
          hier_aa)

# In[137]:

from sklearn import decomposition
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#3.将结果可视化,方便观测结果。
#通过PCA将4维数据降到2维来展示效果
X, y = prepare_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = KMeans(n_clusters=3).fit(X_train)
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(X_test)
pos = pd.DataFrame()
plt.scatter(X[:, 0], X[:, 1], c=model.predict(X_test), s=50, cmap='rainbow')

# In[136]:

#通过Axes3D来展示效果
fig = plt.figure(1, figsize=(4, 3))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
ax.scatter(X_train.values[:, 0],
           X_train.values[:, 1],
           X_train.values[:, 2],
           c=model.predict(X_train),
           cmap='rainbow')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
Exemple #17
0
class evaluate:
    def __init__(self, estimator_label, config, failed_file=False):
        self.estimator_label = estimator_label
        self.config = config
        self.loaded = self.load_estimator()

        self.res = {}
        if not failed_file:
            self.failed = open(estimator_label + "_failed.txt", mode="a")

        self.failed.flush()

    def run_all(self, path="./Datasets/processed/", verbose=False, nmi=False):

        if os.path.exists(path):

            allFiles = glob.glob(path + "*.csv")
            count_load = 0
            count_train = 0
            count_test = 0

            for dfile in allFiles:
                try:
                    data = pd.read_csv(dfile, header=None, na_values='?')
                    self.y = data.iloc[:, -1]

                    self.data = data.iloc[:, :-1]
                    filename_w_ext = os.path.basename(dfile)
                    print(filename_w_ext)
                    filename, file_extension = os.path.splitext(filename_w_ext)
                    self.data_label = filename
                    count_load += 1
                    if verbose:
                        print("loaded " + str(count_load) + " out of " +
                              str(len(allFiles)))
                    if self.data.isnull().values.any():
                        imp = SimpleImputer(missing_values=np.nan,
                                            strategy='mean')
                        imp = imp.fit(self.data)
                        self.data = pd.DataFrame(imp.transform(self.data))

                except:
                    print("couldn't load " + dfile)

                if self.loaded:
                    try:
                        self.fit_data()
                    except:
                        continue
                    if len(set(list(self.estimator.labels_))) == 1:
                        continue
                    count_train += 1
                    if verbose:
                        print("fitted  " + str(count_load) + " out of " +
                              str(len(allFiles)))

                    if len(list(set(self.estimator.labels_))) / len(
                            self.data) > 0.75:
                        continue
                    try:
                        Metric = self.eval_metrics(nmi)

                        self.res[self.data_label] = Metric

                        count_test += 1
                        if verbose:
                            print("evaluated " + str(count_load) + " out of " +
                                  str(len(allFiles)))

                    except:
                        print("evaluation problem", self.data_label,
                              self.config)
                        self.failed.write(
                            str(self.data_label) + " " + str(self.config))
                        self.failed.write("\n")
                        self.failed.flush()

                else:
                    print("model loading failed")
                    return False

        else:
            print(path + " doesn't exist")
            return False
        return True

    def load_estimator(self):
        if self.estimator_label.lower() == "kmeans":
            self.estimator = KMeans(init=self.config['init'],
                                    n_clusters=self.config['n_clusters'],
                                    algorithm=self.config["algorithm"],
                                    n_init=self.config['n_init'],
                                    max_iter=self.config["max_iter"])
            self.estimator_label = "kmeans"
            return True
        elif self.estimator_label.lower() == "meanshift":
            self.estimator = MeanShift(cluster_all=self.config["cluster_all"],
                                       bin_seeding=self.config["bin_seeding"],
                                       n_jobs=self.config["n_jobs"])
            return True
        elif self.estimator_label.lower() == "dbscan":
            self.estimator = DBSCAN(leaf_size=self.config["leaf_size"],
                                    metric=self.config["metric"],
                                    eps=self.config["eps"],
                                    min_samples=self.config["min_samples"])
            return True
        elif self.estimator_label.lower() == "affinitypropagation":
            self.estimator = AffinityPropagation(
                damping=self.config["damping"],
                convergence_iter=self.config["convergence_iter"],
                max_iter=self.config["max_iter"])
            return True
        elif self.estimator_label.lower() == "spectralclustering":
            self.estimator = SpectralClustering(
                n_clusters=self.config['n_clusters'],
                eigen_solver=self.config["eigen_solver"],
                affinity=self.config['affinity'],
                assign_labels=self.config["assign_labels"])
            return True
        elif self.estimator_label.lower() == "birch":
            self.estimator = Birch(
                n_clusters=self.config['n_clusters'],
                threshold=self.config["threshold"],
                branching_factor=self.config['branching_factor'])
            return True
        elif self.estimator_label.lower() == "optics":
            self.estimator = OPTICS(
                min_samples=self.config['min_samples'],
                cluster_method=self.config["cluster_method"],
                p=self.config['p'],
                n_jobs=self.config["n_jobs"])
            return True
        elif self.estimator_label.lower() == "gaussian":
            self.estimator = GaussianMixture(
                n_init=self.config['n_init'],
                init_params=self.config["init_params"],
                n_components=self.config['n_components'],
                covariance_type=self.config["covariance_type"])
            return True
        elif self.estimator_label.lower() == "agglomerativeclustering":
            self.estimator = AgglomerativeClustering(
                n_clusters=self.config['n_clusters'],
                linkage=self.config["linkage"])
            return True
        else:
            print("couldn't load model", self.estimator_label)
            return False

    def fit_data(self):
        self.estimator.fit(self.data)

    def predict_data(self):
        self.estimator.predict(self.data)

    def eval_metrics(self, nmi=False):
        if nmi:
            Metrics = {}
            Metrics["nmi"] = metrics.normalized_mutual_info_score(
                self.y, self.estimator.labels_)
            return Metrics

        sample_size = int(len(self.data) * 0.1)
        if sample_size < 100:
            sample_size = len(self.data)

        v = Validation(
            np.asmatrix(self.data).astype(np.float),
            list(self.estimator.labels_))

        Metrics = v.run_all()
        try:
            Ix = metric(self.data, self.estimator.labels_,
                        self.estimator.cluster_centers_)
            Metrics["IIndex"] = Ix.IIndex()
        except:
            Metrics["IIndex"] = "none"
        try:
            sdbw_c = sdbw(self.data, self.estimator.labels_,
                          self.estimator.cluster_centers_)
            Metrics["SDBW"] = sdbw_c.sdbw_score()
        except:
            Metrics["SDBW"] = "none"

        Metrics["ari"] = 0.0
        Metrics["ami"] = 0.0
        Metrics["nmi"] = metrics.normalized_mutual_info_score(
            self.y, self.estimator.labels_)
        Metrics["v_measure"] = 0.0
        try:
            Metrics["silhouette_score"] = metrics.silhouette_score(
                self.data,
                self.estimator.labels_,
                metric='euclidean',
                sample_size=sample_size,
                random_state=0)
        except:
            Metrics["silhouette_score"] = 0.0
        try:
            Metrics[
                "calinski_harabasz_score"] = metrics.calinski_harabasz_score(
                    self.data, self.estimator.labels_)

        except:
            Metrics["calinski_harabasz_score"] = 0.0
        '''
            sample_size=int(len(self.data)*0.1)
            if sample_size<100:
                  sample_size=len(self.data)
            Metrics={}
            Metrics["silhouette_score"] = metrics.silhouette_score(self.data, self.estimator.labels_, metric='euclidean', sample_size=sample_size,random_state=0)
            Metrics["calinski_harabasz_score"]= metrics.calinski_harabasz_score(self.data,  self.estimator.labels_) 
            Metrics["davies_bouldin_score"]=metrics.davies_bouldin_score(self.data,  self.estimator.labels_) 
            if self.estimator_label.lower()=="meanshift":
                   Metrics["SSE"]=len(self.estimator.cluster_centers_)

            if self.estimator_label.lower()=="kmeans":
                  araujo = metric(self.data, self.estimator.labels_, self.estimator.cluster_centers_)
                  Metrics["IIndex"] = 0# araujo.IIndex()
                  Metrics["SSE"]=self.estimator.inertia_
                  Metrics["nSSE"]=self.estimator.inertia_/(len(self.data)*len(self.data.columns))

                  labels_true=self.y

                  labels_true=np.array(labels_true)
                  Metrics["ARI"]=metrics.adjusted_rand_score(labels_true, self.estimator.labels_)  
                  Metrics["MIS"]=metrics.adjusted_mutual_info_score(labels_true, self.estimator.labels_)
                  Metrics["v_measure"]=metrics.v_measure_score(labels_true, self.estimator.labels_)

            else:
                  #Metrics["SSE"] = -1
                  Metrics["nSSE"] = -1
            '''

        return Metrics
Exemple #18
0
def main():
    # parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True,
                        default='input/', help="string, path to the input folder with the expression data, "
                                                  "default 'input/'")
    parser.add_argument('-ilr', '--input_lr', required=False,
                        default='input/', help="string, optional, path to the input folder with the ligands and "
                                               "receptors list, default 'input/'")
    parser.add_argument('-o', '--output', required=True,
                        default='output/', help="string, path to the output folder, default 'output/'")
    parser.add_argument('-d', '--dataType', required=True,
                        default='merfish', choices=['merfish', 'merfish_cell_line', 'starmap'],
                        help="string, type of expression data, 'merfish' for MERFISH hypothalamus data, "
                             "'merfish_cell_line' for MERFISH U-2 OS cells, 'starmap' for 'STARmap mPFC cells';"
                             "default 'merfish'")
    parser.add_argument('-g', '--gender', required=True,
                        default='Female', help="string, gender of input animal sample, default 'Female', put 'na' if "
                                               "not available")
    parser.add_argument('-b', '--behavior', required=True,
                        default='Naive', help="string, behavior of input animal sample, default 'Naive', put 'na' if "
                                              "not available")
    parser.add_argument('-c', '--cellType', required=True,
                        default='Excitatory', help="string, cell type that will be built a model for, "
                                                   "use \\ for white-space, e.g. 'OD\ Mature\ 2', default 'Excitatory'")
    parser.add_argument('-m', '--mode', required=True,
                        default='train', help="string, any of 'train', 'CV'; if 'train', then all data will be used "
                                              "for training and output a pickle file for learned parameters; if 'CV', "
                                              "then cross-validation will be conducted each time with an animal/sample "
                                              "left out and each CV run output a pickle file and prediction result, "
                                              "default 'train'")
    parser.add_argument('-c1', '--numLevel1', required=False,
                        default=1, help="integer, optional, number of classes at level 1, number of experts = "
                                        "number of classes "
                                        "at level 1 x number of classes at level 2, default 1")
    parser.add_argument('-c2', '--numLevel2', required=False,
                        default=5, help="integer, optional, number of classes at level 2, default 5")
    parser.add_argument('-e', '--epochs', required=False,
                        default=20, help="integer, optional, number of epochs to train MESSI, default 20")
    parser.add_argument('-gs', '--grid_search', required=False, type=str2bool,
                        default=False, help="boolean, optional, if conduct grid search for hyper-parameters, "
                                            "default False")
    parser.add_argument('-ns', '--n_sets', required=False,
                        default=3, help="integer, optional, number of CV sets for grid search, default 3")
    parser.add_argument('-r', '--numReplicates', required=False,
                        default=1, help="integer, optional, number of times to run with same set of parameters, "
                                        "default 1")
    parser.add_argument('-p', '--preprocess', required=False,
                        default='neighbor_cat', help="string, optional, the way to include neighborhood information; "
                                                     "neighbor_cat: include by concatenating them to the cell own "
                                                     "features; neighbor_sum: include by addinding to the cell own "
                                                     "features; anything without 'neighbor': no neighborhood "
                                                     "information will be used as features; 'baseline': only baseline "
                                                     "features; default 'neighbor_cat'")

    parser.add_argument('-tr', '--topKResponses', required=False,
                        default=None, help='integer, optional, number of top dispersed responses genes to model,'
                                           'default None (to include all response genes)')
    parser.add_argument('-ts', '--topKSignals', required=False,
                        default=None, help='integer, optional, number of top dispersed signalling genes to use as '
                                           'features, default None (to include all signalling genes)')
    # parser.add_argument('-rp', '--responsePrior', required=False,
    #                     default=None, help='string, optional, path to the response genes to be used, default None')
    # parser.add_argument('-sp', '--signalsPrior', required=False,
    #                     default=None, help='string, optional, path to the signalling genes to be used, default None')

    args = parser.parse_args()
    print(args)

    # set parameters for data
    input_path = args.input
    input_path_lr = args.input_lr
    output_path = args.output
    data_type = args.dataType
    sex = args.gender
    behavior = args.behavior
    behavior_no_space = behavior.replace(" ", "_")
    current_cell_type = args.cellType
    current_cell_type_no_space = current_cell_type.replace(" ", "_")

    # set parameters for model
    mode = args.mode
    grid_search = args.grid_search
    n_sets = int(args.n_sets)
    n_classes_0 = int(args.numLevel1)
    n_classes_1 = int(args.numLevel2)
    n_epochs = int(args.epochs)
    n_replicates = int(args.numReplicates)

    # set parameters for data processing
    preprocess = args.preprocess
    if args.topKResponses is not None:
        top_k_response = int(args.topKResponses)
    else:
        top_k_response = args.topKResponses
    if args.topKSignals is not None:
        top_k_regulator  = int(args.topKSignals)
    else:
        top_k_regulator = args.topKSignals

    response_type = 'original'  # use raw values to fit the model

    if grid_search:
        condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_grid_search"
    else:
        condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_{n_classes_1}"

    # prepare to read data
    read_in_functions = {'merfish': [read_meta_merfish, read_merfish_data, get_idx_per_dataset_merfish],
                         'merfish_cell_line': [read_meta_merfish_cell_line, read_merfish_cell_line_data,
                                               get_idx_per_dataset_merfish_cell_line],
                         'starmap': [read_meta_starmap_combinatorial, read_starmap_combinatorial,
                                     get_idx_per_dataset_starmap_combinatorial]}

    # set data reading functions corresponding to the data type
    if data_type in ['merfish', 'merfish_cell_line', 'starmap']:
        read_meta = read_in_functions[data_type][0]
        read_data = read_in_functions[data_type][1]
        get_idx_per_dataset = read_in_functions[data_type][2]
    else:
        raise NotImplementedError(f"Now only support processing 'merfish', 'merfish_cell_line' or 'starmap'")

    # read in ligand and receptor lists
    l_u, r_u = get_lr_pairs(input_path=input_path_lr)  # may need to change to the default value

    # read in meta information about the dataset
    meta_all, meta_all_columns, cell_types_dict, genes_list, genes_list_u, \
    response_list_prior, regulator_list_prior = \
        read_meta(input_path, behavior_no_space, sex, l_u, r_u)  # TO BE MODIFIED: number of responses

    # get all available animals/samples
    all_animals = list(set(meta_all[:, meta_all_columns['Animal_ID']]))

    for _z in range(len(all_animals)):
        if mode == 'train':
            # only run once
            if _z == 0:
                test_animal = ''
            else:
                break
        else:
            test_animal = all_animals[_z]

        samples_test = np.array([test_animal])
        samples_train = np.array(list(set(all_animals) - {test_animal}))
        print(f"Test set is {samples_test}")
        print(f"Training set is {samples_train}")

        bregma = None
        # ------ read data ------
        idx_train, idx_test, idx_train_in_general, \
        idx_test_in_general, idx_train_in_dataset, \
        idx_test_in_dataset, meta_per_dataset_train, \
        meta_per_dataset_test = find_idx_for_train_test(samples_train, samples_test,
                                                        meta_all, meta_all_columns,
                                                        data_type, current_cell_type, get_idx_per_dataset,
                                                        return_in_general=False, bregma=bregma)

        # TBD: the current approach uses a lot memory;
        data_sets = []

        for animal_id, bregma in meta_per_dataset_train:
            hp, hp_cor, hp_genes = read_data(input_path, bregma, animal_id, genes_list, genes_list_u)

            if hp is not None:
                hp_columns = dict(zip(hp.columns, range(0, len(hp.columns))))
                hp_np = hp.to_numpy()
            else:
                hp_columns = None
                hp_np = None
            hp_cor_columns = dict(zip(hp_cor.columns, range(0, len(hp_cor.columns))))
            hp_genes_columns = dict(zip(hp_genes.columns, range(0, len(hp_genes.columns))))
            data_sets.append([hp_np, hp_columns, hp_cor.to_numpy(), hp_cor_columns,
                              hp_genes.to_numpy(), hp_genes_columns])
            del hp, hp_cor, hp_genes

        datasets_train = data_sets

        data_sets = []

        for animal_id, bregma in meta_per_dataset_test:
            hp, hp_cor, hp_genes = read_data(input_path, bregma, animal_id, genes_list, genes_list_u)

            if hp is not None:
                hp_columns = dict(zip(hp.columns, range(0, len(hp.columns))))
                hp_np = hp.to_numpy()
            else:
                hp_columns = None
                hp_np = None

            hp_cor_columns = dict(zip(hp_cor.columns, range(0, len(hp_cor.columns))))
            hp_genes_columns = dict(zip(hp_genes.columns, range(0, len(hp_genes.columns))))
            data_sets.append([hp_np, hp_columns, hp_cor.to_numpy(), hp_cor_columns,
                              hp_genes.to_numpy(), hp_genes_columns])
            del hp, hp_cor, hp_genes

        datasets_test = data_sets

        del data_sets

        # ------ pre-processing -------

        # construct neighborhood graph
        if data_type == 'merfish_RNA_seq':
            neighbors_train = None
            neighbors_test = None
        else:
            if data_type == 'merfish':
                dis_filter = 100
            else:
                dis_filter = 1e9

            neighbors_train = get_neighbors_datasets(datasets_train, "Del", k=10, dis_filter=dis_filter,
                                                     include_self=False)
            neighbors_test = get_neighbors_datasets(datasets_test, "Del", k=10, dis_filter=dis_filter,
                                                    include_self=False)
        # set parameters for different feature types
        lig_n = {'name': 'regulators_neighbor', 'helper': preprocess_X_neighbor_per_cell,
                 'feature_list_type': 'regulator_neighbor', 'per_cell': True, 'baseline': False,
                 'standardize': True, 'log': True, 'poly': False}
        rec_s = {'name': 'regulators_self', 'helper': preprocess_X_self_per_cell,
                 'feature_list_type': 'regulator_self', 'per_cell': True, 'baseline': False,
                 'standardize': True, 'log': True, 'poly': False}
        lig_s = {'name': 'regulators_neighbor_self', 'helper': preprocess_X_self_per_cell,
                 'feature_list_type': 'regulator_neighbor', 'per_cell': True, 'baseline': False,
                 'standardize': True, 'log': True, 'poly': False}
        type_n = {'name': 'neighbor_type', 'helper': preprocess_X_neighbor_type_per_dataset,
                  'feature_list_type': None, 'per_cell': False, 'baseline': False,
                  'standardize': True, 'log': False, 'poly': False}
        base_s = {'name': 'baseline', 'helper': preprocess_X_baseline_per_dataset, 'feature_list_type': None,
                  'per_cell': False, 'baseline': True, 'standardize': True, 'log': False, 'poly': False}

        if data_type == 'merfish_cell_line':
            feature_types = [lig_n, rec_s, base_s, lig_s]
        else:
            feature_types = [lig_n, rec_s, type_n, base_s, lig_s]

        # untransformed features
        X_trains, X_tests, regulator_list_neighbor, regulator_list_self = prepare_features(data_type, datasets_train,
                                                                                           datasets_test,
                                                                                           meta_per_dataset_train,
                                                                                           meta_per_dataset_test,
                                                                                           idx_train, idx_test,
                                                                                           idx_train_in_dataset,
                                                                                           idx_test_in_dataset,
                                                                                           neighbors_train,
                                                                                           neighbors_test,
                                                                                           feature_types,
                                                                                           regulator_list_prior,
                                                                                           top_k_regulator,
                                                                                           genes_list_u, l_u, r_u,
                                                                                           cell_types_dict)
        total_regulators = regulator_list_neighbor + regulator_list_self

        log_response = True  # take log transformation of the response genes
        Y_train, Y_train_true, Y_test, Y_test_true, response_list = prepare_responses(data_type, datasets_train,
                                                                                      datasets_test,
                                                                                      idx_train_in_general,
                                                                                      idx_test_in_general,
                                                                                      idx_train_in_dataset,
                                                                                      idx_test_in_dataset,
                                                                                      neighbors_train,
                                                                                      neighbors_test,
                                                                                      response_type, log_response,
                                                                                      response_list_prior,
                                                                                      top_k_response,
                                                                                      genes_list_u, l_u, r_u)
        if grid_search:
            X_trains_gs = copy.deepcopy(X_trains)
            Y_train_gs = copy.copy(Y_train)

        # transform features
        transform_features(X_trains, X_tests, feature_types)
        print(f"Minimum value after transformation can below 0: {np.min(X_trains['regulators_self'])}")

        # combine different type of features
        if data_type == 'merfish':
            num_coordinates = 3
        elif data_type == 'starmap' or data_type == 'merfish_cell_line':
            num_coordinates = 2
        else:
            num_coordinates = None

        if np.ndim(X_trains['baseline']) > 1 and np.ndim(X_tests['baseline']) > 1:
            X_train, X_train_clf_1, X_train_clf_2 = combine_features(X_trains, preprocess, num_coordinates)
            X_test, X_test_clf_1, X_test_clf_2 = combine_features(X_tests, preprocess, num_coordinates)
        elif np.ndim(X_trains['baseline']) > 1:
            X_train, X_train_clf_1, X_train_clf_2 = combine_features(X_trains, preprocess, num_coordinates)

        print(f"Dimension of X train is: {X_train.shape}")
        if mode == 'CV':
            print(f"Dimension of X test is: {X_test.shape}")

        # ------ modeling by MESSI ------
        for _i in range(0, n_replicates):

            # ------ set parameters ------
            model_name_gates = 'logistic'
            model_name_experts = 'mrots'

            soft_weights = True
            partial_fit_expert = True

            # specify default parameters for MESSI
            model_params = {'n_classes_0': n_classes_0,
                            'n_classes_1': n_classes_1,
                            'model_name_gates': model_name_gates,
                            'model_name_experts': model_name_experts,
                            'num_responses': Y_train.shape[1],
                            'soft_weights': soft_weights,
                            'partial_fit_expert': partial_fit_expert,
                            'n_epochs': n_epochs,
                            'tolerance': 3}

            print(f"Model parameters for training is {model_params}")

            # set up directory for saving the model
            sub_condition = f"{condition}_{model_name_gates}_{model_name_experts}"
            sub_dir = f"{data_type}/{behavior_no_space}/{sex}/{current_cell_type_no_space}/{preprocess}/{sub_condition}"
            current_dir = os.path.join(output_path, sub_dir)

            if not os.path.exists(current_dir):
                os.makedirs(current_dir)

            print(f"Model and validation results (if applicable) saved to: {current_dir}")

            if mode == 'CV':
                suffix = f"_{test_animal}_{_i}"
            else:
                suffix = f"_{_i}"

            if grid_search:
                # prepare input meta data
                if data_type == 'merfish':
                    meta_per_part = [tuple(i) for i in meta_per_dataset_train]
                    meta_idx = meta2idx(idx_train_in_dataset, meta_per_part)
                else:
                    meta_per_part, meta_idx = combineParts(samples_train, datasets_train, idx_train_in_dataset)

                # prepare parameters list to be tuned
                if data_type == 'merfish_cell_line':
                    current_cell_type_data = 'U-2_OS'
                elif data_type == 'starmap':
                    current_cell_type_data = 'STARmap_excitatory'
                else:
                    current_cell_type_data = current_cell_type

                params = {'n_classes_1': list(search_range_dict[current_cell_type_data]), 'soft_weights': [True, False],
                          'partial_fit_expert': [True, False]}

                keys, values = zip(*params.items())
                params_list = [dict(zip(keys, v)) for v in itertools.product(*values)]

                new_params_list = []
                for d in params_list:
                    if d['n_classes_1'] == 1:
                        if d['soft_weights'] and d['partial_fit_expert']:
                            # n_expert = 1, soft or hard are equivalent
                            new_params_list.append(d)
                    else:
                        if d['soft_weights'] == d['partial_fit_expert']:
                            new_params_list.append(d)
                ratio = 0.2

                # initialize with default values
                model_params_val = model_params.copy()
                model_params_val['n_epochs'] = 5  # increase for validation models to converge
                model_params_val['tolerance'] = 0
                print(f"Default model parameters for validation {model_params_val}")
                model = hme(**model_params_val)

                gs = gridSearch(params, model, ratio, n_sets, new_params_list)
                gs.generate_val_sets(samples_train, meta_per_part)
                gs.runCV(X_trains_gs, Y_train_gs, meta_per_part, meta_idx, feature_types, data_type,
                         preprocess)
                gs.get_best_parameter()
                print(f"Best params from grid search: {gs.best_params}")

                # modify the parameter setting
                for key, value in gs.best_params.items():
                    model_params[key] = value

                print(f"Model parameters for training after grid search {model_params}")

                filename = f"validation_results{suffix}.pickle"
                pickle.dump(gs, open(os.path.join(current_dir, filename), 'wb'))

            # ------ initialize the sample assignments ------

            if grid_search and 'n_classes_1' in params:
                model = AgglomerativeClustering(n_clusters=gs.best_params['n_classes_1'])
            else:
                model = AgglomerativeClustering(n_classes_1)

            model = model.fit(Y_train)
            hier_labels = [model.labels_]
            model_params['init_labels_1'] = hier_labels

            # ------ construct MESSI  ------
            model = hme(**model_params)

            # train
            model.train(X_train, X_train_clf_1, X_train_clf_2, Y_train)
            if grid_search and 'n_classes_1' in params:
                model = AgglomerativeClustering(n_clusters=gs.best_params['n_classes_1'])
            else:
                model = AgglomerativeClustering(n_classes_1)

            model = model.fit(Y_train)
            hier_labels = [model.labels_]
            model_params['init_labels_1'] = hier_labels

            # ------ construct MESSI  ------
            model = hme(**model_params)

            # train
            model.train(X_train, X_train_clf_1, X_train_clf_2, Y_train)
            # save the model
            filename = f"hme_model{suffix}.pickle"
            pickle.dump(model, open(os.path.join(current_dir, filename), 'wb'))

            # predict the left-out animal
            if mode == 'CV':

                Y_hat_final = model.predict(X_test, X_test_clf_1, X_test_clf_2)

                mae = abs(Y_test - Y_hat_final).mean(axis=1).mean()
                print(f"Mean absolute value for {test_animal} is {mae}")

                filename = f"test_predictions_{test_animal}_{_i}"
                np.save(os.path.join(current_dir, filename), Y_hat_final)
Exemple #19
0
predictedlabels = pd.DataFrame(predictedlabels)
predictedlabels = predictedlabels.iloc[:-5895]

score = silhouette_score(df, cluster.labels_, metric='euclidean')
print('Silhouette Score - Agglomerative Clustering:')
print(score)
print()

#applying MeanShift Clustering Algorithm
cluster = MeanShift(bandwidth=2).fit(df)

score = silhouette_score(df, cluster.labels_, metric='euclidean')
print('Silhouette Score - MeanShift Clustering:')
print(score)

y_predicted = cluster.predict(test_data)

accuracy = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy - MeanShift Clustering:')
print(accuracy)
print("Recall - MeanShift Clustering:")
print(
    metrics.recall_score(y_test,
                         y_predicted,
                         average='macro',
                         zero_division='warn'))
print("Precision - MeanShift Clustering:")
print(metrics.precision_score(y_test, y_predicted, average='macro'))
print("F1 - MeanShift Clustering:")
print(
    metrics.f1_score(y_test,
                                wspace=.2,
                                hspace=.2)
            X = X_r_sales
            plot_num = 1
            params = default_base.copy()

            for n_cluster in params['clusters_range']:
                ac = AgglomerativeClustering(n_clusters=n_cluster, linkage=lkg)
                t0 = time.time()
                ac.fit(X)
                t1 = time.time()

                if hasattr(ac, 'labels_'):
                    y_pred = ac.labels_.astype(np.int)
                else:
                    y_pred = ac.predict(X)

                colors = np.array(
                    list(
                        islice(
                            cycle([
                                '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                                '#a65628', '#984ea3', '#999999', '#e41a1c',
                                '#dede00'
                            ]), int(max(y_pred) + 1))))
                # add black color for outliers (if any)
                colors = np.append(colors, ["#000000"])

                n_cluster = len(set(y_pred)) - (1 if -1 in y_pred else 0)

                noise_rate = 0
Exemple #21
0
# & applying Hierarchial clustering to a set of numbers for k
dendrogram = hc.dendrogram(hc.linkage(X, method='ward', metric='euclidean'))
plt.title('Dendrogram')
plt.xlabel('')
plt.ylabel('Distances')
plt.show()

###################### 3- Training ######################
K = 2
model = AgglomerativeClustering(n_clusters=K,
                                affinity='euclidean',
                                linkage='ward')
model.fit(X)

###################### 4- Testing ######################
y = model.predict(X)

###################### 5- Visualization ######################
###### IMPORTANT NOTE: this visualization works for 2 dimensions only ######
colors = [
    'red', 'blue', 'lightcoral', 'indigo', 'gold', 'crimson', 'fuchsia',
    'peru', 'palegreen', 'lawngreen', 'olivedrab', 'yellow', 'darkseagreen',
    'tomato', 'orange', 'darkgreen', 'springgreen', 'darkred', 'teal',
    'midnightblue', 'brown', 'gray', 'darkviolet', 'aqua', 'purple',
    'orangered', 'turquoise', 'dodgerblue', 'deeppink'
]
for i in range(K):
    plt.scatter(X[y == i, 0],
                X[y == i, 1],
                s=100,
                c=colors[i],
Exemple #22
0
def cluster(features_map, clustering_algorithm, n_clusters, linkage='ward', affinity=None, eigen_solver=None, n_init=10, gamma=1.0, n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', eps=0.5, min_samples=5, algorithm='auto', p=None, compute_full_tree = True, random_state = None):
    if affinity is None:
        if clustering_algorithm == 0:
            affinity = 'euclidean'
        if clustering_algorithm == 1:
            affinity = 'rbf'

    # do clustering for every location we have in the dev set
    # read locations file
    df_locations = pd.read_csv(DATA_DIR + "poiNameCorrespondences.txt", sep="\t", header=None)
    # remove first column (names)
    locations = np.array(df_locations[1])

    score = []
    firstLoop = True
    for location in locations:
        # read the ground truth file for the images
        df_gt = pd.read_csv(DATA_DIR + GROUND_TRUTH_PATH + location + " dGT.txt", sep=",", header=None)
        # create a dictionary of the form { imageID : clusterID }
        truth = dict(zip(df_gt[0], df_gt[1]))

        # read in the image features
        features_df = []
        # copy features map for every loop
        tmp_features_map = features_map
        if tmp_features_map >= ba.bitarray('1000000000'):
            # CM
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CM.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0111111111')
        if tmp_features_map >= ba.bitarray('0100000000'):
            # CM3x3
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CM3x3.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0011111111')
        if tmp_features_map >= ba.bitarray('0010000000'):
            # CN
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CN.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0001111111')
        if tmp_features_map >= ba.bitarray('0001000000'):
            # CN3x3
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CN3x3.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000111111')
        if tmp_features_map >= ba.bitarray('0000100000'):
            # CSD
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CSD.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000011111')
        if tmp_features_map >= ba.bitarray('0000010000'):
            # GLRLM
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " GLRLM.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000001111')
        if tmp_features_map >= ba.bitarray('0000001000'):
            # GLRLM3x3
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " GLRLM3x3.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000000111')
        if tmp_features_map >= ba.bitarray('0000000100'):
            # HOG
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " HOG.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000000011')
        if tmp_features_map >= ba.bitarray('0000000010'):
            # LBP
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " LBP.csv", sep=",", header=None))
            tmp_features_map = tmp_features_map & ba.bitarray('0000000001')
        if tmp_features_map >= ba.bitarray('0000000001'):
            # LBP3x3
            features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " LBP3x3.csv", sep=",", header=None))

        # read the ids into an array
        ids = np.array(features_df[0][0])
        first = True
        for df_feature in features_df:
            # remove the first column with the image ids
            df_feature = df_feature.drop([0], axis=1)
            # create an array of all feautures [id,f1,f2,....]
            if first:
                features = np.array(df_feature)
            else:
                features = np.concatenate((features, np.array(df_feature)), axis=1)
            first = False

        # normalize every feature column
        features = (features - features.min(axis=1)[:,np.newaxis])/(features.max(axis=1)[:,np.newaxis]-features.min(axis=1)[:,np.newaxis])
        # calculate pca components which are used instead of the real features
        pca = PCA(n_components=15)
        data = pca.fit_transform(features)
        # normalize the components columns
        data = (data - data.min(axis=1)[:,np.newaxis])/(data.max(axis=1)[:,np.newaxis]-data.min(axis=1)[:,np.newaxis])

        # use feature array and number of clusters from above
        # use DBSCAN because it does not need the number of clusters
        if clustering_algorithm < 0:
            print("\n\nInvalid clustering algorithm: " + str(clustering_algorithm) + "!\n\n")
            return
        if clustering_algorithm == 0:
            model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, compute_full_tree=compute_full_tree, linkage=linkage)
        if clustering_algorithm == 1:
            model = SpectralClustering(n_clusters=n_clusters, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, n_jobs=1)
        if clustering_algorithm == 2:
            model = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algorithm, p=p, n_jobs=1)
        if clustering_algorithm == 3:
            model = GaussianMixture(n_components=n_clusters)
        if clustering_algorithm > 3:
            print("\n\nInvalid clustering algorithm: " + str(clustering_algorithm) + "!\n\n")
            return
        # create dictionary { imageID, predictedCluster }
        if clustering_algorithm == 3:
            model.fit(data)
            prediction = dict(zip(ids, model.predict(data)))
        else:
            prediction = dict(zip(ids, model.fit_predict(data)))

        # there isn't a ground truth for each image, so we can use the subset for comparision
        # additionally the predictions are now in the same order as the truth values
        prediction_subset = {x: prediction[x] for x in truth.keys() if x in prediction}

        # calculate performance using adjusted rand score:
        ars = adjusted_rand_score(list(truth.values()), list(prediction_subset.values()))
        # move score from [-1;1] to [0;1] and add to score array
        score.append(ars / 2 + 0.5)

    # calculate statistics over all scores
    return {'min': min(score), 'mean': (sum(score)/len(score)), 'sd': np.std(score), 'median': st.median(score), 'max': max(score)}
Exemple #23
0
                    Y_test = np.array([score]).reshape(cnt + 1, 1)
                else:
                    X_test = np.concatenate(
                        (X_test, np.array(weight[idx[doc]]).reshape(1, d)),
                        axis=0).reshape(cnt + 1, d)
                    Y_test = np.concatenate(
                        (Y_test, np.array([score]).reshape(1, 1)),
                        axis=0).reshape(cnt + 1, 1)
                cnt += 1
                line = next(f)

            #call sklearn.Lasso()
            #clflasso = Lasso().fit(X_train, Y_train)

            print('predicting...')
            Y_hat = clfRand.predict(X_test)  #predict
            Y_hat = Y_hat[:, np.newaxis]
            MAE = np.mean(np.abs(Y_hat - Y_test))
            print('MAE: %f' % MAE)
            # print(Y_hat)

            for idx, doc in enumerate(test_set.keys()):
                if idx >= cnt:
                    break
                res.write(QID + ' ' + doc + ' ')
                res.write(str(float(Y_hat[idx])))
                res.write('\n')
            MAE_TOTAL += MAE / 50
            print(QID + ' MAE: %f' % MAE)
            print('===================================\n')
            res.write('\n MAE: %f \n' % MAE)
Exemple #24
0
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25)

undersample = RandomUnderSampler(random_state=0)
X_train2, Y_train2 = undersample.fit_resample(X_train, Y_train)

oversample = SMOTE()
X_train3, Y_train3 = oversample.fit_resample(X_train,Y_train)

cls = KNeighborsClassifier(n_neighbors=1, metric='kulsinski')

l = [(X_train,Y_train),(X_train2,Y_train2),(X_train3,Y_train3)]
  
for i,j in l:
  cls.fit(i,j)
  Y_pred = cls.predict(X_test)
  print(classification_report(Y_test,Y_pred))

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def plot_decision_boundaries(X, y, model_class, **model_params):
    """
    Function to plot the decision boundaries of a classification model.
    This uses just the first two columns of the data for fitting 
    the model as we need to find the predicted value for every point in 
    scatter plot.
    Arguments:
            X: Feature data as a NumPy-type array.
            y: Label data as a NumPy-type array.
def plot_silhouette(df, X, n_clusters, model='KM'):
    '''
    Plot silhouette sample scores for input dataframe.

    :param df: dataframe to cluster
    :param X: dense binary array for silhouette scoring
    :param n_clusters: number of clusters for model to cluster data into
    :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes)
    :returns: None, saved plot of silhouette sample scores for each cluster
    '''
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111)
    ax.set_xlim([-0.6, 1])
    # Insert blank space between silhouette plots of individual clusters
    ax.set_ylim([0, len(df) + (n_clusters + 1) * 10])

    # Initialize clusterer and set random state, if possible
    if model == 'AG':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X)
        labels = clusterer.labels_

    elif model == 'KM':
        clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=3, init='Huang', verbose=1)
        labels = clusterer.fit_predict(df)

    elif model == 'GM':
        clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X)
        labels = clusterer.predict(X)

    # Compute the silhouette score (average value for all the samples) and the silhoutte score for each sample
    silhouette_avg = silhouette_score(X, labels, metric='hamming')
    sample_silhouette_values = silhouette_samples(X, labels, metric='hamming')

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title('The silhouette plot for the various clusters')
    ax.set_xlabel('The silhouette coefficient values')
    ax.set_ylabel('Cluster label')

    # Add a vertical line for average silhoutte score of all values
    ax.axvline(x=silhouette_avg, color='red', linestyle='--')

    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.title('Silhouette analysis for {} with {} clusters'.format(clusterer.__class__.__name__, n_clusters))

    plt.savefig('sil_{}_{}.png'.format(clusterer.__class__.__name__, n_clusters), dpi=200)
    plt.close()
Exemple #26
0
print(" répartition clusters avec K mean : ", df_kmean['label_cluster'].value_counts())

df_kmean.to_csv('fichier_kmean_7clust.csv',sep=',')




'''---------------- Mise en place de l'arbre hiérarchique ascendant--------------- '''



for i in range(2,11):
    
    clust_ah=AgglomerativeClustering(n_clusters=i,affinity='euclidean',linkage='ward')
    
    clust=clust_ah.predict(matrice_norme)
    silhouette=silhouette_score(matrice_norme,clust)
    
    print(' pour ', i, ' clusters, on obtient un score silhouette de :', silhouette)



cah=AgglomerativeClustering(n_clusters=7,affinity='euclidean',linkage='ward')

cah.fit_predict(matrice_norme)

    
''' test du tracé du dendrogramme'''
z=linkage(matrice, method='ward',metric='euclidean')
plt.figure(figsize=(10, 10))  
plt.title("Dendrogramme clustering clients")  
Exemple #27
0
         data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1)
         X=data[:, 0:11]
         
         cluster=int(input("Input the number of clusters: "))
         model = AgglomerativeClustering(n_clusters= cluster)
         model.fit(X)
         first=int(input("Input the number of the first wine: "))
         second=int(input("Input the number of the second wine: "))
         
 
         if model.labels_[first]== model.labels_[second]:
             print("Result : %d and %d are in the same cluster"%(first,second))
         else:
             print("Result : %d and %d are in the different cluster"%(first,second))
     if sel=='k':
         
         from sklearn.cluster import KMeans
         data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1)
         X=data[:, 0:11]
     
         cluster=int(input("Input the number of clusters: "))
         model = KMeans(n_clusters = cluster, random_state=0)
         model.fit(X)
         first=int(input("Input the number of the first wine: "))
         second=int(input("Input the number of the second wine: "))
         
         if model.predict([data[first, 0:11]])== model.predict([data[second, 0:11]]):
             print("Result : %d and %d are in the same cluster"%(first,second))
         else:
             print("Result : %d and %d are in the different cluster"%(first,second))