def transform(self, altitude, velocity, heading, wind_vel_y, wind_vel_x, loc_x,
                  loc_y):
        """
        Return the parameters to a multivariate normal distribution describing the ground impact probability under a
        ballistic descent.

        This function takes into account wind and returns the result in the NED frame with x, y corresponding to
         East and North respectively. The distribution takes in the location of the event in the existing NED frame
         and transforms the Path aligned event frame (PAEF) to the NED frame at the specified location.

         If passing an array, all other arrays must be the same shape. This is usually a single dimension of samples
         generated with scipy.stats.<some distribution>.rvs

        The method is as follows:
            1. The ballistic model return one dimensional results from the specified params, if these are arrays then
                a number of samples are created.
            2. The heading(s) are rotated into the NED frame
            3. A vectorised operation is performed to firstly rotate the PAEF results into the NED frame
            4. The second part of the vectorised operation then multiplies the wind vector (in NED) by the time
                taken to impact the ground. This is then added to the first part.
            5. The samples are used to fit a multivariate Gaussian from which the parameters are generated.

        :param altitude: the altitude in metres
        :type altitude: float or np.array
        :param velocity: the velocity over the ground of the aircraft in the direction of flight in m/s
        :type velocity: float or np.array
        :param heading: the ground track bearing of the aircraft in deg (North is 000)
        :type heading: float or np.array
        :param wind_vel_x: the x component of the wind in m/s
        :type wind_vel_x: float or nd.array
        :param wind_vel_y: the y component of the wind in m/s
        :type wind_vel_y: float or nd.array
        :param loc_x: event x location
        :type loc_x: int
        :param loc_y: event y location
        :type loc_y: int
        :return: a tuple of (means, covariances) of the distribution
        :rtype: tuple of np.arrays of shape (2,) for the means and (2,2) for the covariances
        """
        # Compute impact distances and times in the PAE frame
        # The velocity vector is assumed to be aligned with path vector, hence v_y is 0
        d_i, v_i, a_i, t_i = self.bm.compute_ballistic_distance(altitude, velocity, 0)

        # Compensate for x,y axes being rotated compared to bearings
        theta = bearing_to_angle(heading)
        # Form the array structure required and transform
        arr = np.vstack((np.zeros(d_i.shape), d_i, t_i, theta, wind_vel_x, wind_vel_y))
        transformed_arr = np.apply_along_axis(paef_to_ned_with_wind, 0, arr)
        # Remove nan rows
        transformed_arr = transformed_arr[:, ~np.isnan(transformed_arr).all(axis=0)]
        gm = GaussianMixture()
        gm.fit_predict(transformed_arr.T)
        # If there the event and NED origins match, no need to translate
        if not loc_x or not loc_y:
            means = gm.means_[0]
        else:
            means = gm.means_[0] + np.array([loc_x, loc_y])
        # Gaussian Mixture model can deal with up to 3D distributions, but we are only dealing with 2D here,
        # so take first index into the depth
        return (means, gm.covariances_[0]), v_i.mean(), a_i.mean()
Beispiel #2
0
def pos_posterior(ra_s, dec_s, number=2):
    func = GaussianMixture(n_components=number, covariance_type='full')
    samples = []
    for x, y in zip(ra_s, dec_s):
        samples.append(np.array([x, y]))
    func.fit_predict(samples)
    return func
def gmm(model, dataloader, params):
    gm = GaussianMixture(n_components=model.num_clusters,
                         covariance_type=params['gmm_covariance_type'],
                         tol=params['gmm_tol'],
                         max_iter=params['gmm_max_iter'])
    gm.fit_predict(embedded_outputs(model, dataloader, params))
    weights = torch.from_numpy(gm.means_)
    model.clustering.set_weight(weights.to(params['device']))
Beispiel #4
0
def run_km_em(perm_x, perm_y, dname, clstr):
    SSE_km_perm = []
    ll_em_perm = []
    acc_km_perm = []
    acc_em_perm = []
    adjMI_km_perm = []
    adjMI_em_perm = []
    homo_km_perm = []
    homo_em_perm = []
    comp_km_perm = []
    comp_em_perm = []
    silhou_km_perm = []
    bic_em_perm = []
    clk_time = []

    for k in clstr:
        st = clock()
        km = KMeans(n_clusters=k, random_state=10)
        gmm = GMM(n_components=k, random_state=10)

        SSE_km_perm.append(-km.score(perm_x, km.fit_predict(perm_x)))
        ll_em_perm.append(gmm.score(perm_x, gmm.fit_predict(perm_x)))
        acc_km_perm.append(cluster_acc(perm_y, km.fit_predict(perm_x)))
        acc_em_perm.append(cluster_acc(perm_y, gmm.fit_predict(perm_x)))
        adjMI_km_perm.append(ami(perm_y, km.fit_predict(perm_x)))
        adjMI_em_perm.append(ami(perm_y, gmm.fit_predict(perm_x)))
        homo_km_perm.append(
            metrics.homogeneity_score(perm_y, km.fit_predict(perm_x)))
        homo_em_perm.append(
            metrics.homogeneity_score(perm_y, gmm.fit_predict(perm_x)))
        comp_km_perm.append(
            metrics.completeness_score(perm_y, km.fit_predict(perm_x)))
        comp_em_perm.append(
            metrics.completeness_score(perm_y, gmm.fit_predict(perm_x)))
        silhou_km_perm.append(
            metrics.silhouette_score(perm_x, km.fit_predict(perm_x)))
        bic_em_perm.append(gmm.bic(perm_x))
        clk_time.append(clock() - st)
        print(k, clock() - st)

    dbcluster = pd.DataFrame({
        'k': clstr,
        'SSE_km': SSE_km_perm,
        'll_em': ll_em_perm,
        'acc_km': acc_km_perm,
        'acc_em': acc_em_perm,
        'adjMI_km': adjMI_km_perm,
        'adjMI_em': adjMI_em_perm,
        'homo_km': homo_km_perm,
        'homo_em': homo_em_perm,
        'comp_km': comp_km_perm,
        'comp_em': comp_em_perm,
        'silhou_km': silhou_km_perm,
        'bic_em': bic_em_perm,
        'clk_time': clk_time
    })

    dbcluster.to_csv('./results/cluster_{}.csv'.format(dname), sep=',')
Beispiel #5
0
def final_run():
    gmm = GaussianMixture(n_components=12,
                          n_init=20,
                          covariance_type='full',
                          random_state=4322)
    data, reverse_dict = helper_reg_season()
    labels = gmm.fit_predict(data)
    final_dict = {}
    for index in range(len(labels)):
        curr_category = labels[index]
        curr_row = data[index]
        curr_person = reverse_dict[repr(curr_row)]
        (curr_row).append(curr_category)
        curr_row = [float(i) for i in curr_row]
        final_dict[curr_person] = curr_row
    with open('datasets/with_cat_reg_season_advanced.json', 'w') as fp:
        json.dump(final_dict, fp)

    json_object = json.load(open("datasets/with_cat_reg_season_advanced.json"))
    keys = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    cat_dict = {key: [] for key in keys}
    for person in json_object:
        curr_person_data = json_object[person]
        curr_category = (curr_person_data[-1])
        curr_lst = cat_dict[curr_category]
        curr_lst.append(person)
        cat_dict[curr_category] = curr_lst

    with open('datasets/final_data_reg_season.json', 'w') as fp:
        json.dump(cat_dict, fp)

    data, reverse_dict = helper_playoffs()
    labels = gmm.fit_predict(data)
    final_dict = {}
    for index in range(len(labels)):
        curr_category = labels[index]
        curr_row = data[index]
        curr_person = reverse_dict[repr(curr_row)]
        (curr_row).append(curr_category)
        curr_row = [float(i) for i in curr_row]
        final_dict[curr_person] = curr_row
    with open('datasets/with_cat_playoffs_advanced.json', 'w') as fp:
        json.dump(final_dict, fp)

    json_object = json.load(open("datasets/with_cat_playoffs_advanced.json"))
    keys = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    cat_dict = {key: [] for key in keys}
    for person in json_object:
        curr_person_data = json_object[person]
        curr_category = (curr_person_data[-1])
        curr_lst = cat_dict[curr_category]
        curr_lst.append(person)
        cat_dict[curr_category] = curr_lst

    with open('datasets/final_data_playoffs.json', 'w') as fp:
        json.dump(cat_dict, fp)
Beispiel #6
0
def nn_em_dimredux(data, labels, layers, set_name, pca_max_comp,
                   ica_components, grp_components, skb_k, em_clusters):

    #
    pca = PCA(n_components=pca_max_comp)
    PCAreducedData = pca.fit_transform(data)

    ica = FastICA(n_components=ica_components)
    ICAreducedData = ica.fit_transform(data)

    grp = GRP(n_components=grp_components)
    GRPreducedData = grp.fit_transform(data)

    skb = SKB(f_classif, k=skb_k)
    SKBreducedData = skb.fit_transform(data, labels)

    redux_models = [pca, ica, grp, skb]
    reduced_data = [
        PCAreducedData, ICAreducedData, GRPreducedData, SKBreducedData
    ]

    PCA_scores = []
    ICA_scores = []
    GRP_scores = []
    SKB_scores = []
    for c in em_clusters:
        em = GMM(n_components=c)

        clustered = em.fit_predict(PCAreducedData).reshape(-1, 1)
        PCA_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(ICAreducedData).reshape(-1, 1)
        ICA_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(GRPreducedData).reshape(-1, 1)
        GRP_scores.append(run_nn(clustered, labels, layers))

        em = GMM(n_components=c)
        clustered = em.fit_predict(SKBreducedData).reshape(-1, 1)
        SKB_scores.append(run_nn(clustered, labels, layers))

    #
    test_scores = [PCA_scores, ICA_scores, GRP_scores, SKB_scores]
    labels = ['PCA', 'ICA', 'GRP', 'SKB']
    a = np.arange(1, 5, 1)
    plot_xys2([em_clusters for i in a], test_scores, labels,
              set_name + ' NN Trained on EM from \n' + 'Dim Reduced Set',
              'Clusters', 'Testing Accuracy')
Beispiel #7
0
def Gaussian_Mixture_model(X, k):
    model = GaussianMixture(
        n_components=k)  #assign model to the gaussian mixture
    model.fit_predict(X)  #run the data on the GMM
    yhat = model.predict(X)  #assign yhat the labeled points
    clusters = np.unique(yhat)  #make array of only unique clusters
    for cluster in clusters:  #loop throught eclusters and plot them
        row_ix = np.where(yhat == cluster)
        plt.scatter(X[row_ix, 0], X[row_ix, 1], s=5)

    print("GMM Accuracy")
    print(accuracy_score(yhat,
                         y_train))  # calcuate the accuracy of the clustering
    plt.title("Gaussians Mixture Model")
    plt.show()
Beispiel #8
0
def global_gmm(image, mask, patchSize=None, n=5, label_only=True):
    assert (type(image) is np.ndarray and image.dtype == np.uint8 and len(
        image.shape) == 2), "The input image has to be a uint8 2D numpy array."
    assert (type(mask) is np.ndarray and mask.dtype == np.uint8 and len(
        mask.shape) == 2), "The input mask has to be a uint8 2D numpy array."
    assert type(n) is int
    assert type(label_only) is bool
    assert type(patchSize) is int and patchSize > 0
    patch = (patchSize, patchSize)
    image_reduced = (image if patchSize == 1 else block_reduce(
        image, patch, np.mean, 255))
    mask_reduced = mask if patchSize == 1 else block_reduce(
        mask, patch, np.min)
    global_mean = int(np.mean(image_reduced[mask_reduced > 0]))
    label = np.zeros(mask_reduced.shape, dtype=np.uint8)
    levels = []
    fg_ind = mask_reduced.nonzero()
    while True:
        data = image_reduced[fg_ind].reshape((-1, 1))
        gmm = GaussianMixture(n_components=min(n, len(data)), random_state=123)
        prediction, means = gmm.fit_predict(data), gmm.means_
        min_label, min_mean = np.argmin(means), np.min(means)
        if min_mean >= global_mean:
            break
        levels.append(min_mean)
        min_ind = prediction == min_label
        label[fg_ind[0][min_ind], fg_ind[1][min_ind]] = len(levels)
        fg_ind = fg_ind[0][~min_ind], fg_ind[1][~min_ind]
    label_resized = (label if patchSize == 1 else cv2.resize(
        label, image.shape[::-1], interpolation=cv2.INTER_NEAREST))
    image_labeled = (None if label_only else img_as_ubyte(
        label2rgb(label_resized, image, bg_label=0)))
    return image_labeled, mask_reduced, label, levels
def fit_em(xs, k):
    gmm = GaussianMixture(n_components=k, covariance_type="full")
    labels = jnp.array(gmm.fit_predict(xs))
    mus = jnp.array(gmm.means_)
    covs = jnp.array(gmm.covariances_)
    weights = jnp.log(gmm.weights_)
    return labels, (mus, covs, weights)
Beispiel #10
0
def gaussian_clustering(principal_components, principal_df):
    final_df = pd.concat([principal_df], axis=1)
    model = GaussianMixture(n_components=5)
    # fit model and predict clusters
    yhat = model.fit_predict(principal_components)
    # retrieve unique clusters
    clusters = unique(yhat)
    final_df['Segment'] = model.covariance_type
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    print(final_df)
    plt.title("Gaussian Clustering")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=yhat,
                    n_clusters=len(clusters))
    return final_df
Beispiel #11
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', type=str, required=True)
    parser.add_argument('-o', '--output', type=str, required=True)
    args = parser.parse_args()
    inpath, outpath = args.input, args.output

    header = np.loadtxt(inpath, delimiter=',', max_rows=1, dtype=np.str)
    data = np.loadtxt(inpath, delimiter=',', skiprows=1)

    fidx = np.argwhere(header == 'Fare')[0, 0]
    midx = np.argwhere(header == 'Trip Miles')[0, 0]

    data = data[np.argwhere(data[:, fidx] > 0.0).flatten(), :]
    data = data[np.argwhere(data[:, midx] > 0.0).flatten(), :]

    fare_per_mi = (data[:, fidx] / data[:, midx]).reshape((-1, 1))
    
    gm = GaussianMixture(n_components=2)
    res = gm.fit_predict(fare_per_mi)

    rargs = np.argwhere(res == 0).flatten()
    data = data[rargs]

    np.savetxt(outpath, data, fmt='%0.8f', delimiter=',', header=','.join(header), comments='')
Beispiel #12
0
def initialize_clusters_with_gmm_results(X, n_clusters):
    """
	:param X: p class data, shape is n_samples * n_features
	:param n_clusters: number of Gaussian modes (typically set to 2, needs to be experimented with)
	:return: clusters: list with each element being a dictionary containing information for that cluster (like mu, cov)
	"""
    clusters = []
    idx = np.arange(X.shape[0])

    # initialize with GMM results: Mean and Covariances
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full')
    #
    results = gmm.fit_predict(X)
    #
    print("Show GMM initialization:")
    print(results)
    mu_k = gmm.means_

    # initialize with covariance matrix from the GMM results
    cov_mat = gmm.covariances_

    for i in range(n_clusters):
        clusters.append({
            'pi_k': 1.0 / n_clusters,
            'mu_k': mu_k[i],
            'cov_k': cov_mat[i]
        })

    return clusters, results
Beispiel #13
0
def em_clustering(ctx):
    """
    Gaussian Mixture function to be run on dataset.
    """
    covariance_type = 'spherical'
    n_components = 10

    print("loading data...")
    x_train, _, y_train = load_data(ctx.obj["data_folder"],
                                    shuffle_seed=ctx.obj["seed"])

    print("Running Gaussian Mixture...")

    model = GaussianMixture(n_components=n_components,
                            covariance_type=covariance_type,
                            verbose=2)

    labels_predicted = model.fit_predict(x_train)

    y_train = column_or_1d(y_train)

    score = metrics.adjusted_rand_score(y_train, labels_predicted)
    print(f"Accuracy: {score}.")

    score = metrics.homogeneity_score(y_train, labels_predicted)
    print(f"Homogeneity Score: {score}.")

    score = metrics.completeness_score(y_train, labels_predicted)
    print(f"Completeness Score: {score}.")

    score = metrics.v_measure_score(y_train, labels_predicted)
    print(f"V Measure Score: {score}.")

    score = metrics.fowlkes_mallows_score(y_train, labels_predicted)
    print(f"Fowlkes Mallows Score: {score}.")
Beispiel #14
0
 def cluster(cls, vectors, vectorization_mode=TFIDF_MODE):
     if vectors is None:
         vectors = cls.vector_list
     gmm = GaussianMixture(n_components=cls.get_n_components(),
                           random_state=0)
     cls.label_list = gmm.fit_predict(vectors)
     cls.write_results_to_file(vectorization_mode)
def movie_emcluster():
    # 데이터 전처리
    data = pd.read_csv("./movie_clu.csv")
    label = pd.DataFrame(data['title'])
    data = data[[
        'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
        'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
    ]]
    data_scaled = StandardScaler().fit_transform(data)
    df = pd.DataFrame(data_scaled)
    # 데이터 모델링
    for n in range(3, 8):
        model = GaussianMixture(n_components=n,
                                max_iter=20,
                                random_state=0,
                                covariance_type='spherical').fit(df)
        y_predict = model.fit_predict(df)
        header = 'EM' + str(n)
        label[header] = y_predict
    request_data = {'clustering': []}
    for lab in label.values:
        request_data['clustering'].append({
            'title': str(lab[0]),
            'EM3': str(lab[1]),
            'EM4': str(lab[2]),
            'EM5': str(lab[3]),
            'EM6': str(lab[4]),
            'EM7': str(lab[5])
        })
    # print(request_data)
    response = requests.post(API_URL + 'cluster/emcluster/movie/',
                             data=json.dumps(request_data),
                             headers=headers)
    print(response.text)
def call_silhout_(X, df, range_n_clusters):
    hyper_parm_turning = OrderedDict()
    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        # clusterer = MiniBatchKMeans(n_clusters=n_clusters,init='k-means++', random_state=10)
        # clusterer=clusterer = GaussianMixture(n_components=n_clusters, random_state=10)
        from sklearn.mixture import GaussianMixture
        # Predict GMM cluster membership
        clusterer = GaussianMixture(n_components=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)
        labels = "cluster_labels_{}".format(n_clusters)
        if not labels in df.keys():
            df[labels] = cluster_labels

        sample_dist_std = np.std(df.groupby(labels).size())
        sample_dist_avrg = np.median(df.groupby(labels).size())
        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        if not 'n_clusters' in hyper_parm_turning.keys():
            hyper_parm_turning['n_clusters'] = [n_clusters]
        else:
            hyper_parm_turning['n_clusters'].append(n_clusters)

        if not 'silhouette_avg' in hyper_parm_turning.keys():
            hyper_parm_turning['silhouette_avg'] = [silhouette_avg]
        else:
            hyper_parm_turning['silhouette_avg'].append(silhouette_avg)

        if not 'sample_dist_std' in hyper_parm_turning.keys():
            hyper_parm_turning['sample_dist_std'] = [sample_dist_std]
        else:
            hyper_parm_turning['sample_dist_std'].append(sample_dist_std)

        if not 'sample_dist_avrg' in hyper_parm_turning.keys():
            hyper_parm_turning['sample_dist_avrg'] = [sample_dist_avrg]
        else:
            hyper_parm_turning['sample_dist_avrg'].append(sample_dist_avrg)

        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

    return df, hyper_parm_turning
Beispiel #17
0
def plot_latent(latent_vectors, y_test, y_pred):
    pca = PCA(2)
    # pca = TSNE(2)
    X_pca = pca.fit_transform(latent_vectors)
    kmeans = GaussianMixture(10, tol=1e-6, max_iter=1000)
    pred = kmeans.fit_predict(X_pca)

    df_latent = pd.DataFrame({
        "x1": X_pca[:, 0],
        "x2": X_pca[:, 1],
        "cat": y_test,  # ['pred_{}'.format(i) for i in y_test],
        "kmeans": y_pred,  # ['pred_{}'.format(i) for i in pred]
    })

    # fig, ax = plt.subplots()

    # km_pur = purity_score()

    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(20, 10))

    # true_scatter = sns.scatterplot(data=df_latent,x='x1',y='x2',hue='cat', ax=ax)
    ax1.scatter(df_latent.x1, df_latent.x2, c=df_latent.cat, cmap="viridis")
    ax1.set_title("True Labels. VAE purity ()")

    # fig2, ax2 = plt.subplots()
    # pred_scatter = sns.scatterplot(data=df_latent,x='x1',y='x2',hue='kmeans', ax=ax2)
    ax2.scatter(df_latent.x1, df_latent.x2, c=df_latent.kmeans, cmap="viridis")
    ax2.set_title("Latent Clustering Labels. Purity (pred)")

    return f
Beispiel #18
0
def gmm(Z,
        n_clusters=100,
        optimize=False,
        min_clusts=2,
        max_clusts=1000,
        clust_step=10,
        min_factor=1.01,
        random_state=None):
    if optimize:
        n_clusters = optimize_bic(Z,
                                  min_clusts=min_clusts,
                                  max_clusts=max_clusts,
                                  clust_step=clust_step)

    model = GaussianMixture(n_components=n_clusters,
                            covariance_type='spherical',
                            random_state=random_state)

    labels = model.fit_predict(Z)
    centers = model.means_
    scores = model.predict_proba(Z)

    return {
        'model': model,
        'labels': labels,
        'centers': centers,
        'spread': model.covariances_,
        'scores': scores,
        'n_components': len(set(labels)),
        'components': sorted(list(set(labels)))
    }
Beispiel #19
0
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs):
    """
    Finds cluster of users in data using Gaussian Mixture Models.

    :param data: pd.DataFrame with features for clustering indexed by users (sessions)
    :param max_n_clusters: maximal number of clusters for automatic selection for number of clusters.
        if None, then use n_clusters from arguments
    :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time)
    :param random_state: random state for GaussianMixture clusterer
    :param kwargs: keyword arguments for sklearn.mixture.GaussianMixture
    :return: np.array of clusters
    """
    if max_n_clusters is not None:
        kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters,
                                      random_state, **kwargs)
    else:
        kmargs = {
            i: j
            for i, j in kwargs.items()
            if i in GaussianMixture.get_params(GaussianMixture)
        }
    kmargs.update({'random_state': random_state})
    km = GaussianMixture(**kmargs)
    cl = km.fit_predict(data.values)
    km.labels_ = cl
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    return cl, metrics
Beispiel #20
0
def time_series_clustering(
    df_features: pd.DataFrame,
    n_clusters: int = 3,
    normalize: bool = True,
) -> pd.DataFrame:
    """
    Clusters months into clusters using created features.
    However, standardizing procedures are executed before
    clustering, standardization/scaling is necessary for k-mean
    but unnecessary for other methods including Gaussian mixture.

    Returns the clustered label dataframe, which uses the same indices
    as df_features.
    """
    if normalize:
        # Normalize features.
        scaler = StandardScaler()
        norm_fea = scaler.fit_transform(df_features.values)
    else:
        norm_fea = df_features.values

    gmm = GaussianMixture(n_components=n_clusters)
    gmm_labels = gmm.fit_predict(norm_fea)
    df_gmm_labels = pd.DataFrame(data={"label": gmm_labels},
                                 index=df_features.index)
    return df_gmm_labels
Beispiel #21
0
def main():
    train_df = concat([
        import_from_csv(TRAIN_PATH),
        import_from_csv(VALIDATION_PATH),
        import_from_csv(TEST_PATH)
    ])
    test_unlabeled_df = import_from_csv(TEST_UNLABELED_PATH)

    x_train, y_train = divide_data(
        train_df)  # labels are for generating scores
    test_gmm(x_train, y_train)

    clf = GaussianMixture(n_components=NUM_OF_CLUSTERS,
                          covariance_type='full',
                          init_params='random',
                          random_state=0)

    x_unlabeled_test = test_unlabeled_df[selected_features_without_label]
    y_unlabeled_test = import_from_csv(EXPORT_TEST_PREDICTIONS)["PredictVote"]

    y_unlabeled_pred = clf.fit_predict(x_unlabeled_test)
    print_cluster_distrebutions(NUM_OF_CLUSTERS, y_unlabeled_pred,
                                y_unlabeled_test)
    print_per_party_distrebution(NUM_OF_CLUSTERS, y_unlabeled_pred,
                                 y_unlabeled_test)

    blocks_dict = get_clsuters_to_blocks(x_unlabeled_test, y_unlabeled_test)
    blocks_cms = get_block_center_of_mass(blocks_dict, x_unlabeled_test,
                                          y_unlabeled_test)
    blocks_dist = get_closet_blocks_dict(blocks_cms)

    print(blocks_dist)
Beispiel #22
0
        def gmmc(savename, temb):

            gmm = GaussianMixture(n_components=10)
            GMMC_pred = gmm.fit_predict(temb)

            score_ACC = ACC(10, Y, GMMC_pred)
            score_NMI = NMI(Y, GMMC_pred)

            _X, _Y = np.meshgrid(
                np.linspace(temb[:, 0].min(), temb[:, 0].max()),
                np.linspace(temb[:, 1].min(), temb[:, 1].max()))
            XX = np.array([_X.ravel(), _Y.ravel()]).T
            Z = gmm.score_samples(XX)
            Z = Z.reshape((50, 50))

            new_str = ("(%s)" %
                       savename) + this_str + "ACC = %8.6f, NMI = %8.6f" % (
                           score_ACC, score_NMI)

            # Generate UMAP img
            colormap = cm.get_cmap('tab10')
            plt.figure(figsize=(16, 12))
            vis = plt.scatter(temb[:, 0],
                              temb[:, 1],
                              c=Y,
                              cmap=colormap,
                              alpha=0.15)
            plt.contour(_X, _Y, Z)
            plt.colorbar(vis)
            plt.title(new_str)
            plt.savefig("%s/%s_HP%d.png" % (pthprefix, savename, cnt))

            result_log.write("%10s: ACC = %8.6f, NMI = %8.6f" %
                             (savename, score_ACC, score_NMI) + "\n")
            result_log.flush()
def test_gaussian_mixture_fit_predict_n_init():
    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
    X = np.random.RandomState(0).randn(1000, 5)
    gm = GaussianMixture(n_components=5, n_init=5, random_state=0)
    y_pred1 = gm.fit_predict(X)
    y_pred2 = gm.predict(X)
    assert_array_equal(y_pred1, y_pred2)
Beispiel #24
0
def pca_gmm_and_dip(data, min_data):
    
    if data.shape[0] < min_data:
        return np.zeros(len(data), 'int32'), 1

    pca = PCA(n_components=1)
    score = pca.fit_transform(data)

    label = np.zeros(len(score), 'int32')

    p_val = dp(score[:, 0])[1]
    if p_val < 0.05:
        g = GaussianMixture(n_components=2)
        label_ = g.fit_predict(score)
        idx_0 = label_==0
        if np.sum(idx_0) == 0 or np.sum(~idx_0) == 0:
            return label, 1
        else:
            return label_, p_val

        label_0 = pca_gmm_and_dip(data[idx_0], min_data)[0]
        label_1 = pca_gmm_and_dip(data[~idx_0], min_data)[0]
        label_1 += np.max(label_0) + 1
        label[idx_0] = label_0
        label[~idx_0] = label_1

    return label, p_val
Beispiel #25
0
def visual(c, X, y):
  from sklearn.mixture import GaussianMixture
  cluster_object = GaussianMixture(n_components = c)
  y_pred = cluster_object.fit_predict(X)
  colors = ['red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue']
  clusters = np.unique(y_pred)
  print("Cluster Labels")
  print(clusters)
  print("Evaluation")
  evaluation_labels(y, y_pred)
  evaluation(X, y_pred)
  for cluster in clusters:
    row_idx = np.where(y == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Dataset')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
  for cluster in clusters:
    row_idx = np.where(y_pred == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Clusters')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
Beispiel #26
0
def gaussian_mixtures(datas, labels):
    n_clusters = len(set(labels))
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='diag')
    labels_pred = gmm.fit_predict(datas)
    # score = adjusted_mutual_info_score(labels, labels_pred)
    score = normalized_mutual_info_score(labels, labels_pred)
    return score
Beispiel #27
0
def GaussianMixture(V, **kwargs):
    """Performs clustering on *V* by using Gaussian mixture models. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents 
    for details.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg n_clusters: specifies the number of clusters. 
    :type n_clusters: int
    """

    try:
        from sklearn.mixture import GaussianMixture
    except ImportError:
        raise ImportError('Use of this function (GaussianMixture) requires the '
                          'installation of sklearn.')
    
    n_components = kwargs.pop('n_components', None)
    if n_components == None:
        n_components = kwargs.pop('n_clusters',None)
        if n_components == None:
            n_components = 1
    
    n_init = kwargs.pop('n_init', 1)
    
    mixture = GaussianMixture(n_init=n_init, n_components=n_components, **kwargs).fit(V)

    return mixture.fit_predict(V)
Beispiel #28
0
def train(data:np.ndarray,
		  obs_len:int,
		  filter_name:str,
		  model_dir:str,
		  result_dir:str,
		  save_model:bool=True)->NoReturn:
	
	print('[Gaussian Mixture Clustering][train] creating model...')

	gmc = GaussianMixture(n_components=3,
						  covariance_type="full",
						  max_iter=1000,
						  tol=1e-5,
						  n_init=10,
						  random_state=7,
						  init_params="kmeans")

	print('[Gaussian Mixture Clustering][train] training...')

	_y = gmc.fit_predict(X=data)
	_y = np.expand_dims(_y, axis=1)

	print(f'[Gaussian Mixture Clustering][train] converged?:{gmc.converged_}')

	print('[Gaussian Mixture Clustering][train] params (center and covariance):')
	for i, m, c in zip(range(1, 4), gmc.means_, gmc.covariances_):
		print(f'\tc_{i}-> mean: {m}')
		print(f'\t\tcov: {c}')

	print('[Gaussian Mixture Clustering][train] results:')
	_c, _l = np.unique(_y, return_counts=True)
	for i, c in zip(_c,_l):
		print (f'\tc_{i}: {c}')

	if save_model:
		model_file=f'gmc_{obs_len}s_{filter_name}.pkl'
		print (f'[Gaussian Mixture Clustering][train] saving model ({model_file})...')
		with open(os.path.join(model_dir, model_file), 'wb') as f:
			pickle.dump(gmc, f)


	result_file = f'results_gmc_train_{obs_len}s_{filter_name}.csv'
	print (f'[Gaussian Mixture Clustering][train] saving results ({result_file})...')
	labels = ['mean_velocity', 
			  'mean_acceleration', 
			  'mean_deceleration', 
			  'std_lateral_jerk', 
			  'driving_style']

	result = np.concatenate((data, _y), axis=1)
	df = pd.DataFrame(data=result, columns=labels)
	df.to_csv(os.path.join(result_dir,result_file))

	result_file = result_file.replace('results', 'params').replace('csv', 'json')
	print (f'[Gaussian Mixture Clustering][train] saving results ({result_file})...')
	_d = {}
	_d['means'] = gmc.means_.tolist()
	_d['covariances'] = gmc.covariances_.tolist()
	with open(os.path.join(result_dir, result_file), 'w') as f:
		json.dump(_d, f)
Beispiel #29
0
def cluster_and_nn(X, Y):
    print('Clustering data and running the neural network...')
    if args.adults:
        k = 2
    if args.digits:
        k = 10
    print('Using {} clusters...'.format(k))
    # X = xtest
    # Y = ytest
    km = KMeans(n_clusters=k)
    em = GaussianMixture(n_components=k)
    #
    km_clusters = km.fit_predict(X)
    # km.fit(xtrain)
    em_clusters = em.fit_predict(X)
    # em.fit(xtrain)
    # set_trace()
    #
    nn = MLPClassifier(hidden_layer_sizes=(50, 50, 50))
    nn_kmeans = MLPClassifier(hidden_layer_sizes=(50, 50, 50))
    nn_em = MLPClassifier(hidden_layer_sizes=(50, 50, 50))
    #
    plot_learning_curve(nn, X, Y, 'control', 'blue', 'navy')
    #nn_kmeans.fit(KMeans(n_clusters=k).)
    if args.kmeans:
        plot_learning_curve(nn_kmeans, km_clusters.reshape(-1, 1), Y,
                            'k-means', 'green', 'darkgreen')
    # em doesn't include a transform method so we use the predict_proba instead
    # https://github.com/scikit-learn/scikit-learn/issues/7743
    if args.em:
        plot_learning_curve(nn_em, em_clusters.reshape(-1, 1), Y, 'em',
                            'violet', 'darkviolet')
    plt.title('Learning Curve: {} [k={}]'.format(data_set, k))
    plt.show()
Beispiel #30
0
def gmm_seg(img, seed, random_seed=3):
    """
    Compute a threshold-based segmentation via a 2-component Gaussian mixture model.

    Parameters
    ----------
    img : cloudvolume.volumecutout.VolumeCutout
        The volume to segment.

    random_seed : int
        The random seed for the Gaussian mixture model.

    Returns
    -------
    labels : numpy.ndarray
        An array consisting of the pixelwise segmentation.

    """

    img_T1, img_T1_255 = get_img_T1(img)
    img_array = sitk.GetArrayFromImage(img_T1_255)
    flat_array = img_array.flatten().reshape(-1, 1)
    gmm = GaussianMixture(n_components=2, random_state=random_seed)
    y = gmm.fit_predict(flat_array)
    labels = y.reshape(img.shape).squeeze()
    if labels[seed] != 1:
        labels = abs(labels - 1)
    return labels