Ejemplo n.º 1
0
def dmdbscan_algorithm(data, folder):
    """
        Function to find optimal distance for DBSCAN using DMDBSCAN algorithm.
        param:
            1. data - pandas DataFrame (10000, 82) or (10000, 3), where
                values are mean spendings of customers for every category
            2. folder - string path to save plot
        return:
            Float value of optimal distance
    """
    # Create Nearest Neighbors model to find distance to the
    # first closest neighbor
    nn_model = sklearn.neighbors.NearestNeighbors(n_neighbors=2,
                                                  n_jobs=-1).fit(data)

    # Get and sort distances
    distances, indices = nn_model.kneighbors(data)
    distances = np.sort(distances, axis=0)[:, 1]

    # Find elbow (knee) on distances
    knee_loc = kneed.KneeLocator(distances,
                                 np.arange(len(distances)),
                                 curve="concave",
                                 direction="increasing",
                                 online=False,
                                 interp_method="polynomial")

    # Plot distances and optimal distance
    plotting.line_plotting(
        [np.arange(len(distances)), distances, knee_loc.knee],
        ["Distance", ""], "Optimal distance", folder)

    return knee_loc.knee
Ejemplo n.º 2
0
def main(file):
    dev_strain_cnt = []
    dev_dev = []
    with open(file) as dev:
        dev_csv = csv.reader(dev)
        for idx, row in enumerate(dev_csv):
            if idx == 0:
                continue
            else:
                dev_strain_cnt.append(int(row[0]))
                dev_dev.append(float(row[3]))

    dev_strain_cnt, dev_dev = zip(*sorted(zip(dev_strain_cnt, dev_dev)))
    dev_dev = list(dev_dev)
    devs = []
    devs_mean = []
    cnt = 0
    for dev in dev_dev:
        cnt += 1
        devs.append(dev)
        if cnt == 4:
            devs_mean.append(sum(devs) / len(devs))
            cnt = 0
            devs = []

    x = range(len(devs_mean))
    kn = kneed.KneeLocator(x,
                           devs_mean,
                           curve='convex',
                           direction='decreasing',
                           interp_method='polynomial')
    print(kn.knee + 1)
Ejemplo n.º 3
0
 def fit(self, X):
     for k, model in enumerate(self.models, 1):
         model.fit(X)
         print(f"AutoKMeans: Finished cycle {k}")
         self.sse.append(model.inertia_)
         if k > 1:
             self.sil_coef.append(
                 sk_metrics.silhouette_score(X, model.labels_))
     if self.do_plot:
         _, ax = plt.subplots()
         ax.plot(range(1, self.max_clusters),
                 np.array(self.sse) / max(self.sse),
                 label="sse")
         ax.plot(range(2, self.max_clusters),
                 np.array(self.sil_coef) / max(self.sil_coef),
                 label="sil_coef")
         ax.legend()
     kl = kneed.KneeLocator(range(1, self.max_clusters),
                            self.sse,
                            curve="convex",
                            direction="decreasing")
     if self.preferred == "SSE":
         print(
             f"SSE detection finds {kl.elbow} clusters "
             f"(silhouette coefficients say {np.argmax(self.sil_coef) + 2} clusters)"
         )
         return self.models[kl.elbow - 1]
     print(
         f"Silhouette coefficients find {np.argmax(self.sil_coef) + 2} clusters"
         f"(SSE detection says {kl.elbow} clusters)")
     return self.models[np.argmax(self.sil_coef) + 2 - 1]
Ejemplo n.º 4
0
def perform_k_means(data, num_salesmen):
    sse = []
    k_rng = range(1, num_salesmen + 1)

    for k in k_rng:
        km = KMeans(n_clusters=k)
        km.fit(data)
        sse.append(km.inertia_)

    # TODO improve the estimation of the best number of K. Maybe through tracking the slope of the curve
    """
    for i in range(0, len(sse)-1):
        m = 1-((sse[i+1]/sse[i])/1)
        print(m)
    """

    kn = kneed.KneeLocator(k_rng,
                           sse,
                           curve='convex',
                           direction='decreasing',
                           interp_method='interp1d')
    print("Best Estimated K: ", kn.elbow)
    km = KMeans(n_clusters=kn.elbow)
    y_predicted = km.fit_predict(data)

    plt.xlabel('K')
    plt.ylabel('Sum of squared error')
    plt.plot(k_rng, sse, 'bx-')

    return y_predicted
Ejemplo n.º 5
0
def main(args):
    # main routine
    os.chdir(args.output)
    if not os.path.isfile('dfreqstran_df.csv') and not os.path.isfile(
            'dfreqssel_var.csv'):
        print(
            "Missing essential files dfreqstran_df.csv and dfreqssel_var.csv!")
        os.sys.exit(-1)

    # Use LOWESS to smooth data points
    x = []
    y = []
    with open(args.strains, 'r') as fh:
        next(fh)
        for l in fh:
            segs = re.split(",", l)
            x.append(int(segs[1]))
            y.append(float(segs[3]))

    lowess = sm.nonparametric.lowess(y, x)
    l_x = list(zip(*lowess))[0]
    l_y = list(zip(*lowess))[1]

    real = dict()
    for i, j in zip(l_x, l_y):
        real[i] = j

    r_x = list(real.keys())
    r_y = list(real.values())
    # Select x value from elbow plot and use that in desman
    #a = kneed.KneeLocator(x, y, curve='convex', direction='decreasing')
    #astrains = a.knee
    k = kneed.KneeLocator(r_x,
                          r_y,
                          curve='convex',
                          direction='decreasing',
                          S=1.0)
    strains = k.knee
    for i, j in zip(r_x, r_y):
        print(f'x:{i}\ty:{j}')

    print(f'Identified {strains} as the optimal strain count from the values')
    with open(args.strains + ".strain.count", 'w') as out:
        out.write(f'{args.output}\t{strains}\n')

    # Run desman on the data
    gamma, eta, tau = desmanRun(args.desman, 'dfreqssel_var.csv',
                                'dfreqstran_df.csv', int(strains))

    print(f'Desman stats: gamma = {gamma}, eta = {eta}, tau = {tau}')
Ejemplo n.º 6
0
def elbow_method(data, folder, max_clusters=102):
    """
        Function to find clusters number for k-means using Elbow method.
        param:
            1. data - pandas DataFrame (10000, 82) or (10000, 3), where
                values are mean spendings of customers for every category
            2. folder - string path to save plot
            3. max_clusters - int number of maximum clusters (102 as default)
        return:
            Int value of optimal clusters number
    """
    elbow_results = []

    # Do k-means clustering with number of clusters from 2 to
    # max_clusters (102) and compute sum of squared distances of samples
    # to their closest cluster centers as scores
    for clusters_number in range(2, max_clusters):
        kmeans_model = sklearn.cluster.KMeans(n_clusters=clusters_number,
                                              n_jobs=-1).fit(data)

        elbow_results.append(kmeans_model.inertia_)

    elbow_results = np.array(elbow_results)

    # Find the elbow (knee) on scores
    knee_loc = kneed.KneeLocator(np.arange(2, max_clusters),
                                 elbow_results,
                                 curve="convex",
                                 direction="decreasing",
                                 online=False,
                                 interp_method="polynomial")

    # Plot scores and optimal number of clusters
    plotting.line_plotting(
        [elbow_results,
         np.arange(2, max_clusters), knee_loc.knee],
        ["Clusters number", "Score"], "Elbow score", folder)

    return knee_loc.knee
Ejemplo n.º 7
0
def elbow_cluster_analysis(X,
                           algorithm="pc-kmeans",
                           start=2,
                           end=15,
                           **params_):
    labels_list = []
    models = []
    for n_clusters in range(start, end + 1):
        with silence():
            labels, model = cluster_analysis(X, algorithm=algorithm, **params_)
        labels_list.append(labels)
        models.append(model)
        print(f"\033[KDid model with n_clusters={n_clusters}", end="\r")
    goodnesses = [goodness_of_model(model) for model in models]
    kl = kneed.KneeLocator(range(start, end + 1),
                           goodnesses,
                           curve="convex",
                           direction="increasing")
    n_clusters = kl.elbow
    print(f"ELBOW-{algorithm.upper()}: Found elbow at {n_clusters} clusters "
          f"(goodness: {goodnesses[n_clusters - start]:.2f})")
    return labels_list[n_clusters - start], models[n_clusters - start]
    def data_clustering(self):

        try:
            """
            Dataset is divided into clustes so similar data get trained in a particular model

            Input : N/A
            Output : Clusters will be generated and dataset will be stored in a particular directory
            """
            self.logger.add_in_logs("chk", "Clustering process Initialized")
            self.logger.add_in_logs("chk", "finding number of clusters")
            self.wcss = []
            x = self.df.drop(["SalePrice"], axis=1)
            self.logger.add_in_logs(
                "inf", "saving a plot of wcss vs number of clusters ")
            for i in range(1, 30):
                model = KMeans(n_clusters=i)
                model.fit(x)
                self.wcss.append(model.inertia_)
            k = kneed.KneeLocator(range(1, 30),
                                  self.wcss,
                                  curve="convex",
                                  direction="decreasing")
            self.knee = k.knee
            self.logger.add_in_logs(
                "inf",
                str(self.knee) + " number of clusters will be formed")
            self.wcss = np.array(self.wcss) / max(self.wcss)
            plt.figure()
            plt.style.use("classic")
            plt.plot(range(1, 30),
                     self.wcss,
                     label="WCSS vs Inertia",
                     color="blue")
            plt.plot([self.knee, self.knee], [min(self.wcss), 1],
                     label="Number of cluster used",
                     color="black")
            plt.xlabel("No of cluster")
            plt.ylabel("WCSS")
            plt.legend(loc="upper right")
            plt.savefig(self.path + "/static/plots/Cluster.jpg")
            self.logger.add_in_logs("pas",
                                    "finding number of clusters completed")

            self.logger.add_in_logs("chk",
                                    "getting cluster number for dataset")
            model = KMeans(n_clusters=self.knee)
            x = self.df.drop(["SalePrice"], axis=1)
            model.fit(x)
            cluster_no = model.predict(x)
            self.df["Cluster"] = cluster_no
            self.logger.add_in_logs("inf",
                                    "saving clustering model in model file")
            pickle.dump(
                model,
                open(
                    self.path +
                    "/Model_files/Cluster_directory/Cluster.pickle", "wb"))

            for i in range(0, self.knee):
                self.logger.add_in_logs(
                    "inf",
                    str(i) + " cluster is exported in .csv file")
                self.df[self.df["Cluster"] == i].to_csv(
                    self.path + "/Input_files/Cluster_data/" + str(i) +
                    "_cluster.csv",
                    index=False)
            self.logger.add_in_logs("pas",
                                    "Exporting Clustered dataset Completed")

            self.logger.add_in_logs("pas", "Clustering process Completed")
        except Exception as e:
            self.logger.add_in_logs("ERR",
                                    "data preprocessing in data clustering")
            self.logger.add_in_logs(
                "LIN", "Error on line number : {}".format(
                    sys.exc_info()[-1].tb_lineno))
            self.logger.add_in_logs("TYP", str(e))
Ejemplo n.º 9
0
    print(vWcsse)

# plotting the results onto a line graph, allowing us to observe 'The elbow'
print("\n*** Plot WCSSE ***")
plt.figure()
plt.plot(range(1, vIters), lWcsse)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSSE') #within cluster sum of squares error
plt.show()

# programatically
#!pip install kneed
print("\n*** Find Best K ***")
import kneed
kl = kneed.KneeLocator(range(1, vIters), lWcsse, curve="convex", direction="decreasing")
vBestK = kl.elbow
print(vBestK)

# k means cluster model
print("\n*** Model Create & Train ***")
model = KMeans(n_clusters=vBestK, random_state=707)
model.fit(X)

# result
print("\n*** Model Results ***")
print(model.labels_)
df['PredKnn'] = model.labels_

# counts for knn
print("\n*** Counts For Knn ***")
Ejemplo n.º 10
0
    kmeans_fit = kmeans.fit(new_r_dat)
    inertia_vals.append(kmeans_fit.inertia_)

pl.plot([*range(3, 15)], inertia_vals)
pl.xlabel('Number of clusters')
pl.ylabel('K-means inertia')
pl.vlines([4, 7], ymin=12000, ymax=18500, linestyle='dashed')
pl.show()

# %%

import kneed

kn = kneed.KneeLocator([*range(3, 15)],
                       inertia_vals,
                       curve='convex',
                       direction='decreasing',
                       interp_method='polynomial')

print(kn.knee)

kn_gini = kneed.KneeLocator([*range(3, 33)],
                            r_score_new,
                            curve='convex',
                            direction='decreasing',
                            interp_method='polynomial')
print(kn_gini.knee)

kn_gini_old = kneed.KneeLocator([*range(3, 33)],
                                r_score_old,
                                curve='convex',
Ejemplo n.º 11
0
#finding the optimum number of clusters for k-means classification to check
wcss = []
K = range(1, 11)
for i in K:
    k = KMeans(n_clusters=i,
               init='k-means++',
               max_iter=300,
               n_init=10,
               random_state=0)
    y_k = k.fit(x)
    wcss.append(k.inertia_)
print(wcss)

#find the elbow point
import kneed
kn = kneed.KneeLocator(K, wcss, curve='convex', direction='decreasing')
print("\nknee=", kn.knee)

#plot sum of squared distances
plt.plot(K, wcss, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances-wcss')
plt.title('Elbow Method to find Optimal k')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.show()

for i in K:
    k = KMeans(n_clusters=3,
               init='k-means++',
               max_iter=300,
               n_init=10,