Python DBSCAN._get_covars Examples

Programming Language: Python

Namespace/Package Name: sklearn.cluster

Class/Type: DBSCAN

Method/Function: _get_covars

Examples at hotexamples.com: 1

Python DBSCAN._get_covars - 1 examples found. These are the top rated real world Python examples of sklearn.cluster.DBSCAN._get_covars extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

fit_predict(30)

DBSCAN(30)

fit(30)

get_params(13)

max(11)

labels_(7)

fit_transform(3)

components_(2)

__class__(2)

core_sample_indices_(2)

append(2)

_estimator_type(2)

__init__(2)

as_matrix(1)

create_clusters(1)

_get_predict_signature(1)

_get_covars(1)

_fitid(1)

get_max_distance(1)

items(1)

__str__(1)

start(1)

Example #1

Show file

File: elviz_cluster.py Project: JanetMatsen/elvizAnalysis

def main(argv):
    dbscan_heuristic_mode = False
    dpgmm_mode = False
    do_plot_clusters = False
    do_dump_clusters = False
    try:
        opts, args = getopt.getopt(argv,"hegdp")
    except getopt.GetoptError:
        print('elviz_cluster.py [-h] [-e] [-g] [-d] [-p]')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('elviz_cluster.py [-h] [-e]')
            print('  -h = help, -e = run dbscan' +
                  ' epsilon heuristic plot generation code')
            print('  -g = use a DPGMM for clustering')
            print('  -p = plot the clusters to a PDF file')
            print('  -d = dump the clusters to a text file')
            sys.exit()
        elif opt == '-e':
            dbscan_heuristic_mode = True
        elif opt == '-g':
            dpgmm_mode = True
        elif opt == '-p':
            do_plot_clusters = True
        elif opt == '-d':
            do_dump_clusters = True

    [elviz_data, combined_df] = read_pickle_or_CSVs(DATA_PICKLE, RAW_DATA_DIR)

    # Setup plotting limits
    print("determining plotting limits")
    limits = {"x": [combined_df['Average fold'].min(), MAX_AVG_FOLD],
              "y": [combined_df['Reference GC'].min(), combined_df['Reference GC'].max()]}
    # Below changed in favor of fixed MAX
    # limits["x"] = [combined_df['Average fold'].min(), combined_df['Average fold'].max()]
    # fixed MAX below

    print("normalizing data prior to clustering")
    # normalize the combined data to retrieve the normalization parameters
    scaler = StandardScaler().fit(combined_df[CLUSTER_COLUMNS])
    # serializing outputs

    if dbscan_heuristic_mode:
        print("making DBSCAN heuristic plots")
        dbscan_heuristic(elviz_data, scaler)
        os.sys.exit()

    print("serially processing files")
    for filename in elviz_data.keys():
        pdf_filename = filename.replace("csv", "pdf")
        # skip if the PDF already exists
        if os.path.isfile(RESULTS_DIR + pdf_filename):
            print("skiping file %s" % filename)
            continue
        print("processing file %s" % filename)

        df = elviz_data[filename]

        # create a multipage PDF for storing the plots
        with PdfPages(RESULTS_DIR + pdf_filename) as pdf:
            # find unique values of taxonomy columns
            dfgb = df.groupby(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'])
            for key in dfgb.indices.keys():
                idx = dfgb.indices[key]
                tax_rows = df.iloc[idx]
                if len(tax_rows) < MIN_ROWS:
                    continue
                # normalize all dimensions to be used in clustering, e.g. GC, coverage, rpk
                # reuse the scaler we created from all of the data for the transform
                tax_rows_cluster_columns = scaler.transform(tax_rows[CLUSTER_COLUMNS])

                if not dpgmm_mode:
                    db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES)
                    db.fit(tax_rows_cluster_columns)

                    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
                    core_samples_mask[db.core_sample_indices_] = True
                    labels = db.labels_
                else:
                    db = mixture.DPGMM(n_components=DPGMM_N_COMPONENTS, n_iter=100,
                                       covariance_type='full', alpha=100, verbose=0)
                    db.fit(tax_rows_cluster_columns)
                    Y_ = db.predict(tax_rows_cluster_columns)
                    for i, (mean, covar) in enumerate(zip(
                        db.means_, db._get_covars())):
                        if not np.any(Y_ == i):
                            continue
                        #plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
                    labels = Y_
                    core_samples_mask = np.zeros_like(labels, dtype=bool)
                    core_samples_mask[:] = True
                            
                #print(labels)
                #print(type(labels))

                # number of clusters in labels, ignoring noise if present.
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

                if n_clusters_ < 1:
                    continue

                #print(tax_rows_cluster_columns)

                title = ', '.join(key)
                if (do_plot_clusters):
                    plot_clusters(pdf, scaler.inverse_transform(tax_rows_cluster_columns),
                              title, labels, core_samples_mask, limits)
                if (do_dump_clusters):
                    dump_clusters(filename, key, labels, tax_rows[CONTIG_COLUMN]);