def main(argv): dbscan_heuristic_mode = False dpgmm_mode = False do_plot_clusters = False do_dump_clusters = False try: opts, args = getopt.getopt(argv,"hegdp") except getopt.GetoptError: print('elviz_cluster.py [-h] [-e] [-g] [-d] [-p]') sys.exit(2) for opt, arg in opts: if opt == '-h': print('elviz_cluster.py [-h] [-e]') print(' -h = help, -e = run dbscan' + ' epsilon heuristic plot generation code') print(' -g = use a DPGMM for clustering') print(' -p = plot the clusters to a PDF file') print(' -d = dump the clusters to a text file') sys.exit() elif opt == '-e': dbscan_heuristic_mode = True elif opt == '-g': dpgmm_mode = True elif opt == '-p': do_plot_clusters = True elif opt == '-d': do_dump_clusters = True [elviz_data, combined_df] = read_pickle_or_CSVs(DATA_PICKLE, RAW_DATA_DIR) # Setup plotting limits print("determining plotting limits") limits = {"x": [combined_df['Average fold'].min(), MAX_AVG_FOLD], "y": [combined_df['Reference GC'].min(), combined_df['Reference GC'].max()]} # Below changed in favor of fixed MAX # limits["x"] = [combined_df['Average fold'].min(), combined_df['Average fold'].max()] # fixed MAX below print("normalizing data prior to clustering") # normalize the combined data to retrieve the normalization parameters scaler = StandardScaler().fit(combined_df[CLUSTER_COLUMNS]) # serializing outputs if dbscan_heuristic_mode: print("making DBSCAN heuristic plots") dbscan_heuristic(elviz_data, scaler) os.sys.exit() print("serially processing files") for filename in elviz_data.keys(): pdf_filename = filename.replace("csv", "pdf") # skip if the PDF already exists if os.path.isfile(RESULTS_DIR + pdf_filename): print("skiping file %s" % filename) continue print("processing file %s" % filename) df = elviz_data[filename] # create a multipage PDF for storing the plots with PdfPages(RESULTS_DIR + pdf_filename) as pdf: # find unique values of taxonomy columns dfgb = df.groupby(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']) for key in dfgb.indices.keys(): idx = dfgb.indices[key] tax_rows = df.iloc[idx] if len(tax_rows) < MIN_ROWS: continue # normalize all dimensions to be used in clustering, e.g. GC, coverage, rpk # reuse the scaler we created from all of the data for the transform tax_rows_cluster_columns = scaler.transform(tax_rows[CLUSTER_COLUMNS]) if not dpgmm_mode: db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES) db.fit(tax_rows_cluster_columns) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ else: db = mixture.DPGMM(n_components=DPGMM_N_COMPONENTS, n_iter=100, covariance_type='full', alpha=100, verbose=0) db.fit(tax_rows_cluster_columns) Y_ = db.predict(tax_rows_cluster_columns) for i, (mean, covar) in enumerate(zip( db.means_, db._get_covars())): if not np.any(Y_ == i): continue #plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) labels = Y_ core_samples_mask = np.zeros_like(labels, dtype=bool) core_samples_mask[:] = True #print(labels) #print(type(labels)) # number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) if n_clusters_ < 1: continue #print(tax_rows_cluster_columns) title = ', '.join(key) if (do_plot_clusters): plot_clusters(pdf, scaler.inverse_transform(tax_rows_cluster_columns), title, labels, core_samples_mask, limits) if (do_dump_clusters): dump_clusters(filename, key, labels, tax_rows[CONTIG_COLUMN]);