def run_clustering(X, k, dists_all): cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) intra = metrics.avg_intra_dist(X, assign, dists_all)[0] inter = metrics.avg_inter_dist(X, assign, dists_all)[0] bcv = metrics.beta_cv(X, assign, dists_all) cost = metrics.cost(X, assign, None, dists_cent) return intra, inter, bcv, cost
def main(tseries_fpath, base_folder, k): k = int(k) idx_fpath = os.path.join(os.path.join(base_folder, ".."), "train.dat") X = ioutil.load_series(tseries_fpath, idx_fpath) cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) np.savetxt(os.path.join(base_folder, "cents.dat"), cent, fmt="%.5f") np.savetxt(os.path.join(base_folder, "assign.dat"), assign, fmt="%d") np.savetxt(os.path.join(base_folder, "shift.dat"), shift, fmt="%d") np.savetxt(os.path.join(base_folder, "dists_cent.dat"), dists_cent, fmt="%.5f")
def main(tseries_fpath, base_folder, k): k = int(k) idx_fpath = os.path.join(os.path.join(base_folder, '..'), 'train.dat') X = ioutil.load_series(tseries_fpath, idx_fpath) cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) np.savetxt(os.path.join(base_folder, 'cents.dat'), cent, fmt='%.5f') np.savetxt(os.path.join(base_folder, 'assign.dat'), assign, fmt='%d') np.savetxt(os.path.join(base_folder, 'shift.dat'), shift, fmt='%d') np.savetxt(os.path.join(base_folder, 'dists_cent.dat'), dists_cent, fmt='%.5f')
def main(tseries_fpath, k, plot_foldpath): import mkl mkl.set_num_threads(16) initialize_matplotlib() X = np.genfromtxt(tseries_fpath)[:, 1:] aux = X.sum(axis=1) fix = np.where(aux == 0)[0] X[fix] += .001 #fixing zero only rows X = X.copy() cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) for i in range(cent.shape[0]): t_series = cent[i] plt.plot(t_series, '-k') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) #plt.ylabel('Views') #plt.xlabel('Time') plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i)) plt.close() half = t_series.shape[0] // 2 to_shift = half - np.argmax(t_series) to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True) plt.plot(to_plot_peak_center, '-k') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) #plt.ylabel('Views') #plt.xlabel('Time') plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i)) plt.close() to_shift = 0 - np.argmin(t_series) to_plot_min_first = dist.shift(t_series, to_shift, rolling=True) plt.plot(to_plot_min_first, '-k') plt.gca().get_xaxis().set_visible(False) plt.gca().get_yaxis().set_visible(False) #plt.ylabel('Views') #plt.xlabel('Time') plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i)) plt.close() np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f') np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d') np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d') np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'), dists_cent, fmt='%.5f')
def cluster(T, num_clust=5): ''' Runs the KSC algorithm on time series matrix T. Parameters ---------- T : ndarray of shape (row, time series length) The time series to cluster num_clust : int Number of clusters to create ''' T = np.asarray(T + 1e-20, order='C').copy() cents, assign, _, _ = ksc.inc_ksc(T, num_clust) return cents, assign
return ksc_input if __name__ == "__main__": df = pd.read_csv( "new_daily_frequencies/zika_jj_cols_daily_frequencies_new.csv", encoding="iso-8859-1") # df = pd.read_csv("Frequencies.csv") ksc_input = alter_inputs(df) ksc_input = ksc_input.copy(order='C') # ksc_input_values = ksc_input.values #min_max_scaler = preprocessing.MinMaxScaler() #x_scaled = min_max_scaler.fit_transform(ksc_input) # ksc_input_normalized = pd.DataFrame(x_scaled) k = 3 # Number of clusters centers, assign, series_shifts, dists = ksc.inc_ksc(ksc_input, k) # Creating a Dataframe to store Hashtags and Cluster result_df = pd.DataFrame(columns=['Words', 'Cluster']) result_df['Words'] = df['Words'] result_df['Cluster'] = assign result_df.to_csv( "new_daily_frequencies/Words_Daily_Frequencies_3Clusters.csv", index=False, encoding='utf-8')