def get_j_set(db_file, out_file, sort_list_out_file): j_set_db = [] j_set_sort_list = [] db = Utility.load_obj(db_file) for syl in db: if 'j' in syl['id']: j_set_db.append(syl) # print syl['dur'] dur = 0 for idx, d in enumerate(syl['dur']): if idx == 0: continue dur = dur + d j_set_sort_list.append((syl['id'], d, syl['stress'])) Utility.sort_by_index(j_set_sort_list, 1) print j_set_sort_list Utility.save_obj(j_set_sort_list, sort_list_out_file) Utility.save_obj(j_set_db, out_file) pass
def is_finish(rmse_list): if len(rmse_list) < 2: print 'Finish cause no more input' return True Utility.sort_by_index(rmse_list, 0) print rmse_list first = rmse_list[0] second = rmse_list[1] for a, b in zip(first, second): if abs(a - b) > threshold: return False return True
# the histogram of the data n, bins, patches = plt.hist(errors_list, 100, normed=1, facecolor='green', alpha=0.75) plt.savefig('hist.eps') Utility.save_obj(errors, './errors_dict.pkl') rmse = np.sqrt(sklearn.metrics.mean_squared_error( true, dct_regen)) * 1200 / np.log(2) print 'Coeff {} all rmse : '.format(coeff), rmse Utility.sort_by_index(errors_tuple, 1) Utility.write_to_file_line_by_line('./errors_sorted.txt', errors_tuple) print len(syl_dct) base = '/work/w2/decha/Data/GPR_speccom_data/Interspeech2017/tone_separated/' Utility.make_directory(base) Utility.save_obj( syl_dct, '/work/w2/decha/Data/GPR_speccom_data/Interspeech2017/tone_separated/tone_all_dct_coeff_{}.pkl' .format(coeff)) for t in range(5):
def run(main_fig_path, out_path): vowel_type = ['v', 'vv', 'vn', 'vvn', 'vsg', 'vvsg'] # vowel_type = ['v', 'vv', 'vn', 'vvn', 'vsg', 'vvsg'] tones = ['0', '1', '2', '3', '4'] # vowel_type = ['vv'] # tones = ['1'] c = dict() c[-1] = 'black' c[0] = 'blue' c[1] = 'red' c[2] = 'green' c[3] = 'yellow' fig_per_line = 4 syllable_lists = dict() # for v in vowel_type: for v in ['vvv', 'vvvn', 'vvvsg']: for t in tones: if v == 'vvv': vv = ['v', 'vv'] elif v == 'vvvn': vv = ['vn', 'vvn'] elif v == 'vvvsg': vv = ['vsg', 'vvsg'] else: print 'wtf' dire = '{}/{}/'.format(out_path, v) Utility.make_directory(dire) latext_out_file = '{}/stress-list_vowel-type-{}_Tone-{}.tex'.format( dire, v, t) for vi in vv: path = '{}/{}/{}/'.format(main_fig_path, vi, t) files = Utility.list_file(path) file_list = [] # print files for f in files: # tscsd_manual_f17_22_tone_3_dur_15.025_syl_n-aa-m^_stress_0 if f.startswith('.'): continue pattern = re.compile( r"""(?P<name>.+)_tone.+dur_(?P<dur>.+)_syl.+_stress_(?P<stress>.+).eps""", re.VERBOSE) match = re.match(pattern, f) # print match if match: dur = float(match.group('dur')) stress = 'Unstress' if int(match.group('stress')) == 1: stress = 'Stress' file_list.append(('{}/{}'.format(path, f), dur, stress, match.group('name'))) else: print f Utility.sort_by_index(file_list, 1) # print file_list eps_list = [] caption_list = [] temp_eps = [] temp_cap = [] for fi in file_list: file_path = fi[0] dur = fi[1] stress = fi[2] name = fi[3] group = find_group(t, v, name) color = c[group] # if group != 0: # print v, t # print group, color # print '\\textcolor{{{}}}{{{}}}'.format(color,name) name = name.replace('_', '\_') # {\color{red} Panda } temp_eps.append(file_path) temp_cap.append('{{\color{{{}}} {} }}'.format(color, name)) if fig_per_line == len(temp_eps): eps_list.append(temp_eps) caption_list.append(temp_cap) temp_eps = [] temp_cap = [] eps_list.append(temp_eps) caption_list.append(temp_cap) # print len(eps_list) if len(eps_list) == 1: continue Latext_Tool.plot_all_data(eps_list, caption_list, latext_out_file) pass
def get_data(path): clustered_label = Utility.load_obj('{}/clustered_label.npy'.format(path)) # print len(clustered_label) best_measure_params = Utility.load_obj('{}/best_measure_params.npy'.format(path)) # print best_measure_params name_index = np.array( Utility.load_obj('{}/name_index.npy'.format(path)) ) # print len(name_index) m = Utility.load_obj('{}/GP_model.npy'.format(path)) model = m.X.mean model = np.array(model) # print model input_sensitivity = m.input_sensitivity() print 'Input sent : ', input_sensitivity group_list = ['-1', '0', '1', '2'] sort_list = [] for group in group_list: g = '{}/group_list/{}.npy'.format(path, group) if Utility.is_file_exist(g): names = Utility.load_obj(g) sort_list.append((group, len(names))) Utility.sort_by_index(sort_list, 1) print sort_list unstressed_mean = get_means(sort_list, len(sort_list)-1, name_index, model, path) unstressed_list = Utility.load_obj( '{}/group_list/{}.npy'.format(path, sort_list[len(sort_list)-1][0]) ) stressed_mean = get_means(sort_list, len(sort_list)-2, name_index, model, path) stressed_list = Utility.load_obj( '{}/group_list/{}.npy'.format(path, sort_list[len(sort_list)-2][0]) ) Utility.save_obj( {'unstress_mean': unstressed_mean, 'stress_mean': stressed_mean}, '{}/mean_of_unstress_stress.npy'.format(path) ) lengthscale=1/np.array(input_sensitivity, dtype=float) kernel = GPy.kern.RBF(10, lengthscale=lengthscale, ARD=True) print len(unstressed_list), len(stressed_list) for idx, g in enumerate(sort_list): if idx == (len(sort_list)-2): break names = Utility.load_obj('{}/group_list/{}.npy'.format(path, g[0])) print len(names) latent_data = get_data_from_a_giving_name_index(names, name_index, model) stu = np.array([unstressed_mean, stressed_mean]) distance = -1*np.log(kernel.K( latent_data, stu )) for n, dis in zip(names, distance): if dis[0] > dis[1]: unstressed_list = np.append(unstressed_list, n) else : stressed_list = np.append(stressed_list, n) print len(unstressed_list), len(stressed_list) # print unstressed_list, stressed_list unstressed_data = get_data_from_a_giving_name_index(unstressed_list, name_index, model) stressed_data = get_data_from_a_giving_name_index(stressed_list, name_index, model) unstress_distance = -1*np.log(kernel.K( unstressed_data, np.array([stressed_mean]) )) # print unstress_distance unstress_distance = np.append(unstress_distance, 0.0) unstress_distance = np.reshape(unstress_distance, (len(unstress_distance),1 )) min_max_scaler = preprocessing.MinMaxScaler() unstress_distance = min_max_scaler.fit_transform(unstress_distance) unstress_distance = np.reshape(unstress_distance, len(unstress_distance)) unstress_distance = 1 - unstress_distance # print unstress_distance # print min(unstress_distance), max(unstress_distance) for nam, dis in zip(unstressed_list, unstress_distance): out_dict[nam] = float("{0:.4f}".format(dis)) for nam, dis in zip(stressed_list, [1.0] * len(stressed_list)): out_dict[nam] = dis # print out_dict return (unstressed_list, stressed_list)
child = '' for opt in base_opt: child = child + '{}:'.format(opt) if not child in rmse_list: optimal_queue.append((name, base_opt)) print rmse_list print '-------------------------' tups = [] for key in rmse_list: tups.append((key, rmse_list[key])) Utility.sort_by_index(tups, 1) print tups Utility.save_obj( tups, '{}/results_{}.pkl'.format(outname, strftime("%Y-%m-%d %H:%M:%S", gmtime()))) # for idx, oooo in enumerate(st) : # if (st[idx]-ed[idx]) == 0: # optimal_threshold.append(st[idx]) # continue # print 'start : ', st # rmse_list = []
def run(X, labels_true, path, dominant, inverselengthscale, stress_only=False, stress_list=None): ############################################################################## X = np.array(X) labels_true = np.array(labels_true) if stress_only: print 'stress_only' stress_index = np.where(stress_list==1) print stress_index X = np.copy(X[stress_index]) labels_true = np.copy(labels_true[stress_index]) # Compute DBSCAN print 'Stress : {}, Unstress: {}'.format(len(np.where(labels_true==1)[0]), len(np.where(labels_true==0)[0])) lengthscale=1/np.array(inverselengthscale, dtype=float) kernel = GPy.kern.RBF(len(X[0]), lengthscale=lengthscale, ARD=True) print lengthscale XX = -1*np.log(kernel.K(X, X)) # incre = 0.00005 incre = 0.00001 jncre = 1 done = False measure_list = [] outfile = [] # print labels_true # Best : (0.0025000000000000005, 35.0, 0.69180773481515445) print XX.shape, len(labels_true) print 'Mean, min, max' print np.mean(XX), np.amin(XX), np.amax(XX) # sys.exit() outfile.append('Incre : {}'.format(incre)) outfile.append('Mean, min, max') outfile.append('{}, {}, {}'.format(np.mean(XX), np.amin(XX), np.amax(XX))) for i in np.flipud(np.arange(0.00, 0.01, incre)): # for i in np.flipud(np.arange(0.001, 0.004, incre)): if done : break for j in np.flipud(np.arange(jncre, 40.0, jncre)): try: db = DBSCAN(eps=i, min_samples=j, metric='precomputed').fit(XX) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # if n_clusters_ == en(set(labels_true)): # print n_clusters_, i, j measure_list.append((i, j, metrics.v_measure_score(labels_true, labels))) except: # print 'Error at : eps={}, min_samples={}'.format(i ,j) # traceback.print_exc() # sys.exit() pass Utility.sort_by_index(measure_list, 2) if len(measure_list) == 0: print 'Error: Cannot find best at : {}'.format(path) print 'Best : {}'.format(measure_list[len(measure_list)-1]) v_best = measure_list[len(measure_list)-1][2] outfile.append('Best : '.format(measure_list[len(measure_list)-1])) for m in measure_list: if m[2] == v_best: print m outfile.append('{}'.format(m)) db = DBSCAN( eps=measure_list[len(measure_list)-1][0], min_samples=int(measure_list[len(measure_list)-1][1]), metric='precomputed').fit(XX) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ acc = accuracy_score(labels_true, labels) swap = np.copy(labels_true) stress_index = np.where(swap==1) unstress_index = np.where(swap==0) swap[stress_index] = 0 swap[unstress_index] = 1 acc_swap = accuracy_score(swap, labels) if acc_swap > acc: acc = acc_swap print 'Accuracy score : {} / swap: {}'.format(acc, acc_swap) # for idx, t in enumerate(labels): # print labels[idx], labels_true[idx] # print db.core_sample_indices_ # print labels Utility.save_obj([len(measure_list)-1][0], '{}/best_measure_params.npy'.format(path)) Utility.save_obj(labels, '{}/clustered_label.npy'.format(path)) # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) outfile.append('Estimated number of clusters: %d' % n_clusters_) outfile.append("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) outfile.append("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) outfile.append("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) outfile.append("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) outfile.append("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) Utility.write_to_file_line_by_line('{}/clustering_result.txt'.format(path), outfile) # print("Silhouette Coefficient: %0.3f" # % metrics.silhouette_score(X, labels)) ############################################################################## # Plot result import matplotlib.pyplot as plt plt.clf() # Black removed and is used for noise instead. unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, dominant[0]], xy[:, dominant[1]], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, dominant[0]], xy[:, dominant[1]], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('Estimated number of clusters: %d' % n_clusters_) # plt.show() print '{}/stress_unstress_clustering_lengthscale.eps'.format(path) plt.savefig('{}/stress_unstress_clustering_lengthscale.eps'.format(path)) return labels_true, labels