def domanda_4(): print("Computing: hierchical clustering on dataset_212...") C, t, d = hierarchical_clustering(dataset_212, 9, weighted) print("Drawing...") draw_clustering( C, "Clustering gerarchico su 212 contee" + (" (v. pesata)" if weighted else ""))
def visualize_data(cluster_input, data, method=None, display_centers=False): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(data) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if method == None: cluster_list = sequential_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "sequential clusters") elif method == 'hierarchical_clustering': cluster_list = clustering.hierarchical_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "hierarchical clusters") elif method == 'kmeans_clustering': cluster_list = clustering.kmeans_clustering(singleton_list, cluster_input[0], cluster_input[1]) print("Displaying", len(cluster_list), "k-means clusters") else: print("ERROR: method entered into visualize_data not recognized") alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # cluster_list = sequential_clustering(singleton_list, 15) # print "Displaying", len(cluster_list), "sequential clusters" cluster_list = \ alg_project3_solution.hierarchical_clustering(singleton_list, 9) print "Displaying", len(cluster_list), "hierarchical clusters" # cluster_list = alg_project3_solution.kmeans_clustering( # singleton_list, 9, 5) # print "Displaying", len(cluster_list), "k-means clusters" print ca.compute_distortion(cluster_list, data_table)
def domanda_1(): print("Computing: hierchical clustering on dataset_full...") C, t, d = hierarchical_clustering(dataset_full, 15, weighted) print("Drawing...") draw_clustering( C, "Clustering gerarchico sull'intero dataset" + (" (v. pesata)" if weighted else ""))
def domanda_6(): print("Computing: hierchical clustering on dataset_212...") C1, t1, d1 = hierarchical_clustering(dataset_212, 9, weighted) print("Distortion for hierchical clustering:", d1) print("Computing: kmeans clustering on dataset_212...") C2, t2, d2 = kmeans_clustering(dataset_212, 9, 5, weighted) print("Distortion for kmeans clustering:", d2)
def test_hierarchical24(): """ Test for hierarchical clustering Note that hierarchical_clustering mutates cluster_list """ # load small data table print print "Testing hierarchical_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) # test data of the form [size of output cluster, sets of county tuples] hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])], [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])], [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])], [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])], [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])], ] #hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])]] suite = poc_simpletest.TestSuite() for num_clusters, expected_county_tuple in hierdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.hierarchical_clustering(cluster_list, num_clusters) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()
def cluster_features(contours, cnt_dicts, drawer, edge_type, do_draw=False): # Do hierarchicalclustering by shape, color, and size label_dict = {} for feature_type in ['size', 'shape', 'color']: feature_list = [cnt_dic[feature_type] for cnt_dic in cnt_dicts] # ndarray e.g. ([1, 1, 1, 1, 1, 3, 3, 2, 2, 2]), len=#feature_list labels = hierarchical_clustering(feature_list, feature_type, edge_type, drawer, do_draw) label_dict[feature_type] = labels if do_draw: img = drawer.blank_img() for label in set(labels): cnt_dic_list_by_groups = [ c for i, c in enumerate(contours) if labels[i] == label ] img = drawer.draw_same_color(cnt_dic_list_by_groups, img) desc = 'f_Feature{}_{}'.format(feature_type.capitalize(), edge_type) drawer.save(img, desc) # combine the label clustered by size, shape, and color. ex: (0,1,1), (2,0,1) combine_labels = [] for size, shape, color in zip(label_dict['size'], label_dict['shape'], label_dict['color']): combine_labels.append((size, shape, color)) # find the final group by the intersected label and draw img = drawer.blank_img() groups_cnt_dicts = [] for combine_label in set(combine_labels): if combine_labels.count(combine_label) < 2: continue # groups_cnt_dicts.append( # [cnt_dicts[i] for i, label in enumerate(combine_labels) if label == combine_label] # ) group_idx = [ idx for idx, label in enumerate(combine_labels) if label == combine_label ] group_cnt_dicts = [cnt_dicts[i] for i in group_idx] groups_cnt_dicts.append(group_cnt_dicts) # for do_draw cnts = [contours[i] for i in group_idx] img = drawer.draw_same_color(cnts, img) if do_draw: desc = 'g_OriginalResult_{}'.format(edge_type) drawer.save(img, desc) return groups_cnt_dicts
def hier_dist(data_url): """ Calculates distirtion of hierarchical alg for 6-20 clusters """ res = {} data_table = load_data_table(data_url) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = \ alg_project3_solution.hierarchical_clustering(singleton_list, 20) res[20] = ca.compute_distortion(cluster_list, data_table) for num_clust in range(19, 5, -1): cluster_list = \ alg_project3_solution.hierarchical_clustering( cluster_list, num_clust) res[num_clust] = ca.compute_distortion(cluster_list, data_table) return res
def compute_q5_q6(): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) # Formate data as Clusters singleton_list = [] for line in table111: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # K-means kmeans_clusters = clustering.kmeans_clustering(singleton_list, 9, 5) k_distortion = compute_distortion(kmeans_clusters, table111) print("K-means Distortion: {}".format(k_distortion)) # Hierarchical hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 9) h_distortion = compute_distortion(hierarchical_clusters, table111) print("Hierarchical Distortion: {}".format(h_distortion))
def test_compute_distortion(): # Load data table290 = viz_tools.load_data_table(DATA_290_URL) # Formate data as Clusters singleton_list = [] for line in table290: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # Test 2: Expect 2.323×10^11 kmeans_clusters = clustering.kmeans_clustering(singleton_list, 16, 5) k_distortion = compute_distortion(kmeans_clusters, table290) print("K-means Distortion: {}".format(k_distortion)) # Test 1: Expect 2.575×10^11 hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 16) h_distortion = compute_distortion(hierarchical_clusters, table290) print("Hierarchical Distortion: {}".format(h_distortion))
def main(): years = None features_excluded = ['week_start_date'] _outliers = None cities = get_values_of("../data/dengue_features_train.csv", 'city') target = ['total_cases'] all_revelant_features = {} for city in cities: # Filtering by values of the keys _filter = {'city': [city], 'year': years} #Load city data data = load_data("../data/dengue_features_train.csv", filter_parameters=_filter, excludes_features=features_excluded, outliers=_outliers) # Load total cases by city, year and week of year data_labels = load_data("../data/dengue_labels_train.csv", filter_parameters=_filter) # Adapt data for clustering data_test_hiech = data.drop(labels=['city', 'year'], axis=1, inplace=False) # Outliers will be deleted elements, outliers, cut = clustering.hierarchical_clustering( data=data_test_hiech) n_element = count_elements(elements) n_outliers = count_elements(outliers) total = n_element + n_outliers print 'Analysis in: %s' % (city) total_outliers = [] while (outliers != None): total_outliers += outliers data_test_hiech.drop(outliers, axis=0, inplace=True) elements, outliers, cut = clustering.hierarchical_clustering( data_test_hiech, cut=cut, first_total=total) if total_outliers: print 'Auto-detected Outliers:' print total_outliers # Join data data_without_outliers = data data_without_outliers.drop(total_outliers, axis=0, inplace=True) merge_data = pd.merge(data_without_outliers, data_labels, on=['city', 'year', 'weekofyear'], how='outer') merge_data.drop(labels=['city', 'year'], axis=1, inplace=True) merge_data.dropna(inplace=True) # Features clustering data_for_features = merge_data.drop(labels=target, axis=1) clustering.hierarchical_clustering_features(data_for_features) # Croos Validation for select features feature_selected, max_deph = cros.cross_validation( merge_data, algorithm='DecisionTreeRegressor') # Regressor for select relevant features relevant_features = reg.tree_regressor(merge_data, max_deph, feature_selected, target, city) all_revelant_features[city] = relevant_features # For each city, one model KNN # Croos Validation for select features n_neighbors, X, y = cros.cross_validation(merge_data, algorithm='KNN', features=relevant_features, target=target, verbose=True) #--------------------------------------------- # prediction data_Test = load_data("../data/dengue_features_test.csv", filter_parameters=_filter, excludes_features=features_excluded, outliers=_outliers) #data_Test.dropna(inplace = True) test = data_Test[relevant_features] test.interpolate(method='linear', inplace=True) knn = neighbors.KNeighborsRegressor(n_neighbors, weights='distance') prediction = knn.fit(X, y).predict(test) # show prediction print "\nPREDICTION:" xx = np.stack(i for i in range(len(prediction))) plt.plot(xx, prediction, c='g', label='prediction') plt.axis('tight') plt.legend() plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors, 'distance')) plt.show() # write the results in a csv file submission_data = load_data("../data/submission_format.csv", filter_parameters=_filter) final_data = [] for i in range(len(final_data)): row = [] row.append(submission_data.iloc[i]['city']) row.append(submission_data.iloc[i]['year']) row.append(submission_data.iloc[i]['weekofyear']) row.append(int(prediction[i])) final_data.append(row) col = ["city", "year", "weekofyear", "total_cases"] df = pd.DataFrame(final_data, columns=col) df.to_csv('../data/predictions_for_' + city + '.csv', index=False, sep=',', encoding='utf-8') #--------------------------------------------- print '\n\t [ SELECTED FEATURES ]' for key, value in all_revelant_features.iteritems(): print 'City: %s, %2d features: \n\t %s' % (key, len(value), str(value))
#pairwise sequence alignment results results = main_algorithm(df_encoded,gap,T,s,0) #reset indexes df_encoded = df_encoded.reset_index() #convert similarity matrix into distance matrix results['score'] = convert_to_distance_matrix(results['score']) #exception when all the scores are the same, in this case we continue with the next value of gap if((results['score']== 0).all()): #print('entrei') continue else: #hierarchical clustering Z = hierarchical_clustering(results['score'],method,gap,T,args.automatic,pp) #validation chosen = validation(M,df_encoded,results,Z,method,min_K,max_K+1,args.automatic,pp,gap,T) chosen_k = chosen[2] df_avgs = chosen[0] df_stds = chosen[1] chosen_results = df_avgs.loc[chosen_k] chosen_results['gap'] = gap concat_for_final_decision.append(chosen_results) ############################################################################ # RESULTS ############################################################################ #close pdf
import sys sys.path.append('../../3_closest_pairs_&_clustering_algorithms') import data.load_clusters as lc import data.cluster as cl import clustering as clr import alg_clusters_matplotlib as cplot data_table = lc.load_data_table(lc.DATA_896_URL) #DATA_3108_URL DATA_290_URL singleton_list = [] for line in data_table: singleton_list.append(cl.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) c = 7 # cluster count cluster_list = clr.hierarchical_clustering(singleton_list, c) cplot.plot_clusters(data_table, cluster_list, True)
def main(): first = True name_file = assign_name() prediction_path = '../predictions/' + name_file if not os.path.exists(prediction_path): os.makedirs(prediction_path) years = None features_excluded = ['week_start_date'] _outliers = None cities = get_values_of("../data/dengue_features_train.csv", 'city') target = 'total_cases' all_revelant_features = {} all_scores = [] modes = [ #'dropna', 'interpolate', 'mean', ['interpolate', 'mean'] ] #, ['interpolate', 'dropna']] for mode in modes: first = True scores_city = {} for city in cities: # Filtering by values of the keys _filter = {'city': [city], 'year': years} #Load city data data = load_data("../data/dengue_features_train.csv", filter_parameters=_filter, excludes_features=features_excluded, outliers=_outliers) # Load total cases by city, year and week of year data_labels = load_data("../data/dengue_labels_train.csv", filter_parameters=_filter) data_fill = data_fill_mode(data, mode) data_labels_fill = data_fill_mode(data_labels, mode) # Adapt data for clustering data_test_hiech = data_fill.drop(labels=['city', 'year'], axis=1, inplace=False) # Outliers will be deleted elements, outliers, cut = clustering.hierarchical_clustering( data=data_test_hiech, verbose=False) n_element = count_elements(elements) n_outliers = count_elements(outliers) total = n_element + n_outliers print 'Analysis in: %s on mode %s' % (city, str(mode)) total_outliers = [] while (outliers != None): total_outliers += outliers data_test_hiech.drop(outliers, axis=0, inplace=True) elements, outliers, cut = clustering.hierarchical_clustering( data_test_hiech, cut=cut, first_total=total, verbose=False) if total_outliers: print 'Auto-detected Outliers:' print total_outliers # Join data data_without_outliers = data_fill data_without_outliers.drop(total_outliers, axis=0, inplace=True) merge_data = pd.merge(data_without_outliers, data_labels_fill, on=['city', 'year', 'weekofyear'], how='inner') first_year = merge_data['year'].min() last_year = merge_data['year'].max() split_year = int(last_year - round((last_year - first_year) * 0.2)) # Features clustering data_for_features = merge_data.drop(labels=['city', 'total_cases'], axis=1) feature_groups = clustering.hierarchical_clustering_features( data_for_features, verbose=False) # Croos Validation for select features features_selected, max_deph = cros.cross_validation(merge_data, feature_groups, split_year, target=target) # Regressor for select relevant features relevant_features = reg.tree_regressor(merge_data, split_year, max_deph, features_selected, target, city, verbose=False) all_revelant_features[city] = relevant_features all_features = merge_data.columns.tolist()[1:-1] data_Test = load_data("../data/dengue_features_test.csv", filter_parameters=_filter, excludes_features=features_excluded, outliers=_outliers) # prediction prediction_knn, score_knn = predict.knn_prediction( merge_data, split_year, features_selected, target, data_Test, verbose=True) print('Score KNN on %s mode is : %.4f' % (mode, score_knn)) prediction_rf, score_rf = predict.rf_prediction(merge_data, split_year, all_features, target, data_Test, verbose=True) print('Score RandomForest on %s mode is : %.4f' % (mode, score_rf)) scores_city[city] = [(mode, 'Knn', score_knn), (mode, 'RF', score_rf)] # Load submission data file. submission_data = load_data("../data/submission_format.csv", filter_parameters=_filter) # wr ite the results in a csv file # Write result files. col = ["city", "year", "weekofyear", "total_cases"] write_result(col, submission_data, prediction_knn, prediction_rf, prediction_path, (name_file + str(mode)), first) first = False all_scores.append(scores_city) print all_scores """
def q10(plot_key): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) table290 = viz_tools.load_data_table(DATA_290_URL) table896 = viz_tools.load_data_table(DATA_896_URL) # Create cluster function create_cluster = lambda line: alg_cluster.Cluster(set([line[0]]), line[ 1], line[2], line[3], line[4]) # Formate data as Clusters klist111 = [create_cluster(line) for line in table111] klist290 = [create_cluster(line) for line in table290] klist896 = [create_cluster(line) for line in table896] hlist111 = [create_cluster(line) for line in table111] hlist290 = [create_cluster(line) for line in table290] hlist896 = [create_cluster(line) for line in table896] # Initialize distortion lists distortion111k, distortion290k, distortion896k = [], [], [] distortion111h, distortion290h, distortion896h = [], [], [] # Calculate distortion lists for num in range(20, 5, -1): if plot_key == 111: kmeans_cluster111 = clustering.kmeans_clustering(klist111, num, 5) h_cluster111 = clustering.hierarchical_clustering(hlist111, num) distortion111k.append( compute_distortion(kmeans_cluster111, table111)) distortion111h.append(compute_distortion(h_cluster111, table111)) elif plot_key == 290: kmeans_cluster290 = clustering.kmeans_clustering(klist290, num, 5) h_cluster290 = clustering.hierarchical_clustering(hlist290, num) distortion290k.append( compute_distortion(kmeans_cluster290, table290)) distortion290h.append(compute_distortion(h_cluster290, table290)) elif plot_key == 896: kmeans_cluster896 = clustering.kmeans_clustering(klist896, num, 5) h_cluster896 = clustering.hierarchical_clustering(hlist896, num) distortion896k.append( compute_distortion(kmeans_cluster896, table896)) distortion896h.append(compute_distortion(h_cluster896, table896)) # Plot results fig = plt.figure('Distortion for Different Clustering Methods') plt.title('Distortion for Different Clustering Methods: {} Points'.format( plot_key)) plt.xlabel('Number of Clusters') plt.ylabel('Distortion') x = list(range(20, 5, -1)) if plot_key == 111: y1, y4 = distortion111k, distortion111h plt.plot(x, y1, '-bo', markersize=1, label='K-means (111)') plt.plot(x, y4, '-co', markersize=1, label='Hierarchical (111)') elif plot_key == 290: y2, y5 = distortion290k, distortion290h plt.plot(x, y2, '-go', markersize=1, label='K-means (290)') plt.plot(x, y5, '-mo', markersize=1, label='Hierarchical (290)') elif plot_key == 896: y3, y6 = distortion896k, distortion896h plt.plot(x, y3, '-ro', markersize=1, label='K-means (896)') plt.plot(x, y6, '-yo', markersize=1, label='Hierarchical (896)') plt.legend(loc='best') plt.show()
import clustering import alg_cluster import imp foo = imp.load_source('poc_simpletest', '../PoC/poc_simpletest.py') foo.TestSuite() # horiz_pos, vert_pos, population, risk print clustering.closest_pair_strip([ alg_cluster.Cluster(set([]), 1.0, 1.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 5.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 4.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 7.0, 1, 0) ], 1.0, 3.0) # closest_pair_strip([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 0, 1, 1, 0), alg_cluster.Cluster(set([]), 0, 2, 1, 0), alg_cluster.Cluster(set([]), 0, 3, 1, 0)], 0.0, 1.0) expected one of the tuples in set([(1.0, 2, 3), (1.0, 0, 1), (1.0, 1, 2)]) but received (1.0, 0, 0) # fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)]) expected one of the tuples in set([(1.0, 1, 2), (1.0, 0, 1), (1.0, 2, 3)]) but received (Exception: TypeError) "'tuple' object does not support item assignment" at line 71, in fast_closest_pair # print fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)]) # print slow_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)]) print clustering.hierarchical_clustering([ alg_cluster.Cluster(set([]), 1.0, 1.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 5.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 4.0, 1, 0), alg_cluster.Cluster(set([]), 1.0, 7.0, 1, 0) ], 2)
#pairwise sequence alignment results results = main_algorithm(df_encoded, gap, T, s, 0) #reset indexes df_encoded = df_encoded.reset_index() #convert similarity matrix into distance matrix results['score'] = convert_to_distance_matrix(results['score']) #exception when all the scores are the same, in this case we continue with the next value of gap if ((results['score'] == 0).all()): print('entrei') continue else: #hierarchical clustering Z = hierarchical_clustering(results['score'], method, gap) #validation chosen = validation(M, df_encoded, results, Z, method, min_K, max_K + 1) chosen_k = chosen[2] df_avgs = chosen[0] df_stds = chosen[1] chosen_results = df_avgs.loc[chosen_k] chosen_results['gap'] = gap concat_for_final_decision.append(chosen_results) df_final_decision = pd.concat(concat_for_final_decision, axis=1).T final_k_results = final_decision(df_final_decision)
def main(): years = None features_excluded = ['week_start_date'] _outliers = None cities = get_values_of("../data/dengue_features_train.csv", 'city') all_revelant_features = {} for city in cities: # Filtering by values of the keys _filter = {'city':[city], 'year':years} #Load city data data = load_data( "../data/dengue_features_train.csv", filter_parameters = _filter, excludes_features = features_excluded, outliers = _outliers) # Load total cases by city, year and week of year data_labels = load_data("../data/dengue_labels_train.csv",filter_parameters = _filter) # Adapt data for clustering data_test_hiech = data.drop(labels = ['city', 'year'], axis = 1, inplace = False) # Outliers will be deleted elements, outliers, cut = clustering.hierarchical_clustering(data = data_test_hiech) n_element= count_elements(elements) n_outliers = count_elements(outliers) total=n_element + n_outliers print '\nOutliers in: %s \n\t' % (city) total_outliers = [] while (outliers != None): total_outliers += outliers data_test_hiech.drop(outliers, axis = 0, inplace = True) elements, outliers, cut = clustering.hierarchical_clustering(data_test_hiech, cut = cut, first_total = total) if total_outliers: print 'Auto-detected Outliers:' print total_outliers # Join data data_without_outliers = data data_without_outliers.drop(total_outliers, axis = 0, inplace = True) merge_data = pd.merge(data_without_outliers, data_labels, on = ['city', 'year', 'weekofyear'], how = 'outer') merge_data.drop(labels = ['city', 'year'], axis = 1, inplace = True) merge_data.dropna(inplace = True) # Features clustering data_for_features = merge_data.drop(labels = ['total_cases'], axis = 1) clustering.hierarchical_clustering_features(data_for_features) # Croos Validation for select features feature_selected, max_deph = cros.cross_validation(merge_data) # Regressor for select relevant features relevant_features = reg.tree_regressor(merge_data, max_deph, feature_selected, 'total_cases', city) all_revelant_features[city] = relevant_features print '\n\t [ SELECTED FEATURES ]' for key, value in all_revelant_features.iteritems(): print 'City: %s, %2d features: \n\t %s' % (key, len(value), str(value))