Example #1
0
def main():

    points = read_points_from_file(FILENAME)
    start_time = time.time()
    # TODO - remove all files in output folder (can be in makefile)

    k_means(N_KLUSTERS, points, N_THREADS)

    end_time = time.time()
    print("Execution time: %.3f" % (end_time - start_time))
    return
Example #2
0
def model_parameters(data, structure, C_old, U_old, k):
    """
    input: path string, path to file
           structure list(int, list(floats), list(floats)),
                      number of non-hypertime dimensions, list of hypertime
                      radii nad list of wavelengths
           C_old numpy array kxd, centres from last iteration
           U_old numpy array nxd, weights from last iteration
           k positive integer, number of clusters
    output: C numpy array kxd, matrix of k d-dimensional cluster centres
            U numpy array kxn, matrix of weights
            COV numpy array kxdxd, matrix of covariance matrices
            densities numpy array kx1, matrix of number of measurements
                                       belonging to every cluster
    uses: dio.create_X(), cl.k_means(), covariance_matrices()
    objective: to find model parameters
    """
    X = dio.create_X(data, structure)
    # test to find out if clusters are known from previous clustering
    try:
        len(U_old)
        ##### POKUS !!!
        #used_method = 'stable_init'  # originaly 'prev_dim'
        used_method = 'random'
        ##### KONEC POKUSU !!!
    except TypeError:
        used_method = 'random'
    #print('type of initialization for clustering: ' + used_method)
    C, U, densities = cl.k_means(
        X,
        k,
        structure,
        method=used_method,
        version='hard',  # weight calculation
        fuzzyfier=1,  # weighting exponent
        iterations=100,
        C_in=C_old,
        U_in=U_old)
    COV = covariance_matrices(X, C, U, structure)
    return C, U, COV, densities
Example #3
0
num_cluster_list = range(2, 3)  # the list of number of clusters
SSE_list = np.zeros(len(num_cluster_list))
sil_list = np.zeros(len(num_cluster_list))
ch_list = np.zeros(len(num_cluster_list))

i = 0
for num_cluster in num_cluster_list:
    #  k-means
    centroid, assignment, cost, sorted_zooms, sil_avg_kmeans, SSE_sum_kmeans, ch_score_kmeans = clustering.k_means(
        zooms_stnd,
        projct2d,
        dist_mat,
        num_cluster,
        max_iter,
        randm_init,
        interval + 1,
        w,
        plot=True,
        sil=True,
        SSE=True,
        sort=True)
    print(
        'K-means clustering evaluation: Silhouettes Index = %.3g; SSE= %.3g, C-H sore =%.3g'
        % (sil_avg_kmeans, SSE_sum_kmeans, ch_score_kmeans))
    SSE_list[i] = SSE_sum_kmeans
    sil_list[i] = sil_avg_kmeans
    ch_list[i] = ch_score_kmeans
    i += 1

#%%
Example #4
0
def main(dist_constant: int = None,
         file_path: str = '../data/matches.pkl',
         hero_weight: float = 0.05):
    if dist_constant is not None:
        print('main(): dist_constant is: ')
    print('main(): Data file path: ' + file_path)
    print('main(): hero weight: ' + str(hero_weight))
    collector = data_collector.DataCollector(data_file_path=file_path)
    list_of_dota_matches = collector.read_dota_matches_from_file()
    print('main(): DataCollector found: ' + str(len(list_of_dota_matches)) +
          ' matches from file')
    hero_names_dict = data_collector.get_hero_names()

    # for match_dict in list_of_match_dicts:
    #     if match_dict['avg_mmr'] is not None:
    #         try:
    #             match = DotaMatch(match_dict, hero_names_dict)
    #             list_of_dota_matches.append(match)
    #             print(str(match))
    #         except json.decoder.JSONDecodeError:
    #             print('Caught json.decoder.JSONDecodeError Exception, ignoring match...')
    # print('Query returned ' + str(len(list_of_dota_matches)) + ' matches')

    # Create the initial cluster that contains all matches that were read from file
    initial_cluster = clustering.Cluster(matches=list_of_dota_matches,
                                         hero_names=hero_names_dict)
    initial_cluster.print_center()
    max_dist = 0
    num_of_matches = len(initial_cluster.matches)
    farthest_cluster_pair = 0, 0
    avg_dist = 0
    num_of_disjoint_pairs = 0
    if dist_constant is None:
        # Get all disjoint pair distances, and find the average distance between points as well as the maximum distance
        for i in range(num_of_matches):
            for j in range(i + 1, num_of_matches):
                dist = clustering.get_distance(initial_cluster.matches[i],
                                               initial_cluster.matches[j],
                                               hero_weight=hero_weight)
                avg_dist += dist
                num_of_disjoint_pairs += 1
                if dist > max_dist:
                    max_dist = dist
                    farthest_cluster_pair = i, j
                    # print('Distance is: ' + str(dist))

        avg_dist = (avg_dist / num_of_disjoint_pairs)
        print('Average distance between Dota matches: ' + str(avg_dist) +
              ' max dist is: ' + str(max_dist))
    else:
        avg_dist = dist_constant
        print('Using dist constant of: ' + str(avg_dist) + ' max dist is: ' +
              str(max_dist))

    divisive_clustering_clusters = []
    divisive_start_time = time.time()
    # Run divisive clustering with the initial cluster and use the avg_dist as the "user-defined" constant
    # Time the computation time
    clustering.run_divisive_clustering(initial_cluster,
                                       hero_names_dict,
                                       int(avg_dist),
                                       divisive_clustering_clusters,
                                       hero_weight=hero_weight)
    divisive_end_time = time.time()
    suggested_value_of_k = len(divisive_clustering_clusters)

    print('Num of clusters found in divisive clustering: ' +
          str(len(divisive_clustering_clusters)) +
          ' Time of computation: %s seconds. About to print centers...' %
          (divisive_end_time - divisive_start_time))
    for cluster in divisive_clustering_clusters:
        print('Number of matches in cluster: ' + str(len(cluster.matches)))
        cluster.print_center()

    reclustering_start_time = time.time()
    # Run K-means using the value of k found by divisive clustering, and using the clusters already found in that step
    # Time the computation time
    k_means_clusters = clustering.k_means(
        hero_names_dict=hero_names_dict,
        num_of_clusters=suggested_value_of_k,
        final_clusters=divisive_clustering_clusters,
        hero_weight=hero_weight)
    reclustering_end_time = time.time()
    print('K-Means with re-clustering found clusters for k=' +
          str(len(k_means_clusters)) +
          ' In %s seconds. About to print centers...' %
          (reclustering_end_time - reclustering_start_time))
    for cluster in k_means_clusters:
        print('Number of matches in cluster: ' + str(len(cluster.matches)))
        cluster.print_center()

    random_start_time = time.time()
    # In order to compare computation time between
    k_means_random_clusters = clustering.k_means(
        hero_names_dict=hero_names_dict,
        num_of_clusters=suggested_value_of_k,
        matches=list_of_dota_matches,
        hero_weight=hero_weight)
    random_end_time = time.time()
    print('K-Means with random initialization found clusters for k=' +
          str(len(k_means_random_clusters)) +
          ' In %s seconds. About to print centers...' %
          (random_end_time - random_start_time))
    for cluster in k_means_random_clusters:
        print('Number of matches in cluster: ' + str(len(cluster.matches)))
        cluster.print_center()
 
 #
 # extract clusters information from clean data
 x_train_tmp, y_train_tmp, x_valid_tmp, y_valid_tmp, x_test_tmp, y_test_tmp = utils.generate_batches(
                                                                                  filename='../data/power_consumption.csv', 
                                                                                  window=sequence_len,
                                                                                  stride=stride,
                                                                                  mode='validation', 
                                                                                  non_train_percentage=.3,
                                                                                  val_rel_percentage=.8,                                                                                     
                                                                                  normalize='maxmin01',
                                                                                  time_difference=False,
                                                                                  td_method=None)
 
 # cluster info relative to signal's value (cluster's means)
 clusters_info = clst.k_means(x_train_tmp, n_clusters)    
 
 # extract train and test
 x_train, y_train, x_valid, y_valid, x_test, y_test = utils.generate_batches(
                                                          filename='../data/power_consumption.csv', 
                                                          window=sequence_len,
                                                          stride=stride,
                                                          mode='validation', 
                                                          non_train_percentage=.3,
                                                          val_rel_percentage=.8,
                                                          normalize='maxmin01',
                                                          time_difference=True,
                                                          td_method=np.log2)
 
 # cluster info relative to time difference (cluster's means)
 clusters_info_td = clst.k_means(x_train, n_clusters_td)
Example #6
0
        filename = "exp_results/%s_gridsize=%s_neighsize=200" % (lime_version,
                                                                 g)
        np.save(filename, qualities)

    del image_pool
# ---------------------------------------------------------------------------------------------

# Testing Lime# Clustering --------------------------------------------------------------------
elif lime_version == 'lime#C':
    ks_for_each_grid_size = [20, 17, 16, 26, 18]
    images = clustering.load_images(imgs_clustering)

    for k, g in zip(ks_for_each_grid_size, grid_sizes):
        und = clustering.undersample_images(images, g)
        flattened = clustering.flatten_images(und)
        inertia, lbls, centers = clustering.k_means(flattened, k)

        qualities = evaluation_measures.evaluate_explanations(
            'lime#C',
            model,
            bb_outcomes,
            imgs,
            gts,
            neigh_size=100,
            segmentation_fun=partial(gridSegmentation, g),
            clustering_labels=lbls)
        lime_version = 'lime#C-NEW'
        filename = "exp_results/%s_gridsize=%s_neighsize=200" % (lime_version,
                                                                 g)
        np.save(filename, qualities)