Example #1
0
def test_agnes(db, k=2):
    db = copy.deepcopy(db)
    random.shuffle(db)

    agnes = AgnesMax(db[:300], k)
    clusters = agnes.cluster()
    plot_clusters(clusters)
Example #2
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                            line[4]))

    cluster_list = sequential_clustering(singleton_list, 15)
    print("Displaying " + str(len(cluster_list)) + " sequential clusters")

    #cluster_list = algos.hierarchical_clustering(singleton_list, 9)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = algos.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        plot.plot_clusters(data_table, cluster_list, False)
        #plot.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
Example #3
0
def main(argv):
    parser = argparse.ArgumentParser(description='SCV Spam Classification Viaduct')
    parser.add_argument('FEATURE', type=str, help='Desired feature upon which the clustering algorithm is goin to be trained: POS, BOW, BIGRAMS, TRIGRAMS, W2V, NAMED_ENTITIES')
    parser.add_argument('k', type=int ,help='K number for K-mean')
    parser.add_argument('-eo','--english', action ='store_true', help='Filter to work only with english.')
    parser.add_argument('-sw','--stopword', action ='store_true', help='Remove stopwords.')
    parser.add_argument('-p','--punctuation', action ='store_true', help='Remove punctuation marks.')
    parser.add_argument('-l', '--lematize', action ='store_true',help='Lemmatize tokens.')
    parser.add_argument('-m', '--models', action ='store_true',help='Use already trained model on Models directory.')
    args = parser.parse_args()
    if args.FEATURE not in ['POS','BOW','BIGRAMS','TRIGRAMS','W2V','NAMED_ENTITIES','D2V']:
        print 'Feature not recognized by program.'
        sys.exit()
    if not os.path.isfile('../Resources/spam.txt'):
        prs.parse_raw_spam()
    if not os.path.isfile('../Resources/ham.txt'):
        prs.parse_raw_ham()
    spam_messages = pre_process(args.stopword, args.punctuation, args.lematize,isSpam=True)
    print 'Data pre-processed'
    spam_features = generate_features(spam_messages,args.FEATURE,args.english)
    print 'Features generated'
    results,labels = train_models_clustering(spam_messages,spam_features,args.FEATURE,args.k)
    print 'Model trained'
    score = metrics.silhouette_score(results, labels, metric='euclidean')
    print 'Plotting...'
    plot_clusters(results,args.k,labels,args.FEATURE)
    print 'K-means with '+ str(args.k) +' clusters using '+ str(args.FEATURE) +' silhouette score: ' + str(score)
Example #4
0
def kmeans(principal_components,names,embeds,viz=True):
    kplus = KMeans(n_clusters=12,init='k-means++').fit(embeds)

    if viz:
        plot_clusters(kplus
                    ,pc=principal_components
                    ,text=True
                    ,names=names
                    ,n_names=15
                    ,figsize=(16,4))

        plot_label_dist(kplus,palette='Reds',figsize=(4,2))
        plot_3d_clusters([('Kmeans++',kplus)],pc=principal_components,figsize=(6,4.5))
    
    return kplus
Example #5
0
def getFlatLabels(model,embeds,
                    names,
                    urls,
                    large_cutoff=100,
                    medium_cutoff=75,
                    small_cutoff=50,
                    tiny_cutoff=30,
                    viz=False):
    
    pca = PCA(n_components=30)
    principal_components = pca.fit_transform(embeds)

    pca0 = principal_components[:,0]
    pca1 = principal_components[:,1]
    pca2 = principal_components[:,2]

    t_values = [('Large clusters',large_cutoff),('Medium clusters',medium_cutoff),('Small clusters',small_cutoff),('Tiny clusters',tiny_cutoff)]

    agglom_labels = []

    for label, t_value in t_values:
        print(label,'n:')
        clusters = fcluster(model, t=t_value, criterion='distance')
        print(len(np.unique(clusters)),'\n')

        agglom_labels += [clusters]
        
    agglom_labels = np.array(agglom_labels)


    for i in range(len(agglom_labels)): 
        plot_clusters(model,pc=principal_components
                    ,labels=agglom_labels[i]
                    ,names=names
                    ,text=True,figsize=(20,4)
                    ,title=('{} derived from stopping at {} covariance'
                            ).format(t_values[i][0],t_values[i][1]))
        plt.show()
    
    return agglom_labels
Example #6
0
            dfs.append(density_stats)

            tgt_image_name = constants.analysis_config[
                'FIGURE_NAME_FORMAT_MTOC_ENRICHMENT'].format(
                    molecule_type=molecule_type)
            tgt_fp = pathlib.Path(
                constants.analysis_config['FIGURE_OUTPUT_PATH'].format(
                    root_dir=global_root_dir), tgt_image_name)
            plot.enrichment_violin_plot(density_stats,
                                        molecule_type,
                                        tgt_fp,
                                        groupby_key=conf[1],
                                        limit_threshold=OUTLIERS_THRESHOLD)
            logger.info("Created figure {}", tgt_fp)
            plot.plot_clusters(molecule_type,
                               density_stats.df,
                               peripheral_flag=peripheral_flag)

            tgt_image_name = constants.analysis_config[
                'FIGURE_NAME_FORMAT_MPI'].format(molecule_type=molecule_type)
            tgt_fp = pathlib.Path(
                constants.analysis_config['FIGURE_OUTPUT_PATH'].format(
                    root_dir=global_root_dir), tgt_image_name)
            plot.plot_MPI(density_stats, molecule_type, tgt_fp, use_mean=False)
            logger.info("Created figure {}", tgt_fp)

        if "original" in conf[0]:
            plot.plot_boxplot_MPI(dfs[0], dfs[1],
                                  constants.analysis_config['PROTEINS'],
                                  tp_mrna, tp_proteins)
Example #7
0
def test_dbscan(db, radius=0.3, min_pts=50):
    dbscan = DBScan(db, radius, min_pts)
    clusters = dbscan.cluster()
    plot_clusters(clusters)

    print('Found %d clusters' % len(clusters))
Example #8
0
def test_kmeans(db, k=2):
    kmeans = KMeans(db[:500], k)
    clusters = kmeans.cluster()
    plot_clusters(clusters)
    return clusters
Example #9
0
results = []

for configuration in parameter_grid:
    model = AgglomerativeClustering(**configuration)

    predicted_clusters = model.fit_predict(features)

    v_score = v_measure_score(labels, predicted_clusters)

    results.append({'params': configuration, 'score': v_score})

results = sorted(results, key=lambda k: k['score'], reverse=True)
best_params = results[0]['params']

model = AgglomerativeClustering(**best_params)
predicted_clusters = model.fit_predict(features)

v_score = v_measure_score(labels, predicted_clusters)
silhouette_score = silhouette_score(features, predicted_clusters)

print('V-measure score (external metric): {}'.format(v_score))
print('Silhouette score (internal metric): {}'.format(silhouette_score))

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(features)

plot_clusters(best_params['n_clusters'], principalComponents,
              predicted_clusters, plt, 'predicted_clusters')
plot_clusters(num_classes, principalComponents, labels, plt, 'true_clusters')

plt.show()
Example #10
0
# Save labels
with open(output_folder + "results.txt", "w") as f:
    [f.write("%i " % c) for c in clone.centers]
    f.write("\n")
    for l, core, r in zip(clone.labels_, clone.core_card, clone.rho):
        f.write("%i %i %f\n" % (l, core, r))

# Display results
# > Statistics on unscaled data for better interpretability
# > If PCA was done, chose data to visualize. Sometimes it makes sense to look at PC,
#   sometimes to look at original coords...
if not pca:
    # Stats
    show_cluster_info(clone, original_coords, output_folder, headers)
    # Plot
    plot_clusters(clone, original_coords, output_folder, headers)
else:
    # Stats
    show_original = -1
    while show_original not in [1, 2, 3]:
        show_original = int(
            input(
                "> Show statistics on:\n   1. Original coords (%i dimensions)\n   2. PCA coords (%i dimensions)\n   3. Both\n   > Choice: "
                % (len(headers), len(pca_headers))))
    if show_original == 1:
        show_cluster_info(clone, original_coords, output_folder, headers)
    elif show_original == 2:
        show_cluster_info(clone, coords, output_folder, pca_headers)
    else:
        show_cluster_info(clone, original_coords, output_folder, headers)
        show_cluster_info(clone, coords, output_folder, pca_headers)
Example #11
0
print("> Clustering %s..."%name)
t = time.time()
clone = CLoNe(pdc=pdc, verbose=False)
clone.fit(data)
print("> Done: %.2f sec"%(time.time() - t))

# Get data from clustering
centers = clone.centers
core_card = clone.core_card
labels = clone.labels_
labels_all = clone.labels_all
rho = clone.rho

# Summary
header = "  |  #center  |    Dens    #Core  |  # el  | -outl  |"
subh  =  "  |-----------|-------------------|--------|--------|"
top = "   " + "-" * (len(header) - 4)
print(top + "\n" + header + "\n" + subh + "\n" + top)
for c in range(len(centers)):
    elem = len(labels_all[labels_all == c])
    outl = len(labels[labels == c])
    line =  "  |%2i - %5i | %7.2f  %7i  | %6i | %6i |"%(c+1, centers[c]+1, rho[centers[c]], core_card[centers[c]], elem, outl)
    print(line)
print(top)

# Plot
if data.shape[1] > 3:
    print("> WARNING: data has more than 3 dimensions. Not plotting.")
else:
    plot_clusters(clone, data, ".")
        file.write("Cluster " + str(i) + ":\n" + str(c))
    file.close()

# Normalize data in all features (1e-5 padding is added because clustering works on [0,1) interval)
def normalize_features(data):
    normalized_data = data
    num_feat = np.shape(normalized_data)[1]
    for f in range(num_feat):
        normalized_data[:, f] -= min(normalized_data[:, f]) - 1e-5
        normalized_data[:, f] *= 1 / (max(normalized_data[:, f]) + 1e-5)
    return normalized_data

def visualize_clusters(features, data, clusters, title, xi, tau, file)
    title = ("Dataset: " + file + "tau = " +str(tau) + "xi = " + str(xi))
    if len(features) <= 2:
        plot_clusters(data, clusters, title, xi)

if __name__ == "__main__":
    xi = 3
    tau = 0.1
    ds_file = "data.txt"
    feature_columns = [4, 5, 6, 7]
    label_column = 3
    delimiter = ' '
    cluster_file = "clusters_info.txt"
    
    # ds_file = "mouse.csv"
    # feature_columns = [0, 1]
    # label_column = 2
    # xi = 3
    # tau = 0.1