def affinity(data):
    space = {'damping': hp.uniform('damping', 0.5, 0.99)}
    algo = partial(tpe.suggest, n_startup_jobs=10)
    best = fmin(hyper_affinity, space, algo=algo, max_evals=30)
    model = AffinityPropagation(damping=best['damping'])
    return best, model.fit_predict(data), sil_score(
        data, model.fit_predict(data)), model.fit(data)
Esempio n. 2
0
def clustering(relation_all_df, all_keys, txt_name, method = 'AffinityPropagation', n_clusters= 5):
    print('begin clustering')
    data = relation_all_df.iloc[:, :].values
    if method == 'AffinityPropagation':
        clustering = AffinityPropagation(damping=0.8).fit(data)
    if method == 'AgglomerativeClustering':
        clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
        clustering.fit_predict(data)

    res_dict = dict()
    for i in range(len(clustering.labels_)):
        if clustering.labels_[i] not in res_dict:
            res_dict[clustering.labels_[i]] = []
            res_dict[clustering.labels_[i]].append(all_keys[i])
        else:
            res_dict[clustering.labels_[i]].append(all_keys[i])

    for k, v in res_dict.items():
        print(k,v)

    # for key in res_dict.keys():
    #     if type(key) is not str:
    #         res_dict[str(key)] = res_dict[key]
    #         del res_dict[key]

    with open(txt_name, 'w') as the_file:
        # the_file.write(json.dumps(list(res_dict.items())))
        if method == 'AgglomerativeClustering':
            the_file.write('n_clusters: \n')
            the_file.write(str(n_clusters) + '\n')
        for i in res_dict.keys():
            the_file.write(str(i) + '\n')
            the_file.write(','.join([str(x) for x in res_dict[i]]) + "\n")
Esempio n. 3
0
    def cluster(self, tracklet):
        # tracklet: [[pid, time, img_id, pseudo_id],...], only img_id is used
        ids = map(lambda arr: arr[2], tracklet)
        # cluster = SpectralClustering(n_clusters=2, affinity='precomputed')
        track_features = []
        for i, img_i in enumerate(ids):
            track_features.append(self.feature[img_i])
        similarity = -euclidean_distances(track_features, squared=True)
        # cls = cluster.fit_predict(affinity)
        cluster = AffinityPropagation(preference=np.median(similarity))
        cls = cluster.fit_predict(track_features).reshape(-1)
        self.spectral_score += metrics.adjusted_rand_score(
            [info[0] for info in tracklet], cls)

        cls_cnt = len(set(cls))
        #
        # cluster = SpectralClustering(n_clusters=cls_cnt)
        # cls = cluster.fit_predict(track_features).reshape(-1)
        # self.ap_score += metrics.adjusted_rand_score([info[0] for info in tracklet], cls)

        cluster = KMeans(n_clusters=cls_cnt)
        cls = cluster.fit_predict(track_features).reshape(-1)
        self.kmeans_score += metrics.adjusted_rand_score(
            [info[0] for info in tracklet], cls)

        self.tracklet_cnt += 1
        return [(ids[i], cls[i]) for i in range(len(ids))]
Esempio n. 4
0
def get_w2v_field(zeta_res, model, zeta_scope, mode):

    if mode == 0:
        words = zeta_res.index[zeta_res[zeta_scope] > 0]
    else:
        words = zeta_res.index[zeta_res[zeta_scope] < 0]

    vecs = []
    for word in words:
        try:
            vecs.append(model[word])

        except KeyError:
            pass

    word_matrix = np.matrix(vecs)
    if mode == 0:
        clu = AffinityPropagation(
            preference=zeta_res[zeta_scope][zeta_res[zeta_scope] > 0])
    else:
        clu = AffinityPropagation(
            preference=zeta_res[zeta_scope][zeta_res[zeta_scope] < 0])
    clu.fit_predict(word_matrix)
    cluster_frame = pd.DataFrame(clu.cluster_centers_)
    cluster_frame["Category"] = mode

    return cluster_frame
Esempio n. 5
0
def AffProp(SM):

    af = AffinityPropagation(preference=None, affinity='precomputed')
    af.fit_predict(SM)
    cluster_centers_indices = af.cluster_centers_indices_
    labels_ = af.labels_

    n_clusters_ = len(cluster_centers_indices)
    return n_clusters_, labels_
Esempio n. 6
0
 def clusteringAlgorithm(self, typeOfAlgorithm): 
     #define the model
     clusterPoint = None
     if typeOfAlgorithm == 'affinityPropagation': 
         model = AffinityPropagation(damping=0.9)
         model.fit(x)
         clusterPoint = model.predict(x)
     elif typeOfAlgorithm == 'agglomerativeClustering': 
         model = AgglomerativeClustering(n_clusters=2)
         model.fit(x)
         clusterPoint = model.fit_predict(x)
     elif typeOfAlgorithm == 'BIRCH': 
         model = Birch(threshold=0.01, n_clusters=2)
         model.fit(x)
         clusterPoint = model.predict(x)
     elif typeOfAlgorithm == "DBSCAN": 
         model = DBSCAN(eps=0.30, min_samples=9)
         clusterPoint = model.fit_predict(x)
     elif typeOfAlgorithm == "KMeans": 
         model = KMeans(n_clusters=2)
         model.fit(x)
         clusterPoint = model.predict(x)
     elif typeOfAlgorithm == "MiniBatchKMeans": 
         model = MiniBatchKMeans(n_clusters=2)
         model.fit(x)
         clusterPoint = model.predict(x)
     elif typeOfAlgorithm == "MeanShift": 
         model = MeanShift()
         model.fit(x)
         clusterPoint = model.fit_predict(x)
     elif typeOfAlgorithm == "OPTICS": 
         model = OPTICS(eps=0.8, min_samples=10)
         model.fit(x)
         clusterPoint = model.fit_predict(x)
     elif typeOfAlgorithm == "SpectralClustering":
         model = SpectralClustering(n_clusters=2)
         model.fit(x)   
         clusterPoint =model.fit_predict(x)
     elif typeOfAlgorithm == "GaussianMixture": 
         model = GaussianMixture(n_components=2)
         model.fit(x)
         clusterPoint = model.predict(x)
     else: 
         model = "unknown"
     #fit model
     #assign a cluster to each example
     #retrieve unique clusters
     clusters = unique(clusterPoint)
     for cluster in clusters: 
         #get row indexesMeanShift
         rowIndexes = where(clusterPoint == cluster)
         #create scatter of these samples
         pyplot.scatter(x[rowIndexes, 0], x[rowIndexes, 1])
     pyplot.savefig('img/' + typeOfAlgorithm + '.png')
     pyplot.clf()
     pyplot.cla()
     pyplot.close()
def affinityaropagation(tfidf_matrix):
    ap_cluster = AffinityPropagation(damping=0.5,
                                     max_iter=200,
                                     convergence_iter=15,
                                     copy=True,
                                     preference=None,
                                     affinity='euclidean',
                                     verbose=False)
    print('affinityaropagation聚类的个数:', end="")
    print(len(set(ap_cluster.fit_predict(tfidf_matrix))))  #聚类的个数
    return ap_cluster.fit_predict(tfidf_matrix)
def feat_based_cluster(uniquePhrases,
                       gold,
                       phraseFeats,
                       clutter,
                       damping,
                       affinity='cosine',
                       saveto=None):
    phrase_dic = read_phrase(uniquePhrases)
    gt_df = pd.read_csv(gold, encoding="utf-8")
    feat_matrix = np.load(phraseFeats)

    img_list = gt_df.image.unique()
    result = []
    for img in img_list:
        gt_df_img = gt_df.query('image == %i' % img)

        feats = []
        phrases = []
        for _, item in gt_df_img.iterrows():
            phrase = item.phrase
            phrases.append(phrase)
            feats.append(feat_matrix[phrase_dic[phrase]])

        feats = np.vstack(feats)

        labels = []
        if len(phrases) > 1:
            if affinity == 'cosine':
                similarity = cosine_similarity(feats)
                pref = np.percentile(similarity, clutter)
                af = AffinityPropagation(preference=pref,
                                         affinity='precomputed',
                                         damping=damping)
                labels = af.fit_predict(similarity)

            elif affinity == 'euclidean':
                distance = -euclidean_distances(feats, squared=True)
                pref = np.percentile(distance, clutter)
                af = AffinityPropagation(preference=pref, damping=damping)
                labels = af.fit_predict(feats)
            else:
                raise RuntimeError('invalid affinity metric')
            if np.isnan(labels).any():  # when af did not converge
                labels = np.arange(labels.size)
        else:
            labels = [1]

        for i in range(len(phrases)):
            result.append({
                'image': img,
                'phrase': phrases[i],
                'label': labels[i]
            })
    return pd.DataFrame(result)
Esempio n. 9
0
def clusterSamples(model,trainDataIn,testDataIn,params):
    
    if model == 'SOM':
        
        # Map size 
        msz0 = params[0]
        msz1 = params[1]

        #print('SOM size: ', trainDataIn.shape[0])
        sm = SOM.SOM('sm', trainDataIn, mapsize = [msz0, msz1],norm_method = 'var',initmethod='pca')
        sm.train(n_job = 1, shared_memory = 'no',verbose='off')
        #sm.set_data_labels(list(instancesCorpus))

        if params[2] == True:
            #print('Hitmap for CORPUS (train, red) and TARGET (test, blue) data:')        
            sm.hit_map(testDataIn)

        testData_proj = sm.project_data(testDataIn)
        trainData_proj = sm.project_data(trainDataIn)
        testData_loc = sm.ind_to_xy(testData_proj)[:,2]
        trainData_loc = sm.ind_to_xy(trainData_proj)[:,2]

        return trainData_loc, testData_loc, sm

    if model == 'AffinityPropagation':
        
        model = AffinityPropagation()
        model.fit(trainDataIn)
        
        return model.predict(trainDataIn), model.predict(testDataIn), model
    
    if model == 'DBSCAN':
        
        model = DBSCAN()
        model.fit(trainDataIn)
        
        return model.fit_predict(trainDataIn), model.fit_predict(testDataIn), model

    if model == 'KMeans':
        
        model = KMeans(n_clusters=params[0])
        model.fit(trainDataIn)
        
        return model.predict(trainDataIn), model.predict(testDataIn), model
    
    if model == 'AgglomerativeClustering':
        model = AgglomerativeClustering(n_clusters=params[0])
        model.fit(trainDataIn)
        
        return model.fit_predict(trainDataIn), model.fit_predict(testDataIn), model    
Esempio n. 10
0
def affinity_propagation(D, preference='median'):
    assert D.shape[0] == D.shape[1], 'Matrix is not square!'

    if preference in ['minimum', 'min']:
        preference = np.min(-D)
    elif preference in ['maximum', 'max']:
        preference = np.max(-D)
    elif preference in ['median', 'med']:
        preference = np.median(-D)
    else:
        raise ValueError('preference must be: {minimum, maximum, median}')

    clusterer = AffinityPropagation(affinity='precomputed', preference=preference)
    clusterer.fit_predict(-D)
    return clusterer
Esempio n. 11
0
def cluster(playlist):
    arq = 'Total ' + playlist + '.csv'
    n_clusters = 0
    Full_data = pd.read_csv(arq)
    Full_data = Full_data.dropna(axis=1, how='all')
    Full_data = Full_data.dropna(axis=0, how='any')
    ID = Full_data['id']
    Mode = Full_data['mode']
    length = Full_data['duration_ms']
    artist = Full_data['artist']
    Full_data = Full_data.drop(
        columns=['track', 'album_id', 'artist', 'id', 'mode'])
    Fdata = Full_data.values
    scaler = Scaler()
    data_u = scaler.fit_transform(Fdata)
    # pca_transf = PCA(0.8)
    # PCA_data = pca_transf.fit_transform(data_u)
    clusterer = AffinityPropagation(random_state=None, preference=-500)
    # clusterer = HDBSCAN(min_cluster_size=20)
    # clusterer = MeanShift()
    labels = clusterer.fit_predict(data_u)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    labels.shape = (len(labels), 1)
    Full_data['cluster'] = labels + 1
    Full_data['id'] = ID
    Full_data['mode'] = Mode
    Full_data['artist'] = artist
    Full_data['duration_ms'] = length
    # Full_data.sort_values(by='cluster')
    Full_data.to_csv('clustered.csv', index=False)
    # sns.pairplot(Full_data, hue="cluster", palette='YlGnBu')
    # plt.show()
    return n_clusters
Esempio n. 12
0
def execute(args):
    ##############################################################################
    if len(args) < 1:
        usage()
        sys.exit()

    names, labels_true, X = parse(args[0])
    indices = [int(i) for i in args[1:]]
    relevant_names = names[1:]
    if len(indices) > 0:
        X = np.asarray([[sample[i] for i in indices] for sample in X])
        relevant_names = [relevant_names[i] for i in indices]
    print "Clustering on", str(relevant_names) + "..."

    ##############################################################################
    # Compute Affinity Propagation
    af = AffinityPropagation(preference=-50)
    # cluster_centers_indices = af.cluster_centers_indices_
    # labels = af.labels_
    #
    # n_clusters_ = len(cluster_centers_indices)

    y_pred = af.fit_predict(X)
    if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray:
        return 0
    counts = get_cluster_counts(labels_true, y_pred)
    print counts
Esempio n. 13
0
def visual(c, X, y):
  from sklearn.cluster import AffinityPropagation
  cluster_object = AffinityPropagation()
  y_pred = cluster_object.fit_predict(X)
  colors = ['red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue']
  clusters = np.unique(y_pred)
  print("Cluster Labels")
  print(clusters)
  print("Evaluation")
  evaluation_labels(y, y_pred)
  evaluation(X, y_pred)
  for cluster in clusters:
    row_idx = np.where(y == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Dataset')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
  for cluster in clusters:
    row_idx = np.where(y_pred == cluster)
    plt.scatter(X[row_idx, 0], X[row_idx, 1])
  plt.title('Clusters')
  plt.xlabel('X1')
  plt.ylabel('X2')
  plt.legend()
  plt.show()
Esempio n. 14
0
def cluster_trajectories( curves ):
    """Given a list of curves, cluster_trajectories will cluster them."""
    n_curves = len(curves)
    X_2B_clstrd = np.zeros( (n_curves, 4) )
    X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] )
    X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] )
        
    for col in range( 4 ):
        X_2B_clstrd[:,col] /=  X_2B_clstrd[:,col].std()
        
    def distance_metric(a,b):
        #A distance metric on R^4 modulo the involution
        #(x0,x2,x3,x4) -> (x3,x4,x1,x2)
        d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) )
        T = lambda x: np.array([x[2],x[3],x[0],x[1]])
        return min( d(a,b) , d(T(a),b) )
    from sklearn.cluster import AffinityPropagation
    clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
    aff = np.zeros((n_curves, n_curves))
    for i in range(n_curves):
        for j in range(i+1,n_curves):
            aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2)
            aff[j,i] = aff[i,j]

    #clusterer.Affinity = aff
    cluster_labels = clusterer.fit_predict(aff)
    out = []
    for label in set( cluster_labels):
        cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
        out.append( cluster )
    return map( align_cluster, out)
Esempio n. 15
0
def community_detection_by_affinity(graph, weight_type=WeightType.ABSOLUTE):

    # Has an high impact on Girvan Newman clustering
    graph = nx.algorithms.tree.mst.maximum_spanning_tree(
        Graph.to_undirected(graph))

    mat, node_to_int = prepare_matrix(graph)

    af = AffinityPropagation(preference=-50)
    labels = af.fit_predict(mat)

    inv_node_to_int = {v: k for k, v in node_to_int.items()}

    clusters = {}
    for index, lab in enumerate(labels):
        class_name = inv_node_to_int[index]
        if lab not in clusters:
            clusters[lab] = []
        clusters[lab].append(class_name)

    print(f"\nClusters: {clusters}")
    print(f"Total Clusters: {len(clusters)}")

    pos = nx.spring_layout(graph)
    nx.draw_networkx(graph,
                     pos=pos,
                     edgelist=[],
                     node_color=labels,
                     with_labels=True,
                     node_size=250,
                     font_size=8)

    return [cluster for cluster in clusters.values()]
    def fit_predict(self, preprocessed_documents: List[List[tuple]], documents_features: List[np.ndarray],
                    dataset: Dataset, hyperparams=None, task=None) -> List[Segmentation]:
        assert len(documents_features) == len(preprocessed_documents)

        x_scaled = []
        for doc_features in documents_features:
            x_scaled.append(StandardScaler().fit_transform(doc_features))

        predicted_label_lists = []

        for i in range(len(documents_features)):
            start_time = time.time()

            x = documents_features[i]  # documents_features[i]  x_scaled[i]
            true_n_clusters = dataset.segmentations[i].author_count

            assert x.shape[0] == len(preprocessed_documents[i])

            diarizer = AffinityPropagation(damping=hyperparams['damping'],
                                           preference=hyperparams['preference'],
                                           copy=True, affinity='euclidean',
                                           max_iter=100, convergence_iter=5)

            labels = diarizer.fit_predict(x).tolist()
            predicted_label_lists.append(labels)

            estimated_n_clusters = len(set(labels))

            print('Document', i + 1, '/', len(documents_features), x.shape, 'in', time.time() - start_time, 's', )
            print('Real author count = {}, estimated = {}'.format(true_n_clusters, estimated_n_clusters))
            print()

        return generate_segmentation(preprocessed_documents, documents_features,
                                     predicted_label_lists, dataset.documents, task=task)
Esempio n. 17
0
def plotly_embedding(value, key):
    time5 = time.clock()
    "standardisation"
    x_min, x_max = np.min(value, 0), np.max(value, 0)
    reducer = (value - x_min) / (x_max - x_min)
    "clusterisation"

    clusterer = AffinityPropagation()
    cluster_labels = clusterer.fit_predict(reducer)
    X_projected = reducer
    x, y, z = np.random.multivariate_normal(np.array([0, 0, 0]), np.eye(3),
                                            400).transpose()
    trace1 = go.Scatter3d(x=X_projected[:, 0],
                          y=X_projected[:, 1],
                          z=X_projected[:, 2],
                          mode='markers',
                          marker=dict(size=12,
                                      color=cluster_labels,
                                      colorscale='Paired',
                                      opacity=0.8))

    data = [trace1]
    layout = go.Layout(title="TSNE", margin=dict(l=0, r=0, b=0, t=0))
    fig = dict(data=data, layout=layout)
    plot(fig)
def group_clusters(roughHull):
    # AffinityPropagation algorithm:
    af = AffinityPropagation(preference=-100).fit(roughHull.squeeze(1))
    cluster_indicators = af.fit_predict(roughHull.squeeze(1))
    
    # map the list that the affinity provided to the current roughHull, then
    # find the mean point of all of them
    cluster_centers = []
    current_cluster = 0
    while(current_cluster < len(af.cluster_centers_indices_)):
        i=0
        cluster=[]
        for x in cluster_indicators:
            if(x == current_cluster):
                cluster.append(roughHull[i][0].tolist())
            i+=1
        # now find the average point between these
        average = [0, 0]
        for y in cluster:
            average = [average[0] + y[0], average[1] + y[1]]
        average = [average[0]/len(cluster), average[1]/len(cluster)]
        cluster_centers.append(np.array([average]))
        current_cluster += 1
    cc = np.array(cluster_centers)
    return cc
def execute(args):
  ##############################################################################
  if len(args) < 1:
    usage()
    sys.exit()

  names, labels_true, X = parse(args[0])
  indices = [int(i) for i in args[1:]]
  relevant_names = names[1:]
  if len(indices) > 0:
    X = np.asarray([[sample[i] for i in indices] for sample in X])
    relevant_names = [relevant_names[i] for i in indices]
  print "Clustering on", str(relevant_names) + "..."

  
  ##############################################################################
  # Compute Affinity Propagation
  af = AffinityPropagation(preference=-50)
  # cluster_centers_indices = af.cluster_centers_indices_
  # labels = af.labels_
  # 
  # n_clusters_ = len(cluster_centers_indices)

  y_pred = af.fit_predict(X)
  if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray:
    return 0
  counts = get_cluster_counts(labels_true, y_pred)
  print counts
def score_based_cluster(gold, pair_score, affinity_save_path, clutter,
                        damping):
    gt_df = pd.read_csv(gold, encoding="utf-8")

    img_list = gt_df.image.unique()

    result = []
    for img in img_list:
        gt_df_img = gt_df.query('image == %i' % img)

        phrases = []
        for _, item in gt_df_img.iterrows():
            phrase = item.phrase
            phrases.append(phrase)

        scores = np.load(affinity_save_path + '/' + str(img) + '.npy')
        if scores.size > 1:
            pref = np.percentile(scores, clutter)
            af = AffinityPropagation(preference=pref,
                                     affinity='precomputed',
                                     damping=damping)
            labels = af.fit_predict(scores)
            if np.isnan(labels).any():  # when af did not converge
                labels = np.arange(labels.size)
        else:
            labels = [1]

        for i in range(len(phrases)):
            result.append({
                'image': img,
                'phrase': phrases[i],
                'label': labels[i]
            })
    return pd.DataFrame(result)
Esempio n. 21
0
def hyper_affinity(args):
    global basic_data
    global all_data
    ap = AffinityPropagation(damping = args['damping'])
    pred = ap.fit_predict(basic_data)
    temp = sil_score(all_data, pred)
    # print(args)
    return -temp
Esempio n. 22
0
def apply_affinity_prop_consort(include_transformed):
    (X, y) = extract.generate_labelled_data(
        valid_labels=['1'],
        label_type='consort',
        include_transformed=include_transformed)
    am = AffinityPropagation()
    preds = am.fit_predict(X)
    return (X, preds)
Esempio n. 23
0
def cluster_ap_blobs():
    clustering_blobs = AffinityPropagation(affinity='euclidean',
                                           convergence_iter=5,
                                           damping=0.9,
                                           preference=-10.0)
    y_blobs = clustering_blobs.fit_predict(X_blobs)
    plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs)
    print(y_blobs)
def test_sparse_input_for_fit_predict():
    # Test to make sure sparse inputs are accepted for fit_predict
    # (non-regression test for issue #20049)
    af = AffinityPropagation(affinity="euclidean", random_state=42)
    rng = np.random.RandomState(42)
    X = csr_matrix(rng.randint(0, 2, size=(5, 5)))
    labels = af.fit_predict(X)
    assert_array_equal(labels, (0, 1, 1, 2, 3))
Esempio n. 25
0
    def fit(self, vectors: [int, float]) -> [int, int]:
        vectors_ = list(zip(*vectors))[1]
        cluster_model = AffinityPropagation(damping=0.96, max_iter=10000, convergence_iter=15)
        cluster = cluster_model.fit_predict(vectors_)

        show_two_dimensions_plot(vectors_, cluster)

        return [(i, label) for i, label in enumerate(cluster)]
Esempio n. 26
0
class APModel(ClusteringModel):
    def __init__(self, n_clusters):
        super().__init__()
        self.n_clusters = n_clusters
        self.ap = AffinityPropagation(verbose=True)

    def fit_predict(self, feat):
        pred = self.ap.fit_predict(feat)
        return pred
Esempio n. 27
0
def semantic_clusters(lemmas, unique=True):
    words = lemmas
    if unique:
        words = list(set(lemmas))
    words = _filter_w2v(words)
    m = np.array(_get_matrix(words))
    agg = AffinityPropagation(affinity="precomputed")
    u = agg.fit_predict(m)
    return _group_words(words, agg.labels_)
Esempio n. 28
0
def get_affinity_clusters(listings):
    """Returns a list of cluster IDs based on relative similarity between
    listings."""
    a = get_similarity_matrix(listings)

    clf = AffinityPropagation(affinity='precomputed')
    clusters = clf.fit_predict(a)

    return clusters
Esempio n. 29
0
def main():
    args = parse_arguments()
    verb2vec, subject2vec, object2vec = get_vectors(args.vector_path)
    lines, _, _, _ = get_dict_and_samples(args.input_path, args.min_count, args.first_n, args.step)
    concatenated = concat_vectors(lines, verb2vec, subject2vec, object2vec)
    print(f"Shape: {concatenated.shape}")
    ap = AffinityPropagation()
    result = ap.fit_predict(concatenated)
    groups = group_result(result, lines)
    print(f"Number of clusters: {len(groups)}")
Esempio n. 30
0
def affinity(data,damping):
    # metric_list = ['euclidean', 'manhattan', 'chebyshev']
    # ap = AffinityPropagation(damping = args['damping'])

    db = AffinityPropagation(damping = damping)
    db.fit(data)
    pred = db.fit_predict(data)
    score = sil_score(data,pred)
    print(score)
    return db,pred,score
Esempio n. 31
0
 def build_families(self, smiles, affin_matrix):
     cluster = AffinityPropagation()
     cls = cluster.fit_predict(affin_matrix)
     fam = {}
     for a, b in zip(smiles, cls):
         if b in fam:
             fam[b].add(a)
         else:
             fam[b] = set({a})
     return fam
Esempio n. 32
0
def _AffinityPropagation(corpus, labels):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    affinity_propagation = AffinityPropagation(damping=.5,
                                               max_iter=200,
                                               convergence_iter=25,
                                               copy=False)
    result_affinity_propagation = affinity_propagation.fit_predict(X.toarray())
    print('AffinityPropagation:',
          normalized_mutual_info_score(result_affinity_propagation, labels))
def affinity_propagation():
    """
    AffinityPropagation creates clusters by sending messages between pairs of
    samples until convergence. The messages sent between pairs represent the
    suitability for one sample to be the exemplar of the other, which is updated
    in response to the values from other pairs. this updates occurs iteratively
    until convergence, at which point the final exemplars are chosen and hence
    the final cluster is given.

    Algorithm:

    The message sent between pairs belongs to one of two categories. The first
    is the responsibility, r(i,k), which is the accumulated evidence that sample
    k should the exemplar for sample i. The second is the availability, a(i,k),
    which is the accumulated evidence that sample i should chose sample k to be
    its exemplar, and considers the values for all other samples that k should
    be an exemplar. In this case exemplars are chosen by samples if they are:

        - similar enough to many samples, and
        - chosen by many samples to be representative of themselves.
    """
    # Generate a generic data sample.
    n_samples = 300
    std = 0.3
    seed = 0
    centers = [ [-1., 0.], [0., 1.5], [1., 0.] ]
    data, target = make_blobs(n_samples = n_samples, centers = centers,
        cluster_std = std, random_state = seed)

    # Set the preference for each point: samples with large preference values
    # are more likely to be chosen as exemplars. The number of exemplars, i.e.,
    # clusters, is influenced by the input preference values. If preferences are
    # not passed as arguments, they will be set to the median of the input
    # similarities.
    # pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)]
    pref = -50
    # Compute affinity propagation.
    clf = AffinityPropagation(preference = pref)
    aff_y = clf.fit_predict(data)
    # Find mismatches between predicted and true values.
    cnt = int(0)
    for idx in range(n_samples):
        if(target[idx] != aff_y[idx]): cnt += 1
    # Print results.
    print('Approximated number of clusters ', len(clf.cluster_centers_indices_))
    print('Accuracy ', float(n_samples - cnt) / float(n_samples))
    print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_))
    print('Completeness ', metrics.completeness_score(target, clf.labels_))

    # Plot resulting clusters.
    plt.figure(figsize = (8,8))
    plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50)
    plt.title('Affinity clustering')
    plt.show()
Esempio n. 34
0
def cluster_articles():
  ms = MongoStore()
  articles = [a for a in ms.get_pending_articles()]

  if len(articles) > 0:

    tfidf = TfidfVectorizer(tokenizer=preprocess)


    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
Esempio n. 35
0
def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)
Esempio n. 36
0
File: geo.py Progetto: kedz/cuttsum
def geo_worker_(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    geocache = get_resource_manager(u"GeoCacheResource")
    geoquery = GeoQuery(geocache.get_tsv_path())
    event = kwargs.get(u"event")

    while not job_queue.empty():
        try:
            string_tsv_path, geo_tsv_path = job_queue.get(block=False)

            with gzip.open(string_tsv_path, u"r") as f:
                string_df = pd.io.parsers.read_csv(f, sep="\t", quoting=3, header=0)

            loc_strings = [
                loc_string for loc_string in string_df[u"locations"].tolist() if not isinstance(loc_string, float)
            ]

            coords = []

            for loc_string in loc_strings:
                for location in loc_string.split(","):
                    coord = geoquery.lookup_location(location)
                    if coord is not None:
                        coords.append(coord)

            centers = set()
            if len(coords) > 0:
                coords = np.array(coords)
                D = -geoquery.compute_distances(coords[:, None], coords)
                ap = AffinityPropagation(affinity=u"precomputed")
                Y = ap.fit_predict(D)

                if ap.cluster_centers_indices_ is not None:
                    for center in ap.cluster_centers_indices_:
                        centers.add((coords[center][0], coords[center][1]))

                    centers = [{u"lat": lat, u"lng": lng} for lat, lng in centers]
                    centers_df = pd.DataFrame(centers, columns=[u"lat", u"lng"])

                    with gzip.open(geo_tsv_path, u"w") as f:
                        centers_df.to_csv(f, sep="\t", index=False, index_label=False, na_rep="nan")

            result_queue.put(None)
        except Queue.Empty:
            pass

    return True
Esempio n. 37
0
def mhd_cluster_trajectories( curves ):
    """Returns clusters based upon the modified Hausdorff distance."""
    n_curves = len(curves)
    from sklearn.cluster import AffinityPropagation
    clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
    aff = np.zeros((n_curves, n_curves))
    for i in range(n_curves):
        for j in range(i+1,n_curves):
            from modified_Hausdorff_distance import modified_Hausdorff_distance as mhd
            aff[i,j] = mhd( curves[i].transpose(), curves[j].transpose() )
            aff[j,i] = aff[i,j]

    #clusterer.Affinity = aff
    cluster_labels = clusterer.fit_predict(aff)
    out = []
    for label in set( cluster_labels):
        cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
        out.append( cluster )
    return map( align_cluster, out)
Esempio n. 38
0
def plot_similarity_clusters(desc1, desc2, files, plot = None):
	"""
	find similar sounds using Affinity Propagation clusters

	:param desc1: first descriptor values
	:param desc2: second descriptor values
	:returns:
	  - euclidean_labels: labels of clusters
	""" 

	if plot == True:
		print((Fore.MAGENTA + "Clustering"))
	else:
		pass
         
	min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False)          
	pca = PCA(n_components=2, whiten=True)
	y = pca.fit(min_max).transform(min_max)
	    
	euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean')                           
	euclidean_labels= euclidean.fit_predict(y)

	if plot == True:

		time.sleep(5)  

		print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros"))
		print(np.vstack((euclidean_labels,files)).T)

		time.sleep(6)

		plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b')
		plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r')
		plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y')
		plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g')
		plt.show()
	else:
		pass

	return euclidean_labels
Esempio n. 39
0
    def cluster(self, normalize=False):
        """
        Cluster the nodes based on the PMI similarity measure. The clustering algorithm used is affinity propagation,
        which automatically choosed the number of clusters.

        :param normalize: If true, then normalize the similarity measured (i.e., the PMI) to be between -1 and 1.
        :return: The cluster labels.
        """
        if normalize:
            # use normalized PMI for similarity metric
            similarity = self.pmi / -np.log(self.joint_probs)
            similarity[np.diag_indices_from(similarity)] = 1.0
        else:
            similarity = self.pmi
            similarity[np.diag_indices_from(similarity)] = 1.1 * similarity.max()
        clustering = AffinityPropagation(affinity='precomputed', verbose=self.verbose,
                                         preference=similarity.min())
        clusters = clustering.fit_predict(similarity)
        if self.verbose:
            print 'Found', len(np.unique(clusters)), 'clusters.'

        return clusters
Esempio n. 40
0
def create_tag_categories():
    """Cluster MSE tags in to categories using sklearn AffinityPropogation.

       Any existing category system in the database will be overwritten.
    """
    con = connect_db()
    cur = con.cursor()

    query = """
    SELECT T.id, T.name, COUNT(Q.question_id) AS count FROM
    (
        SELECT tags.id, tags.name, COUNT(qt.question_id) AS count FROM tags
        JOIN question_tags AS qt ON qt.tag_id=tags.id
        WHERE tags.name NOT IN ('advice', 'applications', 'big-list', 
        'education', 'intuition', 'learning', 'math-history', 'math-software',
        'reference-request', 'self-learning', 'soft-question', 'teaching',
        'alternative-proof-strategy', 'proof-writing', 'visualization',
        'alternative-proof', 'proof-strategy', 'proof-verification',
        'solution-verification', 'definition', 'examples-counterexamples',
        'mathematica', 'wolfram-alpha', 'maple', 'matlab', 'sage', 'octave',
        'floor-function', 'ceiling-function', 'article-writing', 'publishing',
        'combinatorial-species', 'gromov-hyperbolic-spaces', 'chemistry',
        'book-recommendation')
        GROUP BY tags.name
    ) AS T
    JOIN question_tags AS Q ON T.id=Q.tag_id
    GROUP BY T.id"""
    cur.execute(query)
    tag_ids = []
    tag_names = []
    tag_indices = dict()
    tag_name_indices = dict()
    counts = []
    for q in cur:
        tag_ids.append(q['id'])
        tag_names.append(q['name'])
        tag_indices[q['id']] = len(tag_ids) - 1
        tag_name_indices[q['name']] = len(tag_ids) - 1
        counts.append(q['count'])

    tag_ids = np.array(tag_ids)
    tag_names = np.array(tag_names)

    query = """
    SELECT t1.id AS tag1, t2.id AS tag2, COUNT(qt1.question_id) as count
    FROM question_tags AS qt1
    JOIN question_tags AS qt2 ON qt1.question_id=qt2.question_id
    JOIN tags AS t1 ON t1.id=qt1.tag_id
    JOIN tags AS t2 ON t2.id=qt2.tag_id
    WHERE t1.id IN ({taglist}) AND t2.id IN ({taglist})
    GROUP BY t1.name, t2.name""".format(taglist=','.join(str(i) for i in tag_ids))
    cur.execute(query)

    paircounts = [[0 for i in range(len(tag_ids))] for j in range(len(tag_ids))]
    for q in cur:
        t1 = q['tag1']
        i1 = tag_indices[t1]
        t2 = q['tag2']
        i2 = tag_indices[t2]
        c = q['count']
        if i1 == i2:
            paircounts[i1][i1] = int(c/2)
        else:
            paircounts[i1][i2] = c

    sim = np.array(paircounts, dtype=np.float_)

    cluster = AffinityPropagation(affinity='precomputed', damping=0.5)

    labels = cluster.fit_predict(sim)

    classes = sorted(list(set(labels)))

    catnames = {i:tag_names[cluster.cluster_centers_indices_[i]] for i in \
            range(len(cluster.cluster_centers_indices_))}
    cur.execute("DELETE FROM categories WHERE 1;")
    cur.execute("DELETE FROM tag_categories WHERE 1;")

    query = "INSERT INTO categories (id,name) VALUES "
    catnames = [tag_names[cluster.cluster_centers_indices_[c]] for c in classes]
    query += ','.join("({},'{}')".format(c,catnames[c]) for c in classes)
    cur.execute(query)

    query = "INSERT INTO tag_categories (tag_id, category_id) VALUES "
    query += ','.join("({},{})".format(tag_ids[i], labels[i]) for i \
            in range(len(labels)))
    cur.execute(query)
    con.commit()
def main(argv):
   inputFile = ''
   outputFile = ''
   imax = 0
   jmax = 0
   inputFile = sys.argv[1]
   outputFile = sys.argv[2]
   if (len(sys.argv) < 4):
      # pick a default value.
      thisDamping = .92
   else: 
      # The third argument contains parameters in the format of key1:value1|key2:value2. In this
      # case we are only expecting one: "damping"
      paramList = sys.argv[3].split("|")
      for thisParam in paramList:
         # first and only parameter should be damping
         paramSplit = thisParam.split(":")
         if (paramSplit[0] == "damping"):
             thisDamping = float(paramSplit[1])
   print 'Input file is:', inputFile
   print 'Output file is:', outputFile
   print 'thisDamping is:', str(thisDamping)

            
   with open(inputFile, 'rb') as csvfile:
      csvReader = csv.reader(csvfile, delimiter=',',quotechar='|')
      # First line is the number of distinct nodes.
      headerRows = csvReader.next()
      imax = int(headerRows[0])
      jmax = int(headerRows[0])
      print str(imax) + " "  + str(jmax)

      # define the matrix         
      simMatrix = np.zeros((imax, jmax), dtype=np.float)
      currentNodeIndex = 0
      # We build a map between the matrix we want to build and the node identifiers
      # as we read in the rows.
      thisI = 0
      thisJ = 0
      nodeMap = dict()

      # we also want a list that maps the indices to the node names
      indexList = list()
      for row in csvReader:
         if (row[0] in nodeMap):
            thisI = nodeMap[row[0]]
         else:
            nodeMap[row[0]] = currentNodeIndex
            indexList.append(row[0])
            currentNodeIndex += 1

         if (row[1] in nodeMap):
            thisJ = nodeMap[row[1]]
         else:
            nodeMap[row[1]] = currentNodeIndex
            indexList.append(row[1])
            currentNodeIndex += 1

         # matrix is symetric
         simMatrix[thisI, thisJ] = float(row[2])
         simMatrix[thisJ, thisI] = float(row[2])

      for i in range(0,imax):
         # Set all of the diagonals to 1
         simMatrix[i,i] = 1.

   db = AffinityPropagation(affinity='precomputed',damping=thisDamping)
   labels = db.fit_predict(simMatrix)

   # Number of clusters in labels, ignoring noise if present.
   n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

   #print 'Estimated number of clusters: %d' % n_clusters_ 
   print labels, len(labels)

   with open(outputFile, 'wb') as csvoutfile:
      csvWriter = csv.writer(csvoutfile, delimiter=',',quotechar='|')
      for i in range(0, imax):
         csvWriter.writerow([indexList[i], labels[i]])
print("size X",len(X))
#kernel = gaussian_kde(X_p.T)

pref = [(-(mvn.pdf(x,[0,0],[[1,0],[0,.1]])))*100 for x in X]

alpha = 1
dists = np.array([
                  -(
                    euclidean(
                            u/(alpha*-np.log(mvn.pdf(u,[0,0],[[.01,0],[0,.01]]))),
                             v/(alpha*-np.log(mvn.pdf(v,[0,0],[[.01,0],[0,.01]])))
                              )
                    
#                     (
#                       (-np.log(mvn.pdf(u,[0,0],[[1,0],[0,1]]))
#                       +(-np.log(mvn.pdf(v,[0,0],[[1,0],[0,1]])))
#                       )
#                     )
                ) for u in X for v in X]).reshape((len(X),len(X)))

ap = AffinityPropagation(affinity = "precomputed",
                            #preference=pref
                             )
labels  =ap.fit_predict(dists)
print("n labels", len(set(labels)))
import matplotlib.pyplot as plt
cmap = dict((label,np.random.beta(1,1,3)) for label in labels)
for x,label in zip(X,labels):
    plt.scatter(x[0],x[1],color=cmap[label])
plt.show()
Esempio n. 43
0
def cluster_affinity_propagation(similarity_matrix, desired_keys=None):

    numpy_matrix = similarity_matrix_to_numpy(similarity_matrix, desired_keys)

    clusterer = AffinityPropagation()
    return clusterer.fit_predict(numpy_matrix)
Esempio n. 44
0
          data_thr.rateC, data_thr.rateCA]
Html_file = open("clustering_files/affinitypropagation.html", "w")

# consider only 10000 data (spectralclustering memory complexity):
ind = np.array(10000 * [1] + (X.shape[0] - 10000) * [0]).astype(bool)
ind = shuffle(ind)
data_thr10 = pd.DataFrame(X[ind])
data_thr10.columns = data.columns

scaler = StandardScaler()
X = scaler.fit_transform(X)

X = X[ind]

km = AffinityPropagation(damping=0.95)
preds = km.fit_predict(X)

print "components:", set(preds)
print np.bincount(preds)

data_thr10['preds'] = pd.Series(preds).astype("category")
color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
             "brown", "green", "orange"] * 25

title = str(np.bincount(preds))
TOOLS = "wheel_zoom,box_zoom,reset,box_select,pan"
plot_width = 900
plot_height = 300
x_name = 'rateCA'
y_name = 'rate'
xmin_p = np.percentile(data_thr10[x_name], 0.1)
Esempio n. 45
0
def main():
    options = docopt.docopt(__doc__)

    features_file = h5py.File(options['<keypoints>'])
    cap = cv2.VideoCapture(options['<video>'])

    frame_idx = -1
    tracks = None
    frame_pair = (None, None)
    tracking = Tracking()
    cluster_tracks = []
    video_writer = None
    clusters = []

    while options['--max-frames'] is None or frame_idx < int(options['--max-frames']):
        # Read in frame image
        rv, frame = cap.read()
        frame_idx += 1
        
        # If we failed to read in a frame, exit
        if not rv:
            break
        
        if options['--no-video']:
            output_frame = np.zeros_like(frame)
        else:
            output_frame = np.copy(frame)

        if video_writer is None:
            h, w = frame.shape[:2]
            video_writer = cv2.VideoWriter(options['<output>'], cv2.cv.FOURCC(*'MJPG'), 25, (w,h), )

        # Show progress
        if frame_idx % 100 == 0:
            print('Frame index: {0} => {1} tracks'.format(frame_idx, len(tracking.tracks)))
            
        # Convert to greyscale
        frame_gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
            
        # Update frame pair
        frame_pair = (frame_pair[1], frame_gray)
        
        # Work out where in the keypoints file, keypoints start and end
        frame_kp_start, n_kps = features_file['frames'][frame_idx]
        
        # Find keypoint locations and descriptors
        kp_locs = features_file['keypoints'][frame_kp_start:(frame_kp_start+n_kps)]
        kp_descs = features_file['descriptors'][frame_kp_start:(frame_kp_start+n_kps)]
        
        # Convert locations to image space
        kp_im_locs = np.array(kp_locs, dtype=np.float32)
        h, w = frame_gray.shape
        kp_im_locs[:,0] += 0.5*w
        kp_im_locs[:,1] += 0.5*h

        # Construct a list of key points
        kps = list(Keypoint(frame_idx, loc[:2], loc[2], desc) for loc, desc in zip(kp_im_locs, kp_descs))

        if options['--show-kps']:
            for kp in kps:
                x, y = kp.location
                cv2.circle(output_frame, (int(x), int(y)), 5, (0,0,200), lineType=cv2.CV_AA)
        
        # Track this frame's keypoints
        tracking.add_frame(frame_pair[0], frame_pair[1], frame_idx, kps)
        
        # All states and covariances for this frame
        frame_states, frame_covars, frame_track_kps = [], [], []
        for t in tracking.tracks:
            if t.final_frame_idx < frame_idx or t.initial_frame_idx > frame_idx:
                continue
            frame_states.append(t.states[frame_idx - t.initial_frame_idx])
            frame_covars.append(t.covariances[frame_idx - t.initial_frame_idx].copy())
            frame_track_kps.append(t.associated_keypoints[-1])

        # Draw trails if required
        trail_length = int(options['--trail-length'])
        if trail_length > 0:
            for t in tracking.tracks:
                if t.final_frame_idx <= frame_idx - trail_length or t.initial_frame_idx > frame_idx:
                    continue

                start_frame = frame_idx - trail_length + 1
                start_idx = start_frame - t.initial_frame_idx

                for s1, s2 in zip(t.states[start_idx:-1], t.states[start_idx+1:]):
                    cv2.line(output_frame, (int(s1[0]), int(s1[1])),
                            (int(s2[0]), int(s2[1])), (200,0,200),
                            lineType=cv2.CV_AA)
            
        # Convert states to an array
        frame_states = np.array(frame_states)

        if not options['--no-cluster']:
            # PDF of choosing kp uniformly from image
            h, w = frame.shape[:2]
            non_cluster_pdf = -30

            # Best existing cluster for each state and the associated PDF
            state_association = [(-1, non_cluster_pdf),] * frame_states.shape[0]

            # PDF of choosing states from each active cluster
            for c_idx, cluster in enumerate(clusters):
                # skip elderly clusters
                if cluster.last_update_frame_idx != frame_idx - 1:
                    continue

                cluster_mu, cluster_sigma = cluster.predict(frame_idx)

                for s_idx in xrange(len(state_association)):
                    s = frame_states[s_idx,:]
                    c = frame_covars[s_idx]
                    _, current_pdf = state_association[s_idx]

                    pdf = mv_gaussian_log_pdf(s, cluster_mu, cluster_sigma + c)[0]
                    if pdf > current_pdf:
                        state_association[s_idx] = (c_idx, pdf)

            # Go through associations
            unassigned_states, unassigned_covars = [], []
            cluster_states = [None,] * len(clusters)
            for s, c, assoc in zip(frame_states, frame_covars, state_association):
                c_idx = assoc[0]
                if c_idx < 0:
                    unassigned_states.append(s)
                    unassigned_covars.append(c)
                    continue
                
                if cluster_states[c_idx] is None:
                    cluster_states[c_idx] = [(s, c)]
                else:
                    cluster_states[c_idx].append((s, c))

            for cluster, assignment in zip(clusters, cluster_states):
                if assignment is None:
                    if cluster.final_frame_idx >= frame_idx - 3:
                        cluster.update(frame_idx)
                else:
                    states = np.array(list(s for s,c in assignment))

                    if states.shape[0] >= 2:
                        sigma = np.cov(states.T)
                    else:
                        sigma = cluster.covariances[-1].copy()

                    mu = np.mean(states, axis=0)
                    for _, cov in assignment:
                        sigma += cov

                    cluster.update(frame_idx, mu, sigma)

                    minx, maxx = states[:,0].min(), states[:,0].max()
                    miny, maxy = states[:,1].min(), states[:,1].max()

                    if maxx - minx > 300 or maxy - miny > 300:
                        continue

                    cv2.rectangle(output_frame,
                            (int(minx), int(miny)), (int(maxx), int(maxy)), (0,0,200), lineType=cv2.CV_AA)

                    state, cov = cluster.predict(frame_idx)
                    draw_cov(output_frame, cov[:2,:2], state[:2], (0,0,200), lineType=cv2.CV_AA)

        # Draw 'o' over each frame state
        sc = 10.0
        filtered_states = []
        filtered_covs = []

        for kp, s, c in zip(frame_track_kps, frame_states, frame_covars):
            # Extract sigmas
            sigmas = np.diag(np.linalg.cholesky(c))

            # Only sufficiently 'good' features pass
            if options['--max-position-sigma'] is not None:
                if np.any(sigmas[:2] > float(options['--max-position-sigma'])):
                    continue

            if options['--max-velocity-sigma'] is not None:
                if np.any(sigmas[2:4] > float(options['--max-velocity-sigma'])):
                    continue

            ## Only those with keypoints at this frame
            #if kp.frame_idx != frame_idx:
            #    continue
                
            # Only those with minimum velocity
            #speed = np.sqrt(np.sum(s[2:4]*s[2:4]))
            #if speed < 0.5:
            #    continue
            
            filtered_states.append(s)
            filtered_covs.append(c)

            if not options['--no-show-states']:
                draw_cov(output_frame, c[:2,:2], s[:2], (255,0,0), lineType=cv2.CV_AA)
                cv2.line(output_frame, (int(s[0]), int(s[1])), (int(s[0]+sc*s[2]),
                    int(s[1]+sc*s[3])), (0,200,0), lineType=cv2.CV_AA)
                draw_cov(output_frame, sc*sc*c[2:4,2:4], s[:2]+sc*s[2:4], (0,200,0), lineType=cv2.CV_AA)

        filtered_states = np.array(filtered_states)

        # Cluster unlabelled states
        if not options['--no-cluster'] and len(unassigned_states) > 4:
            cluster_states = np.copy(np.array(unassigned_states))
            cluster_covs = list(unassigned_covars)

            clustering = AffinityPropagation()
            labels = clustering.fit_predict(cluster_states)

            # Process labels
            for label in np.unique(labels):
                label_indices = np.nonzero(labels == label)[0]
                if label_indices.shape[0] < 2:
                    continue

                label_states = cluster_states[label_indices, :]
                label_covs = list(cluster_covs[i] for i in label_indices)
                
                mu = np.mean(label_states, axis=0)
                sigma = np.cov(label_states.T)

                for c in label_covs:
                    sigma += c

                new_cluster = Cluster()
                new_cluster.update(frame_idx, mu, sigma)
                clusters.append(new_cluster)

                draw_cov(output_frame, sigma[:2,:2], mu, (0,200,200), lineType=cv2.CV_AA)
        
        # Write output
        video_writer.write(output_frame)
        
    del video_writer
Esempio n. 46
0
build_class_labels()
num_classes = len(urls)



sim_matrix = np.zeros((num_classes, num_classes))
record_in_matrix(sim_matrix)
sim_matrix = np.sqrt(sim_matrix)

np.savetxt("sim_mat.txt", sim_matrix)


clst = AffinityPropagation(affinity='precomputed')
#clst = SpectralClustering(n_clusters=7,affinity='precomputed')
classes = clst.fit_predict(sim_matrix)


with open("ap/centers.txt", "w") as f:
    for clst, indx in enumerate(clst.cluster_centers_indices_):
        f.write(all_urls[indx])
        f.write(" ")
        f.write(str(clst))
        f.write("\n")


with open("ap/clusters.txt", "w") as f:
    for idx, cls in enumerate(classes):
        f.write(all_urls[idx])
        f.write(" ")
        f.write(str(cls))
Esempio n. 47
0
# R1 = C1.fit_predict(Gram)
# 
n = len(Gram)
Di = np.reshape(np.diag(Gram),(n,1))
M = Di.dot(np.ones((1,n)))

D = M + M.T - 2*Gram

C2 = AffinityPropagation(affinity='precomputed')
C1 = KMeans(n_clusters = 5)
C3 = AgglomerativeClustering(n_clusters=5, affinity='precomputed',linkage='average')
C4 = SpectralClustering(n_clusters=5,affinity='precomputed')
C5 = SpectralBiclustering(n_clusters=(5,5))

R1 = C1.fit_predict(D)
R2 = C2.fit_predict(D)
R3 = C3.fit_predict(D)
R4 = C4.fit_predict(Gram +11)
R5 = C5.fit(D)

print(R4)

modèle = TSNE(n_components=2,metric='precomputed')
Trans = modèle.fit_transform(D)

G_ACP = ACP(Gram,precomputed=True)

trace_ACP(G_ACP,[10]*5)
##

import propre_TSNE as pt
Esempio n. 48
0

    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
Esempio n. 49
0
# cluster3 = vectorLinspace([4,1],[7,9], num=50)
# cluster3 = cluster1 + np.random.normal(5,.1,cluster3.shape)
# cluster4 = vectorLinspace([-1,4],[-4,2], num=50)
# cluster4 = cluster1 + np.random.normal(-5,.1,cluster4.shape)

X = cluster1#np.append(cluster1,np.append(cluster2,np.append(cluster3,cluster4,axis=0),axis=0),axis=0)
print(X)
print(pearsonr(X[:,0],X[:,1]),spearmanr(X[:,0],X[:,1]))
dists = np.zeros((len(X),len(X)))
for i1,x1 in enumerate(X): 
    print(i1,"/",len(X))
    for i2,x2 in enumerate(X):
#        for i3,x3 in enumerate(X):
#            if i1 != i2 and i2 != i3 and i1 != i3:
#                 tmp = np.append(x1,np.append(x2,x3,axis=0),axis=0).reshape((-1,2))
#                 #print(tmp)
#                 c = spearmanr(tmp[:,0],tmp[:,1])[0]
        dists[i1,i2] = cosine(x1,x2)
print(dists)
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation(affinity="precomputed")
y_pred = ap.fit_predict(dists)
print(len(set(y_pred)))
cmap = dict((y,np.random.beta(1,1,3)) for y in y_pred)
import matplotlib.pyplot as plt
for x,y in zip(X,y_pred):
    #plt.annotate(y,x,color=cmap[y])
    pass
plt.scatter(X[:,0],X[:,1])
plt.scatter(cluster2[:,0],cluster2[:,1])
plt.show()
Esempio n. 50
0
from sklearn.cluster import AffinityPropagation

from sklearn.manifold import TSNE


dataset = pd.read_csv('~/data/gene_expr_170104.csv')
data = np.array(dataset)[:, 1:].astype(float).T




Y = TSNE().fit_transform(data)
clus = AffinityPropagation()

lab = clus.fit_predict(Y)

x, y  = Y.T



plt.scatter(x, y, alpha=0.9, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
#     plt.text(i, j, t, color = 'purple')

plt.show()

x, y, = SOS(iterations=10, alpha=1, beta=0, delta=0, theta=3.5).fit_transform(data).T

plt.scatter(x, y, alpha=0.4, c = plt.cm.Spectral(lab.astype(float) / lab.max()), edgecolors='none')
# for i, j, t in zip(x, y, range(x.shape[0])):
Esempio n. 51
0
if embeddings.shape[1] != 2:
    print("tsne")
    tsne = TSNE(2)
    embeddings_transformed = tsne.fit_transform(embeddings)
else:
    embeddings_transformed = embeddings
    #tsne = TSNE(2)
    #embeddings_transformed = tsne.fit_transform(embeddings)

print("clustering")
c2c = [5,6,7,8]
labels = dict()
from sklearn.cluster import AffinityPropagation
ap = AffinityPropagation()
for c in c2c:
    labels[c] = ap.fit_predict([emb for emb,concept 
                                in zip(embeddings_transformed,concepts) if concept ==c])
print(labels)
print("plotting")
import matplotlib.pyplot as plt
import seaborn
cmap = dict((key,np.random.beta(1,1,3)) for key in cognate_classes)
counters = {5:0,6:0,7:0,8:0}
for asjp_word,emb,cognate_class,concept in zip(asjp_words,embeddings_transformed,cognate_classes,concepts):
#     plt.annotate(asjp_word,emb,color=cmap[cognate_class])
    if concept == 5:
        plt.subplot(2,2,1)
        label = labels[5][counters[5]]
        plt.annotate(asjp_word+"_"+str(label),emb,color=cmap[cognate_class])
        counters[5] += 1
        
    if concept == 6:
        for j in range(size_berlin):
            if i != j:
                matrix_berlin[i][j] = (list_of_berlin_person[i].distance_of_two_persons(list_of_berlin_person[j]))

    for i in range(size_newcomers):
        for j in range(size_newcomers):
            if i != j:
                matrix_newcomer[i][j] = (list_of_newcomer_person[i].distance_of_two_persons(list_of_newcomer_person[j]))

    print(matrix_berlin)
    print(matrix_newcomer)

    print('_____________________________________')
    clusterer.fit(matrix_newcomer, y=None)
    print('_____________________________________')
    clusterer.fit_predict(matrix_newcomer, y=None)
    print('_____________________________________')
    #
    af = AffinityPropagation().fit(matrix_newcomer)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    print(labels)
    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    # print("Adjusted Rand Index: %0.3f"% metrics.adjusted_rand_score(labels_true, labels))
    # print("Adjusted Mutual Information: %0.3f"% metrics.adjusted_mutual_info_score(labels_true, labels))
    # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean'))