def clustering(self):
		# Calculate similarity matrix
		X = self.create_tfidf_vector()
		X = X.toarray()
		pca = PCA(n_components=300, copy=False)
		X = pca.fit(X).transform(X)
		S = cosine_similarity(X, X)
		# Run affinity propogation
		af = AffinityPropagation()
		af.fit(S)
		# Formulate result
		tmp_clusters = defaultdict(list)
		goal_clusters = defaultdict(list)
		cluster_centers_indices = af.cluster_centers_indices_
		labels = af.labels_
		count = 0
		for label in labels:
			tmp_clusters[\
				self.goal_list[cluster_centers_indices[label]]].append(\
				self.goal_list[count])
			count += 1
		# 2nd-layer clutering of each cluster
		for goal, item_list in tmp_clusters.items():
			subclusters = self.subcluster_by_editdistance(goal, item_list)
			for subgoal, items in subclusters.items():
				goal_clusters[subgoal] = items
		return goal_clusters
Esempio n. 2
0
File: ooc.py Progetto: audy/bfc
def main():
    '''
        >>> main() # stuff happens
    '''

    args = parse_args()
    setup_logging(args.log, verbose=args.verbose)

    chunks = sequence_chunk_generator(args.fasta_file,
                                      chunk_size=args.chunk_size)

    hasher = HashingVectorizer(analyzer='char',
                               n_features = 2 ** 18,
                               ngram_range=(args.ngram_min, args.ngram_max),
                               )

    estimator = AffinityPropagation()

    for chunk in chunks:

        logging.info('hashing chunk')
        chunk_vector = hasher.transform([ str(i.seq) for i in chunk ])

        logging.info('clustering')

        estimator.fit(chunk_vector)

        logging.info('got %s clusters' % len(set(estimator.labels_)))
Esempio n. 3
0
 def clusterAffinityPropagation(self):
     """
     Cluster the embeddings with affinity propagation
     :return:
     """
     affin = AffinityPropagation()
     affin.fit(self.emb1.m)
     aflabels1 = affin.labels_
     afclusters1 = dict()
     word2cluster1 = dict()
     for i,l in enumerate(aflabels1):
         points = afclusters1.setdefault(l,list())
         points.append(self.emb1.rd[i])
     for l,c in afclusters1.items():
         for w in c:
             word2cluster1[w] = l
     self.cluster1 = afclusters1
     self.word2cluster1 = word2cluster1
     affin.fit(self.emb2.m)
     aflabels2 = affin.labels_
     afclusters2 = dict()
     word2cluster2 = dict()
     for i,l in enumerate(aflabels2):
         points = afclusters2.setdefault(l,list())
         points.append(self.emb2.rd[i])
     for l,c in afclusters2.items():
         for w in c:
             word2cluster2[w] = l
     self.cluster2 = afclusters2
     self.word2cluster2 = word2cluster2
def affinity_propagation(crime_rows, column_names):
    """
        damping : float, optional, default: 0.5
            Damping factor between 0.5 and 1.
        convergence_iter : int, optional, default: 15
            Number of iterations with no change in the number of estimated 
            clusters that stops the convergence.
        max_iter : int, optional, default: 200
            Maximum number of iterations.
        preference : array-like, shape (n_samples,) or float, optional
            Preferences for each point - points with larger values of preferences 
            are more likely to be chosen as exemplars. 
            The number of exemplars, ie of clusters, is influenced by the input 
            preferences value. If the preferences are not passed as arguments, 
            they will be set to the median of the input similarities.
        affinity : string, optional, default=``euclidean``
            Which affinity to use. At the moment precomputed and euclidean are 
            supported. euclidean uses the negative squared euclidean distance 
            between points.
    """
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    print("Running Affinity Propagation")
    # TODO: Parameterize this
    affinity_prop = AffinityPropagation()
    #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy)
    affinity_prop.fit(random_sampling(crime_xy, num_samples=5000))
    affinity_propagation_labels = affinity_prop.predict(crime_xy)
    print("formatting....")
    return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, 
            column_names)
def execute(args):
  ##############################################################################
  if len(args) < 1:
    usage()
    sys.exit()

  names, labels_true, X = parse(args[0])
  indices = [int(i) for i in args[1:]]
  relevant_names = names[1:]
  if len(indices) > 0:
    X = np.asarray([[sample[i] for i in indices] for sample in X])
    relevant_names = [relevant_names[i] for i in indices]
  print "Clustering on", str(relevant_names) + "..."

  
  ##############################################################################
  # Compute Affinity Propagation
  af = AffinityPropagation(preference=-50)
  # cluster_centers_indices = af.cluster_centers_indices_
  # labels = af.labels_
  # 
  # n_clusters_ = len(cluster_centers_indices)

  y_pred = af.fit_predict(X)
  if y_pred is None or len(y_pred) is 0 or type(y_pred[0]) is np.ndarray:
    return 0
  counts = get_cluster_counts(labels_true, y_pred)
  print counts
Esempio n. 6
0
def clusterSimilarityWithSklearnAPC(data_file,damping=0.9,max_iter=200,convergence_iter=15,preference='min'):
    """
    Compare Sparse Affinity Propagation (SAP) result with SKlearn Affinity Propagation (AP) Clustering result.
    Please note that convergence condition for Sklearn AP is "no change in the number of estimated clusters",
    for SAP the condition is "no change in the cluster assignment". 
    So SAP may take more iterations and the there will be slightly difference in final cluster assignment (exemplars for each sample).
    """
    # loading data
    simi_mat=loadMatrix(data_file)
    simi_mat_dense=simi_mat.todense()

    # get preference
    if preference=='min':
        preference=np.min(simi_mat_dense)
    elif preference=='median':
        preference=np.median(simi_mat_dense)
    
    print('{0}, start SKlearn Affinity Propagation'.format(datetime.now()))
    af=AffinityPropagation(damping=damping, preference=preference, affinity='precomputed',verbose=True)
    af.fit(simi_mat_dense)
    cluster_centers_indices,labels = af.cluster_centers_indices_,af.labels_
    sk_exemplars=np.asarray([cluster_centers_indices[i] for i in labels])
    print('{0}, start Fast Sparse Affinity Propagation Cluster'.format(datetime.now()))
    sap=SAP(preference=preference,convergence_iter=convergence_iter,max_iter=max_iter,damping=damping,verboseIter=100)
    sap_exemplars=sap.fit_predict(simi_mat_dense)
    
    # Caculate similarity between sk_exemplars and sap_exemplars
    exemplars_similarity=sparseAP_cy.arrSamePercent(np.array(sk_exemplars), np.array(sap_exemplars))
    
    return exemplars_similarity
def cluster(mat, doc_indices):
    X = mat[:, doc_indices].T
    # Other clustering algorithms can easily be swapped in:
    # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
    clust = AffinityPropagation()
    clust.fit(X)
    return zip(doc_indices,  clust.labels_)
Esempio n. 8
0
def make_cluster_map(damping=0.992):
	test_labels, prediction = pickle.load(open(f_path_pred, 'rb'))
	prob_conf = np.zeros((121, 121))
	for l in range(121):
		inds = np.squeeze(np.array(np.where(test_labels == l)))
		class_conf = prediction[inds, :].mean(axis=0)
		prob_conf[l, :] = class_conf
	F = prob_conf
	D = (1-F)
	np.fill_diagonal(D, 0)
	D_p = 0.5*(D+D.T)


	clst = AP(damping=damping, # damping determines # of clusters
			  max_iter=500, 
			  convergence_iter=15, 
			  affinity='euclidean', 
			  verbose=False)
	clst.fit(D_p)
	print 'Number of cluster:', len(clst.cluster_centers_)
	membership = np.c_[range(121), clst.labels_]

	fine_to_coarse = dict(membership)
	coarse_to_fine = {l: [] for l in clst.labels_}
	for k, v in fine_to_coarse.items():
		coarse_to_fine[v].append(k)
		
	pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb'))
	pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
Esempio n. 9
0
    def create_stratum(self, column_names, **kwargs):
        '''
        Use affinity propagation to find number of strata for each column. 
        column_names is a list of the covariates to be split into strata and 
        used for classification. This funciton adds a column to the data frame
        for each column as column_name_strata that gives the strata designation
        for that variable.  The whole data frame is returned.
        '''

        for colname in column_names:
            X = self.data[colname].reshape(-1, 1)
            
            if np.isnan(X).any():
                raise ValueError("There are NaN values in self.data[%s] that the \
                                  clustering algorithm can't handle" % colname)
                                  
            elif np.unique(self.data[colname]).shape[0] <=2:
                string_name = colname+'_strata'
                self.data[string_name] = self.data[colname].astype(int)
        
            else:
                af_model = AP(damping = 0.9)
                strata_groups = af_model.fit(X)
                
                #cluster_centers_indices = af.cluster_centers_indices_
                #n_clusters_ = len(cluster_centers_indices)
                
                string_name = colname+'_strata'
                self.data[string_name] = strata_groups.labels_
                
        return self.data
Esempio n. 10
0
def cluster(scope):
    # Setup data
    df = pd.read_sql('playtype_data', db_engine)

    # Manipulate data into scope
    if scope == 'Team':
        df = df.drop('Player', 1).groupby('Team', as_index=False).mean()
    elif scope == 'Player':
        df = df.drop('Team', 1)
    else:
        raise Exception('This is never supposed to happen')

    # Normalize the data
    df[FEATURES] = (df[FEATURES] - df[FEATURES].mean()) / (df[FEATURES].max() - df[FEATURES].min())

    # Run clustering
    clstr = AffinityPropagation()
    clstr.fit(df[FEATURES])

    # Clump results
    df['cluster'] = clstr.labels_
    df = df.sort('cluster')

    # Convert results to JSON for frontend
    return clusters_to_json(df, scope)
Esempio n. 11
0
def affinity_propagation_cluster_analysis(x,y,preference):
    # NOT WORKING BECAUSE I DONT REALLY UNDERSTAND WHAT IT DOES...
    # ADAPTED FROM:
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html#example-cluster-plot-affinity-propagation-py
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    af = AffinityPropagation()
    af = af.fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for i in xrange(len(np.unique(labels))):
        my_members = labels == i
        cluster_center = X[cluster_centers_indices[i]]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
        for j in X[my_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]],c=colors[i],linestyle='--')
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
    def cluster(self, feat_mtx, df_lm_allusers):
        # clustering artists based on AffinityPropogation
        start = time.time()
        af = AffinityPropagation()
        af.fit(feat_mtx)
        self.labels = af.labels_
        self.af = af

        # adding cluster labels to least misery dataframe and sorting by rank and cluster
        #df_least_misery_clustered = self.df_least_misery.copy() --> changing to df_lm_allusers
        print 'number of labels: ', len(self.labels)
        print 'labels', self.labels
        
        # print 'least misery clustered length', len(df_least_misery_clustered)
        
        df_least_misery_clustered = df_lm_allusers.copy()
        print 'len df least misery: ', len(df_least_misery_clustered)
        
        df_least_misery_clustered['cluster'] = self.labels
        df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float)
        ''' will do different sorting if not using rank '''
        # now set to false as looking for highest score
        df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col], ascending = False)
        self.df_least_misery_clustered = df_least_misery_clustered
        end = time.time()
        print 'clustering completed in: ', end - start  
        return df_least_misery_clustered
Esempio n. 13
0
def cluster_trajectories( curves ):
    """Given a list of curves, cluster_trajectories will cluster them."""
    n_curves = len(curves)
    X_2B_clstrd = np.zeros( (n_curves, 4) )
    X_2B_clstrd[:,0] = np.array( [ curves[k][0, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,1] = np.array( [ curves[k][1, 0] for k in range(n_curves) ] )
    X_2B_clstrd[:,2] = np.array( [ curves[k][0,-1] for k in range(n_curves) ] )
    X_2B_clstrd[:,3] = np.array( [ curves[k][1,-1] for k in range(n_curves) ] )
        
    for col in range( 4 ):
        X_2B_clstrd[:,col] /=  X_2B_clstrd[:,col].std()
        
    def distance_metric(a,b):
        #A distance metric on R^4 modulo the involution
        #(x0,x2,x3,x4) -> (x3,x4,x1,x2)
        d = lambda a,b : np.sqrt( np.sum( (a-b)**2 ) )
        T = lambda x: np.array([x[2],x[3],x[0],x[1]])
        return min( d(a,b) , d(T(a),b) )
    from sklearn.cluster import AffinityPropagation
    clusterer = AffinityPropagation(affinity='precomputed', convergence_iter=100)
    aff = np.zeros((n_curves, n_curves))
    for i in range(n_curves):
        for j in range(i+1,n_curves):
            aff[i,j] = np.exp(-distance_metric( X_2B_clstrd[i], X_2B_clstrd[j])**2)
            aff[j,i] = aff[i,j]

    #clusterer.Affinity = aff
    cluster_labels = clusterer.fit_predict(aff)
    out = []
    for label in set( cluster_labels):
        cluster = map( lambda k: curves[k] , filter( lambda k: cluster_labels[k] == label , range( n_curves) ) )
        out.append( cluster )
    return map( align_cluster, out)
Esempio n. 14
0
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'):
    if m=='load':
        centroidRead=open('centroid','r')
        labelClusterRead=open('labelCluster','r')
        labelPreRead=open('labelPre','r')
        centroid=pickle.load(centroidRead)
        labelCluster=pickle.load(labelClusterRead)
        labelPre=pickle.load(labelPreRead)
    else:
        dataArrayTestNorm = preprocessing.normalize(dataArrayTest)
        dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain)
        #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10)
        clf=AffinityPropagation()
        #clf=DBSCAN(min_samples=30)
        pre=clf.fit(dataArrayTrainNorm)


        centroid=pre.cluster_centers_
        centroidWrite=open('centroid','w')
        #pickle.dump(centroid,centroidWrite)

        labelCluster=pre.labels_
        labelClusterWrite=open('labelCluster','w')
        #pickle.dump(labelCluster,labelClusterWrite)

        labelPre=clf.predict(dataArrayTestNorm)
        labelPreWrite=open('labelPre','w')
        #pickle.dump(labelPre,labelPreWrite)

    return centroid,labelCluster,labelPre
Esempio n. 15
0
def clusterise_data(data_obj):
    """ Assigns a cluster label to each days present in the data received 
        using three different algorithms: MeanShift, Affinity Propagation, 
        or KMeans. 
        @param data_obj: List of dictionaries
    """
    L = len(data_obj)
    
    #Simply converts data_obj to a 2D list for computation
    List2D = [[None for _ in range(4)] for _ in range(L-1)]
    for i in range(L-1): #don't include current day
        #wake_up and sleep_duration are the most important factors
        List2D[i][0] = 5 * data_obj[i]["wake_up"]
        List2D[i][1] = 1 * data_obj[i]["sleep"]
        List2D[i][2] = 5 * data_obj[i]["sleep_duration"]
        List2D[i][3] = 0.5 * data_obj[i]["activity"]
    points = NumpyArray(List2D) #converts 2D list to numpyarray
        
    if ALGO == "Affinity Propagation":
        labels = AffinityPropagation().fit_predict(points)
    elif ALGO == "KMeans":
        labels= KMeans(init='k-means++', n_clusters=5, n_init=10)   .fit_predict(points)
    elif ALGO == "MeanShift":
        bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20)
        labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points)
    else:
        raise Exception("Algorithm not defined: "+str(ALGO))
        
    for i in range(L-1):
        data_obj[i]["cluster"] = labels[i]
    for unique_label in remove_duplicates(labels):
        debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points")
    debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters
Esempio n. 18
0
def clustering_affinity_propagation(data_res):
    """
    Executes sklearn's affinity propagation function with the given data frame
    """
    af = AffinityPropagation()
    af.fit(data_res)

    predictions = af.predict(data_res)
    cluster_centers = af.cluster_centers_

    return predictions, cluster_centers, af
Esempio n. 19
0
    def cluster_prop(self, filtered_data):
        prop_dict={}

        for review in filtered_data:
            for dicti in review['line']:
                if not prop_dict.has_key(dicti["prop"][0]):
                    prop_dict[dicti["prop"][0]]={"freq":0,"data":[],"idx":[]}

                prop_dict[dicti["prop"][0]]['idx'].append(review['index'])
                prop_dict[dicti["prop"][0]]["freq"] += 1
                prop_dict[dicti["prop"][0]]["data"].append(dicti)

        d_list=[]
        word_list=[]

        for word in prop_dict:
            try:
                d_list.append(self.wmodel[word])
                word_list.append(word)
            except:
                pass

        Aprop = AffinityPropagation(damping=0.6, convergence_iter=100, max_iter=10000)
        Aprop.fit(d_list)
        cluster_dict = {}

        for idx, each in enumerate(Aprop.labels_):
            vec = d_list[idx]
            if not cluster_dict.has_key(each):
                cluster_dict[each] = {"word":[],"freq":0,"seed":"","sim":0.0}
            cluster_dict[each]["word"].append(word_list[idx])

        total_freq=0

        for each in cluster_dict.keys():
            target_group_id = each
            group_id = each

            last_group_id = target_group_id

            cluster_freq=0
            max_seed=""
            max_freq=0

            for idx,data in enumerate(cluster_dict[each]["word"]):
                cluster_freq+=prop_dict[data]["freq"]
                if prop_dict[data]["freq"] > max_freq:
                    max_freq=prop_dict[data]["freq"]
                    max_seed=data

            cluster_dict[each]["freq"]=cluster_freq
            cluster_dict[each]["seed"]=max_seed

        return (cluster_dict, prop_dict, Aprop)
Esempio n. 20
0
def cluster_concepts(context="location"):
    """
	Cluster related concepts of a specific type to different categories
	"""
    db = Database()
    concept_category = ConceptCategory()
    cmd = "SELECT * FROM %s" % (context)
    context_res = db.query_db(cmd)
    concept_list = []
    concept_matrix = []
    for item in context_res:
        concept_list = []
        concept_matrix = []
        if context == "action":
            context_id, context_chinese, context_name = item[:3]
        elif context == "location":
            context_id, context_name, context_chinese = item
        cmd = (
            "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \
				WHERE a.%s_id = %s AND a.concept_id = b.id"
            % (context, context, context_id)
        )
        concept_res = db.query_db(cmd)
        if len(concept_res) == 0:
            continue
        for item in concept_res:
            concept, concept_id = item
            concept_vector = concept_category.concept_axes.row_named(concept)
            concept_list.append((concept_id, concept))
            concept_matrix.append(concept_vector)
            # Run affinity propogation
        S = cosine_similarity(concept_matrix, concept_matrix)
        af = AffinityPropagation()
        af.fit(S)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        count = 0
        clusters = defaultdict(list)
        for label in labels:
            clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count])
            count += 1
        category_num = 0
        for key, value in clusters.items():
            category_num += 1
            for concept in value:
                cmd = (
                    "UPDATE %s_concept SET category = %d WHERE \
						%s_id = %s AND concept_id = %s"
                    % (context, category_num, context, context_id, concept[0])
                )
                db.query_db(cmd)
                print concept[1].encode("utf-8") + " ",
            print ""
        print "----------" + context_chinese.encode("utf-8") + "----------"
def affinityprop(lngs, lats, city, cluster_diameter):
	city_area = city["area"]
	city_lng = city["lng"]
	city_lat = city["lat"]
	lngs = np.array(lngs)#*(math.cos(city["lat"])**2)

	affinity = AffinityPropagation(damping=0.75, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False)
	affinity.fit(np.array([lngs, lats]).transpose())
	cluster_labels = np.array(affinity.labels_)

	return labels_to_index(cluster_labels)
Esempio n. 22
0
def cluster_articles():
  ms = MongoStore()
  articles = [a for a in ms.get_pending_articles()]

  if len(articles) > 0:

    tfidf = TfidfVectorizer(tokenizer=preprocess)


    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
def affinity_propagation():
    """
    AffinityPropagation creates clusters by sending messages between pairs of
    samples until convergence. The messages sent between pairs represent the
    suitability for one sample to be the exemplar of the other, which is updated
    in response to the values from other pairs. this updates occurs iteratively
    until convergence, at which point the final exemplars are chosen and hence
    the final cluster is given.

    Algorithm:

    The message sent between pairs belongs to one of two categories. The first
    is the responsibility, r(i,k), which is the accumulated evidence that sample
    k should the exemplar for sample i. The second is the availability, a(i,k),
    which is the accumulated evidence that sample i should chose sample k to be
    its exemplar, and considers the values for all other samples that k should
    be an exemplar. In this case exemplars are chosen by samples if they are:

        - similar enough to many samples, and
        - chosen by many samples to be representative of themselves.
    """
    # Generate a generic data sample.
    n_samples = 300
    std = 0.3
    seed = 0
    centers = [ [-1., 0.], [0., 1.5], [1., 0.] ]
    data, target = make_blobs(n_samples = n_samples, centers = centers,
        cluster_std = std, random_state = seed)

    # Set the preference for each point: samples with large preference values
    # are more likely to be chosen as exemplars. The number of exemplars, i.e.,
    # clusters, is influenced by the input preference values. If preferences are
    # not passed as arguments, they will be set to the median of the input
    # similarities.
    # pref = [ np.random.randint(low = -50, high = 0) for x in range(n_samples)]
    pref = -50
    # Compute affinity propagation.
    clf = AffinityPropagation(preference = pref)
    aff_y = clf.fit_predict(data)
    # Find mismatches between predicted and true values.
    cnt = int(0)
    for idx in range(n_samples):
        if(target[idx] != aff_y[idx]): cnt += 1
    # Print results.
    print('Approximated number of clusters ', len(clf.cluster_centers_indices_))
    print('Accuracy ', float(n_samples - cnt) / float(n_samples))
    print('Homogeneity ', metrics.homogeneity_score(target, clf.labels_))
    print('Completeness ', metrics.completeness_score(target, clf.labels_))

    # Plot resulting clusters.
    plt.figure(figsize = (8,8))
    plt.scatter(data[:,0], data[:,1], c = aff_y, s = 50)
    plt.title('Affinity clustering')
    plt.show()
Esempio n. 24
0
def get_label_res2(similar_matrix, n_subs):

    cluster = AffinityPropagation(damping = 0.75 , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed')

    res = cluster.fit(similar_matrix)

    size_labels = len(set(res.labels_))
    assert size_labels < 10, size_labels
    assert size_labels > 1, size_labels

    print res.labels_
    return res.labels_
Esempio n. 25
0
	def do_issue(data, data_name):
		reduced_points, labels, km = reduce_npoints_kmeans(dataframe = data, dataset_name = dataset, data_name=data_name, n_datapoints = 1000, load_from_file = False)
		transformed_data, pca, components = calculate_pca(reduced_points, n_components=3)
		colormap = brewer2mpl.get_map('RdBu', 'diverging', 4, reverse=True)
		filename = figure_save_path + dataset + '_issue_29_1_%s_reduced_number_of_points.png'%data_name
		print "Making scatter plot of %s data for dataset %s, where the number of points have been reduced by K-Means clustering"%(data_name, dataset)
		make_color_grouped_scatter_plot(data_frame=transformed_data, x_name='d1', y_name='d2', color_by='d3', filename=filename, colormap=colormap)

		ap = AffinityPropagation(damping=affinity_damping)
		ap.fit(reduced_points)
		print "Making scatter plot of Affinity Propagation clusters of %s data for dataset %s"%(data_name, dataset)
		filename = figure_save_path + dataset + '_issue_29_2_%s_affinity.png'%data_name
		make_scatter_plot_for_labelled_data(data_frame=transformed_data, x_name='d1', y_name='d2', labels=ap.labels_, filename=filename, colormap = colormap, legend=True)	
    def cluster(self, feat_mtx):
        # clustering artists based on AffinityPropogation
        af = AffinityPropagation()
        af.fit(feat_mtx)
        self.labels = af.labels_
        self.af = af

        # adding cluster labels to least misery dataframe and sorting by rank and cluster
        df_least_misery_clustered = self.df_least_misery.copy()
        df_least_misery_clustered['cluster'] = self.labels
        df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float)
        ''' will do different sorting if not using rank '''
        df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col])
        return df_least_misery_clustered
Esempio n. 27
0
def affinity(DF):
    '''
    calculate and plot affinity propagation
    ts clustering algoritm, return partition
    '''
    X = normaliseTimeseries(DF)
    A = AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15)

    af = A.fit(X)

    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)

    M = metrics.silhouette_score(X, labels, metric='sqeuclidean')
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f" % M)

    fig, axes = plt.subplots(nrows=n_clusters_, figsize=(24, 18), sharex='all')

    colors = nColors(k=n_clusters_, cmap='spectral')
    ticks = ['Monday', 'Tuesday', 'Wednesday',
             'Thursday', 'Friday', 'Saturday', 'Sunday']

    dates = [datetime.datetime(year=2015, month=1, day=i,
                               hour=0, minute=0) for i in range(1, 8)]

    for k, col in zip(range(n_clusters_), colors):
        X.iloc[cluster_centers_indices[k], :].plot(
            lw=1, c=col, label=k, alpha=.5, ax=axes[k])

        X[labels == k].T.plot(lw=.5, c=col, alpha=0.2, ax=axes[k], legend=0)
        axes[k].set_title('cluster %d, %d zipcodes' %
                          (k, len(X[labels == k])), fontsize=16)

        axes[k].set_xticklabels([], minor=False)  # the default
        axes[k].set_xticklabels(ticks, minor=True)
        axes[k].set_yticklabels([], minor=False)

        for d in dates:
            axes[k].axvline(x=d, ymin=0, ymax=1, alpha=.5, linewidth=2)

    plt.tight_layout()

    result = DF.T
    result['label'] = labels
    result.reset_index(inplace=1)
    result.rename(columns={'index': 'postalCode'}, inplace=1)

    return result[['postalCode', 'label']]
Esempio n. 28
0
def evaluate_clustering():

    similarity_matrix = get_sense_similarity_submatrix(range(10000))
    matrix_size = len(similarity_matrix)
    print('got matrix')

    affinity_propagation = AffinityPropagation()
    labels1 = affinity_propagation.fit_predict(similarity_matrix)
    print('affinity propagation')

    dbscan = DBSCAN(min_samples=1)
    labels2 = dbscan.fit_predict(similarity_matrix)
    print('print dbscan')

    distance_matrix = np.ndarray((matrix_size, matrix_size))
    for i in range(matrix_size):
        for j in range(matrix_size):
            distance_matrix[i, j] = 1 - similarity_matrix[i, j]

    print(distance_matrix[1, 2])
    print(distance_matrix[1, 1])

    print('created distance matrix')

    cluster_map1 = cluster_evaluation.fpena_get_clusters(labels1)
    cluster_map2 = cluster_evaluation.fpena_get_clusters(labels2)

    print(cluster_map1)
    print(cluster_map2)

    sc1 = sklearn.metrics.silhouette_score(distance_matrix, labels1, metric='euclidean')
    sc2 = sklearn.metrics.silhouette_score(distance_matrix, labels2, metric='euclidean')
    sc5 = cluster_evaluation.fpena_evaluate(cluster_map1, distance_matrix)
    sc6 = cluster_evaluation.fpena_evaluate(cluster_map2, distance_matrix)

    num_elements1 = [len(values) for values in cluster_map1.values()]
    num_elements2 = [len(values) for values in cluster_map2.values()]
    print(num_elements1)
    print(num_elements2)

    print('Number of clusters Affinity Propagation: %f' % len(cluster_map1))
    print('Number of clusters DBSCAN: %f' % len(cluster_map2))
    print('Average elements per cluster Affinity Propagation: %f' % np.mean(num_elements1))
    print('Average elements per cluster DBSCAN: %f' % np.mean(num_elements2))
    print('Standard deviation per cluster Affinity Propagation: %f' % np.std(num_elements1))
    print('Standard deviation per cluster DBSCAN: %f' % np.std(num_elements2))
    print('Silouhette score Affinity Propagation (distance matrix): %f' % sc1)
    print('Silouhette score DBSCAN (distance matrix): %f' % sc2)
    print('Dunn index Affinity Propagation (distance matrix): %f' % sc5)
    print('Dunn index DBSCAN (distance matrix): %f' % sc6)
Esempio n. 29
0
    def affinity_propagation(self, affinity_matrix=None, sigma=1, **kwargs):
        """

        :param kwargs: damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, verbose=False
        :return:
        """
        if affinity_matrix is None:
            aff = rbf(self.dm.values, sigma)
        else:
            aff = affinity_matrix

        est = AffinityPropagation(affinity='precomputed', **kwargs)
        est.fit(aff.view(np.ndarray))
        return Partition(est.labels_)
Esempio n. 30
0
def affinity_cluster(all_features):
    # cluster all features
    X = np.array(all_features)
    af = AffinityPropagation(verbose=True, preference=-50).fit(X)

    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print ("Estimated number of clusters: %d" % n_clusters_)

    # use transform method of trained model of KMeans to do vector quantization
    X_fit = af.fit(X)
    return X_fit, n_clusters_
Esempio n. 31
0
def test_affinity_propagation_random_state_warning():
    # test that a warning is raised when random_state is not defined.
    X = np.array([[0, 0], [1, 1], [-2, -2]])
    match = "'random_state' has been introduced in 0.23."
    with pytest.warns(FutureWarning, match=match):
        AffinityPropagation().fit(X)
Esempio n. 32
0
def affinityPropagation(Data,DataLabels):
    print("=======AffinityPropagation========")
    clustering = AffinityPropagation().fit(Data)
    printResult(clustering.labels_,DataLabels)
    def handleEndGame(self, event, replay):
        try:
            pdict = {}
            for player in replay.players:
                player.bases = {}
                player.base_cluster = {}
                pdict[player.team_id] = player

            old_frames = {p.pid: 0 for p in replay.players}
            for frame in self.keyframes:
                for player in replay.players:
                    player.bases[frame] = {}
                    player.base_cluster[frame] = {}
                    if frame > 0:
                        for f in range(old_frames[player.pid] + 1, frame):
                            player.bases[f] = player.bases[old_frames[
                                player.pid]]
                            player.base_cluster[f] = player.base_cluster[
                                old_frames[player.pid]]
                        old_frames[player.pid] = frame

                for f, ls in self.lookup.items():
                    if f <= frame:
                        locs, teamids, finishes, prefs = zip(
                            *list(ls.values()))
                        unit_ids = list(ls.keys())
                    else:
                        break
                locs = np.array(locs)
                prefs = np.array(prefs)
                finishes = np.array(finishes)
                self.logger.debug(
                    f"(frame {frame}): locs = {locs.tolist()} prefs = {prefs.tolist()}"
                )

                af = AffinityPropagation(
                    preference=[0 if p else -5000 for p in prefs],
                    random_state=None).fit(locs)
                count = 1
                while -1 in af.labels_ and count < 100:  # indicates clustering did not converge, so we retry until it does (giving up after 100 tries)
                    af = AffinityPropagation(
                        preference=[0 if p else -5000 for p in prefs],
                        random_state=None,
                        max_iter=5000).fit(locs)
                    count += 1
                if count > 1:
                    self.logger.warning(
                        f"(frame {frame}): Tried {count} times to achieve convergence --- {'FAILED' if count == 100 else 'SUCCEEDED'}"
                    )
                cluster_centers_indices = af.cluster_centers_indices_
                centers = af.cluster_centers_.tolist()
                labels = af.labels_
                self.logger.debug(f"(frame {frame}): labels = {labels}")
                n_clusters = len(cluster_centers_indices)

                # mining location? must be separate cluster
                new_centers = []
                for k in range(n_clusters):
                    # mining bases in this cluster
                    mining_locs = [(loc, finish) for loc, finish, pref in zip(
                        locs[labels == k], finishes[labels == k], prefs[
                            labels == k]) if pref]
                    if len(mining_locs) > 1:
                        # split up clusters with more than one mining base
                        self.logger.debug(
                            f"(frame {frame}): mining_locs = {mining_locs}")
                        original = min(filter(lambda x: x is not None,
                                              mining_locs),
                                       key=lambda x: x[1])
                        to_split = [
                            x for x in mining_locs
                            if x[0].tolist() != original[0].tolist()
                        ]
                        self.logger.debug(
                            f"(frame {frame}): original = {original}, to_split = {to_split}"
                        )
                        for i, (loc, finish) in enumerate(to_split):
                            new_label = n_clusters + i
                            self.logger.debug(
                                f"(frame {frame}): changing {labels[(locs == loc).all(axis=1).nonzero()]} to {new_label}"
                            )
                            labels[(locs == loc).all(
                                axis=1).nonzero()] = new_label
                            members = [(loc, finish) for loc, finish, pref in
                                       zip(locs[labels == k], finishes[
                                           labels == k], prefs[labels == k])
                                       if not pref]
                            for ml, mf in members:
                                if dist(ml, loc) == min(
                                        dist(ml, x[0])
                                        for x in [original] + to_split[:i] +
                                        to_split[i + 1:]) and mf >= finish:
                                    self.logger.debug(
                                        f"(frame {frame}): changing {labels[(locs == ml).all(axis=1).nonzero()]} to {new_label}"
                                    )
                                    labels[(locs == ml).all(
                                        axis=1).nonzero()] = new_label
                        new_centers.append(loc)

                for c in new_centers:
                    cluster_centers_indices = np.append(
                        cluster_centers_indices,
                        (locs == c).all(axis=1).nonzero())
                    n_clusters += 1

                # maximum distance
                new_centers = []
                for loc in locs:
                    if all(
                            dist(loc, c) / self.map_dim > 0.1 for c in centers
                    ):  # too far away from any cluster center, should be split
                        if any(
                                dist(loc, select_center(cs)) /
                                self.map_dim <= 0.1 for cs in new_centers
                        ):  # close to an already split building, merge
                            _, i = min((dist(loc, select_center(cs)), i)
                                       for i, cs in enumerate(new_centers))
                            labels[(locs == loc).all(
                                axis=1).nonzero()] = n_clusters + i
                            new_centers[i].append(tuple(loc))
                        else:  # start a new cluster
                            labels[(locs == loc).all(axis=1).nonzero(
                            )] = n_clusters + len(new_centers)
                            new_centers.append([tuple(loc)])

                for cs in new_centers:
                    central = select_center(cs)
                    cluster_centers_indices = np.append(
                        cluster_centers_indices,
                        (locs == central).all(axis=1).nonzero())
                    n_clusters += 1
                self.logger.debug(
                    f"(frame {frame}): set(labels) = {set(labels)} center indices = {cluster_centers_indices}"
                )
                base_types = {}
                for loc, label in zip(locs, labels):
                    if any(np.array_equal(loc, m) for m in self.mains):
                        base_types[label] = BaseType.MAIN
                    elif label not in base_types and (is_mining_loc(
                            self.resource_clusters, loc)):
                        base_types[label] = BaseType.EXPANSION
                for unit_id, loc, team_id, label in zip(
                        unit_ids, locs, teamids, labels):
                    pdict[team_id].bases[frame][unit_id] = loc
                    pdict[team_id].base_cluster[frame][unit_id] = BaseCluster(
                        label, locs[cluster_centers_indices[label]],
                        base_types.get(label, BaseType.PROXY))

        except:
            print(locs)
            print(replay.filename)
            traceback.print_exc()

        for player in replay.players:
            if frame < replay.frames:
                for f in range(frame + 1, replay.frames + 1):
                    player.bases[f] = player.bases[frame]
                    player.base_cluster[f] = player.base_cluster[frame]
            assert len(
                player.bases
            ) == replay.frames + 1, f"{len(player.bases)} base entries, {replay.frames} frames {sorted(player.bases.keys())}"
class AP(object):
    def __init__(self,
                 damping=.5,
                 max_iter=200,
                 convergence_iter=15,
                 copy=True,
                 preference=None,
                 affinity='euclidean',
                 verbose=False,
                 random_state='warn'):
        """

        Parameters
        ----------
        damping : TYPE, optional
        阻尼系数   0.5~1 之间
            DESCRIPTION. The default is .5.
        max_iter : TYPE, optional
        最大迭代次数
            DESCRIPTION. The default is 200.
        convergence_iter : TYPE, optional
        停止收敛的估计簇数没有变化的迭代数
            DESCRIPTION. The default is 15.
        copy : TYPE, optional
        复制输入数据 True
            DESCRIPTION. The default is True.
        preference : TYPE, optional
        
            DESCRIPTION. The default is None.
        affinity : TYPE, optional
        {"euclidean","precomputed"}
        欧氏距离 与与计算
            DESCRIPTION. The default is 'euclidean'.
        verbose : TYPE, optional
            DESCRIPTION. The default is False.
        random_state : TYPE, optional
            DESCRIPTION. The default is 'warn'.

        Returns
        -------
        None.

        """
        self.ap_cluster = AffinityPropagation(
            damping=damping,
            max_iter=max_iter,
            convergence_iter=convergence_iter,
            copy=copy,
            preference=preference,
            affinity=affinity,
            verbose=verbose,
            random_state=random_state)

    def fit(self, x, y=None):
        self.ap_cluster.fit(X=x, y=y)

    def fit_predict(self, x, y=None):
        return self.ap_cluster.fit_predict(X=x, y=y)

    def get_params(self, deep=True):
        return self.ap_cluster.get_params(deep=deep)

    def set_params(self, params):
        self.ap_cluster.set_params(**params)

    def predict(self, x):
        return self.ap_cluster.predict(X=x)

    def get_cluster_centers_indices(self):
        return self.ap_cluster.cluster_centers_indices_

    def get_cluster_centers(self):
        return self.ap_cluster.cluster_centers_

    def get_labels(self):
        return self.ap_cluster.labels_

    def get_affinity_matrix(self):
        return self.ap_cluster.affinity_matrix_

    def get_n_iter(self):
        return self.ap_cluster.n_iter_
Esempio n. 35
0
def ReadFilesClusters(file2):
    print(file2)
    #Leyendo file
    file = open(file2, "r")
    lineName = file2.split('.')
    lineName = lineName[0].split('/')
    print("lineName[1]=" + lineName[1])

    ## Crear carpetas
    newpath = r'C:/Users/gquis/Documents/Visual Studio 2015/Projects/Kmeans/Kmeans/clusters/' + lineName[
        1]
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    # Read data from files
    file2 = open("clusters/size" + lineName[1] + ".txt", "r")
    line = file2.readlines()
    [n, dim] = [int(val) for val in line[0].split()]

    dataSubCluster = np.zeros(
        (n, dim))  # Aqui va la subdata que pertenece a un cluster

    # Se tiene que mapear los Id de los clusters que entran

    IDs = np.zeros((n, ), dtype=int)  #IDs segun la lista grande
    idx = 0
    for lineG in file:
        #print(lineG) # cada line es un numero
        line2 = lineG.split()
        dataSubCluster[idx] = X[int(lineG)]  # Features obtenidos

        IDs[idx] = int(lineG)
        idx = idx + 1

    #print('pintandodataSubCluster')
    #print(dataSubCluster)
    sizeCluster = len(dataSubCluster)
    ## Aplicar de nuevo clustering
    ##############################################################################
    # Se hace validacion para clusters con un elemento
    if (sizeCluster == 1):
        cluster_centers_indices = [0]
        #print(cluster_centers_indices)
        labels = [0]
        n_clusters_ = 1
    else:
        af = AffinityPropagation(preference=None).fit(dataSubCluster)
        cluster_centers_indices = af.cluster_centers_indices_
        #print(cluster_centers_indices)
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)

    instring = r'clusters/' + lineName[1] + '/ClustersOutPut.txt'
    print('instring=' + instring)
    file = open(instring, "w")
    for i in range(0, len(labels)):
        print(
            "labels[" + str(IDs[i]) + "]=" + str(IDs[int(labels[i])])
        )  #Ojo en esta parte se esta asumiendo que label toma encuenta el Id como el orden en el que entra al proceso de clustering.
        file.write(str(IDs[int(labels[i])]) + " " + str(IDs[i]) + '\n')
    file.close()

    ########## Despues de este modulo se va tener que escribir alguna validacion
    ## para tomar solo las carpetas que salieron con elementos en los clusters.
    ## ya que los archivos para el proceso de jutar indices esta en c++ y no es posible
    # crearlos automaticamente (si es posible pero va demorar xd).
    ####################### Parte donde recogemos los clusters #####################
    file = open('clusters/' + lineName[1] + '/dim.dim', "w")
    file.write(str(dim))
    file.close()
    ################Borrar los archvios .cluster .tex y .pdf ####################

    os.system(
        'cd clusters/' + lineName[1] +
        '/ && del *.cluster && del *.pdf && del *.log && del *.aux && del sizecluster*'
    )

    os.system('cd clusters/' + lineName[1] +
              '/ && g++ recogerElementosClusters.cpp && a.exe')

    ## Presentar Clusters  creando .tex
    ################################################################################
    os.system('cd clusters/' + lineName[1] +
              '/ && python PresentClusters.py')  # construye el archivo .tex

    ## Presentar generar PDFS
    ###############################################################################
    #Aqui si es posible ejecutamos los .tex comandos
    os.system('cd clusters/' + lineName[1] +
              '/ && python generarPDF.py')  # Sólo genera pdf's
# train_x=X[:train_len]
# train_y=labels[:train_len]
#
# test_x=X[train_len:]
# test_y=labels[train_len:]

# KMeans
km = KMeans(n_clusters=class_num)
km.fit(X)
pred_y = km.labels_

nmi = normalized_mutual_info_score(labels, pred_y)
print('KMeans NMI:{:.4f}'.format(nmi))

# AffinityPropagation
affinity_propagation = AffinityPropagation(damping=0.9, preference=-1)
affinity_propagation.fit(X)
pred_y = affinity_propagation.labels_

nmi = normalized_mutual_info_score(labels, pred_y)
print('AffinityPropagation NMI:{:.4f}'.format(nmi))

# Mean-shift
bandwidth = estimate_bandwidth(X, quantile=0.2)
mean_shift = MeanShift(bandwidth=0.8, bin_seeding=True)
mean_shift.fit(X)
pred_y = mean_shift.labels_

nmi = normalized_mutual_info_score(labels, pred_y)
print('Mean-shift NMI:{:.4f}'.format(nmi))
Esempio n. 37
0
def test_affinity_propagation_predict():
    # Test AffinityPropagation.predict
    af = AffinityPropagation(affinity="euclidean", random_state=63)
    labels = af.fit_predict(X)
    labels2 = af.predict(X)
    assert_array_equal(labels, labels2)
Esempio n. 38
0
    str(''.join(letter)) for letter_array in mat['field']
    for letter in letter_array
]

## preprocessing
Y = UCData
attrind = np.array(range(1, 51) + range(62, 78, 3))
Field = [Field[i] for i in range(1, 51) + range(62, 78, 3)]
X = AttrData[:, attrind]
X[np.isnan(X)] = 0
scaler = preprocessing.StandardScaler().fit(X)
Xn = scaler.fit_transform(X)

### cluster
model = KMeans(init='k-means++', n_clusters=6, n_init=10, max_iter=1000)
model = AffinityPropagation(preference=-150, verbose=True)
#model = Birch(branching_factor=10, n_clusters=4, threshold=0.3, compute_labels=True)
model = MeanShift(bandwidth=estimate_bandwidth(X, quantile=0.1, n_samples=100),
                  bin_seeding=True)

label = SSRS.Cluster(X, model)

### classification
model = tree.DecisionTreeClassifier()
model = GaussianNB()
model = svm.SVC()
model = SGDClassifier()

Tp = SSRS.Classification_cross(XXn, T=label, nfold=10, model=model)
SSRS.plotErrorMap(label, Tp)
Esempio n. 39
0
#df = pd.read_csv('C:/Users/neshragh/ecounter/Affinity_Sample_SPY/1-Dataset_ecounter/data analysis/HourlyAP/29apriloneweek.csv')#one day
df = pd.read_csv(
    'C:/Users/neshragh/ecounter/Affinity_Sample_SPY/1-Dataset_ecounter/data analysis/monthIntervention.csv'
)  #one day
# #############################################################################

#Choose data for algorithm
#df = df.loc[(df.Date >= '5/30/2019') & (df.Date <= '5/5/2019')]
df = df.loc[(df['Date'] >= '4/29/2019') & (df.Date <= '5/4/2019')]
#df = df[df]
df = df.loc[(df.Time >= '07') & (df.Time <= '19')]
X = df.loc[df.index, ['Count', 'Position']].to_numpy()

# Compute Affinity Propagation
af = AffinityPropagation(preference=-4, damping=.95, max_iter=500).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

#print('Estimated number of clusters: %d' % n_clusters_)
#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
#print("Adjusted Rand Index: %0.3f"
#      % metrics.adjusted_rand_score(labels_true, labels))
#print("Adjusted Mutual Information: %0.3f"
#      % metrics.adjusted_mutual_info_score(labels_true, labels))
'''print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))'''
Esempio n. 40
0
class LexRank(object):
    def __init__(self,
                 similarity='cosine',
                 decay_window=20,
                 decay_alpha=0.25,
                 clustering='dbscan',
                 tagger='twitter',
                 useful_tags=[
                     'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb',
                     'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix',
                     'Alpha', 'Number'
                 ],
                 delimiters=['. ', '\n', '.\n'],
                 min_token_length=2,
                 stopwords=stopwords_ko,
                 no_below_word_count=2,
                 no_above_word_portion=0.85,
                 max_dictionary_size=None,
                 min_cluster_size=2,
                 similarity_threshold=0.85,
                 matrix_smoothing=False,
                 n_clusters=None,
                 compactify=True,
                 **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError(
                "available similarity functions are: cosine, jaccard, normalized_cooccurrence"
            )
        self.sim = lambda sentence1, sentence2: self.decay(
            sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger,
                                       useful_tags=useful_tags,
                                       delimiters=delimiters,
                                       min_token_length=min_token_length,
                                       stopwords=stopwords,
                                       **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix
                                                                     )
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 -
                                                                      matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 -
                                                                        matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [
                0 for index in range(matrix.shape[0])
            ]
        else:
            raise LexRankError(
                "available clustering algorithms are: birch, markov, no-clustering(use `None`)"
            )
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify

    def summarize(self, text):
        self.sentences = self.factory.text2sentences(text)
        self.num_sentences = len(self.sentences)
        self.corpus = SentenceCorpus(self.sentences, self.no_below_word_count,
                                     self.no_above_word_portion,
                                     self.max_dictionary_size)
        self.model = TfidfModel(self.corpus.bows,
                                id2word=self.corpus.dictionary,
                                normalize=True)
        self.tfidfs = self.model[self.corpus.bows]
        self._inject_tfidfs()
        self._build_matrix()
        self._clustering()
        if self.compactify:
            self._compactify()
        self.graphs = []
        for i in range(self.num_clusters):
            graph = self.sentences2graph(self.clusters[i])
            pagerank = networkx.pagerank(graph, weight='weight')
            self.clusters[i] = sorted(pagerank, key=pagerank.get, reverse=True)
            self.graphs.append(graph)

    def _sim_jaccard(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        p = sum((sentence1.counter & sentence2.counter).values())
        q = sum((sentence1.counter | sentence2.counter).values())
        return p / q if q else 0

    def _sim_cosine(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        sentence1_tfidf = {
            word_id: tfidf
            for word_id, tfidf in sentence1.tfidf
        }
        sentence2_tfidf = {
            word_id: tfidf
            for word_id, tfidf in sentence2.tfidf
        }
        vector1, vector2 = self.vectorizer.fit_transform(
            [sentence1_tfidf, sentence2_tfidf]).toarray()
        return vector1.dot(vector2)

    def _sim_normalized_cooccurrence(self, sentence1, sentence2):
        if sentence1 == sentence2:
            return 1
        return len(set(sentence1.tokens) & set(sentence2.tokens)) / (
            math.log(len(sentence1.tokens)) + math.log(len(sentence2.tokens)))

    def decay(self, sentence1, sentence2):
        distance = abs(sentence1.index - sentence2.index)
        closeness = max(self.decay_window - distance, 0) / self.decay_window
        return math.pow(closeness, self.decay_alpha)

    def _inject_tfidfs(self):
        for index in range(self.num_sentences):
            bow = self.corpus.bows[index]
            self.sentences[index].bow = bow
            self.sentences[index].tfidf = self.model[bow]

    def _build_matrix(self):
        self.matrix = np.zeros((self.num_sentences, self.num_sentences))
        for sentence1 in self.sentences:
            for sentence2 in self.sentences:
                self.matrix[sentence1.index,
                            sentence2.index] = self.sim(sentence1, sentence2)
        if self.matrix_smoothing:
            for index in range(self.num_sentences):
                self.matrix[index, index] = 0
                self.matrix[index, index] = max(self.matrix[index])

    def sentences2graph(self, sentences):
        graph = networkx.Graph()
        graph.add_nodes_from(sentences)
        for sentence1 in sentences:
            for sentence2 in sentences:
                weight = self.matrix[sentence1.index, sentence2.index]
                if weight:
                    graph.add_edge(sentence1, sentence2, weight=weight)
        return graph

    def _clustered(self):
        self.clusters = [
            cluster for cluster in self.clusters
            if len(cluster) >= self.min_cluster_size
        ]
        self.num_clusters = len(self.clusters)
        self.clusters = sorted(self.clusters,
                               key=lambda cluster: len(cluster),
                               reverse=True)

    def _clustering(self):
        cls = self._clusterer(self.matrix)
        bucket = {}
        for index in range(len(cls)):
            key = str(cls[index])
            if key not in bucket:
                bucket[key] = []
            bucket[key].append(self.sentences[index])
        self.clusters = bucket.values()
        self._clustered()

    def _compactify(self):
        clusters = []
        for cluster in self.clusters:
            compact_cluster = []
            cluster_size = len(cluster)
            for i in range(cluster_size):
                cluster[i].duplicated = False
            for i in range(cluster_size):
                if cluster[i].duplicated:
                    continue
                for j in range(i + 1, cluster_size):
                    if cluster[j].duplicated:
                        continue
                    if self.uniform_sim(
                            cluster[i],
                            cluster[j]) > self.similarity_threshold:
                        cluster[j].duplicated = True
                compact_cluster.append(cluster[i])
            clusters.append(compact_cluster)
        self.clusters = clusters
        self._clustered()

    def _verbose(self):
        summaries = sorted(self.summaries, key=lambda sentence: sentence.index)
        return [sentence.text for sentence in summaries]

    def probe(self, k=None):
        if not hasattr(self, 'clusters'):
            raise LexRankError("summarize it first")
        if not k:
            k = max(2, self.num_clusters)
        if k < 0:
            raise LexRankError(
                "appropriate value for `k`: float(0 ~ 1) for compress rate, or natural number for exact number of sentences"
            )
        if k > self.num_sentences:
            raise LexRankError("this will not give a summarization")
        if k < 1:
            k = int(self.num_sentences * k)
        self.summaries = []
        ends = np.array([len(cluster) for cluster in self.clusters])
        drones = np.zeros(ends.shape)
        for i in range(self.num_clusters):
            self.summaries.append(self.clusters[i][0])
            drones[i] += 1
            if len(self.summaries) == k:
                return self._verbose()
        while True:
            branch = np.array([drones + 1, ends]).min(axis=0) / ends
            leach = int(branch.argmin())
            drone = int(drones[leach])
            self.summaries.append(self.clusters[leach][drone])
            drones[leach] += 1
            if len(self.summaries) == k:
                return self._verbose()
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
import numpy as np
from sklearn.decomposition import PCA
from time import time
# #############################################################################
#sample data
digits = load_digits()
data = scale(digits.data)
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels_true = digits.target

# Compute Affinity Propagation
t0 = time()
af = AffinityPropagation(preference=-5000).fit(data)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('name\t\t\t\ttime\thomo\tcompl\tNMI')
print('%-15s\t%.2fs\t%.3f\t%.3f\t%.3f' % (
    'AffinityPropagation',
    (time() - t0),
    metrics.homogeneity_score(labels_true, labels),
    metrics.completeness_score(labels_true, labels),
    metrics.normalized_mutual_info_score(
        labels_true, labels, average_method='arithmetic'),
))
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
Esempio n. 42
0
def test_affinity_propagation_pairwise_is_deprecated():
    afp = AffinityPropagation(affinity='precomputed')
    msg = r"Attribute _pairwise was deprecated in version 0\.24"
    with pytest.warns(FutureWarning, match=msg):
        afp._pairwise
Esempio n. 43
0
def test_affinity_propagation():
    # Affinity Propagation algorithm
    # Compute similarities
    S = -euclidean_distances(X, squared=True)
    preference = np.median(S) * 10
    # Compute Affinity Propagation
    cluster_centers_indices, labels = affinity_propagation(
        S, preference=preference, random_state=39)

    n_clusters_ = len(cluster_centers_indices)

    assert n_clusters == n_clusters_

    af = AffinityPropagation(preference=preference, affinity="precomputed",
                             random_state=28)
    labels_precomputed = af.fit(S).labels_

    af = AffinityPropagation(preference=preference, verbose=True,
                             random_state=37)
    labels = af.fit(X).labels_

    assert_array_equal(labels, labels_precomputed)

    cluster_centers_indices = af.cluster_centers_indices_

    n_clusters_ = len(cluster_centers_indices)
    assert np.unique(labels).size == n_clusters_
    assert n_clusters == n_clusters_

    # Test also with no copy
    _, labels_no_copy = affinity_propagation(S, preference=preference,
                                             copy=False, random_state=74)
    assert_array_equal(labels, labels_no_copy)

    # Test input validation
    with pytest.raises(ValueError):
        affinity_propagation(S[:, :-1])
    with pytest.raises(ValueError):
        affinity_propagation(S, damping=0)
    af = AffinityPropagation(affinity="unknown", random_state=78)
    with pytest.raises(ValueError):
        af.fit(X)
    af_2 = AffinityPropagation(affinity='precomputed', random_state=21)
    with pytest.raises(TypeError):
        af_2.fit(csr_matrix((3, 3)))
def worker(X, damping):
    method = AffinityPropagation(damping=damping)
    method.fit(X)
    key = methodName + "/length_" + length + "/" + deg + "/individuals/affinity_propagation_damping_" + str(
        damping)
    np.savetxt(key + "_labels.csv", method.labels_, fmt="%d")
print(type(X_test.iloc[1, 1]))
#X_train = X_train.fillna(0)
X_ktrain = X_train.values
y_ktrain = y_train.values
#print(X_train.head())

N = X_ktrain.shape[0]
affinity = np.zeros((N, N))
for i in range(N):
    affinity[i, :] = bdist(X_ktrain, X_ktrain[i], 5, 1e-3, 1e-25)

#Time
tsum = 0
t = time.process_time()

cluster = AffinityPropagation(damping=0.5, affinity='precomputed')
labels = cluster.fit_predict(affinity)
C = np.unique(labels).size
clusters = X_ktrain[cluster.cluster_centers_indices_]

# estimate positions for test data
pred, error3D, error2D, fdetect, cused, true_labels, acc_pred = position_route(
    method,
    X_ktrain,
    y_ktrain,
    x_test,
    y_test,
    clusters,
    labels,
    N=5,
    eps=1e-3)
Esempio n. 46
0
def Affinity_Propagation(X, Y):
    print("Affinity_Propagation")
    label_pred = AffinityPropagation().fit_predict(X)
    score(Y, label_pred)
Esempio n. 47
0
    'n_neighbors': 10,
    'n_clusters': 5
}
bandwidth = estimate_bandwidth(embedding, quantile=params['quantile'])
connectivity = kneighbors_graph(embedding,
                                n_neighbors=params['n_neighbors'],
                                include_self=False)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ward = AgglomerativeClustering(n_clusters=params['n_clusters'],
                               linkage='ward',
                               connectivity=connectivity)
spectral = SpectralClustering(n_clusters=params['n_clusters'],
                              eigen_solver='arpack',
                              affinity="nearest_neighbors")
dbscan = DBSCAN(eps=params['eps'])
affinity_propagation = AffinityPropagation(damping=params['damping'],
                                           preference=params['preference'])
average_linkage = AgglomerativeClustering(linkage="average",
                                          affinity="cityblock",
                                          n_clusters=params['n_clusters'],
                                          connectivity=connectivity)
birch = Birch(n_clusters=params['n_clusters'])
gmm = GaussianMixture(n_components=params['n_clusters'],
                      covariance_type='full')
clustering_algorithms = (('AffinityPropagation', affinity_propagation),
                         ('MeanShift', ms), ('SpectralClustering', spectral),
                         ('Ward', ward), ('AgglomerativeClustering',
                                          average_linkage), ('DBSCAN', dbscan),
                         ('Birch', birch), ('GaussianMixture', gmm))
#now plot everything
f, ax = plt.subplots(2, 4, figsize=(20, 15))
for idx, (name, algorithm) in enumerate(clustering_algorithms):
from sklearn.datasets.samples_generator import make_blobs

# #############################################################################
#import data
f = open("./pc1-pc2-completetn-aromagroups.txt")
x = np.loadtxt(f, delimiter='\t', skiprows=1)
# create np array for data points
data = np.array(x).astype("float")

#data[i][j], i varies the row (chooses the coordinates [pc1, pc2] at row i)
#data[i][j], j varies the column (chooses between pc1 and pc2 respectively 0 or 1)

X = data
# #############################################################################
# Compute Affinity Propagation
af = AffinityPropagation(preference=-50).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
Esempio n. 49
0
plt.show()

#k-means clustring
kmeans = KMeans(n_clusters=3).fit(X_trans)  #n_clusters=3
# Fitting the input data and getting the cluster labels
labels_k = kmeans.labels_
print labels_k

#AgglomerativeClustering
agglomerative = AgglomerativeClustering(n_clusters=3).fit(X_trans)
# Fitting the input data and getting the cluster labels
labels_agg = agglomerative.labels_
print labels_agg

#AffinityPropagation
affinity = AffinityPropagation().fit(X_trans)
# Fitting the input data and getting the cluster labels
labels_aff = affinity.labels_
print labels_aff
'''
--------------------- Question 3 ---------------------
'''

#set parameter for Gridsearch
cv = 10
n_features = X_train.shape[1]

#KNeighborsClassifier
n_neighbors_range = np.arange(1, 20, 1)
param_grid_n = dict(n_neighbors=n_neighbors_range)  #set tuning parameter range
Esempio n. 50
0
def affinity(df_std):
	AffPro = AffinityPropagation(max_iter=300, preference=-50, verbose=True)
	aa = AffPro.fit(df_std)
	return aa
Esempio n. 51
0
from sklearn.preprocessing import scale
from time import time

digits = load_digits()
#获得原始数据
origin_data = digits.data
#获得原始数据的标签,即属于哪一类
labels = digits.target

#对原始数据进行标准化
data = scale(origin_data)
#查看label中一共有多少类
n_classes = len(np.unique(labels))

km = KMeans(init='random', n_clusters=10)
ap = AffinityPropagation()
ms = MeanShift()
sc = SpectralClustering(n_clusters=10, gamma=0.1)
ac = AgglomerativeClustering(n_clusters=10, linkage='average')
whc = AgglomerativeClustering(n_clusters=10, linkage='ward')
db = DBSCAN()
gm = GaussianMixture(n_components=10)

print(82 * '_')
print('name\t\ttime\t\th_score\t\tc_score\t\tnmi')


def bench(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('%s\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f' %
Esempio n. 52
0
## 生成的测试数据的中心点
centers = [[1, 1], [-1, -1], [1, -1]]
##生成数据
Xn, labels_true = make_blobs(n_samples=150,
                             centers=centers,
                             cluster_std=0.5,
                             random_state=0)

simi = []
for m in Xn:
    ##每个数字与所有数字的相似度列表,即矩阵中的一行
    temp = []
    for n in Xn:
        ##采用负的欧式距离计算相似度
        s = -np.sqrt((m[0] - n[0])**2 + (m[1] - n[1])**2)
        temp.append(s)
    simi.append(temp)

p = -50  ##3个中心
#p = np.min(simi)  ##9个中心,
#p = np.median(simi)  ##13个中心

ap = AffinityPropagation(damping=0.5,
                         max_iter=500,
                         convergence_iter=30,
                         preference=p).fit(Xn)
cluster_centers_indices = ap.cluster_centers_indices_

for idx in cluster_centers_indices:
    print(Xn[idx])
train_data = np.loadtxt('Train_Data.csv', dtype=np.float32, delimiter=',')
train_labels = np.loadtxt('Train_Labels.csv', dtype=np.int32, delimiter=',')
test_data = np.loadtxt('Test_Data.csv', dtype=np.float32, delimiter=',')
test_labels = np.loadtxt('Test_Labels.csv', dtype=np.int32, delimiter=',')
class_names = ['1', '2', '3']


# Feature Selection
all_data = np.vstack((train_data,test_data))
all_data_labels=np.hstack((train_labels,test_labels))
sel = VarianceThreshold(threshold=0.90*(1-0.90))
all_data = sel.fit_transform(all_data)
all_data_size, _ = all_data.shape
_, feature_size = all_data.shape

clustering = AffinityPropagation(preference= -1200,damping=0.92).fit(all_data)



tmp = clustering.labels_
replace_all(tmp,0,10)
replace_all(tmp,1,20)
replace_all(tmp,2,30)


replace_all(tmp,10,1)
replace_all(tmp,20,3)
replace_all(tmp,30,2)


chem_map = pd.read_csv("data/chem_all.csv").to_numpy()
topn_smile = [chem_map[int(idx.split('_')[1]), 0] for idx in topn_idx]


sm = np.zeros((n, n))
for i in range(n):
    for j in range(i, n):
        m1, m2 = Chem.MolFromSmiles(topn_smile[i]), Chem.MolFromSmiles(topn_smile[j])
        sm[i, j] = FingerprintSimilarity(Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2))

sm = sm + sm.T - np.eye(n)

from sklearn.cluster import AffinityPropagation

af = AffinityPropagation().fit(sm)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print("{} clusters: ".format(n_clusters_))
print("    Center index: {}".format(cluster_centers_indices.tolist()))
print("    Labels: {}".format(labels.tolist()))

send = {'1EVZ': ['CHEM_833', 'CHEM_84524', 'CHEM_6372', 'CHEM_28096', 'CHEM_16023'],
        '1P33': ['CHEM_6372', 'CHEM_40322', 'CHEM_4109', 'CHEM_8472'],
        '3HQQ': ['CHEM_6372', 'CHEM_40322', 'CHEM_4109', 'CHEM_16498'],
        '1T10': ['CHEM_6372', 'CHEM_40322', 'CHEM_3777', 'CHEM_38064', 'CHEM_74497']}


Esempio n. 55
0
                        shell=True)
        if opts.match:
            subprocess.call([
                TOMTOM + ' -oc ' + opts.outfile + '-aggthresh=' + str(thresh) +
                ' ' + opts.outfile + '-aggthresh=' + str(thresh) + '.meme ' +
                MOUSEMEME
            ],
                            shell=True)
else:
    #opts.clustering == 'representative'
    #shrink motif entropy to end up with fewer clusters?
    preference = np.array(
        [1.0 / motif_entropy(motif_dict[motif]) for motif in motifs]) * 0.25

    agg_clust = AffinityPropagation(affinity='precomputed',
                                    preference=preference).fit(
                                        np.clip(similarity_cor, 0,
                                                np.max(similarity_cor)))
    seq_motifs = {}
    print('# clusters:', len(set(agg_clust.labels_)))
    for label_ind, cluster_center in enumerate(
            agg_clust.cluster_centers_indices_):
        seq_motifs[motifs[cluster_center]] = motif_dict[motifs[cluster_center]]
        with open(opts.outfile + '-' + motifs[cluster_center] + '.cluster',
                  'w') as f:
            for mind in np.where(agg_clust.labels_ == label_ind)[0]:
                f.write(motifs[mind] + '\n')
    with open(opts.outfile + '-affinityprop.motifs', 'w') as f:
        for key in seq_motifs.keys():
            f.write('>' + key + '\n')
            for i in range(seq_motifs[key].shape[0]):
                f.write('\t'.join([str(v)
Esempio n. 56
0
class FeatureClusterer(ClusterMixin, BaseEstimator):
    #clusters features together
    #like sklearn feature agglomeration, but can work on dataframes and tracks names of the features

    def __init__(self, base_model = 'default', scale = False):
        if base_model is None or base_model == 'default':
            self.base_model = AffinityPropagation()
        else:
            self.base_model = base_model
        self.scale = scale
        assert(hasattr(self.base_model, 'fit_predict'))

    def fit(self, x, y = None):
        if self.scale:
            x = StandardScaler().fit_transform(x)
        x = x.transpose()
        self.labels = self.base_model.fit_predict(x)
        self.labels = self.map_to_zero(self.labels)

    def map_to_zero(self, labels):
        labels -= labels.min()
        unique_labels = set(labels)
        n_labels = len(unique_labels)
        if n_labels == labels.max():
            return labels
        for i in range(n_labels):
            if i not in set(labels):
                args = np.argwhere(labels > i)
                labels[args] -= 1
        return labels

    def predict(self, x, y = None):
        index = list(x.index)
        x = x.transpose()
#        if hasattr(self.base_model, 'predict'):
#            labels = self.base_model.predict(x)
#        else:
#            labels = self.base_model.fit_predict(x)
#        labels = self.map_to_zero(labels)
#        print(sorted(set(labels)))
        is_df = isinstance(x, pd.DataFrame)

        groups = [[] for x in range(len(set(self.labels)))]
        group_names = [[] for x in range(len(set(self.labels)))]
        for pos, groupnum in enumerate(self.labels):
            if is_df:
                feature = x.iloc[pos]
                groups[groupnum].append(feature.values)
                group_names[groupnum].append(feature.name)
            else:
                groups[groupnum].append(x[pos])

        f_out = np.zeros((len(set(self.labels)), x.shape[1]))
        for row, vals in enumerate(groups):
            f_out[row] = np.mean(vals, axis = 0)
        x_out = f_out.transpose()
        group_names = [','.join(gn) for gn in group_names]
        if is_df:
            x_out = pd.DataFrame(x_out, index=index, columns = group_names)
        return x_out

    def fit_predict(self, x, y = None):
        self.fit(x)
        return self.predict(x)
Esempio n. 57
0
class ComplexBuilder(object):
    def __init__(self, method="HDBSCAN"):
        ""
        if method == "OPTICS":
            self.clustering = OPTICS(min_samples=2,
                                     metric="precomputed",
                                     n_jobs=4)
        elif method == "AGGLOMERATIVE_CLUSTERING":
            self.clustering = AgglomerativeClustering(affinity="precomputed")
        elif method == "AFFINITY_PROPAGATION":
            self.clustering = AffinityPropagation(affinity="precomputed")
        elif method == "HDBSCAN":
            self.clustering = hdbscan.HDBSCAN(min_cluster_size=2)
        self.method = method

    def set_params(self, params):

        self.clustering.set_params(**params)

    def fit(self,
            X,
            metricColumns,
            scaler=None,
            inv=False,
            poolMethod="min",
            umapKwargs={
                "min_dist": 1e-7,
                "n_neighbors": 4,
                "random_state": 350
            },
            generateSquareMatrix=True,
            preCompEmbedding=None,
            useSquareMatrixForCluster=False,
            entryColumns=["E1", "E2"]):
        """
        Fits predicted interactions to potential macromolecular complexes.


        """
        pooledDistances = None
        if X is not None and generateSquareMatrix and preCompEmbedding is None:
            #  print("Generate Square Matrix ..")
            # print(scaler)
            X, labels, pooledDistances = self._makeSquareMatrix(
                X, metricColumns, scaler, inv, poolMethod, entryColumns)
            # print(X)
            print("Info :: Umap calculations started.")
            umapKwargs["metric"] = "precomputed"
            embed = umap.UMAP(**umapKwargs).fit_transform(X)
        elif preCompEmbedding is not None:
            embed = preCompEmbedding.values
            labels = preCompEmbedding.index.values
            pooledDistances = None
            print("Info :: Aligned UMAP was precomputed. ")
        elif not generateSquareMatrix:
            labels = X.index.values
            umapKwargs["metric"] = "correlation"
            embed = umap.UMAP(**umapKwargs).fit_transform(X)
        else:
            raise ValueError(
                "X and preCompEmbedding are both None. No data for UMAP.")

    #  print("done .. - starting clustering")
        if self.method == "OPTICS":
            clusterLabels = self.clustering.fit_predict(X)
            return clusterLabels, labels, X, self.clustering.reachability_[
                self.clustering.ordering_], self.clustering.core_distances_[
                    self.clustering.ordering_]
        elif self.method in [
                "AGGLOMERATIVE_CLUSTERING", "AFFINITY_PROPAGATION"
        ]:
            clusterResult = self.clustering.fit_predict(X)
            return clusterResult, labels, X, ["None"] * labels.size, [
                "None"
            ] * labels.size
        elif self.method == "HDBSCAN":
            if useSquareMatrixForCluster:
                self.set_params({"metric": "precomputed"})
                clusterResult = self.clustering.fit(X)
            else:
                clusterResult = self.clustering.fit(embed)
        # self.clustering.condensed_tree_.to_pandas()
            return clusterResult.labels_, labels, X, clusterResult.probabilities_, [
                "None"
            ] * labels.size, embed, pooledDistances

    def _makeSquareMatrix(self, X, metricColumns, scaler, inv, poolMethod,
                          entryColumns):

        if scaler is None:
            if poolMethod == "mean":
                X["meanDistance"] = X[metricColumns].mean(axis=1)
            elif poolMethod == "max":
                X["meanDistance"] = X[metricColumns].max(axis=1)
            elif poolMethod == "min":
                X["meanDistance"] = X[metricColumns].min(axis=1)
        else:
            if poolMethod == "mean":
                X["meanDistance"] = scaler(X[metricColumns]).mean(axis=1)
            elif poolMethod == "max":
                X["meanDistance"] = scaler(X[metricColumns]).max(axis=1)
            elif poolMethod == "min":
                X["meanDistance"] = scaler(X[metricColumns]).min(axis=1)

        if inv:
            X['meanDistance'] = 1 - X['meanDistance']

        X = X.dropna(subset=["meanDistance"])

        uniqueValues = np.unique(X[entryColumns])
        uniqueVDict = dict([(value, n)
                            for n, value in enumerate(uniqueValues)])
        nCols = nRows = uniqueValues.size
        print("Info :: Creating {} x {} distance matrix".format(nCols, nCols))
        matrix = np.full(shape=(nRows, nCols),
                         fill_value=2.0 if scaler is not None else 1.0)
        columnNames = entryColumns + ["meanDistance"]
        for row in X[columnNames].values:

            nRow = uniqueVDict[row[0]]
            nCol = uniqueVDict[row[1]]

            matrix[[nRow, nCol], [nCol, nRow]] = row[2]
        if scaler is not None:
            matrix = (matrix - np.min(matrix)) / (np.max(matrix) -
                                                  np.min(matrix))
        np.fill_diagonal(matrix, 0)

        return matrix, uniqueValues, X
Esempio n. 58
0
 def __init__(self,
              similarity='cosine',
              decay_window=20,
              decay_alpha=0.25,
              clustering='dbscan',
              tagger='twitter',
              useful_tags=[
                  'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb',
                  'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix',
                  'Alpha', 'Number'
              ],
              delimiters=['. ', '\n', '.\n'],
              min_token_length=2,
              stopwords=stopwords_ko,
              no_below_word_count=2,
              no_above_word_portion=0.85,
              max_dictionary_size=None,
              min_cluster_size=2,
              similarity_threshold=0.85,
              matrix_smoothing=False,
              n_clusters=None,
              compactify=True,
              **kwargs):
     self.decay_window = decay_window
     self.decay_alpha = decay_alpha
     if similarity == 'cosine':  # very, very slow :(
         self.vectorizer = DictVectorizer()
         self.uniform_sim = self._sim_cosine
     elif similarity == 'jaccard':
         self.uniform_sim = self._sim_jaccard
     elif similarity == 'normalized_cooccurrence':
         self.uniform_sim = self._sim_normalized_cooccurrence
     else:
         raise LexRankError(
             "available similarity functions are: cosine, jaccard, normalized_cooccurrence"
         )
     self.sim = lambda sentence1, sentence2: self.decay(
         sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
     self.factory = SentenceFactory(tagger=tagger,
                                    useful_tags=useful_tags,
                                    delimiters=delimiters,
                                    min_token_length=min_token_length,
                                    stopwords=stopwords,
                                    **kwargs)
     if clustering == 'birch':
         self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
         self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix
                                                                  )
     elif clustering == 'dbscan':
         self._dbscan = DBSCAN()
         self._clusterer = lambda matrix: self._dbscan.fit_predict(1 -
                                                                   matrix)
     elif clustering == 'affinity':
         self._affinity = AffinityPropagation()
         self._clusterer = lambda matrix: self._affinity.fit_predict(1 -
                                                                     matrix)
     elif clustering is None:
         self._clusterer = lambda matrix: [
             0 for index in range(matrix.shape[0])
         ]
     else:
         raise LexRankError(
             "available clustering algorithms are: birch, markov, no-clustering(use `None`)"
         )
     self.no_below_word_count = no_below_word_count
     self.no_above_word_portion = no_above_word_portion
     self.max_dictionary_size = max_dictionary_size
     self.similarity_threshold = similarity_threshold
     self.min_cluster_size = min_cluster_size
     self.matrix_smoothing = matrix_smoothing
     self.compactify = compactify
    def handleEndGame(self, event, replay):
        try:
            pdict = {}
            for player in replay.players:
                player.bases = {}
                pdict[player.team_id] = player

            step_size = int(20 * 22.4)
            for frame in range(0, replay.frames + 1, step_size):
                for player in replay.players:
                    player.bases[frame] = {}
                    if frame > 0:
                        for f in range(frame - step_size + 1, frame):
                            player.bases[f] = player.bases[frame - step_size]

                for f, ls in self.lookup.items():
                    if f <= frame:
                        locs, teamids, finishes, prefs = zip(*list(ls.values()))
                        unit_ids = list(ls.keys())
                    else:
                        break
                locs = np.array(locs)
                prefs = np.array(prefs)
                finishes = np.array(finishes)

                af = AffinityPropagation(preference=[0 if p else -5000 for p in prefs], random_state=None).fit(locs)
                cluster_centers_indices = af.cluster_centers_indices_
                centers = af.cluster_centers_.tolist()
                labels = af.labels_
                n_clusters = len(cluster_centers_indices)

                # mining location? must be separate cluster
                new_centers = []
                for k in range(n_clusters):
                    # mining bases in this cluster
                    mining_locs = [(loc, finish) for loc, finish, pref in zip(locs[labels == k], finishes[labels == k], prefs[labels == k]) if pref]
                    if len(mining_locs) > 1:
                        # split up clusters with more than one mining base
                        original = min(mining_locs, key=lambda x: x[1])
                        to_split = [x for x in mining_locs if x[0].tolist() != original[0].tolist()]
                        for i, (loc, finish) in enumerate(to_split):
                            new_label = n_clusters + i
                            labels[(locs == loc).all(axis=1).nonzero()] = new_label
                            members = [(loc, finish) for loc, finish, pref in zip(locs[labels == k], finishes[labels == k], prefs[labels == k]) if not pref]
                            for ml, mf in members:
                                if dist(ml, loc) == min(dist(ml, x[0]) for x in [original] + to_split[:i] + to_split[i + 1:]) and mf >= finish:
                                    labels[(locs == ml).all(axis=1).nonzero()] = new_label
                        new_centers.append(loc)

                for c in new_centers:
                    cluster_centers_indices = np.append(cluster_centers_indices, (locs == c).all(axis=1).nonzero())
                    n_clusters += 1

                # maximum distance
                new_centers = []
                for loc in locs:
                    if all(dist(loc, c) / self.map_dim > 0.1 for c in centers):  # too far away from any cluster center, should be split
                        if any(dist(loc, select_center(cs)) / self.map_dim <= 0.1 for cs in new_centers):  # close to an already split building, merge
                            _, i = min((dist(loc, select_center(cs)), i) for i, cs in enumerate(new_centers))
                            labels[(locs == loc).all(axis=1).nonzero()] = n_clusters + i
                            new_centers[i].append(tuple(loc))
                        else:  # start a new cluster
                            labels[(locs == loc).all(axis=1).nonzero()] = n_clusters + len(new_centers)
                            new_centers.append([tuple(loc)])

                for cs in new_centers:
                    central = select_center(cs)
                    cluster_centers_indices = np.append(cluster_centers_indices, (locs == central).all(axis=1).nonzero())
                    n_clusters += 1

                for unit_id, loc, team_id in zip(unit_ids, locs, teamids):
                    pdict[team_id].bases[frame][unit_id] = loc
        except:
            print(locs)
            print(replay.filename)
            traceback.print_exc()

        for player in replay.players:
            if frame < replay.frames:
                for f in range(frame + 1, replay.frames + 1):
                    player.bases[f] = player.bases[frame]
            assert len(player.bases) == replay.frames + 1, f"{len(player.bases)} base entries, {replay.frames} frames {sorted(player.bases.keys())}"
Esempio n. 60
0
def malek_gentleman():
    os.system("sudo modprobe bcm2835-v4l2")
    timeout=0
    i_file = "1"
    N = "1"
    sumq=[]
    buf_back=[]
    colq=[]
    count=[]
    initialisation=0
    initialisation+=1
    i=1
    VideoNumberInt=i
    VideoNumberString=i_file
    VideoName=VideoNumberString+'.mp4'
    
    cap = cv2.VideoCapture(0) # importer le video
    if( initialisation <= int(N)):
        count.append(0) #count[i-1]s the no of frames read till now
    nframe = 300 #no of frames needed to initialize the background
    cols = 160
    rows = 160
    flag = 0
    move = 0
    avg = np.zeros([160,160],dtype=np.uint8) # definir une image 'avg' dans tous ces pixels sont null , uint Unsigned integer (0 to 255)
    avg_temp = np.zeros([160,160],np.uint) # bon j'ai pas trouver la deffirence entre le uint8 et le uint , mais finalement sa sere à definir une plage d'entier 
    if( initialisation <= int(N)):
        sumq.append(np.zeros([160,160],np.uint)) # the same thing
    cur_back = np.zeros([160,160],dtype=np.uint8) # definir une image cur_back , dans tous ces pixels sont null
    if( initialisation <= int(N)):
        buf_back.append(np.zeros([160,160],dtype=np.uint8)) # definir une image buf_back , dans tous ces pixels sont null
    if( initialisation <= int(N)):
        colq.append(Queue()) # definir une pile des threads qui suit le loi de fifo 

    #to form clusters
    count_5=0
    arr=np.zeros(shape=(0, 2), dtype=np.uint8) # definir un tableau 'arr' de deux colones 
    cur_cent=last_cent=[0,0]
    def dist(cur_cent, last_cent):
        dis=(cur_cent[0]-last_cent[0])**2 + (cur_cent[1] - last_cent[1])**2 # calculer la somme de la variation au carré des deux pixel 
        return dis
    cluster_centres_q = np.zeros(shape=(0,2),dtype=np.int64) # on vas travaillé sur un cluster de deux pixels
    ret, pure_img = cap.read() # on extraire le 'cout'éme frame du video qui sera enregistrer dans la variable pure_img , ret , est un variable booleanretourner par la fonction read  
    #print("ker")
    #print(cap.isOpened())
    #print(ret)
    while(cap.isOpened() and ret==True): # tanque la video qu'on souhaite traiter est en boucle
            #print("im")
            #time.sleep(0.1)
            count[i-1] = count[i-1] + 1 # on va commencer à traiter les frame , du coup il faux incrémenter le count[i-1] à chaque fois qu'on est entrain de traiter une frame ( count[i-1] represente le nbr des frame traiter )
            img = cv2.resize(pure_img,(160,160)) # grase à resize on arrive à resizer notre image sans perdre la forme generale de l'image , c'est dans le sense ou on est pas entrais deretrancher les pixels d'une facon stupide bon maby il vas engondrer une certaine distortion l'orsqu'on zoom , euuh mais bon :p
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # convertir l'image en une image noire et blanch 
    
            colq[i-1].enqueue(gray) # aywah ahiya :D , tana na7chiw il image ' des années 60 :p ' fil pila mte3na 
            if count[i-1] == nframe:
                print("Background fixed !")
            
            if(count[i-1] < nframe):
                    #avg[:]=0 #no need as avg is initialized to be zero matrix
                    sumq[i-1] = sumq[i-1] + gray # il est entrain de superposer les differentes frame 
                    print(str(count[i-1] //3)+" %")
            else: # une fois nodkhol lil else , ca veux dire que j'ai construit mon propre background 
                    temp = colq[i-1].dequeue() # yibda yijbid fil les frames ta3 el background 
                    sumq[i-1] = sumq[i-1] + gray - temp # yzid ce qui a changé entre le frame du background wil frame ali 9e3id yikhdim fih
                    avg_temp = sumq[i-1]/nframe # bon mafhimtich 3leh ya9sim el 7a9 :D , ama mahiyech importante , puisque nnframe deja cst , nitsawir ,il est entrai de normaliser haja kima haka

            high_value_indices = avg_temp>255 # 7asilou puisque ahna mil se3a on ajout de la deffirence , fama des pixel , ynajmo yfoutou 255 ce qui n'est pas logique 
            avg_temp[high_value_indices] = 255 # kif kif 
            avg=avg_temp 
            avg=avg.astype(np.uint8) #7asilou lahna badal el type bech twali image
            cur_back = avg 
            if(flag == 0):
                    #buf_back[:] = 0 #no need as buf_back is initialized to be zero matrix
                    flag = 10 # pour eliminer le cas ou nframe=1 
            if(flag == 10 and count[i-1] >= nframe):
                    buf_back[i-1] = cur_back # doub maya3mil hekom el nframe , yimchi ya3ti lil buf_back el deffirence mabin el back w awil frame 5dheha ba3d el back 
                    flag = 20 
    
            sub = cv2.absdiff(cur_back,buf_back[i-1]) # voila lahna nhoto la difference entre le background wil frame ali 9e3din nitraitiw fiha 

            img_show = cv2.resize(img,(400,400)) 
        #img_show = img_show.astype(int)
        #print(img_show.shape)
        #print(img_show)
        #time.sleep(1)
            cv2.imshow("img",img_show) # affichage de l'image originale 

            gray_show = cv2.resize(gray,(400,400)) 
        #gray_show = gray_show.astype(int)
            cv2.imshow("gray",gray_show)

        #print(cur_back)
            cur_back_show = cv2.resize(cur_back,(400,400))
        #cur_back_show = cur_back_show.astype(int)
        #print(cur_back_show)
            cv2.imshow("cur_back_show",cur_back_show)

            buf_back_show = cv2.resize(buf_back[i-1],(400,400))
        #buf_back_show = buf_back_show.astype(int)
            cv2.imshow("buf_back_show",buf_back_show)

            sub = cv2.resize(sub,(400,400))
        #sub=sub.astype(int)
            cv2.imshow("Abandoned Objects",sub) # affichage de l'ivolution de la distortion du back

            ret_s,sub_t = cv2.threshold(sub,50,255,0) # codage : pour tous pixels entre 50 et 255 en luis attribut un 0 sinon 1
            mask = np.zeros(gray.shape,np.uint8)

            louka, contours, hier = cv2.findContours(sub_t,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
        #im2, contours, hier = cv2.findContours(sub_t,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
            count_5 += 1
            malek=0;
            for cnt in contours: # count[i-1]ouRS IS A LISTE OF object and in each element we find a liste of element which represente the different value of pixel who forme the contours of this object
                    print(cv2.contourArea(cnt))
                    if 300<cv2.contourArea(cnt)<5000: # si le piremetre entre 200 et 5000
                            #cv2.drawContours(sub,[cnt],0,(0,255,0),2) 
                            cv2.drawContours(mask,[cnt],0,255,-1) 
                            M = cv2.moments(cnt)
                            cx,cy = int(M['m10']/M['m00']), int(M['m01']/M['m00']) # localiser le centroid de l'objet ( centre de masse ) 
                            file_output = open('output.txt', 'w')
                            file_output.write('video num : ')
                            file_output.write(i_file)#<------------------------
                            file_output.write(' ====> une anomalie est detecté dans la position suivante : x=')
                            file_output.write(str(cx))
                            file_output.write(',  y=')
                            file_output.write(str(cy))
                            file_output.close()
                            cv2.circle(sub,(cx,cy),10,255,-1) # 
                            if(count_5<=5):
                                    arr = np.append(arr, [[cx,cy]], axis=0) # localiser la position de l'objet dans l'image 


            #print(arr)
            #print(count_5)

            if(count_5==5):
                    count_5 = 0
                    #print (len(arr))
                    if (len(arr) == 0): # il n y a pas d'objet 
                            pass
                    else:
                            affin = AffinityPropagation()
                            affin.fit_predict(arr)
                            centroids = affin.cluster_centers_
                            labels = affin.labels_
                            max_label_index = labels.argmax()
                            biggest_clust_ind = labels[max_label_index]
                            print(labels)
                            print(centroids)
                      #  print("len_labels",len(labels),"biggest cluster's index", biggest_clust_ind, "len_centroids", len(centroids))
                            print("+++++++++++++++++++++++++++++++++++++",type(biggest_clust_ind),"++++++++++++++++++++++++++++++++++++++++++")
                            if( type(biggest_clust_ind)!=np.ndarray):
                                    biggest_clust_cent = centroids[biggest_clust_ind]
                                    cx = np.uint8(biggest_clust_cent[0])
                                    cy = np.uint8(biggest_clust_cent[1])
                                    cv2.rectangle(sub,(cx-15,cy-15),(cx+15,cy+15),(255,255,255),1)
                                    cv2.rectangle(img,(cx-7,cy-7),(cx+7,cy+7),(0,255,0),2)
                                    cv2.drawContours(sub, contours, -1, (0,255,0), 3)
                                #finallly reinitializing the np_array arr
                                    last_cent = cur_cent
                                    cur_cent = [cx,cy]
                                    cluster_centres_q = np.append(cluster_centres_q, [cur_cent], axis=0)
                                    print("++++++++++++++++++++++++++++++++", len(cluster_centres_q),"++++++++++++++++++++++++++++++++++++")
                            else:
                                    pass
                            arr=np.zeros(shape=(0, 2), dtype=np.uint8)
            dista=dist(cur_cent, last_cent)
       # print("distance b/w centroid of last & current frame",dista)

         #if(0 < dista < 25 ):
            if(cur_cent==last_cent and last_cent!=[0,0]):
                    cv2.rectangle(sub,(cx-15,cy-15),(cx+15,cy+15),(255,255,255),1)
                    cv2.rectangle(img,(cx-7,cy-7),(cx+7,cy+7),(0,255,0),2)
                    print("---------------------------------------------------------------------------")

            if(len(cluster_centres_q)>=1):
                    temp_a = deepcopy(cluster_centres_q[0])
                    temp_b = cluster_centres_q[-1]
                    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
                    if(1==1):
                            print ("warning, abandoned object detected")
                            print("maleek yraheb b karim")
                            if timeout==0:
                                timeout = time.time() + 10
                            else:
                                if time.time() > timeout:
                                    break
                            #font = cv2.InitFont(cv2.CV_FONT_HERSHEY_SIMPLEX, 1, 1, 0, 3, 8) #Creates a font
                            font = cv2.FONT_HERSHEY_SIMPLEX
                            text_x = cx-10 #position of text
                            text_y = cy-20 #position of text
                            cv2.putText(sub,"Warning", (text_x,text_y),font,1, (255,255,255)) #Draw the text
                    cluster_centres_q = cluster_centres_q[1:]


            avg_show = cv2.resize(avg,(400,400))
#           cv2.imshow("avg",avg_show)
            cv2.imshow("Abandoned Objects",sub)
            if(move==0):
                    move=1
                    cv2.moveWindow("gray", 400,20)
                    cv2.moveWindow("img", 0,20)
                    cv2.moveWindow("cur_back_show", 800,20)
                    cv2.moveWindow("buf_back_show", 400,420)
                    cv2.moveWindow("Abandoned Objects", 20,220)
                    cv2.moveWindow("avg", 800,420)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
            ret, pure_img = cap.read() # on extraire le 'cout'éme frame du video qui sera enregistrer dans la variable pure_img , ret , est un variable booleanretourner par la fonction read
                

    cap.release()
    cv2.destroyAllWindows() 
    return