Ejemplo n.º 1
0
def test_all(n,dim):
  method = 'single'

  # metrics for boolean vectors
  pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool)
  pcd2 = pcd.copy()
  for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', #'kulsinski',
                 'rogerstanimoto',
                 #'sokalmichener',
                 # exclude, bug in older Scipy versions
                 # http://projects.scipy.org/scipy/ticket/1486
                 'russellrao', 'sokalsneath',
                 #'kulsinski'
                 # exclude, bug in older Scipy versions
                 # http://projects.scipy.org/scipy/ticket/1484
                 ):
    sys.stdout.write("Metric: " + metric + "...")
    D = pdist(pcd, metric)
    Z2 = fc.linkage_vector(pcd, method, metric)
    if np.any(pcd2!=pcd):
      raise AssertionError('Input array was corrupted.', pcd)
    test(Z2, method, D)

  # metrics for real vectors
  bound = math.sqrt(n)
  pcd = np.random.random_integers(-bound,bound,(n,dim))
  for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski',
                 'cosine', 'correlation', 'hamming', 'jaccard',
                 #'canberra',
                 # exclude, bug in older Scipy versions
                 # http://projects.scipy.org/scipy/ticket/1430
                 'braycurtis', 'seuclidean', 'mahalanobis',
                 'user']:
    sys.stdout.write("Metric: " + metric + "...")
    if metric=='minkowski':
      p = np.random.uniform(1.,10.)
      sys.stdout.write("p: " + str(p) + "...")
      D = pdist(pcd, metric, p)
      Z2 = fc.linkage_vector(pcd, method, metric, p)
    elif metric=='user':
      # Euclidean metric as a user function
      fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum()))
      D = pdist(pcd, fn)
      Z2 = fc.linkage_vector(pcd, method, fn)
    else:
      D = pdist(pcd, metric)
      Z2 = fc.linkage_vector(pcd, method, metric)
    test(Z2, method, D)

  #print pcd
  D = pdist(pcd)
  for method in ['ward', 'centroid', 'median']:
    Z2 = fc.linkage_vector(pcd, method)
    test(Z2, method, D)
Ejemplo n.º 2
0
    def linkage(self, title_clusters, method='ward'):

        try:
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)
        except AttributeError:
            title_clusters = apply_word_embedings(title_clusters)
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)

        return Z
Ejemplo n.º 3
0
    def linkage(self, title_clusters, method='ward', linkage_matrix=None):
        if not linkage_matrix is None:
            self.linkage_matrix = linkage_matrix
            return linkage_matrix

        try:
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)
        except AttributeError:
            title_clusters = apply_word_embedings(title_clusters,
                                                  model_name=self.model_name)
            data = np.array([i[0][0] for i in title_clusters.word_vector])
            Z = fastcluster.linkage_vector(data, method=method)

        return Z
Ejemplo n.º 4
0
def cluster_finder(polygons):
    """returns a matrix Z as described in scipy.cluster.hierarchy.linkage"""
    def distfunc(u, v):
        return polygons[int(u)].distance(polygons[int(v)])
    
    X = np.arange(len(polygons))[:, np.newaxis]
    return fastcluster.linkage_vector(X, method='single', metric=distfunc)
Ejemplo n.º 5
0
def test():
    n = np.random.random_integers(2, 100)

    # Part 1: distance matrix input

    N = n * (n - 1) // 2
    D = np.random.rand(N)
    # Insert a single NaN value
    pos = np.random.randint(N)
    D[pos] = np.nan

    for method in [
            'single', 'complete', 'average', 'weighted', 'ward', 'centroid',
            'median'
    ]:
        try:
            fastcluster.linkage(D, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Next: the original array does not contain a NaN, but a NaN occurs
    # as an updated distance.
    for method in ['average', 'weighted', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage([np.inf, -np.inf, -np.inf], method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Part 2: vector input

    dim = np.random.random_integers(2, 12)
    X = np.random.rand(n, dim)
    pos = (np.random.randint(n), np.random.randint(dim))
    # Insert a single NaN coordinate
    X[pos] = np.nan

    for method in ['single', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage_vector(X, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    return True
Ejemplo n.º 6
0
def fast_hierarchy(feat, distance, hmethod='single', **kwargs):
    import fastcluster
    import scipy.cluster
    links = fastcluster.linkage_vector(feat, method=hmethod)
    labels_ = scipy.cluster.hierarchy.fcluster(links,
                                               distance,
                                               criterion='distance')
    return labels_
Ejemplo n.º 7
0
def test():
    n = np.random.randint(2,100)

    # Part 1: distance matrix input

    N = n*(n-1)//2
    D = np.random.rand(N)
    # Insert a single NaN value
    pos = np.random.randint(N)
    D[pos] = np.nan

    for method in ['single', 'complete', 'average', 'weighted', 'ward',
                   'centroid', 'median']:
        try:
            fastcluster.linkage(D, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Next: the original array does not contain a NaN, but a NaN occurs
    # as an updated distance.
    for method in ['average', 'weighted', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Part 2: vector input

    dim = np.random.randint(2,13)
    X = np.random.rand(n,dim)
    pos = (np.random.randint(n), np.random.randint(dim))
    # Insert a single NaN coordinate
    X[pos] = np.nan

    for method in ['single', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage_vector(X, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    return True
Ejemplo n.º 8
0
def cal_cophenetic(C):
	""" calculate cophenetic correlation coefficient """
	print("=== calculate cophenetic correlation coefficient ===")
	X = C  # Original data (1000 observations)
	"""Z = linkage(X)"""
	Z = fc.linkage_vector(X)         # Clustering
	orign_dists = fc.pdist(X)  # Matrix of original distances between observations
	cophe_dists = cophenet(Z)  # Matrix of cophenetic distances between observations
	corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1]
	return corr_coef
Ejemplo n.º 9
0
    def linkage(self, x):
        """Performs hierarchical clustering.

        :Parameters:
          x : 2d array_like object (N, P)
             vector data, N observations in R^P
        """

        self._Z = fastcluster.linkage_vector(X=x, method=self._method, 
            metric='euclidean', extraarg=None)
Ejemplo n.º 10
0
    def linkage(self, x):
        """Performs hierarchical clustering.

        :Parameters:
          x : 2d array_like object (N, P)
             vector data, N observations in R^P
        """

        self._Z = fastcluster.linkage_vector(X=x, method=self._method, 
            metric='euclidean', extraarg=None)
def product_finder_fasttext(data_, k1= 30,topn_=4000, min_value_ = 0.95,expected_density = 1.1, sparse=False, clustering_algorithm = 'agglomerative'):   
    
    if clustering_algorithm == 'community':
        print('creating  cv_matrix')
        data = [i[0][0] for i in data_['word_vector']]
        cv_matrix = csr_matrix(np.array(data))
        print('calculating similarity matrix')
        s = time.time()
        cosine_sim = pairwise_cosine_sparse_sim(cv_matrix, topn = topn_, min_value=min_value_,expected_density = expected_density, sparse= False)
        print (str(time.time()-s)+'s for cosine similarity computing')
        s = time.time()
        print('generating similarities graph')
        sources, targets = cosine_sim.nonzero()
        g = Graph(list(zip(sources.tolist(), targets.tolist())))
        print (str(time.time()-s)+'s for graph generation')
        s= time.time()
        print('creating graph communities')
        clusters= g.community_multilevel(weights = np.exp(k1*cosine_sim.data))
        cluster_labels = clusters.membership
        Z = None
    if clustering_algorithm == 'agglomerative':        
        data = np.array([i[0][0] for i in data_['word_vector']])
        cv_matrix = csr_matrix(np.array(data))
        print('calculating similarity matrix')
        s = time.time()
        #cosine_sim = pairwise_cosine_sparse_sim(cv_matrix, topn = cv_matrix.shape[0], min_value=min_value_,expected_density = expected_density, sparse= False)
        print (str(time.time()-s)+'s for cosine similarity computing')
        
        Z = fastcluster.linkage_vector(np.array(data))
        cluster_labels = scipy.cluster.hierarchy.fcluster(Z, 1-min_value_, criterion='distance', depth=2, R=None, monocrit=None)
        
    
    data_=data_.assign(product_id = cluster_labels)
    
    list_of_labels = list(set(data_['product_id']))
    product_word_vector={}
    for i in list_of_labels:
        by_column_dic_i = data_[data_.product_id == i]
        word_vectors = [vector[0] for vector in by_column_dic_i['word_vector']]
        if len(word_vectors) == 1:   
            product_word_vector[i] = word_vectors
        else:
            product_word_vector[i] = [np.average(np.array(word_vectors),axis=0)]
    
    data_= data_.assign(product_word_vector = 0)
    for i in product_word_vector.keys():
        data_[data_.product_id == i] = data_[data_.product_id == i].assign(product_word_vector =  len(data_[data_.product_id == i])*product_word_vector[i])

    data_ = data_[['ad_title','ad_title_corpus','ad_id','product_id','word_vector','product_word_vector']]
    print (str(time.time()-s)+'s for clusters computation')
    
    if clustering_algorithm == 'community':
        return {'clustered_data':data_ , 'sim_matrix_density': cosine_sim.size/(cosine_sim.shape[0]*cosine_sim.shape[1])}
    if clustering_algorithm == 'agglomerative':
        return {'clustered_data':data_ , 'linkage_matrix': Z}
Ejemplo n.º 12
0
def fast_cluster(array, method, metric):
    import fastcluster
    euclidean_methods = ('centroid', 'median', 'ward')
    euclidean = metric == 'euclidean' and method in euclidean_methods
    if euclidean or method == 'single':
        _linkage = fastcluster.linkage_vector(array,
                                              method=method,
                                              metric=metric)
    else:
        _linkage = fastcluster.linkage(array, method=method, metric=metric)
    return _linkage
Ejemplo n.º 13
0
def _calculate_linkage_fastcluster(array, metric='euclidean', method='single'):
    # Fastcluster has a memory-saving vectorized version, but only
    # with certain linkage methods, and mostly with euclidean metric
    # vector_methods = ('single', 'centroid', 'median', 'ward')
    euclidean_methods = ('centroid', 'median', 'ward')
    euclidean = metric == 'euclidean' and method in \
        euclidean_methods
    if euclidean or method == 'single':
        return fastcluster.linkage_vector(array, method=method, metric=metric)
    else:
        linkage = fastcluster.linkage(array, method=method, metric=metric)
        return linkage
Ejemplo n.º 14
0
    def _calculate_linkage_fastcluster(self):
        import fastcluster

        # Fastcluster has a memory-saving vectorized version, but only
        # with certain linkage methods, and mostly with euclidean metric
        vector_methods = ("single", "centroid", "median", "ward")
        euclidean_methods = ("centroid", "median", "ward")
        euclidean = self.metric == "euclidean" and self.method in euclidean_methods
        if euclidean or self.method == "single":
            return fastcluster.linkage_vector(self.array, method=self.method, metric=self.metric)
        else:
            pairwise_dists = distance.pdist(self.array, metric=self.metric)
            linkage = fastcluster.linkage(pairwise_dists, method=self.method)
            del pairwise_dists
            return linkage
Ejemplo n.º 15
0
 def _calculate_linkage_fastcluster(self):
     import fastcluster
     # Fastcluster has a memory-saving vectorized version, but only
     # with certain linkage methods, and mostly with euclidean metric
     # vector_methods = ("single", "centroid", "median", "ward")
     euclidean_methods = ("centroid", "median", "ward")
     euclidean = self.metric == "euclidean" and self.method in euclidean_methods
     if euclidean or self.method == "single":
         return fastcluster.linkage_vector(self.array,
                                           method=self.method,
                                           metric=self.metric)
     else:
         linkage = fastcluster.linkage(self.array, method=self.method,
                                       metric=self.metric)
         return linkage
Ejemplo n.º 16
0
 def _calculate_linkage_fastcluster(self):
     import fastcluster
     # Fastcluster has a memory-saving vectorized version, but only
     # with certain linkage methods, and mostly with euclidean metric
     # vector_methods = ('single', 'centroid', 'median', 'ward')
     euclidean_methods = ('centroid', 'median', 'ward')
     euclidean = self.metric == 'euclidean' and self.method in \
         euclidean_methods
     if euclidean or self.method == 'single':
         return fastcluster.linkage_vector(self.array,
                                           method=self.method,
                                           metric=self.metric)
     else:
         linkage = fastcluster.linkage(self.array, method=self.method,
                                       metric=self.metric)
         return linkage
Ejemplo n.º 17
0
    def test_custom_linkage(self):
        kws = self.default_kws.copy()

        try:
            import fastcluster

            linkage = fastcluster.linkage_vector(self.x_norm, method="single", metric="euclidean")
        except ImportError:
            d = distance.pdist(self.x_norm, metric="euclidean")
            linkage = hierarchy.linkage(d, method="single")
        dendrogram = hierarchy.dendrogram(linkage, no_plot=True, color_list=["k"], color_threshold=-np.inf)
        kws["linkage"] = linkage
        p = mat._DendrogramPlotter(self.df_norm, **kws)

        npt.assert_array_equal(p.linkage, linkage)
        nt.assert_dict_equal(p.dendrogram, dendrogram)
def applyHierarchicalClustering(X, n_clusters):
    Z = fastcluster.linkage_vector(X, method='ward', metric='euclidean')

    Z_dataFrame = pd.DataFrame(
        data=Z,
        columns=['clusterOne', 'clusterTwo', 'distance', 'newClusterSize'])

    distance = find_distance_thres(n_clusters, Z, X)

    clusters = fcluster(Z, distance, criterion='distance')
    clusters = pd.DataFrame(data=clusters, index=X.index, columns=['cluster'])
    print("Number of distinct clusters: ", len(clusters['cluster'].unique()))

    # cluster number from int to string
    clusters['cluster'] = clusters['cluster'].apply(str)

    return clusters
Ejemplo n.º 19
0
 def _calculate_linkage_fastcluster(self):
     import fastcluster
     # Fastcluster has a memory-saving vectorized version, but only
     # with certain linkage methods, and mostly with euclidean metric
     vector_methods = ('single', 'centroid', 'median', 'ward')
     euclidean_methods = ('centroid', 'median', 'ward')
     euclidean = self.metric == 'euclidean' and self.method in \
         euclidean_methods
     if euclidean or self.method == 'single':
         return fastcluster.linkage_vector(self.array,
                                           method=self.method,
                                           metric=self.metric)
     else:
         pairwise_dists = distance.pdist(self.array, metric=self.metric)
         linkage = fastcluster.linkage(pairwise_dists, method=self.method)
         del pairwise_dists
         return linkage
Ejemplo n.º 20
0
    def test_custom_linkage(self):
        kws = self.default_kws.copy()

        try:
            import fastcluster

            linkage = fastcluster.linkage_vector(self.x_norm, method='single',
                                                 metric='euclidean')
        except ImportError:
            d = distance.pdist(self.x_norm, metric='euclidean')
            linkage = hierarchy.linkage(d, method='single')
        dendrogram = hierarchy.dendrogram(linkage, no_plot=True,
                                          color_threshold=-np.inf)
        kws['linkage'] = linkage
        p = mat._DendrogramPlotter(self.df_norm, **kws)

        npt.assert_array_equal(p.linkage, linkage)
        nt.assert_dict_equal(p.dendrogram, dendrogram)
Ejemplo n.º 21
0
def agglom_cluster(down, nclusters):
    """Performs agglomerative clustering on downsampled data

    as per paper, this is single linkage, L1 distance metric 
    """ 

    # see http://www.jstatsoft.org/v53/i09/paper for details on fastcluster
    # by Daniel Müllnerout of Carlsson's group
    
    # NOTE: Ideally, we would call the linkage function as
    # `Z = linkage(down, method = 'single', metric = cityblock)`
    # which would prevent the explicit formation of a distance matrix
    # however, since this involves calling back to Python, the overhead
    # is too much.  So we form the distance matrix and pass it to the 
    # linkage function.
    try:
        Z = linkage_vector(down, method = 'single', metric = 'cityblock')
    except:
        dist = pdist(down, metric = 'minkowski', p = 1)
        Z = linkage(dist, method = 'single', preserve_input = False)
    return fcluster(Z, nclusters, criterion = 'maxclust') 
Ejemplo n.º 22
0
def showModelPerformanceHierarchical(X_train, y_train):
    fc = fastcluster.linkage_vector(X_train, method='ward', metric='euclidean')

    distance = find_hierarchical_clustering_distance_threshold(
        23, fc, X_train)  # le résultat est 174

    clusters = fcluster(fc, distance, criterion='distance')
    X_train_hierClustered = pd.DataFrame(data=clusters,
                                         index=X_train.index,
                                         columns=['cluster'])
    print(X_train_hierClustered)
    print("Number of distinct clusters: ",
          len(X_train_hierClustered['cluster'].unique()))

    showClusterDistribution(X_train, clusters, 6, 'Evaporation', 'Rainfall')

    countByCluster_hierClust, countByLabel_hierClust, countMostFreq_hierClust, accuracyDF_hierClust, overallAccuracy_hierClust, accuracyByLabel_hierClust = analyzeCluster(
        X_train_hierClustered, y_train)
    print("Accuracy by cluster from hierarchical clustering: \n",
          accuracyByLabel_hierClust)
    print("Overall accuracy from hierarchical clustering: ",
          overallAccuracy_hierClust)
    print("Standard deviation from hierarchical clustering: ",
          accuracyByLabel_hierClust.std())
Ejemplo n.º 23
0
  def test_random_cluster(self):
    np.random.seed(1337)
    N = 1000

    t_old = 0.
    t_new = 0.

    for _ in range(N):
      n = int(np.random.uniform(2, 32))
      x = np.random.uniform(-10, 50, (n, 1))
      y = np.random.uniform(-5, 5, (n, 1))
      vrel = np.random.uniform(-5, 5, (n, 1))
      pts = np.hstack([x, y, vrel])

      t = time.time()
      old_link = linkage_vector(pts, method='centroid')
      old_cluster_idx = fcluster(old_link, 2.5, criterion='distance')
      t_old += time.time() - t

      t = time.time()
      cluster_idx = cluster_points_centroid(pts, 2.5)
      t_new += time.time() - t

      self.assertTrue(same_clusters(old_cluster_idx, cluster_idx))
Ejemplo n.º 24
0
    def handle_unlabeled(self,
                         data,
                         max_product_id,
                         clustering_algorithm='agglomerative'):
        unknown_products = apply_word_embedings(data,
                                                model_name=self.model_name)

        if clustering_algorithm == 'agglomerative':

            unknown_data = np.array(
                [i[0][0] for i in unknown_products.word_vector])
            cluster_ = fastcluster.linkage_vector(unknown_data, method='ward')
            cluster_labels = Cluster.hierarchy.fcluster(cluster_, 0.2)
            unknown_products = unknown_products.assign(
                product_id=cluster_labels)

        elif clustering_algorithm == 'community':
            unknown_products = self.graph_communities(
                unknown_products,
                min_value_=0.8,
                topn_=400,
                k1=50,
                expected_density=0.1,
                graph_communities_df=None)
Ejemplo n.º 25
0
def radard_thread(gctx=None):
  set_realtime_priority(2)

  # wait for stats about the car to come in from controls
  cloudlog.info("radard is waiting for CarParams")
  CP = car.CarParams.from_bytes(Params().get("CarParams", block=True))
  mocked= CP.radarName == "mock"
  VM = VehicleModel(CP)
  cloudlog.info("radard got CarParams")

  # import the radar from the fingerprint
  cloudlog.info("radard is importing %s", CP.radarName)
  exec('from selfdrive.radar.'+CP.radarName+'.interface import RadarInterface')

  context = zmq.Context()

  # *** subscribe to features and model from visiond
  poller = zmq.Poller()
  model = messaging.sub_sock(context, service_list['model'].port, conflate=True, poller=poller)
  live100 = messaging.sub_sock(context, service_list['live100'].port, conflate=True, poller=poller)

  PP = PathPlanner()
  RI = RadarInterface()

  last_md_ts = 0
  last_l100_ts = 0

  # *** publish live20 and liveTracks
  live20 = messaging.pub_sock(context, service_list['live20'].port)
  liveTracks = messaging.pub_sock(context, service_list['liveTracks'].port)

  path_x = np.arange(0.0, 140.0, 0.1)    # 140 meters is max

  # Time-alignment
  rate = 20.   # model and radar are both at 20Hz
  tsv = 1./rate
  v_len = 20         # how many speed data points to remember for t alignment with rdr data

  active = 0
  steer_angle = 0.
  steer_override = False

  tracks = defaultdict(dict)

  # Kalman filter stuff:
  ekfv = EKFV1D()
  speedSensorV = SimpleSensor(XV, 1, 2)

  # v_ego
  v_ego = None
  v_ego_array = np.zeros([2, v_len])
  v_ego_t_aligned = 0.

  rk = Ratekeeper(rate, print_delay_threshold=np.inf)
  while 1:
    rr = RI.update()

    ar_pts = {}
    for pt in rr.points:
      ar_pts[pt.trackId] = [pt.dRel + RDR_TO_LDR, pt.yRel, pt.vRel, pt.measured]

    # receive the live100s
    l100 = None
    md = None

    for socket, event in poller.poll(0):
      if socket is live100:
        l100 = messaging.recv_one(socket)
      elif socket is model:
        md = messaging.recv_one(socket)

    if l100 is not None:
      active = l100.live100.active
      v_ego = l100.live100.vEgo
      steer_angle = l100.live100.angleSteers
      steer_override = l100.live100.steerOverride

      v_ego_array = np.append(v_ego_array, [[v_ego], [float(rk.frame)/rate]], 1)
      v_ego_array = v_ego_array[:, 1:]

      last_l100_ts = l100.logMonoTime

    if v_ego is None:
      continue

    if md is not None:
      last_md_ts = md.logMonoTime

    # *** get path prediction from the model ***
    PP.update(v_ego, md)

    # run kalman filter only if prob is high enough
    if PP.lead_prob > 0.7:
      ekfv.update(speedSensorV.read(PP.lead_dist, covar=PP.lead_var))
      ekfv.predict(tsv)
      ar_pts[VISION_POINT] = (float(ekfv.state[XV]), np.polyval(PP.d_poly, float(ekfv.state[XV])),
                              float(ekfv.state[SPEEDV]), False)
    else:
      ekfv.state[XV] = PP.lead_dist
      ekfv.covar = (np.diag([PP.lead_var, ekfv.var_init]))
      ekfv.state[SPEEDV] = 0.
      if VISION_POINT in ar_pts:
        del ar_pts[VISION_POINT]

    # *** compute the likely path_y ***
    if (active and not steer_override) or mocked:
      # use path from model (always when mocking as steering is too noisy)
      path_y = np.polyval(PP.d_poly, path_x)
    else:
      # use path from steer, set angle_offset to 0 it does not only report the physical offset
      path_y = calc_lookahead_offset(v_ego, steer_angle, path_x, VM, angle_offset=0)[0]

    # *** remove missing points from meta data ***
    for ids in tracks.keys():
      if ids not in ar_pts:
        tracks.pop(ids, None)

    # *** compute the tracks ***
    for ids in ar_pts:
      # ignore standalone vision point, unless we are mocking the radar
      if ids == VISION_POINT and not mocked:
        continue
      rpt = ar_pts[ids]

      # align v_ego by a fixed time to align it with the radar measurement
      cur_time = float(rk.frame)/rate
      v_ego_t_aligned = np.interp(cur_time - RI.delay, v_ego_array[1], v_ego_array[0])
      d_path = np.sqrt(np.amin((path_x - rpt[0]) ** 2 + (path_y - rpt[1]) ** 2))
      # add sign
      d_path *= np.sign(rpt[1] - np.interp(rpt[0], path_x, path_y))

      # create the track if it doesn't exist or it's a new track
      if ids not in tracks:
        tracks[ids] = Track()
      tracks[ids].update(rpt[0], rpt[1], rpt[2], d_path, v_ego_t_aligned, rpt[3], steer_override)

    # allow the vision model to remove the stationary flag if distance and rel speed roughly match
    if VISION_POINT in ar_pts:
      fused_id = None
      best_score = NO_FUSION_SCORE
      for ids in tracks:
        dist_to_vision = np.sqrt((0.5*(ar_pts[VISION_POINT][0] - tracks[ids].dRel)) ** 2 + (2*(ar_pts[VISION_POINT][1] - tracks[ids].yRel)) ** 2)
        rel_speed_diff = abs(ar_pts[VISION_POINT][2] - tracks[ids].vRel)
        tracks[ids].update_vision_score(dist_to_vision, rel_speed_diff)
        if best_score > tracks[ids].vision_score:
          fused_id = ids
          best_score = tracks[ids].vision_score

      if fused_id is not None:
        tracks[fused_id].vision_cnt += 1
        tracks[fused_id].update_vision_fusion()

    if DEBUG:
      print "NEW CYCLE"
      if VISION_POINT in ar_pts:
        print "vision", ar_pts[VISION_POINT]

    idens = tracks.keys()
    track_pts = np.array([tracks[iden].get_key_for_cluster() for iden in idens])

    # If we have multiple points, cluster them
    if len(track_pts) > 1:
      link = linkage_vector(track_pts, method='centroid')
      cluster_idxs = fcluster(link, 2.5, criterion='distance')
      clusters = [None]*max(cluster_idxs)

      for idx in xrange(len(track_pts)):
        cluster_i = cluster_idxs[idx]-1

        if clusters[cluster_i] == None:
          clusters[cluster_i] = Cluster()
        clusters[cluster_i].add(tracks[idens[idx]])
    elif len(track_pts) == 1:
      # TODO: why do we need this?
      clusters = [Cluster()]
      clusters[0].add(tracks[idens[0]])
    else:
      clusters = []

    if DEBUG:
      for i in clusters:
        print i
    # *** extract the lead car ***
    lead_clusters = [c for c in clusters
                     if c.is_potential_lead(v_ego)]
    lead_clusters.sort(key=lambda x: x.dRel)
    lead_len = len(lead_clusters)

    # *** extract the second lead from the whole set of leads ***
    lead2_clusters = [c for c in lead_clusters
                      if c.is_potential_lead2(lead_clusters)]
    lead2_clusters.sort(key=lambda x: x.dRel)
    lead2_len = len(lead2_clusters)

    # *** publish live20 ***
    dat = messaging.new_message()
    dat.init('live20')
    dat.live20.mdMonoTime = last_md_ts
    dat.live20.canMonoTimes = list(rr.canMonoTimes)
    dat.live20.radarErrors = list(rr.errors)
    dat.live20.l100MonoTime = last_l100_ts
    if lead_len > 0:
      lead_clusters[0].toLive20(dat.live20.leadOne)
      if lead2_len > 0:
        lead2_clusters[0].toLive20(dat.live20.leadTwo)
      else:
        dat.live20.leadTwo.status = False
    else:
      dat.live20.leadOne.status = False

    dat.live20.cumLagMs = -rk.remaining*1000.
    live20.send(dat.to_bytes())

    # *** publish tracks for UI debugging (keep last) ***
    dat = messaging.new_message()
    dat.init('liveTracks', len(tracks))

    for cnt, ids in enumerate(tracks.keys()):
      if DEBUG:
        print "id: %4.0f x:  %4.1f  y: %4.1f  vr: %4.1f d: %4.1f  va: %4.1f  vl: %4.1f  vlk: %4.1f alk: %4.1f  s: %1.0f" % \
          (ids, tracks[ids].dRel, tracks[ids].yRel, tracks[ids].vRel,
           tracks[ids].dPath, tracks[ids].vLat,
           tracks[ids].vLead, tracks[ids].vLeadK,
           tracks[ids].aLeadK,
           tracks[ids].stationary)
      dat.liveTracks[cnt].trackId = ids
      dat.liveTracks[cnt].dRel = float(tracks[ids].dRel)
      dat.liveTracks[cnt].yRel = float(tracks[ids].yRel)
      dat.liveTracks[cnt].vRel = float(tracks[ids].vRel)
      dat.liveTracks[cnt].aRel = float(tracks[ids].aRel)
      dat.liveTracks[cnt].stationary = tracks[ids].stationary
      dat.liveTracks[cnt].oncoming = tracks[ids].oncoming
    liveTracks.send(dat.to_bytes())

    rk.monitor_time()
Ejemplo n.º 26
0
def radard_thread(gctx=None):
  #print "===>>> File: controls/radard.py; FUnction: radard_thread"
  set_realtime_priority(1)

  # wait for stats about the car to come in from controls
  cloudlog.info("radard is waiting for CarParams")
  CP = car.CarParams.from_bytes(Params().get("CarParams", block=True))
  cloudlog.info("radard got CarParams")

  # import the radar from the fingerprint
  cloudlog.info("radard is importing %s", CP.radarName)
  exec('from selfdrive.radar.'+CP.radarName+'.interface import RadarInterface')

  context = zmq.Context()

  # *** subscribe to features and model from visiond
  model = messaging.sub_sock(context, service_list['model'].port)
  live100 = messaging.sub_sock(context, service_list['live100'].port)

  PP = PathPlanner()
  RI = RadarInterface()

  last_md_ts = 0
  last_l100_ts = 0

  # *** publish live20 and liveTracks
  live20 = messaging.pub_sock(context, service_list['live20'].port)
  liveTracks = messaging.pub_sock(context, service_list['liveTracks'].port)

  path_x = np.arange(0.0, 140.0, 0.1)    # 140 meters is max

  # Time-alignment
  rate = 20.   # model and radar are both at 20Hz
  tsv = 1./rate
  rdr_delay = 0.10   # radar data delay in s
  v_len = 20         # how many speed data points to remember for t alignment with rdr data

  enabled = 0
  steer_angle = 0.

  tracks = defaultdict(dict)
  
  # Kalman filter stuff: 
  ekfv = EKFV1D()
  speedSensorV = SimpleSensor(XV, 1, 2)

  # v_ego
  v_ego = None
  v_ego_array = np.zeros([2, v_len])
  v_ego_t_aligned = 0.

  rk = Ratekeeper(rate, print_delay_threshold=np.inf)
  while 1:
    rr = RI.update()

    ar_pts = {}
    for pt in rr.points:
      ar_pts[pt.trackId] = [pt.dRel + RDR_TO_LDR, pt.yRel, pt.vRel, pt.aRel, None, False, None]

    # receive the live100s
    l100 = messaging.recv_sock(live100)
    if l100 is not None:
      enabled = l100.live100.enabled
      v_ego = l100.live100.vEgo
      steer_angle = l100.live100.angleSteers

      v_ego_array = np.append(v_ego_array, [[v_ego], [float(rk.frame)/rate]], 1)
      v_ego_array = v_ego_array[:, 1:]

      last_l100_ts = l100.logMonoTime

    if v_ego is None:
      continue

    md = messaging.recv_sock(model)
    #print "============ RADAR Thread"
    #print md
    if md is not None:
      last_md_ts = md.logMonoTime

    # *** get path prediction from the model ***
    PP.update(sec_since_boot(), v_ego, md)

    # run kalman filter only if prob is high enough
    if PP.lead_prob > 0.7:
      ekfv.update(speedSensorV.read(PP.lead_dist, covar=PP.lead_var))
      ekfv.predict(tsv)
      ar_pts[VISION_POINT] = (float(ekfv.state[XV]), np.polyval(PP.d_poly, float(ekfv.state[XV])),
                              float(ekfv.state[SPEEDV]), np.nan, last_md_ts, np.nan, sec_since_boot())
    else:
      ekfv.state[XV] = PP.lead_dist
      ekfv.covar = (np.diag([PP.lead_var, ekfv.var_init]))
      ekfv.state[SPEEDV] = 0.
      if VISION_POINT in ar_pts:
        del ar_pts[VISION_POINT]

    # *** compute the likely path_y ***
    if enabled:    # use path from model path_poly
      path_y = np.polyval(PP.d_poly, path_x)
    else:          # use path from steer, set angle_offset to 0 since calibration does not exactly report the physical offset
      path_y = calc_lookahead_offset(v_ego, steer_angle, path_x, CP, angle_offset=0)[0]

    # *** remove missing points from meta data ***
    for ids in tracks.keys():
      if ids not in ar_pts:
        tracks.pop(ids, None)

    # *** compute the tracks ***
    for ids in ar_pts:
      # ignore the vision point for now
      if ids == VISION_POINT and not VISION_ONLY:
        continue
      elif ids != VISION_POINT and VISION_ONLY:
        continue
      rpt = ar_pts[ids]

      # align v_ego by a fixed time to align it with the radar measurement     
      cur_time = float(rk.frame)/rate
      v_ego_t_aligned = np.interp(cur_time - rdr_delay, v_ego_array[1], v_ego_array[0])
      d_path = np.sqrt(np.amin((path_x - rpt[0]) ** 2 + (path_y - rpt[1]) ** 2))

      # create the track if it doesn't exist or it's a new track
      if ids not in tracks or rpt[5] == 1:
        tracks[ids] = Track()
      tracks[ids].update(rpt[0], rpt[1], rpt[2], d_path, v_ego_t_aligned)

      # allow the vision model to remove the stationary flag if distance and rel speed roughly match
      if VISION_POINT in ar_pts:
        dist_to_vision = np.sqrt((0.5*(ar_pts[VISION_POINT][0] - rpt[0])) ** 2 + (2*(ar_pts[VISION_POINT][1] - rpt[1])) ** 2)
        rel_speed_diff = abs(ar_pts[VISION_POINT][2] - rpt[2])
        tracks[ids].mix_vision(dist_to_vision, rel_speed_diff)

    # publish tracks (debugging)
    dat = messaging.new_message()
    dat.init('liveTracks', len(tracks))
    for cnt, ids in enumerate(tracks.keys()):
      dat.liveTracks[cnt].trackId = ids
      dat.liveTracks[cnt].dRel = float(tracks[ids].dRel)
      dat.liveTracks[cnt].yRel = float(tracks[ids].yRel)
      dat.liveTracks[cnt].vRel = float(tracks[ids].vRel)
      dat.liveTracks[cnt].aRel = float(tracks[ids].aRel)
      dat.liveTracks[cnt].stationary = tracks[ids].stationary
      dat.liveTracks[cnt].oncoming = tracks[ids].oncoming
    liveTracks.send(dat.to_bytes())

    idens = tracks.keys()
    track_pts = np.array([tracks[iden].get_key_for_cluster() for iden in idens])

    # If we have multiple points, cluster them
    if len(track_pts) > 1:
      link = linkage_vector(track_pts, method='centroid')
      cluster_idxs = fcluster(link, 2.5, criterion='distance')
      clusters = [None]*max(cluster_idxs)

      for idx in xrange(len(track_pts)):
        cluster_i = cluster_idxs[idx]-1

        if clusters[cluster_i] == None:
          clusters[cluster_i] = Cluster()
        clusters[cluster_i].add(tracks[idens[idx]])
    elif len(track_pts) == 1:
      # TODO: why do we need this?
      clusters = [Cluster()]
      clusters[0].add(tracks[idens[0]])
    else:
      clusters = []

    # *** extract the lead car ***
    lead_clusters = [c for c in clusters
                     if c.is_potential_lead(v_ego)]
    lead_clusters.sort(key=lambda x: x.dRel)
    lead_len = len(lead_clusters)

    # *** extract the second lead from the whole set of leads ***
    lead2_clusters = [c for c in lead_clusters
                      if c.is_potential_lead2(lead_clusters)]
    lead2_clusters.sort(key=lambda x: x.dRel)
    lead2_len = len(lead2_clusters)

    # *** publish live20 ***
    dat = messaging.new_message()
    dat.init('live20')
    dat.live20.mdMonoTime = last_md_ts
    dat.live20.canMonoTimes = list(rr.canMonoTimes)
    dat.live20.l100MonoTime = last_l100_ts
    if lead_len > 0:
      lead_clusters[0].toLive20(dat.live20.leadOne)
      if lead2_len > 0:
        lead2_clusters[0].toLive20(dat.live20.leadTwo)
      else:
        dat.live20.leadTwo.status = False
    else:
      dat.live20.leadOne.status = False

    dat.live20.cumLagMs = -rk.remaining*1000.
    live20.send(dat.to_bytes())

    rk.monitor_time()
Ejemplo n.º 27
0
# 少ない列を削除
print(X_train.shape)
# drop_columns = X_train.columns[X_train.var(axis='index') <= 2]#  0.5 by bunsan  のとき
# X_train = X_train.drop(drop_columns, axis=1)
# print(X_train.shape)

X_train = X_train[:880]
print(X_train.shape)

# 正規化
# X_train = normalization(X_train)

methods = ["ward"]
# methods = ["ward", "single", "centroid", "median"]
for method in methods:
    Z = fastcluster.linkage_vector(X_train, method=method, metric="euclidean")

    Z_dataFrame = pd.DataFrame(
        data=Z,
        columns=["clusterOne", "clusterTwo", "distance", "newClusterSize"])

    # print(Z_dataFrame[:10])

    BINARY_SEARCH = 0
    LINER_SEARCH = 1
    if BINARY_SEARCH:
        # クラスター数を要素数の半分とした時の閾値を決めて評価
        (
            distance_threshold,
            clusters,
            X_train_hierClustered,
Ejemplo n.º 28
0
  def test_scipy_clustering(self):
    old_link = linkage_vector(TRACK_PTS, method='centroid')
    old_cluster_idxs = fcluster(old_link, 2.5, criterion='distance')

    np.testing.assert_allclose(old_link, CORRECT_LINK)
    np.testing.assert_allclose(old_cluster_idxs, CORRECT_LABELS)
def heatmap(x, row_header, column_header, row_method,
            column_method, row_metric, column_metric,
            color_gradient, 
            filename, 
            other_data=None, 
            log=False, trad=False, 
            level_row=0.4, level_column=0.5,
            folder=os.getcwd(),
            range_normalization=(-2,2), colorbar_ticks=[-2, 0, 2],
            colorbar_ticklabels=['$ <\mu-2 \sigma$', '$\mu$', '$> \mu+2 \sigma$'], colorbar_title='Feature range',
            title=None,
            save=False,
            show=True):
    
    print "\nPerforming hiearchical clustering using %s for columns and %s for rows" % (column_metric,row_metric),
    if numpy.any(numpy.isnan(x)):
        sys.stderr.write("WARNING, there are NaN values in the data. Hence distances with data elements that have NaN values will have value NaN, which might perturb the hierarchical clustering.")
        
    """
    This below code is based in large part on the protype methods:
    http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html
    http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre
    
    Possibilities for methods: single, complete, average, centroid, median, ward
    
    Possibilities for metrics: 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 
    'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 
    'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule

    x is an m by n ndarray, m observations, n genes, or m rows,n columns
    
    WARNING WARNING
    This is a modified version to work with "big data" (starting with m=50,000). Indeed, the previous version actually stores
    the distance matrix in the memory which makes it crash. Here, we use the package fastcluster (see http://danifold.net/fastcluster.html)
    in its memory-efficient implementation.
    The parameter method must be one of 'single', 'centroid', 'median', 'ward', complete, average, weighted.
    It can take a dissimilarity matrix in input, ie we don't necessarily have to use a metric which is already implemented
    
    If one wants to plot another data than that which is used for the clustering, then this can be inputed in other_data.
    If X is n_row, n_columns, then other_data should be n_row, m_col
    
    """
    print level_column, level_row
        
    #for export
    if numpy.any(~numpy.array([type(s)==str for s in row_header])):
        row_header=[str(el) for el in row_header]
    if numpy.any(~numpy.array([type(s)==str for s in column_header])):
        column_header=[str(el) for el in column_header]
        
    ### Define the color gradient to use based on the provided name
    n = len(x[0]); m = len(x)
    if color_gradient == 'red_white_blue':
        cmap=pylab.cm.bwr
    if color_gradient == 'red_black_sky':
        cmap=RedBlackSkyBlue()
    if color_gradient=='OrRd':
        cmap = pylab.cm.OrRd
    if color_gradient == 'red_black_blue':
        cmap=RedBlackBlue()
    if color_gradient == 'red_black_green':
        cmap=RedBlackGreen()
    if color_gradient == 'yellow_black_blue':
        cmap=YellowBlackBlue()
    if color_gradient == 'seismic':
        cmap=pylab.cm.seismic
    if color_gradient == 'green_white_purple':
        cmap=pylab.cm.PiYG_r
    if color_gradient == 'coolwarm':
        cmap=pylab.cm.coolwarm
    if color_gradient=='YlOrRd':
        cmap=pylab.cm.YlOrRd

    ### Scale the max and min colors so that 0 is white/black
    vmin=numpy.nanmin(x)
    vmax=numpy.nanmax(x)
    vmax = max([vmax,abs(vmin)])
    #vmin = vmax*-1
#    if log:
#        norm = mpl.colors.LogNorm(vmin, vmax) ### adjust the max and min to scale these colors
#    elif normalization:
#        norm = mpl.colors.Normalize(10**(-70), 1)
#    else:
    if numpy.any(x<0):
        norm = mpl.colors.Normalize(range_normalization[0], range_normalization[1])
    else:
        if range_normalization[0]<0:
            norm = mpl.colors.Normalize(0,range_normalization[1])
        else:
            norm = mpl.colors.Normalize(range_normalization[0], range_normalization[1])
    ### Scale the Matplotlib window size
    default_window_hight = 8.5
    default_window_width = 12
    fig = pylab.figure(figsize=(default_window_width,default_window_hight)) ### could use m,n to scale here
    color_bar_w = 0.015 ### Sufficient size to show
        
    ## calculate positions for all elements
    # ax1, placement of dendrogram 1, on the left of the heatmap
    #if row_method != None: w1 = 
    [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.22,0.2,0.6]   ### The second value controls the position of the matrix relative to the bottom of the view
    width_between_ax1_axr = 0.004
    height_between_ax1_axc = 0.004 ### distance between the top color bar axis and the matrix
    
    # axr, placement of row side colorbar
    [axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w,0.6] ### second to last controls the width of the side color bar - 0.015 when showing
    axr_x = ax1_x + ax1_w + width_between_ax1_axr
    axr_y = ax1_y; axr_h = ax1_h
    width_between_axr_axm = 0.004

    # axc, placement of column side colorbar
    [axc_x, axc_y, axc_w, axc_h] = [0.4,0.63,0.5,color_bar_w] ### last one controls the hight of the top color bar - 0.015 when showing
    axc_x = axr_x + axr_w + width_between_axr_axm
    axc_y = ax1_y + ax1_h + height_between_ax1_axc
    height_between_axc_ax2 = 0.004

    # axm, placement of heatmap for the data matrix
    [axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5]
    axm_x = axr_x + axr_w + width_between_axr_axm
    axm_y = ax1_y; axm_h = ax1_h
    axm_w = axc_w

    # ax2, placement of dendrogram 2, on the top of the heatmap
    [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,0.15] ### last one controls hight of the dendrogram
    ax2_x = axr_x + axr_w + width_between_axr_axm
    ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
    ax2_w = axc_w

    # axcb - placement of the color legend
    [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07,0.88,0.18,0.04]

    # Compute and plot top dendrogram
    if column_method != None:
        start_time = time.time()
#        d2 = dist.pdist(x.T)
#        D2 = dist.squareform(d2)
        ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True)
        
        Y2 = fastcluster.linkage_vector(x.T, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete'
        Z2 = sch.dendrogram(Y2)
        ind2 = sch.fcluster(Y2,level_column*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram
        ax2.set_xticks([]) ### Hides ticks
        ax2.set_yticks([])
        time_diff = str(round(time.time()-start_time,1))
        print 'Column clustering completed in %s seconds' % time_diff
    else:
        ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data
        
    # Compute and plot left dendrogram.
    if row_method != None:
        start_time = time.time()
#        d1 = dist.pdist(x)
#        D1 = dist.squareform(d1)  # full matrix
        ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True) # frame_on may be False
        if row_metric==None:
            Y1 = fastcluster.linkage_vector(x, method=row_method) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
        else:
            Y1 = fastcluster.linkage_vector(x, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
        Z1 = sch.dendrogram(Y1, orientation='right')
        ind1 = sch.fcluster(Y1,level_row*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram
        ax1.set_xticks([]) ### Hides ticks
        ax1.set_yticks([])
        time_diff = str(round(time.time()-start_time,1))
        print 'Row clustering completed in %s seconds' % time_diff
    else:
        ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data
    if save:
        print 'Saving flat clusters in', 'Flat_clusters_{}_{}.pkl'.format(filename, level_row) 
        f=open('Flat_clusters_{}_{}.pkl'.format(filename,level_row), 'w')
        pickle.dump([ind1, ind2],f); f.close()
        
    ind1_to_return = np.array(ind1)
    
#     if trad:
#         if len(row_header)>100:
#             genes=list(row_header)
#             clustering = numpy.array(ind1)
#         elif len(column_header)>100:
#             genes=list(column_header)
#             clustering=numpy.array(ind2)
#         else:
#             print 'Tell which of column and row is the gene list'
#             pdb.set_trace()
#         #il faut d'abord traduire de SYMBOL en ENSEMBL
#         trad = EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt')
#         trad['ctrl']='None'
#         
#         result=[Counter([trad[genes[k]] for k in numpy.where(clustering==cluster)[0]]).keys() for cluster in range(1,numpy.max(clustering)+1)]
#         for geneList in result:
#             for i,gene in enumerate(geneList):
#                 if '/' in gene:
#                     geneList[i]=gene.split('/')[0]
#                     geneList.append(gene.split('/')[1])
#                 
#         #ensuite on va enregistrer les genes des differents clusters dans differents fichiers
#         #background par defaut c'est genes_list.txt
#         print "Nb of cluster found", numpy.max(clustering)
#         multipleGeneListsToFile(result, ['Cluster {}'.format(k+1) for k in range(numpy.max(clustering))], 'gene_cluster_{}_{}.txt'.format(column_method, filename))
    
    # Plot distance matrix.
    axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])  # axes for the data matrix
    xt = x
    if column_method != None:
        idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data
        xt = xt[:,idx2]
        ind2 = ind2[idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram
    if row_method != None:
        idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data
        xt = xt[idx1,:]   # xt is transformed x
        if other_data is not None:
            other_data=other_data[idx1,:]
        
        ind1 = ind1[idx1] ### reorder the flat cluster to match the order of the leaves the dendrogram
    ### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894
    if other_data is None:
        im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black
    else:
        im = axm.matshow(other_data, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black
    axm.set_xticks([]) ### Hides x-ticks
    axm.set_yticks([])

    # Add text
    new_row_header=[]
    new_column_header=[]
    for i in range(x.shape[0]):
        if row_method != None:
            if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                axm.text(x.shape[1]-0.5, i, '  {}'.format(row_header[idx1[i]]), fontsize=6)
            new_row_header.append(row_header[idx1[i]])
        else:
            if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows
                axm.text(x.shape[1]-0.5, i, ' {}'.format(row_header[i]), fontsize=6) ### When not clustering rows
            new_row_header.append(row_header[i])
            
    column_decider=x if other_data is None else other_data
    for i in range(column_decider.shape[1]):
        if column_method != None:
            if len(column_header)<200:
                axm.text(i, -0.9, '{}'.format(column_header[idx2[i]]), rotation=270, verticalalignment="top", fontsize=6) # rotation could also be degrees
            new_column_header.append(column_header[idx2[i]])
        else: ### When not clustering columns
            if len(column_header)<200:
                axm.text(i, -0.9, '{}'.format(column_header[i]), rotation=270, verticalalignment="top", fontsize=6)
            new_column_header.append(column_header[i])

    # Plot colside colors
    # axc --> axes for column side colorbar
    if column_method != None:
        print 'Number of clusters for columns ', np.bincount(ind2)
        axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])  # axes for column side colorbar
        #getting a degrade colormap for the column side colorbar
        cm = ColorMap()
        cr = cm.makeColorRamp(256, ["#FFFF00", "#FF0000"])
        degrade = [cm.getColorFromMap(x, cr, 0, 10) for x in range(len(np.bincount(ind2)))]
        cmap_c = mpl.colors.ListedColormap(degrade)
        
        dc = numpy.array(ind2, dtype=int)
        dc.shape = (1,len(ind2)) 
        im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
        axc.set_xticks([]) ### Hides ticks
        axc.set_yticks([])
    
    # Plot rowside colors
    # axr --> axes for row side colorbar
    if row_method != None:
        print 'Number of clusters for rows ', np.bincount(ind1)
        axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h])  # axes for column side colorbar
        dr = numpy.array(ind1, dtype=int)
        dr.shape = (len(ind1),1)
#rainbow colormap for row side colorbar
        cmap_r = mpl.cm.gist_rainbow
        
        im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
        axr.set_xticks([]) ### Hides ticks
        axr.set_yticks([])

    # Plot color legend
    axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False)  # axes for colorbar
    axcb.set_title(colorbar_title, fontsize=15)
    cb = mpl.colorbar.ColorbarBase(axcb, cmap=cmap,norm=norm, orientation='horizontal',
                                   ticks=colorbar_ticks)
    cb.ax.set_xticklabels(colorbar_ticklabels, fontsize=15)
    
    filename = '%s/Clust_%s_%s_%s.pdf' % (folder, filename[:10],column_method,row_method)
#    exportFlatClusterData(filename, new_row_header,new_column_header,xt,ind1,ind2)

#    ### Render the graphic
#    if len(row_header)>50 or len(column_header)>50:
#        pylab.rcParams['font.size'] = 5
#    else:
    pylab.rcParams['font.size'] = 15
    if title is not None:
        axm.set_xlabel(title)
    pylab.savefig(filename)
    print 'Exporting:',filename
    if show:
        pylab.show()
#     if trad:
#         return result
    return ind1_to_return
Ejemplo n.º 30
0
class TestClustermap:

    rs = np.random.RandomState(sum(map(ord, "clustermap")))

    x_norm = rs.randn(4, 8) + np.arange(8)
    x_norm = (x_norm.T + np.arange(4)).T
    letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"],
                        name="letters")

    df_norm = pd.DataFrame(x_norm, columns=letters)

    default_kws = dict(pivot_kws=None,
                       z_score=None,
                       standard_scale=None,
                       figsize=(10, 10),
                       row_colors=None,
                       col_colors=None,
                       dendrogram_ratio=.2,
                       colors_ratio=.03,
                       cbar_pos=(0, .8, .05, .2))

    default_plot_kws = dict(metric='euclidean',
                            method='average',
                            colorbar_kws=None,
                            row_cluster=True,
                            col_cluster=True,
                            row_linkage=None,
                            col_linkage=None,
                            tree_kws=None)

    row_colors = color_palette('Set2', df_norm.shape[0])
    col_colors = color_palette('Dark2', df_norm.shape[1])

    if not _no_scipy:
        if _no_fastcluster:
            x_norm_distances = distance.pdist(x_norm.T, metric='euclidean')
            x_norm_linkage = hierarchy.linkage(x_norm_distances,
                                               method='single')
        else:
            x_norm_linkage = fastcluster.linkage_vector(x_norm.T,
                                                        metric='euclidean',
                                                        method='single')

        x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage,
                                                 no_plot=True,
                                                 color_threshold=-np.inf)
        x_norm_leaves = x_norm_dendrogram['leaves']
        df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves])

    def test_ndarray_input(self):
        cg = mat.ClusterGrid(self.x_norm, **self.default_kws)
        pdt.assert_frame_equal(cg.data, pd.DataFrame(self.x_norm))
        assert len(cg.fig.axes) == 4
        assert cg.ax_row_colors is None
        assert cg.ax_col_colors is None

    def test_df_input(self):
        cg = mat.ClusterGrid(self.df_norm, **self.default_kws)
        pdt.assert_frame_equal(cg.data, self.df_norm)

    def test_corr_df_input(self):
        df = self.df_norm.corr()
        cg = mat.ClusterGrid(df, **self.default_kws)
        cg.plot(**self.default_plot_kws)
        diag = cg.data2d.values[np.diag_indices_from(cg.data2d)]
        npt.assert_array_almost_equal(diag, np.ones(cg.data2d.shape[0]))

    def test_pivot_input(self):
        df_norm = self.df_norm.copy()
        df_norm.index.name = 'numbers'
        df_long = pd.melt(df_norm.reset_index(),
                          var_name='letters',
                          id_vars='numbers')
        kws = self.default_kws.copy()
        kws['pivot_kws'] = dict(index='numbers',
                                columns='letters',
                                values='value')
        cg = mat.ClusterGrid(df_long, **kws)

        pdt.assert_frame_equal(cg.data2d, df_norm)

    def test_colors_input(self):
        kws = self.default_kws.copy()

        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cg = mat.ClusterGrid(self.df_norm, **kws)
        npt.assert_array_equal(cg.row_colors, self.row_colors)
        npt.assert_array_equal(cg.col_colors, self.col_colors)

        assert len(cg.fig.axes) == 6

    def test_categorical_colors_input(self):
        kws = self.default_kws.copy()

        row_colors = pd.Series(self.row_colors, dtype="category")
        col_colors = pd.Series(self.col_colors,
                               dtype="category",
                               index=self.df_norm.columns)

        kws['row_colors'] = row_colors
        kws['col_colors'] = col_colors

        exp_row_colors = list(map(mpl.colors.to_rgb, row_colors))
        exp_col_colors = list(map(mpl.colors.to_rgb, col_colors))

        cg = mat.ClusterGrid(self.df_norm, **kws)
        npt.assert_array_equal(cg.row_colors, exp_row_colors)
        npt.assert_array_equal(cg.col_colors, exp_col_colors)

        assert len(cg.fig.axes) == 6

    def test_nested_colors_input(self):
        kws = self.default_kws.copy()

        row_colors = [self.row_colors, self.row_colors]
        col_colors = [self.col_colors, self.col_colors]
        kws['row_colors'] = row_colors
        kws['col_colors'] = col_colors

        cm = mat.ClusterGrid(self.df_norm, **kws)
        npt.assert_array_equal(cm.row_colors, row_colors)
        npt.assert_array_equal(cm.col_colors, col_colors)

        assert len(cm.fig.axes) == 6

    def test_colors_input_custom_cmap(self):
        kws = self.default_kws.copy()

        kws['cmap'] = mpl.cm.PRGn
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cg = mat.clustermap(self.df_norm, **kws)
        npt.assert_array_equal(cg.row_colors, self.row_colors)
        npt.assert_array_equal(cg.col_colors, self.col_colors)

        assert len(cg.fig.axes) == 6

    def test_z_score(self):
        df = self.df_norm.copy()
        df = (df - df.mean()) / df.std()
        kws = self.default_kws.copy()
        kws['z_score'] = 1

        cg = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cg.data2d, df)

    def test_z_score_axis0(self):
        df = self.df_norm.copy()
        df = df.T
        df = (df - df.mean()) / df.std()
        df = df.T
        kws = self.default_kws.copy()
        kws['z_score'] = 0

        cg = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cg.data2d, df)

    def test_standard_scale(self):
        df = self.df_norm.copy()
        df = (df - df.min()) / (df.max() - df.min())
        kws = self.default_kws.copy()
        kws['standard_scale'] = 1

        cg = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cg.data2d, df)

    def test_standard_scale_axis0(self):
        df = self.df_norm.copy()
        df = df.T
        df = (df - df.min()) / (df.max() - df.min())
        df = df.T
        kws = self.default_kws.copy()
        kws['standard_scale'] = 0

        cg = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cg.data2d, df)

    def test_z_score_standard_scale(self):
        kws = self.default_kws.copy()
        kws['z_score'] = True
        kws['standard_scale'] = True
        with pytest.raises(ValueError):
            mat.ClusterGrid(self.df_norm, **kws)

    def test_color_list_to_matrix_and_cmap(self):
        # Note this uses the attribute named col_colors but tests row colors
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            self.col_colors, self.x_norm_leaves, axis=0)

        for i, leaf in enumerate(self.x_norm_leaves):
            color = self.col_colors[leaf]
            assert_colors_equal(cmap(matrix[i, 0]), color)

    def test_nested_color_list_to_matrix_and_cmap(self):
        # Note this uses the attribute named col_colors but tests row colors
        colors = [self.col_colors, self.col_colors[::-1]]
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            colors, self.x_norm_leaves, axis=0)

        for i, leaf in enumerate(self.x_norm_leaves):
            for j, color_row in enumerate(colors):
                color = color_row[leaf]
                assert_colors_equal(cmap(matrix[i, j]), color)

    def test_color_list_to_matrix_and_cmap_axis1(self):
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            self.col_colors, self.x_norm_leaves, axis=1)

        for j, leaf in enumerate(self.x_norm_leaves):
            color = self.col_colors[leaf]
            assert_colors_equal(cmap(matrix[0, j]), color)

    def test_color_list_to_matrix_and_cmap_different_sizes(self):
        colors = [self.col_colors, self.col_colors * 2]
        with pytest.raises(ValueError):
            matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
                colors, self.x_norm_leaves, axis=1)

    def test_savefig(self):
        # Not sure if this is the right way to test....
        cg = mat.ClusterGrid(self.df_norm, **self.default_kws)
        cg.plot(**self.default_plot_kws)
        cg.savefig(tempfile.NamedTemporaryFile(), format='png')

    def test_plot_dendrograms(self):
        cm = mat.clustermap(self.df_norm, **self.default_kws)

        assert len(cm.ax_row_dendrogram.collections[0].get_paths()) == len(
            cm.dendrogram_row.independent_coord)
        assert len(cm.ax_col_dendrogram.collections[0].get_paths()) == len(
            cm.dendrogram_col.independent_coord)
        data2d = self.df_norm.iloc[cm.dendrogram_row.reordered_ind,
                                   cm.dendrogram_col.reordered_ind]
        pdt.assert_frame_equal(cm.data2d, data2d)

    def test_cluster_false(self):
        kws = self.default_kws.copy()
        kws['row_cluster'] = False
        kws['col_cluster'] = False

        cm = mat.clustermap(self.df_norm, **kws)
        assert len(cm.ax_row_dendrogram.lines) == 0
        assert len(cm.ax_col_dendrogram.lines) == 0

        assert len(cm.ax_row_dendrogram.get_xticks()) == 0
        assert len(cm.ax_row_dendrogram.get_yticks()) == 0
        assert len(cm.ax_col_dendrogram.get_xticks()) == 0
        assert len(cm.ax_col_dendrogram.get_yticks()) == 0

        pdt.assert_frame_equal(cm.data2d, self.df_norm)

    def test_row_col_colors(self):
        kws = self.default_kws.copy()
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.clustermap(self.df_norm, **kws)

        assert len(cm.ax_row_colors.collections) == 1
        assert len(cm.ax_col_colors.collections) == 1

    def test_cluster_false_row_col_colors(self):
        kws = self.default_kws.copy()
        kws['row_cluster'] = False
        kws['col_cluster'] = False
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.clustermap(self.df_norm, **kws)
        assert len(cm.ax_row_dendrogram.lines) == 0
        assert len(cm.ax_col_dendrogram.lines) == 0

        assert len(cm.ax_row_dendrogram.get_xticks()) == 0
        assert len(cm.ax_row_dendrogram.get_yticks()) == 0
        assert len(cm.ax_col_dendrogram.get_xticks()) == 0
        assert len(cm.ax_col_dendrogram.get_yticks()) == 0
        assert len(cm.ax_row_colors.collections) == 1
        assert len(cm.ax_col_colors.collections) == 1

        pdt.assert_frame_equal(cm.data2d, self.df_norm)

    def test_row_col_colors_df(self):
        kws = self.default_kws.copy()
        kws['row_colors'] = pd.DataFrame(
            {
                'row_1': list(self.row_colors),
                'row_2': list(self.row_colors)
            },
            index=self.df_norm.index,
            columns=['row_1', 'row_2'])
        kws['col_colors'] = pd.DataFrame(
            {
                'col_1': list(self.col_colors),
                'col_2': list(self.col_colors)
            },
            index=self.df_norm.columns,
            columns=['col_1', 'col_2'])

        cm = mat.clustermap(self.df_norm, **kws)

        row_labels = [l.get_text() for l in cm.ax_row_colors.get_xticklabels()]
        assert cm.row_color_labels == ['row_1', 'row_2']
        assert row_labels == cm.row_color_labels

        col_labels = [l.get_text() for l in cm.ax_col_colors.get_yticklabels()]
        assert cm.col_color_labels == ['col_1', 'col_2']
        assert col_labels == cm.col_color_labels

    def test_row_col_colors_df_shuffled(self):
        # Tests if colors are properly matched, even if given in wrong order

        m, n = self.df_norm.shape
        shuffled_inds = [
            self.df_norm.index[i]
            for i in list(range(0, m, 2)) + list(range(1, m, 2))
        ]
        shuffled_cols = [
            self.df_norm.columns[i]
            for i in list(range(0, n, 2)) + list(range(1, n, 2))
        ]

        kws = self.default_kws.copy()

        row_colors = pd.DataFrame({'row_annot': list(self.row_colors)},
                                  index=self.df_norm.index)
        kws['row_colors'] = row_colors.loc[shuffled_inds]

        col_colors = pd.DataFrame({'col_annot': list(self.col_colors)},
                                  index=self.df_norm.columns)
        kws['col_colors'] = col_colors.loc[shuffled_cols]

        cm = mat.clustermap(self.df_norm, **kws)
        assert list(cm.col_colors)[0] == list(self.col_colors)
        assert list(cm.row_colors)[0] == list(self.row_colors)

    def test_row_col_colors_df_missing(self):
        kws = self.default_kws.copy()
        row_colors = pd.DataFrame({'row_annot': list(self.row_colors)},
                                  index=self.df_norm.index)
        kws['row_colors'] = row_colors.drop(self.df_norm.index[0])

        col_colors = pd.DataFrame({'col_annot': list(self.col_colors)},
                                  index=self.df_norm.columns)
        kws['col_colors'] = col_colors.drop(self.df_norm.columns[0])

        cm = mat.clustermap(self.df_norm, **kws)

        assert list(
            cm.col_colors)[0] == [(1.0, 1.0, 1.0)] + list(self.col_colors[1:])
        assert list(
            cm.row_colors)[0] == [(1.0, 1.0, 1.0)] + list(self.row_colors[1:])

    def test_row_col_colors_df_one_axis(self):
        # Test case with only row annotation.
        kws1 = self.default_kws.copy()
        kws1['row_colors'] = pd.DataFrame(
            {
                'row_1': list(self.row_colors),
                'row_2': list(self.row_colors)
            },
            index=self.df_norm.index,
            columns=['row_1', 'row_2'])

        cm1 = mat.clustermap(self.df_norm, **kws1)

        row_labels = [
            l.get_text() for l in cm1.ax_row_colors.get_xticklabels()
        ]
        assert cm1.row_color_labels == ['row_1', 'row_2']
        assert row_labels == cm1.row_color_labels

        # Test case with only col annotation.
        kws2 = self.default_kws.copy()
        kws2['col_colors'] = pd.DataFrame(
            {
                'col_1': list(self.col_colors),
                'col_2': list(self.col_colors)
            },
            index=self.df_norm.columns,
            columns=['col_1', 'col_2'])

        cm2 = mat.clustermap(self.df_norm, **kws2)

        col_labels = [
            l.get_text() for l in cm2.ax_col_colors.get_yticklabels()
        ]
        assert cm2.col_color_labels == ['col_1', 'col_2']
        assert col_labels == cm2.col_color_labels

    def test_row_col_colors_series(self):
        kws = self.default_kws.copy()
        kws['row_colors'] = pd.Series(list(self.row_colors),
                                      name='row_annot',
                                      index=self.df_norm.index)
        kws['col_colors'] = pd.Series(list(self.col_colors),
                                      name='col_annot',
                                      index=self.df_norm.columns)

        cm = mat.clustermap(self.df_norm, **kws)

        row_labels = [l.get_text() for l in cm.ax_row_colors.get_xticklabels()]
        assert cm.row_color_labels == ['row_annot']
        assert row_labels == cm.row_color_labels

        col_labels = [l.get_text() for l in cm.ax_col_colors.get_yticklabels()]
        assert cm.col_color_labels == ['col_annot']
        assert col_labels == cm.col_color_labels

    def test_row_col_colors_series_shuffled(self):
        # Tests if colors are properly matched, even if given in wrong order

        m, n = self.df_norm.shape
        shuffled_inds = [
            self.df_norm.index[i]
            for i in list(range(0, m, 2)) + list(range(1, m, 2))
        ]
        shuffled_cols = [
            self.df_norm.columns[i]
            for i in list(range(0, n, 2)) + list(range(1, n, 2))
        ]

        kws = self.default_kws.copy()

        row_colors = pd.Series(list(self.row_colors),
                               name='row_annot',
                               index=self.df_norm.index)
        kws['row_colors'] = row_colors.loc[shuffled_inds]

        col_colors = pd.Series(list(self.col_colors),
                               name='col_annot',
                               index=self.df_norm.columns)
        kws['col_colors'] = col_colors.loc[shuffled_cols]

        cm = mat.clustermap(self.df_norm, **kws)

        assert list(cm.col_colors) == list(self.col_colors)
        assert list(cm.row_colors) == list(self.row_colors)

    def test_row_col_colors_series_missing(self):
        kws = self.default_kws.copy()
        row_colors = pd.Series(list(self.row_colors),
                               name='row_annot',
                               index=self.df_norm.index)
        kws['row_colors'] = row_colors.drop(self.df_norm.index[0])

        col_colors = pd.Series(list(self.col_colors),
                               name='col_annot',
                               index=self.df_norm.columns)
        kws['col_colors'] = col_colors.drop(self.df_norm.columns[0])

        cm = mat.clustermap(self.df_norm, **kws)
        assert list(
            cm.col_colors) == [(1.0, 1.0, 1.0)] + list(self.col_colors[1:])
        assert list(
            cm.row_colors) == [(1.0, 1.0, 1.0)] + list(self.row_colors[1:])

    def test_row_col_colors_ignore_heatmap_kwargs(self):

        g = mat.clustermap(self.rs.uniform(0, 200, self.df_norm.shape),
                           row_colors=self.row_colors,
                           col_colors=self.col_colors,
                           cmap="Spectral",
                           norm=mpl.colors.LogNorm(),
                           vmax=100)

        assert np.array_equal(
            np.array(self.row_colors)[g.dendrogram_row.reordered_ind],
            g.ax_row_colors.collections[0].get_facecolors()[:, :3])

        assert np.array_equal(
            np.array(self.col_colors)[g.dendrogram_col.reordered_ind],
            g.ax_col_colors.collections[0].get_facecolors()[:, :3])

    def test_row_col_colors_raise_on_mixed_index_types(self):

        row_colors = pd.Series(list(self.row_colors),
                               name="row_annot",
                               index=self.df_norm.index)

        col_colors = pd.Series(list(self.col_colors),
                               name="col_annot",
                               index=self.df_norm.columns)

        with pytest.raises(TypeError):
            mat.clustermap(self.x_norm, row_colors=row_colors)

        with pytest.raises(TypeError):
            mat.clustermap(self.x_norm, col_colors=col_colors)

    def test_mask_reorganization(self):

        kws = self.default_kws.copy()
        kws["mask"] = self.df_norm > 0

        g = mat.clustermap(self.df_norm, **kws)
        npt.assert_array_equal(g.data2d.index, g.mask.index)
        npt.assert_array_equal(g.data2d.columns, g.mask.columns)

        npt.assert_array_equal(
            g.mask.index, self.df_norm.index[g.dendrogram_row.reordered_ind])
        npt.assert_array_equal(
            g.mask.columns,
            self.df_norm.columns[g.dendrogram_col.reordered_ind])

    def test_ticklabel_reorganization(self):

        kws = self.default_kws.copy()
        xtl = np.arange(self.df_norm.shape[1])
        kws["xticklabels"] = list(xtl)
        ytl = self.letters.loc[:self.df_norm.shape[0]]
        kws["yticklabels"] = ytl

        g = mat.clustermap(self.df_norm, **kws)

        xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()]
        ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()]

        xtl_want = xtl[g.dendrogram_col.reordered_ind].astype("<U1")
        ytl_want = ytl[g.dendrogram_row.reordered_ind].astype("<U1")

        npt.assert_array_equal(xtl_actual, xtl_want)
        npt.assert_array_equal(ytl_actual, ytl_want)

    def test_noticklabels(self):

        kws = self.default_kws.copy()
        kws["xticklabels"] = False
        kws["yticklabels"] = False

        g = mat.clustermap(self.df_norm, **kws)

        xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()]
        ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()]
        assert xtl_actual == []
        assert ytl_actual == []

    def test_size_ratios(self):

        # The way that wspace/hspace work in GridSpec, the mapping from input
        # ratio to actual width/height of each axes is complicated, so this
        # test is just going to assert comparative relationships

        kws1 = self.default_kws.copy()
        kws1.update(dendrogram_ratio=.2,
                    colors_ratio=.03,
                    col_colors=self.col_colors,
                    row_colors=self.row_colors)

        kws2 = kws1.copy()
        kws2.update(dendrogram_ratio=.3, colors_ratio=.05)

        g1 = mat.clustermap(self.df_norm, **kws1)
        g2 = mat.clustermap(self.df_norm, **kws2)

        assert (g2.ax_col_dendrogram.get_position().height >
                g1.ax_col_dendrogram.get_position().height)

        assert (g2.ax_col_colors.get_position().height >
                g1.ax_col_colors.get_position().height)

        assert (g2.ax_heatmap.get_position().height <
                g1.ax_heatmap.get_position().height)

        assert (g2.ax_row_dendrogram.get_position().width >
                g1.ax_row_dendrogram.get_position().width)

        assert (g2.ax_row_colors.get_position().width >
                g1.ax_row_colors.get_position().width)

        assert (g2.ax_heatmap.get_position().width <
                g1.ax_heatmap.get_position().width)

        kws1 = self.default_kws.copy()
        kws1.update(col_colors=self.col_colors)
        kws2 = kws1.copy()
        kws2.update(col_colors=[self.col_colors, self.col_colors])

        g1 = mat.clustermap(self.df_norm, **kws1)
        g2 = mat.clustermap(self.df_norm, **kws2)

        assert (g2.ax_col_colors.get_position().height >
                g1.ax_col_colors.get_position().height)

        kws1 = self.default_kws.copy()
        kws1.update(dendrogram_ratio=(.2, .2))

        kws2 = kws1.copy()
        kws2.update(dendrogram_ratio=(.2, .3))

        g1 = mat.clustermap(self.df_norm, **kws1)
        g2 = mat.clustermap(self.df_norm, **kws2)

        # Fails on pinned matplotlib?
        # assert (g2.ax_row_dendrogram.get_position().width
        #         == g1.ax_row_dendrogram.get_position().width)
        assert g1.gs.get_width_ratios() == g2.gs.get_width_ratios()

        assert (g2.ax_col_dendrogram.get_position().height >
                g1.ax_col_dendrogram.get_position().height)

    def test_cbar_pos(self):

        kws = self.default_kws.copy()
        kws["cbar_pos"] = (.2, .1, .4, .3)

        g = mat.clustermap(self.df_norm, **kws)
        pos = g.ax_cbar.get_position()
        assert pytest.approx(tuple(pos.p0)) == kws["cbar_pos"][:2]
        assert pytest.approx(pos.width) == kws["cbar_pos"][2]
        assert pytest.approx(pos.height) == kws["cbar_pos"][3]

        kws["cbar_pos"] = None
        g = mat.clustermap(self.df_norm, **kws)
        assert g.ax_cbar is None

    def test_square_warning(self):

        kws = self.default_kws.copy()
        g1 = mat.clustermap(self.df_norm, **kws)

        with pytest.warns(UserWarning):
            kws["square"] = True
            g2 = mat.clustermap(self.df_norm, **kws)

        g1_shape = g1.ax_heatmap.get_position().get_points()
        g2_shape = g2.ax_heatmap.get_position().get_points()
        assert np.array_equal(g1_shape, g2_shape)

    def test_clustermap_annotation(self):

        g = mat.clustermap(self.df_norm, annot=True, fmt=".1f")
        for val, text in zip(np.asarray(g.data2d).flat, g.ax_heatmap.texts):
            assert text.get_text() == "{:.1f}".format(val)

        g = mat.clustermap(self.df_norm, annot=self.df_norm, fmt=".1f")
        for val, text in zip(np.asarray(g.data2d).flat, g.ax_heatmap.texts):
            assert text.get_text() == "{:.1f}".format(val)

    def test_tree_kws(self):

        rgb = (1, .5, .2)
        g = mat.clustermap(self.df_norm, tree_kws=dict(color=rgb))
        for ax in [g.ax_col_dendrogram, g.ax_row_dendrogram]:
            tree, = ax.collections
            assert tuple(tree.get_color().squeeze())[:3] == rgb
Ejemplo n.º 31
0
def get_single_inverse_image_clusters(inverseImage, coveringSetElements, dmax, debugMode, metricName='euclidean', clusterMethod='single'):

	if debugMode == 1:
		print ""
		print "(function:  get_single_inverse_image_clusters)  "


	# check that at least two data points are in the inverse image
	numElements = len(coveringSetElements)
	if numElements == 0:
		return [set()]
	elif numElements == 1:
		return [{coveringSetElements[0]}]
	else:
		# perform clustering in the inverse image
		links = fastcluster.linkage_vector(inverseImage, method=clusterMethod, metric=metricName)

		# determine total number of data points in this inverse image
		N = inverseImage.shape[0]

		# count the number of clusters; to start, no points have been merged so number of Clusters =  number of data points = N
		numClusters = N

		# create an initial dictionary of clusters; the labels will be in the range [0, N-1] and these will correspond to how the initial clusters (the nodes) are labeled in the 'links' array.  The values will be sets; to start, each set will be a singleton that contains the original point index as its only element.
		clusterDict = {i: [coveringSetElements[i]] for i in range(N)}

		# The third column (index 2) of 'links' contains the merging distance for the two clusters listed in that row.  We will now loop through  'links' until that merging distance is greater than dmax, or until we reach the end of 'links', whichever comes first.
		index = 0	# this is out "counter" index
		while index < (N-1):
			if links[index][2] < dmax:
				# get the fastcluster indices for the two nodes merged at this step
				p1 = links[index][0]
				p2 = links[index][1]
				# Now create a new cluster (that is, a new dictionary entry) which has a label that is one higher than whatever is currently the highest label and which has a value that is the list produced by combining the lists which correspond to clusters p1 and p2 (remember, p1 and p2 are also labels in the dictionary)
				newLabel = N + index	# start at N, then increase
				newCluster = clusterDict[p1] + clusterDict[p2]	# this concatenates the lists which are indexed in the dictionary by p1 and p2
				clusterDict.pop(p1); clusterDict.pop(p2)	# remove these clusters
				clusterDict[newLabel] = newCluster 			# add the new cluster which was formed by merging the old 2

			index += 1 	# increment our index counter after each merge

		# Take each cluster, which is stored in a a dictionary value, and place it into a list.  This list of sublists now contains a sublist which holds the original indices of each point in the cluster which is represented by that sublist
		clusters = clusterDict.values()

		# EXPERIMENT: try converting each element of 'clusters' (elements being lists) to a set
		for i in range(len(clusters)):
			listlength = len(clusters[i])
			clusters[i] = set(clusters[i])
			setlength = len(clusters[i])
			# if lengths don't match, then something went wrong
			if listlength != setlength and debugMode == 1:
				print "ERROR!! (function:  get_single_inverse_image_clusters):  listlength != setlength"

		if debugMode == 1:
			print "links.shape = " + str(links.shape)
			print "number of data points in this inverse image = " + str(N)
			print "should = number of elements in coveringSetElements = " + str(len(coveringSetElements))
			print "Number of clusters = " + str(len(clusters))
			print "first merging distance = " + str(links[0][2])
			print "last merging distance = " + str(links[N-2][2])
			print ""

		return clusters
Ejemplo n.º 32
0
    labels = algorithm.labels_
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    # plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    # plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)


# clusterer = hdbscan.HDBSCAN(min_cluster_size=1000, min_samples=100).fit(X)
# plot_clusters(X[:,:5], clusterer)

link_mat = fastcluster.linkage_vector(X, method='ward')  ## fc_cluster
labels = fcluster(link_mat, 8, criterion='maxclust')
# plt.clf();plt.scatter(X.T[0], X.T[1], c=(labels==6).astype(np.int))

X = X[labels != 6, :]

## remove shots of sky for light normalization....
imgs = imgs[labels != 6]
imgs_paths = imgs_paths[labels != 6]

## generate additional features
stdevs = [np.std(t.reshape((24, 38, 3)).reshape(-1, 3), axis=0) for t in imgs]
avg_vals = np.array(
    [np.mean(t.reshape((24, 38, 3)).reshape(-1, 3), axis=0) for t in imgs])

################ gmm ################
Ejemplo n.º 33
0
        fastcluster.linkage(D, method=method)
        raise AssertionError('fastcluster did not detect a NaN value!')
    except FloatingPointError:
        pass

# Next: the original array does not contain a NaN, but a NaN occurs
# as an updated distance.
for method in ['average', 'weighted', 'ward', 'centroid', 'median']:
    try:
        fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method)
        raise AssertionError('fastcluster did not detect a NaN value!')
    except FloatingPointError:
        pass

# Part 2: vector input

dim = np.random.random_integers(2,12)
X = np.random.rand(n,dim)
pos = (np.random.randint(n), np.random.randint(dim))
# Insert a single NaN coordinate
X[pos] = np.nan

for method in ['single', 'ward', 'centroid', 'median']:
    try:
        fastcluster.linkage_vector(X, method=method)
        raise AssertionError('fastcluster did not detect a NaN value!')
    except FloatingPointError:
        pass

print('OK.')
Ejemplo n.º 34
0
def test_all(n,dim):
  method = 'single'

  # metrics for boolean vectors
  pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool)
  pcd2 = pcd.copy()

  for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice',
                 'rogerstanimoto',
                 #'sokalmichener',
                 # exclude, bug in Scipy
                 # http://projects.scipy.org/scipy/ticket/1486
                 'russellrao', 'sokalsneath',
                 #'kulsinski'
                 # exclude, bug in Scipy
                 # http://projects.scipy.org/scipy/ticket/1484
                 ):
    sys.stdout.write("Metric: " + metric + "...")
    D = pdist(pcd, metric)
    D = correct_for_zero_vectors(D, pcd, metric)

    try:
        Z2 = fc.linkage_vector(pcd, method, metric)
    except FloatingPointError:
        # If linkage_vector reported a NaN dissimilarity value,
        # check whether the distance matrix really contains NaN.
        if np.any(np.isnan(D)):
            print("Skip this test: NaN dissimilarity value.")
            continue
        else:
            raise AssertionError('"linkage_vector" erroneously reported NaN.')

    if np.any(pcd2!=pcd):
      raise AssertionError('Input array was corrupted.', pcd)
    test(Z2, method, D)

  # metrics for real vectors
  bound = math.sqrt(n)
  pcd = np.random.random_integers(-bound,bound,(n,dim))
  for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev',
                 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard',
                 'canberra',
                 # canberra: see bug in older Scipy versions
                 # http://projects.scipy.org/scipy/ticket/1430
                 'braycurtis', 'seuclidean', 'mahalanobis', 'user']:
    sys.stdout.write("Metric: " + metric + "...")
    if metric=='minkowski':
        p = np.random.uniform(1.,10.)
        sys.stdout.write("p: " + str(p) + "...")
        D = pdist(pcd, metric, p)
        Z2 = fc.linkage_vector(pcd, method, metric, p)
    elif metric=='user':
        # Euclidean metric as a user function
        fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum()))
        D = pdist(pcd, fn)
        Z2 = fc.linkage_vector(pcd, method, fn)
    else:
        D = pdist(pcd, metric)
        D = correct_for_zero_vectors(D, pcd, metric)
        try:
            Z2 = fc.linkage_vector(pcd, method, metric)
        except FloatingPointError:
            if np.any(np.isnan(D)):
                print("Skip this test: NaN dissimilarity value.")
                continue
            else:
                raise AssertionError(
                    '"linkage_vector" erroneously reported NaN.')

    test(Z2, method, D)

  D = pdist(pcd)
  for method in ['ward', 'centroid', 'median']:
    Z2 = fc.linkage_vector(pcd, method)
    test(Z2, method, D)
Ejemplo n.º 35
0
class TestDendrogram(object):
    rs = np.random.RandomState(sum(map(ord, "dendrogram")))

    x_norm = rs.randn(4, 8) + np.arange(8)
    x_norm = (x_norm.T + np.arange(4)).T
    letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"],
                        name="letters")

    df_norm = pd.DataFrame(x_norm, columns=letters)
    try:
        import fastcluster

        x_norm_linkage = fastcluster.linkage_vector(x_norm.T,
                                                    metric='euclidean',
                                                    method='single')
    except ImportError:
        x_norm_distances = distance.pdist(x_norm.T, metric='euclidean')
        x_norm_linkage = hierarchy.linkage(x_norm_distances, method='single')
    x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage, no_plot=True,
                                             color_list=['k'],
                                             color_threshold=-np.inf)
    x_norm_leaves = x_norm_dendrogram['leaves']
    df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves])

    default_kws = dict(linkage=None, metric='euclidean', method='single',
                       axis=1, label=True, rotate=False)

    def test_ndarray_input(self):
        p = mat._DendrogramPlotter(self.x_norm, **self.default_kws)
        npt.assert_array_equal(p.array.T, self.x_norm)
        pdt.assert_frame_equal(p.data.T, pd.DataFrame(self.x_norm))

        npt.assert_array_equal(p.linkage, self.x_norm_linkage)
        nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram)

        npt.assert_array_equal(p.reordered_ind, self.x_norm_leaves)

        npt.assert_array_equal(p.xticklabels, self.x_norm_leaves)
        npt.assert_array_equal(p.yticklabels, [])

        nt.assert_equal(p.xlabel, None)
        nt.assert_equal(p.ylabel, '')

    def test_df_input(self):
        p = mat._DendrogramPlotter(self.df_norm, **self.default_kws)
        npt.assert_array_equal(p.array.T, np.asarray(self.df_norm))
        pdt.assert_frame_equal(p.data.T, self.df_norm)

        npt.assert_array_equal(p.linkage, self.x_norm_linkage)
        nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram)

        npt.assert_array_equal(p.xticklabels,
                               np.asarray(self.df_norm.columns)[
                                   self.x_norm_leaves])
        npt.assert_array_equal(p.yticklabels, [])

        nt.assert_equal(p.xlabel, 'letters')
        nt.assert_equal(p.ylabel, '')

    def test_df_multindex_input(self):

        df = self.df_norm.copy()
        index = pd.MultiIndex.from_tuples([("A", 1), ("B", 2),
                                           ("C", 3), ("D", 4)],
                                          names=["letter", "number"])
        index.name = "letter-number"
        df.index = index
        kws = self.default_kws.copy()
        kws['label'] = True

        p = mat._DendrogramPlotter(df.T, **kws)

        xticklabels = ["A-1", "B-2", "C-3", "D-4"]
        xticklabels = [xticklabels[i] for i in p.reordered_ind]
        npt.assert_array_equal(p.xticklabels, xticklabels)
        npt.assert_array_equal(p.yticklabels, [])
        nt.assert_equal(p.xlabel, "letter-number")

    def test_axis0_input(self):
        kws = self.default_kws.copy()
        kws['axis'] = 0
        p = mat._DendrogramPlotter(self.df_norm.T, **kws)

        npt.assert_array_equal(p.array, np.asarray(self.df_norm.T))
        pdt.assert_frame_equal(p.data, self.df_norm.T)

        npt.assert_array_equal(p.linkage, self.x_norm_linkage)
        nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram)

        npt.assert_array_equal(p.xticklabels, self.df_norm_leaves)
        npt.assert_array_equal(p.yticklabels, [])

        nt.assert_equal(p.xlabel, 'letters')
        nt.assert_equal(p.ylabel, '')

    def test_rotate_input(self):
        kws = self.default_kws.copy()
        kws['rotate'] = True
        p = mat._DendrogramPlotter(self.df_norm, **kws)
        npt.assert_array_equal(p.array.T, np.asarray(self.df_norm))
        pdt.assert_frame_equal(p.data.T, self.df_norm)

        npt.assert_array_equal(p.xticklabels, [])
        npt.assert_array_equal(p.yticklabels, self.df_norm_leaves)

        nt.assert_equal(p.xlabel, '')
        nt.assert_equal(p.ylabel, 'letters')

    def test_rotate_axis0_input(self):
        kws = self.default_kws.copy()
        kws['rotate'] = True
        kws['axis'] = 0
        p = mat._DendrogramPlotter(self.df_norm.T, **kws)

        npt.assert_array_equal(p.reordered_ind, self.x_norm_leaves)

    def test_custom_linkage(self):
        kws = self.default_kws.copy()

        try:
            import fastcluster

            linkage = fastcluster.linkage_vector(self.x_norm, method='single',
                                                 metric='euclidean')
        except ImportError:
            d = distance.pdist(self.x_norm, metric='euclidean')
            linkage = hierarchy.linkage(d, method='single')
        dendrogram = hierarchy.dendrogram(linkage, no_plot=True,
                                          color_list=['k'],
                                          color_threshold=-np.inf)
        kws['linkage'] = linkage
        p = mat._DendrogramPlotter(self.df_norm, **kws)

        npt.assert_array_equal(p.linkage, linkage)
        nt.assert_dict_equal(p.dendrogram, dendrogram)

    def test_label_false(self):
        kws = self.default_kws.copy()
        kws['label'] = False
        p = mat._DendrogramPlotter(self.df_norm, **kws)
        nt.assert_equal(p.xticks, [])
        nt.assert_equal(p.yticks, [])
        nt.assert_equal(p.xticklabels, [])
        nt.assert_equal(p.yticklabels, [])
        nt.assert_equal(p.xlabel, "")
        nt.assert_equal(p.ylabel, "")

    def test_linkage_scipy(self):
        p = mat._DendrogramPlotter(self.x_norm, **self.default_kws)

        scipy_linkage = p._calculate_linkage_scipy()

        from scipy.spatial import distance
        from scipy.cluster import hierarchy

        dists = distance.pdist(self.x_norm.T,
                               metric=self.default_kws['metric'])
        linkage = hierarchy.linkage(dists, method=self.default_kws['method'])

        npt.assert_array_equal(scipy_linkage, linkage)

    @skipif(_no_fastcluster)
    def test_fastcluster_other_method(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws['method'] = 'average'
        linkage = fastcluster.linkage(self.x_norm.T, method='average',
                                      metric='euclidean')
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)

    @skipif(_no_fastcluster)
    def test_fastcluster_non_euclidean(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws['metric'] = 'cosine'
        kws['method'] = 'average'
        linkage = fastcluster.linkage(self.x_norm.T, method=kws['method'],
                                      metric=kws['metric'])
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)

    def test_dendrogram_plot(self):
        d = mat.dendrogram(self.x_norm, **self.default_kws)

        ax = plt.gca()
        xlim = ax.get_xlim()
        # 10 comes from _plot_dendrogram in scipy.cluster.hierarchy
        xmax = len(d.reordered_ind) * 10

        nt.assert_equal(xlim[0], 0)
        nt.assert_equal(xlim[1], xmax)

        nt.assert_equal(len(ax.collections[0].get_paths()),
                        len(d.dependent_coord))

        plt.close('all')

    def test_dendrogram_rotate(self):
        kws = self.default_kws.copy()
        kws['rotate'] = True

        d = mat.dendrogram(self.x_norm, **kws)

        ax = plt.gca()
        ylim = ax.get_ylim()

        # 10 comes from _plot_dendrogram in scipy.cluster.hierarchy
        ymax = len(d.reordered_ind) * 10

        # Since y axis is inverted, ylim is (80, 0)
        # and therefore not (0, 80) as usual:
        nt.assert_equal(ylim[1], 0)
        nt.assert_equal(ylim[0], ymax)
        plt.close('all')

    def test_dendrogram_ticklabel_rotation(self):
        f, ax = plt.subplots(figsize=(2, 2))
        mat.dendrogram(self.df_norm, ax=ax)

        for t in ax.get_xticklabels():
            nt.assert_equal(t.get_rotation(), 0)

        plt.close(f)

        df = self.df_norm.copy()
        df.columns = [str(c) * 10 for c in df.columns]
        df.index = [i * 10 for i in df.index]

        f, ax = plt.subplots(figsize=(2, 2))
        mat.dendrogram(df, ax=ax)

        for t in ax.get_xticklabels():
            nt.assert_equal(t.get_rotation(), 90)

        plt.close(f)

        f, ax = plt.subplots(figsize=(2, 2))
        mat.dendrogram(df.T, axis=0, rotate=True)
        for t in ax.get_yticklabels():
            nt.assert_equal(t.get_rotation(), 0)
        plt.close(f)
Ejemplo n.º 36
0
    def search_engine(self,
                      raw_df,
                      centroids,
                      threshold=0.5,
                      min_sim=0.9,
                      model_name='model_fast_text_sg_40',
                      prod_id_column='product_id',
                      column_name_db='word_vector',
                      column_name_data='word_vector',
                      pre_computed_word_vectors=False,
                      min_amount_analogous=3,
                      clustering_algorithm='agglomerative'):
        '''
        performs a serach for  similarity of word vector from a dataframe in a precalculated reference DB
        
        raw_df is the unlabeled data
        centroids  is the reference DB
        threshold is the hierarchichal clustering threshold distance
        min sim is the minimum similarity in order to assign an ad_title to a prodcut_id tag
        
        '''

        last_product_id = max(centroids.product_id)

        if not ('category_id' in raw_df.columns):
            raw_df = raw_df.assign(category_id=0)

        test = search_engine_fasttext(
            raw_df,
            centroids,
            min_sim=min_sim,
            model_name=model_name,
            column_name_db=column_name_db,
            column_name_data=column_name_data,
            pre_computed_word_vectors=pre_computed_word_vectors)
        test = test.rename(columns={'product_id_fasttext': 'product_id'})
        test = test[[
            'date_min', 'date_max', 'category_id', 'ad_title', 'word_vector',
            'ad_id', 'product_id'
        ]]
        data = test
        print('{} unlabeled ads'.format(
            len(test[test.product_id == -1]['product_id'])))

        try:
            test = test.assign(last_modified_date=test.date_max,
                               starting_date=test.date_min)
        except:
            test = test.assign(
                last_modified_date=datetime.datetime.today().strftime(
                    '%Y-%m-%d'),
                starting_date='2000-01-01')
        try:
            unknown_products = test[test.product_id == -1].assign(
                starting_date=test[test.product_id == -1].date_min)
        except:
            print(test.product_id)
            unknown_products = test[test.product_id == -1].assign(
                starting_date=datetime.datetime.today().strftime('%Y-%m-%d'))

        try:
            test = test.drop('date_max', axis=1)
        except:
            pass
        try:
            test = test.drop('date_min', axis=1)
        except:
            pass
        try:
            unknown_products = unknown_products.drop('date_max', axis=1)
        except:
            pass
        try:
            unknown_products = unknown_products.drop('date_min', axis=1)
        except:
            pass

        print('groupying new products')

        self.new_existing_products = test[test.product_id != -1].rename(
            columns={
                'product_id': 'product_id'
            }).assign(counter=1)
        self.new_existing_products = self.group_by_product(
            self.new_existing_products[[
                'product_id', 'starting_date', 'last_modified_date',
                'category_id', 'ad_title', 'counter', 'word_vector', 'ad_id'
            ]])

        if len(unknown_products) >= min_amount_analogous:

            if clustering_algorithm == 'agglomerative':
                unknown_data = np.array(
                    [i[0][0] for i in unknown_products.word_vector])
                cluster_ = fastcluster.linkage_vector(unknown_data,
                                                      method='ward')
                cluster_labels = Cluster.hierarchy.fcluster(
                    cluster_, threshold)
                unknown_products = unknown_products.assign(
                    product_id=cluster_labels)

            elif clustering_algorithm == 'community':
                unknown_products = self.graph_communities(
                    unknown_products,
                    min_value_=0.8,
                    topn_=400,
                    k1=50,
                    expected_density=0.1,
                    graph_communities_df=None)

            title_clusters_joinned = self.group_by_product(
                unknown_products, prod_id_column='product_id')
            new_products = title_clusters_joinned[
                title_clusters_joinned.counter >= min_amount_analogous]
            dumped_products = title_clusters_joinned[
                title_clusters_joinned.counter < min_amount_analogous]
            new_products = new_products.assign(
                product_id=new_products.product_id.apply(
                    lambda x: x + last_product_id + 1))

            self.new_products = new_products
            self.dumped_products = dumped_products

            print(title_clusters_joinned)

        else:
            new_products = pd.DataFrame(columns=[
                'product_id', 'starting_date', 'last_modified_date',
                'category_id', 'ad_title', 'counter', 'word_vector', 'ad_id'
            ])
            self.new_products = new_products
        try:
            print(
                str(len(self.new_existing_products)) +
                ' products that already exist in data base')
        except:
            pass
        try:
            print(str(len(self.new_products)) + ' new products found')
        except:
            pass
        try:
            print(str(len(self.dumped_products)) + ' ads dumped')
        except:
            pass

        self.new_existing_products = self.new_existing_products.set_index(
            np.arange(len(self.new_existing_products)))
        self.new_products = self.new_products.set_index(
            np.arange(len(self.new_products)))

        try:
            self.dumped_products = self.dumped_products.set_index(
                np.arange(len(self.dumped_products)))
        except:
            try:
                self.dumped_products = dumped_products.set_index(
                    np.arange(len(dumped_products)))
            except:
                pass

        return data
Ejemplo n.º 37
0
# Hierarchical agglomerative clustering

# Fortunately, it has been implemented by somebody with quite a nice analysis of
# the complexity and comparison to other packages.
import fastcluster

# fastcluster has a nice website to go along with it:
# http://danifold.net/fastcluster.html

# So the result is that linkage_vector is a much more efficient 
# But it isn't magic; always test when you have larger datasets.

from timeit import default_timer as timer

start = timer()
fastcluster.linkage_vector(mat_numeric[0:1000,:7])
end = timer()
# Time for 1000
print(end - start) 

start = timer()
fastcluster.linkage_vector(mat_numeric[0:10000,:7])
end = timer()
# Time for 10000
print(end - start) 

# Hm, this won't scale linearly. Let's randomly sample 1000 so it will be easier to work with.
np.random.seed(1)
samples = np.random.randint(0,df.shape[0], 100)
mat_sample = mat_numeric[samples, :]
Ejemplo n.º 38
0
def test_all(n, dim):
    method = 'single'

    # metrics for boolean vectors
    pcd = np.array(np.random.random_integers(0, 1, (n, dim)), dtype=np.bool)
    pcd2 = pcd.copy()
    for metric in (
            'hamming',
            'jaccard',
            'yule',
            'matching',
            'dice',  #'kulsinski',
            'rogerstanimoto',
            #'sokalmichener',
            # exclude, bug in older Scipy versions
            # http://projects.scipy.org/scipy/ticket/1486
            'russellrao',
            'sokalsneath',
            #'kulsinski'
            # exclude, bug in older Scipy versions
            # http://projects.scipy.org/scipy/ticket/1484
    ):
        sys.stdout.write("Metric: " + metric + "...")
        D = pdist(pcd, metric)
        Z2 = fc.linkage_vector(pcd, method, metric)
        if np.any(pcd2 != pcd):
            raise AssertionError('Input array was corrupted.', pcd)
        test(Z2, method, D)

    # metrics for real vectors
    bound = math.sqrt(n)
    pcd = np.random.random_integers(-bound, bound, (n, dim))
    for metric in [
            'euclidean',
            'sqeuclidean',
            'cityblock',
            'chebychev',
            'minkowski',
            'cosine',
            'correlation',
            'hamming',
            'jaccard',
            #'canberra',
            # exclude, bug in older Scipy versions
            # http://projects.scipy.org/scipy/ticket/1430
            'braycurtis',
            'seuclidean',
            'mahalanobis',
            'user'
    ]:
        sys.stdout.write("Metric: " + metric + "...")
        if metric == 'minkowski':
            p = np.random.uniform(1., 10.)
            sys.stdout.write("p: " + str(p) + "...")
            D = pdist(pcd, metric, p)
            Z2 = fc.linkage_vector(pcd, method, metric, p)
        elif metric == 'user':
            # Euclidean metric as a user function
            fn = (lambda u, v: np.sqrt(((u - v) * (u - v).T).sum()))
            D = pdist(pcd, fn)
            Z2 = fc.linkage_vector(pcd, method, fn)
        else:
            D = pdist(pcd, metric)
            Z2 = fc.linkage_vector(pcd, method, metric)
        test(Z2, method, D)

    D = pdist(pcd)
    for method in ['ward', 'centroid', 'median']:
        Z2 = fc.linkage_vector(pcd, method)
        test(Z2, method, D)
Ejemplo n.º 39
0
class TestClustermap(object):
    rs = np.random.RandomState(sum(map(ord, "clustermap")))

    x_norm = rs.randn(4, 8) + np.arange(8)
    x_norm = (x_norm.T + np.arange(4)).T
    letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"],
                        name="letters")

    df_norm = pd.DataFrame(x_norm, columns=letters)
    try:
        import fastcluster

        x_norm_linkage = fastcluster.linkage_vector(x_norm.T,
                                                    metric='euclidean',
                                                    method='single')
    except ImportError:
        x_norm_distances = distance.pdist(x_norm.T, metric='euclidean')
        x_norm_linkage = hierarchy.linkage(x_norm_distances, method='single')
    x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage, no_plot=True,
                                             color_list=['k'],
                                             color_threshold=-np.inf)
    x_norm_leaves = x_norm_dendrogram['leaves']
    df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves])

    default_kws = dict(pivot_kws=None, z_score=None, standard_scale=None,
                       figsize=None, row_colors=None, col_colors=None)

    default_plot_kws = dict(metric='euclidean', method='average',
                            colorbar_kws=None,
                            row_cluster=True, col_cluster=True,
                            row_linkage=None, col_linkage=None)

    row_colors = color_palette('Set2', df_norm.shape[0])
    col_colors = color_palette('Dark2', df_norm.shape[1])

    def test_ndarray_input(self):
        cm = mat.ClusterGrid(self.x_norm, **self.default_kws)
        pdt.assert_frame_equal(cm.data, pd.DataFrame(self.x_norm))
        nt.assert_equal(len(cm.fig.axes), 4)
        nt.assert_equal(cm.ax_row_colors, None)
        nt.assert_equal(cm.ax_col_colors, None)

        plt.close('all')

    def test_df_input(self):
        cm = mat.ClusterGrid(self.df_norm, **self.default_kws)
        pdt.assert_frame_equal(cm.data, self.df_norm)

        plt.close('all')

    def test_corr_df_input(self):
        df = self.df_norm.corr()
        cg = mat.ClusterGrid(df, **self.default_kws)
        cg.plot(**self.default_plot_kws)
        diag = cg.data2d.values[np.diag_indices_from(cg.data2d)]
        npt.assert_array_equal(diag, np.ones(cg.data2d.shape[0]))

        plt.close('all')

    def test_pivot_input(self):
        df_norm = self.df_norm.copy()
        df_norm.index.name = 'numbers'
        df_long = pd.melt(df_norm.reset_index(), var_name='letters',
                          id_vars='numbers')
        kws = self.default_kws.copy()
        kws['pivot_kws'] = dict(index='numbers', columns='letters',
                                values='value')
        cm = mat.ClusterGrid(df_long, **kws)

        pdt.assert_frame_equal(cm.data2d, df_norm)

        plt.close('all')

    def test_colors_input(self):
        kws = self.default_kws.copy()

        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.ClusterGrid(self.df_norm, **kws)
        npt.assert_array_equal(cm.row_colors, self.row_colors)
        npt.assert_array_equal(cm.col_colors, self.col_colors)

        nt.assert_equal(len(cm.fig.axes), 6)
        plt.close('all')

    def test_nested_colors_input(self):
        kws = self.default_kws.copy()

        row_colors = [self.row_colors, self.row_colors]
        col_colors = [self.col_colors, self.col_colors]
        kws['row_colors'] = row_colors
        kws['col_colors'] = col_colors

        cm = mat.ClusterGrid(self.df_norm, **kws)
        npt.assert_array_equal(cm.row_colors, row_colors)
        npt.assert_array_equal(cm.col_colors, col_colors)

        nt.assert_equal(len(cm.fig.axes), 6)
        plt.close('all')

    def test_colors_input_custom_cmap(self):
        kws = self.default_kws.copy()

        kws['cmap'] = mpl.cm.PRGn
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.clustermap(self.df_norm, **kws)
        npt.assert_array_equal(cm.row_colors, self.row_colors)
        npt.assert_array_equal(cm.col_colors, self.col_colors)

        nt.assert_equal(len(cm.fig.axes), 6)
        plt.close('all')

    def test_z_score(self):
        df = self.df_norm.copy()
        df = (df - df.mean()) / df.std()
        kws = self.default_kws.copy()
        kws['z_score'] = 1

        cm = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cm.data2d, df)

        plt.close('all')

    def test_z_score_axis0(self):
        df = self.df_norm.copy()
        df = df.T
        df = (df - df.mean()) / df.std()
        df = df.T
        kws = self.default_kws.copy()
        kws['z_score'] = 0

        cm = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cm.data2d, df)

        plt.close('all')

    def test_standard_scale(self):
        df = self.df_norm.copy()
        df = (df - df.min()) / (df.max() - df.min())
        kws = self.default_kws.copy()
        kws['standard_scale'] = 1

        cm = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cm.data2d, df)

        plt.close('all')

    def test_standard_scale_axis0(self):
        df = self.df_norm.copy()
        df = df.T
        df = (df - df.min()) / (df.max() - df.min())
        df = df.T
        kws = self.default_kws.copy()
        kws['standard_scale'] = 0

        cm = mat.ClusterGrid(self.df_norm, **kws)
        pdt.assert_frame_equal(cm.data2d, df)

        plt.close('all')

    def test_z_score_standard_scale(self):
        kws = self.default_kws.copy()
        kws['z_score'] = True
        kws['standard_scale'] = True
        with nt.assert_raises(ValueError):
            cm = mat.ClusterGrid(self.df_norm, **kws)

        plt.close('all')

    def test_color_list_to_matrix_and_cmap(self):
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            self.col_colors, self.x_norm_leaves)

        colors_set = set(self.col_colors)
        col_to_value = dict((col, i) for i, col in enumerate(colors_set))
        matrix_test = np.array([col_to_value[col] for col in
                                self.col_colors])[self.x_norm_leaves]
        shape = len(self.col_colors), 1
        matrix_test = matrix_test.reshape(shape)
        cmap_test = mpl.colors.ListedColormap(colors_set)
        npt.assert_array_equal(matrix, matrix_test)
        npt.assert_array_equal(cmap.colors, cmap_test.colors)

        plt.close('all')

    def test_nested_color_list_to_matrix_and_cmap(self):
        colors = [self.col_colors, self.col_colors]
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            colors, self.x_norm_leaves)

        all_colors = set(itertools.chain(*colors))
        color_to_value = dict((col, i) for i, col in enumerate(all_colors))
        matrix_test = np.array(
            [color_to_value[c] for color in colors for c in color])
        shape = len(colors), len(colors[0])
        matrix_test = matrix_test.reshape(shape)
        matrix_test = matrix_test[:, self.x_norm_leaves]
        matrix_test = matrix_test.T

        cmap_test = mpl.colors.ListedColormap(all_colors)
        npt.assert_array_equal(matrix, matrix_test)
        npt.assert_array_equal(cmap.colors, cmap_test.colors)

        plt.close('all')

    def test_color_list_to_matrix_and_cmap_axis1(self):
        matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap(
            self.col_colors, self.x_norm_leaves, axis=1)

        colors_set = set(self.col_colors)
        col_to_value = dict((col, i) for i, col in enumerate(colors_set))
        matrix_test = np.array([col_to_value[col] for col in
                                self.col_colors])[self.x_norm_leaves]
        shape = 1, len(self.col_colors)
        matrix_test = matrix_test.reshape(shape)
        cmap_test = mpl.colors.ListedColormap(colors_set)
        npt.assert_array_equal(matrix, matrix_test)
        npt.assert_array_equal(cmap.colors, cmap_test.colors)

        plt.close('all')

    def test_savefig(self):
        # Not sure if this is the right way to test....
        cm = mat.ClusterGrid(self.df_norm, **self.default_kws)
        cm.plot(**self.default_plot_kws)
        cm.savefig(tempfile.NamedTemporaryFile(), format='png')

        plt.close('all')

    def test_plot_dendrograms(self):
        cm = mat.clustermap(self.df_norm, **self.default_kws)

        nt.assert_equal(len(cm.ax_row_dendrogram.collections[0].get_paths()),
                        len(cm.dendrogram_row.independent_coord))
        nt.assert_equal(len(cm.ax_col_dendrogram.collections[0].get_paths()),
                        len(cm.dendrogram_col.independent_coord))
        data2d = self.df_norm.iloc[cm.dendrogram_row.reordered_ind,
                                   cm.dendrogram_col.reordered_ind]
        pdt.assert_frame_equal(cm.data2d, data2d)
        plt.close('all')

    def test_cluster_false(self):
        kws = self.default_kws.copy()
        kws['row_cluster'] = False
        kws['col_cluster'] = False

        cm = mat.clustermap(self.df_norm, **kws)
        nt.assert_equal(len(cm.ax_row_dendrogram.lines), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.lines), 0)

        nt.assert_equal(len(cm.ax_row_dendrogram.get_xticks()), 0)
        nt.assert_equal(len(cm.ax_row_dendrogram.get_yticks()), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.get_xticks()), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.get_yticks()), 0)

        pdt.assert_frame_equal(cm.data2d, self.df_norm)
        plt.close('all')

    def test_row_col_colors(self):
        kws = self.default_kws.copy()
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.clustermap(self.df_norm, **kws)

        nt.assert_equal(len(cm.ax_row_colors.collections), 1)
        nt.assert_equal(len(cm.ax_col_colors.collections), 1)

        plt.close('all')

    def test_cluster_false_row_col_colors(self):
        kws = self.default_kws.copy()
        kws['row_cluster'] = False
        kws['col_cluster'] = False
        kws['row_colors'] = self.row_colors
        kws['col_colors'] = self.col_colors

        cm = mat.clustermap(self.df_norm, **kws)
        nt.assert_equal(len(cm.ax_row_dendrogram.lines), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.lines), 0)

        nt.assert_equal(len(cm.ax_row_dendrogram.get_xticks()), 0)
        nt.assert_equal(len(cm.ax_row_dendrogram.get_yticks()), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.get_xticks()), 0)
        nt.assert_equal(len(cm.ax_col_dendrogram.get_yticks()), 0)
        nt.assert_equal(len(cm.ax_row_colors.collections), 1)
        nt.assert_equal(len(cm.ax_col_colors.collections), 1)

        pdt.assert_frame_equal(cm.data2d, self.df_norm)
        plt.close('all')

    def test_mask_reorganization(self):

        kws = self.default_kws.copy()
        kws["mask"] = self.df_norm > 0

        g = mat.clustermap(self.df_norm, **kws)
        npt.assert_array_equal(g.data2d.index, g.mask.index)
        npt.assert_array_equal(g.data2d.columns, g.mask.columns)

        npt.assert_array_equal(g.mask.index,
                               self.df_norm.index[
                                   g.dendrogram_row.reordered_ind])
        npt.assert_array_equal(g.mask.columns,
                               self.df_norm.columns[
                                   g.dendrogram_col.reordered_ind])

        plt.close("all")

    def test_ticklabel_reorganization(self):

        kws = self.default_kws.copy()
        xtl = np.arange(self.df_norm.shape[1])
        kws["xticklabels"] = list(xtl)
        ytl = self.letters.ix[:self.df_norm.shape[0]]
        kws["yticklabels"] = ytl

        g = mat.clustermap(self.df_norm, **kws)

        xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()]
        ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()]

        xtl_want = xtl[g.dendrogram_col.reordered_ind].astype("<U1")
        ytl_want = ytl[g.dendrogram_row.reordered_ind].astype("<U1")[::-1]

        npt.assert_array_equal(xtl_actual, xtl_want)
        npt.assert_array_equal(ytl_actual, ytl_want)

        plt.close("all")
Ejemplo n.º 40
0
def test_all(n,dim):
  method = 'single'

  # metrics for boolean vectors
  pcd = np.random.randint(0, 2, size=(n,dim), dtype=np.bool)
  pcd2 = pcd.copy()

  for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice',
                 'rogerstanimoto',
                 #'sokalmichener',
                 # exclude, bug in Scipy
                 # http://projects.scipy.org/scipy/ticket/1486
                 'russellrao', 'sokalsneath',
                 #'kulsinski'
                 # exclude, bug in Scipy
                 # http://projects.scipy.org/scipy/ticket/1484
                 ):
    sys.stdout.write("Metric: " + metric + "...")
    D = pdist(pcd, metric=metric)
    D = correct_for_zero_vectors(D, pcd, metric)

    try:
        Z2 = fc.linkage_vector(pcd, method, metric)
    except FloatingPointError:
        # If linkage_vector reported a NaN dissimilarity value,
        # check whether the distance matrix really contains NaN.
        if np.any(np.isnan(D)):
            print("Skip this test: NaN dissimilarity value.")
            continue
        else:
            raise AssertionError('"linkage_vector" erroneously reported NaN.')

    if np.any(pcd2!=pcd):
      raise AssertionError('Input array was corrupted.', pcd)
    check(Z2, method, D)

  # metrics for real vectors
  bound = math.sqrt(n)
  pcd = np.random.randint(-bound, bound + 1, (n,dim))
  for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev',
                 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard',
                 'canberra',
                 # canberra: see bug in older Scipy versions
                 # http://projects.scipy.org/scipy/ticket/1430
                 'braycurtis', 'seuclidean', 'mahalanobis', 'user']:
    sys.stdout.write("Metric: " + metric + "...")
    if metric=='minkowski':
        p = np.random.uniform(1.,10.)
        sys.stdout.write("p: " + str(p) + "...")
        D = pdist(pcd, metric=metric, p=p)
        Z2 = fc.linkage_vector(pcd, method, metric, p)
    elif metric=='user':
        # Euclidean metric as a user function
        fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum()))
        D = pdist(pcd, metric=fn)
        Z2 = fc.linkage_vector(pcd, method, fn)
    else:
        D = pdist(pcd, metric=metric)
        D = correct_for_zero_vectors(D, pcd, metric)
        try:
            Z2 = fc.linkage_vector(pcd, method, metric)
        except FloatingPointError:
            if np.any(np.isnan(D)):
                print("Skip this test: NaN dissimilarity value.")
                continue
            else:
                raise AssertionError(
                    '"linkage_vector" erroneously reported NaN.')

    check(Z2, method, D)

  D = pdist(pcd)
  for method in ['ward', 'centroid', 'median']:
    Z2 = fc.linkage_vector(pcd, method)
    check(Z2, method, D)