Ejemplo n.º 1
0
def cameras():
    conn, cursor = connect()
    query = "SELECT title, x, y, rstp, F, current_frame FROM cameras"
    cursor.execute(query)
    cameras = cursor.fetchall()

    query = "SELECT * FROM occurrences WHERE DATE(`timestamp`)=CURDATE() AND e1 IS NOT NULL"
    cursor.execute(query)
    today_occurrences = cursor.fetchall()
    arr = [[to["e%i" % i] for i in range(1, 129)] for to in today_occurrences]
    arr = np.array(arr)
    
    model = OPTICS()
    model.fit(arr)
    indices = np.arange(len(today_occurrences))
    result_occurrences = []
    for i in range(np.max(model.labels_) + 1):
        person_indices = indices[model.labels_ == i]
        print(person_indices)
        if len(person_indices) < 4:
            continue
        index = np.random.choice(person_indices)
        result_occurrences.append('/'+'/'.join(today_occurrences[index]['human_picture'].split('/')[1:]))
    
    conn.close()
    print(len(result_occurrences))
 
    return render_template('cameras.html', cameras=cameras, today_occurrences=result_occurrences)
Ejemplo n.º 2
0
    def optics(self, x, threshold = 0.01, min_samples = 0.01):
        """

        """
        model = OPTICS(eps=threshold, min_samples = max(10, int(min_samples*len(x))))
        model.fit(x)
        return model
Ejemplo n.º 3
0
def make_autoencoder(data, lr=0.001, enc_dim=100):
    # Auto encoder layers
    ae0 = Input(shape=products_shape, name='FeaturesInput')
    encode = Dense(enc_dim,
                   activation='relu',
                   kernel_initializer=he_normal(1),
                   name='AE_feature_reduction')(ae0)
    decode = Dense(products_shape[0], activation='relu', name='AE_3')(encode)

    # inspired by https://www.frontiersin.org/articles/10.3389/fgene.2018.00585/full
    # clustering layers (will work with the help of OPTICS)
    # we want to find the probability of one product to be in 1 of total found clusters
    opt = OPTICS()
    opt.fit(minmax.fit_transform(data))
    clusters = len(np.unique(opt.labels_))
    print('Optimal number of cluster:', clusters)
    prob0 = Dense(enc_dim // 2,
                  activation='relu',
                  kernel_initializer=he_normal(1))(encode)
    prob1 = BatchNormalization()(prob0)
    prob = Dense(clusters, activation='softmax',
                 name='Probability_Product')(prob1)

    autoencoder_ = Model(inputs=ae0, outputs=decode)
    encoder_ = Model(inputs=ae0, outputs=encode)
    p_prob = Model(inputs=ae0, outputs=prob)

    autoencoder_.compile(optimizer=Adam(learning_rate=lr),
                         loss='mae',
                         metrics=['mse'])

    return autoencoder_, encoder_, p_prob, opt
Ejemplo n.º 4
0
def optics(params): 
    distance_path=''
    distance_path+=params["distance_path"]
    print(distance_path)
    distance=np.loadtxt(distance_path,dtype=np.float32)
    print(distance.shape)

    #using default values, set metric to 'precomputed'
    op = OPTICS(eps=0.03, min_samples =10, metric='precomputed')
    #check db
    print(op)

    op.fit(distance)
    #get labels
    labels = op.labels_

    print(labels,labels.shape)
    #get number of clusters
    no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print(no_clusters,"no_clusters")

    #for i in range(no_clusters):
        #print('Cluster  : ', np.nonzero(labels == i)[0])

    #print(type(labels))
    return_val=tuple(labels.tolist())
    #print(type(return_val))
    return return_val
Ejemplo n.º 5
0
    def setUp(self):
        n_points_per_cluster = 250
        np.random.seed(0)
        C1 = np.zeros((n_points_per_cluster, 3))
        C2 = np.zeros((n_points_per_cluster, 3))
        C3 = np.zeros((n_points_per_cluster, 3))
        C4 = np.zeros((n_points_per_cluster, 3))
        C5 = np.zeros((n_points_per_cluster, 3))
        C6 = np.zeros((n_points_per_cluster, 3))
        C1[:, 1:3] = ([-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2))
        C2[:, 1:3] = ([4, -1] + .1 * np.random.randn(n_points_per_cluster, 2))
        C3[:, 1:3] = ([0, -2] + .2 * np.random.randn(n_points_per_cluster, 2))
        C4[:, 1:3] = ([-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2))
        C5[:, 1:3] = ([3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2))
        C6[:, 1:3] = ([5, 6] + 2 * np.random.randn(n_points_per_cluster, 2))
        X = np.vstack(
            (C1[:, 1:3], C2[:, 1:3], C3[:, 1:3], C4[:, 1:3], C5[:,
                                                                1:3], C6[:,
                                                                         1:3]))

        clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
        # Run the fit
        clust.fit(X)
        self.tbhg = modeling.TBH()
        self.tbhg.optics = clust
        self.tbhg.locH = (C1, C2, C3, C4, C5, C6)
        # self.tbhg = TBHG(clust)
        pass
Ejemplo n.º 6
0
def optics_fit_predict(X, min_samples=50, cluster_method='dbscan', eps=2):
    """Perform OPTICS clustering
    Extracts an ordered list of points and reachability distances, and
    performs initial clustering using ``max_eps`` distance specified at
    OPTICS object instantiation.
    
    Parameters
    ----------
    X               : array, shape (n_samples, n_features), or (n_samples, n_samples)  
    min_samples     : The number of samples in a neighborhood for a point to be considered as a core point.
    cluster_method  : 'dbscan' by default. Other available: 'xi'
    eps             : The maximum distance between two samples for one to be considered as in the neighborhood of the other.

    Returns
    -------
    labels: Prediction/labels  
    """
    opt = OPTICS(min_samples=min_samples, cluster_method=str(cluster_method))
    opt.fit(X)
    labels = cluster_optics_dbscan(reachability=opt.reachability_,
                                   core_distances=opt.core_distances_,
                                   ordering=opt.ordering_,
                                   eps=eps)

    return labels
Ejemplo n.º 7
0
def optics_clustering(df, min_sample, min_cluster_size):

    clust = OPTICS(min_samples=min_sample,
                   xi=.05,
                   min_cluster_size=min_cluster_size)

    # Run the fit
    clust.fit(df)
    labels = clust.labels_[clust.ordering_]

    # Generate new column with cluster_id
    df['cluster_id'] = np.nan

    for each in range(len(df)):
        x = labels[each]
        df.loc[df.index == each, 'cluster_id'] = str(x)

    # Create an array containing n arrays filled with PIDs of n cluster
    n = df.col_name.unique()
    cluster = []

    for each in n:
        cluster_no = str(each)
        array = np.array(df.pid.loc[df['cluster_id'] == cluster_no])
        cluster.append(array)

    return df, cluster
Ejemplo n.º 8
0
def perform_optics_clustering(data, program_options: Options) -> ClusteredData:
    # The data that will be returned
    clustered_data = ClusteredData(data, list(), program_options=program_options)

    op = OPTICS(min_samples=program_options.OPTICS_MIN_SAMPLES, n_jobs=-1)
    op.fit(data)
    optic_labels = op.labels_

    for k in range(optic_labels.max() + 1):
        class_members = optic_labels == k
        nodes_in_cluster = data[class_members]
        # optics has no way of telling you the final cluster centres so have to calculate it yourself
        cluster_centre = nodes_in_cluster.mean(axis=0)
        cluster = Cluster(cluster_centre=cluster_centre, nodes=nodes_in_cluster, cluster_type=ClusterType.FULL_CLUSTER,
                          program_options=program_options)
        clustered_data.add_cluster(cluster)

    if optic_labels.min() == -1:
        class_members = optic_labels == -1
        # There are unclassified nodes
        unclassified_nodes = data[class_members]
        for unclassified_node in unclassified_nodes:
            cluster_to_add = Cluster(unclassified_node, [unclassified_node],
                                     cluster_type=ClusterType.UNCLASSIFIED_NODE_CLUSTER,
                                     program_options=program_options)
            clustered_data.add_unclassified_node(cluster_to_add)

    return clustered_data
def get_clustered_data(data_matrix,
                       clustering_algorithm=model_constants.KMEANS,
                       distance_metric='euclidean',
                       num_clusters=3):
    if clustering_algorithm.lower() == model_constants.AFFINITY_PROP:
        aff_prop = AffinityPropagation(affinity=distance_metric)
        aff_prop.fit(data_matrix)
        return aff_prop.labels_, aff_prop
    elif clustering_algorithm.lower() == model_constants.DBSCAN:
        dbscan = DBSCAN(metric=distance_metric)
        dbscan.fit(data_matrix)
        return dbscan.labels_, dbscan
    elif clustering_algorithm.lower() == model_constants.OPTICS:
        optics = OPTICS(metric=distance_metric)
        optics.fit(data_matrix)
        return optics.labels_, optics
    elif clustering_algorithm.lower() == model_constants.MEANSHIFT:
        mean_shift = MeanShift()
        mean_shift.fit(data_matrix)
        return mean_shift.labels_, mean_shift
    elif clustering_algorithm.lower() == model_constants.BIRCH:
        birch = Birch(n_clusters=num_clusters)
        birch.fit(data_matrix)
        return birch.labels_, birch
    elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE:
        agglomerative = AgglomerativeClustering(n_clusters=num_clusters,
                                                affinity=distance_metric)
        agglomerative.fit(data_matrix)
        return agglomerative.labels_, agglomerative
    else:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(data_matrix)
        return kmeans.labels_, kmeans
Ejemplo n.º 10
0
def routes_cluster(df, orig, dest, params, color, size):
    '''Groups routes using the OPTICS clustering algorithm.'''
    model = OPTICS(**params)
    X = np.hstack((df.loc[orig, ['x', 'y']].values, df.loc[dest,
                                                           ['x', 'y']].values))
    model.fit(X=X)
    n = len(X)
    fancy = model.ordering_.tolist()
    fancy2 = fancy + [f + n for f in model.ordering_]
    df = df.loc[fancy2, :]
    df['reachability_order'] = list(range(n)) * 2
    df['reachability'] = model.reachability_[fancy].tolist() * 2
    df['route_cluster'] = model.labels_[fancy].tolist() * 2
    df['route_cluster'] = df['route_cluster'].replace(-1, np.nan)
    df['reachability_plot'] = df['route_cluster'] // size
    df.loc[orig, 'reachability_plot'] = df.loc[orig,
                                               'reachability_plot'].fillna(
                                                   method='bfill')
    df.loc[dest, 'reachability_plot'] = df.loc[dest,
                                               'reachability_plot'].fillna(
                                                   method='bfill')
    df.loc[orig, 'reachability_plot'] = df.loc[orig,
                                               'reachability_plot'].fillna(
                                                   method='ffill')
    df.loc[dest, 'reachability_plot'] = df.loc[dest,
                                               'reachability_plot'].fillna(
                                                   method='ffill')
    df['reachability_color'] = (df['route_cluster'] % size).map(color)
    df['reachability_color'] = df['reachability_color'].fillna('(1,1,1)')
    return df
Ejemplo n.º 11
0
	def sub_cluster(wanted_gps, wanted_time, file_names, min_pic_num, show_idx=False):

		# normalization
		wanted_xyz = [drs.lonlat2xyz(x[0], x[1], x[2]) for x in wanted_gps]
		norm_xyz = np.array((wanted_xyz - np.mean(wanted_xyz, 0)) / (np.std(wanted_xyz, 0) + np.array([1e-15, 1e-15, 1e-15])))
		wanted_secs = drs.convert_datetime_seconds(wanted_time)
		norm_secs = np.array((wanted_secs - np.mean(wanted_secs)) / (np.std(wanted_secs) + np.array([1e-15, 1e-15, 1e-15])))
		#         norm_info = np.array([np.array([x[1], x[2], y[0]]) for x, y in zip(norm_xyz, norm_secs)])
		norm_info = np.array([np.array([x[0], x[1], x[2], y[0]]) for x, y in zip(norm_xyz, norm_secs)])
		#         img_cl_idx = hcluster.fclusterdata(norm_info, thres, criterion="distance", method = 'centroid')
		clust = OPTICS(min_pic_num)
		clust.fit(norm_info)
		img_cl_idx, res_eps, min_noise = find_best_thres(clust, len(wanted_gps))
		# img_cl_idx = DBSCAN(thres, 3).fit_predict(norm_info)
		# plotting
		if show_idx:
			fig = plt.figure()
			ax = fig.add_subplot(111, projection='3d')
			title = "threshold: %f, number of clusters: %d" % (res_eps, len(np.unique(img_cl_idx)))
			ax.set_title(title)
			plt.show()
		# find file names in each cluster:
		res_fn = [[] for _ in np.unique(np.append(img_cl_idx, [-1]))]
		for idx, cl_idx in enumerate(img_cl_idx):
			try:
				res_fn[cl_idx + 1].append(file_names[idx])
			except:
				pdb.set_trace()

		res_noise = res_fn[0]
		res_fn = np.array(res_fn[1:])
		res_unchosen = res_fn[[len(x) < min_pic_num for x in res_fn]]
		res_fn = res_fn[[len(x) >= min_pic_num for x in res_fn]]
		return np.array([set(x) for x in res_fn]), np.array(img_cl_idx), res_noise, np.array(res_unchosen)
Ejemplo n.º 12
0
def test_correct_number_of_clusters():
    # in 'auto' mode

    n_clusters = 3
    X = generate_clustered_data(n_clusters=n_clusters)
    # Parameters chosen specifically for this task.
    # Compute OPTICS
    clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1)
    clust.fit(X)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_)
    assert n_clusters_1 == n_clusters

    # check attribute types and sizes
    assert clust.labels_.shape == (len(X),)
    assert clust.labels_.dtype.kind == 'i'

    assert clust.reachability_.shape == (len(X),)
    assert clust.reachability_.dtype.kind == 'f'

    assert clust.core_distances_.shape == (len(X),)
    assert clust.core_distances_.dtype.kind == 'f'

    assert clust.ordering_.shape == (len(X),)
    assert clust.ordering_.dtype.kind == 'i'
    assert set(clust.ordering_) == set(range(len(X)))
Ejemplo n.º 13
0
def cluster_optics(data_mat, metric='euclidean', min_samples=5,
                   max_eps=np.inf, n_jobs=32, save_dir=''):
    """ Cluster data using OPTICS.

    :param data_mat: (array) data matrix
    :param metric: (str) distance metric to use in clustering
    :param min_samples: (int) minimum number of neighbours for core points
    :param max_eps: (float) maximum distance for OPTICS
    :param n_jobs: (int) number or jobs to spawn
    :param save_dir: (str) directory where to save resulting labels
    :return: (model, array) trained OPTICS model and labels array
    """
    start_time = time.time()
    opt = OPTICS(
        min_samples=min_samples,
        metric=metric,
        n_jobs=n_jobs,
        max_eps=max_eps
    )
    opt.fit(data_mat)
    print('Clustering took: {}'.format(time.time() - start_time))

    opt_labs = opt.labels_
    if save_dir:
        f_name = 'optics_labels_{}_ms{}_me{}'.format(
            metric, min_samples, max_eps
        )
        np.save(os.path.join(save_dir, f_name + '.npy'), opt_labs)

    return opt, opt_labs
Ejemplo n.º 14
0
    def find_pairs(self):
        """
        Uses OPTICS algorithim to find clusters of similar securities within
        PCA component space. Once clusters labels are assigned, function
        generates series of tuples containing unique pairs of securities
        within the same cluster.
        """

        if self.returns_reduced is None:
            raise ValueError("returns_reduced not found: must run \
                             .reduce_PCA() before this function")

        # Initialize and fit OPTICS cluster to PCA components
        clustering = OPTICS()
        clustering.fit(self.components_.T)

        # Create cluster data frame and identify trading pairs
        clusters = pd.DataFrame({
            'security': self.securities,
            'cluster': clustering.labels_
        })
        # clusters with label == -1 are 'noise'
        clusters = clusters[clusters['cluster'] != -1]

        # Group securities by cluster and flatten list of combination lists
        groups = clusters.groupby('cluster')
        combos = list(groups['security'].apply(combinations, 2))  # All pairs
        pairs = list(chain.from_iterable(combos))  # Flatten list of lists

        print(f"Found {len(pairs)} potential pairs")

        self.pairs = pd.Series(pairs)
        self.cluster_labels_ = clustering.labels_
Ejemplo n.º 15
0
def test_min_cluster_size_invalid2():
    clust = OPTICS(min_cluster_size=len(X) + 1)
    with pytest.raises(ValueError, match="must be no greater than the "):
        clust.fit(X)

    clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean")
    with pytest.raises(ValueError, match="must be no greater than the "):
        clust.fit(sparse.csr_matrix(X))
Ejemplo n.º 16
0
def test_min_cluster_size_invalid(min_cluster_size):
    clust = OPTICS(min_cluster_size=min_cluster_size)
    with pytest.raises(ValueError, match="must be a positive integer or a "):
        clust.fit(X)

    clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean")
    with pytest.raises(ValueError, match="must be a positive integer or a "):
        clust.fit(sparse.csr_matrix(X))
Ejemplo n.º 17
0
def cluster_embedded_maps_optics(aligned_maps):
    # embeding = embed(aligned_maps)
    embedding = np.vstack([xmap.flatten() for xmap in aligned_maps])
    clusterer = OPTICS()

    clusterer.fit(embedding.astype(np.float64))

    return clusterer.labels_
Ejemplo n.º 18
0
def test_bad_reachability():
    msg = "All reachability values are inf. Set a larger max_eps."
    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750, centers=centers,
                                cluster_std=0.4, random_state=0)

    with pytest.warns(UserWarning, match=msg):
        clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015)
        clust.fit(X)
Ejemplo n.º 19
0
def run_optics(data):
    print(">>> Running OPTICS")
    clf = OPTICS(min_samples=25, xi=.05, min_cluster_size=.02)
    tic = time.time()
    clf.fit(data)
    toc = time.time()
    print("    OPTICS found {} clusters".format(len(np.unique(clf.labels_))))
    print("    OPTICS took {:.2f} s".format(toc - tic))
    return clf.labels_
Ejemplo n.º 20
0
 def _do_optics(self):
     """
     No optimal parameter was mentioned in the paper, so use default parameters here.
     :return:
     clusterings: The cluster label of each stock
     """
     cluster = OPTICS()
     cluster.fit(self.__pca_repr)
     self.__clusterings = cluster.labels_
     return cluster.labels_
Ejemplo n.º 21
0
def OPTICS_Clustering(X):
    X = preprocess(X)
    cluster = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05)
    cluster.fit(X)
    label_pred = cluster_optics_dbscan(reachability=cluster.reachability_,
                                       core_distances=cluster.core_distances_,
                                       ordering=cluster.ordering_,
                                       eps=2)
    label_pred = cluster.labels_
    return label_pred
Ejemplo n.º 22
0
def form_clusters():
    features = np.load(
        "xception_features_with_dim_{}x{}".format(img_width, img_height) +
        '.npy')
    # db = DBSCAN(eps=config['eps'], min_samples=config['min_pts']).fit(features)
    # labels = db.labels_
    # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    # n_noise_ = list(labels).count(-1)
    #
    # print('Estimated number of clusters: %d' % n_clusters_)
    # print('Estimated number of noise points: %d' % n_noise_)
    # # pca_features = PCA.fit(n_components=2)
    #
    # plt.scatter(features[:,0], features[:,1],c=db, cmap='Paired')
    # plt.title("DBSCAN")

    clust = OPTICS(min_samples=5, xi=.05)

    # Run the fit
    clust.fit(features)

    space = np.arange(len(features))
    reachability = clust.reachability_[clust.ordering_]
    labels = clust.labels_[clust.ordering_]

    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 1)
    ax1 = plt.subplot(G[0, :])
    ax2 = plt.subplot(G[1, 0])

    # Reachability plot
    colors = ['g.', 'r.', 'b.', 'y.', 'c.']
    for klass, color in zip(range(0, 5), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
    ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
    ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
    ax1.set_ylabel('Reachability (epsilon distance)')
    ax1.set_title('Reachability Plot')

    # OPTICS
    colors = ['g.', 'r.', 'b.', 'y.', 'c.']
    for klass, color in zip(range(0, 5), colors):
        Xk = features[clust.labels_ == klass]
        ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax2.plot(features[clust.labels_ == -1, 0],
             features[clust.labels_ == -1, 1],
             'k+',
             alpha=0.1)
    ax2.set_title('Automatic Clustering\nOPTICS')

    plt.tight_layout()
    plt.show()
 def _cluster_optics(self):
     optics = OPTICS(min_cluster_size=self.min_clu_size,
                     min_samples=self.min_clu_size,
                     metric=self.distance_metric,
                     leaf_size=len(self.data))
     optics.fit(X=self.data)
     pred = self._extract_best_optics(optics)
     # Append its k to the list of values
     if self.estimated_k:
         self.cand_k.append(1+max(pred))
     return pred
def clustering(the_image_autoencoded,
               the_image_shape,
               number_of_clusters,
               extra_parameters=""):
    print()
    print("***   OPTICS clustering   ***")
    print("---------------------------------")
    # https://scikit-learn.org/stable/modules/clustering.html
    # https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html
    # #sphx-glr-auto-examples-cluster-plot-optics-py
    # https://scikit-learn.org/stable/modules/clustering.html#optics

    print("Image shape: ", the_image_shape)

    print("OPTICS clustering")
    clust = OPTICS(min_samples=10, xi=.0005, min_cluster_size=.005)

    print("Running fit function for OPTICS clustering")
    clust.fit(the_image_autoencoded)

    labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=0.5)

    labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=2)

    labels_300 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=3)

    print("---------------------------")
    reachability = clust.reachability_[clust.ordering_]
    print("Reachability: ", reachability)
    print("---------------------------")

    print("Creating list for clustered data")
    clustered_data = np.zeros((the_image_shape[0], the_image_shape[1]))
    print("Clustered data shape:  ", np.shape(clustered_data))

    x = 0
    y = 0
    for i in range(the_image_shape[0] * the_image_shape[1]):
        clustered_data[y, x] = labels_050[y * the_image_shape[1] + x]
        x = x + 1
        if x == the_image_shape[1]:
            x = 0
            y = y + 1

    return clustered_data
Ejemplo n.º 25
0
def test_min_cluster_size(min_cluster_size):
    redX = X[::2]  # reduce for speed
    clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX)
    cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1])
    if cluster_sizes.size:
        assert min(cluster_sizes) >= min_cluster_size
    # check behaviour is the same when min_cluster_size is a fraction
    clust_frac = OPTICS(min_samples=9,
                        min_cluster_size=min_cluster_size / redX.shape[0])
    clust_frac.fit(redX)
    assert_array_equal(clust.labels_, clust_frac.labels_)
Ejemplo n.º 26
0
    def fit(self, X):
        """
        Apply the ST OPTICS algorithm 
        ----------
        X : 2D numpy array with
            The first element of the array should be the time 
            attribute as float. The following positions in the array are 
            treated as spatial coordinates. The structure should look like this [[time_step1, x, y], [time_step2, x, y]..]
            For example 2D dataset:
            array([[0,0.45,0.43],
            [0,0.54,0.34],...])
        Returns
        -------
        self
        """
        # check if input is correct
        X = check_array(X)

        if not self.eps1 > 0.0 or not self.eps2 > 0.0 or not self.min_samples > 0.0:
            raise ValueError('eps1, eps2, minPts must be positive')

        n, m = X.shape

        # Compute sqaured form Euclidean Distance Matrix for 'time' attribute and the spatial attributes
        time_dist = pdist(X[:, 0].reshape(n, 1), metric=self.metric)
        euc_dist = pdist(X[:, 1:], metric=self.metric)

        # filter the euc_dist matrix using the time_dist
        time_filter = math.pow(10, m)
        dist = np.where(time_dist <= self.eps2, euc_dist, time_filter)

        # speeds up the ST OPTICS
        if np.isinf(self.max_eps):
            self.max_eps = time_filter - 1
        if np.isinf(self.eps1):
            self.eps1 = time_filter - 1

        op = OPTICS(eps=self.eps1,
                    min_samples=self.min_samples,
                    metric='precomputed',
                    max_eps=self.max_eps,
                    cluster_method=self.cluster_method,
                    xi=self.xi,
                    n_jobs=self.n_jobs)
        op.fit(squareform(dist))

        self.labels = op.labels_
        self.reachability = op.reachability_
        self.ordering = op.ordering_
        self.core_distances = op.core_distances_
        self.predecessor = op.predecessor_
        self.cluster_hierarchy = op.cluster_hierarchy_

        return self
def train_topics(tweets, model, _min_samples=5):
    quantified = get_topics_quantified(tweets, n_topics=1)
    quantified = [x[0] for x in quantified]

    optics = OPTICS(min_samples=_min_samples)
    optics.fit(quantified)

    file = open(model, 'wb')
    pickle.dump(optics, file)
    file.close()


### CREATE TWEET ARRAY EXAMPLES
# tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?", "BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"]
# #tweets = ["BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"]
# #tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?"]
# tweets = ["The legislative council belongs to the people of Hong Kong.Those people with ulterior motives indicated by forces hide behind the scenes laid seige to the legislative.The path of your darkness and the bright roads of the masses of the Hong Kong people will not inevitably coexist."]

### GET AND PRINT TOPICS
# topics = get_topics(tweets);
# print(topics);

### GET AND PRINT TOPIC VECTORS
# quantified = get_topics_quantified(tweets);
# print(quantified);

### CLUSTER TWEETS BASED ON TOPICS
# print(train_topics(tweets, "models/model_001.pickle", _min_samples=1));
# print(cluster_topics(tweets, "models/model_001.pickle"));

# # Read the CSV file mapping all tweet data to a motive.
# labeledDataPath = "../data/actors_and_motives.csv";
# df = pd.read_csv(labeledDataPath, usecols=["tweet_docs", "motive"], converters={"tweet_docs": lambda x: x.strip("[]").split(", ")});

# # Removes leading and ending quote characters
# df["tweet_docs"] = [[x.strip('\"') for x in df["tweet_docs"][i]] for i in range(len(df["tweet_docs"]))];

# MAX_LINES = 30;
# tweets = [];

# # Get all file paths and their associated motives from the dataframe.
# _files, _classes = [], [];
# for i in range(len(df["tweet_docs"])):
#     for j in range(len(df["tweet_docs"][i])):
#         data = pd.read_csv(df["tweet_docs"][i][j], usecols=["tweet_text"], nrows=MAX_LINES);
#         tweets += data["tweet_text"].tolist();

# train_topics(tweets, "models/model_001.pickle");

# tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?", "BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"]
# labels = cluster_topics(tweets, "models/model_001.pickle");
# connected = sorted(zip(labels, tweets));
# for label, tweet in connected:
#     print(str(label) + ": " + str(tweet));
Ejemplo n.º 28
0
def test_minimum_number_of_sample_check():
    # test that we check a minimum number of samples
    msg = "min_samples must be no greater than"

    # Compute OPTICS
    X = [[1, 1]]
    clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1)

    # Run the fit
    with pytest.raises(ValueError, match=msg):
        clust.fit(X)
Ejemplo n.º 29
0
def doClustering(X = None, y = None, initial = False, silent = True, numClusters = 4):
    takekmeans = True
    takeoptics = False
    if not silent:  print("- doClustering")

    X, y = rd.readTransformedData()

    # metric learning

    X2 = X.iloc[:, 0:].values

    if initial == False:

        votesX, votesY = rd.readFeedbackData()
        pairs = []
        for index, row in votesX.iterrows():
            pairs.append((X2[row["id_punkt1"]], X2[row["id_punkt2"]]))

        a = votesY

        itml = ITML()
        itml.fit(pairs, a)
        if not silent:  print("Transform")

        X2 = itml.transform(X2)

    if takekmeans == True:
        # Compute kMeans
#        print("numCluster",numClusters)
#        number_clusters = numClusters
        kmeans = KMeans(n_clusters=numClusters , random_state=0).fit(X2)
        labels = kmeans.labels_
        labels_true = y
        core_samples_mask = [0] * len(y)
    elif takeoptics == True:
        opt = OPTICS(min_samples=30, xi=.05)
#        opt = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
        opt.fit(X2)
        labels = opt.labels_
        labels_true = y
        core_samples_mask = [0] * len(y)
    else:
        # Compute DBSCAN
    #    db = DBSCAN(eps=0.1, min_samples=10).fit(X2)
        db = DBSCAN(eps=0.6, min_samples=5).fit(X2)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        labels_true = y

    writeClusteringResult(X2, labels, labels_true, core_samples_mask)

    if not silent:  print("+ doClustering")
    return 1
Ejemplo n.º 30
0
class OPTICS_algo_wrapper:
    def __init__(self):
        self.wrapped = OPTICS(min_samples=5, xi=.05, min_cluster_size=.05)
        self.data = []
        self.indexes = []

    def fit(self, data):
        self.wrapped.fit(data)
        self.data = data
        self.indexes = self.wrapped.labels_

    def predict(self, data):
        return self.wrapped.fit_predict(data)
Ejemplo n.º 31
0
np.random.seed(0)
n_points_per_cluster = 250

C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)

# Run the fit
clust.fit(X)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=0.5)
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=2)

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])