def get_cluster_medoid_positions_OPTICS(
    file_list, cgmodel, min_samples=5, xi=0.05,
    frame_start=0, frame_stride=1, frame_end=-1, output_format="pdb", output_dir="cluster_output", output_cluster_traj = False,
    plot_silhouette=True, plot_rmsd_hist=True, filter=True, filter_ratio=0.05):
    """
    Given PDB or DCD trajectory files and coarse grained model as input, this function performs OPTICS clustering on the poses in the trajectory, and returns a list of the coordinates for the medoid pose of each cluster.

    :param file_list: A list of PDB or DCD files to read and concatenate
    :type file_list: List( str )

    :param cgmodel: A CGModel() class object
    :type cgmodel: class
    
    :param min_samples: minimum of number of samples in neighborhood of a point to be considered a core point (includes point itself)
    :type min_samples: int
    
    :param xi: OPTICS parameter for minimum slope on reachability plot signifying a cluster boundary
    :type xi: float

    :param frame_start: First frame in trajectory file to use for clustering.
    :type frame_start: int

    :param frame_stride: Advance by this many frames when reading trajectories.
    :type frame_stride: int

    :param frame_end: Last frame in trajectory file to use for clustering.
    :type frame_end: int
    
    :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported
    :type output_format: str
    
    :param output_dir: directory to write clustering medoid and plot files
    :type output_dir: str
    
    :param plot_silhouette: option to create silhouette plot(default=True)
    :type plot_silhouette: boolean
    
    :param filter: option to apply neighborhood radius filtering to remove low-density data (default=True)
    :type filter: boolean
    
    :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05)
    :type filter_ratio: float

    :returns:
       - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters.
       - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster 
       - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers
       - n_noise ( int ) - number of points classified as noise
       - silhouette_avg - ( float ) - average silhouette score across all clusters 
    """    
    
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    top_from_pdb = None
    if cgmodel is None:
        top_from_pdb = file_list[0]
    
    distances, traj_all, original_indices = get_rmsd_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, return_original_indices=True)
    
    if filter:
        # Filter distances:
        distances, dense_indices, filter_ratio_actual, original_indices = \
            filter_distances(distances, filter_ratio=filter_ratio, return_original_indices = True, original_indices = original_indices)
        
        traj_all = traj_all[dense_indices]

    
    if plot_rmsd_hist:
        # Plot rmsd histogram:
        distances_row = np.reshape(distances, (distances.shape[0]*distances.shape[1],1))
        
        # Remove the diagonal 0 elements:
        distances_row = distances_row[distances_row != 0]
        
        figure = plt.figure()
        n_out, bin_edges_out, patch = plt.hist(
            distances_row, bins=1000,density=True)
        plt.xlabel('rmsd')
        plt.ylabel('probability density')
        plt.savefig(f'{output_dir}/distances_rmsd_hist.pdf')
        plt.close()        
    
    
    # Cluster with sklearn OPTICS
    optic = OPTICS(min_samples=min_samples,xi=xi,cluster_method='xi',metric='precomputed').fit(distances)
    # The produces a cluster labels from 0 to n_clusters-1, and assigns -1 to noise points
    
    # Get labels
    labels = optic.labels_
    
    # Number of clusters:
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    
    # Number of noise points:
    n_noise = list(labels).count(-1)
    
    # Get indices of frames in each cluster:
    cluster_indices = {}
    cluster_sizes = []   
    for k in range(n_clusters):
        cluster_indices[k] = np.argwhere(labels==k)[:,0]
        cluster_sizes.append(len(cluster_indices[k]))      
        
    # Get indices of frames classified as noise:
    noise_indices = np.argwhere(labels==-1)[:,0]
        
    # Find the structure closest to each center (medoid):
    # OPTICS/DBSCAN does not have a built-in function to transform to cluster-distance space,
    # as the centroids of the clusters are not physically meaningful in general. However, as
    # RMSD between structures is our only clustering feature, the cluster centers (regions of
    # high density) will likely be representative structures of each cluster.

    # Following the protocol outlined in MDTraj example:
    # http://mdtraj.org/1.9.3/examples/centroids.html
    
    # Create distance matrices within each cluster:
    distances_k = {}
    for k in range(n_clusters):
        distances_k[k] = np.zeros((cluster_sizes[k],cluster_sizes[k]))
        for i in range(cluster_sizes[k]):
            for j in range(cluster_sizes[k]):
                distances_k[k][i,j] = distances[cluster_indices[k][i],cluster_indices[k][j]]
    
    # Compute medoid based on similarity scores:
    medoid_index = [] # Global index
    intra_cluster_medoid_index = [] # Index within cluster
    for k in range(n_clusters):
        intra_cluster_medoid_index.append(
            np.exp(-distances_k[k] / distances_k[k].std()).sum(axis=1).argmax()
        )
        # Here we need to use the global sample index to find the medoid structure:
        medoid_index.append(cluster_indices[k][intra_cluster_medoid_index[k]])
            
    medoid_xyz = np.zeros([n_clusters,traj_all.n_atoms,3])
    for k in range(n_clusters):
        medoid_xyz[k,:,:] = traj_all[medoid_index[k]].xyz[0]
        
    # Write medoids to file
    write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format, top_from_pdb=top_from_pdb)
    medoid_positions = medoid_xyz * unit.nanometer

    if output_cluster_traj:
        write_clusters_to_file(labels, traj_all, output_dir, output_format)
    
    # Compute intra-cluster rmsd of samples to medoid based on structure rmsd
    cluster_rmsd = np.zeros(n_clusters)
    
    for k in range(n_clusters):
        cluster_rmsd[k] = np.sqrt(((distances_k[k][intra_cluster_medoid_index[k]]**2).sum())/len(cluster_indices[k]))
    
    # Get silhouette scores
    try:
        silhouette_sample_values = silhouette_samples(distances, labels)
        silhouette_avg = np.mean(silhouette_sample_values[labels!=-1])
    
        if plot_silhouette:
            # Plot silhouette analysis
            plotfile = f"{output_dir}/silhouette_optics_min_sample_{min_samples}_xi_{xi}.pdf"
                
            make_silhouette_plot(
                optic, silhouette_sample_values, silhouette_avg,
                n_clusters, cluster_rmsd, cluster_sizes, plotfile
                )
    except ValueError:
        print("There are either no clusters, or no noise points identified. Try adjusting OPTICS min_samples, xi parameters")
        silhouette_avg = None
        
    return medoid_positions, cluster_sizes, cluster_rmsd, n_noise, silhouette_avg, labels, original_indices
                j += 1
                flag = True
            else:
                break

        if flag and traj[j - 1, 0] - traj[i, 0] > tThresh:  #
            styPt = np.mean(traj[i:j], axis=0)
            styPts.append(styPt)
            i = j
        else:
            i += 1
    return np.array(styPts)


if __name__ == '__main__':
    import util
    data_dir = "data_NCSU"
    # traces_files = [trace for trace in os.listdir(data_dir) if re.match(r'\d+\.trace',trace)]
    traces_files = util.get_trace_files(data_dir)
    locH = []
    for trace_file in traces_files:
        trace = np.loadtxt(trace_file)
        locH.append(detect_staypoints(trace, 90, 10))

    X = [h[:, 1:3] for h in locH]
    X = np.vstack(tuple(X))
    clust = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05)
    clust.fit(X)
    util.plt_clusters(clust, X)
    print("hello")
Beispiel #3
0
        # FeatureAgglomeration did not have fit_predict and fail in this version
        # 'FeatureAgglomeration_100'   : FeatureAgglomeration(n_clusters=100),
        # 'FeatureAgglomeration_150'   : FeatureAgglomeration(n_clusters=150),
        # 'FeatureAgglomeration_200'   : FeatureAgglomeration(n_clusters=200),
        # 'FeatureAgglomeration_250'   : FeatureAgglomeration(n_clusters=250),
        # 'FeatureAgglomeration_300'   : FeatureAgglomeration(n_clusters=300),
        # 'FeatureAgglomeration_350'   : FeatureAgglomeration(n_clusters=350),
        # 'FeatureAgglomeration_400'   : FeatureAgglomeration(n_clusters=400),
        'MiniBatchKMeans_100': MiniBatchKMeans(n_clusters=100),
        'MiniBatchKMeans_150': MiniBatchKMeans(n_clusters=150),
        'MiniBatchKMeans_200': MiniBatchKMeans(n_clusters=200),
        'MiniBatchKMeans_250': MiniBatchKMeans(n_clusters=250),
        'MiniBatchKMeans_300': MiniBatchKMeans(n_clusters=300),

        # 'OPTICS_0_5'                :OPTICS(eps = 0.5, min_samples = 2),
        'OPTICS_1_0': OPTICS(eps=1.5, min_samples=2),
        # 'OPTICS_1_5'                :OPTICS(eps = 2.0, min_samples = 2),
        # 'OPTICS_2_5'                :OPTICS(eps = 2.5, min_samples = 2),
        # 'OPTICS_3_0'                :OPTICS(eps = 3.0, min_samples = 2),
        'MeanShift_1_0': MeanShift(bandwidth=1.0),
        'MeanShift_1_5': MeanShift(bandwidth=1.5),
        'MeanShift_2_0': MeanShift(bandwidth=2.0),
        'MeanShift_2_5': MeanShift(bandwidth=2.5),
        'MeanShift_3_0': MeanShift(bandwidth=3.0),
    }

    # test all combinations
    results = []
    for model_key in models.keys():
        for df_key in dfs.keys():
# db = DBSCAN().fit(data)

# score=metrics.normalized_mutual_info_score(digits.target,db.labels_,average_method='arithmetic')
# print(score)
bench_show3(DBSCAN(),name="DBSCAN", data=data)


# 光学聚类
# clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
#
# # Run the fit
# clust.fit(data)
# score=metrics.normalized_mutual_info_score(digits.target,clust.labels_,average_method='arithmetic')
# print(score)

bench_show(OPTICS(min_samples=50, xi=.05, min_cluster_size=.05),name="OPTICS", data=data)

# 高斯混合模型
#gmm = mixture.GaussianMixture(n_components=n_digits, covariance_type='full').fit(data)
# score=metrics.normalized_mutual_info_score(digits.target,gmm.predict(data),average_method='arithmetic')
# print(score)

bench_show2(mixture.GaussianMixture(n_components=n_digits, covariance_type='full'),name="Gaussian", data=data)

# 桦木
# brc = Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5, compute_labels=True)
# brc.fit(data)
# score=metrics.normalized_mutual_info_score(digits.target,brc.labels_,average_method='arithmetic')
# print(score)

bench_show2(Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5, compute_labels=True),name="Birch", data=data)
Beispiel #5
0
  
  vectors = []
  for word in sentence:
    if word in model:
      vectors.append(model[word])

  df_vectors = pd.DataFrame(vectors)
  # Wortweise Durchschnitt bilden, sodass der ganze Satz einen einzigen "Durchschnitts-Wortvektor" erhält
  mean_vector = df_vectors.mean(axis=0).values.tolist()

  entry_vectors.append(mean_vector)

df['vector'] = entry_vectors

# Clustering
xi = .07
clust = OPTICS(min_samples=2, xi=xi)
labels = clust.fit_predict(entry_vectors)
df['label'] = labels

pd.set_option('display.max_colwidth', -1) # Lange Strings

# Spalten wählen
df = df.filter(items=['label', 'feed', 'entry'])
# Unkategorisierte Zeilen weglassen
df = df[df['label'] >= 0]
# Sortieren
df = df.sort_values(by='label')

print(df.to_string())
Beispiel #6
0
def generate_model(df: pd.DataFrame, label_column: Optional[str]) -> Dict:

    num_cols = get_numeric_columns(df=df)
    cat_cols = get_text_categorical_columns(df=df)

    if label_column is None:
        print('clustering')
        '''
        !!!Preprocessing!!!
        This applies to all processing throughout this file.
        I tried to reduce the data sets to data that can run on these very general data sets.
        An example of this may be removing all text columns for clustering, as there are more cases where this is a 
        good idea than it is not.
        Similar to how you'll see me use my models, I don't think this file will work very well for any data set, but 
        perhaps ok for a lot of data sets. I discuss this more with the models but there is truly no free lunch.
        '''

        # drop categorical columns
        for i in cat_cols:
            df = df.drop(i, axis=1)

        # drop nans
        for i in df.columns:
            df[i].fillna(value=0, inplace=True)
            df = df[df[i] != 0]

        # scale everything down for PCA
        for i in num_cols:
            df[i] = normalize_column(df_column=df[i])
        '''
        !!!Explanation!!!
        Using PCA for dimensionality reduction!
        by using n_components = 0.90, we are keeping 90% of the datasets variance within the amount of features
        projected will have 
        '''
        pca = PCA(n_components=0.90)
        projected = pca.fit_transform(X=df)
        '''
        !!!Models!!!
        This applies for all of my model sections.
        THERE IS NO FREE LUNCH!! 
        Not expecting these models to run perfectly, or even well for every single data set you throw at them.
        Though I did try to make them capable of being as general as possible. 
        For example, this includes setting my trees to be very shallow!
        I think if I can make sure no model is becoming too specific (like a deep tree would), I can perhaps swing at
        least an average of a generally low score for all my models, which for the possibility of any data set ever
        being thrown at it, I would be happy with those results. 
        For this reason, I chose my models based on trying to cover as much ground as I can. For example, with 
        classification I know Naive Bayes may not be great in some cases, that is why the other models are there; but 
        for cases where Naive Bayes is very useful, it will be there to shine! I believe trying to cover as many
        general data sets as possible, the files may perform mediocre on any data set you may throw at it (within 
        reason lol), once again I don't think there could be a file such as this that could run 99% on any dataset...
        and if someone does find it I think instead of submitting it for marks they should sell it for billions ;)
        I think the keyword here is generality!!
        '''
        '''
        !!!DBScan!!!
        '''

        eps = [0.0001, 0.001, 0.01, 0.1, 1, 10]
        mins = [10, 15, 20, 30]

        db_scores = []
        for i in eps:
            for n in mins:
                model = DBSCAN(eps=.2, min_samples=5)
                clusters = model.fit(projected)
                score = metrics.silhouette_score(projected, model.labels_)
                db_scores.append(dict(model=model, score=score))

        best_db = dict(model=None, score=0)
        for i in range(len(db_scores)):
            if db_scores[i]['score'] > best_db['score']:
                best_db['score'] = db_scores[i]['score']
                best_db['model'] = db_scores[i]['model']
        '''
        MeanShift
        '''

        ms_scores = []
        bands = [2, 4, 6, 8, 10]
        for i in bands:
            model = MeanShift(bandwidth=i)
            clusters = model.fit(projected)
            score = metrics.silhouette_score(projected, model.labels_)
            ms_scores.append(dict(model=model, score=score))

        best_ms = dict(model=None, score=0)
        for i in range(len(db_scores)):
            if ms_scores[i]['score'] > best_ms['score']:
                best_ms['score'] = ms_scores[i]['score']
                best_ms['model'] = ms_scores[i]['model']
        '''
        OPTICS
        '''

        o_scores = []
        eps = [0.0001, 0.001, 0.01, 0.1, 1, 10]
        mins = [10, 15, 20, 30]
        for i in eps:
            for n in mins:
                model = OPTICS(min_samples=i, max_eps=n)
                clusters = model.fit(projected)
                score = metrics.silhouette_score(projected, model.labels_)
                o_scores.append(dict(model=model, score=score))

        best_o = dict(model=None, score=0)
        for i in range(len(db_scores)):
            if o_scores[i]['score'] > best_o['score']:
                best_o['score'] = o_scores[i]['score']
                best_o['model'] = o_scores[i]['model']
        '''
        Hierarchical 
        '''

        hier_scores = []
        aff = ['euclidean', 'cosine', 'l1', 'l2', 'manhatten']
        for i in aff:
            model = AgglomerativeClustering(affinity=i)
            clusters = model.fit(projected)
            score = metrics.silhouette_score(projected, model.labels_)
            hier_scores.append(dict(model=model, score=score))
        best_h = dict(model=None, score=0)

        for i in range(len(db_scores)):
            if hier_scores[i]['score'] > best_h['score']:
                best_h['score'] = hier_scores[i]['score']
                best_h['model'] = hier_scores[i]['model']
        '''
        Return Best Cluster!
        '''

        best_scores = np.array([
            best_db['score'], best_ms['score'], best_o['score'],
            best_h['score']
        ])
        best = best_scores.max()

        if best == best_h['score']:
            print(best_h)
            return best_h
        elif best == best_ms['score']:
            print(best_ms)
            return best_ms
        elif best == best_o['score']:
            print(best_o)
            return best_o
        else:
            print(best_db)
            return best_db

    elif label_column in num_cols:
        print('regressor')
        '''
        !!!Processing!!!
        '''

        # label encode non numeric
        for i in cat_cols:
            le = LabelEncoder()
            df[i] = le.fit_transform(df[i])

        # replace nans with mean for each column
        for i in df.columns:
            df[i].fillna(value=df[i].mean(), inplace=True)

        # partition df to x and y
        y = df[label_column]
        df = df.drop(label_column, axis=1)
        x = df
        '''
        !!!Models!!!
        '''

        X_train, X_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.30)
        '''
        !!!Decision Tree Regressor!!!
        '''
        dt_scores = []
        depths = [4, 6, 8, 10]
        splits = [4, 6, 8, 10]
        impurity = [0.2, 0.3, 0.4]
        for i in depths:
            for n in impurity:
                for m in splits:
                    model = DecisionTreeRegressor(max_depth=i,
                                                  min_impurity_decrease=n,
                                                  min_samples_split=m)
                    model.fit(X_train, y_train)
                    y_predict = model.predict(X_test)
                    score = model.score(X_test, y_test)
                    dt_scores.append(dict(model=model, score=score))

        best_dt = dict(model=None, score=0)
        for i in range(len(dt_scores)):
            if dt_scores[i]['score'] > best_dt['score']:
                best_dt['score'] = dt_scores[i]['score']
                best_dt['model'] = dt_scores[i]['model']
        '''
        !!!Random Forest Regressor!!!
        '''

        rf_scores = []
        estimators = [50, 70, 100, 120]
        for q in estimators:
            for i in depths:
                for n in impurity:
                    for m in splits:
                        model = RandomForestRegressor(n_estimators=q,
                                                      max_depth=i,
                                                      min_impurity_decrease=n,
                                                      min_samples_split=m)
                        model.fit(X_train, y_train)
                        y_predict = model.predict(X_test)
                        score = model.score(X_test, y_test)
                        rf_scores.append(dict(model=model, score=score))

        best_rf = dict(model=None, score=0)
        for i in range(len(rf_scores)):
            if rf_scores[i]['score'] > best_rf['score']:
                best_rf['score'] = rf_scores[i]['score']
                best_rf['model'] = rf_scores[i]['model']
        '''
        !!!KNeighbours Regressor!!!
        '''

        neighs = [3, 6, 9, 12, 15]
        kn_scores = []
        for i in neighs:
            model = KNeighborsRegressor(n_neighbors=i)
            model.fit(X_train, y_train)
            y_predict = model.predict(X_test)
            score = model.score(X_test, y_test)
            kn_scores.append(dict(model=model, score=score))

        best_kn = dict(model=None, score=0)
        for i in range(len(kn_scores)):
            if kn_scores[i]['score'] > best_kn['score']:
                best_kn['score'] = kn_scores[i]['score']
                best_kn['model'] = kn_scores[i]['model']
        '''
        !!!Return Best!!!
        '''

        best_scores = np.array(
            [best_dt['score'], best_rf['score'], best_kn['score']])
        best = best_scores.max()

        print(best_scores)

        if best == best_kn['score']:
            print(best_kn)
            return best_kn
        elif best == best_dt['score']:
            print(best_dt)
            return best_dt
        else:
            print(best_rf)
            return best_rf

    elif label_column in cat_cols:

        print('label_column is categorical!')
        '''
        !!!Preprocessing!!!
        '''

        # label encode non numeric
        for i in cat_cols:
            le = LabelEncoder()
            df[i] = le.fit_transform(df[i])

        # replace nans with mean for each column
        for i in df.columns:
            df[i].fillna(value=df[i].mean(), inplace=True)

        # partition df to x and y
        y_encoded = df[label_column]
        df = df.drop(label_column, axis=1)
        x = df
        '''
        !!!Models!!!
        '''
        '''
        !!!Explanation and Citation!!!
        In order to average out multiple instances of the model, I use k-fold cross validation for training and testing
        of the model.
        I used the from cross_val_score function from scikit learn model selection library to do this
        I used this Youtube tutorial to learn how to effectively use this method.
        https://www.youtube.com/watch?v=gJo0uNL-5Qw.
        Thus I would like to cite the Youtube user "codebasics" from Jan. 26, 2019 for helping me figure out this 
        library.
        '''
        '''
        Decision Tree
        '''

        dt_scores = []
        depths = [4, 6, 8, 10]
        splits = [4, 6, 8, 10]
        impurity = [0.2, 0.3, 0.4]
        for i in depths:
            for n in impurity:
                for m in splits:
                    model = DecisionTreeClassifier(max_depth=i,
                                                   min_impurity_decrease=n,
                                                   min_samples_split=m)
                    score = np.average(cross_val_score(model, x, y_encoded))
                    dt_scores.append(dict(model=model, score=score))

        best_dt = dict(model=None, score=0)
        for i in range(len(dt_scores)):
            if dt_scores[i]['score'] > best_dt['score']:
                best_dt['score'] = dt_scores[i]['score']
                best_dt['model'] = dt_scores[i]['model']
        '''
        Random Forest
        '''

        rf_scores = []
        estimators = [50, 70, 100, 120]
        for q in estimators:
            for i in depths:
                for n in impurity:
                    for m in splits:
                        model = RandomForestClassifier(n_estimators=q,
                                                       max_depth=i,
                                                       min_impurity_decrease=n,
                                                       min_samples_split=m)
                        score = np.average(cross_val_score(
                            model, x, y_encoded))
                        rf_scores.append(dict(model=model, score=score))

        best_rf = dict(model=None, score=0)
        for i in range(len(rf_scores)):
            if rf_scores[i]['score'] > best_rf['score']:
                best_rf['score'] = rf_scores[i]['score']
                best_rf['model'] = rf_scores[i]['model']
        '''
        K-Neighbours
        '''

        neighs = [3, 6, 9, 12, 15]
        kn_scores = []
        for i in neighs:
            model = KNeighborsClassifier(n_neighbors=i)
            score = np.average(cross_val_score(model, x, y_encoded))
            kn_scores.append(dict(model=model, score=score))

        best_kn = dict(model=None, score=0)
        for i in range(len(kn_scores)):
            if kn_scores[i]['score'] > best_kn['score']:
                best_kn['score'] = kn_scores[i]['score']
                best_kn['model'] = kn_scores[i]['model']
        '''
        GaussianNB
        '''

        gnb_scores = []
        model = GaussianNB()
        score = np.average(cross_val_score(model, x, y_encoded))

        gnb_scores.append(dict(model=model, score=score))
        best_gnb = gnb_scores[0]
        '''
        Return Best Classifier!
        '''

        best_scores = np.array([
            best_dt['score'], best_rf['score'], best_kn['score'],
            best_gnb['score']
        ])
        best = best_scores.max()

        if best == best_gnb['score']:
            print(best_gnb)
            return best_gnb
        elif best == best_kn['score']:
            print(best_kn)
            return best_kn
        elif best == best_dt['score']:
            print(best_dt)
            return best_dt
        else:
            print(best_rf)
            return best_rf

    else:
        print('Error: label_column is not valid.')
        print(
            'It must be relate to a column in df that is of categorical data type, numeric data type, '
            + 'or is simply None. Returning empty dict object.')
        return dict(model=None, final_score=None)
Beispiel #7
0
plt.figure(figsize=(10, 10))
sns.scatterplot(principalDf["principal component 1"],
                principalDf["principal component 2"],
                hue=principalDf["kMeans"],
                markers="1",
                palette="Accent").set_title("PCA of kMeans analysis")

del (dataCopy, principalComponents, principalDf)

# OPTICS Clustering
#explaination of methods in sklearn documentation

from sklearn.cluster import OPTICS

optics = OPTICS(min_samples=5, xi=.05, min_cluster_size=5)

dataCopy = data.copy()
del (dataCopy["Gate"])
resOptics = optics.fit_predict(dataCopy)
dataCopy["optics"] = resOptics

#sns.pairplot(dataCopy, diag_kind="kde", markers="1", hue = "optics")

dataCopy["optics"].value_counts()

#DBSCAN Clustering

from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=121, min_samples=10)
Beispiel #8
0
plt.ylabel('n')
plt.xlabel('Epsilon distance')
#from the k distance graph we see that eps = 5 or 6 is an appropriate value
from sklearn.cluster import DBSCAN
from sklearn import metrics

dbscanClustering = DBSCAN(eps=5, min_samples=6).fit(clData)
dbscanLabels = dbscanClustering.labels_
cluster = list(dbscanLabels)
plotter(x, y, cluster, clusteringPlotsPath + "DBSCAN.pdf")
# -1 cluster are noise points

##############      OPTICS       ##############
from sklearn.cluster import OPTICS

opticsClustering = OPTICS(min_samples=50, xi=0.05, max_eps=10)
opticsLabels = opticsClustering.fit_predict(clData)
cluster = list(opticsLabels)

plotter(x, y, cluster, clusteringPlotsPath + "OPTICS.pdf")
# -1 cluster are noise points

###########       Hierarchical Clustering         ##############
from sklearn.cluster import AgglomerativeClustering

HierClustering = AgglomerativeClustering().fit(clData)
agglomerativeLabels = clustering.labels_
cluster = list(agglomerativeLabels)

plotter(x, y, cluster, clusteringPlotsPath + "Agglomerative_Clustering.pdf")
Beispiel #9
0
def prepdf_or_featureselection(mydata, myfeature_importances=None, prep=True):

    #create location feature for data using optics clustering
    optics_df = mydata[['Latitude', 'Longitude']].copy()
    clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
    clust.fit(optics_df)
    #
    optics_df['clust_label'] = clust.labels_
    #
    location_max = np.max(optics_df.clust_label.unique())
    #optics labels noisy samples as -1 need to replace for successful onehotencoding
    optics_df['clust_label'].replace([-1], location_max + 1, inplace=True)
    #one hot encoding and combining to mydata
    enc = OneHotEncoder(categories='auto')

    optics_df_1hot = enc.fit_transform(optics_df[['clust_label']])

    location_labels = [
        'cluster' + str(l) for l in optics_df.clust_label.unique()
    ]

    optics_df_1hot = pd.DataFrame(optics_df_1hot.todense(),
                                  index=optics_df.index,
                                  columns=location_labels)
    #part1done cluster columns added

    #print(mydata.shape[1])#39
    mydata = pd.concat([mydata, optics_df_1hot], axis=1)

    #print(mydata.shape[1])#42
    #drop unneccessary columns in our case
    mydata = mydata.drop([
        'city', 'Latitude', 'Longitude', 'change_hunits', 'studio_1000_1499',
        'studio_1500_more', 'studio_750_999', 'onebed_1000_1499',
        'onebed_1500_more', 'onebed_750_999', 'twobed_1000_1499',
        'twobed_1500_more', 'twobed_750_999', 'threebed_1000_1499',
        'threebed_1500_more', 'threebed_750_999'
    ],
                         axis=1)
    feature_data = mydata.copy()
    mydata = mydata.drop('med_rental_rate', axis=1)

    if prep:
        mydatacolumns = mydata.columns
        #print(mydata.shape[1])#37

        #prepare data section

        imputer = IterativeImputer(max_iter=10, random_state=22, min_value=0)
        mydata = imputer.fit_transform(mydata)
        #scale only numerical attrbs which are everything but the columns which were appended earlier
        #print(len(location_labels),mydata.shape[1])
        num_attrbs = mydata.shape[1] - len(location_labels)

        ct_columns = list(range(num_attrbs))

        ct = ColumnTransformer([('scale1', RobustScaler(), ct_columns)],
                               remainder='passthrough')

        mydata = ct.fit_transform(mydata)
        myfeature_selection = 'onlyprep_selected'

    else:
        num_pipeline = Pipeline([('imputer',
                                  IterativeImputer(max_iter=10,
                                                   random_state=22,
                                                   min_value=0)),
                                 ('rob_scaler', RobustScaler())])
        #only num attributes
        num_attrbs = mydata.shape[1] - len(location_labels)
        ct_columns = list(range(num_attrbs))

        full_pipeline = ColumnTransformer([('num', num_pipeline, ct_columns)],
                                          remainder='passthrough')
        #Thanks to Aurelien Geron https://github.com/ageron for TopFeatureSelector
        k = 10
        feature_importances = myfeature_importances

        def indices_of_top_k(arr, k):
            return np.sort(np.argpartition(np.array(arr), -k)[-k:])

        class TopFeatureSelector(BaseEstimator, TransformerMixin):
            def __init__(self, feature_importances, k):
                self.feature_importances = feature_importances
                self.k = k

            def fit(self, X, y=None):
                self.feature_indices_ = indices_of_top_k(
                    self.feature_importances, self.k)
                return self

            def transform(self, X):
                return X[:, self.feature_indices_]

        prepare_select_and_predict_pipeline = Pipeline([
            ('preparation', full_pipeline),
            ('feature_selection', TopFeatureSelector(feature_importances, k)),
            ('rf_reg', RandomForestRegressor(random_state=22,
                                             n_estimators=100))
        ])

        param_grid = [{
            'feature_selection__k':
            list(range(1,
                       len(feature_importances) + 1))
        }]

        grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline,
                                        param_grid,
                                        cv=5,
                                        scoring='neg_mean_squared_error',
                                        n_jobs=-1)

        grid_search_prep.fit(feature_data.drop('med_rental_rate', axis=1),
                             feature_data['med_rental_rate'].copy())
        myfeature_selection = grid_search_prep.best_params_
        mydata = 'onlyprep_selected'
        mydatacolumns = 'onlyprep_selected'
    return {
        'mydata': mydata,
        'mydatacolumns': mydatacolumns,
        'myfeature_selection': myfeature_selection
    }
Beispiel #10
0
# normalization
data = pre_processing.scaling(data)

#
# OPTICS
# Refs:
#  https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html
#  https://scikit-learn.org/stable/modules/clustering.html#optics
#  https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html#sphx-glr-auto-examples-cluster-plot-optics-py
#
start_time = time.time()
print("[i] OPTICS Clustering: min_samples = {} ...\n".format(min_samples))
clust = OPTICS(min_samples=min_samples,
               xi=0.1,
               min_cluster_size=0.1,
               n_jobs=4,
               algorithm="ball_tree")
# Run the fit
clust.fit(data)
print("successfully clustered!")
print("[i] Run Time: {}".format((time.time() - start_time)))

# Performs DBSCAN extraction for an arbitrary epsilon.
#Ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.cluster_optics_dbscan.html#sklearn.cluster.cluster_optics_dbscan

# 0.22 eh o vertice da curva da distancia entre os vizinhos
#labels_022 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.22)

# 0.8 eh o epsilon que o indiano usou e encontrou um fscore de 96%
#labels_080 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.8)
Beispiel #11
0
def similarity(x, y):

    max_len = max(len(x[np.where(x > 0)]), len(y[np.where(y > 0)]))
    totals = np.add(x, y)
    total_incommon = len(totals[np.where(totals > 1)])

    result = 1 - (total_incommon / max(max_len, 0.000001)
                  )  # max ensures no division by 0.

    return result


clust = OPTICS(min_samples=2,
               min_cluster_size=2,
               metric="minkowski",
               n_jobs=10,
               cluster_method='dbscan',
               max_eps=0.5)

# Run the fit
res = clust.fit(data)

result = res

d_testing = data.assign(cluster=result)
d_testing = d_testing.assign(concept=names)

csvRes = d_testing[["concept",
                    'cluster']].sort_values(by=['cluster', 'concept'],
                                            ascending=False)
csvRes = pd.merge(csvRes, cuis, on='concept')
Beispiel #12
0
def test_clustering_results(z, edgeList, args):
    '''
    Try different clustring without known celltypes
    '''
    try:
        # graph Louvain
        print("Louvain")
        listResult, size = generateLouvainCluster(edgeList)
        measure_clustering_results(z, listResult)
    except:
        pass

    try:
        # KMeans
        print("KMeans")
        clustering = KMeans(n_clusters=args.n_clusters, random_state=0).fit(z)
        listResult = clustering.predict(z)
        measure_clustering_results(z, listResult)
    except:
        pass

    # try:
    #     #Spectral Clustering
    #     print("SpectralClustering")
    #     clustering = SpectralClustering(n_clusters=args.n_clusters, assign_labels="discretize", random_state=0).fit(z)
    #     listResult = clustering.labels_.tolist()
    #     measure_clustering_results(z,listResult)
    # except:
    #     pass

    try:
        # AffinityPropagation
        print("AffinityPropagation")
        clustering = AffinityPropagation().fit(z)
        listResult = clustering.predict(z)
        measure_clustering_results(z, listResult)
    except:
        pass

    try:
        # AgglomerativeClustering
        print("AgglomerativeClustering")
        clustering = AgglomerativeClustering().fit(z)
        listResult = clustering.labels_.tolist()
        measure_clustering_results(z, listResult)
    except:
        pass

    try:
        # Birch
        print("Birch")
        clustering = Birch(n_clusters=args.n_clusters).fit(z)
        listResult = clustering.predict(z)
        measure_clustering_results(z, listResult)
    except:
        pass

    # #DBSCAN
    # print("DBSCAN")
    # clustering = DBSCAN().fit(z)
    # listResult = clustering.labels_.tolist()
    # measure_clustering_results(z,listResult)

    # FeatureAgglomeration
    # print("FeatureAgglomeration")
    # clustering = FeatureAgglomeration(n_clusters=args.n_clusters).fit(z)
    # listResult = clustering.labels_.tolist()
    # measure_clustering_results(z,listResult)

    # MeanShift
    # print("MeanShift")
    # clustering = MeanShift().fit(z)
    # listResult = clustering.predict(z)
    # measure_clustering_results(z,listResult)

    try:
        # OPTICS
        print("OPTICS")
        clustering = OPTICS().fit(z)
        listResult = clustering.labels_.tolist()
        measure_clustering_results(z, listResult)
    except:
        pass
Beispiel #13
0
    def class3_output(hz_fft3, num_fft3):
        max_sum = 0
        max_index = -1
        min_sum = 0
        min_index = -1
        fft = []
        # 最大周波数とピーク数のリストを一つのリストにまとめる
        for i in range(len(hz_fft3)):
            fft.append([hz_fft3[i], num_fft3[i]])
        fft = np.array(fft)

        # 最小-1,最大1にリストを正規化
        normal_fft = scipy.stats.zscore(fft).tolist()
        # 正規化されたリストから最小の和と最大の和のリストを抽出
        for i in range(len(normal_fft)):
            sum_fft = normal_fft[i][0]+normal_fft[i][1]
            if max_sum < sum_fft:
                max_sum = sum_fft
                max_index = [normal_fft[i][0], normal_fft[i][1]]
            if min_sum > sum_fft:
                min_sum = sum_fft
                min_index = [normal_fft[i][0], normal_fft[i][1]]

        #分類対象のデータのリスト。各要素はfloatのリスト
        vectors = np.array(normal_fft)
        #分類対象のデータをクラスタ数3でクラスタリング
        clustering = OPTICS(**pram3).fit(vectors)

        # 各特徴点をラベルに従いプロットする
        label = clustering.labels_
        print(label)

        '''
        for i in range(len(normal_fft)):
            if label[i]==0:
                fft_0x.append(normal_fft[i][0])
                fft_0y.append(normal_fft[i][1])
            elif label[i]==1:
                fft_1x.append(normal_fft[i][0])
                fft_1y.append(normal_fft[i][1])
            else:
                fft_2x.append(normal_fft[i][0])
                fft_2y.append(normal_fft[i][1])
        '''

        # figure
        fig = plt.figure(figsize=(14,10))
        ax = fig.add_subplot(1, 1, 1)

        clist = ['gray', 'blue', 'orange', 'green', 'red', 'purple', 'brown', 'yellow']

        # plot
        for i in range(len(vectors)):
            ax.scatter(vectors[i,0], vectors[i,1], color=clist[label[i]+1], s=36)

        #plt.title('Method-2', fontsize=36)
        ax.set_xlabel('vector in x', fontsize=36)
        ax.set_ylabel('vector in y', fontsize=36)
        #plt.tick_params(labelsize=36)
        fig.show()
        fig.savefig('D:/opticalflow/evaluation/plt/class3/' + videoName[:-4] + '_' + algorithm + '_figure.png')

        return label.tolist()
test = test.apply(lambda row: np.radians(row))


# In[112]:



#Min samples is the amount of 'core' points needed to be recognized as a cluster
#core points are basically points in the 'middle' (or just semi-central) of a high density zone
#Metric is the way the algorithm will measure distance. Since we are working in a 'sphere' we will use haversine 
#(although Earth is not REALLY a sphere, but the error is acceptable, about 0.3%
#Max_exps is the maximun distance the algorithm will search for neighbors for a cluster
#since the haversine function returns very small values (above the distance was 0.04, which converted is about 268 km),
#the value in this function is also really small. With 0.01 a many points that weren't noise were considered as such,
#with 0.08 it takes WAY to long to run, I suggest to lower it to 0.05 and see it the results are good enough
clust = OPTICS(min_samples = 3, metric = haversine_distance, min_cluster_size= 3, max_eps = 0.08)


# In[ ]:


start_time = time.time()#Line to measure the time it takes to run the algorithm
clust.fit(test)#the process of clustering itself
print("--- %s seconds ---" % (time.time() - start_time))#Also to measure time


# In[96]:


space = np.arange(test.shape[0])#array with numbers from 0 to the number of cbgs
labels = np.asarray(clust.labels_)# cluster to which the data point belongs (has the same length and index that the main dataset)
                   delimiter=",")
        np.savetxt(results_folderName + labels_fileName,
                   np.concatenate([labels_train, labels_test], axis=0),
                   delimiter=",",
                   fmt='%s')
    if plotFeatures:
        plot_features(X_train[3, :], nTokens)
        plot_features_compare(X_train, y_train, nTokens)

    # remove outliers from train data:
    X_train_inliers = []
    y_train_inliers = []
    for label in labels_to_class_dict.values():
        X = X_train[y_train == label, :]
        y = y_train[y_train == label]
        clusters = OPTICS(min_samples=min_samples).fit_predict(X)
        X_train_inliers.append(X[clusters != -1, :])
        y_train_inliers.append(y[clusters != -1])
    X_train = np.concatenate(X_train_inliers, axis=0)
    y_train = np.concatenate(y_train_inliers)

    # scaling:
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # select good features:
    n_features = int(X_train_scaled.shape[1] * nFeatures_factor)
    n_neighbors = 10
    r = ReliefF(n_features_to_select=n_features, n_neighbors=n_neighbors)
    r.fit(X_train_scaled, y_train)
Beispiel #16
0
df['Cluster_GM'] = gm.fit_predict(X)
''' birch '''
from sklearn.cluster import Birch
bh = Birch(threshold=0.01, branching_factor=100, n_clusters=9)
# fit model and predict clusters
df['Cluster_BH'] = bh.fit_predict(X).astype(str)
''' dbscan '''
from sklearn.cluster import DBSCAN
# define the model
db = DBSCAN(eps=0.3, min_samples=100)
# fit model and predict clusters
df['Cluster_DB'] = db.fit_predict(X)
''' optics '''
from sklearn.cluster import OPTICS
# define the model
op = OPTICS(xi=0.05, min_samples=10, min_cluster_size=0.1)
# fit model and predict clusters
df['Cluster_OP'] = op.fit_predict(X)
''' spectral '''
from sklearn.cluster import SpectralClustering
# define the model
sp = SpectralClustering(n_clusters=9)
# fit model and predict clusters
df['Cluster_SP'] = sp.fit_predict(X)

df['Cluster_SP'].value_counts(normalize=True)
''' TSNE '''
from sklearn.manifold import TSNE
tsne = TSNE(
    n_components=2,
    perplexity=100,
        if P["domain name"] == 'bickley_jet_domain':
            ax.tick_params(labelsize=8)
            if i==0: 
                plt.yticks(np.arange(-2000,4000,2000), np.arange(-2,4,2))
            else: plt.yticks([])
            plt.xticks(np.arange(0,25000,5000), np.arange(0,25,5))
        
    f.savefig(P["filename"] + "_Kmeans", dpi=300)
    

"""
OPTICS
"""

if P["OPTICS"]:
    optics_clustering = OPTICS(min_samples=P["MinPts"], metric="euclidean").fit(X_embedding)
    reachability = optics_clustering.reachability_
    core_distances = optics_clustering.core_distances_
    ordering = optics_clustering.ordering_
    predecessor = optics_clustering.predecessor_
    
    labels = []
    
    for op in P["optics_params"]:
        m, c = op[0], op[1]
        if m == "xi":
            l, _ = cluster_optics_xi(reachability, predecessor, ordering, P["MinPts"], xi=c)
        else:
            l = cluster_optics_dbscan(reachability=reachability,
                                                core_distances=core_distances,
                                               ordering=ordering, eps=c)
def do_clustering(target_csv, cluster_method):
    num_cluster = 24
    df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'),
                          index_col=0,
                          header=0,
                          encoding='utf-8-sig')
    df_data.index.name = 'short_code'
    print(df_data.iloc[:100])
    print(df_data.shape)

    start_time = time.time()
    if cluster_method == 0:
        clustering = DBSCAN(eps=0.3, min_samples=1000)
        clustering.fit(df_data)
        csv_name = 'clustered_dbscan_' + target_csv + '.csv'
    elif cluster_method == 1:
        clustering = OPTICS(min_samples=1000, metric='cosine')
        clustering.fit(df_data)
        csv_name = 'clustered_optics_' + target_csv + '.csv'
    elif cluster_method == 2:
        clustering = AgglomerativeClustering(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_ward_' + target_csv + '.csv'
    elif cluster_method == 3:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='complete',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_complete_' + target_csv + '.csv'
    elif cluster_method == 4:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='single',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_single_' + target_csv + '.csv'
    elif cluster_method == 5:
        clustering = Birch(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_birch_' + target_csv + '.csv'
    elif cluster_method == 6:
        clustering = KMeans(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_kmeans_' + target_csv + '.csv'
    elif cluster_method == 7:
        clustering = SpectralClustering(n_clusters=num_cluster,
                                        random_state=42,
                                        assign_labels='discretize')
        clustering.fit(df_data)
        csv_name = 'clustered_spectral_' + target_csv + '.csv'
    print("time elapsed for clustering: " + str(time.time() - start_time))
    print(clustering.get_params())
    print(clustering.labels_)
    count_percentage(clustering.labels_)
    result_df = pd.DataFrame(data=clustering.labels_,
                             index=df_data.index,
                             columns=['cluster'])

    start_time = time.time()
    print("calinski_harabasz_score: ",
          calinski_harabasz_score(df_data, result_df['cluster'].squeeze()))
    print("silhouette_score: ",
          silhouette_score(df_data, result_df['cluster'].squeeze()))
    print("davies_bouldin_score: ",
          davies_bouldin_score(df_data, result_df['cluster'].squeeze()))
    print("time elapsed for scoring: " + str(time.time() - start_time))
    result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name),
                     encoding='utf-8-sig')
Beispiel #19
0
arguments = sys.argv[1:]

filePath = arguments[1]
eps = float(arguments[3])
minNumSamples = int(arguments[5])

#read in point cloud file in csv
inFile = np.genfromtxt(filePath, delimiter=',', skip_header=1)

#nonGround_coords:
X = inFile[:, 1:4]

startTime = time.time()

print("Running OPTICS.  \n")
testtree = OPTICS(eps=eps, min_samples=minNumSamples).fit(X)
"""# |  eps : float, optional given the label -1.
 |  The maximum distance between two samples for them to be considered
 |  as in the same neighborhood. This is also the largest object size
 |  expected within the dataset. Lower eps values can be used after
 |  OPTICS is run the first time, with fast returns of labels. Default
 |  value of "np.inf" will identify clusters across all scales; reducing
 |  eps will result in shorter run times.
 |  min_samples : int, optional
 |  The number of samples in a neighborhood for a point to be considered
 |  as a core point.
"""
timeElapsed = time.time() - startTime
print(
    "OPTICS.fit(eps = {0}, min_samples = {1}) time elapsed: ".format(
        eps, minNumSamples), timeElapsed, "\n")
    def __init__(self, **params):
        super().__init__('optics')

        self.model = OPTICS(**params)
Beispiel #21
0
def execute_event_detection_procedure(task_id: int, task_name: str, min_x, min_y, max_x, max_y, look_back_hours: int, lang_code,
                                      min_cluster_size=10, st_clustering_max_eps=0.2, text_clustering_max_eps=0.4, verbose=True):

    global postgres_tweets, postgres_events, vectorizer, languages, exec_number
    exec_number = exec_number + 1
    
    end_date = datetime.now()
    start_date = end_date - timedelta(hours=int(look_back_hours))

    print("*"*60)
    print("*"*60)
    print(
        F"Process: {task_name} ({task_id}), Language: {lang_code}, Interval: {start_date} to {end_date}")
    print(F"Execution number: {exec_number}")

    if exec_number % 100 == 0:
        try:
          postgres_tweets.delete_old_tweets()
          print('Old tweets were deleted from the database.')
        except:
          print('Unable to delete old tweets.')

    if not lang_code in languages:
        print(f"The selected language ({lang_code}) is not supported.")
        print('Processing was terminated.')

    # Read data from database
    print("1. Read data from database.")
    df, num = postgres_tweets.read_data_from_postgres(
        start_date=start_date,
        end_date=end_date,
        min_x=min_x,
        min_y=min_y,
        max_x=max_x,
        max_y=max_y,
        lang=lang_code)

    if num <= 0:
        print('There was no record for processing.')
        print('Processing was terminated.')
        return
    if num <= min_cluster_size:
        print('There was no enough record for processing.')
        print('Processing was terminated.')
        return
    if verbose:
        print(F"Number of retrieved tweets: {num}")

    # convert to geodataframe
    print("2. convert to GeoDataFrame")
    gdf = add_geometry(df, crs=get_wgs84_crs())

    # get location vectors
    print("3. Tweet info")
    x = np.asarray(gdf.geometry.x)[:, np.newaxis]
    y = np.asarray(gdf.geometry.y)[:, np.newaxis]
    # get time vector
    t = np.asarray(gdf.created_at.dt.year * 365.2425 + gdf.created_at.dt.day)
    date_time = gdf.created_at.dt.to_pydatetime()
    # get tweet_id and user_id
    tweet_id = gdf.id.values
    user_id = gdf.user_id.values

    # Vectorzie text
    print("4. Get text vector")
    clean_text = df.c.values
    text = df.text.values
    text_vect = None
    text_vect = vectorizer.vectorize(df.c.values, lang_code)
    # Added to debugging
    # if __debug__:
    #     text_vect_path = '~/temp/text.npy'
    #     os.makedirs('~/temp', exist_ok=True)
    #     if os.path.exists(text_vect_path):
    #         text_vect = np.load(text_vect_path)
    #     else:
    #         text_vect = vectorizer.vectorize(df.c.values, lang_code)
    #         np.save(text_vect_path, text_vect)
    # else:
    #     text_vect = vectorizer.vectorize(df.c.values, lang_code)

    # print(F"Shape of the vectorized tweets: {text_vect.shape}")

    # Text-based clustering
    print("5. Clustering - First-level: Text-based")
    start_time = time()
    optics_ = OPTICS(
        min_cluster_size=min_cluster_size,
        max_eps=text_clustering_max_eps,
        metric='precomputed')
    text_dist = np.absolute(cosine_distances(text_vect))
    optics_.fit(text_dist)
    time_taken = time() - start_time
    txt_clust_labels = optics_.labels_
    txt_clust_label_codes = np.unique(txt_clust_labels)
    num_of_clusters = len(txt_clust_label_codes[txt_clust_label_codes >= 0])
    if verbose:
        print(F'\tNumber of text based clusters: {num_of_clusters - 1}')
        print(F"\tTime: {math.ceil(time_taken)} seconds")
    if num_of_clusters <= 0:
        print("No first level cluster was detected.")
        print('Processing was terminated.')
        return

    # topic identification
    print("6. Identify topics")
    # TODO: We need to specify the maximum number of tweets enter into the clustering procedures
    identTopic = HDPTopicIdentification()
    identTopic.identify_topics(txt_clust_labels, clean_text)
    if verbose:
        identTopic.print_cluster_topics('\t')
    topics = identTopic.get_cluster_topics()

    clusters = []
    print("\n7. Clustering - Second-level: Spatiotemporal")
    for label in txt_clust_label_codes:
        if label >= 0:
            start_time = time()
            optics_ = OPTICS(
                min_cluster_size=min_cluster_size,
                max_eps=st_clustering_max_eps,
                metric='precomputed')
            _x = x[txt_clust_labels == label]
            _y = y[txt_clust_labels == label]
            _tweet_id = tweet_id[txt_clust_labels == label]
            _user_id = user_id[txt_clust_labels == label]
            # _x = StandardScaler().fit_transform(x[txt_clust_labels == label])
            # _y = StandardScaler().fit_transform(y[txt_clust_labels == label])
            _text = text[txt_clust_labels == label]
            _date_time = date_time[txt_clust_labels == label]
            st_vect = np.concatenate((_x,
                                      _y,
                                      #   t[txt_clust_labels==label],
                                      ), axis=1)
            st_dist = euclidean_distances(st_vect)
            optics_.fit(st_dist)
            time_taken = time() - start_time
            st_clust_labels = optics_.labels_
            st_clust_label_codes = np.unique(st_clust_labels)
            num_of_clusters = len(
                st_clust_label_codes[st_clust_label_codes >= 0])
            st_any_clust = num_of_clusters > 0

            for l in st_clust_label_codes[st_clust_label_codes >= 0]:
                topic = topics[label][3]
                topic_words = topics[label][4]
                points_text = _text[st_clust_labels == l].tolist()
                points_x = _x[st_clust_labels == l]
                points_y = _y[st_clust_labels == l]
                points_tweet_id = _tweet_id[st_clust_labels == l]
                points_user_id = _user_id[st_clust_labels == l]
                points_date_time = _date_time[st_clust_labels == l].tolist()
                lat_min = np.min(points_y)
                lat_max = np.max(points_y)
                lon_min = np.min(points_x)
                lon_max = np.max(points_x)
                dt_min = min(points_date_time)
                dt_max = max(points_date_time)
                if (len(np.unique(points_user_id)) > 1):
                    clusters.append({
                        'id': None,
                        'task_id': task_id,
                        'task_name': task_name,
                        'topic': topic,
                        'topic_words': topic_words,
                        'latitude_min': lat_min,
                        'latitude_max': lat_max,
                        'longitude_min': lon_min,
                        'longitude_max': lon_max,
                        'date_time_min': dt_min,
                        'date_time_max': dt_max,
                        'points': [{'cluster_id': None,
                                    'longitude': xx.item(),
                                    'latitude': yy.item(),
                                    'text': tt,
                                    'date_time': dd,
                                    'tweet_id': ti.item(),
                                    'user_id': ui.item()} for xx, yy, tt, dd, ti, ui in zip(points_x, points_y, points_text, points_date_time, points_tweet_id, points_user_id)]
                    })
    if verbose:
        print(F'\tNumber of spatial clusters: {len(clusters)}')
        print(F"\tTime: {math.ceil(time_taken)} seconds")

    print("8. Link clusters")
    num_new_cluster = 0
    num_updated_cluster = 0
    for cluster in clusters:
        # 8.1 Select cluster that coincide with the current time interval and extent
        db_clusters = postgres_events.get_clusters(
            cluster['latitude_min'],
            cluster['latitude_max'],
            cluster['longitude_min'],
            cluster['longitude_max'],
            cluster['date_time_min'],
            cluster['date_time_max']
        )
        # 8.2 Retrieve their points
        coverage_id = []
        coverage_ratio = []
        # 8.3 Compare the point of the old clusters and the new clusters
        for db_cluster in db_clusters:
            db_cluster_point_tweet_ids = np.array(
                postgres_events.get_cluster_point_tweet_ids(db_cluster['id']))
            cluster_point_tweet_ids = np.array(
                [point['tweet_id'] for point in cluster['points']])
            numerator = len(np.intersect1d(
                db_cluster_point_tweet_ids, cluster_point_tweet_ids))
            denominator = len(np.union1d(
                db_cluster_point_tweet_ids, cluster_point_tweet_ids))
            if denominator > 0:
                coverage_id.append(db_cluster['id'])
                coverage_ratio.append(numerator / denominator)

        # 8.4 Link the clusters with higher cluster relation strength
        if len(coverage_id) > 0 and max(coverage_ratio) >= min_linking_ratio:
            coverage_ratio = np.array(coverage_ratio)
            coverage_id = np.array(coverage_id)
            cluster['id'] = np.max(
                coverage_id[coverage_ratio == max(coverage_ratio)]).item()
            db_cluster = None
            for db_clust in db_clusters:
                if db_clust['id'] == cluster['id']:
                    db_cluster = db_clust
            if not db_cluster is None:
                cluster['latitude_min'] = min(
                    cluster['latitude_min'], db_cluster['latitude_min'])
                cluster['latitude_max'] = max(
                    cluster['latitude_max'], db_cluster['latitude_max'])
                cluster['longitude_min'] = min(
                    cluster['longitude_min'], db_cluster['longitude_min'])
                cluster['longitude_max'] = max(
                    cluster['longitude_max'], db_cluster['longitude_max'])
                cluster['date_time_min'] = min(
                    cluster['date_time_min'], db_cluster['date_time_min'])
                cluster['date_time_max'] = max(
                    cluster['date_time_max'], db_cluster['date_time_max'])

            for i in range(0, len(cluster['points'])):
                cluster['points'][i]['cluster_id'] = cluster['id']

            num_updated_cluster += 1
        else:
            num_new_cluster += 1
        pass
    print(
        f'\t # updated clusters: {num_updated_cluster}, # new clusters: {num_new_cluster}')

    print("9. Save clusters")
    postgres_events.insert_clusters(clusters)

    print(F"Process {task_name} ({task_id}) finished.")
    print('*'*60)
    print("*"*60)
Beispiel #22
0
    df[a] = df[a].cat.codes

features = df.columns

scaler = MinMaxScaler().fit(df[features])
scaled_df = pd.DataFrame(scaler.transform(df[features]))
scaled_df.columns = features

scaled_df.fillna(scaled_df.mean(), inplace=True)

fig = plt.figure(figsize=(7, 7))

index = 1

for s in [50, 60, 70, 80]:
    est = OPTICS(min_samples=s)
    est.fit(scaled_df)
    df['labels'] = est.labels_
    num_clusters = len(df['labels'].unique())

    sp = fig.add_subplot(2, 2, index)
    sp.set_xlabel('Ball Control')
    sp.set_ylabel('Interceptions')

    print("Broj klastera: %d" % num_clusters)
    print("Samples: %d " % s)
    print("Senka koeficijent: %f " % silhouette_score(scaled_df, est.labels_))

    for j in range(-1, num_clusters):
        if j == -1:
            label = 'noise'
Beispiel #23
0
    def get_clusters(self, st_arr):

        import traceback

        import numpy as np
        from sklearn.cluster import DBSCAN, AffinityPropagation, OPTICS, MeanShift, AgglomerativeClustering, Birch
        from sklearn.cluster import KMeans, SpectralClustering
        from sklearn.neighbors import NearestNeighbors
        import hdbscan, pyamg
        #        import sklearn.utils
        from sklearn.preprocessing import StandardScaler
        from sklearn.datasets import make_blobs
        from sklearn.metrics.pairwise import haversine_distances
        import sys
        sys.path.insert(1, '../lib')
        import denclue, GDT.api, GDT.plot_tools

        try:
            if self.name == 'DBSCAN':
                clusterer = DBSCAN(eps=self.epsilon,
                                   min_samples=self.minimum_samples,
                                   algorithm=self.algorithm,
                                   metric=self.metric)

            elif self.name == 'HDBSCAN':
                clusterer = hdbscan.HDBSCAN(
                    min_samples=self.minimum_samples,
                    #                                            min_cluster_size=self.minimum_cluster_size,
                    min_cluster_size=self.minimum_samples,
                    cluster_selection_epsilon=self.epsilon,
                    metric=self.metric,
                    cluster_selection_method=self.cluster_method,
                    gen_min_span_tree=True,
                    prediction_data=True)

            elif self.name == 'AFFINITYPROPAGATION':
                if self.metric in ['haversine', 'precomputed']:
                    lat = np.array(st_arr[:, 0])
                    lon = np.array(st_arr[:, 1])
                    st_coords = np.column_stack((lat, lon))
                    st_arr = haversine_distances(np.radians(st_coords),
                                                 np.radians(st_coords))
                    clusterer = AffinityPropagation(
                        affinity=self.metric,
                        damping=0.5,
                        max_iter=self.maximum_iterations,
                        convergence_iter=15,
                        preference=None,
                        random_state=self.random_state,
                        #affinity='precomputed',
                    )
                elif self.metric in ['euclidean']:
                    clusterer = AffinityPropagation(
                        affinity=self.metric,
                        damping=0.5,
                        max_iter=self.maximum_iterations,
                        convergence_iter=15,
                        preference=None,
                        random_state=self.random_state,
                    )
                else:
                    raise ValueError(
                        'Invalid metric %s . Must be euclidean or havesine' %
                        self.metric)

            elif self.name == 'OPTICS':
                clusterer = OPTICS(
                    min_samples=self.minimum_samples,
                    #                                   min_cluster_size=self.minimum_cluster_size,
                    min_cluster_size=self.minimum_samples,
                    max_eps=self.epsilon,
                    eps=self.epsilon,
                    metric=self.metric,
                    cluster_method=self.cluster_method,
                    algorithm=self.algorithm)

            elif self.name == 'AGGLOMERATIVE':
                clusterer = AgglomerativeClustering(
                    distance_threshold=self.epsilon,
                    affinity=self.metric,
                    linkage='average',
                    n_clusters=None)
                if self.metric in ['haversine', 'precomputed']:
                    lat = np.array(st_arr[:, 0])
                    lon = np.array(st_arr[:, 1])
                    st_coords = np.column_stack((lat, lon))
                    st_arr = haversine_distances(np.radians(st_coords),
                                                 np.radians(st_coords))

#                    clusterer = AgglomerativeClustering(distance_threshold=self.epsilon,
#                                                        affinity=self.metric,
#                                                        n_clusters=None)
                elif self.metric in ['euclidean']:
                    pass
                else:
                    raise ValueError(
                        'Invalid metric %s . Must be euclidean, havesine, or precomputed'
                        % self.metric)

            elif self.name == 'DENCLUE':
                clusterer = denclue.DENCLUE(
                    h=None,
                    eps=self.epsilon,
                    min_density=self.minimum_cluster_size,
                    metric=self.metric)
                if self.fit_predict:
                    print(
                        'WARNING DENCLUE does does not have a fit_predict function. Switching to fit'
                    )
                    self.fit_predict = False

            elif self.name == 'BIRCH':
                clusterer = Birch(n_clusters=None, threshold=self.epsilon)

            elif self.name == 'MEANSHIFT':
                clusterer = MeanShift()

            elif self.name == 'KMEANS':
                scaler = StandardScaler()
                scaled_features = scaler.fit_transform(st_arr)
                ''' init="random" or "k-means++"
                    n_init=10 (Number of runs with different centroid seeds)
                    max_iter=300 (Maximum number of iterations for a single run)
                    random_state=5 (Determines random number generation for centroid initialization)
                '''
                clusterer = KMeans(
                    init='k-means++',
                    n_clusters=self.n_clusters,  # default=8
                    n_init=self.centroid_init,
                    max_iter=self.maximum_iterations,  # default=300
                    random_state=self.random_state)  # default=5

            elif self.name == 'SPECTRAL':
                clusterer = SpectralClustering(
                    assign_labels=self.
                    algorithm,  # {‘kmeans’, ‘discretize’}, default=’kmeans’
                    random_state=self.random_state,  # default: 0
                    n_clusters=self.n_clusters,  # default=8
                    # {'nearest_neighbors','rbf','precomputed','precomputed_nearest_neighbors'}
                    affinity=self.metric,
                    n_neighbors=self.
                    minimum_samples,  # Number of neighbors to use; default=10
                    eigen_solver=self.
                    cluster_method  # {‘arpack’, ‘lobpcg’, ‘amg’}
                )
                if self.metric in [
                        'precomputed', 'precomputed_nearest_neighbors'
                ]:
                    lat = np.array(st_arr[:, 0])
                    lon = np.array(st_arr[:, 1])
                    st_coords = np.column_stack((lat, lon))
                    st_arr = haversine_distances(np.radians(st_coords),
                                                 np.radians(st_coords))

#d            elif self.name == 'NEARESTNEIGHBORS':
#d                clusterer = NearestNeighbors(n_neighbors=self.n_clusters,
#d                                             metric=self.metric,
#d                                             weights='distance',
#d                                             algorithm=self.algorithm,)

            else:
                print(
                    "Class cluster_data [get_clusters] something was not right"
                )

            X, _labels_true = make_blobs(n_samples=len(st_arr),
                                         centers=st_arr,
                                         cluster_std=self.cluster_std,
                                         random_state=self.random_state)

            if self.fit_predict:
                clusterer.fit_predict(np.radians(st_arr))
            else:
                clusterer.fit(np.radians(st_arr))

#        _core_samples_mask = np.zeros_like(clusterer.labels_, dtype=bool)
#        _core_samples_mask[clusterer.core_sample_indices_] = True

#            print(clusterer)

            cluster_centers = self.get_cluster_centers(self.name, clusterer)

            return clusterer.labels_, _labels_true, cluster_centers  #, _core_samples_mask

        except Exception as err:
            print("Class cluster_data [get_clusters] Error message:", err)
            print(traceback.format_exc())
Beispiel #24
0
def detect(file_path, space, deleted_features):
    """
    Detect outliers
    """
    start_time = time.time()
    print("==================================================")
    print("Outlier detection and treatment started ...")
    print("Space:", space)

    X = DataLoader.load(file_path)
    # X = pd.read_csv(file_path)

    if len(deleted_features) > 0:
        X = X.drop(deleted_features, axis=1, inplace=False)

    # Basic data cleaning
    X = data_cleaning_formatting(X)

    y_predicted = None
    params = space['params']
    error = dict()

    try:
        if space['model'] == "DBSCAN":
            model = DBSCAN(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted))

        elif space['model'] == "OPTICS":
            model = OPTICS(**params)
            y_predicted = model.fit_predict(X)
            print(y_predicted)
            y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted))

        elif space['model'] == "EllipticEnvelope":
            model = EllipticEnvelope(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "IsolationForest":
            model = IsolationForest(**params)
            with parallel_backend('threading'):
                y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "OneClassSVM":
            model = OneClassSVM(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "LocalOutlierFactor":
            model = LocalOutlierFactor(**params)
            with parallel_backend('threading'):
                y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "zscore":
            model = ZScore(threshold=params['threshold'])
            y_predicted = model.fit_predict(X)

    except Exception as e:
        print("Error:", e)
        y_predicted = [0] * X.shape[0]
        error['detect_' + str(space)] = e

    if isinstance(y_predicted, list):
        y_predicted = np.array(y_predicted)

    time_taken = time.time() - start_time
    print("Time taken:", time_taken)

    return y_predicted
import numpy as np

# Generate sample data

np.random.seed(0)
n_points_per_cluster = 250

C1 = [-5, -2] + 0.8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + 0.1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + 0.2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + 0.3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05)

# Run the fit
clust.fit(X)

labels_050 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=0.5,
)
labels_200 = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=2,
Beispiel #26
0
    def update_chart(self):
        if self.last_results is None:
            return
        x = self.transfer(self.last_results[0].classes_φ)
        self.save_typical_button.setEnabled(True)
        cluster = OPTICS(min_samples=self.min_samples_input.value(),
                         min_cluster_size=self.min_cluster_size_input.value(),
                         xi=self.xi_input.value())
        flags = cluster.fit_predict(self.data_to_clustering)
        cmap = plt.get_cmap()

        self.clustering_axes.clear()
        flag_set = set(flags)
        for flag in flag_set:
            key = np.equal(flags, flag)
            if flag == -1:
                c = "#7a7374"
                label = self.tr("Not clustered")
            else:
                c = cmap(flag)
                label = self.tr("EM{0}").format(flag + 1)
            self.clustering_axes.plot(self.data_to_clustering[key]
                                      [:,
                                       self.x_axis_combo_box.currentIndex()],
                                      self.data_to_clustering[key]
                                      [:,
                                       self.y_axis_combo_box.currentIndex()],
                                      c="#ffffff00",
                                      marker=".",
                                      ms=8,
                                      mfc=c,
                                      mew=0.0,
                                      zorder=flag,
                                      label=label)
        if len(flag_set) < 6:
            self.clustering_axes.legend(loc="upper left")
        self.clustering_axes.set_xlabel(self.x_axis_combo_box.currentText())
        self.clustering_axes.set_ylabel(self.y_axis_combo_box.currentText())
        self.clustering_axes.set_title(self.tr("Clustering of end-members"))

        self.component_axes.clear()
        if self.xlog:
            self.component_axes.set_xscale("log")

        for flag in flag_set:
            if flag == -1:
                c = "#7a7374"
            else:
                c = cmap(flag)
            key = np.equal(flags, flag)
            for distribution in self.stacked_components[key]:
                self.component_axes.plot(x, distribution, c=c, zorder=flag)

            if flag != -1:
                typical = np.mean(self.stacked_components[key], axis=0)
                self.component_axes.plot(x,
                                         typical,
                                         c="black",
                                         zorder=1e10,
                                         ls="--",
                                         linewidth=1)
        self.component_axes.set_title(self.tr("Typical end-members"))
        self.component_axes.set_xlabel(self.xlabel)
        self.component_axes.set_ylabel(self.ylabel)
        self.component_axes.set_xlim(x[0], x[-1])
        self.component_axes.set_ylim(0, None)

        self.figure.tight_layout()
        self.canvas.draw()
Beispiel #27
0
clean_data = clean_data[clean_data.KM != -1]
data_no_km = clean_data.loc[:, clean_data.columns != 'KM']

# Plotting data without the noise
pca = PCA(n_components=2, random_state=1).fit_transform(data_no_km)
sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=clean_data['KM'], palette='deep')
plt.show()

# Let us now attempt to apply OPTICS algorithm on our cleaned dataset without noise

# First let us remove the K-means clusters from the dataset
# clean_data.drop(['KM'], axis=1, inplace=True)

# min_samples=82 is set as the data after feature selection has 41 dimensions
# eps=18.498287329980066 determined using k-neighbours plot and imported to increase speed
opt = OPTICS(min_samples=82, eps=18.498287329980066,
             n_jobs=-1).fit_predict(data_no_km)
clean_data['OPT'] = opt

# OPTICS algorithm has rejected most of the data (100000) and treated it as noise, while the rest 30000,
# it split into various groups of small sizes ranging from 1626 to 84 members. It can be seen on the plot bellow
print(clean_data['OPT'].value_counts())

# Therefore we will discard OPTICS and keep K-Means

# Interpreting clusters using Random Forest and extracting the most important features

# And graphically showing how the clusters were sorted based on important features #(0.5, 1.3)
rf = RandomForestClassifier(random_state=1,
                            n_jobs=-1).fit(data_no_km, clean_data.KM)

selected_columns = list(
Beispiel #28
0
    def save_typical(self, filename):
        assert self.last_results is not None
        if len(self.last_results) == 0:
            return
        cluster = OPTICS(min_samples=self.min_samples_input.value(),
                         min_cluster_size=self.min_cluster_size_input.value(),
                         xi=self.xi_input.value())
        classes_μm = self.last_results[0].classes_μm
        flags = cluster.fit_predict(self.data_to_clustering)
        flag_set = set(flags)
        typicals = []
        for flag in flag_set:
            if flag != -1:
                key = np.equal(flags, flag)
                typical = np.mean(self.stacked_components[key], axis=0)
                typicals.append(typical)

        wb = openpyxl.Workbook()
        prepare_styles(wb)
        ws = wb.active
        ws.title = self.tr("README")
        description = \
            """
            This Excel file was generated by QGrain ({0}).

            Please cite:
            Liu, Y., Liu, X., Sun, Y., 2021. QGrain: An open-source and easy-to-use software for the comprehensive analysis of grain size distributions. Sedimentary Geology 423, 105980. https://doi.org/10.1016/j.sedgeo.2021.105980

            It contanins 2 + N_clusters sheets:
            1. The first sheet is the sum distributions of all component clusters.
            2. The second sheet is used to put the component distributions that not in any cluster.
            3. The left sheet is the component distributions of each cluster, separately.

            The clustering algorithm is OPTICS, implemented by scikit-learn.
            https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html

            Clustering algorithm details
                min_samples={1}
                min_cluster_size={2}
                xi={3}
                others=default

            """.format(QGRAIN_VERSION,
                       self.min_samples_input.value(),
                       self.min_cluster_size_input.value(),
                       self.xi_input.value())

        def write(row, col, value, style="normal_light"):
            cell = ws.cell(row + 1, col + 1, value=value)
            cell.style = style

        lines_of_desc = description.split("\n")
        for row, line in enumerate(lines_of_desc):
            write(row, 0, line, style="description")
        ws.column_dimensions[column_to_char(0)].width = 200

        ws = wb.create_sheet(self.tr("Typical Components"))
        write(0, 0, self.tr("Typical Component"), style="header")
        ws.column_dimensions[column_to_char(0)].width = 16
        for col, value in enumerate(classes_μm, 1):
            write(0, col, value, style="header")
            ws.column_dimensions[column_to_char(col)].width = 10
        for row, distribution in enumerate(typicals, 1):
            if row % 2 == 0:
                style = "normal_dark"
            else:
                style = "normal_light"
            write(row, 0, self.tr("Component{0}").format(row), style=style)
            for col, value in enumerate(distribution, 1):
                write(row, col, value, style=style)
            QCoreApplication.processEvents()

        for flag in flag_set:
            if flag == -1:
                ws = wb.create_sheet(self.tr("Not Clustered"), 2)
            else:
                ws = wb.create_sheet(self.tr("Cluster{0}").format(flag + 1))

            write(0, 0, self.tr("Index"), style="header")
            ws.column_dimensions[column_to_char(0)].width = 16
            for col, value in enumerate(classes_μm, 1):
                write(0, col, value, style="header")
                ws.column_dimensions[column_to_char(col)].width = 10
            key = np.equal(flags, flag)
            for row, component in enumerate(self.stacked_components[key], 1):
                if row % 2 == 0:
                    style = "normal_dark"
                else:
                    style = "normal_light"
                write(row, 0, str(row), style=style)
                for col, value in enumerate(component, 1):
                    write(row, col, value, style=style)
                QCoreApplication.processEvents()

        wb.save(filename)
        wb.close()
lon0 = np.array(lon0)
lat0 = np.array(lat0)

X = direct_embedding(lons, lats)
#%%
P = {}
P = {
    "MinPts": mins,
    "optics_params": [
        ["dbscan", 4000],
        ["xi", 0.002],
    ],
    "ylims": [200, 20000]
}

#%%
optics_clustering = OPTICS(min_samples=mins, metric="euclidean").fit(X)
reachability = optics_clustering.reachability_
core_distances = optics_clustering.core_distances_
ordering = optics_clustering.ordering_
predecessor = optics_clustering.predecessor_

#%%
np.savez('results/OPTICS_sp%d_smin%d' % (sp, mins),
         reachability=reachability,
         core_distances=core_distances,
         ordering=ordering,
         predecessor=predecessor,
         lon=lon0,
         lat=lat0)
Beispiel #30
0
    km = EARTHRADIUS * c
    return km


def group_euclid(qw):
    return np.sqrt((qw['latitude'] - qw['latitude'].mean())**2 +
                   (qw['longitude'] - qw['longitude'].mean())**2).max()


df1 = pd.read_csv('df_concat.csv')
df2 = df1.drop(['Industry'], axis=1)

distance_matrix = squareform(
    pdist(df2, (lambda u, v: getDistanceByHaversine(u, v))))

db = OPTICS(min_samples=5, metric='precomputed')
y_db = db.fit_predict(distance_matrix)

df1['cluster'] = y_db

uf = df1[['Industry', 'cluster']].groupby('cluster')

unique_cluster = uf.nunique()

mean_cluster = df1[['longitude', 'latitude',
                    'cluster']].groupby('cluster').mean()

max_distance = df1[['longitude', 'latitude',
                    'cluster']].groupby('cluster').apply(group_euclid)

three_cluster = mean_cluster[unique_cluster['Industry'] == 3]