def get_clusters(clusterable_embedding_, min_cluster_size, min_samples): clusterer = hdbscan.HDBSCAN( min_samples=min_samples, min_cluster_size=min_cluster_size, prediction_data=True).fit(clusterable_embedding_) soft_clusters_ = hdbscan.all_points_membership_vectors(clusterer) return soft_clusters_
def bsoid_hdbscan(umap_embeddings, hdbscan_params=HDBSCAN_PARAMS): """ Trains HDBSCAN (unsupervised) given learned UMAP space :param umap_embeddings: 2D array, embedded UMAP space :param hdbscan_params: dict, HDBSCAN params in GLOBAL_CONFIG :return assignments: HDBSCAN assignments """ highest_numulab = -np.infty numulab = [] min_cluster_range = range(6, 21) logging.info('Running HDBSCAN on {} instances in {} D space...'.format(*umap_embeddings.shape)) for min_c in min_cluster_range: trained_classifier = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size=int(round(0.001 * min_c * umap_embeddings.shape[0])), **hdbscan_params).fit(umap_embeddings) numulab.append(len(np.unique(trained_classifier.labels_))) if numulab[-1] > highest_numulab: logging.info('Adjusting minimum cluster size to maximize cluster number...') highest_numulab = numulab[-1] best_clf = trained_classifier assignments = best_clf.labels_ soft_clusters = hdbscan.all_points_membership_vectors(best_clf) soft_assignments = np.argmax(soft_clusters, axis=1) # trained_classifier = hdbscan.HDBSCAN(prediction_data=True, # min_cluster_size=round(umap_embeddings.shape[0] * 0.007), # just < 1%/cluster # **hdbscan_params).fit(umap_embeddings) # assignments = best_clf.labels_ logging.info('Done predicting labels for {} instances in {} D space...'.format(*umap_embeddings.shape)) return assignments, soft_clusters, soft_assignments
def cluster(self, distances, metric='euclidean', allow_single_cluster=False, prediction_data=False, min_cluster_size=2): with warnings.catch_warnings(): warnings.simplefilter("ignore") ## Cluster on the UMAP embeddings and return soft clusters tuned_eom = utils.hyperparameter_selection(distances, self.threads, metric=metric, method="eom", allow_single_cluster=allow_single_cluster, starting_size = min_cluster_size) tuned_leaf = utils.hyperparameter_selection(distances, self.threads, metric=metric, method="leaf", allow_single_cluster=allow_single_cluster, starting_size = min_cluster_size) best_eom = utils.best_validity(tuned_eom) best_leaf = utils.best_validity(tuned_leaf) if int(best_eom["validity_score"]) >= int(best_leaf["validity_score"]): best = best_eom binning_method = "eom" else: best = best_leaf binning_method = "leaf" if metric == 'precomputed': clusterer = hdbscan.HDBSCAN( algorithm='best', alpha=1.0, cluster_selection_method=binning_method, metric=metric, min_cluster_size=int(best['min_cluster_size']), min_samples=int(best['min_samples']), allow_single_cluster=allow_single_cluster, core_dist_n_jobs=self.threads, approx_min_span_tree=False ) clusterer.fit(distances) if prediction_data: self.soft_clusters = None else: clusterer = hdbscan.HDBSCAN( algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=True, leaf_size=40, cluster_selection_method=binning_method, metric=metric, min_cluster_size=int(best['min_cluster_size']), min_samples=int(best['min_samples']), allow_single_cluster=allow_single_cluster, core_dist_n_jobs=self.threads, prediction_data=prediction_data ) clusterer.fit(distances) if prediction_data: self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer) return clusterer.labels_
def auto_clust(sense_data: SenseData, umap_n_neighbors=2, umap_ndim=2, umap_min_dist=0.1, clust_min_samples=2, print_clust=False): vecs = sense_data.sense_vecs umap_inst = umap.UMAP(n_components=umap_ndim, n_neighbors=umap_n_neighbors, metric='cosine', min_dist=umap_min_dist, random_state=4422) proj = umap_inst.fit_transform(vecs) clust = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=clust_min_samples, prediction_data=True).fit(proj) clabels = clust.labels_ probs = clust.probabilities_ sfreqs = np.array(sense_data.sense_freqs) slabels = sense_data.sense_labels if np.all(clabels < 0): prob_mat = np.array([]) clust_freq = np.array([]) else: prob_mat = hdbscan.all_points_membership_vectors(clust) clust_freq = (sfreqs[:, np.newaxis] * prob_mat).sum(0) sense_clusters = {} for clust_idx in np.unique(clabels): idx_list = (clabels == clust_idx).nonzero()[0] idx_list = sorted(idx_list, key=lambda x: -probs[x]) sense_clusters[clust_idx] = [(i, probs[i], sfreqs[i], slabels[i]) for i in idx_list] if print_clust: print("-- Cluster %d --" % (clust_idx, )) print("\n".join(f"[{x[0]:2d}] {x[1]:.2f}({x[2]:3d}): {x[3]}" for x in sense_clusters[clust_idx])) print("\n") if print_clust: if proj.shape[1] == 1: plt.scatter(proj[:, 0], np.ones(proj.shape[0]), c=clust.labels_, cmap="Set1") else: plt.scatter(proj[:, 0], proj[:, 1], c=clust.labels_, cmap="Set1") return { "projection": proj, "sense_clusters": sense_clusters, "sense_freqs": sfreqs, "cluster_freqs": clust_freq, "memberships": prob_mat }
def hierarchy(self): if st.button("__Identify Clusters__"): funfacts = randfacts.getFact() st.info( str.join('', ('Identifying... Here is a random fact: ', funfacts))) max_num_clusters = -np.infty num_clusters = [] self.min_cluster_size = np.linspace(self.cluster_range[0], self.cluster_range[1], 25) for min_c in self.min_cluster_size: learned_hierarchy = hdbscan.HDBSCAN( prediction_data=True, min_cluster_size=int( round(min_c * 0.01 * self.sampled_embeddings.shape[0])), **HDBSCAN_PARAMS).fit(self.sampled_embeddings) num_clusters.append(len(np.unique(learned_hierarchy.labels_))) if num_clusters[-1] > max_num_clusters: max_num_clusters = num_clusters[-1] retained_hierarchy = learned_hierarchy self.assignments = retained_hierarchy.labels_ self.assign_prob = hdbscan.all_points_membership_vectors( retained_hierarchy) self.soft_assignments = np.argmax(self.assign_prob, axis=1) st.info('Done assigning labels for **{}** instances ({} minutes) ' 'in **{}** D space'.format( self.assignments.shape, round(self.assignments.shape[0] / 600), self.sampled_embeddings.shape[1])) st.balloons()
def fit_transform(self, dataset: Dataset, name: str, remove_disc: bool = True) -> TopicModel: # WARNING: setting a seed for reproducibility make the algorithm run on a single core (-> slower) seed = None if get_seed(): seed = get_seed() # https://umap-learn.readthedocs.io/en/latest/index.html mapper = umap.UMAP(random_state=seed, **self.u_args).fit(dataset.get_count_matrix()) # WARNING: some points might be disconnected (np.inf) if remove_disc: disc = umap.utils.disconnected_vertices(mapper) embedding = mapper.embedding_[~disc, :] else: embedding = mapper.embedding_ # https://hdbscan.readthedocs.io/en/latest/index.html clusterer = hdbscan.HDBSCAN(prediction_data=True, **self.h_args).fit(embedding) # labels = clusterer.labels_ # predicted labels (hard clusters) with -1 for too noisy observations: how to return them? doc_topic_matrix = hdbscan.all_points_membership_vectors(clusterer) topic_word_matrix = np.array([]) return TopicModel.from_array(name, topic_word_matrix, doc_topic_matrix)
def _cluster_embeddings(self, umap_embeddings: np.ndarray, documents: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]: """ Cluster UMAP embeddings with HDBSCAN Arguments: umap_embeddings: The reduced sentence embeddings with UMAP documents: Dataframe with documents and their corresponding IDs Returns: documents: Updated dataframe with documents and their corresponding IDs and newly added Topics probabilities: The distribution of probabilities """ self.cluster_model = hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True).fit(umap_embeddings) documents['Topic'] = self.cluster_model.labels_ if self.calculate_probabilities: probabilities = hdbscan.all_points_membership_vectors(self.cluster_model) else: probabilities = None self._update_topic_size(documents) logger.info("Clustered UMAP embeddings with HDBSCAN") return documents, probabilities
def hdbscan_with_knn(data, clf, thresh=None, mink_p=1.5, mink_kwargs=None): df = data.copy() mc = clf.min_cluster_size ms = clf.min_samples metric = clf.metric clf_method = clf.cluster_selection_method try: # run hdbscan if metric == 'wminkowski': mw = mink_weights(df, **mink_kwargs) metric = lambda x, y: wminkowski(x, y, p=mink_p, w=mw) clusterer = HDBSCAN(min_cluster_size=mc, min_samples=ms, prediction_data=True, metric=metric, cluster_selection_method=clf_method).fit(df) thresh = thresh if thresh else 1 / max(2, len(clusterer.exemplars_)) # get exemplars and labels exemplars = np.concatenate([e for e in clusterer.exemplars_]) labels = np.concatenate([ np.full((len(e)), fill_value=i) for i, e in enumerate(clusterer.exemplars_) ]) # fit knn on exemplars knn = KNeighborsClassifier(n_neighbors=1).fit(exemplars, labels) # map top soft cluster probabilities to obs probs = np.max(all_points_membership_vectors(clusterer), axis=1) df['top_prob'] = pd.Series(probs, index=df.index) # assign all points to outlier class (label:-1) df['label'] = -1 # take all points above a prob threshhold obs = df.top_prob >= thresh # predict labels from fitted knn df.loc[obs, 'label'] = knn.predict( df.loc[obs, df.columns.drop(['top_prob', 'label'])]) except: df['label'] = 0 return df.label #----------------------- TO-DO ----------------------------- # allow batch prediction # -- 1. assign points below thresh to outlier class # -- 2. take top n% of obs by cluster prob and predict label # -- 3. refit knn on assigned points # -- 4. repeat steps 2 & 3 for remaining percentage bins # allow for custom distance metrics and weight in hdbscan call return df.label
def test_hdbscan_all_points_membership_vectors(): clusterer = HDBSCAN(prediction_data=True).fit(X) vects = all_points_membership_vectors(clusterer) assert_array_almost_equal( vects[0], np.array([7.86400992e-002, 2.52734246e-001, 8.38299608e-002])) assert_array_almost_equal( vects[-1], np.array([8.09055344e-001, 8.35882503e-002, 1.07356406e-001]))
def make_clusters(self, min_size=11, metric='euclidean', use_soft_clustering=True): print("making clusters ..") self.use_soft_clustering = use_soft_clustering self.clusterer = hd.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, metric='euclidean', p=1, min_samples=1, cluster_selection_method='leaf', leaf_size=MIN_CLUSTER_SIZE*2, prediction_data=use_soft_clustering) result = self.clusterer.fit(self.features) if use_soft_clustering: self.soft_clusters = hd.all_points_membership_vectors(self.clusterer) print("finished making clusters ..")
def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination, min_cluster_size, allow_noise): print("Clustering ...") clusterer = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True, metric="euclidean").fit(dim_reduced_vecs) print("Get prediction data ...") clusterer.generate_prediction_data() try: cluster_pred = clusterer.labels_ if allow_noise else np.argmax( all_points_membership_vectors(clusterer)[:, 1:], axis=1) except IndexError: print( "Got IndexError and will not enforce cluster membership (allow noise) ..." ) print(all_points_membership_vectors(clusterer)) cluster_pred = clusterer.labels_ # scoring print("Get scores ...") # GLOSH threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9) outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1) scores["cluster_n"] = len(np.unique(clusterer.labels_)) scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred) scores["completeness"] = completeness_score(outlier_labels, cluster_pred) scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred) scores = get_scores(scores, outlier_labels, outlier_pred) print( f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f} \ cluster_n - {len(np.unique(clusterer.labels_))}") return scores, clusterer.outlier_scores_
def make_clusters(self): self.aggregates = [] self.artists = [] self.track_count = 0 for _id in self.data.subdb: doc = self.data.subdb.get(_id) self.aggregates.append(doc["aggregates"]["mfcc"]["median"]) self.artists.append({"name": doc["name"]}) self.track_count += doc["track_count"] data = np.array(self.aggregates) self.clusterer = hd.HDBSCAN(min_cluster_size=3, metric='euclidean', p=1, min_samples=1, cluster_selection_method='leaf', leaf_size=5, prediction_data=True) result = self.clusterer.fit(data) self.soft_clusters = hd.all_points_membership_vectors(self.clusterer)
def _set_cluster_member_colors(clusterer: HDBSCAN, soft: bool = True): n_clusters = np.size(np.unique(clusterer.labels_)) if -1 in np.unique(clusterer.labels_) and not soft: color_palette = sns.color_palette('husl', n_clusters-1) else: color_palette = sns.color_palette('husl', n_clusters) if soft: soft_clusters = all_points_membership_vectors(clusterer) cluster_colors = [color_palette[np.argmax(x)] for x in soft_clusters] else: cluster_colors = [color_palette[x] if x >= 0 else (0.5, 0.5, 0.5) for x in clusterer.labels_] cluster_member_colors = [sns.desaturate(x, p) for x, p in zip(cluster_colors, clusterer.probabilities_)] return cluster_member_colors, color_palette
def generate_groups(utterances, embeddings, metric='euclidean'): keys = ['text', 'intent', 'confidence'] common_examples = [] clusterer = hdbscan.HDBSCAN( metric=metric, min_cluster_size=5, min_samples=2, prediction_data=True, cluster_selection_method='eom', alpha= 0.8 # TODO: The docs say this should be left alone, and keep the default of 1, but playing with it seems to help, might be different with real data. ).fit(np.inner(embeddings, embeddings)) # create list like: [ [utterance, label] ] with strings labels_strings = list(map(str, clusterer.labels_)) cluster_probs = hdbscan.all_points_membership_vectors(clusterer) values = zip(utterances, labels_strings, cluster_probs) for value in values: common_examples.append(dict(zip(keys, value))) message_groups = defaultdict(list) for example in common_examples: message_groups[example['intent']].append({ "phrase": str(example['text']), # "confidence": list(example['confidence']) }) unlabeled_messages = list(clusterer.labels_).count(-1) total_messages = len(utterances) return { "intents found": int(clusterer.labels_.max()), "unlabeled messages": int(unlabeled_messages), "labeled messaged": int(total_messages - unlabeled_messages), "total messages": int(total_messages), "message groups": message_groups }
def all_points_membership_vectors(self): return hdbscan.all_points_membership_vectors(self.hdbscan)
def test_hdbscan_all_points_membership_vectors(): clusterer = HDBSCAN(prediction_data=True, min_cluster_size=200).fit(X) vects = all_points_membership_vectors(clusterer) assert_array_equal(vects, np.zeros(clusterer.prediction_data_.raw_data.shape[0]))
def cluster(self): """ Cluster agents based on their traces. """ if self.learner_params["Clustering"] == 'Grouped': if self.learner_params["Cluster_type"] == 'KMedoids': traces = None scheduling_profile = None if self.learner_params["Features"] == 'Normal': # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.) traces, scheduling_profile = self.read_clustering_data() elif self.learner_params["Features"] == 'Advanced': # Clustering using the derived features traces, scheduling_profile = self.read_generated_clustering_data( ) dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile( self.agent_profiles_params) distances = self.pre_calculate_distances( traces, scheduling_profile, dtw_days_matching_of_profiles, norm=False) K_Medoids = KMedoids() best_k = 0 best_score = -1000000000 best_clusters = None for k in range(2, min(6, self.number_agents - 1)): clusters, curr_medoids = K_Medoids.cluster( distances=distances, k=k) silhouette_avg = silhouette_score(distances, clusters, metric="precomputed") print(clusters) print( "__________________________________________________________________________" ) print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg) print( "__________________________________________________________________________" ) if silhouette_avg > best_score: best_score = silhouette_avg best_clusters = clusters best_k = k print( "__________________________________________________________________________" ) print("Best K =", best_k, "The best average silhouette_score is :", best_score) print( "__________________________________________________________________________" ) print(best_clusters) self.number_clusters = best_k self.clusters = best_clusters self.assign_clusters_to_agents() elif self.learner_params[ "Cluster_type"] == 'AgglomerativeClustering': traces = None scheduling_profile = None if self.learner_params["Features"] == 'Normal': # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.) traces, scheduling_profile = self.read_clustering_data() elif self.learner_params["Features"] == 'Advanced': # Clustering using the derived features traces, scheduling_profile = self.read_generated_clustering_data( ) dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile( self.agent_profiles_params) # Clustering using hard clustering and precomputed distances distances = self.pre_calculate_distances( traces, scheduling_profile, dtw_days_matching_of_profiles, norm=False) best_k = 0 best_score = -1000000000 best_clusters = None for k in range(2, min(7, self.number_agents - 1)): # Add paramters in config clusters = AgglomerativeClustering( k, affinity='precomputed', linkage='complete').fit_predict(distances) silhouette_avg = silhouette_score(distances, clusters, metric="precomputed") print(clusters) print( "__________________________________________________________________________" ) print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg) print( "__________________________________________________________________________" ) if silhouette_avg > best_score: best_score = silhouette_avg best_clusters = clusters best_k = k print( "__________________________________________________________________________" ) print("Best K =", best_k, "The best average silhouette_score is :", best_score) print( "__________________________________________________________________________" ) print(best_clusters) self.number_clusters = best_k self.clusters = best_clusters self.assign_clusters_to_agents_hdbscan() elif self.learner_params["Cluster_type"] == 'HDBScan': traces = None scheduling_profile = None if self.learner_params["Features"] == 'Normal': # Clustering using the 11 standard features (reward, day of week, hour of the day, etc.) traces, scheduling_profile = self.read_clustering_data() elif self.learner_params["Features"] == 'Advanced': # Clustering using the derived features traces, scheduling_profile = self.read_generated_clustering_data( ) dtw_days_matching_of_profiles = self.get_sorted_average_amount_activity_per_day_per_profile( self.agent_profiles_params) #clustering using hard clustering and precomputed distances # distances = self.pre_calculate_distances(traces, # scheduling_profile, # dtw_days_matching_of_profiles, # norm=False) # cluster_labels = hdbscan.HDBSCAN(min_cluster_size=5, metric='precomputed').fit_predict(distances) # most_common = collections.Counter(cluster_labels).most_common(1)[0][0] # for i in range(0, len(cluster_labels)): # if cluster_labels[i] == -1: # cluster_labels[i] = most_common #clustering with soft clustering to deal with outliers (can't use precomputed distances with this method) clusterer = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data='true', metric='euclidean').fit(traces) soft_clusters = hdbscan.all_points_membership_vectors( clusterer) cluster_labels = [np.argmax(x) for x in soft_clusters] print("CLUSTER LABELS" + str(cluster_labels)) self.number_clusters = len(set(cluster_labels)) self.clusters = cluster_labels self.assign_clusters_to_agents_hdbscan()
numulab = [] min_cluster_range = np.linspace(cluster_range[0], cluster_range[1], 25) for min_c in min_cluster_range: trained_classifier = hdbscan.HDBSCAN( prediction_data=True, min_cluster_size=int(round(min_c * 0.01 * umap_embeddings.shape[0])), **HDBSCAN_PARAMS).fit(umap_embeddings) numulab.append(len(np.unique(trained_classifier.labels_))) if numulab[-1] > highest_numulab: st.info( 'Adjusting minimum cluster size to maximize cluster number...') highest_numulab = numulab[-1] best_clf = trained_classifier assignments = best_clf.labels_ soft_clusters = hdbscan.all_points_membership_vectors(best_clf) soft_assignments = np.argmax(soft_clusters, axis=1) st.info( 'Done assigning labels for **{}** instances in **{}** D space'.format( *umap_embeddings.shape)) with open( os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_clusters.sav'))), 'wb') as f: joblib.dump([assignments, soft_clusters, soft_assignments], f) st.balloons() if last_run: with open( os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_clusters.sav'))),
random_state=42).fit_transform(tfidf_vecs) print("Local outlier factor ...") # df["predicted"] = LocalOutlierFactor( # novelty=False, metric="euclidean", contamination=d["contamination"]).fit_predict(tfidf_vecs) clusterer = HDBSCAN(min_cluster_size=10, prediction_data=True).fit(dim_reduced_vecs) threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9) df["predicted"] = np.where(clusterer.outlier_scores_ > threshold, -1, 1) df["result"] = df.apply(lambda row: get_result(row), axis=1) title = df["result"].value_counts().to_string().replace("\n", "\t") title = f"m_clus: {clusterer.min_cluster_size} n_comp: {n_comps}" + title print(classification_report(df["outlier_label"], df["predicted"])) outlier_labels = df["outlier_label"] print(all_points_membership_vectors(clusterer)) cluster_labels = clusterer.labels_ if allow_noise else np.argmax( all_points_membership_vectors(clusterer)[:, 1:], axis=1) print(f"\nHomogeneity: {homogeneity_score(outlier_labels, cluster_labels)}") crosstab = pd.crosstab(cluster_labels, outlier_labels, normalize='index') print(f"\n\n {crosstab}") crosstab_abs = pd.crosstab(cluster_labels, outlier_labels) print(f"\n\n {crosstab_abs}") if showclusters: df["result"] = cluster_labels.astype(str) fig = create_show_graph(df, "text", coords_2d=vecs_2d, color="result") fig.update_layout(title=title) fig.show() # !! get imdb % of each cluster and homogeneity score
def run_streamlit_app(): # Introduction st.title('B-SOiD') st.header('An open-source machine learning app for parsing spatio-temporal patterns.') st.subheader('Extract behavior from pose for any organism, any camera angle! ' 'Note that keeping the checkboxes unchecked when not needed speeds up the processing.') demo_videos = { "Open-field, unrestrained, wild-type (Yttri lab @ CMU)": f"{os.path.join(BSOID_BASE_PROJECT_PATH, 'demo', 'ClusteredBehavior_aligned.mp4')}", "Open-field, tethered, OCD model (Ahmari lab @ UPitt)": f"{os.path.join(BSOID_BASE_PROJECT_PATH, 'demo', 'bsoid_grm_demo.mp4')}", } vid = st.selectbox("Notable examples, please contribute!", list(demo_videos.keys()), 0) with open(demo_videos[vid], 'rb') as video_file: # video_file = open(demo_vids[vid], 'rb') video_bytes = video_file.read() st.video(video_bytes) # Load previous run? if st.sidebar.checkbox("Load previous run? This resumes training, or can " "load previously trained network for new analysis.", False): OUTPUT_PATH = st.sidebar.text_input('Enter the prior run output directory:') try: os.listdir(OUTPUT_PATH) st.markdown(f'You have selected **{OUTPUT_PATH}** as your prior run root directory.') except FileNotFoundError: st.error('No such directory') MODEL_NAME = st.sidebar.text_input('Enter your prior run variable file prefix:') if MODEL_NAME: st.markdown(f'You have selected **{MODEL_NAME}_[contents].sav** as your prior variable files.') app_model_data_filename = f'{MODEL_NAME}_data.sav' app_model_features_filename = f'{MODEL_NAME}_features.sav' app_model_predictions_filename = f'{MODEL_NAME}_predictions.sav' app_model_clusters_filename = f'{MODEL_NAME}_clusters.sav' app_model_neuralnet_filename = f'{MODEL_NAME}_neuralnet.sav' else: st.error('Please enter a prefix name for prior run variable file.') last_run = True else: last_run = False if not last_run: # # Setting things up # # # BASE_PATH, TRAIN_FOLDERS, FPS, OUTPUT_PATH and MODEL_NAME designations st.subheader('Find your data') st.write('The __BASE PATH__ contains multiple nested directories.') BASE_PATH = st.text_input('Enter a DLC project "BASE PATH":', DLC_PROJECT_PATH) try: os.listdir(BASE_PATH) st.markdown( f'You have selected **{BASE_PATH}** as your root directory for training/testing sub-directories.') except FileNotFoundError: st.error('No such directory') st.write('The __sub-directory(ies)__ each contain one or more .csv files. ' 'Currently supporting _2D_ and _single_ animal.') TRAIN_FOLDERS = [] num_project_path_sub_directories = int(st.number_input('How many BASE_PATH/SUB-DIRECTORIES for training?', value=3)) st.markdown(f'Your will be training on **{num_project_path_sub_directories}** csv containing sub-directories.') for i in range(num_project_path_sub_directories): training_dir = st.text_input(f'Enter path to training directory NUMBER {i+1} within base path:') try: os.listdir(f'{BASE_PATH}{training_dir}') except FileNotFoundError: st.error('No such directory') if training_dir not in TRAIN_FOLDERS: TRAIN_FOLDERS.append(training_dir) st.markdown(f'You have selected **sub-directory(ies)** *{TRAIN_FOLDERS}*.') st.write('Average __frame-rate__ for these processed .csv files. ' 'Your pose estimation will be integrated over 100ms. ' 'For most animal behaviors, static poses per 100ms appears to capture _sufficient information_ ' 'for behavioral clustering while maintaining _high temporal resolution._') FPS = int(st.number_input('What is your frame-rate?', value=60)) st.markdown(f'Your framerate is **{FPS}** frames per second.') st.write('The __output directory__ will store B-SOID clustering _variable_ files and .csv _analyses_.') OUTPUT_PATH = st.text_input('Enter an output directory:', value=config.OUTPUT_PATH) try: os.listdir(OUTPUT_PATH) st.markdown(f'You have selected **{OUTPUT_PATH}** to store results.') except FileNotFoundError: st.error('No such directory, was there a typo or did you forget to create one?') st.write('For each run, computed variables are stored as __.sav files__. ' 'If you type in the same variable prefix as last run, your _workspace_ will be loaded.') MODEL_NAME = st.text_input('Enter a variable file name prefix:') if MODEL_NAME: st.markdown(f'You have named **{MODEL_NAME}_XXX.sav** as the variable files.') else: st.error('Please enter a name for your variable file name prefix.') # Pre-processing st.subheader('__Pre-process__ the low-likelihood estimations as a representation of occlusion coordinates.') st.text_area('', ''' Within each .csv file, the algorithm finds the best likelihood cutoff for each body part. ''') csv_rep = glob.glob(BASE_PATH + TRAIN_FOLDERS[0] + '/*.csv') # curr_df = pd.read_csv(csv_rep[0], low_memory=False) try: curr_df = pd.read_csv(csv_rep[0], low_memory=False) except IndexError as e: st.error('CSV file(s) was/were not found.') currdf = np.array(curr_df) BP = st.multiselect('Body parts to include', [*currdf[0, 1:-1:3]], [*currdf[0, 1:-1:3]]) BODYPARTS = [] for b in BP: index = [i for i, s in enumerate(currdf[0, 1:]) if b in s] if not index in BODYPARTS: BODYPARTS += index BODYPARTS.sort() if st.button("Start pre-processing"): filenames_list, rawdata_list, data_list, perc_rect_list = [], [], [], [] for idx_folder, folder in enumerate(TRAIN_FOLDERS): # Loop through folders f = io.get_filenames_csvs_from_folders_recursively_in_dlc_project_path(folder) my_bar = st.progress(0) for j, filename in enumerate(f): curr_df = pd.read_csv(filename, low_memory=False) curr_df_filt, perc_rect = feature_engineering.adaptive_filter_LEGACY(curr_df) rawdata_list.append(curr_df) perc_rect_list.append(perc_rect) data_list.append(curr_df_filt) filenames_list.append(filename) my_bar.progress(round((j + 1) / len(f) * 100)) training_data = np.array(data_list) with open(os.path.join(OUTPUT_PATH, app_model_data_filename), 'wb') as f: # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_data.sav'))), 'wb') as f: f'{MODEL_NAME}_data.sav' joblib.dump([BASE_PATH, FPS, BODYPARTS, filenames_list, rawdata_list, training_data, perc_rect_list], f) st.info(f'Processed a total of **{len(data_list)}** CSV files, ' f'and compiled into a **{training_data.shape}** data list.') st.balloons() # with open(os.path.join(OUTPUT_PATH, app_model_data_filename), 'rb') as fr: # f'{MODEL_NAME}_data.sav' BASE_PATH, FPS, BODYPARTS, filenames, rawdata_list, training_data, perc_rect_list = joblib.load(fr) if st.checkbox('Show % body part processed per file?', False): st.write('This line chart shows __% body part below file-based threshold__') subllh_percent = pd.DataFrame(perc_rect_list) st.bar_chart(subllh_percent) # st.write('This allows you to scroll through and visualize raw vs processed data.') # if st.checkbox("Show raw & processed data?", False): # try: # ID = int(st.number_input('Enter csv/data-list index:', min_value=1, max_value=len(rawdata_li), value=1)) # st.markdown('This is file *{}*.'.format(filenames[ID - 1])) # st.write(rawdata_li[ID - 1]) # st.write(training_data[ID - 1]) # except: # pass if last_run: with open(os.path.join(config.OUTPUT_PATH, app_model_data_filename), 'rb') as fr: BASE_PATH, FPS, BODYPARTS, filenames, rawdata_list, training_data, perc_rect_list = joblib.load(fr) if st.checkbox('Show % body part processed per file?', False): st.write('This line chart shows __% body part below file-based threshold__') subllh_percent = pd.DataFrame(perc_rect_list) st.bar_chart(subllh_percent) st.markdown(f'**_CHECK POINT_**: Processed a total of **{len(rawdata_list)}** CSV files, ' f'and compiled into a **{training_data.shape}** data list.') st.write('This allows you to scroll through and visualize raw vs processed data.') if st.checkbox("Show raw & processed data?", False): try: ID = int( st.number_input('Enter csv/data-list index:', min_value=1, max_value=len(rawdata_list), value=1)) st.write(rawdata_list[ID - 1]) st.write(training_data[ID - 1]) except Exception as e: # TODO: med: exception is too generalized. Add note or make more specific. st.error(f'Error found: {repr(e)}.') pass # Feature extraction + UMAP st.subheader('Perform __dimensionality reduction__ to improve clustering.') st.text_area('', ''' For each body part, find the distance to all others, the angular change between these distances, and its displacement over time. That is A LOT of dimensions, so reducing it is necessary. ''') if st.button("Start dimensionality reduction"): # TODO ********************** THIS IS A TOTAL REPEAT OF ANOTHER FEATURE EXTRACTION FUNCTION ************************ win_len = np.int(np.round(0.05 / (1 / FPS)) * 2 - 1) feats = [] my_bar = st.progress(0) for m in range(len(training_data)): data_range = len(training_data[m]) dis_r, dxy_r = [], [] for r in range(data_range): if r < data_range - 1: dis = [] for c in range(0, training_data[m].shape[1], 2): dis.append(np.linalg.norm(training_data[m][r + 1, c:c + 2] - training_data[m][r, c:c + 2])) dis_r.append(dis) dxy = [] for i, j in itertools.combinations(range(0, training_data[m].shape[1], 2), 2): dxy.append(training_data[m][r, i:i + 2] - training_data[m][r, j:j + 2]) dxy_r.append(dxy) dis_r = np.array(dis_r) dxy_r = np.array(dxy_r) dis_smth = [] dxy_eu = np.zeros([data_range, dxy_r.shape[1]]) ang = np.zeros([data_range - 1, dxy_r.shape[1]]) dxy_smth = [] ang_smth = [] for l in range(dis_r.shape[1]): dis_smth.append(likelihoodprocessing.boxcar_center(dis_r[:, l], win_len)) for k in range(dxy_r.shape[1]): for kk in range(data_range): dxy_eu[kk, k] = np.linalg.norm(dxy_r[kk, k, :]) if kk < data_range - 1: b_3d = np.hstack([dxy_r[kk + 1, k, :], 0]) a_3d = np.hstack([dxy_r[kk, k, :], 0]) c = np.cross(b_3d, a_3d) ang[kk, k] = np.dot(np.dot(np.sign(c[2]), 180) / np.pi, math.atan2(np.linalg.norm(c), np.dot(dxy_r[kk, k, :], dxy_r[kk + 1, k, :]))) dxy_smth.append(likelihoodprocessing.boxcar_center(dxy_eu[:, k], win_len)) ang_smth.append(likelihoodprocessing.boxcar_center(ang[:, k], win_len)) dis_smth = np.array(dis_smth) dxy_smth = np.array(dxy_smth) ang_smth = np.array(ang_smth) feats.append(np.vstack((dxy_smth[:, 1:], ang_smth, dis_smth))) my_bar.progress(round((m + 1) / len(training_data) * 100)) st.info(f'Done extracting features from a total of **{len(training_data)}** training ' f'CSV files. Now reducing dimensions...') for n in range(len(feats)): feats1 = np.zeros(len(training_data[n])) for k in range(round(FPS / 10), len(feats[n][0]), round(FPS / 10)): if k > round(FPS / 10): feats1 = np.concatenate((feats1.reshape(feats1.shape[0], feats1.shape[1]), np.hstack((np.mean((feats[n][0:dxy_smth.shape[0], range(k - round(FPS / 10), k)]), axis=1), np.sum((feats[n][dxy_smth.shape[0]:feats[n].shape[0], range(k - round(FPS / 10), k)]), axis=1))).reshape(len(feats[0]), 1)), axis=1) else: feats1 = np.hstack((np.mean((feats[n][0:dxy_smth.shape[0], range(k - round(FPS / 10), k)]), axis=1), np.sum((feats[n][dxy_smth.shape[0]:feats[n].shape[0], range(k - round(FPS / 10), k)]), axis=1))).reshape(len(feats[0]), 1) if n > 0: features_10fps = np.concatenate((features_10fps, feats1), axis=1) scaler = StandardScaler() scaler.fit(feats1.T) feats1_scaled = scaler.transform(feats1.T).T features_10fps_scaled = np.concatenate((features_10fps_scaled, feats1_scaled), axis=1) else: features_10fps = feats1 scaler = StandardScaler() scaler.fit(feats1.T) feats1_scaled = scaler.transform(feats1.T).T features_10fps_scaled = feats1_scaled # scaling is important as I've seen wildly different stdev/feat between sessions features_10fps_train = features_10fps_scaled.T mem = virtual_memory() if mem.available > features_10fps_scaled.shape[0] * features_10fps_scaled.shape[1] * 32 * 100 + 256_000_000: # TODO: low: magic variables trained_umap = umap.UMAP(**UMAP_PARAMS).fit(features_10fps_train) # n_neighbors removed, moved to config else: st.info('Detecting that you are running low on available memory for this ' 'computation, setting low_memory so will take longer.') trained_umap = umap.UMAP(low_memory=True, **UMAP_PARAMS).fit(features_10fps_train) umap_embeddings = trained_umap.embedding_ st.info(f'Done non-linear transformation of **{features_10fps_train.shape[0]}** instances ' f'from **{features_10fps_train.shape[1]}** D into **{umap_embeddings.shape[1]}** D.') with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'wb') as file: joblib.dump([features_10fps, features_10fps_scaled, umap_embeddings], file) st.balloons() if last_run: with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) st.markdown(f'**_CHECK POINT_**: Done non-linear transformation of **{features_10fps_scaled.shape[1]}** instances ' f'from **{features_10fps_scaled.shape[0]}** D into **{umap_embeddings.shape[1]}** D.') # HDBSCAN st.subheader('Perform density-based clustering.') st.text_area('', ''' The following slider allows you to adjust cluster number. The preset (0.5-1.5%) works for most large (> 25k instances) datasets. It is recommended to tweak this for cluster number > 40 or < 4. ''') cluster_range = st.slider('Select range of minimum cluster size in %', 0.01, 5.0, (0.4, 1.2)) st.markdown(f'Your minimum cluster size ranges between **{cluster_range[0]}%** and **{cluster_range[1]}%**.') if st.button("Start clustering"): with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) highest_numulab = -np.infty numulab = [] min_cluster_range = np.linspace(cluster_range[0], cluster_range[1], 25) for min_c in min_cluster_range: trained_classifier = hdbscan.HDBSCAN(prediction_data=True, min_cluster_size=int(round(min_c * 0.01 * umap_embeddings.shape[0])), **HDBSCAN_PARAMS).fit(umap_embeddings) numulab.append(len(np.unique(trained_classifier.labels_))) if numulab[-1] > highest_numulab: st.info('Adjusting minimum cluster size to maximize cluster number...') highest_numulab = numulab[-1] best_clf = trained_classifier assignments = best_clf.labels_ # TODO: med: potential for reference before assignment soft_clusters = hdbscan.all_points_membership_vectors(best_clf) soft_assignments = np.argmax(soft_clusters, axis=1) st.info('Done assigning labels for **{}** instances in **{}** D space'.format(*umap_embeddings.shape)) with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'wb') as f: joblib.dump([assignments, soft_clusters, soft_assignments], f) st.balloons() if last_run: with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: assignments, soft_clusters, soft_assignments = joblib.load(fr) st.markdown('**_CHECK POINT_**: Done assigning labels for ' '**{}** instances in **{}** D space'.format(*umap_embeddings.shape)) if st.checkbox("Show UMAP enhanced clustering plot?", True): st.write('Below are two cluster plots.') with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: # str.join('', (MODEL_NAME, '_feats.sav')) features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: # str.join('', (MODEL_NAME, '_clusters.sav')) assignments, soft_clusters, soft_assignments = joblib.load(fr) fig1, plt1 = visuals.plot_classes_bsoidapp(umap_embeddings[assignments >= 0], assignments[assignments >= 0]) plt1.suptitle('HDBSCAN assignment') st.pyplot(fig1) st.write('The __soft__ assignment disregards noise and attempts to fit all data points to assignments ' 'based on highest probability.') fig2, plt2 = visuals.plot_classes_bsoidapp(umap_embeddings[soft_assignments >= 0], soft_assignments[soft_assignments >= 0]) plt2.suptitle('HDBSCAN soft assignment') st.pyplot(fig2) st.subheader('Based on __soft__ assignment, train a neural network to _learn_ the rules.') st.text_area('', ''' Neural network will be trained on recognizing distance, angles, and speed. This is for our vision in closed-loop experiments ''') if st.button("Start training a behavioral neural network"): with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_feats.sav'))), 'rb') as fr: f'{MODEL_NAME}_feats.sav' features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: # with open(os.path.join(OUTPUT_PATH, str.join('', (MODEL_NAME, '_clusters.sav'))), 'rb') as fr: f'{MODEL_NAME}_clusters.sav' assignments, soft_clusters, soft_assignments = joblib.load(fr) features_10fps_train, feats_test, labels_train, labels_test = train_test_split( features_10fps.T, soft_assignments.T, test_size=HOLDOUT_PERCENT, random_state=config.RANDOM_STATE) st.info( f'Training feedforward neural network on randomly partitioned {(1 - HOLDOUT_PERCENT) * 100}% of training data...') classifier = MLPClassifier(**MLP_PARAMS) classifier.fit(features_10fps_train, labels_train) clf = MLPClassifier(**MLP_PARAMS) clf.fit(features_10fps.T, soft_assignments.T) nn_assignments = clf.predict(features_10fps.T) st.info(f'Done training feedforward neural network ' f'mapping **{features_10fps.T.shape}** features to **{soft_assignments.T.shape}** assignments.') scores = cross_val_score(classifier, feats_test, labels_test, cv=CROSSVALIDATION_K, n_jobs=-1) with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'wb') as f: # str.join('', (MODEL_NAME, '_neuralnet.sav')) joblib.dump([feats_test, labels_test, classifier, clf, scores, nn_assignments], f) st.balloons() if last_run: # app_model_neuralnet_filename with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: # str.join('', (MODEL_NAME, '_neuralnet.sav')) feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) st.markdown(f'**_CHECK POINT_**: Done training feedforward neural network ' f'mapping **{features_10fps.T.shape}** features to **{soft_assignments.T.shape}** assignments.') if st.checkbox(f"Show confusion matrix on {HOLDOUT_PERCENT * 100}% data?", False): with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: # f'{MODEL_NAME}_neuralnet.sav' feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) np.set_printoptions(precision=2) # TODO: low: move precision setting to top? titles_options = [("Non-normalized confusion matrix", None), ("Normalized confusion matrix", 'true'), ] titlenames = ["counts", "norm"] # TODO: unused variable j = 0 # TODO: unused variable st.write('Below are two confusion matrices - top: raw counts, bottom: probability. These matrices shows ' '**true positives in diagonal**, false negatives in rows, and false positives in columns') for title, normalize in titles_options: colormap = plot_confusion_matrix(classifier, feats_test, labels_test, cmap=plt.cm.Blues, normalize=normalize) colormap.ax_.set_title(title) j += 1 st.pyplot(colormap.figure_) st.write( 'If these are **NOT satisfactory**, either _increase_ the above minimum cluster size to ' 'remove noise subgroups, or include _more data_') if st.checkbox( "Show cross-validated accuracy on randomly selected {}% held-out test set?".format(HOLDOUT_PERCENT * 100), False): st.write( 'For **overall** machine learning accuracy, a part of the error could be _cleaning up_ clustering noise.') with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) fig, plt_acc = visuals.plot_accuracy_bsoidapp(scores) st.pyplot(fig) st.write( 'If this is **NOT satisfactory**, either _increase_ the above minimum cluster size to ' 'remove noise subgroups, or include _more data_') st.subheader(f'If reasonable/satisfied, you may export analyses results to {OUTPUT_PATH}') txt5 = st.text_area('Result options descriptions:', ''' Input features: basic statistics of these extracted pairwise distance, angle, and speed features. Feature corresponding labels: these features time-locked to the labels. Soft assignment probabilities: if interested, the label probabilities of each time point. ''') result1_options = st.multiselect('What type of results do you want to export', ['Input features', 'Feature corresponding labels', 'Soft assignment probabilities', ], ['Feature corresponding labels', ], ) if st.button('Export'): if any('Input features' in o for o in result1_options): with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: # str.join('', (MODEL_NAME, '_feats.sav')) features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) timestr = time.strftime("_%Y%m%d_%H%M") feat_range, feat_med, p_cts, edges = statistics.feat_dist(features_10fps) f_range_df = pd.DataFrame(feat_range, columns=['5%tile', '95%tile']) f_med_df = pd.DataFrame(feat_med, columns=['median']) f_pcts_df = pd.DataFrame(p_cts) f_pcts_df.columns = pd.MultiIndex.from_product([f_pcts_df.columns, ['prob']]) f_edge_df = pd.DataFrame(edges) f_edge_df.columns = pd.MultiIndex.from_product([f_edge_df.columns, ['edge']]) f_dist_data = pd.concat((f_range_df, f_med_df, f_pcts_df, f_edge_df), axis=1) f_dist_data.to_csv((os.path.join(OUTPUT_PATH, f'feature_distribution_10Hz{timestr}.csv')), # str.join('', ('', timestr, '.csv')) index=True, chunksize=10000, encoding='utf-8') if any('Feature corresponding labels' in o for o in result1_options): with open(os.path.join(OUTPUT_PATH, app_model_features_filename), 'rb') as fr: features_10fps, features_10fps_scaled, umap_embeddings = joblib.load(fr) with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: assignments, soft_clusters, soft_assignments = joblib.load(fr) with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) timestr = time.strftime("_%Y%m%d_%H%M") length_nm, angle_nm, disp_nm = [], [], [] for i, j in itertools.combinations(range(int(np.sqrt(features_10fps.shape[0]))), 2): # TODO: low: remove range starts at 0, redundant? length_nm.append(['distance between points:', i + 1, j + 1]) angle_nm.append(['angular change for points:', i + 1, j + 1]) for i in range(int(np.sqrt(features_10fps.shape[0]))): disp_nm.append(['displacement for point:', i + 1, i + 1]) m_columns = np.vstack((length_nm, angle_nm, disp_nm)) feat_nm_df = pd.DataFrame(features_10fps.T, columns=m_columns) umaphdb_data = np.concatenate([umap_embeddings, assignments.reshape(len(assignments), 1), soft_assignments.reshape(len(soft_assignments), 1), nn_assignments.reshape(len(nn_assignments), 1), ], axis=1) multi_index_columns = pd.MultiIndex.from_tuples([ ('UMAP embeddings', 'Dimension 1'), ('', 'Dimension 2'), ('', 'Dimension 3'), ('HDBSCAN', 'Assignment No.'), ('HDBSCAN*SOFT', 'Assignment No.'), ('Neural Net', 'Assignment No.')], names=['Type', 'Frame@10Hz'], ) umaphdb_df = pd.DataFrame(umaphdb_data, columns=multi_index_columns) training_data = pd.concat((feat_nm_df, umaphdb_df), axis=1) soft_clust_prob = pd.DataFrame(soft_clusters) # TODO: ??? !!! training_data.to_csv((os.path.join(OUTPUT_PATH, f'features_labels_10Hz{timestr}.csv')), # str.join('', ('features_labels_10Hz', timestr, '.csv')) index=True, chunksize=10000, encoding='utf-8') if any('Soft assignment probabilities' in o for o in result1_options): with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: assignments, soft_clusters, soft_assignments = joblib.load(fr) timestr = time.strftime("_%Y%m%d_%H%M") soft_clust_prob = pd.DataFrame(soft_clusters) soft_clust_prob.to_csv((os.path.join(OUTPUT_PATH, f'soft_cluster_prob_10Hz{timestr}.csv')), index=True, chunksize=10000, encoding='utf-8') st.balloons() if st.sidebar.checkbox('Behavioral structure visual analysis?', False): with open(os.path.join(OUTPUT_PATH, app_model_clusters_filename), 'rb') as fr: assignments, soft_clusters, soft_assignments = joblib.load(fr) with open(os.path.join(OUTPUT_PATH, app_model_predictions_filename), 'rb') as fr: # str.join('', (MODEL_NAME, '_predictions.sav'))), folders, folders_list, filenames, data_new, frameshift_labels = joblib.load(fr) selected_folder = st.sidebar.selectbox('select folder', [*folders]) try: indices = [i for i, s in enumerate(folders_list) if str(selected_folder) in s] tm_c_all, tm_p_all = [], [] for idx in indices: df_runlengths, df_dur_statistics, B, df_tm, B_norm = statistics.main_app( frameshift_labels[idx], len(np.unique(soft_assignments))) tm_c_all.append(B) tm_p_all.append(B_norm) tm_c_ave = np.nanmean(tm_c_all, axis=0) tm_p_ave = np.nanmean(tm_p_all, axis=0) diag = [tm_c_ave[i][i] for i in range(len(tm_c_ave))] diag_p = np.array(diag) / np.array(diag).max() node_sizes = [50 * i for i in diag_p] A = np.matrix(tm_p_ave) # TODO: med: numpy error: the matrix subclass is not the recommended way to represent matrices or deal with linear algebra (see https://docs.scipy.org/doc/numpy/user/numpy-for-matlab-users.html). Please adjust your code to use regular ndarray. np.fill_diagonal(A, 0) A_norm = A / A.sum(axis=1) where_are_NaNs = np.isnan(A_norm) A_norm[where_are_NaNs] = 0 fig = plt.figure() G = nx.from_numpy_matrix(A_norm, create_using=nx.MultiDiGraph()) pos = nx.layout.spring_layout(G) edge_colors = [G[u][v][0].get('weight') for u, v in G.edges()] nodes = nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='blue', with_label=True) edges = nx.draw_networkx_edges(G, pos, node_size=node_sizes, arrowstyle='->', arrowsize=8, edge_color=edge_colors, edge_cmap=plt.cm.Blues, width=1.5) lab_pos = [pos[i] + 0.005 for i in range(len(pos))] nx.draw_networkx_labels(G, lab_pos, font_size=10) pc = mpl.collections.PatchCollection(edges, cmap=plt.cm.Blues) pc.set_array(edge_colors) plt.colorbar(pc) ax = plt.gca() ax.set_axis_off() st.pyplot(fig) except: pass else: st.subheader('Making sense of these behaviors and bulk process old/new data.') txt = st.text_area('Process flow options:', ''' Generate predictions and corresponding videos: allows you to go video by video and analyze with visuals. Bulk process all csvs: once you have subjective definitions for labels, you can run predictions with high consistency. It will prompt for types of analysis to be exported. ''') prediction_options = st.selectbox('Select an option:', ('Generate predictions and corresponding videos', 'Bulk process all csvs')) if prediction_options == 'Generate predictions and corresponding videos': csv_dir = st.text_input('Enter the testing data sub-directory within BASE PATH:') try: os.listdir(os.path.join(DLC_PROJECT_PATH, csv_dir)) # os.listdir(str.join('', (BASE_PATH, csv_dir))) st.markdown(f'You have selected **{csv_dir}** as your csv data sub-directory.') except FileNotFoundError: st.error('No such directory') csv_file = st.selectbox('Select the csv file', sorted(os.listdir(DLC_PROJECT_PATH + csv_dir))) vid_dir = st.text_input('Enter corresponding video directory (This can be outside of BASE PATH):') try: os.listdir(vid_dir) st.markdown(f'You have selected **{vid_dir}** as your video directory.') except FileNotFoundError: st.error('No such directory') vid_file = st.selectbox('Select the video (.mp4 or .avi)', sorted(os.listdir(vid_dir))) st.markdown(f'You have selected **{vid_file}** as your video matching **{csv_file}**.') csv_filename = os.path.basename(csv_file).rpartition('.')[0] try: os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/pngs'))) # TODO: low: refactor `str.join(...)` except FileExistsError: pass try: path_to_make = os.path.join(DLC_PROJECT_PATH, csv_dir, 'pngs', csv_filename) # path_to_make = f'{DLC_PROJECT_PATH}{csv_dir}{os.path.sep}pngs{os.path.sep}{csv_filename}' if not os.path.isdir(path_to_make): os.mkdir(path_to_make) # os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/pngs', '/', csv_filename))) except FileExistsError as fee: err = f'Error: {repr(fee)}' logger.error(err) pass frame_dir = os.path.join(DLC_PROJECT_PATH, csv_dir, 'pngs', csv_filename) # frame_dir = f'{DLC_PROJECT_PATH}{csv_dir}{os.path.sep}pngs{os.path.sep}{csv_filename}' # TODO: low: refactor `str.join(...)` st.markdown(f'You have created **{frame_dir}** as your PNG directory for video file {vid_file}.') probe = ffmpeg.probe(os.path.join(vid_dir, vid_file)) video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video') width = int(video_info['width']) height = int(video_info['height']) num_frames = int(video_info['nb_frames']) bit_rate = int(video_info['bit_rate']) avg_frame_rate = round(int( video_info['avg_frame_rate'].rpartition('/')[0]) / int(video_info['avg_frame_rate'].rpartition('/')[2])) if st.button(f'Start frame extraction for {num_frames} frames at {avg_frame_rate} frames per second'): try: (ffmpeg.input(os.path.join(vid_dir, vid_file)) .filter('fps', fps=avg_frame_rate) .output(str.join('', (frame_dir, '/frame%01d.png')), video_bitrate=bit_rate, s=str.join('', (str(int(width * 0.5)), 'x', str(int(height * 0.5)))), sws_flags='bilinear', start_number=0) .run(capture_stdout=True, capture_stderr=True)) st.info(f'Done extracting **{num_frames}** frames from video **{vid_file}**.') except ffmpeg.Error as e: print('stdout:', e.stdout.decode('utf8')) print('stderr:', e.stderr.decode('utf8')) try: os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s'))) except FileExistsError: pass try: os.mkdir(str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s', '/', csv_filename))) except FileExistsError: pass shortvid_dir = str.join('', (DLC_PROJECT_PATH, csv_dir, '/mp4s', '/', csv_filename)) st.markdown(f'You have created **{shortvid_dir}** as your .mp4 directory for ' f'group examples from video {vid_file}.') min_time = st.number_input('Enter minimum time for bout in ms:', value=100) min_frames = round(float(min_time) * 0.001 * float(FPS)) st.markdown(f'You have entered **{min_time} ms** as your minimum duration per bout, ' f'which is equivalent to **{min_frames} frames**.' f'(drop this down for more group representations)') number_examples = st.slider('Select number of non-repeated examples', 1, 10, 3) st.markdown('Your will obtain a maximum of **{number_examples}** non-repeated output examples per group.') out_fps = int(st.number_input('Enter output frame-rate:', value=30)) playback_speed = float(out_fps) / float(FPS) st.markdown(f'Your have selected to view these examples at **{out_fps} FPS**, which is ' f'equivalent to **{playback_speed}X speed**.') if st.button("Predict labels and create example videos"): with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) curr_df = pd.read_csv(os.path.join(str.join('', (DLC_PROJECT_PATH, csv_dir, '/', csv_file))), low_memory=False) # TODO: low: rework this pathing... curr_df_filt, perc_rect = likelihoodprocessing.adaptive_filter_LEGACY(curr_df) test_data = [curr_df_filt] labels_frameshift = [] fs_labels = frameshift_labels = [] # TODO: low: change loop variable names so each is unique (clarity) for i in range(len(test_data)): feats_new = classify.bsoid_extract_app(test_data, FPS) labels = classify.bsoid_predict_app(feats_new, clf) for m in range(len(labels)): labels[m] = labels[m][::-1] labels_pad = -1 * np.ones([len(labels), len(max(labels, key=lambda x: len(x)))]) for n, l in enumerate(labels): labels_pad[n][0:len(l)] = l labels_pad[n] = labels_pad[n][::-1] if n > 0: labels_pad[n][0:n] = labels_pad[n - 1][0:n] labels_frameshift.append(labels_pad.astype(int)) for k in range(len(labels_frameshift)): labels_fs2 = [] for l in range(math.floor(FPS / 10)): labels_fs2.append(labels_frameshift[k][l]) frameshift_labels.append(np.array(labels_fs2).flatten('F')) st.info(f'Done frameshift-predicting **{csv_file}**.') # def create_labeled_vid_app(labels, crit, counts, output_fps, video_frames_directory, output_path) -> None: videoprocessing.create_labeled_example_videos_by_label( frameshift_labels[0], crit=int(min_frames), counts=int(number_examples), # TODO: high: why is only the first frameshift_labels indexed? output_fps=int(out_fps), video_frames_directory=frame_dir, output_path=shortvid_dir) st.balloons() if st.checkbox(f"Show example videos? (loading it up from {shortvid_dir})", False): example_vid = st.selectbox('Select the video (.mp4 or .avi)', sorted(os.listdir(shortvid_dir))) example_vid_file = open(os.path.join(str.join('', (shortvid_dir, os.path.sep, example_vid))), 'rb') st.markdown(f'You have selected **{example_vid}** as your video from {shortvid_dir}.') video_bytes = example_vid_file.read() st.video(video_bytes) if prediction_options == 'Bulk process all CSVs': st.write('Bulk processing will take some time for large datasets.' 'This includes a lot of files, long videos, and/or high frame-rates.') TEST_FOLDERS = [] num_project_path_sub_directories: int = int(st.number_input('How many sub-directories for bulk predictions?', value=3)) st.markdown(f'Your will be predicting on **{num_project_path_sub_directories}** csv containing sub-directories.') for i in range(num_project_path_sub_directories): test_dir = st.text_input(f'Enter path to test directory number {i+1} within base path:') try: os.listdir(str.join('', (DLC_PROJECT_PATH, test_dir))) os.listdir(f'{DLC_PROJECT_PATH}{test_dir}') except FileNotFoundError: st.error('No such directory') except Exception as e: err = f'Unexpected error found: {repr(e)}' st.error(err) logger.error(err) if test_dir not in TEST_FOLDERS: TEST_FOLDERS.append(test_dir) st.markdown(f'You have selected sub-directory(ies) **{TEST_FOLDERS}**.') FPS = int(st.number_input('What is your framerate for these csvs?', value=60)) # TODO: Q: 60=magic variable? st.markdown(f'Your frame-rate is **{FPS}** frames per second for these CSVs.') st.text_area('Select the analysis of interest to you. If in doubt, select all.', ''' Predicted labels with original pose: labels written into original .csv files (time-locked). Behavioral bout lengths in chronological order: the behaviors and its bouts over time. Behavioral bout statistics: basic statistics for these behavioral durations. Transition matrix: behavioral transitions based on Markov Decision Process. ''') result2_options = st.multiselect('What type of results do you want to export?', ['Predicted labels with original pose', 'Behavioral bout lengths in chronological order', 'Behavioral bout statistics', 'Transition matrix'], ['Predicted labels with original pose', 'Behavioral bout statistics']) if st.button("Begin bulk csv processing, potentially a long computation"): st.write('These B-SOiD csv files will be saved in the original pose estimation csv containing ' 'folders, under sub-directory BSOID.') with open(os.path.join(OUTPUT_PATH, app_model_neuralnet_filename), 'rb') as fr: feats_test, labels_test, classifier, clf, scores, nn_assignments = joblib.load(fr) folders, filenames, data_new, perc_rect = io.import_folders_app(DLC_PROJECT_PATH, TEST_FOLDERS, BODYPARTS) labels_frameshift, labels_fs2, frameshift_labels = [], [], [] # TODO: HIGH: RE-EVALUATE VARIABLE NAMES --> `labels_fs` and `fs_labels` <------- bar = st.progress(0) for i in range(len(data_new)): feats_new = classify.bsoid_extract_app([data_new[i]], FPS) labels = classify.bsoid_predict_app(feats_new, clf) for m in range(0, len(labels)): labels[m] = labels[m][::-1] labels_pad = -1 * np.ones([len(labels), len(max(labels, key=lambda x: len(x)))]) for n, l in enumerate(labels): labels_pad[n][0:len(l)] = l labels_pad[n] = labels_pad[n][::-1] if n > 0: labels_pad[n][0:n] = labels_pad[n - 1][0:n] labels_frameshift.append(labels_pad.astype(int)) bar.progress(round((i + 1) / len(data_new) * 100)) for k in range(len(labels_frameshift)): labels_fs2 = [] for l in range(math.floor(FPS / 10)): labels_fs2.append(labels_frameshift[k][l]) frameshift_labels.append(np.array(labels_fs2).flatten('F')) st.info(f'Done frameshift-predicting a total of **{len(data_new)}** files.') filenames = [] all_df = [] folders_list = [] for i, folder in enumerate(TEST_FOLDERS): # Loop through folders f = io.get_filenames_csvs_from_folders_recursively_in_dlc_project_path(folder) for j, filename in enumerate(f): curr_df = pd.read_csv(filename, low_memory=False) filenames.append(filename) folders_list.append(folder) all_df.append(curr_df) for i in range(len(frameshift_labels)): timestr = time.strftime("_%Y%m%d_%H%M_") csv_filename = os.path.basename(filenames[i]).rpartition('.')[0] fs_labels_pad = np.pad(frameshift_labels[i], (0, len(all_df[i]) - 2 - len(frameshift_labels[i])), 'edge') df2 = pd.DataFrame(fs_labels_pad, columns={'B-SOiD labels'}) df2.loc[len(df2)] = '' df2.loc[len(df2)] = '' # TODO: low: duplicate? df2 = df2.shift() df2.loc[0] = '' df2 = df2.shift() df2.loc[0] = '' frames = [df2, all_df[0]] xyfs_df = pd.concat(frames, axis=1) df_runlengths, df_dur_statistics, B, df_tm, B_norm = \ statistics.main_app(frameshift_labels[i], len(np.unique(nn_assignments))) try: os.mkdir(str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID'))) except FileExistsError: pass if any('Predicted labels with original pose' in o for o in result2_options): xyfs_filename = os.path.join(f'{DLC_PROJECT_PATH}{folders_list[i]}', 'BSOID', f'labels_pose_{FPS}Hz{timestr}{csv_filename}.csv') # xyfs_filename = os.path.join(DLC_PROJECT_PATH + folders_list[i] + '/BSOID', str.join('', ('', str(FPS), 'Hz', timestr, csv_filename, '.csv'))) xyfs_df.to_csv(xyfs_filename, index=True, chunksize=10000, encoding='utf-8') if any('Behavioral bout lengths in chronological order' in o for o in result2_options): df_runlengths.to_csv(os.path.join( str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')), str.join('', ('bout_lengths_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))), index=True, chunksize=10000, encoding='utf-8') if any('Behavioral bout statistics' in o for o in result2_options): df_dur_statistics.to_csv(os.path.join( str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')), str.join('', ('bout_stats_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))), index=True, chunksize=10000, encoding='utf-8') if any('Transition matrix' in o for o in result2_options): df_tm.to_csv(os.path.join( str.join('', (DLC_PROJECT_PATH, folders_list[i], '/BSOID')), str.join('', ('transitions_mat_', str(FPS), 'Hz', timestr, csv_filename, '.csv'))), index=True, chunksize=10000, encoding='utf-8') with open(os.path.join(OUTPUT_PATH, app_model_predictions_filename), 'wb') as f: joblib.dump([folders, folders_list, filenames, data_new, frameshift_labels], f) st.balloons() return