def upgma_triu(segments_ids, dm): """ Perform UPGMA given a distance matrix :param segments_ids: an array of Segment IDs :param dm: ID of a DistanceMatrix :return: two arrays: indices is the positions of the leaves and distances the distances to the root """ all_segments_ids = np.array( list(Segment.objects.all().order_by('id').values_list('id', flat=True))) chksum = DistanceMatrix.calc_chksum(all_segments_ids) if dm is None: return [0] * len(segments_ids) assert chksum == dm.chksum mat_idx = np.searchsorted(all_segments_ids, segments_ids) triu = dm.triu distmat = triu2mat(triu) distmat = distmat[:, mat_idx][mat_idx, :] distmat[np.isnan(distmat)] = 0 triu = mat2triu(distmat) tree = linkage(triu, method='average') indices, distances = dist_from_root(tree) return indices.tolist(), distances.tolist()
def perform_symprof(self, dist_triu, processed_measures, permuter): clusters_by_simprof_pkl_name = 'symprof-unsup-{}-{}-pca={}%.pkl'.format( self.feature_grouper, self.max_deviation, self.pca_explained) clusters_by_simprof_pkl_file = '/tmp/' + clusters_by_simprof_pkl_name clusters_by_cutoff_pkl_name = 'cluster-unsup-{}-{}-pca={}%.pkl'.format( self.feature_grouper, self.max_deviation, self.pca_explained) clusters_by_cutoff_pkl_file = '/tmp/' + clusters_by_cutoff_pkl_name if os.path.isfile(clusters_by_simprof_pkl_file): with open(clusters_by_simprof_pkl_file, 'rb') as f: saved = pickle.load(f) clusters_by_symprof = saved['clusters_by_symprof'] else: nobs = processed_measures.shape[0] distmat = triu2mat(dist_triu) min_cluster_count = 10 max_cluster_size = 1000 if os.path.isfile(clusters_by_cutoff_pkl_file): with open(clusters_by_cutoff_pkl_file, 'rb') as f: saved = pickle.load(f) sub_clusters = saved['sub_clusters'] else: original_cluster = np.arange(nobs) sub_clusters = [] divide_clusters(distmat, original_cluster, max_cluster_size, sub_clusters, min_cluster_count) with open(clusters_by_cutoff_pkl_file, 'wb') as f: pickle.dump(dict(sub_clusters=sub_clusters), f) clusters_by_symprof = [] x = 1 for sub_cluster in sub_clusters: print('========================={}/{}========================'. format(x, len(sub_clusters))) recursive_simprof(processed_measures, permuter, sub_cluster, clusters_by_symprof, min_cluster_size=10, max_deviation=self.max_deviation, is_structural=self.structural_checker) x += 1 with open(clusters_by_simprof_pkl_file, 'wb') as f: pickle.dump(dict(clusters_by_symprof=clusters_by_symprof), f) return clusters_by_symprof
def calc_class_dist_by_adjacency(adjacency_mat, syl_label_enum_arr, return_triu=False, metric='euclidean'): """ Currently this distmat contains reversed distance, e.g a pair (A,B) has high "distance" if they're found adjacent to each other often -- so we need to reverse this. :param adjacency_mat: :param syl_label_enum_arr: :param return_triu: :param metric: :return: """ max_distance = np.max(adjacency_mat) adjacency_mat = max_distance - adjacency_mat adjacency_mat[np.where(np.isinf(adjacency_mat))] = max_distance + 1 # To avoid overwhelming the entire distance matrix by having some highly repeated pair, we convert the distance to # logarithmic scale adjacency_mat = np.log10(adjacency_mat) counter = Counter(syl_label_enum_arr) nlabels = len(counter) frequencies = np.array([counter[i] for i in range(nlabels)]) adjacency_mat_fw_norm = adjacency_mat / frequencies[:, None] # adjacency_mat_bw_norm = adjacency_mat / frequencies # coordinates = np.concatenate((adjacency_mat_fw_norm, adjacency_mat_bw_norm), axis=1) coordinates = adjacency_mat_fw_norm dist_triu = distance.pdist(coordinates, metric) if return_triu: return dist_triu distmat = triu2mat(dist_triu) return distmat
def handle(self, source, matfile, niters, csv_filename, *args, **options): assert source in ['tsne', 'raw', 'norm'] saved = DotMap(loadmat(matfile)) sids = saved.sids.ravel() clusters = saved.clusters dataset = saved.dataset meas = zscore(dataset) labels = saved.labels haslabel_ind = np.where(labels != ' ')[0] labels = labels[haslabel_ind] labels = np.array([x.strip() for x in labels]) sids = sids[haslabel_ind] clusters = clusters[haslabel_ind, :] unique_labels, enum_labels = np.unique(labels, return_inverse=True) nlabels = len(unique_labels) data_sources = {'tsne': clusters, 'raw': dataset, 'norm': meas} data = data_sources[source] disttriu = pdist(data) distmat = triu2mat(disttriu) label_prediction_scores = [0] * niters label_hitss = [0] * niters label_missess = [0] * niters label_hitrates = np.empty((niters, len(unique_labels))) label_hitrates[:] = np.nan num_left_in = int(len(sids) * 0.9) if not csv_filename: csv_filename = 'knn_by_{}.csv'.format(source) with open(csv_filename, 'w', encoding='utf-8') as f: f.write('Label prediction mean\tstdev\t{}\n'.format( '\t'.join(unique_labels))) scrambled_syl_idx = np.arange(len(sids), dtype=np.int) bar = Bar('Running knn...', max=niters) for iteration in range(niters): np.random.shuffle(scrambled_syl_idx) train_syl_idx = scrambled_syl_idx[:num_left_in] train_y = enum_labels[train_syl_idx] trained_distmat = distmat[train_syl_idx, :][:, train_syl_idx] label_prediction_score, label_hits, label_misses = k_nearest( trained_distmat, train_y, nlabels, 1, 1) label_prediction_scores[iteration] = label_prediction_score label_hitss[iteration] = label_hits label_missess[iteration] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[iteration, :] = label_hitrate bar.next() bar.finish() f.write('{}\t{}\t{}\n'.format( np.nanmean(label_prediction_scores), np.nanstd(label_prediction_scores), '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))