def upgma_triu(segments_ids, dm):
    """
    Perform UPGMA given a distance matrix
    :param segments_ids: an array of Segment IDs
    :param dm: ID of a DistanceMatrix
    :return: two arrays: indices is the positions of the leaves and distances the distances to the root
    """
    all_segments_ids = np.array(
        list(Segment.objects.all().order_by('id').values_list('id',
                                                              flat=True)))
    chksum = DistanceMatrix.calc_chksum(all_segments_ids)

    if dm is None:
        return [0] * len(segments_ids)
    assert chksum == dm.chksum

    mat_idx = np.searchsorted(all_segments_ids, segments_ids)
    triu = dm.triu
    distmat = triu2mat(triu)
    distmat = distmat[:, mat_idx][mat_idx, :]
    distmat[np.isnan(distmat)] = 0
    triu = mat2triu(distmat)

    tree = linkage(triu, method='average')
    indices, distances = dist_from_root(tree)

    return indices.tolist(), distances.tolist()
Exemple #2
0
    def perform_symprof(self, dist_triu, processed_measures, permuter):
        clusters_by_simprof_pkl_name = 'symprof-unsup-{}-{}-pca={}%.pkl'.format(
            self.feature_grouper, self.max_deviation, self.pca_explained)
        clusters_by_simprof_pkl_file = '/tmp/' + clusters_by_simprof_pkl_name

        clusters_by_cutoff_pkl_name = 'cluster-unsup-{}-{}-pca={}%.pkl'.format(
            self.feature_grouper, self.max_deviation, self.pca_explained)
        clusters_by_cutoff_pkl_file = '/tmp/' + clusters_by_cutoff_pkl_name

        if os.path.isfile(clusters_by_simprof_pkl_file):
            with open(clusters_by_simprof_pkl_file, 'rb') as f:
                saved = pickle.load(f)
                clusters_by_symprof = saved['clusters_by_symprof']
        else:
            nobs = processed_measures.shape[0]
            distmat = triu2mat(dist_triu)

            min_cluster_count = 10
            max_cluster_size = 1000

            if os.path.isfile(clusters_by_cutoff_pkl_file):
                with open(clusters_by_cutoff_pkl_file, 'rb') as f:
                    saved = pickle.load(f)
                    sub_clusters = saved['sub_clusters']
            else:
                original_cluster = np.arange(nobs)
                sub_clusters = []
                divide_clusters(distmat, original_cluster, max_cluster_size,
                                sub_clusters, min_cluster_count)

                with open(clusters_by_cutoff_pkl_file, 'wb') as f:
                    pickle.dump(dict(sub_clusters=sub_clusters), f)

            clusters_by_symprof = []
            x = 1
            for sub_cluster in sub_clusters:
                print('========================={}/{}========================'.
                      format(x, len(sub_clusters)))
                recursive_simprof(processed_measures,
                                  permuter,
                                  sub_cluster,
                                  clusters_by_symprof,
                                  min_cluster_size=10,
                                  max_deviation=self.max_deviation,
                                  is_structural=self.structural_checker)
                x += 1

            with open(clusters_by_simprof_pkl_file, 'wb') as f:
                pickle.dump(dict(clusters_by_symprof=clusters_by_symprof), f)

        return clusters_by_symprof
def calc_class_dist_by_adjacency(adjacency_mat,
                                 syl_label_enum_arr,
                                 return_triu=False,
                                 metric='euclidean'):
    """
    Currently this distmat contains reversed distance, e.g a pair (A,B) has high "distance" if they're found adjacent
    to each other often -- so we need to reverse this.
    :param adjacency_mat:
    :param syl_label_enum_arr:
    :param return_triu:
    :param metric:
    :return:
    """
    max_distance = np.max(adjacency_mat)
    adjacency_mat = max_distance - adjacency_mat
    adjacency_mat[np.where(np.isinf(adjacency_mat))] = max_distance + 1

    # To avoid overwhelming the entire distance matrix by having some highly repeated pair, we convert the distance to
    # logarithmic scale
    adjacency_mat = np.log10(adjacency_mat)

    counter = Counter(syl_label_enum_arr)
    nlabels = len(counter)
    frequencies = np.array([counter[i] for i in range(nlabels)])

    adjacency_mat_fw_norm = adjacency_mat / frequencies[:, None]
    # adjacency_mat_bw_norm = adjacency_mat / frequencies
    # coordinates = np.concatenate((adjacency_mat_fw_norm, adjacency_mat_bw_norm), axis=1)

    coordinates = adjacency_mat_fw_norm
    dist_triu = distance.pdist(coordinates, metric)
    if return_triu:
        return dist_triu

    distmat = triu2mat(dist_triu)
    return distmat
    def handle(self, source, matfile, niters, csv_filename, *args, **options):
        assert source in ['tsne', 'raw', 'norm']

        saved = DotMap(loadmat(matfile))
        sids = saved.sids.ravel()
        clusters = saved.clusters
        dataset = saved.dataset
        meas = zscore(dataset)
        labels = saved.labels
        haslabel_ind = np.where(labels != '                              ')[0]

        labels = labels[haslabel_ind]
        labels = np.array([x.strip() for x in labels])
        sids = sids[haslabel_ind]
        clusters = clusters[haslabel_ind, :]

        unique_labels, enum_labels = np.unique(labels, return_inverse=True)
        nlabels = len(unique_labels)

        data_sources = {'tsne': clusters, 'raw': dataset, 'norm': meas}

        data = data_sources[source]
        disttriu = pdist(data)
        distmat = triu2mat(disttriu)

        label_prediction_scores = [0] * niters
        label_hitss = [0] * niters
        label_missess = [0] * niters
        label_hitrates = np.empty((niters, len(unique_labels)))
        label_hitrates[:] = np.nan

        num_left_in = int(len(sids) * 0.9)

        if not csv_filename:
            csv_filename = 'knn_by_{}.csv'.format(source)

        with open(csv_filename, 'w', encoding='utf-8') as f:
            f.write('Label prediction mean\tstdev\t{}\n'.format(
                '\t'.join(unique_labels)))
            scrambled_syl_idx = np.arange(len(sids), dtype=np.int)

            bar = Bar('Running knn...', max=niters)
            for iteration in range(niters):
                np.random.shuffle(scrambled_syl_idx)
                train_syl_idx = scrambled_syl_idx[:num_left_in]

                train_y = enum_labels[train_syl_idx]
                trained_distmat = distmat[train_syl_idx, :][:, train_syl_idx]

                label_prediction_score, label_hits, label_misses = k_nearest(
                    trained_distmat, train_y, nlabels, 1, 1)

                label_prediction_scores[iteration] = label_prediction_score
                label_hitss[iteration] = label_hits
                label_missess[iteration] = label_misses

                label_hitrate = label_hits / (label_hits +
                                              label_misses).astype(np.float)
                label_hitrates[iteration, :] = label_hitrate

                bar.next()
            bar.finish()

            f.write('{}\t{}\t{}\n'.format(
                np.nanmean(label_prediction_scores),
                np.nanstd(label_prediction_scores),
                '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))