Ejemplo n.º 1
0
 def flat_clusters(self, n=8, init=1, criterion='maxclust'):
     """
     Returns flat clusters from the linkage matrix :Z:
     """
     if criterion is 'distance':
         self.T = hierarchy.fcluster(self.Z, init, criterion='distance')
         a = 0
         while a < 20:
             if self.T.max() < n:
                 init = init - 0.02
                 a += 1
             elif self.T.max() > n:
                 init = init + 0.02
                 a += 1
             else:
                 self.L, self.M = hierarchy.leaders(self.Z, self.T)
                 return self.T
             self.T = hierarchy.fcluster(self.Z, init, criterion='distance')
         self.L, self.M = hierarchy.leaders(self.Z, self.T)
         return self.T
     elif criterion is 'inconsistent':
         self.T = hierarchy.fcluster(self.Z, criterion='inconsistent')
         self.L, self.M = hierarchy.leaders(self.Z, self.T)
         return self.T
     elif criterion is 'maxclust':
         self.T = hierarchy.fcluster(self.Z, t=n, criterion='maxclust')
         self.L, self.M = hierarchy.leaders(self.Z, self.T)
         return self.T
     else:
         print('Criteria not implemented')
         return 0
Ejemplo n.º 2
0
def optMDL(df):

    Z = getDist(df)
    tree = sc.to_tree(Z, rd=True)[1]
    minMDL = 1000000
    optK = 0
    desLength = 0
    DList = []
    for n_cluster in range(1, 11, 1):  #range(df.shape[0]+1)
        N = fcluster(Z, n_cluster, criterion='maxclust')
        L, M = sc.leaders(Z, N)
        leaders = list(L)
        print(leaders)
        leafDict = {}

        for node in tree:
            if node.get_id() in leaders:
                key = node.get_id()

                if node.get_count() > 1:

                    dist = getleafdict(node)
                else:
                    dist = {key: 0}

                leafDict[key] = dist

        desLength = binning(leafDict) + n_cluster * np.log2(df.shape[0])
        DList.append(desLength)

        #if desLength
        if desLength < minMDL:
            minMDL = desLength
            optK = n_cluster
    return optK, minMDL, DList
Ejemplo n.º 3
0
 def test_leaders_single(self):
     # Tests leaders using a flat clustering generated by single linkage.
     X = hierarchy_test_data.Q_X
     Y = pdist(X)
     Z = linkage(Y)
     T = fcluster(Z, criterion='maxclust', t=3)
     Lright = (np.array([53, 55, 56]), np.array([2, 3, 1]))
     L = leaders(Z, T)
     assert_equal(L, Lright)
Ejemplo n.º 4
0
 def test_leaders_single(self):
     # Tests leaders using a flat clustering generated by single linkage.
     X = hierarchy_test_data.Q_X
     Y = pdist(X)
     Z = linkage(Y)
     T = fcluster(Z, criterion='maxclust', t=3)
     Lright = (np.array([53, 55, 56]), np.array([2, 3, 1]))
     L = leaders(Z, T)
     assert_equal(L, Lright)
Ejemplo n.º 5
0
def _hierarchical_clustering_post(table,
                                  model,
                                  num_clusters,
                                  cluster_col='prediction'):
    Z = model['model']
    mode = model['input_mode']
    if mode == 'matrix':
        distance_matrix = model['dist_matrix']
    out_table = model['linkage_matrix']

    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    if mode == 'original':
        prediction_table = table.copy()
    elif mode == 'matrix':
        prediction_table = distance_matrix
    prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    clusters_info_table[cluster_col] = M
    clusters_info_table['name_of_clusters'] = which_cluster
    clusters_info_table = clusters_info_table.sort_values(cluster_col)
    cluster_count = np.bincount(prediction_table[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    clusters_info_table['num_of_entities'] = list(cluster_count)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(
        strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{clusters_info_table}
    |
    """.format(display_params=dict2MD(model['parameters']),
               clusters_info_table=pandasDF2MD(clusters_info_table))))

    model = _model_dict('hierarchical_clustering_post')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Ejemplo n.º 6
0
def _hierarchical_clustering_post(table, model, num_clusters, cluster_col='prediction'):
    Z = model['model']
    input_cols = model['input_cols']
    params = model['parameters']
    out_table = model['outtable']
    predict = fcluster(Z, t=num_clusters, criterion='maxclust')
    out_table2 = table.copy()
    out_table2[cluster_col] = predict
    
    L, M = leaders(Z, predict)
    which_cluster = []
    for leader in L:
        if leader in Z[:, 0]:
            select_indices = np.where(Z[:, 0] == leader)[0][0]
            which_cluster.append(out_table['joined_column1'][select_indices])
        elif leader in Z[:, 1]:
            select_indices = np.where(Z[:, 1] == leader)[0][0]
            which_cluster.append(out_table['joined_column2'][select_indices])
    
    out_table3 = pd.DataFrame([])
    out_table3[cluster_col] = M
    out_table3['name_of_clusters'] = which_cluster
    out_table3 = out_table3.sort_values(cluster_col)
    cluster_count = np.bincount(out_table2[cluster_col])
    cluster_count = cluster_count[cluster_count != 0]
    # data = {'cluster_name': ['prediction' + str(i) for i in range(1, num_clusters + 1)]}
    out_table3['num_of_entities'] = list(cluster_count)
    
    rb = ReportBuilder()
    rb.addMD(strip_margin("""### Hierarchical Clustering Post Process Result"""))
    rb.addMD(strip_margin("""
    |### Parameters
    |
    |{display_params}
    |
    |## Clusters Information
    |
    |{out_table3}
    |
    """.format(display_params=dict2MD(params), out_table3=pandasDF2MD(out_table3))))

    model = _model_dict('hierarchical_clustering_post')
    model['report'] = rb.get()
    
    return {'out_table2' : out_table2, 'model': model}
Ejemplo n.º 7
0
def _find_leaf_indxs_and_fig_posns(Z, ddata, ax):
    """
    Plot a dendrogram into axes `ax` and return for each leaf cluster the
    item-indecies that belong to that cluster
    """
    # find the lowest link
    y_merges = np.array(ddata["dcoord"])
    d_max = np.min(y_merges[y_merges > 0.0])

    d2 = Z[:, 2][np.argwhere(Z[:, 2] == d_max)[0][0] - 1]

    T = hc.fcluster(Z, t=d2, criterion="distance")
    L, M = hc.leaders(Z, T)

    assert set(L) == set(ddata["leaves"])

    # get the actual leaf from the indecies (these were set by providing the
    # `leaf_label_func` above)
    leaf_indecies_from_labels = np.array(
        [int(lab.get_text()) for lab in ax.get_xticklabels()]).tolist()

    ax.set_xticklabels(np.arange(len(leaf_indecies_from_labels)))

    # work out which leaf each item (image) belongs to
    mapping = dict(zip(M, L))
    leaf_mapping = np.array(list(map(lambda i: mapping[i], T)))

    # counts per leaf
    # [(n, sum(leaf_mapping == n)) for n in L]

    tile_idxs_per_cluster = {}
    for tile_id, leaf_id in enumerate(leaf_mapping):
        cluster_id = leaf_indecies_from_labels.index(leaf_id)
        cluster_tile_idxs = tile_idxs_per_cluster.setdefault(cluster_id, [])
        cluster_tile_idxs.append(tile_id)

    return tile_idxs_per_cluster
Ejemplo n.º 8
0
def hierarchial_sentences(X, **kwargs):
    '''Perform hierarchial clustering on a vector of sentences.'''

    matrix = tfidf_matrix(X, **kwargs)
    # hierarchial clustering
    linkage     = sch.linkage(matrix, method = 'complete')
    cutoff      = kwargs.get('cutoff_coef', 0.45)*max(linkage[:,2])
    # create the plot
    fig = pylab.figure()
    axdendro  = fig.add_axes([0.09,0.1,0.2,0.8])

    denodrogram = sch.dendrogram(linkage, orientation='right', color_threshold=cutoff)

    axdendro.set_xticks([])
    axdendro.set_yticks([])

    # extract the indices
    indices  = denodrogram['leaves']

    matrix = matrix[indices,:]
    matrix = matrix[:,indices]

    axmatrix = fig.add_axes([0.3,0.1,0.6,0.8])

    im = axmatrix.matshow(matrix, aspect='auto', origin='lower')
    axmatrix.set_xticks([])
    axmatrix.set_yticks([])

    axcolor = fig.add_axes([0.91,0.1,0.02,0.8])
    pylab.colorbar(im, cax=axcolor)

    # flatten the clusters
    flat_clusters = sch.fcluster(linkage, cutoff, 'distance')
    leaders       = sch.leaders(linkage, flat_clusters)

    return {'fig': fig, 'flat': flat_clusters, 'leaders': leaders[1]}
Ejemplo n.º 9
0
def dendrogram(da_embeddings,
               n_clusters_max=14,
               debug=False,
               ax=None,
               n_samples=10,
               show_legend=False,
               label_clusters=False,
               return_clusters=False,
               color="black",
               sampling_method="random",
               linkage_method="ward",
               **kwargs):
    """
    Additional kwargs will be passed to scipy.cluster.hierarchy.dendrogram
    """

    tile_dataset = ImageSingletDataset(
        data_dir=da_embeddings.data_dir,
        tile_type=da_embeddings.tile_type,
        stage=da_embeddings.stage,
    )

    if ax is None:
        fig, ax = plt.subplots(figsize=(14, 3))
    else:
        fig = ax.figure

    Z = hc.linkage(
        y=da_embeddings,
        method=linkage_method,
    )

    if color is not None:
        kwargs["link_color_func"] = lambda k: color

    # we want to label the leaf by the index of the leaf node, at least
    # initially. Below we will change the labels to have the count in each
    # leaf, but we don't know that number yet
    leaf_label_func = lambda i: str(i)
    kwargs["leaf_label_func"] = leaf_label_func

    ddata = hc.dendrogram(Z=Z,
                          truncate_mode="lastp",
                          p=n_clusters_max,
                          get_leaves=True,
                          **kwargs)

    if debug:
        for ii in range(len(ddata["icoord"])):

            bl, br = list(zip(ddata["icoord"][ii], ddata["dcoord"][ii]))[
                0::3]  # second and third are top left and right corners
            ax.scatter(*bl, marker="s", label=ii, s=100)
            ax.scatter(*br, marker="s", label=ii, s=100)

    # find the lowest link
    y_merges = np.array(ddata["dcoord"])
    d_max = np.min(y_merges[y_merges > 0.0])

    d2 = Z[:, 2][np.argwhere(Z[:, 2] == d_max)[0][0] - 1]

    if debug:
        plt.axhline(d_max, linestyle="--", color="grey")
        ax.legend()

    T = hc.fcluster(Z, t=d2, criterion="distance")
    L, M = hc.leaders(Z, T)

    assert set(L) == set(ddata["leaves"])

    # getting leaf locations
    # the order in `L` (leaders) above unfortunately is *not* same as the order
    # of points in icoord so instead we pick up the order from the actual
    # labels used
    bl_pts = np.array([
        np.asarray(ddata["icoord"])[:, 0],  # x at bottom-right corner
        np.asarray(ddata["dcoord"])[:, 0],  # y at bottom-right corner
    ])
    br_pts = np.array([
        np.asarray(ddata["icoord"])[:, -1],  # x at bottom-right corner
        np.asarray(ddata["dcoord"])[:, -1],  # y at bottom-right corner
    ])

    leaf_pts = np.append(bl_pts, br_pts, axis=1)
    # remove pts where y != 0 as these mark joins within the diagram and don't
    # connect to the edge
    leaf_pts = leaf_pts[:, ~(leaf_pts[1] > 0)]
    # sort by x-coordinate for leaf labels, so that the positions are in the
    # same order as the axis labels
    leaf_pts = leaf_pts[:, leaf_pts[0, :].argsort()]
    # get the actual leaf from the indecies (these were set by providing the
    # `leaf_label_func` above)
    leaf_indecies_from_labels = np.array(
        [int(lab.get_text()) for lab in ax.get_xticklabels()])
    # create mapping from the leaf indecies to the (x,y)-points in the
    # dendrogram where these leaves terminate
    leaf_pts_mapping = dict(zip(leaf_indecies_from_labels, leaf_pts.T))

    # work out which leaf each item (image) belongs to
    mapping = dict(zip(M, L))
    leaf_mapping = np.array(list(map(lambda i: mapping[i], T)))

    N_leaves = len(np.unique(leaf_mapping))

    # counts per leaf
    # [(n, sum(leaf_mapping == n)) for n in L]

    w_pad = 0.02
    size = (3.6 - (n_clusters_max - 1.0) * w_pad) / float(n_clusters_max)
    y_offset = 1.4
    if label_clusters:
        y_offset += 0.2

    for lid, leaf_id in enumerate(ddata["leaves"]):
        img_idxs_in_cluster = da_embeddings.tile_id.values[leaf_mapping ==
                                                           leaf_id].astype(int)
        if sampling_method == "random":
            try:
                img_idxs = np.random.choice(img_idxs_in_cluster,
                                            size=n_samples,
                                            replace=False)
            except ValueError:
                img_idxs = img_idxs_in_cluster
        elif sampling_method == "center_dist":
            emb_in_cluster = da_embeddings.sel(tile_id=img_idxs_in_cluster)
            d_emb = emb_in_cluster.mean(dim="tile_id") - emb_in_cluster
            center_dist = np.sqrt(d_emb**2.0).sum(dim="emb_dim")
            emb_in_cluster["dist_to_center"] = center_dist
            img_idxs = emb_in_cluster.sortby(
                "dist_to_center").tile_id.values[:n_samples]
        else:
            raise NotImplementedError(sampling_method)

        def transform(coord):
            axis_to_data = fig.transFigure + ax.transData.inverted()
            data_to_axis = axis_to_data.inverted()
            return data_to_axis.transform(coord)

        leaf_xy = leaf_pts_mapping[leaf_id]
        xp, yh = transform(leaf_xy)

        if show_legend:
            ax.scatter(*leaf_xy, marker="s", label=lid, s=100)

        for n, img_idx in enumerate(img_idxs):
            img = tile_dataset.get_image(index=img_idx)

            ax1 = fig.add_axes([
                xp - 0.5 * size, yh - size * 1.1 * (n + y_offset), size, size
            ])
            ax1.set_aspect(1)
            ax1.axison = False
            ax1.imshow(img)

    ax.set_xticklabels(
        _fix_labels(ax=ax,
                    leaf_mapping=leaf_mapping,
                    label_clusters=label_clusters))

    if show_legend:
        ax.legend()

    if return_clusters:
        # instead of returning the actual indecies of the leaves here (as were
        # used above) we remap so that they run from 0...N_leaves
        leaf_idxs_remapped = np.array(
            [list(leaf_indecies_from_labels).index(i) for i in leaf_mapping])
        if not label_clusters:
            return ax, leaf_idxs_remapped
        else:
            return ax, _make_letter_labels(N_leaves)[leaf_idxs_remapped]
    else:
        return ax
Ejemplo n.º 10
0
    def cut_tree(self,
                 t=None,
                 criterion='inconsistent',
                 depth=None,
                 cluster_min=3):
        """ Groups data into clusters based on the linkage matrix.

            --Input--

            t: float
                Threshold to be used by the criterion.

            criterion: str
                Criterion for grouping (cutting) the dendrogram.
                Can be either 'distance' or 'inconsistency'.

            depth: int
                Depth used when calculating inconsistency coefficient of a
                branch. Uses all banches by default.

            cluster_min: int
                Minimum cluster size. Data in clusters below this size will be
                assigned as outliers.

            --Output--

            labels: list
                List of cluster labels (integers) for each data.

            clusters: list
                Atoms objects grouped after cluster labels.
                The first group (list) is all the outliers.

            branches: list
                The branch index of each cluster.

            centroids: list
                The centroids of all clusters calculated as averaged features.

            cluster_energies: list
                The average energy of outliers (first element - empty if no
                outliers) and structures in the clusters. Is an empty list if
                data has no .get_potential_energy() attribute.

                The first energy is the average energy of the outliers. This
                is an empty list if there are no outliers.

            avg_width: float
                Average cluster width. Can be used as an outlier threshold in
                the assign_to_cluster function.
        """
        if t is None:
            if criterion == 'distance':
                t = 0.7 * max(self.linkage_matrix[:, 2])
            elif criterion == 'inconsistent':
                t = 4.0
        if depth is None:
            depth = self.n_data

        labels = fcluster(self.linkage_matrix, t, criterion, depth)
        branches = leaders(self.linkage_matrix, labels)

        # Data in clusters below cluster_size are outliers with label = 0
        for label in sorted(set(labels)):
            cluster_size = sum([i == label for i in labels])
            if cluster_size < cluster_min:
                labels = np.where(labels == label, 0, labels)

                x = np.delete(branches[0], np.where(branches[1] == label))
                y = np.delete(branches[1], np.where(branches[1] == label))
                branches = (x, y)

        # Rearrange labels to fill gaps from assignment of outliers
        for label in sorted(set(labels)):
            if label < 2:
                continue
            while len(np.where(labels == label - 1)[0]) == 0:
                y = np.where(branches[1] == label, label - 1, branches[1])
                branches = (branches[0], y)
                labels = np.where(labels == label, label - 1, labels)
                label -= 1

        n_clusters = max(labels)
        print 'Number of clusters: {}'.format(n_clusters)

        # Group data of equal cluster label and calculate centroid
        clusters = [[] for i in range(n_clusters + 1)]
        centroids = [[] for i in range(n_clusters)]
        i = 0
        for label, sample in zip(labels, self.data):
            clusters[label].append(sample)
            if label > 0:
                centroids[label - 1].append(self.feature_matrix[i])
            i += 1

        print 'Number of outliers: {}'.format(len(clusters[0]))

        for i, c in enumerate(centroids):
            c_mean = dict()
            for key in sorted(self.feature_matrix[0].keys()):
                c_mean[key] = np.mean([x[key] for x in c], axis=0)
            centroids[i] = c_mean

        # Determine average width of clusters
        cluster_widths = []

        for branch in branches[0]:
            width = self.linkage_matrix[branch - self.n_data][-2]
            cluster_widths.append(width)
        print 'Cluster widths: {}'.format(cluster_widths)

        # Calculate average cluster energies
        cluster_energies = []
        for cluster in clusters:
            try:  # Check if data has a potential energy
                self.data[0].get_potential_energy()
            except IndexError, SyntaxError:
                break
            if len(cluster) == 0:
                mean_energy = []
            else:
                mean_energy = np.mean(
                    [x.get_potential_energy() for x in cluster])
            cluster_energies.append(mean_energy)
Ejemplo n.º 11
0
def _hierarchical_clustering_post(model, num_clusters, cluster_col='cluster'):
    if 'linkage_matrix' not in model:
        model_table = model['table_1']
        length = len(model_table) + 1
        tmp_table = model_table[[
            'clusters_joined1', 'clusters_joined2', 'height', 'frequency'
        ]]

        tmp = [
            i for i in tmp_table[['clusters_joined1', 'clusters_joined2'
                                  ]].values.flatten()
            if i.split("_")[0] != 'CL'
        ]
        label_encoder = preprocessing.LabelEncoder().fit(tmp)
        tmp_table['clusters_joined2'] = tmp_table['clusters_joined2'].apply(
            _change_name, length=length, encoder=label_encoder)
        tmp_table['clusters_joined1'] = tmp_table['clusters_joined1'].apply(
            _change_name, length=length, encoder=label_encoder)
        Z = tmp_table.values
        predict = fcluster(Z, t=num_clusters, criterion='maxclust')
        data_names = ['pt_' + str(i) for i in range(length)]
        prediction_table = pd.DataFrame()
        prediction_table['name'] = data_names
    else:
        Z = model['model']
        mode = model['input_mode']
        out_table = model['linkage_matrix']
        predict = fcluster(Z, t=num_clusters, criterion='maxclust')
        if mode == 'original':
            prediction_table = model['table']
        elif mode == 'matrix':
            prediction_table = model['dist_matrix'][['name']]
    if num_clusters == 1:
        prediction_table[cluster_col] = [
            1 for _ in range(len(prediction_table.index))
        ]
    else:
        prediction_table[cluster_col] = predict

    L, M = leaders(Z, predict)
    which_cluster = []
    if 'linkage_matrix' not in model:
        for leader in L:
            which_cluster.append('CL_' + str(2 * length - 1 - leader))
    else:
        for leader in L:
            if leader in Z[:, 0]:
                select_indices = np.where(Z[:, 0] == leader)[0][0]
                which_cluster.append(
                    out_table['joined column1'][select_indices])
            elif leader in Z[:, 1]:
                select_indices = np.where(Z[:, 1] == leader)[0][0]
                which_cluster.append(
                    out_table['joined column2'][select_indices])

    clusters_info_table = pd.DataFrame([])
    if num_clusters == 1 and 'linkage_matrix' in model:
        clusters_info_table[cluster_col] = [1]
        clusters_info_table['name of clusters'] = [
            out_table['name of clusters'][len(Z) - 1]
        ]
        clusters_info_table['number of entities'] = [
            out_table['number of original'][len(Z) - 1]
        ]
    else:
        clusters_info_table[cluster_col] = M
        clusters_info_table['name of clusters'] = which_cluster
        clusters_info_table = clusters_info_table.sort_values(cluster_col)
        cluster_count = np.bincount(prediction_table[cluster_col])
        cluster_count = cluster_count[cluster_count != 0]
        clusters_info_table['number of entities'] = list(cluster_count)
    if 'linkage_matrix' in model:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""# Hierarchical Clustering Post Process Result"""))
        rb.addMD(
            strip_margin("""
        |### Parameters
        |
        |{display_params}
        |
        |### Clusters Information
        |
        |{clusters_info_table}
        |
        """.format(display_params=dict2MD(model['parameters']),
                   clusters_info_table=pandasDF2MD(
                       clusters_info_table,
                       num_rows=len(clusters_info_table.index) + 1))))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""# Hierarchical Clustering Post Process Result"""))
        rb.addMD(
            strip_margin("""
        |
        |### Clusters Information
        |
        |{clusters_info_table}
        |
        """.format(clusters_info_table=pandasDF2MD(
                clusters_info_table,
                num_rows=len(clusters_info_table.index) + 1))))
    model = _model_dict('hierarchical_clustering_post_process')
    model['clusters_info'] = clusters_info_table
    model['_repr_brtc_'] = rb.get()

    return {'out_table': prediction_table, 'model': model}
Ejemplo n.º 12
0
from numpy import *
import scipy
import scipy.cluster.hierarchy as sch
import matplotlib.pylab as plt

A = [6,12,18,24,30,42,48]
X = array([[6],[12],[18],[24],[30],[42],[48]])
#print type(X)
d = sch.distance.pdist(X)
Z= sch.linkage(d,method='single')
#print Z
P =sch.dendrogram(Z)
#print P
plt.savefig('plot_dendrogram.png')
T = sch.fcluster(Z, 0.5*d.max(), 'distance')
sch.leaders(Z,T)
Ejemplo n.º 13
0
from numpy import *
import scipy
import scipy.cluster.hierarchy as sch
import matplotlib.pylab as plt

X = loadtxt('cluster_test.txt')

# N = len(X)
# d = zeros((N,N))
#
# for i in range(N):
# 	for j in range(i+1, N):
# 		d[j, i] = d[i, j] = (sum((X[i, :]-X[j, :])**2))**0.5

d = sch.distance.pdist(X)

print(d.shape, X.shape)

Z = sch.linkage(d, method='complete')

P = sch.dendrogram(Z, orientation='right')

plt.show()
# plt.savefig('plot_dendrogram.png')

T = sch.fcluster(Z, 0.5 * d.max(), 'distance')

sch.leaders(Z, T)
Ejemplo n.º 14
0
def adaptive_heartbeat_modelling(signal=None,
                                 sampling_rate=1000.,
                                 initial_length=0.6,
                                 residual_threshold=0.35,
                                 show=True):
    """Adaptive Heartbeat Modelling.

    Follows the approach by Paalasmaa et al. [Paal14]_. Only suitable here for 15s-long BCG.

    Parameters
    ----------
    signal : array
        Input unfiltered BCG signal.
    sampling_rate : int, float, optional
        Sampling frequency (Hz).
    initial_length : float, optional
        Initial length of the template.
    residual_threshold :
        Threshold for heartbeat intervals selection.

    Returns
    -------
    template : array
        Heartbeat model.
    peaks : array
        Heartbeats location indices.

    References
    ----------
    .. [Paal14] J. Paalasmaa, H. Toivonen, M. Partinen,
    "Adaptive heartbeat modeling for beat-to-beat heart rate measurement in
    ballistocardiograms", IEEE journal of biomedical and health informatics, 2015

    """
    # check inputs
    if signal is None:
        raise TypeError("Please specify an input signal.")

    # ensure numpy
    signal = np.array(signal)
    sampling_rate = float(sampling_rate)

    #preprocessing
    signal -= np.mean(signal)
    filtered, _, _ = st.filter_signal(signal=signal,
                                      ftype='butter',
                                      band='lowpass',
                                      order=2,
                                      frequency=10,
                                      sampling_rate=sampling_rate)
    gaussian_filter_std = 0.1
    filtered -= si.gaussian_filter(filtered,
                                   gaussian_filter_std * sampling_rate)

    #D. Initial estimation of the heartbeat model
    #clustering
    filtered_grad = np.gradient(filtered)
    windows_center_p, _ = ss.find_peaks(filtered_grad)
    windows_center_n, _ = ss.find_peaks(-filtered_grad)
    windows_center = np.sort(
        np.concatenate((windows_center_p, windows_center_n)))
    windows, windows_center = extract_heartbeats(signal=filtered,
                                                 peaks=windows_center,
                                                 sampling_rate=sampling_rate,
                                                 before=initial_length / 2,
                                                 after=initial_length / 2)

    #clustering
    dist_matrix = ssd.pdist(windows)
    n = len(windows)
    linkage_matrix = sch.linkage(dist_matrix, method='complete')
    densest_4_cluster_indices, = np.where(linkage_matrix[:, 3] == 4)
    densest_4_cluster_index = densest_4_cluster_indices[0]
    leader_node = densest_4_cluster_index + n
    max_inconsistent_value = linkage_matrix[densest_4_cluster_index, 2]
    flat_clusters = sch.fcluster(linkage_matrix,
                                 max_inconsistent_value,
                                 criterion='distance')
    L, M = sch.leaders(linkage_matrix, flat_clusters)
    leaves, = np.where(flat_clusters == M[L == leader_node])

    windows, windows_center = extract_heartbeats(signal=filtered,
                                                 peaks=windows_center[leaves],
                                                 sampling_rate=sampling_rate,
                                                 before=1.25,
                                                 after=1.25)

    mu = np.mean(windows, axis=0)

    hvs_result = modified_heart_valve_signal(signal=signal,
                                             sampling_rate=sampling_rate)
    hvs = hvs_result['hvs']
    hvs_minima, _ = ss.find_peaks(-hvs)
    half_lengths = []
    for center in windows_center:
        half_lengths.append(min(center - hvs_minima[hvs_minima < center]))
        half_lengths.append(min(hvs_minima[hvs_minima > center] - center))

    half_len = min(half_lengths)
    mu = mu[int(len(mu) / 2) - half_len:int(len(mu) / 2) + half_len]
    mu_center = int(len(mu) / 2)

    #E/ Detecting heartbeat position candidates
    peaks = []
    ta = []
    tb = []

    for iter in range(2):
        peaks = []
        ta = []
        tb = []
        half_len = int(initial_length * sampling_rate / 2)
        if (half_len > mu_center) | (half_len > len(mu) - mu_center):
            raise ValueError('Template is too short or badly centered')
        mu_corr = mu[mu_center - half_len:mu_center + half_len]
        corr = matchTemplate(filtered.astype('float32'),
                             mu_corr.astype('float32'), TM_CCORR_NORMED)
        corr = corr.flatten()
        candidates_pos, _ = ss.find_peaks(corr)
        corr_delay = -mu_center + half_len

        #F/Detecting beat-to-beat intervals
        half_len = int(1 * sampling_rate)
        if half_len > len(mu) - mu_center:
            mu2 = np.append(mu, np.zeros(2 * half_len - len(mu)))
        else:
            mu2 = mu[:int(2 * sampling_rate)]

        candidates_pos += corr_delay
        candidates_pos = candidates_pos[candidates_pos >= 0]

        #1) Initialize ta to the first candidate position
        ta_cand = candidates_pos[0]
        while ta_cand < candidates_pos[-1]:
            try:
                if ta_cand + int(2 * sampling_rate) > len(filtered):
                    raise Exception
                sa = filtered[ta_cand:ta_cand + int(2 * sampling_rate)]
                za = so.least_squares(
                    lambda z: np.mean(np.power(sa - z * mu2, 2)), 1).x[0]
                xa = za * mu2
                #2) Find candidates for tb
                tb_candidates = candidates_pos[np.logical_and(
                    ta_cand + int(0.4 * sampling_rate) < candidates_pos,
                    candidates_pos < ta_cand + int(2 * sampling_rate))]
                #3) find best tb or find another ta -> step 2)
                for tb_cand in tb_candidates:
                    if tb_cand + int(2 * sampling_rate) > len(filtered):
                        raise Exception
                    sb = filtered[tb_cand:tb_cand + int(2 * sampling_rate)]
                    zb = so.least_squares(
                        lambda z: np.mean(np.power(sb - z * mu2, 2)), 1).x[0]
                    xb = zb * mu2
                    xa_tmp = np.concatenate(
                        (xa,
                         np.zeros(
                             max([
                                 0, 2 * (tb_cand - ta_cand) -
                                 int(2 * sampling_rate)
                             ]))))
                    xb_tmp = np.concatenate((np.zeros(tb_cand - ta_cand), xb))
                    x = xa_tmp[:2 * (tb_cand - ta_cand)] + xb_tmp[:2 *
                                                                  (tb_cand -
                                                                   ta_cand)]
                    s = filtered[ta_cand:ta_cand + 2 * (tb_cand - ta_cand)]
                    eps = s - x

                    if (np.mean(np.power(eps, 2)) <
                            residual_threshold * np.mean(np.power(s, 2))) & (
                                max([za, zb]) < 2 * min([za, zb])):
                        ta.append(ta_cand)
                        tb.append(tb_cand)
                        peak_a = ta_cand + mu_center
                        peak_b = tb_cand + mu_center
                        if peak_a not in peaks:
                            peaks.append(peak_a)
                        peaks.append(peak_b)
                        ta_cand = tb_cand
                        break
                    else:
                        continue

                if ta_cand != tb_cand:
                    ta_candidates = candidates_pos[np.logical_and(
                        candidates_pos > ta_cand,
                        candidates_pos < ta_cand + int(2 * sampling_rate))]
                    ta_cand = ta_candidates[np.argmax(corr[ta_candidates -
                                                           corr_delay])]
            except Exception:
                break
        beats = dict(peaks=np.array(peaks), ta=np.array(ta), tb=np.array(tb))

        #G. re-estimation of the model with detected beat to beat intervals
        template_extraction = long_template_extraction(signal=filtered,
                                                       beats=beats,
                                                       mu_center=mu_center,
                                                       sampling_rate=1000.)
        try:
            mu = template_extraction['long_template']
            mu_center_new = template_extraction['long_template_center']
            mu = mu[mu_center_new - mu_center:]
        except KeyError:
            mu = template_extraction['short_template']
        peaks = beats['peaks']
        print('iteration no ', iter, ': ', len(peaks), ' beats detected')

    #H. Accounting for abrupt changes of the heartbeat shape
    # to complete, with four different instances of the beat-to-beat detection method

    #I. Post-preprocessing
    # slightly different in our case : we added a smoother rather than the non linear filter explained in the paper

    if show:
        # extract templates
        templates, peaks = extract_heartbeats(signal=filtered,
                                              peaks=peaks,
                                              sampling_rate=sampling_rate,
                                              before=0.6,
                                              after=0.2)
        # compute heart rate
        hr_idx, hr = st.get_heart_rate(beats=peaks,
                                       sampling_rate=sampling_rate,
                                       smooth=True,
                                       size=3)
        # get time vectors
        length = len(signal)
        T = (length - 1) / sampling_rate
        ts = np.linspace(0, T, length, endpoint=True)
        ts_hr = ts[hr_idx]
        ts_tmpl = np.linspace(-0.4, 0.4, templates.shape[1], endpoint=False)

        plotting.plot_bcg(ts=ts,
                          raw=signal,
                          filtered=filtered,
                          jpeaks=peaks,
                          templates_ts=ts_tmpl,
                          templates=templates,
                          heart_rate_ts=ts_hr,
                          heart_rate=hr,
                          path=None,
                          show=True)

    return utils.ReturnTuple((mu, peaks), ('template', 'peaks'))
Ejemplo n.º 15
0
def run_hdbscan_hai(hs,
                    labels,
                    method='hai',
                    show='save',
                    cut=0,
                    colormap=plt.cm.gist_rainbow):
    clusterer = hdbscan.HDBSCAN(metric='precomputed',
                                match_reference_implementation=True,
                                min_samples=1)
    clusterer.fit(hs)

    # Labels of objects as extracted from HDBSCAN*
    cluster_labels = clusterer.labels_

    print("Labels: ", cluster_labels)
    print("Unique Labels: ", np.unique(cluster_labels))

    # Linkage Matrix
    Z = clusterer.single_linkage_tree_.to_numpy()

    roots = leaders(Z, np.asarray(cluster_labels).astype('i'))

    print "Roots", map(str, roots[0])
    print "Roots", ', '.join(map(str, roots[0]))

    fosc_file = open(basedir + "/FOSC", "w+")
    fosc_file.write(', '.join(map(str, roots[0])))

    fosc_file.close()

    # Plot Settings
    # fig, ax1 = plt.subplots()
    # plt.title('HDBSCAN*')
    # plt.xlabel('mpts')
    # plt.ylabel('distance')

    # Extraction Method: FOSC or Thresholfd.
    if cut > 0:
        partitioning = fcluster(Z, cut, criterion='distance')
        # plt.axhline(y=cut, c='k')
    else:
        partitioning = cluster_labels + 1

    # Normalizes the colors according to the clusters found in the partitioning.
    norm = colors.Normalize(0, partitioning.max())

    dflt_col = "#cccccc"
    link_cols = {}
    for i, i12 in enumerate(Z[:, :2].astype(int)):
        c1, c2 = (link_cols[x] if x > len(Z) else dflt_col
                  if partitioning[x] == 0 else colormap(norm(partitioning[x]))
                  for x in i12)
        link_cols[i + 1 + len(Z)] = c1 if c1 == c2 else dflt_col

    # Creates the dendrogram.
    dendrogram(
        Z,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=6.,  # font size for the x axis labels
        labels=labels,
        count_sort=True,
        link_color_func=lambda x: colors.to_hex(link_cols[x]),
        above_threshold_color='grey')

    # Saves or shows the dendrogram.
    # if show == 'save':
    #     plt.savefig(plotdir + filename + '_hdbscan_' + method + '.png', dpi=600, bbox_inches='tight')
    #     plt.savefig(plotdir + filename + '_hdbscan_' + method + '.pdf', dpi=600, bbox_inches='tight')
    # else:
    #     plt.show()

    # Clears plot.
    # plt.gcf().clear()

    clusters, _ = np.unique(partitioning, return_counts=True)

    h = np.array(hs)

    medoids = []

    for c in clusters:
        if c != 0:
            medoids.append(compute_medoid(c, partitioning, h))

    medoids.sort()

    # Linkage Matrix in a Tree Format
    T = to_tree(Z)

    d3Dendro = dict(name=T.id, y=T.dist)

    add_node(T, d3Dendro, labels, h)
    json.dump(d3Dendro,
              open(resudir + filename + '_meta-hierarchy_.json', "w"),
              sort_keys=True,
              indent=4)

    return medoids, labels[medoids], partitioning[medoids], clusters.max()
Ejemplo n.º 16
0
def fcluster_combine_leaves(Z,
                            t,
                            criterion="distance",
                            depth=2,
                            R=None,
                            monocrit=None):
    # AKA no leaf left behind

    # check if Z is a valid linkage matrix
    _ = hierarchy.is_valid_linkage(Z, throw=True)

    N = Z.shape[0] + 1

    # alternative: iteratively increase t, check for remaining leaves

    # move up the tree, merging leaf clusters until all leaves are merged into clusters
    T = hierarchy.fcluster(Z,
                           t,
                           criterion=criterion,
                           depth=depth,
                           R=R,
                           monocrit=monocrit)
    L, M = hierarchy.leaders(Z, T)
    leaf_leaders = list(L[L < N])

    # no leaf clusters
    if len(leaf_leaders) == 0:
        return T

    max_cluster = T.max()

    # iterate through all links
    for n, link in enumerate(
            Z[np.logical_or(*(np.in1d(Z[:, l], leaf_leaders)
                              for l in range(2))), :2].astype("i")):

        if n % 10 == 0:
            print(
                f"After {n} iterations, {len(leaf_leaders)} leaf leaders left with {len(np.unique(T))} total clusters"
            )

        # find linkages if link is between two leaf_leaders
        if all([l in leaf_leaders for l in link]):
            # make new cluster of leaf leaders
            max_cluster += 1
            T[link] = max_cluster

            # remove from list of leaf_leaders
            _ = [leaf_leaders.remove(l) for l in link]

        # find linkages of leaf leaders with any non-leaf node
        elif any([l in leaf_leaders for l in link]):

            # which one is the leaf leader?
            node_index = link[0] in leaf_leaders
            node, leaf = link[int(node_index)], link[int(~node_index)]

            # other node is a leader
            if node in L:
                downstream_leaders = [node]

            # node is not a leader, have to traverse down the tree until leaders are found
            else:
                # get hierarchy.ClusterNode representation of the node
                tree = hierarchy.to_tree(Z, rd=True)[1][node]

                def check_node(node, nodes_to_check, downstream_leaders, L):
                    """check if a node is a leader, else append successors to nodes_to_check"""
                    if node.id in L:
                        downstream_leaders.append(node.id)
                    else:
                        nodes_to_check.extend([node.left, node.right])
                    return nodes_to_check, downstream_leaders

                # initialize traversal
                downstream_leaders = []
                nodes_to_check = [tree.left, tree.right]

                while len(nodes_to_check) > 0:
                    n_ = nodes_to_check.pop(0)
                    if all([s is None for s in [n_.left, n_.right]]):
                        raise ValueError(
                            "While traversing the tree, a leaf node was reached"
                            f", node {n_.id}. In theory this should not occur."
                        )
                    nodes_to_check, downstream_leaders = check_node(
                        n_, nodes_to_check, downstream_leaders, L)

            # update T
            max_cluster += 1
            merge_clusters = M[np.in1d(L, downstream_leaders)]
            T[np.in1d(T, merge_clusters)] = max_cluster
            T[leaf] = max_cluster

            # remove from leaf_leaders
            _ = leaf_leaders.remove(leaf)

        else:
            continue

        # update L,M
        L, M = hierarchy.leaders(Z, T)

        if len(leaf_leaders) == 0:
            break

    leaf_leaders = list(L[L < N])

    # no leaf clusters
    if len(leaf_leaders) == 0:
        print(
            f"All leaf leaders combined, resulting in {len(np.unique(T))} total clusters"
        )

        # relabel
        unique, inverse = np.unique(T, return_inverse=True)

        return np.arange(0, unique.shape[0])[inverse]
    else:
        raise ValueError(f"Failed to merge leaf leaders {leaf_leaders}")