Example #1
0
def test_umap_transform_embedding_stability():
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    embedding = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    u2 = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)
def metrics(model, data_iterator):
    """
    Summary:

    Args:

    Returns:

    """
    umap_proj = UMAP(metric='euclidean', n_neighbors=200, low_memory=True)
    hdb_clusterer = hdbscan.HDBSCAN(
        min_samples=100,
        min_cluster_size=100,
    )
    ads_pred = []
    ads_actual = []
    total_duration = []
    pred_ads_duration = []
    for i, (data, labels) in tqdm(enumerate(data_iterator)):
        aud_len = MP3_META(data).info.length
        total_duration.append(aud_len)
        aud_data = load_audio(data)
        embeds, (aud_splits, _) = encoder.embed(aud_data, group=False)
        print(data, "Embed done")
        try:
            projs = umap_proj.fit_transform(embeds)
            print(data, "Created Projections")
        except Exception as e:
            print(e)
            continue
        clusters = hdb_clusterer.fit_predict(projs)
        print(data, "Created Clusters")

        ad_dir, ads = segment_ads(aud_data, aud_splits, data, clusters)
        pred_ads_duration.append(len(ads) * 10)
        ads_pred.append(len(ads))
        ads_actual.append(labels)
        print(data, "Done segmenting ads")

        plt.scatter(projs[:, 0], projs[:, 1], cmap='Spectral')
        plt.title(str(Counter(clusters)))
        plt.savefig('{}/{}_umap.jpg'.format(ad_dir, data.split('/')[-1]))
        plt.close()
        plt.plot(clusters)
        plt.savefig('{}/{}_hdb_labels.jpg'.format(ad_dir, data.split('/')[-1]))
        plt.close()

        continue
Example #3
0
X = dataset.data
y = dataset.target

# Generate shape graph using KeplerMapper
mapper = KeplerMapper(verbose=1)
lens = mapper.fit_transform(X, projection=[0])
graph = mapper.map(lens, X, nr_cubes=6, overlap_perc=0.2)

# Convert to a DyNeuGraph
dG = DyNeuGraph(G=graph, y=y)

# Define some custom_layouts
dG.add_custom_layout(lens, name='lens')
dG.add_custom_layout(nx.spring_layout, name='nx.spring')
dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai')
dG.add_custom_layout(nx.spectral_layout, name='nx.spectral')
dG.add_custom_layout(nx.circular_layout, name='nx.circular')

# Configure some projections
pca = PCA(2, random_state=1)
tsne = TSNE(2, init='pca', random_state=1)
umap = UMAP(n_components=2, init=pca.fit_transform(X))

# Add projections as custom_layouts
dG.add_custom_layout(pca.fit_transform(X), name='PCA')
dG.add_custom_layout(tsne.fit_transform(X), name='TSNE')
dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP')

# Visualize
dG.visualize(static=True, show=True)
# Extract sessions 4-5
mask_sessions = df.chunks.add(1).isin([4, 5])
X = X[mask_sessions]
y = y.loc[mask_sessions, :]
target = target[mask_sessions]

# Generate a shape graph using KeplerMapper
mapper = KeplerMapper(verbose=1)

# Configure projection
pca = PCA(2, random_state=1)
umap = UMAP(n_components=2, init=pca.fit_transform(X))

# Construct lens and generate the shape graph
lens = mapper.fit_transform(umap.fit_transform(X, y=target), projection=[0, 1])
graph = mapper.map(
    lens,
    X=X,
    cover=Cover(20, 0.5),
    clusterer=optimize_dbscan(X, k=3, p=100.0),
)

# Convert to a DyNeuGraph
dG = DyNeuGraph(G=graph, y=y)

# Define some custom_layouts
dG.add_custom_layout(lens, name='lens')
dG.add_custom_layout(nx.spring_layout, name='nx.spring')
dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai')
dG.add_custom_layout(nx.spectral_layout, name='nx.spectral')
Example #5
0
class ClusteringWidget(QSplitter):
    def __init__(self, headermodel, selectionmodel):
        super(ClusteringWidget, self).__init__()
        self.headermodel = headermodel
        self.selectionmodel = selectionmodel
        # init some values
        self.selectMapidx = 0
        self.embedding = None
        self.labels = None
        self.mean_spectra = None

        # split between cluster image and scatter plot
        self.image_and_scatter = QSplitter()
        # split between image&scatter and spec plot, vertical split
        self.leftsplitter = QSplitter()
        self.leftsplitter.setOrientation(Qt.Vertical)
        # split between params, buttons and map list, vertical split
        self.rightsplitter = QSplitter()
        self.rightsplitter.setOrientation(Qt.Vertical)

        self.clusterImage = MapViewWidget()
        self.clusterScatterPlot = ScatterPlotWidget()
        self.rawSpecPlot = SpectraPlotWidget()
        self.clusterMeanPlot = ClusterSpectraWidget()

        # ParameterTree
        self.parametertree = ClusteringParameters()
        self.parameter = self.parametertree.parameter

        # buttons layout
        self.buttons = QWidget()
        self.buttonlayout = QGridLayout()
        self.buttons.setLayout(self.buttonlayout)
        # set up buttons
        self.fontSize = 12
        font = QFont("Helvetica [Cronyx]", self.fontSize)
        self.computeBtn = QPushButton()
        self.computeBtn.setText('Compute clusters')
        self.computeBtn.setFont(font)
        self.saveBtn = QPushButton()
        self.saveBtn.setText('Save clusters')
        self.saveBtn.setFont(font)
        # add all buttons
        self.buttonlayout.addWidget(self.computeBtn)
        self.buttonlayout.addWidget(self.saveBtn)

        # Headers listview
        self.headerlistview = QListView()
        self.headerlistview.setModel(headermodel)
        self.headerlistview.setSelectionModel(
            selectionmodel)  # This might do weird things in the map view?
        self.headerlistview.setSelectionMode(QListView.SingleSelection)
        # add title to list view
        self.mapListWidget = QWidget()
        self.listLayout = QVBoxLayout()
        self.mapListWidget.setLayout(self.listLayout)
        mapListTitle = QLabel('Maps list')
        mapListTitle.setFont(font)
        self.listLayout.addWidget(mapListTitle)
        self.listLayout.addWidget(self.headerlistview)

        # assemble widgets
        self.image_and_scatter.addWidget(self.clusterImage)
        self.image_and_scatter.addWidget(self.clusterScatterPlot)
        self.leftsplitter.addWidget(self.image_and_scatter)
        self.leftsplitter.addWidget(self.rawSpecPlot)
        self.leftsplitter.addWidget(self.clusterMeanPlot)
        self.leftsplitter.setSizes([200, 50, 50])
        self.rightsplitter.addWidget(self.parametertree)
        self.rightsplitter.addWidget(self.buttons)
        self.rightsplitter.addWidget(self.mapListWidget)
        self.rightsplitter.setSizes([300, 50, 50])
        self.addWidget(self.leftsplitter)
        self.addWidget(self.rightsplitter)
        self.setSizes([500, 100])

        # setup ROI item
        sideLen = 10
        self.roi = PolyLineROI(positions=[[0, 0], [sideLen, 0],
                                          [sideLen, sideLen], [0, sideLen]],
                               closed=True)
        self.roi.hide()
        self.roiInitState = self.roi.getState()
        # set up mask item
        self.maskItem = ImageItem(np.ones((1, 1)),
                                  axisOrder="row-major",
                                  autoLevels=True,
                                  opacity=0.3)
        self.maskItem.hide()
        # set up select mask item
        self.selectMaskItem = ImageItem(np.ones((1, 1)),
                                        axisOrder="row-major",
                                        autoLevels=True,
                                        opacity=0.3,
                                        lut=np.array([[0, 0, 0], [255, 0, 0]]))
        self.selectMaskItem.hide()
        self.clusterImage.view.addItem(self.roi)
        self.clusterImage.view.addItem(self.maskItem)
        self.clusterImage.view.addItem(self.selectMaskItem)

        # Connect signals
        self.computeBtn.clicked.connect(self.computeEmbedding)
        self.saveBtn.clicked.connect(self.saveCluster)
        self.clusterImage.sigShowSpectra.connect(self.rawSpecPlot.showSpectra)
        self.clusterImage.sigShowSpectra.connect(
            self.clusterScatterPlot.clickFromImage)
        self.clusterScatterPlot.sigScatterRawInd.connect(
            self.rawSpecPlot.showSpectra)
        self.clusterScatterPlot.sigScatterClicked.connect(self.showClusterMean)
        self.clusterScatterPlot.sigScatterRawInd.connect(self.setImageCross)
        self.parametertree.sigParamChanged.connect(self.updateClusterParams)
        self.selectionmodel.selectionChanged.connect(self.updateMap)
        self.selectionmodel.selectionChanged.connect(self.updateRoiMask)

    def computeEmbedding(self):
        # get current map idx
        if not self.isMapOpen():
            return
        msg.showMessage('Compute embedding.')
        # Select wavenumber region
        wavROIList = []
        for entry in self.parameter['Wavenumber Range'].split(','):
            try:
                wavROIList.append(val2ind(int(entry), self.wavenumbers))
            except:
                continue
        if len(wavROIList) % 2 == 0:
            wavROIList = sorted(wavROIList)
            wavROIidx = []
            for i in range(len(wavROIList) // 2):
                wavROIidx += list(
                    range(wavROIList[2 * i], wavROIList[2 * i + 1] + 1))
        else:
            msg.logMessage('"Wavenumber Range" values must be in pairs',
                           msg.ERROR)
            MsgBox('Clustering computation aborted.', 'error')
            return
        self.wavenumbers_select = self.wavenumbers[wavROIidx]
        self.N_w = len(self.wavenumbers_select)
        # get current dataset
        if self.selectedPixels is None:
            n_spectra = len(self.data)
            self.dataset = np.zeros((n_spectra, self.N_w))
            for i in range(n_spectra):
                self.dataset[i, :] = self.data[i][wavROIidx]
        else:
            n_spectra = len(self.selectedPixels)
            self.dataset = np.zeros((n_spectra, self.N_w))
            for i in range(n_spectra):  # i: ith selected pixel
                row_col = tuple(self.selectedPixels[i])
                self.dataset[i, :] = self.data[self.rc2ind[row_col]][wavROIidx]
        # get parameters and compute embedding
        n_components = self.parameter['Components']
        if self.parameter['Embedding'] == 'UMAP':
            n_neighbors = self.parameter['Neighbors']
            metric = self.parameter['Metric']
            min_dist = np.clip(self.parameter['Min Dist'], 0, 1)
            self.umap = UMAP(n_neighbors=n_neighbors,
                             min_dist=min_dist,
                             n_components=n_components,
                             metric=metric,
                             random_state=0)
            self.embedding = self.umap.fit_transform(self.dataset)
        elif self.parameter['Embedding'] == 'PCA':
            # normalize and mean center
            if self.parameter['Normalization'] == 'L1':  # normalize
                data_norm = Normalizer(norm='l1').fit_transform(self.dataset)
            elif self.parameter['Normalization'] == 'L2':
                data_norm = Normalizer(norm='l2').fit_transform(self.dataset)
            else:
                data_norm = self.dataset
            # subtract mean
            data_centered = StandardScaler(
                with_std=False).fit_transform(data_norm)
            # Do PCA
            self.PCA = PCA(n_components=n_components)
            self.PCA.fit(data_centered)
            self.embedding = self.PCA.transform(data_centered)
        # save embedding to standardModelItem
        self.item.embedding = self.embedding
        # update cluster map
        self.computeCluster()

    def computeCluster(self):
        # check if embeddings exist
        if self.embedding is None:
            return
        msg.showMessage('Compute clusters.')
        # get num of clusters
        n_clusters = self.parameter['Clusters']
        # set colorLUT
        self.colorLUT = cm.get_cmap('viridis',
                                    n_clusters + 1).colors[:, :3] * 255
        # compute cluster
        cluster_object = KMeans(n_clusters=n_clusters,
                                random_state=0).fit(self.embedding)
        self.labels = cluster_object.labels_ + 1
        # update cluster image
        if self.selectedPixels is None:  # full map
            self.cluster_map = self.labels.reshape(self.imgShape[0],
                                                   self.imgShape[1])
        elif self.selectedPixels.size == 0:
            self.cluster_map = np.zeros((self.imgShape[0], self.imgShape[1]),
                                        dtype=int)
        else:
            self.cluster_map = np.zeros((self.imgShape[0], self.imgShape[1]),
                                        dtype=int)
            self.cluster_map[self.selectedPixels[:, 0],
                             self.selectedPixels[:, 1]] = self.labels
        self.cluster_map = np.flipud(self.cluster_map)
        self.clusterImage.setImage(self.cluster_map, levels=[0, n_clusters])
        # self.clusterImage.setImage(self.cluster_map)
        self.clusterImage._image = self.cluster_map
        self.clusterImage.rc2ind = self.rc2ind
        self.clusterImage.row, self.clusterImage.col = self.imgShape[
            0], self.imgShape[1]
        self.clusterImage.txt.setPos(self.clusterImage.col, 0)
        self.clusterImage.cross.show()
        # update cluster mean
        mean_spectra = []
        self.dfGroups = []
        if self.selectedPixels is None:
            n_spectra = len(self.data)
            self.dataList = np.zeros((n_spectra, len(self.wavenumbers)))
            dataIdx = np.arange(n_spectra)
            for i in range(n_spectra):
                self.dataList[i] = self.data[i]
        else:
            n_spectra = len(self.selectedPixels)
            self.dataList = np.zeros((n_spectra, len(self.wavenumbers)))
            dataIdx = np.zeros(n_spectra, dtype=int)
            for i in range(n_spectra):  # i: ith selected pixel
                row_col = tuple(self.selectedPixels[i])
                dataIdx[i] = self.rc2ind[row_col]
                self.dataList[i] = self.data[dataIdx[i]]

        for ii in range(1, n_clusters + 1):
            sel = (self.labels == ii)
            # save each group spectra to a dataFrame
            self.dfGroups.append(
                pd.DataFrame(self.dataList[sel],
                             columns=self.wavenumbers.tolist(),
                             index=dataIdx[sel]))
            this_mean = np.mean(self.dataset[sel, :], axis=0)
            mean_spectra.append(this_mean)
        self.mean_spectra = np.vstack(mean_spectra)
        self.clusterMeanPlot.setColors(self.colorLUT)
        self.clusterMeanPlot._data = self.mean_spectra
        self.clusterMeanPlot.wavenumbers = self.wavenumbers_select
        self.clusterMeanPlot.plotClusterSpectra()
        # update scatter plot
        self.updateScatterPlot()

    def saveCluster(self):
        if hasattr(self, 'cluster_map') and hasattr(self, 'mean_spectra'):
            filePath = self.pathList[self.selectMapidx]
            # get dirname and old filename
            dirName = os.path.dirname(filePath)
            oldFileName = os.path.basename(filePath)
            n_clusters = self.parameter['Clusters']
            for i in range(n_clusters):
                # save dataFrames to csv file
                csvName = oldFileName[:-3] + f'_cluster{i+1}.csv'
                newFilePath = os.path.join(dirName, csvName)
                self.dfGroups[i].to_csv(newFilePath)
            MsgBox(
                f'Cluster spectra groups were successfully saved at: {newFilePath}!'
            )

    def updateScatterPlot(self):
        if (self.embedding is None) or (self.labels is None):
            return
        # get scatter x, y values
        self.clusterScatterPlot.scatterData = self.embedding[:, [
            self.parameter['X Component'] - 1, self.parameter['Y Component'] -
            1
        ]]
        # get colormapings
        brushes = [mkBrush(self.colorLUT[x, :]) for x in self.labels]
        # make plots
        if hasattr(self, 'scatterPlot'):
            self.clusterScatterPlot.plotItem.clearPlots()
        self.scatterPlot = self.clusterScatterPlot.plotItem.plot(
            self.clusterScatterPlot.scatterData,
            pen=None,
            symbol='o',
            symbolBrush=brushes)
        self.clusterScatterPlot.getViewBox().autoRange(padding=0.1)
        self.clusterScatterPlot.getNN()

    def updateClusterParams(self, name):
        if name == 'Components':
            self.computeEmbedding()
        elif name == 'Clusters':
            self.computeCluster()
        elif name in ['X Component', 'Y Component']:
            self.updateScatterPlot()

    def updateMap(self):
        # get current map idx
        if not self.selectionmodel.selectedIndexes():  # no map is open
            return
        else:
            self.selectMapidx = self.selectionmodel.selectedIndexes()[0].row()
            # get current item
            self.item = self.headermodel.item(self.selectMapidx)
            if hasattr(self.item, 'embedding'):
                # compute embedding
                self.computeEmbedding()
            else:
                # reset custer image and plots
                self.cleanUp()

    def showClusterMean(self, i):
        if self.mean_spectra is None:
            return
        self.clusterMeanPlot.curveHighLight(self.labels[i] - 1)

    def setImageCross(self, ind):
        row, col = self.ind2rc[ind]
        # update cross
        self.clusterImage.cross.setData([col + 0.5],
                                        [self.imgShape[0] - row - 0.5])
        # update text
        self.clusterImage.txt.setHtml(
            toHtml(f'Point: #{ind}', size=8) + toHtml(f'X: {col}', size=8) +
            toHtml(f'Y: {row}', size=8) + toHtml(
                f'Val: {self.clusterImage._image[self.imgShape[0] - row - 1, col] :d}',
                size=8))

    def cleanUp(self):
        if self.selectionmodel.hasSelection():
            self.selectMapIdx = self.selectionmodel.selectedIndexes()[0].row()
        elif self.headermodel.rowCount() > 0:
            self.selectMapIdx = 0
        else:
            return

        if hasattr(self,
                   'imgShapes') and (self.selectMapIdx < len(self.imgShapes)):
            # self.clusterImage.clear()
            img = np.zeros((self.imgShapes[self.selectMapIdx][0],
                            self.imgShapes[self.selectMapIdx][1]))
            self.clusterImage.setImage(img=img)
        if hasattr(self, 'scatterPlot'):
            self.clusterScatterPlot.plotItem.clearPlots()
            self.clusterScatterPlot.scatterData = None
        self.rawSpecPlot.clearAll()
        self.rawSpecPlot._data = None
        self.clusterMeanPlot.clearAll()
        self.clusterMeanPlot._data = None

    def updateRoiMask(self):
        if self.selectionmodel.hasSelection():
            self.selectMapIdx = self.selectionmodel.selectedIndexes()[0].row()
        elif self.headermodel.rowCount() > 0:
            self.selectMapIdx = 0
        else:
            return
        # update roi
        try:
            roiState = self.headermodel.item(self.selectMapIdx).roiState
            if roiState[0]:  # roi on
                self.roi.show()
            else:
                self.roi.hide()
            # update roi state
            self.roi.blockSignals(True)
            self.roi.setState(roiState[1])
            self.roi.blockSignals(False)
        except Exception:
            self.roi.hide()
        # update automask
        try:
            maskState = self.headermodel.item(self.selectMapIdx).maskState
            self.maskItem.setImage(maskState[1])
            if maskState[0]:  # automask on
                self.maskItem.show()
            else:
                self.maskItem.hide()
        except Exception:
            pass
        # update selectMask
        try:
            selectMaskState = self.headermodel.item(
                self.selectMapIdx).selectState
            self.selectMaskItem.setImage(selectMaskState[1])
            if selectMaskState[0]:  # selectmask on
                self.selectMaskItem.show()
            else:
                self.selectMaskItem.hide()
        except Exception:
            pass

    def setHeader(self, field: str):
        self.headers = [
            self.headermodel.item(i).header
            for i in range(self.headermodel.rowCount())
        ]
        self.field = field
        self.wavenumberList = []
        self.imgShapes = []
        self.rc2indList = []
        self.ind2rcList = []
        self.pathList = []
        self.dataSets = []

        # get wavenumbers, imgShapes, rc2ind
        for header in self.headers:
            dataEvent = next(header.events(fields=[field]))
            self.wavenumberList.append(dataEvent['wavenumbers'])
            self.imgShapes.append(dataEvent['imgShape'])
            self.rc2indList.append(dataEvent['rc_index'])
            self.ind2rcList.append(dataEvent['index_rc'])
            self.pathList.append(dataEvent['path'])
            # get raw spectra
            data = None
            try:  # spectra datasets
                data = header.meta_array('spectra')
            except IndexError:
                msg.logMessage(
                    'Header object contained no frames with field '
                    '{field}'
                    '.', msg.ERROR)
            if data is not None:
                self.dataSets.append(data)
        self.cleanUp()

    def isMapOpen(self):
        if not self.selectionmodel.selectedIndexes():  # no map is open
            return False
        else:
            self.selectMapidx = self.selectionmodel.selectedIndexes()[0].row()
            # get current data
            self.item = self.headermodel.item(self.selectMapidx)
            self.selectedPixels = self.item.selectedPixels
            self.clusterScatterPlot.selectedPixels = self.selectedPixels
            self.currentHeader = self.headers[self.selectMapidx]
            self.wavenumbers = self.wavenumberList[self.selectMapidx]
            self.rc2ind = self.rc2indList[self.selectMapidx]
            self.ind2rc = self.ind2rcList[self.selectMapidx]
            self.clusterScatterPlot.ind2rc = self.ind2rc
            self.clusterScatterPlot.rc2ind = self.rc2ind
            self.imgShape = self.imgShapes[self.selectMapidx]
            self.data = self.dataSets[self.selectMapidx]
            self.rawSpecPlot.setHeader(self.currentHeader, 'spectra')
            if self.selectedPixels is not None:
                self.clusterScatterPlot.selPx_rc2ind = {
                    tuple(self.selectedPixels[i]): i
                    for i in range(len(self.selectedPixels))
                }
                self.clusterScatterPlot.selPx_ind2rc = {
                    i: tuple(self.selectedPixels[i])
                    for i in range(len(self.selectedPixels))
                }

            return True
Example #6
0
	projection=[0, 1]) 
graph = mapper.map(
    lens, X=X, 
    cover=Cover(20, 0.5),
    clusterer=optimize_dbscan(X, k=3, p=100.0), )



# Convert to a DyNeuGraph
dG = DyNeuGraph(G=graph, y=y)

# Define some custom_layouts
dG.add_custom_layout(lens, name='lens')
dG.add_custom_layout(nx.spring_layout, name='nx.spring')
dG.add_custom_layout(nx.kamada_kawai_layout, name='nx.kamada_kawai')
dG.add_custom_layout(nx.spectral_layout, name='nx.spectral')
dG.add_custom_layout(nx.circular_layout, name='nx.circular')

# Configure some projections
pca = PCA(2, random_state=1)
tsne = TSNE(2, init='pca', random_state=1)
umap = UMAP(n_components=2, init=pca.fit_transform(X))

# Add projections as custom_layouts
dG.add_custom_layout(pca.fit_transform(X), name='PCA')
dG.add_custom_layout(tsne.fit_transform(X), name='TSNE')
dG.add_custom_layout(umap.fit_transform(X, y=None), name='UMAP')
dG.add_custom_layout(umap.fit_transform(X, y=target), name='Supervised UMAP')

# Visualize 
dG.visualize(static=True, show=True)