def single_run_repeatable_and_monotonic(self,
                                            aArg,
                                            oArg,
                                            algName,
                                            iArg,
                                            nBatch=1):
        """ Test a single call to bnpy.run, verify repeatability and monotonic.
        """
        self.pprintSingleRun(aArg, oArg, algName, iArg, nBatch)

        kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg, nBatch)
        model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                                 algName, **kwargs)
        self.pprintResult(model1, Info1)

        loss_history = Info1['loss_history']
        if algName.count('moVB'):
            loss_history = loss_history[Info1['lap_history'] >= 1.0]
        isMonotonic = self.isMonotonicallyIncreasing(-1 * loss_history)
        try:
            assert isMonotonic
        except AssertionError:
            from IPython import embed
            embed()

        model2, Info2 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                                 algName, **kwargs)
        self.pprintResult(model2, Info2)
        isRepeatable = np.allclose(Info1['loss_history'],
                                   Info2['loss_history'])
        assert isRepeatable
Example #2
0
def fit_model(name, dataset):
    """

    :param name:
    :param dataset: bnpy.data.XData object
    :return:
    """
    gamma = 1.0  # Prior on dirichlet dispersion parameter
    sF = 1.0  # Prior covariance matrix is Identity * sF
    K = 5  # Numver of initial clusters

    workdir = tempfile.mkdtemp(prefix=name)
    outputdir = 'trymoves-K={K}-gamma={G}-ECovMat={Cov}-moves=birth,merge,shuffle/'.format(
        K=K, G=gamma, Cov=sF)
    output_path = os.path.join(workdir, outputdir)

    blockPrint()
    trained_model, info_dict = bnpy.run(dataset,
                                        'DPMixtureModel',
                                        'Gauss',
                                        'memoVB',
                                        output_path=output_path,
                                        nLap=100,
                                        nTask=1,
                                        nBatch=1,
                                        gamma0=gamma,
                                        sF=sF,
                                        ECovMat='eye',
                                        K=K,
                                        moves='birth,merge,shuffle')
    enablePrint()

    shutil.rmtree(workdir)
    return trained_model
Example #3
0
    def fit(self, X, lengths):
        '''
        # Load dataset from file
        import os
        dataset_path = os.path.join(bnpy.DATASET_PATH, 'mocap6')
        mocap6_dataset = bnpy.data.GroupXData.read_npz(os.path.join(dataset_path, 'dataset.npz'))
        ipdb.set_trace()
        '''
        Xprev = X[:-1, :]
        X = X[1:, :]
        doc_range = list([0])
        doc_range += (np.cumsum(lengths).tolist())
        dataset = bnpy.data.GroupXData(X, doc_range, None, Xprev)

        # -set the hyperparameters
        model, model_info = bnpy.run(dataset,
                                     self.alloModel,
                                     self.obsModel,
                                     self.varMethod,
                                     nLap=self.n_iteration,
                                     nTask=self.nTask,
                                     nBatch=self.nBatch,
                                     convergethr=self.convergethr,
                                     alpha=self.alpha,
                                     gamma=self.gamma,
                                     sF=self.sF,
                                     ECovMat=self.ECovMat,
                                     K=self.K,
                                     initname=self.initname)
        #       self.log_startprob = log_mask_zero(model.allocModel.get_init_prob_vector())
        self.log_startprob = model.allocModel.get_active_comp_probs()
        self.log_startprob = self.log_startprob / sum(self.log_startprob)
        self.log_transmat = model.allocModel.get_trans_prob_matrix()
        self.model = model
        return self
Example #4
0
    def fit(self, X, lengths):
        Xprev = X

        doc_range = list([0])
        doc_range += (np.cumsum(lengths).tolist())

        dataset = bnpy.data.GroupXData(X, doc_range, None, Xprev)

        # -set the hyperparameters
        model, model_info = bnpy.run(
            dataset,
            self.alloModel,
            self.obsModel,
            self.varMethod,
            #output_path = os.path.join(model_save_path, 'results'),
            nLap=self.n_iteration,
            nTask=self.nTask,
            nBatch=self.nBatch,
            convergethr=self.convergethr,
            alpha=self.alpha,
            gamma=self.gamma,
            sF=self.sF,
            ECovMat=self.ECovMat,
            K=self.K,
            initname=self.initname)

        self.model = model
        return self
Example #5
0
    def setUp(self):
        ''' Create a valid Data - model - LP - SS configuration
        '''
        # Make toy data
        Data = ToyHMMK4.get_data(12345, T=15, nDocTotal=3)
        self.Data = Data

        hmodel, Info = bnpy.run(Data,
                                'HDPHMM',
                                'Gauss',
                                'VB',
                                nLap=1,
                                K=6,
                                initname='randexamplesbydist',
                                alpha=0.5,
                                gamma=5.0,
                                ECovMat='eye',
                                sF=1.0,
                                kappa=1e-5,
                                doWriteStdOut=False,
                                doSaveToDisk=False)
        LP = hmodel.calc_local_params(Data, limitMemoryLP=0)
        assert 'mHtable' not in LP

        self.mPairIDs = [(0, 1), (2, 3), (4, 5), (1, 5), (3, 4)]
        SS = hmodel.get_global_suff_stats(Data,
                                          LP,
                                          doPrecompEntropy=1,
                                          doPrecompMergeEntropy=1,
                                          mPairIDs=self.mPairIDs)
        hmodel.update_global_params(SS)
        self.hmodel = hmodel
        self.origLP = LP
        self.origSS = SS.copy()
Example #6
0
 def run(self,
         data,
         mixModel='DPMixtureModel',
         obsModel='Gauss',
         alg='memoVB'):
     dp_model, dp_info_dict = bnpy.run(data,
                                       mixModel,
                                       obsModel,
                                       alg,
                                       K=self.K,
                                       output_path=self.output_path,
                                       nLap=self.nLap,
                                       nTask=self.nTask,
                                       nBatch=self.nBatch,
                                       sF=self.sF,
                                       ECovMat=self.ECovMat,
                                       m_startLap=self.m_startLap,
                                       initname=self.initname,
                                       moves=self.moves,
                                       b_startLap=self.b_startLap,
                                       b_Kfresh=self.b_Kfresh,
                                       doSaveToDisk=self.doSaveToDisk,
                                       gamma1=self.gamma1,
                                       gamma0=self.gamma0,
                                       Kmax=self.Kmax,
                                       taskID=self.taskID)
     return dp_model, dp_info_dict
Example #7
0
    def single_run_monotonic(self, aArg, oArg, algName, iArg):
        """ Test a single call to bnpy.run, verify monotonicity only.
        """
        self.pprintSingleRun(aArg, oArg, algName, iArg)

        kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg)
        model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                                 algName, **kwargs)
        self.pprintResult(model1, Info1)
        isMonotonic = self.isMonotonic(Info1['evTrace'])
        assert isMonotonic
    def run_MOVBWithMoves_SegmentManySeq(
            self,
            aArg,
            oArg,
            moves='merge,delete,shuffle,seqcreate',
            algName='moVB',
            nWorkers=0,
            **kwargs):
        """ Execute single run with all moves enabled.

        Post Condition
        --------------
        Will raise AssertionError if any bad results detected.
        """
        self.Data.alwaysTrackTruth = 1
        Ktrue = np.unique(self.Data.TrueParams['Z']).size

        pprint(aArg)
        pprint(oArg)
        initArg = dict(**kwargs)
        pprint(initArg)

        viterbiPath = os.path.expandvars(
            '$BNPYROOT/bnpy/learnalg/extras/XViterbi.py')
        kwargs = self.makeAllKwArgs(aArg,
                                    oArg,
                                    initArg,
                                    moves=moves,
                                    nWorkers=nWorkers,
                                    customFuncPath=viterbiPath,
                                    doSaveToDisk=1,
                                    doWriteStdOut=1,
                                    printEvery=1,
                                    saveEvery=1000,
                                    **kwargs)

        kwargs['jobname'] += '-creationProposalName=%s' % (
            kwargs['creationProposalName'])
        model, Info = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                               algName, **kwargs)
        pprintResult(model, Info, Ktrue=Ktrue)
        try:
            assert model.allocModel.K == model.obsModel.K
            assert model.allocModel.K == Ktrue

        except AssertionError as e:
            pprintCommandToReproduceError(self.datasetArg, aArg, oArg, algName,
                                          **kwargs)
            assert model.allocModel.K == model.obsModel.K
            if not model.allocModel.K == Ktrue:
                print('>>>>>> WHOA! Kfinal != Ktrue <<<<<<')
        print('')
        return Info
Example #9
0
    def single_run_repeatable_and_monotonic(self, aArg, oArg, algName, iArg):
        """ Test a single call to bnpy.run, verify repeatability and monotonic.
        """
        self.pprintSingleRun(aArg, oArg, algName, iArg)

        kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg)
        model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                                 algName, **kwargs)
        self.pprintResult(model1, Info1)

        evTrace = Info1['evTrace']
        if algName.count('moVB'):
            evTrace = evTrace[Info1['lapTrace'] >= 1.0]
        isMonotonic = self.isMonotonic(evTrace)
        assert isMonotonic

        model2, Info2 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                                 algName, **kwargs)
        self.pprintResult(model2, Info2)
        isRepeatable = np.allclose(Info1['evTrace'], Info2['evTrace'])
        assert isRepeatable
Example #10
0
    def fit(self, name='MultivariateAnalysis', verbose=False):
        """
        Fits the multivariate analysis

        :param name: Name for the output directory
        :return:
        """
        if self.og_data.shape[0] > self.og_data.shape[1]:
            print 'WARNING: Number of genes outnumbers samples. ' \
                  'Consider more stringent filtering.'

        # This is a pandas dataframe: genes x samples
        data = self.og_data
        if self.center:
            if self.verbose:
                print 'centering data'
            data = data.apply(lambda x: x - x.mean(), axis=1)

        data = data.T.values
        xdata = bnpy.data.XData(data)

        workdir = tempfile.mkdtemp(prefix="%s_" % name)
        output_dir = 'K={K}-gamma={G}-ECovMat={Cov}-moves=birth,merge,delete,shuffle/'.format(
            K=self.K, G=self.gamma, Cov=self.variance)
        output_path = os.path.join(workdir, output_dir)
        hmodel, info_dict = bnpy.run(xdata,
                                     'DPMixtureModel',
                                     'Gauss',
                                     'memoVB',
                                     nLap=1000,
                                     nTask=1,
                                     nBatch=1,
                                     gamma0=self.gamma,
                                     sF=self.variance,
                                     ECovMat='eye',
                                     K=self.K,
                                     initname='randexamplesbydist',
                                     moves='birth,merge,delete,shuffle',
                                     b_startLap=0,
                                     m_startLap=2,
                                     d_startLap=2,
                                     output_path=output_path,
                                     doWriteStdOut=verbose)
        self.hmodel = hmodel
        self.clusters = collections.defaultdict(list)
        for sample, cluster in zip(self.og_data.columns,
                                   self.get_assignments(self.og_data)):
            self.clusters[cluster].append(sample)
        return self.hmodel
    def run_MOVBWithMoves(self,
                          aArg,
                          oArg,
                          moves='merge',
                          algName='moVB',
                          nWorkers=0,
                          **kwargs):
        """ Execute single run with merge moves enabled.

        Post Condition
        --------------
        Will raise AssertionError if any bad results detected.
        """
        Ktrue = self.Data.TrueParams['K']
        pprint(aArg)
        pprint(oArg)
        initArg = dict(**kwargs)
        pprint(initArg)
        kwargs = self.makeAllKwArgs(aArg,
                                    oArg,
                                    initArg,
                                    moves=moves,
                                    nWorkers=nWorkers,
                                    **kwargs)
        model, Info = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg),
                               algName, **kwargs)
        pprintResult(model, Info, Ktrue=Ktrue)

        afterFirstLapMask = Info['lapTrace'] >= 1.0
        evTraceAfterFirstLap = Info['evTrace'][afterFirstLapMask]
        isMonotonic = is_monotonic(evTraceAfterFirstLap, aArg=aArg)

        try:
            assert isMonotonic
            assert model.allocModel.K == model.obsModel.K
            assert model.allocModel.K == Ktrue

        except AssertionError as e:
            pprintCommandToReproduceError(self.datasetArg, aArg, oArg, algName,
                                          **kwargs)
            assert isMonotonic
            assert model.allocModel.K == model.obsModel.K
            if not model.allocModel.K == Ktrue:
                print('>>>>>> WHOA! Kfinal != Ktrue <<<<<<')
        return Info
def bnpy_select_clusters(data, max_cells=50000):
    """
    Args:
        data: matrix of shape genes x cells

    Returns:
        selected k based on converged Gaussian DPMM, and
            the assigned labels.
    """
    # TODO: randomly sub-select max_cells
    selected_cell_ids = list(range(data.shape[1]))
    if max_cells < data.shape[1]:
        import random
        selected_cell_ids = random.sample(selected_cell_ids, max_cells)
    data = data[:, selected_cell_ids]
    tsvd = TruncatedSVD(8)
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data)).T)
    data_dense_bnpy = bnpy.data.XData(data_tsvd)
    trained_model, info_dict = bnpy.run(
        data_dense_bnpy,
        'DPMixtureModel',
        'Gauss',
        'memoVB',
        #doSaveToDisk=False,
        doWriteStdOut=False,
        output_path='./temp',
        nLap=100,
        nTask=1,
        nBatch=1,
        sF=0.1,
        ECovMat='eye',
        K=10,
        initname='randexamples',
        moves='birth,merge,shuffle',
        m_startLap=5,
        b_startLap=2,
        b_Kfresh=4)
    selected_k = info_dict['K_history'][-1]
    results = trained_model.calc_local_params(data_dense_bnpy)
    cluster_labels = results['resp'].argmax(1)
    return selected_k, cluster_labels
pylab.tight_layout()

###############################################################################
#
# Training the model
# ------------------
# Let's do one single run of the VB algorithm.
#
# Using 10 clusters and the 'randexamples' initializatio procedure.

trained_model, info_dict = bnpy.run(
    dataset,
    'FiniteMixtureModel',
    'Gauss',
    'VB',
    output_path='/tmp/AsteriskK8/helloworld-K=10/',
    nLap=100,
    sF=0.1,
    ECovMat='eye',
    K=10,
    initname='randexamples')

###############################################################################
#
# Loss function trace plot
# ------------------------
# We can plot the value of the loss function over iterations,
# starting after the first full pass over the dataset (first lap).
#
# As expected, we see monotonic decrease in the loss function's score
# after every subsequent iteration.
Example #14
0
        # perform at most this many iterations at each document
        nCoordAscentItersLP=100,
        # stop local iters early when max change in doc-topic counts < this thr
        convThrLP=convThrLP,
        )

    for nBatch in [1, 16]:
        
        output_path = '/tmp/wiki/scalability-model=hdp_topic+mult-alg=memoized-nBatch=%d-nCoordAscentItersLP=%s-convThrLP=%.3g/' % (
                nBatch, local_step_kwargs['nCoordAscentItersLP'], convThrLP)

        trained_model, info_dict = bnpy.run(
            dataset, 'HDPTopicModel', 'Mult', 'memoVB',
            output_path=output_path,
            nLap=nLap, nBatch=nBatch, convThr=convThr,
            K=K, gamma=gamma, alpha=alpha, lam=lam,
            initname='randomlikewang', 
            moves='shuffle',
            traceEvery=traceEvery, printEvery=printEvery,
            **local_step_kwargs)


###############################################################################
# Plot: Training Loss and Laps Completed vs. Wallclock time
# ---------------------------------------------------------
#
# * Left column: Training Loss progress vs. wallclock time
# * Right column: Laps completed vs. wallclock time
#
# Remember: one lap is a complete pass through entire training set (6400 docs)
        cur_ax_handle.set_xticks([-2, -1, 0, 1, 2])
        cur_ax_handle.set_yticks([-2, -1, 0, 1, 2])
        cur_ax_handle.set_xlabel("lap: %d" % lap_val)
    pylab.tight_layout()


###############################################################################
# Training from K=1 cluster
# -------------------------
# 
# Using 1 initial cluster, with birth and merge proposal moves.

K1_trained_model, K1_info_dict = bnpy.run(
    dataset, 'DPMixtureModel', 'Gauss', 'memoVB',
    output_path='/tmp/AsteriskK8/trymoves-K=1/',
    nLap=100, nTask=1, nBatch=1,
    sF=0.1, ECovMat='eye',
    K=1, initname='randexamples',
    moves='birth,merge,shuffle',
    m_startLap=5, b_startLap=2, b_Kfresh=4)

show_clusters_over_time(K1_info_dict['task_output_path'])

###############################################################################
# Training from K=4 cluster
# -------------------------
# 
# Now using 4 initial clusters, with birth and merge proposal moves.

K4_trained_model, K4_info_dict = bnpy.run(
    dataset, 'DPMixtureModel', 'Gauss', 'memoVB',
    output_path='/tmp/AsteriskK8/trymoves-K=4/',
Example #16
0
def run_synthetic_data_comparisons(
    D: int,
    K: int,
    N: int,
    var_scale: int,
    alpha: int,
    iters: int,
    burnout: int,
    repeats: int,
):

    results = {
        "method": [],
        "k_mae": [],
        "NMI": [],
        "ARI": [],
        "Time": [],
    }

    i = 0
    while i < repeats:

        # generate dataset
        data, labels = DPMMPython.generate_gaussian_data(N, D, K, var_scale)

        prior = niw(1, np.zeros(D), 100, np.eye(D) * 0.5)
        # run DPGMM

        if D == 2:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="splitnet_2d",
            )[0]
            dpmm_net_time = timer() - start

        elif D <= 10:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_10d",
            )[0]
            dpmm_net_time = timer() - start

        else:
            start = timer()
            dpmm_splitnet_results = DPMMPython.fit(
                data,
                alpha,
                iterations=ITERS,
                burnout=BURNOUT,
                verbose=False,
                init_type="splitnet_128d",
            )[0]
            dpmm_net_time = timer() - start

        if len(np.unique(dpmm_splitnet_results)) < K // 2:
            print("failed.")
        else:
            start = timer()
            dpmm_rand_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="none",
            )[0]
            dpmm_rand_time = timer() - start

            start = timer()
            dpmm_kmeans_results = DPMMPython.fit(
                data,
                alpha,
                iterations=iters,
                burnout=burnout,
                verbose=False,
                init_type="kmeans",
            )[0]
            dpmm_kmeans_time = timer() - start

            # run kmeans
            start = timer()
            kmeans = KMeans(n_clusters=K).fit(data.T)
            kmeans_time = timer() - start
            kmeans_labels = kmeans.labels_

            # run GMM
            start = timer()
            gmm = GaussianMixture(n_components=K,
                                  covariance_type="full").fit(data.T)
            gmm_labels = gmm.predict(data.T)
            gmm_time = timer() - start

            # sklearn DPGMM
            start = timer()
            dpgmm = BayesianGaussianMixture(
                n_components=2 * K,
                covariance_type="full",
                weight_concentration_prior=alpha,
                weight_concentration_prior_type="dirichlet_process",
                mean_precision_prior=1e2,
                covariance_prior=1e0 * np.eye(D),
                init_params="kmeans",
                max_iter=iters,
                verbose=0,
            ).fit(data.T)
            dpgmm_labels = dpgmm.predict(data.T)
            dpgmmsk_time = timer() - start

            # moVB

            # pass data NxD
            data_bnpy = bnpy.data.XData(data.T)

            start = timer()
            model, run_info = bnpy.run(
                data_bnpy,
                "DPMixtureModel",
                "Gauss",
                "memoVB",
                nTask=1,
                nBatch=1,
                K=1,
                nLap=iters,
                moves="birth,merge,shuffle",
                gt=labels,
                gamma0=alpha,
            )

            moVB_time = timer() - start
            LP = model.calc_local_params(data_bnpy)
            moVB_labels = LP["resp"].argmax(axis=1)

            # calc metrics and aggregate
            results = add_results(results, "k-means", labels, kmeans_labels,
                                  kmeans_time)
            results = add_results(results, "EM-GMM", labels, gmm_labels,
                                  gmm_time)
            results = add_results(results, "DPGMM (SKlearn's)", labels,
                                  dpgmm_labels, dpgmmsk_time)
            results = add_results(results, "DPGMM-Random", labels,
                                  dpmm_rand_results, dpmm_rand_time)
            results = add_results(results, "DPGMM-k-means", labels,
                                  dpmm_kmeans_results, dpmm_kmeans_time)
            results = add_results(results, "DPGMM-SplitNet", labels,
                                  dpmm_splitnet_results, dpmm_net_time)
            results = add_results(results, "moVB", labels, moVB_labels,
                                  moVB_time)

            i += 1
            print(f"Finished iteration {i}")

    return results
    # Set "reactivation" limits
    # So that each cluster is eligible again after 10 passes thru dataset
    # Or when it's size changes by 400%
    m_nLapToReactivate=10,
    m_minPercChangeInNumAtomsToReactivate=400 * 0.01,
    # Specify how to rank pairs (determines order in which merges are tried)
    # 'obsmodel_elbo' means rank pairs by improvement to observation model ELBO
    m_pair_ranking_procedure='obsmodel_elbo',
    m_pair_ranking_direction='descending',
)

goodelbopairs_trained_model, goodelbopairs_info_dict = bnpy.run(
    dataset,
    'HDPHMM',
    'AutoRegGauss',
    'memoVB',  #
    output_path=output_path_starter +
    'trymerge-K=20-model=HDPHMM+ARMA-ECovMat=1*eye-merge_strategy=good_elbo_pairs/',
    moves='merge,shuffle',
    **dict(alg_kwargs.items() + init_kwargs.items() + hdphmm_kwargs.items() +
           gauss_kwargs.items() + goodelbopairs_merge_kwargs.items()))
K = goodelbopairs_trained_model.obsModel.K
start_prob_K = goodelbopairs_trained_model.allocModel.get_init_prob_vector()
trans_prob_KK = goodelbopairs_trained_model.allocModel.get_trans_prob_matrix()
prior = goodelbopairs_trained_model.obsModel.calcLogSoftEvMatrix_FromPost
post = goodelbopairs_trained_model.obsModel.Post

print("printing all data!")
print(goodelbopairs_trained_model.obsModel.Post.M)
print(goodelbopairs_trained_model.obsModel.Post.B)
print(goodelbopairs_trained_model.obsModel.Post)
print("printing finished!")
Example #18
0
import librosa
import matplotlib.pyplot as plt

if __name__ == '__main__':
    ndim = 20
    train_path = "../raw_data/train.wav"
    train_wav, _ = librosa.load(train_path, sr=44100)
    train = librosa.feature.mfcc(train_wav, sr=44100, n_mfcc=ndim)
    colnames = ['mfcc' + str(i) for i in range(train.shape[0])]
    df = pd.DataFrame(data=train.T, columns=colnames)
    df.to_csv('../raw_data/train_mfcc.csv', index=False)

    hmodel, Rinfo = bnpy.run('../raw_data/train_mfcc.csv',
                             'FiniteMixtureModel',
                             'Gauss',
                             'EM',
                             K=3,
                             output_path='out/2/',
                             nLap=300,
                             minLaps=10)
    plt.plot(np.arange(0, ndim),
             hmodel.obsModel.get_mean_for_comp(0),
             label="Cluster 0")
    plt.plot(np.arange(0, ndim),
             hmodel.obsModel.get_mean_for_comp(1),
             label="Cluster 1")
    plt.plot(np.arange(0, ndim),
             hmodel.obsModel.get_mean_for_comp(2),
             label="Cluster 2")
    plt.xlabel('Feat.')
    plt.ylabel('Norm. Pow')
    plt.ylim(-100, 180)
Example #19
0
###############################################################################
#
# Run the VB+proposals algorithm
# with only merges and re-shuffling.
#
# Initialization: 10 topics, using randomlikewang

trained_model, info_dict = bnpy.run(
    dataset,
    'HDPTopicModel',
    'Mult',
    'memoVB',
    output_path='/tmp/bars_one_per_doc/' +
    'trymoves-model=hdp+mult-K=10-moves=merge,shuffle/',
    nLap=50,
    convergeThr=0.001,
    nBatch=1,
    K=10,
    initname='randomlikewang',
    alpha=0.5,
    lam=0.1,
    moves='merge,shuffle',
    **dict(list(merge_kwargs.items()) + list(local_step_kwargs.items())))

###############################################################################
#
#


def show_bars_over_time(task_output_path=None,
                        query_laps=[0, 1, 2, 5, None],
Example #20
0
# Start with too many clusters (K=25)

gamma = 5.0
sF = 5.0
K = 25

diag1_trained_model, diag1_info_dict = bnpy.run(
    dataset,
    'DPMixtureModel',
    'DiagGauss',
    'memoVB',
    output_path=(
        '/tmp/faithful/' +
        'trymoves-K=%d-gamma=%s-lik=DiagGauss-ECovMat=%s*eye-moves=none/' %
        (K, gamma, sF)),
    nLap=1000,
    nTask=1,
    nBatch=1,
    convergeThr=0.0001,
    gamma0=gamma,
    sF=sF,
    ECovMat='eye',
    K=K,
    initname='randexamplesbydist',
)
show_clusters_over_time(diag1_info_dict['task_output_path'])

###############################################################################
#
# *DiagGauss* observation model
# --------------------------------------
Example #21
0
pylab.tight_layout()

###############################################################################
#
# Cold-start model training
# -------------------------
# Let's do one single run of the VB algorithm.
#
# Using 10 clusters and the 'randexamples' initialization procedure.

cold_start_model, cold_info_dict = bnpy.run(
    dataset,
    'FiniteMixtureModel',
    'Gauss',
    'VB',
    output_path='/tmp/AsteriskK8/coldstart-K=10/',
    nLap=25,
    sF=0.1,
    ECovMat='eye',
    K=10,
    initname='randexamples')


###############################################################################
#
# Setup helper method to visualize clusters
# -----------------------------------------
# Here's a short function to show how clusters evolve during training.
def show_clusters_over_time(task_output_path=None,
                            query_laps=[0, 1, 2, 5, 10, None],
                            nrows=2):
    parser.add_argument('--K', type=int, default=200)
    parser.add_argument('--nnzPerRowLP', type=int, default=5)
    parser.add_argument('--convThrLP', type=float, default=-1.0)
    parser.add_argument('--nCoordAscentItersLP', type=int, default=50)
    parser.add_argument('--initLaps', type=int, default=2)
    args = parser.parse_args()

    if args.dataName == 'AdmixAsteriskK8':
        import AdmixAsteriskK8
        Data = AdmixAsteriskK8.get_data(nDocTotal=args.nDocTotal,
                                        nObsPerDoc=200)
        hmodel, Info = bnpy.run(Data,
                                'HDPTopicModel',
                                'Gauss',
                                'memoVB',
                                ECovMat='diagcovdata',
                                sF=0.1,
                                nLap=args.initLaps,
                                initname='randexamples',
                                K=args.K,
                                nBatch=1)
    else:
        import MixBarsK10V900
        Data = MixBarsK10V900.get_data(nDocTotal=args.nDocTotal,
                                       nWordsPerDoc=500)
        hmodel, Info = bnpy.run(Data,
                                'HDPTopicModel',
                                'Mult',
                                'memoVB',
                                lam=0.1,
                                nLap=args.initLaps,
                                initname='randexamples',
Example #23
0
    # Set "reactivation" limits
    # So that each cluster is eligible again after 10 passes thru dataset
    # Or when it's size changes by 400%
    m_nLapToReactivate=10,
    m_minPercChangeInNumAtomsToReactivate=400 * 0.01,
    # Specify how to rank pairs (determines order in which merges are tried)
    # 'total_size' and 'descending' means try largest combined clusters first
    m_pair_ranking_procedure='total_size',
    m_pair_ranking_direction='descending',
)

allpairs_trained_model, allpairs_info_dict = bnpy.run(
    dataset,
    'HDPHMM',
    'DiagGauss',
    'memoVB',
    output_path=
    '/tmp/mocap6/trymerge-K=20-model=HDPHMM+DiagGauss-ECovMat=1*eye-merge_strategy=all_pairs/',
    moves='merge,shuffle',
    **dict(alg_kwargs.items() + init_kwargs.items() + hdphmm_kwargs.items() +
           gauss_kwargs.items() + allpairs_merge_kwargs.items()))

###############################################################################
#
# Large-Pairs : Try 5-largest-size pairs of merges every 10 laps
# --------------------------------------------------------------
#
# This is much cheaper than all pairs. Let's see how well it does.

largepairs_merge_kwargs = dict(
    m_startLap=10,
    # Set limits to number of merges attempted each lap.
Example #24
0
    b_Kfresh=5)

# output_path = os.path.join(bnpy.ROOT_PATH,
#                            "code/output/trymoves-model=hdp_topic+mult-K=5/")

# Start at 20 or 30 topics TODO

trained_model, info_dict = bnpy.run(
    dataset,
    'HDPTopicModel',
    'Mult',
    'memoVB',
    output_path='/tmp/hdp_topic+mult-K=5/',
    nLap=2000,
    convergeThr=0.01,
    nBatch=5,
    K=5,
    initname='randomlikewang',
    gamma=50.0,
    alpha=0.5,
    lam=0.1,
    moves='birth,merge,shuffle',
    **dict(local_step_kwargs.items() + merge_kwargs.items() +
           birth_kwargs.items()))

###############################################################################
#
# Setup: Helper function to plot topics at each stage of training


def show_top_words_over_time(task_output_path=None,
ECovMat = 'eye'

nLap = 200

###############################################################################
#
# Baseline: Mixture model with *DiagGauss* observation model
# ----------------------------------------------------------
#
# We'll take the best of 3 independent inits ('tasks')


mix_model, mix_info_dict = bnpy.run(
    dataset, 'FiniteMixtureModel', 'DiagGauss', 'memoVB',
    output_path='/tmp/mocap6/test-model=FiniteMixtureModel+DiagGauss-ECovMat=1*eye/',
    nLap=nLap, nTask=3, nBatch=1, convergeThr=0.0001,
    gamma=1.0,
    sF=sF, ECovMat=ECovMat,
    K=K, initname='randexamples',
    )


###############################################################################
#
# FiniteTopicModel with *DiagGauss* observation model
# ---------------------------------------------------
#
# We'll take the best of 3 independent inits ('tasks')


finite_model, finite_info_dict = bnpy.run(
    dataset, 'FiniteTopicModel', 'DiagGauss', 'memoVB',
Example #26
0
# Assumes diagonal covariances.
#
# No sparsity assumptions during training

K = 3  # n clusters
gamma = 50.0  # DP concentration param
sF = 0.1  # scale of expected covariance

full_trained_model, full_info_dict = bnpy.run(
    dataset,
    'DPMixtureModel',
    'DiagGauss',
    'VB',
    output_path='/tmp/faithful/demo_sparse_resp-K=3-lik=Gauss-ECovMat=5*eye/',
    nLap=1000,
    nTask=5,
    nBatch=1,
    convergeThr=0.0001,
    gamma0=gamma,
    sF=sF,
    ECovMat='eye',
    K=K,
    initname='randexamples',
)

# Add this model into the current plot
bnpy.viz.PlotComps.plotCompsFromHModel(full_trained_model, )

###############################################################################
#
# Do inference with L=1 sparsity
# ------------------------------
Example #27
0
sF = 1.0  # Set observation model prior so E[covariance] = identity
ECovMat = 'eye'

###############################################################################
#
# DP mixture with *DiagGauss* observation model
# ---------------------------------------------

mixdiag_trained_model, mixdiag_info_dict = bnpy.run(
    dataset,
    'DPMixtureModel',
    'DiagGauss',
    'memoVB',
    output_path='/tmp/mocap6/showcase-K=20-model=DP+DiagGauss-ECovMat=1*eye/',
    nLap=50,
    nTask=1,
    nBatch=1,
    convergeThr=0.0001,
    gamma=gamma,
    sF=sF,
    ECovMat=ECovMat,
    K=K,
    initname='randexamples',
)

###############################################################################
#
# HDP-HMM with *DiagGauss* observation model
# -------------------------------------------
#
# Assume diagonal covariances.
#
Example #28
0
# Train LDA topic model
# ---------------------
# 
# Using 10 clusters and the 'randexamples' initialization procedure.

local_step_kwargs = dict(
    # perform at most this many iterations at each document
    nCoordAscentItersLP=100,
    # stop local iters early when max change in doc-topic counts < this thr
    convThrLP=0.001,
    )

trained_model, info_dict = bnpy.run(
    dataset, 'FiniteTopicModel', 'Mult', 'VB',
    output_path='/tmp/bars_one_per_doc/helloworld-model=topic+mult-K=10/',
    nLap=100, convergeThr=0.01,
    K=10, initname='randomlikewang',
    alpha=0.5, lam=0.1,
    **local_step_kwargs)

###############################################################################
#
# First, we can plot the loss function over time
# We'll skip the first few iterations, since performance is quite bad.
#

pylab.figure(figsize=FIG_SIZE)
pylab.plot(info_dict['lap_history'][1:], info_dict['loss_history'][1:], 'k.-')
pylab.xlabel('num. laps')
pylab.ylabel('loss')
pylab.tight_layout()
Example #29
0
        # Plot the current model
        cur_ax_handle = ax_handle_list.flatten()[plot_id]
        bnpy.viz.PlotComps.plotCompsFromHModel(
            cur_model,
            Data=dataset,
        )  #ax_handle=cur_ax_handle)
        cur_ax_handle.set_xticks([-2, -1, 0, 1, 2])
        cur_ax_handle.set_yticks([-2, -1, 0, 1, 2])
        cur_ax_handle.set_xlabel("lap: %d" % lap_val)
    pylab.tight_layout()
    pylab.savefig("results/covMat1.png")
    pylab.waitforbuttonpress()
    pylab.show()


K25_trained_model, K25_info_dict = bnpy.run(
    "msnbc_wh.csv",
    'FiniteMixtureModel',
    'Gauss',
    'EM',
    output_path='results/',
    nLap=500,
    nTask=1,
    nBatch=1,
    sF=0.1,
    moves='birth,merge,shuffle',
    K=10,
)

show_clusters_over_time(K25_info_dict['task_output_path'])
Example #30
0
X_csr_DV = dataset.getSparseDocTypeCountMatrix()
bnpy.viz.BarsViz.show_square_images(
    X_csr_DV[:10].toarray(), vmin=0, vmax=5)
#pylab.colorbar()
#pylab.clabel('word count')
pylab.tight_layout()

###############################################################################
#
# Let's do one single run of the VB algorithm.
# 
# Using 10 clusters and the 'randexamples' initializatio procedure.

trained_model, info_dict = bnpy.run(
    dataset, 'FiniteTopicModel', 'Bern', 'VB',
    output_path='/tmp/bars_one_per_doc/helloworld-lik=bernoulli-K=10/',
    nLap=1000, convergeThr=0.0001,
    K=10,
    alpha=0.5, lambda1=0.1, lambda0=0.1)

###############################################################################
#
# First, we can plot the loss function over time
# We'll skip the first few iterations, since performance is quite bad.
#

pylab.figure(figsize=FIG_SIZE)
pylab.plot(info_dict['lap_history'][2:], info_dict['loss_history'][2:], 'k.-')
pylab.xlabel('num. laps')
pylab.ylabel('loss')
pylab.tight_layout()