コード例 #1
0
def initial_values(p, k, m):
    """Creates a list of lenght m with initial values.
    K-means generates cluster assignments and groupwise maximum likelihood
    estimates are the initial Dirichlet parameter values. """

    scaler = preprocessing.StandardScaler()  #Center the data
    p_scaled = scaler.fit_transform(pd.DataFrame(p))
    cat = numpy.shape(p)[1]

    index = 0
    inits = []
    while index < m:
        kmeans = KMeans(n_clusters=k, n_init=5)
        z_init = kmeans.fit_predict(p_scaled) + 1

        rho_mles = numpy.reshape(numpy.repeat(0.0, k * cat), (k, cat))
        try:
            for l in numpy.arange(1, k + 1):
                rho_mles[l - 1, :] = dirichlet.mle(p[z_init == l, ])
        except:
            for l in numpy.arange(1, k + 1):
                rho_mles[l - 1, :] = numpy.random.uniform(low=0.1,
                                                          high=5,
                                                          size=1)

        alpha_init = numpy.random.uniform(1, 2, 1)
        beta_init = numpy.random.uniform(0.1, 0.5, 1)

        inits_m = [z_init, rho_mles, alpha_init, beta_init]
        inits.append(inits_m)
        index += 1

    return inits
コード例 #2
0
def computeNIPS(dimensions):
    dsname, data, features = getNips()
    
    lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
    lda.fit(data)
    mixt = lda.transform(data)

    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=100)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/nips/"+ dsname + "-" + str(dimensions) + ".json")   
コード例 #3
0
ファイル: models.py プロジェクト: seffka/pychord_tools
 def fit(self, segments):
     """
     Fits the model to given chroma segments.
     :param segments: AnnotatedChromaSegment list
     """
     for k in self.kinds:
         chroma = self.preprocess(segments.chromas[segments.kinds == k])
         self.alphas[k] = dirichlet.mle(chroma)
コード例 #4
0
ファイル: chroma.py プロジェクト: agangzz/dl4mir
def train_dirichlet(qual_obs):
    alphas = np.array([dirichlet.mle(obs) for obs in qual_obs])
    alphas = np.concatenate([circshift(alphas, 0, r)
                             for r in range(12)], axis=0)
    logl = [np.array([loglikelihood(obs, a) for a in alphas]).T
            for obs in qual_obs]
    y_pred = np.concatenate([logl[n].argmax(axis=1) for n in range(13)])
    y_true = np.concatenate([np.array([n]*len(logl[n])) for n in range(13)])
    print score(y_true, y_pred)
コード例 #5
0
def randomize_dirichlet(X):
    # additive smoothing to avoid numerical problems
    M = X.shape[1]
    # eps = np.spacing(1)
    eps = 0.01 / M
    X = (X + eps) / (1 + M * eps)
    alpha = dirichlet.mle(X)
    N = X.shape[0]
    X = np.random.dirichlet(alpha, size=N)
    return X
コード例 #6
0
def train_dirichlet(qual_obs):
    alphas = np.array([dirichlet.mle(obs) for obs in qual_obs])
    alphas = np.concatenate([circshift(alphas, 0, r) for r in range(12)],
                            axis=0)
    logl = [
        np.array([loglikelihood(obs, a) for a in alphas]).T for obs in qual_obs
    ]
    y_pred = np.concatenate([logl[n].argmax(axis=1) for n in range(13)])
    y_true = np.concatenate([np.array([n] * len(logl[n])) for n in range(13)])
    print score(y_true, y_pred)
コード例 #7
0
ファイル: tune.py プロジェクト: nirg/mods_usr_eng
 def _update_psi(self):
     new_psi = np.zeros((self.n_topics, self.n_eng_modes), dtype=np.float64)
     for k in xrange(self.n_topics):
         if np.sum(self.ndz_[:, k]) > 0.0:
             new_psi[k, :] = dir.mle(self.Y, weights=self.ndz_[:, k])
         else:
             new_psi[k, :] = self.eta
     self._psi_coefs = gamma(np.sum(new_psi, 1)) / np.prod(
         gamma(new_psi), 1)
     self.psi_ = new_psi
コード例 #8
0
ファイル: eToT.py プロジェクト: JasonLC506/LDA_CF
 def _etaUpdate(self, dataE):
     """
     use standard MLE estimation of eta from dirichlet distribution
     observation is dataE for each word with word-level topic
     """
     dataE_smoothed = probNormalize(dataE + SMOOTH_FACTOR)
     eta_est = np.zeros([self.K, self.E])
     for k in range(self.K):
         obs = np.repeat(dataE_smoothed, self.TI[:, k].tolist(), axis=0)
         eta_est[k] = dirichlet.mle(obs)
     return eta_est
コード例 #9
0
ファイル: models.py プロジェクト: seffka/pychord_tools
    def fit(self, segments):
        """
        Fits the model to given chroma segments.
        :param segments: AnnotatedChromaSegment list
        """
        in_chroma_sums = dict()

        for k in self.kinds:
            chroma = self.preprocess(segments.chromas[segments.kinds == k])
            partition = [self.inDegreeDict[k], self.outDegreeDict[k]]
            in_chroma_sums[k] = amalgamate(partition, chroma).transpose()[0]
            in_chroma_composition = subcomposition(
                [[e] for e in self.inDegreeDict[k]], chroma).astype('float64')
            self.dirichlets[k] = dirichlet.mle(in_chroma_composition)
            out_chroma_composition = subcomposition(
                [[e] for e in self.outDegreeDict[k]], chroma).astype('float64')
            self.residualDirichletAlphas[k] = dirichlet.mle(
                out_chroma_composition)

        all_chords = np.concatenate(list(in_chroma_sums.values()))
        self.betaParams = beta.fit(all_chords, floc=0, fscale=1)
コード例 #10
0
 def _etaUpdate(self):
     """
     use standard MLE estimation of eta from dirichlet distribution
     observation is dataE for each word with word-level topic
     """
     dataE_smoothed = np.zeros([self.D, self.E])
     for d in self.dataE_smoothed:
         dataE_smoothed[d] = self.dataE_smoothed[d]
     eta_est = np.zeros([self.K, self.E])
     for k in range(self.K):
         obs = np.repeat(dataE_smoothed, self.TI[:,k].tolist(), axis=0)
         eta_est[k] = dirichlet.mle(obs)
     return eta_est
コード例 #11
0
def estimate_dirichlet_par(
    x_train
):  #input to dirichlet.mle N*K numpy array, N=train_samples, K=dimension_of_each_input
    par = {}
    alpha = 0.001
    for i in x_train:
        x_train[i] = x_train[i] * 1.0
        x_train[i] += alpha
        x = np.linalg.norm(x_train[i], axis=1, keepdims=True)
        x_train[i] = x_train[i] / x
        # x_train[i] = np.transpose(np.transpose(x_train[i])/(np.sum(x_train[i],axis=1)))

        par[i] = dirichlet.mle(x_train[i])
    return par
コード例 #12
0
def getHydrochemLL():
    dsname, data, features = getHydrochem()

    print(data)
    print(data.shape)

    featureTypes = ["continuous"] * data.shape[1]

    domains = [[0, 1]] * data.shape[1]

    print(domains)
    families = ['piecewise'] * data.shape[1]

    #families = ['histogram'] * data.shape[1]
    #@memory.cache
    def learn(data, families, mininst, alpha, th):
        spn = SPN.LearnStructure(
            data,
            featureTypes=featureTypes,
            row_split_method=Splitting.Gower(),
            col_split_method=Splitting.RDCTest(threshold=th),
            domains=domains,
            alpha=alpha,
            families=families,
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=mininst)
        return spn

    stats = Stats(name=dsname)

    alll = []
    for train, test, i in kfolded(data, 5):
        dirichlet_alphas = dirichlet.mle(train,
                                         method='meanprecision',
                                         maxiter=100000)
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test),
                                          alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

        spn = learn(train, families, 10, 0.1, 0.1)
        ll = spn.root.eval(test)
        alll.append(numpy.mean(ll))
        stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))

    print(numpy.mean(alll))
    stats.save("results/hydrochems/" + dsname + ".json")
コード例 #13
0
def dirichlet_fit(sampled_probas, method='fixedpoint'):
    """
	Input:
		sampled_probas: tensor shape of (S#samples, B#examples, D#class_dim) 
	output:
		das: matrix shape of (B#examples, D#class_dim)
			where each row is the alpha's from a Dirichlet distribution,
			fitted to the sampled_probas per example
	"""
    das = np.zeros((sampled_probas.shape[1], sampled_probas.shape[2]))
    for example_idx in xrange(sampled_probas.shape[1]):
        curr_samples = sampled_probas[:, example_idx]
        alphas = dirichlet.mle(curr_samples, method)
        das[example_idx] = alphas
    return das
コード例 #14
0
def dir_ave_MLE(n):
    # Compute the empirical average of the Dirichlet and CC MLEs
    # for Dirichlet-generated data
    dir_means = []
    CC_means = []
    for i in range(trials):
        dat = np.random.dirichlet(alpha_true, n)

        # the CC MLE is just the empirical mean
        CC_means.append(dat.mean(axis=0))
        try:
            alpha_hat = dirichlet.mle(dat)
            mean = alpha_hat / sum(alpha_hat)
            dir_means.append(mean)
        except:
            print("WARNING: failed to converge")
    CC_means = np.array(CC_means)
    dir_means = np.array(dir_means)
    CC_mean = CC_means.mean(axis=0)
    dir_mean = dir_means.mean(axis=0)
    return CC_mean, dir_mean
コード例 #15
0
def CC_ave_MLE(n):
    # Compute the empirical average of the Dirichlet and CC MLEs
    # for CC generated data
    dir_means = []
    CC_means = []
    lam = lam_true.repeat(n).reshape(K, n).transpose()

    for i in range(trials):
        dat = sample_mcb_naive_ordered(lam=lam)

        # the CC MLE is just the empirical mean
        CC_means.append(dat.mean(axis=0))
        try:
            alpha_hat = dirichlet.mle(dat)
            mean = alpha_hat / sum(alpha_hat)
            dir_means.append(mean)
        except:
            print("WARNING: failed to converge")
    CC_means = np.array(CC_means)
    dir_means = np.array(dir_means)
    CC_mean = CC_means.mean(axis=0)
    dir_mean = dir_means.mean(axis=0)
    return CC_mean, dir_mean
コード例 #16
0
 def estimateParameter(self):
     try:
         self.alpha = diri.mle(self.particles)
     except:
         pass
     return self.alpha
コード例 #17
0
def randomize_dirichlet(X):
    alpha = dirichlet.mle(X)
    N = X.shape[0]
    X = np.random.dirichlet(alpha, size=N)
    return X
コード例 #18
0

print(y_train)

for sentence in range(len(X_train)):
  if y_train[sentence]==0:
    sum=0
    for word in range(len(X_train[sentence])):
      sum=sum+X_train[sentence][word]
    t.append(X_train[sentence]/sum)

# print(len(t))

b=numpy.asarray(t)
# print(b.shape)
a0=dirichlet.mle(b)
print(a0)

v=[]


# print(X_train)

for sentence in range(len(X_train)):
  if y_train[sentence]==1:
    sum=0
    for word in range(len(X_train[sentence])):
      sum=sum+X_train[sentence][word]
    v.append(X_train[sentence]/sum)
# print(len(t))
コード例 #19
0
 def test_mle(self, method):
     a0_fit = dirichlet.mle(self.D0, method=method)
     logl0_fit = dirichlet.loglikelihood(self.D0, a0_fit)
     assert (norm(self.a0 - a0_fit) / norm(self.a0) < 0.1)
     assert (abs((logl0_fit - self.logl0) / logl0_fit) < 0.01)
コード例 #20
0
def getAirPollution(dimensions):
    dsname, data, features = getAirQualityUCITimeless()
    
    idxmissing = data == -200
    
    data = data[:, numpy.sum(idxmissing,0) < 2000]
    idxmissing = data == -200
    data = data[numpy.sum(idxmissing,1) == 0, :]
    idxmissing = data == -200
    print(data.shape)
    
    _, mixt = getArchetypes(data, dimensions)
    
    if mixt is None:
        print( "no archetypes", dimensions)
        #0/0
        return
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    print(data.shape)
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = [[0,1]] * mixt.shape[1]
    
    print(domains)
        
    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 #min_instances_slice=int(data.shape[0]*0.01))
                                 min_instances_slice=200)
        return spn
    
    
    stats = Stats(name=dsname)
    
    for train, test, i in kfolded(mixt, 10):
        print(i)
        #dirichlet_alphas = getDirichlet(train)
        dirichlet_alphas = dirichlet.mle(train, method='meanprecision', maxiter=1000000)
        print("dirichlet done")
        ll = scipy.stats.dirichlet.logpdf(numpy.transpose(test), alpha=dirichlet_alphas)
        stats.add("DIRICHLET", Stats.LOG_LIKELIHOOD, numpy.sum(ll))
        stats.add("DIRICHLET", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll))
        
        spn = learn(train)
        print("spn done")
        ll = spn.root.eval(test)
        print(stats.add("SPN", Stats.LOG_LIKELIHOOD, numpy.sum(ll)))
        print(stats.add("SPN", Stats.MEAN_LOG_LIKELIHOOD, numpy.mean(ll)))

     
    stats.save("results/airpollution/"+ dsname + "-" + str(dimensions) + ".json")   
コード例 #21
0
def test6(data):
    print(data.shape)
    _, mixt = getArchetypes(data, 3)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd
    
    mixt = normalize(mixt)
    
    dirichlet_alphas = dirichlet.mle(mixt, method='meanprecision', maxiter=100000)
    
    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    print(domains)

    @memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=1,
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=50)
        return spn
    
    spn = learn(mixt)
    print(spn)
    
    spn_samples = numpy.zeros((data.shape[0], 3))/0
    a,spn_samples = spn.root.sample(spn_samples)
    
    spn_samples = normalize(spn_samples)
    
    
    
    #dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        return result
    
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    xy_all = cartesian(mixt)
    
    
    filename = 'plots/dirichlet_mle.pdf'
    try:
        import os
        os.remove(filename)
    except OSError:
        pass
    pp = PdfPages(filename)
    
    # all
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    numpy.random.seed(17)
    mixt_samples = numpy.random.dirichlet(dirichlet_alphas, data.shape[0])
    print(dirichlet_alphas)
    xy_samples = cartesian(mixt_samples)
    
    
    fig = plt.figure()
    draw_pdf_contours_func(plotDirichlet)
    plt.title("dirichlet trained on all, sampled points")
    plt.plot(xy_samples[:, 0], xy_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, original points")
    plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    xy_spn_samples = cartesian(spn_samples)
    fig = plt.figure()
    draw_pdf_contours_func(spnpdf)
    plt.title("spn trained on all, sampled points")
    plt.plot(xy_spn_samples[:, 0], xy_spn_samples[:, 1], 'ro', markersize=markersize)
    plt.colorbar()
    pp.savefig(fig)
    
    
    
    pp.close()
コード例 #22
0
def computeSimplexExperiment(dsname, data, dimensions, mixttype, min_instances_slice=700):
    if mixttype == "Archetype":
        _, mixt = getArchetypes(data, dimensions)
        if mixt is None:
            return ()
    elif mixttype == "LDA":
        lda = LatentDirichletAllocation(n_topics=dimensions, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    
        lda.fit(data)
        mixt = lda.transform(data)
    elif mixttype == "RandomSample":
        mixt = numpy.random.dirichlet((1,1,1), 20).transpose()
        print(mixt)
        0/0
        
    print(mixt.shape)
    
    def normalize(data):
        mixtd = data
        mixtd[mixtd == 1] = mixtd[mixtd == 1] - 0.0000001
        mixtd[mixtd == 0] = mixtd[mixtd == 0] + 0.0000001
        mixtnorm = numpy.sum(mixtd, axis=1)
        mixtd = numpy.divide(mixtd, mixtnorm[:, None])
        return mixtd+0.0
    
    mixt = normalize(mixt)
    mixt_train, mixt_test = train_test_split(mixt, test_size=0.30, random_state=42)


    numpy.savetxt("mixt_train.csv", mixt_train)
    numpy.savetxt("mixt_test.csv", mixt_test)
    #0/0

    featureTypes = ["continuous"] * mixt.shape[1]
    
    domains = []
    for i, ft in enumerate(featureTypes):
        if ft == "continuous":
            r = (0.0, 1.0)
            fd = estimate_continuous_domain(mixt, i, range=r, binning_method=400)
            domain = numpy.array(sorted(fd.keys()))
        else:
            domain = numpy.unique(data[:, i])

        domains.append(domain)
    
    dirichlet_alphas = dirichlet.mle(mixt_train, method='meanprecision', maxiter=100000)

    #@memory.cache
    def learn(data):
        spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3),
                                 domains=domains,
                                 alpha=0.1,
                                 families = ['histogram'] * data.shape[1],
                                 # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                                 # domains, families=families, row_split_method=Splitting.KmeansRows(),
                                 # col_split_method=Splitting.RDCTest(),
                                 min_instances_slice=min_instances_slice)
        return spn
    #for the good pdf it was 700
    
    
    spn = learn(mixt_train)
    print(spn)
    def spnpdf(data):
        data = data.reshape(-1, mixt.shape[1])
        res = spn.root.eval(normalize(data))[0]
        return res
    
    print(dirichlet_alphas)
    
    def plotDirichlet(data):
        data = data.reshape(-1, mixt.shape[1])
        try:
            result = scipy.stats.dirichlet.logpdf(numpy.transpose(normalize(data)), alpha=dirichlet_alphas)
        except:
            print(normalize(data))
            print(normalize(data)*1.0)
            print(normalize(data)+1)
            print(normalize(data)+0)
            0/0
        return result
    
    df_train = pandas.DataFrame()
    df_test = pandas.DataFrame()
    
    dtrain_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_train), alpha=dirichlet_alphas)
    dtest_fit = scipy.stats.dirichlet.logpdf(numpy.transpose(mixt_test), alpha=dirichlet_alphas)
    df_train["dirichlet_train"] = dtrain_fit
    df_test["dirichlet_test"] = dtest_fit
    
    spn_train_fit = spn.root.eval(mixt_train)
    spn_test_fit = spn.root.eval(mixt_test)
    df_train["spn_train"] = spn_train_fit
    df_test["spn_test"] = spn_test_fit
    

    
    if dimensions == 3:
        xy_train = cartesian(mixt_train)
        xy_test = cartesian(mixt_test)
        
        filename = 'plots/%s_%s.pdf' % (dsname, mixttype)
        try:
            import os
            os.remove(filename)
        except OSError:
            pass
        pp = PdfPages(filename)
        
        markersize = 1.0
        # all
#         fig = plt.figure()
#         plt.title("dirichlet, original points")
#         draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        # train
        fig = plt.figure()
        plt.title("Dirichlet, train points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("Dirichlet, test points")
        draw_pdf_contours_func2(plotDirichlet, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],plotDirichlet)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
    
        # all
#         fig = plt.figure()
#         plt.title("spn, original points")
#         draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
#         #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
# 
#         plt.plot(xy_all[:, 0], xy_all[:, 1], 'ro', markersize=markersize)
#         plt.colorbar()
#         pp.savefig(fig)
        
        # train
        fig = plt.figure()
        plt.title("SPN, train points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_train[:, 0], xy_train[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        
        # test
        fig = plt.figure()
        plt.title("SPN, test points")
        draw_pdf_contours_func2(spnpdf, vmin=-2, vmax=12)
        #draw_pdf_contours_func2(xy_all[:, 0], xy_all[:, 1],spnpdf)
        plt.plot(xy_test[:, 0], xy_test[:, 1], 'ro', markersize=markersize)
        #plt.colorbar()
        fig.tight_layout()
        pp.savefig(fig)
        pp.close()
    
    return ("name", dsname, "size", data.shape, "type", mixttype, "dims", dimensions,
            "spn_train_LL", numpy.mean(spn_train_fit), "dir_train_LL", numpy.mean(dtrain_fit),
            "spn_test_LL", numpy.mean(spn_test_fit), "dir_test_LL", numpy.mean(dtest_fit) ,
            "spn_#_sum_nodes", spn.n_sum_nodes(), "spn_#_prod_nodes", spn.n_prod_nodes(), "spn_#_layers", spn.n_layers()
            )
コード例 #23
0
ファイル: numpy_helper.py プロジェクト: PeterZs/latenttrees
def randomize_dirichlet(X):
    alpha = dirichlet.mle(X)
    N = X.shape[0]
    X = np.random.dirichlet(alpha, size=N)
    return X
def Dirich(vocab_length):
    #%% IMPORTING LIBRARIES AND FUNCTIONS
    import numpy as np
    import pandas as pd
    # import os
    import time
    # import nltk
    # import operator
    # from nltk.corpus import stopwords
    # from sklearn import model_selection
    import scipy.io
    from scipy.stats import dirichlet
    # from sklearn.naive_bayes import MultinomialNB
    from Feature_Extraction import Feature_Extractor
    from dirichlet import mle
    from dirichlet import loglikelihood
    fast_run = 0
    #%%  EXTRACTING FEATURES FROM DATA
    if fast_run == 0:
        Folder_Name = './20_news_small/'
        # Folder_Name='./20_newsgroups/'
        [x_train, x_test, y_train, y_test, x, y, train_score,
         test_score] = Feature_Extractor(Folder_Name, vocab_length)

        #%% SAVING THE VARIABLES AND IMPORTING THEM TO AVOID LONG EXTRACTION TIMES
        scipy.io.savemat('Extracted_Features/file_feat_x_train.mat',
                         mdict={'x_train': (x_train)})
        scipy.io.savemat('Extracted_Features/file_feat_lab_y_train.mat',
                         mdict={'y_train': (y_train)})
        scipy.io.savemat('Extracted_Features/file_feat_x_test.mat',
                         mdict={'x_test': (x_test)})
        scipy.io.savemat('Extracted_Features/file_feat_lab_y_test.mat',
                         mdict={'y_test': (y_test)})
        scipy.io.savemat('Extracted_Features/file_feat_lab_train_score.mat',
                         mdict={'train_score': (train_score)})
        scipy.io.savemat('Extracted_Features/file_feat_lab_test_score.mat',
                         mdict={'test_score': (test_score)})
        #%% LOADING .mat FILES
    temp1 = scipy.io.loadmat('Extracted_Features/file_feat_x_test.mat')
    temp2 = scipy.io.loadmat('Extracted_Features/file_feat_lab_y_test.mat')
    temp3 = scipy.io.loadmat('Extracted_Features/file_feat_x_train.mat')
    temp4 = scipy.io.loadmat('Extracted_Features/file_feat_lab_y_train.mat')
    temp5 = scipy.io.loadmat(
        'Extracted_Features/file_feat_lab_train_score.mat')
    temp6 = scipy.io.loadmat('Extracted_Features/file_feat_lab_test_score.mat')

    x_test = temp1["x_test"]
    y_test = temp2["y_test"]
    x_train = temp3["x_train"]
    y_train = temp4["y_train"]
    train_score = temp5["train_score"]
    test_score = temp6["test_score"]
    del temp1, temp2, temp3, temp4, temp5, temp6

    #%% PARAMETERS FOR LOOPING LATER
    [nmbr_of_files, vocab_length] = np.shape(x_train)
    unique_classes = np.unique(y_train)
    nmbr_of_classes = len(unique_classes)

    #%% NORMALISING TRAINING DATA
    normalising_factor = np.sum(x_train,
                                axis=1)  #Count of all words in each class
    eta = 1.1625
    x_train = x_train + eta
    x_train = x_train / (normalising_factor[:, None] + eta * vocab_length)

    #%% COMPUTING OPTIMAL ALPHAS FOR EACH CLASS (MLE)
    alpha = np.zeros((nmbr_of_classes, vocab_length))
    for i in range(0, nmbr_of_classes):
        alpha[i][:] = mle(x_train[y_train[0][:] == i][:],
                          tol=1e-7,
                          method='meanprecision',
                          maxiter=100000)
    #initialising uniform alphas

    #%%                                                             RUNNING CLASSIFIER
    #NORMALISING THE INPUT TEST SAMPLES.
    [nmbr_of_files, vocab_length] = np.shape(x_test)
    sample_normalising_factor = np.sum(x_test, axis=1)
    #LAPLACE SMOOTHING x_test
    x_test = x_test + eta
    x_test = x_test / (sample_normalising_factor[:, None] + eta * vocab_length)
    #TESTING

    y_pred = np.zeros((1, nmbr_of_files))
    likelihoods = np.zeros((nmbr_of_classes, 1))
    for i in range(0, nmbr_of_files):
        test_sample = x_test[i][:]
        for j in range(0, nmbr_of_classes):
            likelihoods[j][:] = loglikelihood(
                x_test[i][:], alpha[j][:])  #Skewing the trained alpha
        y_pred[0][i] = np.argmax(likelihoods)

    # np.random.dirichlet(alpha)
    # [nmbr_of_files,vocab_length]=np.shape(x_test)
    # multinom_matrix=np.zeros((nmbr_of_classes,vocab_length)) #Matrix holding the probability of each class raised to the power of frequency.
    # likelihood=np.zeros((nmbr_of_classes,1))
    # y_pred=np.zeros((1,nmbr_of_files))
    print("Classifying testing samples \n")

    #%%                                                             TESTING RESULTS

    diffrnce = y_pred - y_test
    diffrnce[diffrnce != 0] = 1

    incorrect = sum(diffrnce[0][:])
    accuracy = (1 - (incorrect / nmbr_of_files)) * 100

    # print("Classifier accuracy is: ",accuracy,"%\n")
    # train_score=train_score*100
    # test_score=test_score*100

    # print("In-built function gives: ",train_score[0][0], "% accuracy on training set and", test_score[0][0],"% accuracy on testing set")
    return accuracy, test_score, y_pred, y_test
コード例 #25
0
ファイル: test_dirichlet.py プロジェクト: ericsuh/dirichlet
 def test_mle(self, method):
     a0_fit = dirichlet.mle(self.D0, method=method)
     logl0_fit = dirichlet.loglikelihood(self.D0, a0_fit)
     assert(norm(self.a0 - a0_fit)/norm(self.a0) < 0.1)
     assert(abs((logl0_fit - self.logl0)/logl0_fit) < 0.01)
コード例 #26
0
import numpy as np
import pandas as pd
import dirichlet
import matplotlib.pyplot as plt



bodyshop = pd.read_csv("data/autoshop_ratings.csv", header=0)
bodyshop['total'] = bodyshop.iloc[:, 1:5].sum(axis=1)
bodyshop = bodyshop.loc[bodyshop['total'] > 0]
bodyshop['metric'] = (bodyshop['five'] * 5 + bodyshop['four'] * 4 \
                    + bodyshop['three'] * 3 + bodyshop['two'] * 2 \
                    + bodyshop['one']) / bodyshop['total']

bodyshop = bodyshop.sort_values(by='total', ascending=False)

K = 5
ITERATION = 50
a0 = np.array([100, 299, 100])
D0 = np.random.dirichlet(a0, 1000)
dirichlet.mle(D0)