Beispiel #1
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k, covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
Beispiel #2
0
    def _fit_dpgmm(self, x):
        # clustering
        k = max(self.crange)
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model_kwargs = {}
            if 'alpha' in self.clus_kwargs:
                model_kwargs.update(alpha=self.clus_kwargs['alpha'])
            if 'conv_thresh' in self.clus_kwargs:
                model_kwargs.update(thresh=self.clus_kwargs['conv_thresh'])
            if 'max_iter' in self.clus_kwargs:
                model_kwargs.update(n_iter=self.clus_kwargs['max_iter'])

            model = DPGMM(n_components=k,
                          covariance_type=self.cvtype,
                          **model_kwargs)
            model.fit(x)
            self._labels[r] = model.predict(x)
            self._parameters[r] = model.means_
            self._ll[r] = model.score(x).sum()

            # evaluate goodness of fit for this run
            #self._gof[r] = self.gof(x, self._ll[r], k)
            if self.gof_type == 'aic':
                self._gof[r] = model.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model.bic(x)

            # debug
            if self.debug is True:
                print self._gof[r], model.n_components, model.weights_.shape[0]
def main():
    if len(sys.argv) != 4:
        print(__doc__)
        return 1

    infile = sys.argv[1]
    N = int(sys.argv[2])    
    num_random = int(sys.argv[3])

    print("Reading in", infile)
    fullarr = np.loadtxt(fileinput.input(infile), delimiter = '\t')[:,:-7]

    stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T
    means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T
    stds[stds == 0] = 1.0

    num_lines = num_random
    fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:]

    fullarr = (fullarr - means) / stds

    output = ''

    print("Parameter searching...")
    igmm = None
    best_score = -100000
    best_alpha = -1
    best_model = None
    for alpha in [0.01,0.1,1,10]: 
        print("Learning infinite GMM with N={}, alpha={}".format(N, alpha))
        output += "Learning infinite GMM with N={}, alpha={}\n".format(N, alpha)
        igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc')
        igmm.fit(fullarr)
        score = igmm.score(fullarr)
        score = sum(score)/len(score)
        print('{}: {} with {} clusters'.format(alpha, score, igmm.n_components))
        output += '{}: {} with {} clusters\n'.format(alpha, score, igmm.n_components)

        if score > best_score:
            best_score = score
            best_alpha = alpha
            best_model = igmm

    print('Best alpha={}, score={}'.format(best_alpha, best_score))
    output += 'Best alpha={}, score={}\n'.format(best_alpha, best_score)
    with open('parameter_search_results.txt', 'a+') as outf:
        outf.write(output)
    
    return 0
Beispiel #4
0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None):
    best_bic = np.inf
    bic_dpgmm = None
    lbl_vec_dpgmm = np.zeros(X.shape[0])
    prob_vec_dpgmm = np.zeros(X.shape[0])
    log_prob_dpgmm = None
    for i in xrange(n_init):
        dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \
                        alpha=alpha, random_state=rand_state)
        dpgmm.fit(X)
        b = dpgmm.bic(X)
        if b < best_bic:
            bic_dpgmm = b
            lbl_vec = dpgmm.predict(X)
            prob_vec = dpgmm.predict_proba(X)
            log_prob_dpgmm = np.sum(dpgmm.score(X))
    return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
def main():
    if len(sys.argv) != 5:
        print(__doc__)
        return 1

    infiles = glob(sys.argv[1])
    outfile = sys.argv[2]
    N = int(sys.argv[3])
    alpha = float(sys.argv[4])

    print("Reading in", len(infiles), "files")
    fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7]


    stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T
    means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T
    stds[stds == 0] = 1.0

    num_lines = 10000
    fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:]

    fullarr = (fullarr - means) / stds


    print("Learning infinite GMM with N={}, alpha={}".format(N, alpha))

    igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc')
    igmm.fit(fullarr)

    print("Infinite GMM trained, saving")

    with open(outfile + '_' + num_lines, 'wb') as out_model:
        pickle.dump(igmm, out_model)

    print("Score:", igmm.score(fullarr))
    print("Num Components:", igmm.n_components)
    
    return 0