コード例 #1
0
    def test_train(self, params='wm'):
        b = bernoullimm.BernoulliMM(n_components=self.n_components)
        b.weights_ = self.weights
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means)


        # Create a training set by sampling from the predefined distribution.
        X = b.sample(n_samples=100)
        b = self.model(n_components=self.n_components,
                       
                       random_state=rng, 
                       n_iter=1, init_params=params)
        b.fit(X)

        # Do one training iteration at a time so we can keep track of
        # the log likelihood to make sure that it increases after each
        # iteration.
        trainll = []
        for iter in range(5):
            b.params = params
            b.init_params = ''
            b.fit(X)
            trainll.append(self.score(b, X))
        b.n_iter = 10
        b.init_params = ''
        b.params = params
        b.fit(X)  # finish fitting

        delta_min = np.diff(trainll).min()
        self.assertTrue(
            delta_min > self.threshold,
            "The min nll increase is %f which is lower than the admissible"
            " threshold of %f. The likelihoods are %s."
            % (delta_min, self.threshold,  trainll))
コード例 #2
0
    def test_train(self, params='wm'):
        b = bernoullimm.BernoulliMM(n_components=self.n_components)
        b.weights_ = self.weights
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(
            self.means)

        # Create a training set by sampling from the predefined distribution.
        X = b.sample(n_samples=100)
        b = self.model(n_components=self.n_components,
                       random_state=rng,
                       n_iter=1,
                       init_params=params)
        b.fit(X)

        # Do one training iteration at a time so we can keep track of
        # the log likelihood to make sure that it increases after each
        # iteration.
        trainll = []
        for iter in range(5):
            b.params = params
            b.init_params = ''
            b.fit(X)
            trainll.append(self.score(b, X))
        b.n_iter = 10
        b.init_params = ''
        b.params = params
        b.fit(X)  # finish fitting

        delta_min = np.diff(trainll).min()
        self.assertTrue(
            delta_min > self.threshold,
            "The min nll increase is %f which is lower than the admissible"
            " threshold of %f. The likelihoods are %s." %
            (delta_min, self.threshold, trainll))
コード例 #3
0
    def test_sample(self, n=100):
        b = self.model(n_components=self.n_components,
                       random_state=rng)
        # Make sure the means are far apart so responsibilities.argmax()
        # picks the actual component used to generate the observations.
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means)

        b.weights_ = self.weights

        samples = b.sample(n)
        self.assertEqual(samples.shape, (n, self.n_features))
コード例 #4
0
    def test_sample(self, n=100):
        b = self.model(n_components=self.n_components, random_state=rng)
        # Make sure the means are far apart so responsibilities.argmax()
        # picks the actual component used to generate the observations.
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(
            self.means)

        b.weights_ = self.weights

        samples = b.sample(n)
        self.assertEqual(samples.shape, (n, self.n_features))
コード例 #5
0
    def test_eval(self):

        b = self.model(n_components=self.n_components,
                       random_state=rng)
        # Make sure the means are far apart so responsibilities.argmax()
        # picks the actual component used to generate the observations.
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means)
        b.weights_ = self.weights

        bernoulliidx = np.repeat(np.arange(self.n_components), 5)
        n_samples = len(bernoulliidx)
        X = (rng.randn(n_samples, self.n_features) <= b.means_[bernoulliidx]).astype(np.uint8)

        ll, responsibilities = b.eval(X)

        self.assertEqual(len(ll), n_samples)
        self.assertEqual(responsibilities.shape,
                         (n_samples, self.n_components))
        assert_array_almost_equal(responsibilities.sum(axis=1),
                                  np.ones(n_samples))
        assert_array_equal(responsibilities.argmax(axis=1), bernoulliidx)
コード例 #6
0
    def test_eval(self):

        b = self.model(n_components=self.n_components, random_state=rng)
        # Make sure the means are far apart so responsibilities.argmax()
        # picks the actual component used to generate the observations.
        b.means_ = self.means
        b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(
            self.means)
        b.weights_ = self.weights

        bernoulliidx = np.repeat(np.arange(self.n_components), 5)
        n_samples = len(bernoulliidx)
        X = (rng.randn(n_samples, self.n_features) <=
             b.means_[bernoulliidx]).astype(np.uint8)

        ll, responsibilities = b.eval(X)

        self.assertEqual(len(ll), n_samples)
        self.assertEqual(responsibilities.shape,
                         (n_samples, self.n_components))
        assert_array_almost_equal(responsibilities.sum(axis=1),
                                  np.ones(n_samples))
        assert_array_equal(responsibilities.argmax(axis=1), bernoulliidx)
コード例 #7
0
def main(args):
    """
    get all the data matrices and process the data
    """
    config_d = configParserWrapper.load_settings(open(args.config,'r'))
    phones = np.loadtxt(args.phones,dtype=str)
    max_n_classifiers = len(phones) * args.ncomponents



    classifier_id = 0
    print "ncomponents = %d" % args.ncomponents
    for phone_id, phone in enumerate(phones):
        if args.v:
            print "Working on phone %s which has id %d" % (phone,phone_id)
            print "classifier_id = %d" % classifier_id
        X = np.load('%s/%s_%s' % ( args.data_prefix,
                                   phone,
                                   args.data_suffix))



        if phone_id == 0:
            avgs = np.zeros((max_n_classifiers,
                               ) + X.shape[1:])
            counts = np.zeros(max_n_classifiers
                               )
            # will keep track of which average belongs to which
            # phone and mixture component--this allows us to
            # drop mixture components if they are potentially
            # not helping
            weights = np.zeros(max_n_classifiers,dtype=float)
            meta = np.zeros((max_n_classifiers
                             ,2),dtype=int)

        if args.ncomponents == 1:
            avgs[phone_id] = X.mean(0)
            counts[phone_id] = X.shape[0]
            weights[phone_id] = 1
            meta[phone_id,0] = phone_id
            meta[phone_id,1] = 0
            classifier_id += 1
        else:
            bmm = bernoullimm.BernoulliMM(n_components=args.ncomponents,
                                          n_init= config_d['EM']['n_init'],
                                          n_iter= config_d['EM']['n_iter'],
                                          tol=config_d['EM']['tol'],
                                          random_state=config_d['EM']['random_seed'],
                                          verbose=args.v)
            bmm.fit(X)

            responsibilities = bmm.predict_proba(X)
            component_counts = responsibilities.sum(0)
            no_use_components = component_counts < config_d['EM']['min_data_count']
            while no_use_components.sum() > 0:
                n_use_components = len(component_counts) -1
                bad_component = np.argmin(component_counts)
                use_components = np.ones(len(component_counts),
                                         dtype=bool)
                use_components[bad_component] = False
                bmm.means_ = bmm.means_[use_components]
                bmm.weights_ = bmm.weights_[use_components]
                bmm.weights_ /= bmm.weights_.sum()
                bmm.n_components = n_use_components
                bmm.log_odds_, bmm.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(bmm.means_)
                bmm.n_iter = 1
                bmm.init_params=''
                bmm.fit(X)
                responsibilities = bmm.predict_proba(X)
                component_counts = responsibilities.sum(0)
                no_use_components = component_counts < config_d['EM']['min_data_count']
                print component_counts

            n_use_components = bmm.n_components
            cur_means = bmm.means_.reshape(
                *((n_use_components,)
                  + avgs.shape[1:]))
            cur_counts = component_counts
            cur_weights = bmm.weights_
            avgs[classifier_id:classifier_id+
                 n_use_components] = cur_means
            counts[classifier_id:
                   classifier_id + n_use_components] = cur_counts
            weights[classifier_id:
                    classifier_id + n_use_components] = cur_weights
            meta[classifier_id:classifier_id+n_use_components,0] = phone_id
            meta[classifier_id:classifier_id+n_use_components,1] = np.arange(n_use_components)

            # make sure we move forward in the vector
            classifier_id += n_use_components


    print "Total of %d models" % classifier_id
    np.save('%s/avgs_%s' % (args.out_prefix, args.out_suffix),
            avgs[:classifier_id])
    np.save('%s/counts_%s' % (args.out_prefix, args.out_suffix),
            counts[:classifier_id])
    np.save('%s/weights_%s' % (args.out_prefix, args.out_suffix),
            weights[:classifier_id])

    np.save('%s/meta_%s' % (args.out_prefix, args.out_suffix),
            meta[:classifier_id])