def test_train(self, params='wm'): b = bernoullimm.BernoulliMM(n_components=self.n_components) b.weights_ = self.weights b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means) # Create a training set by sampling from the predefined distribution. X = b.sample(n_samples=100) b = self.model(n_components=self.n_components, random_state=rng, n_iter=1, init_params=params) b.fit(X) # Do one training iteration at a time so we can keep track of # the log likelihood to make sure that it increases after each # iteration. trainll = [] for iter in range(5): b.params = params b.init_params = '' b.fit(X) trainll.append(self.score(b, X)) b.n_iter = 10 b.init_params = '' b.params = params b.fit(X) # finish fitting delta_min = np.diff(trainll).min() self.assertTrue( delta_min > self.threshold, "The min nll increase is %f which is lower than the admissible" " threshold of %f. The likelihoods are %s." % (delta_min, self.threshold, trainll))
def test_train(self, params='wm'): b = bernoullimm.BernoulliMM(n_components=self.n_components) b.weights_ = self.weights b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums( self.means) # Create a training set by sampling from the predefined distribution. X = b.sample(n_samples=100) b = self.model(n_components=self.n_components, random_state=rng, n_iter=1, init_params=params) b.fit(X) # Do one training iteration at a time so we can keep track of # the log likelihood to make sure that it increases after each # iteration. trainll = [] for iter in range(5): b.params = params b.init_params = '' b.fit(X) trainll.append(self.score(b, X)) b.n_iter = 10 b.init_params = '' b.params = params b.fit(X) # finish fitting delta_min = np.diff(trainll).min() self.assertTrue( delta_min > self.threshold, "The min nll increase is %f which is lower than the admissible" " threshold of %f. The likelihoods are %s." % (delta_min, self.threshold, trainll))
def test_sample(self, n=100): b = self.model(n_components=self.n_components, random_state=rng) # Make sure the means are far apart so responsibilities.argmax() # picks the actual component used to generate the observations. b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means) b.weights_ = self.weights samples = b.sample(n) self.assertEqual(samples.shape, (n, self.n_features))
def test_sample(self, n=100): b = self.model(n_components=self.n_components, random_state=rng) # Make sure the means are far apart so responsibilities.argmax() # picks the actual component used to generate the observations. b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums( self.means) b.weights_ = self.weights samples = b.sample(n) self.assertEqual(samples.shape, (n, self.n_features))
def test_eval(self): b = self.model(n_components=self.n_components, random_state=rng) # Make sure the means are far apart so responsibilities.argmax() # picks the actual component used to generate the observations. b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(self.means) b.weights_ = self.weights bernoulliidx = np.repeat(np.arange(self.n_components), 5) n_samples = len(bernoulliidx) X = (rng.randn(n_samples, self.n_features) <= b.means_[bernoulliidx]).astype(np.uint8) ll, responsibilities = b.eval(X) self.assertEqual(len(ll), n_samples) self.assertEqual(responsibilities.shape, (n_samples, self.n_components)) assert_array_almost_equal(responsibilities.sum(axis=1), np.ones(n_samples)) assert_array_equal(responsibilities.argmax(axis=1), bernoulliidx)
def test_eval(self): b = self.model(n_components=self.n_components, random_state=rng) # Make sure the means are far apart so responsibilities.argmax() # picks the actual component used to generate the observations. b.means_ = self.means b.log_odds_, b.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums( self.means) b.weights_ = self.weights bernoulliidx = np.repeat(np.arange(self.n_components), 5) n_samples = len(bernoulliidx) X = (rng.randn(n_samples, self.n_features) <= b.means_[bernoulliidx]).astype(np.uint8) ll, responsibilities = b.eval(X) self.assertEqual(len(ll), n_samples) self.assertEqual(responsibilities.shape, (n_samples, self.n_components)) assert_array_almost_equal(responsibilities.sum(axis=1), np.ones(n_samples)) assert_array_equal(responsibilities.argmax(axis=1), bernoulliidx)
def main(args): """ get all the data matrices and process the data """ config_d = configParserWrapper.load_settings(open(args.config,'r')) phones = np.loadtxt(args.phones,dtype=str) max_n_classifiers = len(phones) * args.ncomponents classifier_id = 0 print "ncomponents = %d" % args.ncomponents for phone_id, phone in enumerate(phones): if args.v: print "Working on phone %s which has id %d" % (phone,phone_id) print "classifier_id = %d" % classifier_id X = np.load('%s/%s_%s' % ( args.data_prefix, phone, args.data_suffix)) if phone_id == 0: avgs = np.zeros((max_n_classifiers, ) + X.shape[1:]) counts = np.zeros(max_n_classifiers ) # will keep track of which average belongs to which # phone and mixture component--this allows us to # drop mixture components if they are potentially # not helping weights = np.zeros(max_n_classifiers,dtype=float) meta = np.zeros((max_n_classifiers ,2),dtype=int) if args.ncomponents == 1: avgs[phone_id] = X.mean(0) counts[phone_id] = X.shape[0] weights[phone_id] = 1 meta[phone_id,0] = phone_id meta[phone_id,1] = 0 classifier_id += 1 else: bmm = bernoullimm.BernoulliMM(n_components=args.ncomponents, n_init= config_d['EM']['n_init'], n_iter= config_d['EM']['n_iter'], tol=config_d['EM']['tol'], random_state=config_d['EM']['random_seed'], verbose=args.v) bmm.fit(X) responsibilities = bmm.predict_proba(X) component_counts = responsibilities.sum(0) no_use_components = component_counts < config_d['EM']['min_data_count'] while no_use_components.sum() > 0: n_use_components = len(component_counts) -1 bad_component = np.argmin(component_counts) use_components = np.ones(len(component_counts), dtype=bool) use_components[bad_component] = False bmm.means_ = bmm.means_[use_components] bmm.weights_ = bmm.weights_[use_components] bmm.weights_ /= bmm.weights_.sum() bmm.n_components = n_use_components bmm.log_odds_, bmm.log_inv_mean_sums_ = bernoullimm._compute_log_odds_inv_means_sums(bmm.means_) bmm.n_iter = 1 bmm.init_params='' bmm.fit(X) responsibilities = bmm.predict_proba(X) component_counts = responsibilities.sum(0) no_use_components = component_counts < config_d['EM']['min_data_count'] print component_counts n_use_components = bmm.n_components cur_means = bmm.means_.reshape( *((n_use_components,) + avgs.shape[1:])) cur_counts = component_counts cur_weights = bmm.weights_ avgs[classifier_id:classifier_id+ n_use_components] = cur_means counts[classifier_id: classifier_id + n_use_components] = cur_counts weights[classifier_id: classifier_id + n_use_components] = cur_weights meta[classifier_id:classifier_id+n_use_components,0] = phone_id meta[classifier_id:classifier_id+n_use_components,1] = np.arange(n_use_components) # make sure we move forward in the vector classifier_id += n_use_components print "Total of %d models" % classifier_id np.save('%s/avgs_%s' % (args.out_prefix, args.out_suffix), avgs[:classifier_id]) np.save('%s/counts_%s' % (args.out_prefix, args.out_suffix), counts[:classifier_id]) np.save('%s/weights_%s' % (args.out_prefix, args.out_suffix), weights[:classifier_id]) np.save('%s/meta_%s' % (args.out_prefix, args.out_suffix), meta[:classifier_id])