Ejemplo n.º 1
0
def deserialize_multinomial_nb(model_dict):
    model = MultinomialNB(model_dict['params'])

    model.classes_ = np.array(model_dict['classes_'])
    model.class_count_ = np.array(model_dict['class_count_'])
    model.class_log_prior_ = np.array(model_dict['class_log_prior_'])
    model.feature_count_ = np.array(model_dict['feature_count_'])
    model.feature_log_prob_ = np.array(model_dict['feature_log_prob_'])

    return model
Ejemplo n.º 2
0
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability'):
    if 'features' in model:
        feature_cols = model['features']
    else:
        feature_cols = model['feature_cols']
    feature_names, features = check_col_type(table, feature_cols)
    if 'nb_model' in model:
        nb_model = model['nb_model']
    else:
        model_table = model['table_1']
        if model_table.model_type[0] == 'multinomial':
            nb_model = MultinomialNB()
        else:
            nb_model = BernoulliNB()
        nb_model.fit([[1]], [1])
        nb_model.classes_ = np.array([0, 1])
        nb_model.class_log_prior_ = model_table.pi.values
        nb_model.feature_log_prob_ = np.array(list(model_table.theta))
    prediction = nb_model.predict(features)
    if 'label_encoder' in model:
        label_encoder = model['label_encoder']
        prediction = label_encoder.inverse_transform(prediction)
        if suffix == 'label':
            suffixes = label_encoder.classes_
        else:
            suffixes = range(0, len(label_encoder.classes_))
    else:
        suffixes = [0, 1]

    prob = nb_model.predict_proba(features)    
    prob_cols = ['{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes]
    prob_df = pd.DataFrame(data=prob, columns=prob_cols)

    result = table
    result[prediction_col] = prediction

    if display_log_prob == True:
        log_prob = nb_model.predict_log_proba(features)
        logprob_cols = ['{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes]
        logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols)
        result = pd.concat([result, prob_df, logprob_df], axis=1)
    else:
        result = pd.concat([result, prob_df], axis=1)

    return {'out_table' : result}
Ejemplo n.º 3
0
    def runEM(self):
        ''' initializes, then iteratively runs, the EM algorithm to cluster
            self.documents in self.n_category different classes '''
        self.initializeEM(self.randomize)
        initial_ll = loglikelihood(self.models[-1], self.documents)
        self.likelihoods.append(initial_ll)

        print "EM initial likelihood: %s" % initial_ll

        for iter_n in range(self.max_iterations):
            done = False

            try:
                prev_likelihood = self.likelihoods[-1]
            except IndexError:
                prev_likelihood = -inf

            nb = MultinomialNB(**self.kwargs)
            # add faked "classes_" attribute to force it to think it's been trained
            nb.classes_ = np.ndarray((self.n_categories, ))
            # and add the random parameters to actually "train" it
            nb.class_log_prior_ = self.class_log_priors[-1]
            nb.feature_log_prob_ = self.feature_log_probs[-1]

            soft_predictions = self.e_step(nb)
            nb = self.m_step(soft_predictions)

            ### TODO: can speed up by a factor of two if i combine ll calculation and soft prediction
            ll = loglikelihood(nb, self.documents)

            self.models.append(nb)

            ### CHECK LIKELIHOOD CHANGE
            self.likelihoods.append(ll)
            if abs(float((ll - prev_likelihood)) /
                   prev_likelihood) < LIKELIHOOD_EPSILON:
                done = True

            print "EM iteration %s of %s" % (iter_n, self.max_iterations), ll

            #print iter_n, ll, ll - prev_likelihood #, nb.count_classifications()
            #print iter_n, this_likelihood, count_live_classes(nb)
            if done:
                break