def get_classifier(vocabulary):
    '''
    需要将抽象的句子分类到某一个模板,这里是训练分类器
    '''

    # 准备数据集
    x_train = []
    y_train = []

    root = "./Qdata/question/"
    filenames = [
        filename for filename in os.listdir(root) if filename[0] == "【"
    ]
    for filename in filenames:
        label = int(filename[filename.index("【") + 1:filename.index("】")])
        with open(root + filename, "r", encoding="utf-8") as f:
            sen_list = [line.strip() for line in f.readlines()]
            x_train += sen_list
            y_train += [label] * len(sen_list)

    x_train_array = np.zeros((len(x_train), len(vocabulary)))
    for row, sentence in enumerate(x_train):
        for col, voc in enumerate(vocabulary):
            if voc in sentence:
                x_train_array[row, col] = 1

    classifier = ComplementNB()
    classifier.fit(x_train_array, y_train)

    return classifier
Example #2
0
def realizar_treinamento(registros_de_treino, vetorizador):
    treino_comentarios = [
        registro_treino[0] for registro_treino in registros_de_treino
    ]
    treino_respostas = [
        registro_treino[1] for registro_treino in registros_de_treino
    ]

    treino_comentarios = vetorizador.fit_transform(treino_comentarios)

    # modelo = BernoulliNB()
    # modelo = MultinomialNB()
    modelo = ComplementNB()
    modelo.fit(treino_comentarios, treino_respostas)

    # VALIDAÇÃO COM CROSS VALIDATION
    # cv = KFold(n_splits=200)
    # resultado = cross_val_predict(modelo, treino_comentarios, treino_respostas, cv=cv)
    # total = len(resultado)
    # acc = 0
    #
    # score = accuracy_score(treino_respostas, resultado)
    # print(score * 100)
    #
    # for i in range(0, total):
    #     if resultado[i] == treino_respostas[i]:
    #         acc += 1
    #
    # print(acc, total, acc/total * 100)
    #
    # print(metrics.classification_report(treino_respostas, resultado, [0, 1]))
    #
    # exit()

    return modelo
Example #3
0
class CNBTwoStepClassifier(ImbalancedTrainerInterface):
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.clf_yn = ComplementNB(alpha=alpha)
        self.clf_n = ComplementNB(alpha=alpha)
    
    def fit(self, X_train, y_train):
        X_train = X_train.toarray().tolist()
        y_train = pd.Series(y_train)
        max_class = self._find_dominant_class(X_train, y_train)
        x_w, x_o, y_w, y_o = self._partition(X_train, y_train, max_class[0])
        x_yn = x_w + x_o
        y_yn = y_w + ['not'] * len(y_o)
        # print(y_yn)
        # print(y_o)
        self.clf_yn.fit(x_yn, y_yn)
        self.clf_n.fit(x_o, y_o)
    
    def predict(self, X_test):
        y_pred_yn = self.clf_yn.predict(X_test)
        y_pred_total = []
        for p,x in list(zip(y_pred_yn, X_test)):
            if p == 'not':
                y_pred_total.append(self.clf_n.predict(x)[0])
            else:
                y_pred_total.append(p)
        return y_pred_total
    
    def score(self, X_test, y_test):
        y_test = pd.Series(y_test)
        y_pred = self.predict(X_test)
        acc = np.mean(y_pred == y_test)
        return acc
Example #4
0
def confusion_matrix():
    '''
    Creates a full confusion matrix for the top 15 varieties and displays it.
    Currently changes to vectorizer and model must be done manually.
    '''

    wrangler = Data_Handler('data/cleaned_data.csv')
    df = wrangler.get_top_num(15)
    stops = wrangler.stop_words

    X = df['description']
    y = df['variety']
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    vecto = TfidfVectorizer(stop_words=stops)
    X_train = vecto.fit_transform(X_train)
    X_test = vecto.transform(X_test)
    model = ComplementNB()
    model.fit(X_train, y_train)

    class_sort = [
        'Pinot Noir', 'Cabernet Sauvignon', 'Red Blend',
        'Bordeaux-style Red Blend', 'Syrah', 'Merlot', 'Zinfandel',
        'Sangiovese', 'Malbec', 'Nebbiolo', 'Rosé', 'Chardonnay',
        'Sauvignon Blanc', 'Riesling', 'White Blend'
    ]
    plot_confusion_matrix(model,
                          X_test,
                          y_test,
                          normalize='true',
                          xticks_rotation='vertical',
                          labels=class_sort,
                          include_values=False)
    plt.show()
Example #5
0
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val):
    alphas = [x / 10 for x in range(0, 11)]
    fit_priors = [True, False]
    norms = [True, False]
    max_score = 0
    optimal_fit_prior = True
    optimal_alpha = 1.0
    optiomal_norm = False

    # Evaluamos para escoger el mejor parámetro
    for alpha in alphas:
        for fit_prior in fit_priors:
            for norm in norms:
                naive = ComplementNB(alpha=alpha,
                                     fit_prior=fit_prior,
                                     norm=norm)
                naive.fit(x_train, y_train)
                y_pred = naive.predict(x_val)
                if max_score < accuracy_score(y_val, y_pred) * 100:
                    optimal_alpha = alpha
                    optimal_fit_prior = fit_prior
                    optiomal_norm = norm
                    max_score = accuracy_score(y_val, y_pred) * 100
    print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm)
    return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
Example #6
0
def test_cnb():
    # Tests ComplementNB when alpha=1.0 for the toy example in Manning,
    # Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
    # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
    X = np.array([[1, 1, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0],
                  [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1).
    Y = np.array([0, 0, 0, 1])

    # Verify inputs are nonnegative.
    clf = ComplementNB(alpha=1.0)
    assert_raises(ValueError, clf.fit, -X, Y)

    clf.fit(X, Y)

    # Check that counts are correct.
    feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
    assert_array_equal(clf.feature_count_, feature_count)
    class_count = np.array([3, 1])
    assert_array_equal(clf.class_count_, class_count)
    feature_all = np.array([1, 4, 1, 1, 1, 1])
    assert_array_equal(clf.feature_all_, feature_all)

    # Check that weights are correct. See steps 4-6 in Table 4 of
    # Rennie et al. (2003).
    theta = np.array([
        [
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6)
        ],
        [
            (1 + 1) / (6 + 6),
            (3 + 1) / (6 + 6),
            (0 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (0 + 1) / (6 + 6)
        ]])

    weights = np.zeros(theta.shape)
    for i in range(2):
        weights[i] = np.log(theta[i])
        weights[i] /= weights[i].sum()

    assert_array_equal(clf.feature_log_prob_, weights)
def _complementnb(*,
                  train,
                  test,
                  x_predict=None,
                  metrics,
                  alpha=1.0,
                  fit_prior=True,
                  class_prior=None,
                  norm=False):
    """For for info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB
    """

    model = ComplementNB(alpha=alpha,
                         fit_prior=fit_prior,
                         class_prior=class_prior,
                         norm=norm)
    model.fit(train[0], train[1])
    model_name = 'ComplementNB'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Example #8
0
def findBestFitCluster(orphanCorpus, corpusCluster={}):
    """
    Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions
    Parameters:
        orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster.
        corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses

    Returns:
        xxx
    """

    # corpusCluster = {
    #     "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ],
    #     "question_vectors": [[], []],
    #     "clusterIds": [ '4', '4' ]
    # }

    # orphanCorpus = [ {
    #         "id": 11, "question": 'Another one about the sun?', "question_vector": []
    #     },
    #     {
    #         "id": 33,
    #         "question": 'What is the distance from the sun though?', "question_vector": [] },
    #     {
    #         "id": 37,
    #         "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": []
    # } ]

    # Fit the Naive bayes model on existing clusters
    clf = ComplementNB()
    clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"])

    predictions = clf.predict_proba(
        [doc["question_vector"] for doc in orphanCorpus])
def ComplementNB_classification(train,
                                test,
                                train_labels,
                                test_labels,
                                res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Complement Nive Bayes...")

    complNB = ComplementNB()
    complNB.fit(train, train_labels)

    prediction = complNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "ComplementNB")
    score = complNB.score(test, test_labels)

    res["ComplementNB"] = {
        "model": complNB,
        "accuracy": score,
        "name": "ComplementNB"
    }
    print("Complement ended...")
    return score, complNB
Example #10
0
class ComplementNBImpl():
    def __init__(self,
                 alpha=1.0,
                 fit_prior=True,
                 class_prior=None,
                 norm=False):
        self._hyperparams = {
            'alpha': alpha,
            'fit_prior': fit_prior,
            'class_prior': class_prior,
            'norm': norm
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Example #11
0
def main():
    # Iris or breast cancer dataset can be used too
    x, y = datasets.load_wine(return_X_y=True)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=2405)

    # Multinomial Naive Bayes
    MNB = MultinomialNB()
    MNB.fit(x_train, y_train)
    mnb_accuracy = MNB.score(x_test, y_test)
    print(f"MultinomialNB accuracy is {round(mnb_accuracy, 4)}")

    # Gaussian Naive Bayes
    GNB = GaussianNB()
    GNB.fit(x_train, y_train)
    gnb_accuracy = GNB.score(x_test, y_test)
    print(f"GaussianNB accuracy is {round(gnb_accuracy, 4)}")

    # Complement Naive Bayes
    CNB = ComplementNB()
    CNB.fit(x_train, y_train)
    cnb_accuracy = CNB.score(x_test, y_test)
    print(f"ComplementNB accuracy is {round(cnb_accuracy, 4)}")
Example #12
0
class bayes(object):
    def __init__(self, data, target, algorithm="GNB"):
        self.algorithm = algorithm
        self.data = data
        self.target = target
        if algorithm == 'GNB':
            self.model = GaussianNB()
        elif algorithm == 'MNB':
            self.model = MultinomialNB()
        elif algorithm == 'BNB':
            self.model = BernoulliNB()
        else:
            self.model = ComplementNB()

        self.model.fit(data, target)

    def save_model(self, path):
        _joblib.dump(self.model, path)

    def load_model(self, path):
        self.model = _joblib.load(path)

    def predict(self, x):
        res = self.model.predict(x)
        return res
def pickling():
    '''
    Creates and pickles both the vectorizer and model for use in prediction.

    Parameters
    ----------
    None

    Returns
    ----------
    None
    '''

    wrangler = Data_Handler('data/cleaned_data.csv')
    stops = wrangler.stop_words
    df = wrangler.get_top_num(15)
    X = df['description']
    y = df['variety']

    vecto = TfidfVectorizer(stop_words=stops)
    X = vecto.fit_transform(df['description'])
    f = open('pickles/text_vec.pkl', 'wb')
    pickle.dump(vecto, f)

    model = ComplementNB()
    model.fit(X, y)
    m = open('pickles/model.pkl', 'wb')
    pickle.dump(model, m)
Example #14
0
def main(args):
    model_name = args.model_name
    model_dir = os.path.join(args.root, "model")  # get model dir
    data_dir = os.path.join(args.root, "data")  # get data dir

    data_path = os.path.join(data_dir, args.inFile)
    print('load data from' + data_path)

    data = pickle.load(open(data_path, 'rb'))
    out_path = os.path.join(data_dir, args.outFileName + '.csv')
    assert 'data' in data
    if args.train:
        ratio = args.ratio
        clf = ComplementNB(alpha=args.alpha,
                           fit_prior=args.fit_prior,
                           norm=args.norm)

        assert 'target' in data

        features = data['data']
        labels = data['target']

        rs = ShuffleSplit(n_splits=1, test_size=ratio)
        train_index, val_index = next(rs.split(features, labels))

        x_train = features[train_index]
        x_test = features[val_index]

        y_train = labels[train_index]
        y_test = labels[val_index]

        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        # The accuracy
        print('Accuracy: \n', accuracy_score(y_test, y_pred))

        df = pd.DataFrame({
            'pred': y_pred,
            'target': y_test,
        })
        print(f'validation results save to:{args.outFileName}.csv')
        df.to_csv(out_path)
        print("Some results of validation:")
        print(df.head())

        model_path = os.path.join(model_dir, f'{model_name}_{model}.model')
        dump(clf, model_path)
    else:
        # TODO: How to Save the prediction?
        model_path = os.path.join(model_dir, args.model_path)
        clf = load(args.model)
        x = data['data']
        pred = clf.predict(x)
        df = pd.DataFrame({
            'pred': pred,
        })
        df.to_csv(out_path)
class NaiveBayes():
    def __init__(self, division="sents", ngram=1):

        self.df_train = pd.read_csv(f"../data/{division}_train.csv",
                                    sep='\t',
                                    names=['sentence', 'author', 'work'])
        self.df_val = pd.read_csv(f"../data/{division}_val.csv",
                                  sep='\t',
                                  names=['sentence', 'author', 'work'])
        self.df_test = pd.read_csv(f"../data/{division}_test.csv",
                                   sep='\t',
                                   names=['sentence', 'author', 'work'])

        self.df_spurious = pd.read_csv(f"../data/{division}_spurious.csv",
                                       sep='\t',
                                       names=['sentence', 'work'])
        self.df_epistles = self.df_spurious[self.df_spurious['work'] == 36]
        self.df_spurious = self.df_spurious[self.df_spurious['work'] != 36]

        self.tfidf = TfidfVectorizer(lowercase=False,
                                     stop_words=list(
                                         map(strip_accents, STOPS_LIST)),
                                     ngram_range=(1, ngram))

        self.tfidf_train = self.tfidf.fit_transform(self.df_train['sentence'])
        self.tfidf_val = self.tfidf.transform(self.df_val['sentence'])
        self.tfidf_test = self.tfidf.transform(self.df_test['sentence'])
        self.tfidf_spurious = self.tfidf.transform(
            self.df_spurious['sentence'])
        self.tfidf_epistles = self.tfidf.transform(
            self.df_epistles['sentence'])

        self.label = LabelEncoder()
        self.author_train = self.label.fit_transform(self.df_train['author'])

        self.author_val = self.label.transform(self.df_val['author'])
        self.author_test = self.label.transform(self.df_test['author'])

        self.nb = ComplementNB()
        self.nb.fit(self.tfidf_train, self.author_train)

    def eval(self):
        author_train_pred = self.nb.predict(self.tfidf_train)
        author_val_pred = self.nb.predict(self.tfidf_val)
        author_test_pred = self.nb.predict(self.tfidf_test)

        print(classification_report(self.author_train, author_train_pred))
        print(classification_report(self.author_val, author_val_pred))

    def predict(self):
        epistles_labels = self.label.inverse_transform(
            self.nb.predict(self.tfidf_epistles))
        print((epistles_labels == "Plato").mean())
        print(epistles_labels)

        spurious_labels = self.label.inverse_transform(
            self.nb.predict(self.tfidf_spurious))
        print((spurious_labels == "Plato").mean())
def run_compnb(x_train, x_test, y_train, y_test, x):
    '''Complement Naive Bayes'''
    logger.info("Running ComplementNB")
    compnb = ComplementNB()
    compnb.fit(x_train, y_train)
    compnb_pred = compnb.predict(x_test)
    model_dict['compnb'] = get_model_results(compnb, x_test, y_test,
                                             compnb_pred, x)
    return compnb_pred
Example #17
0
class DocClfTfidfCNB():
    def __init__(self,maxStringLength=MAXSTRINGLENGH, \
                 firstStringLength=FIRSTSTRINGLENGTH):
        self.maxStringLength=maxStringLength
        self.firstStringLength=firstStringLength
        self.message="Complement Naive Bayes using TF-IDF with "+"%5d" % maxFeatures + " features " + \
        " ngram-range "+"%2d" % ngramrange[0]+" to "+"%2d" % ngramrange[1] + \
        " maxString Length "+ "%6d" % self.maxStringLength
       
        return
    def preprocess(self,x):
        xprocessed=[]
        xbegin=[]
        for item in x:
            xprocessed.append(item[0:self.maxStringLength])
            xbegin.append(item[0:self.firstStringLength])
        return xprocessed,xbegin
    def fit(self,x,y):
            # generate dictionary of words and numb of word occurences
    # in each document
        xprocessed,xbegin=self.preprocess(x)
        self.vectorizer=\
        TfidfVectorizer(max_df=maxdf,min_df=mindf,max_features=maxFeatures,
                               ngram_range=ngramrange)
        xv=self.vectorizer.fit_transform(xprocessed)
        self.nbclf=ComplementNB(alpha=alphasmooth)
        self.nbclf.fit(xv,y)
        ytrain=self.nbclf.predict(xv)
        return ytrain
    
    #predict for a group of x value
    def predict(self,x):
        if (len(x[0])<minLength):
            y=["No input"]
            return y
        try:
            xprocessed,xbegin=self.preprocess(x)
            xv=self.vectorizer.transform(xprocessed)
            y=self.nbclf.predict(xv)
        except:
            raise
        return y
    
    # Compute confidence given predicted values & return confusion matrix
    def confidence(self,ytest,ytestpred):
        conf_mat = confusion_matrix(ytest, ytestpred)
    # compute accuracy given predicted value
        labels = sorted(set(ytest))
        self.confidence=dict(zip(labels, conf_mat.diagonal()/
                                 (.1+conf_mat.sum(axis=0))))
        return conf_mat
    # get the Confidence score for a single item:
    def getConfidence(self,x,y):
        try:
            return self.confidence[y]
        except:
            return -1.0;        
def train_complement_naivebayes(params,
                                x_train,
                                y_train,
                                n_folds,
                                random_state,
                                stratified=True,
                                shuffle=True):

    # Model and hyperparameter selection
    if stratified:
        kf = StratifiedKFold(n_splits=n_folds,
                             random_state=random_state,
                             shuffle=shuffle)
    else:
        kf = KFold(n_splits=n_folds,
                   random_state=random_state,
                   shuffle=shuffle)

    cnb_model = ComplementNB(**params)
    i = 0

    # Model Training
    for (train_index, test_index) in kf.split(x_train, y_train):
        # cross-validation randomly splits train data into train and validation data
        print('\n Fold %d' % (i + 1))

        x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[
            test_index]
        y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[
            test_index]

        # declare your model
        cnb_model.fit(x_train_cv, y_train_cv)

        # predict train and validation set accuracy and get eval metrics
        scores_cv = cnb_model.predict(x_train_cv)
        scores_val = cnb_model.predict(x_val_cv)

        # training evaluation
        train_pc = accuracy_score(y_train_cv, scores_cv)
        train_pp = precision_score(y_train_cv, scores_cv)
        train_re = recall_score(y_train_cv, scores_cv)
        print('\n train-Accuracy: %.6f' % train_pc)
        print(' train-Precision: %.6f' % train_pp)
        print(' train-Recall: %.6f' % train_re)

        eval_pc = accuracy_score(y_val_cv, scores_val)
        eval_pp = precision_score(y_val_cv, scores_val)
        eval_re = recall_score(y_val_cv, scores_val)
        print('\n eval-Accuracy: %.6f' % eval_pc)
        print(' eval-Precision: %.6f' % eval_pp)
        print(' eval-Recall: %.6f' % eval_re)

        i = i + 1

    # return model for evaluation and prediction
    return cnb_model
Example #19
0
class Recommender(object):
    '''
    A class to house the text vectorizer and stacked Naive Bayes/Random Forest 
    Classifiers that form the heart of this wine recommender.
    '''
    def __init__(self):
        self.nb = ComplementNB()
        self.rf = RandomForestClassifier()
        self.vecto = TfidfVectorizer()

    def _fit(self, data):
        '''
        Takes in the data for the recommender to be trained and fit to.

        Parameters
        ----------
        data - The filepath to the data being fit.

        Returns
        ----------
        None
        '''

        wrangler = Data_Handler(data)
        df = wrangler.get_top_num(15)
        X = df['description']
        y = df['variety']

        X = self.vecto.fit_transform(X)
        self.nb.fit(X, y)
        X = self.nb.predict_proba(X)

        self.rf.fit(X, y)

    def predict(self, text):
        '''
        Takes in a single input of tasting notes and runs it through our
        vectorizer and ensemble method to return the top five predicted
        varieties.

        Parameters
        ----------
        text - str - The input tastings notes.

        Returns
        ----------
        top_five - lst -  The top five predicted varieties for recommendation.
        '''

        vect = self.vecto.transform([text])
        probs = self.nb.predict_proba(vect)
        probs = self.rf.predict_proba(probs)[0]
        idx = np.argsort(probs)
        top_five_idx = idx[-1:-6:-1]
        top_five = self.rf.classes_[top_five_idx]
        return top_five
Example #20
0
def initFitNaiveBayes(xtrain, ytrain):
    nb = ComplementNB(
        alpha=1.0,
        class_prior=None,
        fit_prior=True,
        norm=False
    )
    nb.fit(xtrain, ytrain)
    print("Naive Bayes Training: Done")
    return nb
def CNB(train_x, train_y, test_x, test_y):  #ComplementNB알고리즘 결과출력
    cnb = ComplementNB()
    cnb.fit(train_x, train_y)
    pre_arr = cnb.predict(test_x)
    pre_arr = pre_arr.reshape(10, 12)

    print('ComplementNB의 테스트 세트 예측 :\n{}'.format(pre_arr))
    print('ComplementNB의 테스트 세트 정확도 : {0:0.2f}%'.format(
        cnb.score(test_x, test_y) * 100))
    print('------------------------------------------------------')
Example #22
0
def complement_bayes(train_data, test_data):
    train_y = train_data['state']
    train_X = train_data.iloc[:, FEATURES_INDICES]

    test_y = test_data['state']
    test_X = test_data.iloc[:, FEATURES_INDICES]
    CNB = ComplementNB()
    CNB.fit(train_X, train_y)
    pred_y = CNB.predict(test_X)
    evaluate(CNB, test_X, test_y, pred_y)
Example #23
0
def CNB_train(features, labels, ds):
    """
    Use the Complement Naive Bayes classifier to train
    and saves the classifier as pickle file
    :param features: List of features from training set
    :param labels: List of labels from training set
    :param ds: Number of the dataset
    """
    CNB_Classifier = ComplementNB(alpha=10.0)
    CNB_Classifier.fit(features, labels)
    save_classifier(CNB_Classifier, "ds" + ds + "CNB_Classifier.pkl")
Example #24
0
def complementNB(tr_vec, tr_ans, val_vec, val_ans, te_vec):
    from sklearn.naive_bayes import ComplementNB
    clf = ComplementNB()
    clf.fit(tr_vec, tr_ans)
    print(clf.score(val_vec, val_ans))

    print('make predictions ...')
    #clf_predictions = clf.predict_proba(te_vec)
    preds = clf.predict(te_vec)
    pred_test_y = (preds > 0.35).astype(int)
    return pred_test_y
Example #25
0
def train_naive_bayes(list_of_vector_label_pairs: list, binary_classification=False, alpha=1.0, norm=False):
    """Builds and trains a Naive Bayes classifier on a list of (vector, label) tuples.
        Returns the Naive Bayes classifier."""
    # Repackage the data for training the classifier
    list_of_vector_tuples, list_of_labels = split_data_pairs(list_of_vector_label_pairs, binary_classification)
    # Make and fit the classifier
    classifier = ComplementNB(alpha=1.0, norm=True)
    print("Please wait, training the Naive Bayes classifier now. . .")
    classifier.fit(list_of_vector_tuples, list_of_labels)
    print("Training complete.")
    return classifier
Example #26
0
def get_accuracy_of_selection(X, y):
    # create k-fold cross validation object
    kf = StratifiedKFold(n_splits=25, shuffle=True, random_state=None)

    # array of accuracy predictions for this selection of features
    accuracies = []

    # perform a k-fold cross validation to determine accuracy of selected features
    for train_index, test_index in kf.split(X, y):
        # split into testing and training data based on the splits
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # count each occurrence of the classes to determine frequency
        class_count = [0, 0]
        for i in y_train:
            class_count[int(i)] += 1

        # calculate total number of observations and determine prior probability
        total = class_count[0] + class_count[1]
        prior_probability = [class_count[0] / total, class_count[1] / total]

        # define smoothing: "portion of the largest variance of all features that is added to variances for calculation stability."
        smoothing = 1e-09

        # perform a complement naive bayes
        gnb = ComplementNB(class_prior=prior_probability)
        gnb.fit(X_train, y_train)

        y_pred = gnb.predict(X_test)  # predicted class

        # y_probs = gnb.predict_proba(X_test)  # confidence in each prediction

        # for i in range(y_pred.shape[0]):
        #     if y_pred[i] != y_test[i]:
        #         print(y_probs[i])       # shows that sometimes we are really really confident in the wrong answer

        # determine how accurate we were
        size = y_test.size
        true_count = (y_test == y_pred).sum()
        accuracy_percentage = (true_count) / size

        # add to array of accuracy predictions for this selection of features
        accuracies.append(accuracy_percentage)

    # compute the mean and standard deviation of this selection of features
    mean = np.mean(accuracies)
    sd = np.std(accuracies)

    # print("MEAN: " + str(round(mean*100,2)) + "%")
    # print("STANDARD DEVIATION: " + str(round(sd*100,2)) + "%")

    return mean, sd
Example #27
0
class NBClassifier(super.abstract_classifier):
    def __init__(self, train_features, train_labels):
        self.train_features = train_features
        self.train_labels = train_labels
        self.nb_Member = ComplementNB()

    def train(
            self):  # after this function the ComplementNB is ready to classify
        self.nb_Member.fit(self.train_features, self.train_labels)

    def classify(self, newVector):
        return self.nb_Member.predict(newVector)
Example #28
0
def naive_bayes(x, y):
    # import complementNB,MultinomialNB
    cpl = ComplementNB()
    mnb = MultinomialNB()
    # train our dataset
    cpl.fit(x, y)
    mnb.fit(x, y)
    # perform prediction and find accuracy
    y_test_cpl = cpl.predict(x)
    y_test_mnb = mnb.predict(x)

    return y_test_cpl, y_test_mnb
 def naive_bayes(self, name="Train_Test"):
     X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                         self.Y,
                                                         test_size=0.4,
                                                         random_state=0)
     clf = ComplementNB()
     clf.fit(X_train, y_train)
     predict = clf.predict(X_test)
     f, p, r = self.nbeval(y_test, predict)
     line = "{}: F score:{:.3f}\tP score:{:.3f}\tR score:{:.3f}.".format(
         name, f, p, r)
     self.logger.info(line)
Example #30
0
def NB_accuracy_complement(X_train, X_test, y_train, y_test, fold):
    gnb = ComplementNB()
    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)

    accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print ("mean_squared_error: ", mean_squared_error(y_test, y_pred))

    results = cross_val_score(gnb, X_train, y_train, cv = fold)
    print("After 5-fold: ", results.mean()*100)