def test_integrated_plot_numpy_named_arrays(self):
        model = naive_bayes.MultinomialNB()

        X = np.array([
            (1.1, 9.52, 1.23, 0.86, 7.89, 0.13),
            (3.4, 2.84, 8.65, 0.45, 7.43, 0.16),
            (1.2, 3.22, 6.56, 0.24, 3.45, 0.17),
            (3.8, 6.18, 2.45, 0.28, 2.53, 0.13),
            (5.1, 9.12, 1.06, 0.19, 1.43, 0.13),
            (4.4, 8.84, 4.97, 0.98, 1.35, 0.13),
            (3.2, 3.22, 5.03, 0.68, 3.53, 0.32),
            (7.8, 2.18, 6.87, 0.35, 3.25, 0.38),
        ],
                     dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8'),
                            ('d', '<f8'), ('e', '<f8'), ('f', '<f8')])

        y = np.array([1, 1, 0, 1, 0, 0, 1, 0])

        visualizer = DecisionBoundariesVisualizer(model, features=['a', 'f'])
        visualizer.fit_draw_poof(X, y=y)
        self.assertEquals(visualizer.features_, ['a', 'f'])
        self.assert_images_similar(visualizer)
Exemple #2
0
def count_vector(train_x, valid_x, train_y, valid_y):
    count_vect = CountVectorizer(analyzer='word',
                                 lowercase=False,
                                 token_pattern=r'\w{1,}')
    count_vect.fit(trainDF['text'])
    xtrain_count = count_vect.transform(train_x)
    xvalid_count = count_vect.transform(valid_x)
    accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y,
                           xvalid_count)

    accuracy1 = train_model(linear_model.LogisticRegression(), xtrain_count,
                            train_y, xvalid_count)

    accuracy2 = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)

    accuracy3 = train_model(ensemble.RandomForestClassifier(), xtrain_count,
                            train_y, xvalid_count)

    accuracy4 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(),
                            train_y, xvalid_count.tocsc())

    return accuracy, accuracy1, accuracy2, accuracy3, accuracy4
Exemple #3
0
class NB_pipelined:

    p = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', naive_bayes.MultinomialNB()),
    ])

    gs_clf = None

    def fit(self, train, y):
        text_clf = self.p.fit(train, y)
        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': (1e-2, 1e-3),
        }
        self.gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
        self.gs_clf = self.gs_clf.fit(train, y)
        return self.gs_clf

    def predict(self, x):
        return self.gs_clf.predict(x)

    def save(self, p):
        if not '.pk1' in p:
            path = p + '.pk1'
        else:
            path = p
        with open(path, 'wb') as output:
            pickle.dump(self.gs_clf, output, pickle.HIGHEST_PROTOCOL)

    def load(self, p):
        if not '.pk1' in p:
            path = p + '.pk1'
        else:
            path = p
        with open(path, 'rb') as input:
            self.gs_clf = pickle.load(input)
Exemple #4
0
def test():
    train_x, test_x, train_y, test_y = load_data()
    alphas = np.logspace(-2, 5, num=200)
    train_scores = []
    test_scores = []
    for alpha in alphas:
        cls = naive_bayes.MultinomialNB(alpha=alpha)
        cls.fit(train_x, train_y)
        train_scores.append(cls.score(train_x, train_y))
        test_scores.append(cls.score(test_x, test_y))

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(alphas, train_scores)
    ax.plot(alphas, test_scores)
    ax.legend(['Training Score', 'Testing Score'], loc='lower right')
    ax.set_xlabel("alpha")
    ax.set_ylabel("score")
    ax.set_ylim(0, 1.0)
    ax.set_title("MultinomialNB")
    ax.set_xscale("log")
    plt.show()
Exemple #5
0
def tf_idf_word(train_x, valid_x, train_y, valid_y):
    tfidf_vect = TfidfVectorizer(analyzer='word',
                                 token_pattern=r'\w{1,}',
                                 max_features=5000)
    tfidf_vect.fit(trainDF['text'])
    xtrain_tfidf = tfidf_vect.transform(train_x)
    xvalid_tfidf = tfidf_vect.transform(valid_x)
    accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y,
                           xvalid_tfidf)
    #print("NB, WordLevel TF-IDF: ", accuracy)
    accuracy1 = train_model(linear_model.LogisticRegression(), xtrain_tfidf,
                            train_y, xvalid_tfidf)
    #print("LR, WordLevel TF-IDF: ", accuracy1)
    accuracy2 = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
    #print("SVM, WordLevel TF-IDF: ", accuracy2)
    accuracy3 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf,
                            train_y, xvalid_tfidf)
    #print("RF, WordLevel TF-IDF: ", accuracy3)
    accuracy4 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(),
                            train_y, xvalid_tfidf.tocsc())
    #print("Xgb, WordLevel TF-IDF: ", accuracy4)
    return accuracy, accuracy1, accuracy2, accuracy3, accuracy4
Exemple #6
0
def model_builder():
    labels, texts = [], []
    #reading good and Bad dataset file
    with open("Dataset//BadWords.txt") as fp:
        data = fp.readlines()
        for abc in data:
            labels.append("0")
            texts.append(abc)
    with open("Dataset//Goodwords.txt") as fp:
        data = fp.readlines()
        for abc in data:
            labels.append("1")
            texts.append(abc)
    trainDF = pandas.DataFrame()
    trainDF['text'] = texts
    trainDF['label'] = labels

    # split the dataset into training and validation datasets
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
        trainDF['text'], trainDF['label'])

    # label encode the target variable
    encoder = preprocessing.LabelEncoder()
    train_y = encoder.fit_transform(train_y)
    valid_y = encoder.fit_transform(valid_y)
    # characters level tf-idf
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',
                                             token_pattern=r'\w{1,}',
                                             ngram_range=(2, 3),
                                             max_features=5000)
    tfidf_vect_ngram_chars.fit(trainDF['text'])
    xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
    xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)

    accuracy = train_model(valid_y, naive_bayes.MultinomialNB(),
                           xtrain_tfidf_ngram_chars, train_y,
                           xvalid_tfidf_ngram_chars)

    return str(accuracy)
Exemple #7
0
def __available_classifiers():
    available_clfs = dict()
    # features of all available classifiers
    Classifier = collections.namedtuple('Classifier', [
        'idf', 'full_name', 'function_call', 'scaling_possible',
        'predict_proba', 'numeric_labels'
    ])
    available_clfs["svm"] = Classifier("svm", "Support Vector Machine",
                                       svm.SVC(probability=True), True, True,
                                       False)
    available_clfs["svm_gs1"] = Classifier(
        "svm", "Co-best SVM according to Skll Grid Search",
        svm.SVC(probability=True,
                kernel="sigmoid",
                C=0.1,
                coef0=0.01,
                gamma=0.01), True, True, False)
    available_clfs["svm_gs2"] = Classifier(
        "svm", "Co-best SVM according to Skll Grid Search",
        svm.SVC(probability=True,
                kernel="sigmoid",
                C=0.01,
                coef0=0.01,
                gamma=0.0), True, True, False)
    available_clfs["mnb"] = Classifier(
        "mnb", "Multinomial Naive Bayes", naive_bayes.MultinomialNB(), False,
        True, False
    )  # MNB can't do default scaling: ValueError: Input X must be non-negative
    available_clfs["knn"] = Classifier("knn", "k Nearest Neighbour",
                                       neighbors.KNeighborsClassifier(), True,
                                       True,
                                       False)  # knn can do feature scaling
    available_clfs["raf"] = Classifier(
        "raf", "Random Forest",
        ensemble.RandomForestClassifier(n_estimators=15,
                                        max_depth=5,
                                        oob_score=True), True, True, False)
    return available_clfs
Exemple #8
0
def run(fold):

    df = pd.read_csv("../inputs/IMDB_Dataset-folds.csv")

    df_train = df[df.kfold != fold].reset_index(drop=True)

    df_valid = df[df.kfold == fold].reset_index(drop=True)

    count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

    count_vec.fit(df_train.review)

    x_train = count_vec.transform(df_train.review)

    x_valid = count_vec.transform(df_valid.review)

    model = naive_bayes.MultinomialNB()

    model.fit(x_train, df_train.sentiment.values)

    acc = model.score(x_valid, df_valid.sentiment.values)

    print(f"Fold: {fold}, Accuracy: {acc}")
    def __init__(self, trainDF):
        super().__init__()
        prePro = PreProcessor()
        self.pf = PlotFunctions()
        self.trainDF = trainDF
        self.X_train, self.X_test, self.y_train, self.y_test = \
            prePro.split_train_test(trainDF['cleaned_sentence'], trainDF['classification'], 0.4)
        self.X_test, self.X_cross, self.y_test, self.y_cross = \
            prePro.split_train_test(self.X_test, self.y_test, 0.5)

        self.all_scores = list()
        self.models = {
            'MultinomialNB':
            naive_bayes.MultinomialNB(alpha=0.767,
                                      class_prior=None,
                                      fit_prior=True),
            'ComplementNB':
            naive_bayes.ComplementNB(alpha=0.767,
                                     class_prior=None,
                                     fit_prior=True),
            'LogisticRegression':
            linear_model.LogisticRegression(solver='lbfgs')
        }
Exemple #10
0
def train_model_write(input_dataset, train_model_path, payload_col_name,
                      payload_label):

    #print(''+train_model_path)
    trainDF = load_cvs_dataset(input_dataset)
    txt_label = trainDF[payload_label]
    txt_text = trainDF[payload_col_name]
    model_input = count_vectorizer(txt_text, txt_label)
    naive = naive_bayes.MultinomialNB()
    accuracy = train_model(naive, model_input[0], model_input[1],
                           model_input[2], model_input[3])
    dirs = os.listdir(train_model_path)
    file_no = len(dirs)
    pickle.dump(
        naive,
        open(
            str(train_model_path) + "text_classifier-" + str(file_no) +
            ".pickle", "wb"))
    pickle.dump(
        model_input[4],
        open(
            str(train_model_path) + "tfidf-" + str(file_no) + ".pickle", "wb"))
    return accuracy * 100
Exemple #11
0
    def context_search(documents, ids, query):

        global ready_states, context_ids
        docs_new = [query]

        text_clf = pipeline.Pipeline([
            ('vect', feature_extraction.text.CountVectorizer()),
            ('tfidf', feature_extraction.text.TfidfTransformer()),
            ('clf', naive_bayes.MultinomialNB()),
        ])

        for i in range(1, 3):
            if len(documents) > 0 and len(ids) > 0 and len(documents) == len(ids):
                text_context = text_clf.fit(documents, ids)
                text_id = text_context.predict(docs_new)
                found_id = int(text_id.astype(int))
                if found_id not in context_ids:
                    context_ids.append(found_id)
                index = ids.index(found_id)
                del(documents[index])
                del(ids[index])

        ready_states.append(True)
Exemple #12
0
def train_sentiment_classifier(trainingtext):
    '''
		trains a naive bayes classifier to train on sentiment.
		parameters:
			- trainingtext(.csv/.txt), needs to be annotated
	'''
    df = pd.read_csv('training.txt', sep='\t', names=['liked', 'txt'])
    # vectorize words
    stopset = set(stopwords.words('english'))
    vectorizer = TfidfVectorizer(use_idf=True,
                                 lowercase=True,
                                 strip_accents='ascii',
                                 stop_words=stopset)
    # target
    y = df.liked
    # samples
    X = vectorizer.fit_transform(df.txt)
    # split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # train the naive bayes classifier
    clf = naive_bayes.MultinomialNB()
    clf.fit(X_train, y_train)
    return clf
Exemple #13
0
def bayes_model(model_type='m'):
    from sklearn import naive_bayes
    if model_type == 'b':
        model = naive_bayes.BernoulliNB(alpha=1.0,
                                        binarize=0.0,
                                        fit_prior=True,
                                        class_prior=None)
    elif model_type == 'g':
        model = naive_bayes.GaussianNB()  # 高斯贝叶斯
    else:
        model = naive_bayes.MultinomialNB(alpha=1.0,
                                          fit_prior=True,
                                          class_prior=None)
    """
    文本分类问题常用MultinomialNB
    参数
    ---
        alpha:平滑参数
        fit_prior:是否要学习类的先验概率;false-使用统一的先验概率
        class_prior: 是否指定类的先验概率;若指定则不能根据参数调整
        binarize: 二值化的阈值,若为None,则假设输入由二进制向量组成
    """
    return model
Exemple #14
0
def analyze(stop_words_):
    if stop_words_ == "":
        vectorizer = feature_extraction.text.CountVectorizer(stop_words=None)
    else:
        vectorizer = feature_extraction.text.CountVectorizer(stop_words=stop_words_)

    X = vectorizer.fit_transform(data['title_reviews_combo'])

    x_train, x_test, y_train, y_test = model_selection.train_test_split(X, data['Recommended IND'])

    NB = naive_bayes.MultinomialNB()
    NB.fit(x_train, y_train)

    y_predict = NB.predict(x_test)
    unique, counts = np.unique(y_predict, return_counts=True)
    dict(zip(unique, counts))

    score_train = NB.score(x_train, y_train)
    score_test = NB.score(x_test, y_test)
    recall_test = metrics.recall_score(y_test, NB.predict(x_test))
    precision_test = metrics.precision_score(y_test, NB.predict(x_test))

    return score_train, score_test, recall_test, precision_test
Exemple #15
0
def my_MultinomialNB_alpha(*data):
    train_x, test_x, train_y, test_y = data
    alphas = np.logspace(-2, 5, num=200)
    train_scores = []
    test_scores = []
    for alpha in alphas:
        cls = naive_bayes.MultinomialNB(alpha=alpha)
        cls.fit(train_x, train_y)
        train_scores.append(cls.score(X=train_x, y=train_y))
        test_scores.append(cls.score(X=test_x, y=test_y))

    # 绘图
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(alphas, train_scores, label="Training Score")
    ax.plot(alphas, test_scores, label="Tesing Score")
    ax.set_xlabel(r"$\alpha$")
    ax.set_xscale("log")
    ax.set_ylabel("score")
    ax.set_ylim(0, 1.0)
    ax.set_title("MultinomialNB")
    ax.legend(loc="best")
    plt.show()
Exemple #16
0
def test_MultinomialNB_alpha():
    alphas = np.logspace(-2, 5, num=20)
    train_score = []
    test_score = []

    for alpha in alphas:
        cls = naive_bayes.MultinomialNB(alpha=alpha)
        cls.fit(X_train, y_train)

        train_score.append(cls.score(X_train, y_train))
        test_score.append(cls.score(X_test, y_test))

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(alphas, train_score, label="training score")
    ax.plot(alphas, test_score, label="testing score")
    ax.set_xlabel(r'$\alpha$')
    ax.set_ylabel('score')
    # ax.set_ylim(0, 1.0)
    # todo: 为什么没有显示label
    ax.set_title("MultinomialNB")
    ax.set_xscale('log')
    plt.show()
Exemple #17
0
def execute_other_model(X_train, y_train, X_test, y_test):
    print('\n\033[1m\033[94m Accuracy Of Other Models: \033[0m\n')
    print(symptoms_name)

    user_input_preds = []
    models_accuracy.clear()
    user_input_result_val.clear()
    user_input_result_val.append(user_input)

    # Train each model one by one and get result accordingly
    prediction, accuracy, user_input_preds = train_model_linear_regression(
        X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[0], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        KNeighborsRegressor(n_neighbors=5), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[1], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        svm.SVC(), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[2], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        linear_model.LogisticRegression(), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[3], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        ensemble.RandomForestClassifier(), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[4], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        xgboost.XGBClassifier(), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[5], accuracy, user_input_preds)

    prediction, accuracy, user_input_preds = train_model_others(
        naive_bayes.MultinomialNB(), X_train, y_train, X_test, y_test)
    prepare_result_to_display(models_name[6], accuracy, user_input_preds)
Exemple #18
0
def main():
    data_column = 'text'
    processed_column = 'text_final'
    target = 'target'

    print("Preprocessing...")
    Corpus = sentence_preprocessing(pd.read_csv('../data/disaster-tweets.csv'),
                                    data_column, processed_column)
    Vectorizers = [TfidfVectorizer(max_features=5000), CountVectorizer()]
    Vectorizer_Columns = ["tfidf", "count"]
    Models = [
        naive_bayes.MultinomialNB(),
        svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'),
        LogisticRegression()
    ]
    Accuracies = []

    print("Splitting data...")
    Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
        Corpus[processed_column], Corpus[target], test_size=0.3)
    for Model in Models:
        current_accuracies = []
        for index, Vectorizer in enumerate(Vectorizers):
            Corpus[Vectorizer_Columns[index]] = Corpus[processed_column]
            print("Vectorizing", "...")
            (Train_X_Vectorized,
             Test_X_Vectorized) = vectorize(Vectorizer, Corpus,
                                            Vectorizer_Columns[index], Train_X,
                                            Test_X)
            print("Generating predictions", "...")
            score = generate_predictions(Model, Train_X_Vectorized,
                                         Test_X_Vectorized, Train_Y, Test_Y)
            print(Model, " with vectorizer ", Vectorizer_Columns[index],
                  " Accuracy Score -> ", score)
            current_accuracies.append(round(score, 2))
        Accuracies.append(current_accuracies)
    print("Accuracies: ", Accuracies)
Exemple #19
0
def predict_model(train_df, test_df):
	train_data, train_y, test_data, test_y = data_processing(train_df, test_df)
	cv = CountVectorizer()
	train_tfmat = cv.fit_transform(train_data)

	tf = TfidfTransformer()
	train_x = tf.fit_transform(train_tfmat)

	test_tfmat = cv.transform(test_data)
	test_x = tf.transform(test_tfmat)

	model_nb = nb.MultinomialNB()
	model_lr = LogisticRegression()
	model_nn = MLPClassifier(hidden_layer_sizes=(100,100), early_stopping=True)

	model_names = ['NN', 'NB', 'LR']
	models = [model_nn, model_nb, model_lr]

	for _, clf in enumerate(models):
		print("Model {}: {}".format(_+1, model_names[_]))
        clf.fit(train_x, train_y)
        y_pred = clf.predict(test_x)
        result = eval_model(test_y, y_pred)
        print(result)
def train_u():
    vu = TfidfVectorizer(use_idf=False, lowercase=False)
    doc = pd.read_csv(file_to_read, sep='\t', names=['review', 'sentiment'])

    class_categ_u = doc.sentiment  # positive and negative classes
    token_u = vu.fit_transform(doc.review)

    token_u_train, token_u_test, class_u_train, class_u_test = train_test_split(token_u, class_categ_u, random_state=40)

    # training the naive bayes classifier
    naive_train_u = naive_bayes.MultinomialNB()
    naive_train_u.fit(token_u_train, class_u_train)

    # training the logistic regression classifier
    log_train_u = LogisticRegression(penalty='l2', C=1)
    log_train_u.fit(token_u_train, class_u_train)

    print("Logistic Regression classifier accuracy with unnormalized data is %2.2f"
          % accuracy_score(class_u_test, log_train_u.predict(token_u_test)))

    print("Naive Bayes classifier accuracy with unnormalized data is %2.2f"
          % roc_auc_score(class_u_test, naive_train_u.predict(token_u_test)))

    return naive_train_u, log_train_u, vu
Exemple #21
0
def main():

    tic = time.time()
    vectorizer = text.CountVectorizer(ngram_range=(1, 1))
    X_train, y_train, X_test, y_test = load_text_data(sys.argv[1], sys.argv[2],
                                                      vectorizer)

    print("\nLOGISTIC REGRESSION CLASSIFIER")
    model_results(linear_model.LogisticRegression(), X_train, y_train, X_test,
                  y_test)

    print("\n\nNAIVE BAYES CLASSIFIER")
    model_results(naive_bayes.MultinomialNB(), X_train, y_train, X_test,
                  y_test)

    #print("\n\nK-NEIGHBORS CLASSIFIER")
    #model_results(neighbors.KNeighborsClassifier(), X_train, y_train, X_test, y_test)

    print("\n\nLINEAR SVC CLASSIFIER")
    model_results(svm.LinearSVC(), X_train, y_train, X_test, y_test)

    toc = time.time()
    print("\n" + str(int((toc - tic) // 60)) + "m " +
          str(int(toc - tic) % 60) + "s")
def nb_classifier(X, y, sw=False, checkpoint=True):
    # stopwords
    stop_words = set(stopwords.words('english')) if sw else None
    # initialize model to vectorize
    vec = TfidfVectorizer(lowercase=True,
                          use_idf=True,
                          norm=None,
                          smooth_idf=False,
                          analyzer='word',
                          input='content',
                          stop_words=stop_words,
                          min_df=10,
                          max_features=20000)
    # initialize
    mnb_clf = naive_bayes.MultinomialNB()
    # Pipeline
    vec_nb = Pipeline([('vectorize', vec), ('mnb', mnb_clf)])
    # fit model
    vec_nb.fit(X, y)
    # save model
    if checkpoint:
        filename = '/Mining_The_Social_Web/models/nbtfidf.sav'
        joblib.dump(vec_nb, filename)
    return vec_nb
Exemple #23
0
def build_NB_classifier(X_training, y_training):
    '''  
    Build a Naive Bayes classifier based on the training set X_training, 
     y_training, optimized for the hyperparameters passed.

    @param 
        X_training: X_training[i,:] is the ith example
        y_training: y_training[i] is the class label of X_training[i,:]

    @return
        nbc: the naive bayes classifier built in this function
        results: the dict of scores returned by cross validation, since 
            GridSearchCV would also return this but it cannot be used for 
            NB with no hyperparameter to optimize, and CV must be done before
            fitting takes place (and fitting happens here)
    '''
    
    print_clf_intro("NAIVE BAYES")
    
    # Instantiate a Multinomial NB classifier.
    nbc = naive_bayes.MultinomialNB()
    
    # Perform cross validation and store results. 
    results = model_selection.cross_validate(nbc, X_training, y_training, 
                                             return_train_score=True,
                                             scoring=['accuracy', 
                                                      'precision', 
                                                      'roc_auc', 
                                                      'recall',
                                                      'f1'])
    
    # Fit the data with X-training.
    nbc.fit(X_training, y_training)
    
    # Return the classifier object and CV results. 
    return nbc, results
    def naive_bayes(self):

        from sklearn.model_selection import GridSearchCV

        parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

        clf_log = naive_bayes.MultinomialNB()
        clf_log = GridSearchCV(clf_log, parameters, cv=5)

        accuracy, f1 = self.train_model(clf_log, self.train_x_count,
                                        self.train_y, self.valid_x_count)
        print("Logistic Regression (Count Vectors)", accuracy, f1)

        accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf,
                                        self.train_y, self.xvalid_tfidf)
        print("Logistic Regression)", accuracy, f1)

        accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf_ngram,
                                        self.train_y, self.xvalid_tfidf_ngram)
        print("Logistic Regression (TDIDF-ngram)", accuracy, f1)

        print("Best parameters! -", clf_log.best_params_)

        return clf_log
Exemple #25
0
def NBunnormalized(newfile):
    #reading file
    filename = "amazon_cells_labelled.txt"
    folder = pd.read_csv(filename, sep="\t", names=["docs", "class"])

    #Convert a collection of raw documents to a matrix of TF-IDF features.
    Vectwords = TfidfVectorizer(use_idf=False,
                                lowercase=False,
                                strip_accents="ascii")
    y = folder['class']

    #Learn vocabulary and idf, return term-document matrix.
    x = Vectwords.fit_transform(folder.docs)

    #Training data using given trained files
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=40)

    clf = nb.MultinomialNB()
    clf.fit(x_train, y_train)
    score = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
    predict = clf.predict(x_test)
    acc = accuracy_score(y_test, predict)

    #testing given file
    classifier = np.array(newfile)
    classifier_vect = Vectwords.transform(classifier)
    pre = clf.predict(classifier_vect)
    #print(pre)
    print("Accuracy: ", acc * 100)
    print("Score :", score)

    file = open("results-nb-u.txt", "a")
    for i in pre:
        print(i)
        file.write(str(i) + "\n")
    file.close()
def bayes_predict(tr_smp, tr_lb, test_smp, all_wrd):
    Train_X = tr_smp
    Train_Y = tr_lb
    Test_X = test_smp

    # Metoda TF-IDF
    Tfidf_vect = TfidfVectorizer(max_features=5000,
                                 strip_accents='unicode',
                                 ngram_range=(1, 3),
                                 max_df=0.9,
                                 min_df=5,
                                 sublinear_tf=True)
    Tfidf_vect.fit(all_wrd)

    Train_X = Tfidf_vect.transform(Train_X)
    Test_X = Tfidf_vect.transform(Test_X)

    # Normalizare
    Train_X = normalize(Train_X, axis=1, norm='l1')
    Test_X = normalize(Test_X, axis=1, norm='l1')

    # Standardizare
    #scaler = preprocessing.Normalizer()
    scaler = preprocessing.RobustScaler(quantile_range=(0.1, 0.9),
                                        with_centering=False)
    Train_X = scaler.fit_transform(Train_X)
    Test_X = scaler.fit_transform(Test_X)

    model = naive_bayes.MultinomialNB(alpha=0.0001)
    model.fit(Train_X, Train_Y)

    # predict the labels on validation dataset
    predictions = model.predict(Test_X)
    predictions = make_ints(predictions)

    return predictions
Exemple #27
0
        train_vec = scaler.fit_transform(train_vec)
        test_vec = scaler.fit_transform(test_vec)

# Model training
if training_model == 'RF' or training_model == "BT":
    # Initialize the random Forest or bagged tree based the model chosen
    rfc = RFC(n_estimators=100,
              oob_score=True,
              max_features=(None if trainging_model == "BT" else "auto"))
    print("Training %s" %
          ("Random Forest" if training_model == "RF" else "bagged tree"))
    rfc = rfc.fit(train_vec, train_data.sentiment)
    print("OOB Score = ", rfc.oob_score)
    pred = rfc.predict(test_vec)
elif training_model == "NB":
    nb = naive_bayes.MultinomialNB()
    cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10)
    print("Training Naive Bayes")
    print("cv score = ", cv_score.mean())
    nb = nb.fit(train_vec, train_data.sentiment)
    pred = nb.predict(test_vec)

elif training_model == 'SVM':
    svc = svm.LinearSVC()
    param = {
        'C': [1e15, 1e13, 1e11, 1e9, 1e7, 1e5, 1e3, 1e1, 1e-1, 1e-3, 1e-5]
    }
    print("Training SVM")
    svc = GridSearchCV(svc, param, cv=10)
    svc = svc.fit(train_vec, train_data.sentiment)
    pred = svc.predict(test_vec)
                    is_neural_net=False):
        # fit the training dataset on the classifier
        classifier.fit(feature_vector_train, label)

        # predict the labels on validation dataset
        predictions = classifier.predict(feature_vector_valid)

        if is_neural_net:
            predictions = predictions.argmax(axis=-1)

        return metrics.accuracy_score(predictions, self.valid_y)


if __name__ == '__main__':
    start = TextProcessing()
    result = start.train_model(naive_bayes.MultinomialNB(), start.xtrain_count,
                               start.train_y, start.xvalid_count)
    print("naive_bayes, Count Vectors: ", result)
    result = start.train_model(naive_bayes.MultinomialNB(), start.xtrain_tfidf,
                               start.train_y, start.xvalid_tfidf)
    print("naive_bayes, WordLevel TF-IDF: ", result)
    print(
        '-------------------------------------------------------------------------------------------------------'
    )
    result = start.train_model(linear_model.LogisticRegression(),
                               start.xtrain_count, start.train_y,
                               start.xvalid_count)
    print("LogisticRegression, Count Vectors: ", result)
    result = start.train_model(linear_model.LogisticRegression(),
                               start.xtrain_tfidf, start.train_y,
                               start.xvalid_tfidf)
        current_features = np.zeros(len(partial_mapping))

        for instr in json_data['instructions']:
            mnemonic = instr.split(" ")[0].rstrip()

            current_features[
                partial_mapping[mnemonic] if mnemonic in
                partial_mapping else partial_mapping['<UNK>']] += 1

        train_x.append(current_features)
        train_y_opt.append(json_data['opt'])
        train_y_cmp.append(json_data['compiler'])

    train_input_file.close()

    opt_model = naive_bayes.MultinomialNB()
    opt_model.fit(train_x, train_y_opt)

    cmp_model = naive_bayes.MultinomialNB()
    cmp_model.fit(train_x, train_y_cmp)

    test_path = 'datasets/test_dataset_blind.jsonl'
    output_path = '1711234.csv'

    test_input_file = open(test_path, mode='r')
    output_file = open(output_path, mode='w')

    for json_line in test_input_file:
        json_data = json.loads(json_line)
        current_features = np.zeros(len(partial_mapping))
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model