def train_model(X, y_train):
    """
    模型训练
    :return:
    """
    model_NB = MNB()
    model_NB.fit(X, y_train)  # 特征数据直接灌进来
    MNB(alpha=1.0, class_prior=None, fit_prior=True)
    score = np.mean(
        cross_val_score(model_NB, X, y_train, cv=20, scoring='roc_auc'))
    logging.info(f'多项式贝叶斯分类器20折交叉验证得分{score}')
def main():

    #import Preprocessing class from preprocess
    from preprocess import Preprocessing
    prepro = Preprocessing

    #get data and lable from Preprocessing class
    X, Y = prepro.split_data()

    #convert data into features list
    feature_set, lable = make_dataset(X, Y)

    #split data into training and testing data
    X_train, X_test, Y_train, Y_test = tts(feature_set, lable, test_size=0.2)

    #making classifier object using Multinomial Naive Bayes
    classifier = MNB()

    #training the classifier with Trainining data feature and lables
    classifier.fit(X_train, Y_train)

    #testing the classifier
    predictions = classifier.predict(X_test)

    #calculate accuracy by comparing prediction make test data's lable
    print("Accuracy of Classifier :")
    print(accuracy_score(Y_test, predictions))

    #saving classifier in a file.
    with open('spam_classifier.mdl', 'wb') as scla:
        pickle.dump(classifier, scla)
Exemple #3
0
 def mnb(self):
     clf = MNB()
     clf.fit(self.X_train, self.Y_Train)
     y_predict = clf.predict(self.X_Test)
     score = clf.score(self.X_Test, self.Y_Test)
     print("using mnb, score %s" % score)
     print(classification_report(self.Y_Test, y_predict))
Exemple #4
0
def main():
    df = pd.read_csv("/home/saxobeat/PythonML/MLCodes/Spambase/Dataset/spamdata.csv")
    features = df.iloc[:,0:57].values
    labels = df.iloc[:,57].values
    X_train, X_test, y_train, y_test = tts(features, labels, test_size=0.25, shuffle=True, random_state=8)
    models = []
    models.append(('LR', LR(solver='lbfgs', max_iter=2000, tol=0.0001)))
    models.append(('LDA', LDA()))
    models.append(('DTC', DTC()))
    models.append(('KNC', KNC()))
    models.append(('MNB', MNB()))
    models.append(('RFC', RFC(n_estimators=100)))
    models.append(('SVC', SVC(gamma='scale', kernel='rbf', probability=True)))
    x0 = np.linspace(0,1,10)
    plt.plot([0,1],[0,1],'k',linestyle='--')
    for name,model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        y_score = y_pred[:,1]
        fpr, tpr, thresholds = roc_curve(y_test, y_score)
        label = "{}({})".format(name,auc(fpr, tpr))
        plt.plot(fpr,tpr,label=label)
        plt.legend()
        # plt.legend(name)

    plt.title("Reciever Operating Characteristics")
    plt.grid()
    plt.cool()
    plt.xlabel("Fasle Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.savefig("roc.pdf")
Exemple #5
0
def pred():
    # Load fitted training data
    trainAfterFit = pickle.load(open("../picks/fittedTrainData.pkl", "rb"))
    # Load prediction column
    predCol = pickle.load(open("../picks/predCol", "rb"))
    # Load fitted test data
    testAfterFit = pickle.load(open("../picks/fittedTestData.pkl", "rb"))
    # Load test data
    test = pd.read_csv('../data/testData.tsv',
                       header=0,
                       delimiter="\t",
                       quoting=3)
    # Initialize MNB Classifier
    modelMNB = MNB()
    # Fit the classifier according to the given training data.
    modelMNB.fit(trainAfterFit, predCol)
    # Display stats for MB Classifier. This will give us a 20-fold cross validation score that looks at ROC_AUC
    print(
        "20 Fold CV Score for Multinomial Naive Bayes: ",
        np.mean(
            cross_val_score(modelMNB,
                            trainAfterFit,
                            predCol,
                            cv=20,
                            scoring='roc_auc')))
    # Make prediction on fitted test data. These are Probability estimates. The returned estimates for all classes are ordered by the label of classes.
    MNBresult = modelMNB.predict_proba(testAfterFit)[:, 1]
    # Create and store predictions in DataFrame and csv
    MNBoutput = pd.DataFrame(data={"id": test["id"], "sentiment": MNBresult})
    MNBoutput.to_csv('../results/MNBPredictions.csv', index=False, quoting=3)


# if __name__ == '__main__':
#     main()
Exemple #6
0
def pipeline_predict(request):
    if request.method == 'POST':
        preTest = request.POST.get('text_to_classif')
        predictTestData = preTest.split()
        print("*********************", predictTestData)
    py_pipeline = Pipeline([
        ("count", CV()),
        # ("tfid", TF()),
        ("multi", MNB())
    ])
    dbData = Data.objects.all()
    X_language_train = []
    y_language_train = []
    for each in dbData:
        xlt = each.text.split(", ")
        ylt = len(each.text.split(", ")) * [each.category]
        X_language_train.extend(xlt)
        y_language_train.extend(ylt)
    print(X_language_train, y_language_train)
    py_pipeline.fit(X_language_train, y_language_train)
    prediction = py_pipeline.predict(predictTestData)
    print("*********************", prediction)
    appearances = defaultdict(int)
    for curr in prediction:
        appearances[curr] += 1
    answer = max(appearances, key=appearances.get)
    print("*********************", answer)
    # score = py_pipeline.score(span_test_data, y)
    context = {'response': answer}
    return render(request, 'classifier_app/tindex.html', context)
Exemple #7
0
 def mnb(self):
     clf = MNB()
     clf.fit(self.X_train, self.Y_Train)
     print("20 Fold CV Score for Multinomial Naive Bayes: %f" % (np.mean(
         cross_val_score(
             clf, self.X_train, self.Y_Train, cv=20, scoring='roc_auc'))))
     self.best_clf = clf
     return clf
Exemple #8
0
def createClassifier(type, model_data):
    if type == "MNB":
        return Classifier(name='多项式朴素贝叶斯分类器', model=MNB(
                    alpha=model_data["alpha"], fit_prior=model_data["fit_prior"]))
    elif type == "LinearSVC":
        return Classifier(name='线性核SVM分类器', model=LinearSVC(
            tol=model_data["tol"], C=model_data["C"], penalty=model_data["penalty"], loss=model_data["loss"]))
    else:
        return False
Exemple #9
0
def build_mnb_model(X_train_dtm, y_train):
    """
    Builds the Multinomial Naive Bayes model
    :param X_train_dtm: training document-term matrix
    :param y_train: training target labels
    :return: fitted Multinomial Naive Bayes model
    """
    mnb = MNB()
    mnb.fit(X_train_dtm, y_train)
    return mnb
 def _make_tfidf_NB_clf(self, **cfg):
     max_f = cfg.get('max_features', 1200)
     max_df = cfg.get('max_df', 0.7)
     sublin = cfg.get('sublin', True)
     vectorizer = Tfidf(stop_words='english',
                        norm='l2',
                        max_df=max_df,
                        max_features=max_f,
                        sublinear_tf=sublin)
     model = MNB()
     clf = Pipeline(steps=[('v', vectorizer), ('nb', model)])
     return clf
Exemple #11
0
def learnData(xData,yData,f_obj,MLtype):
	f_obj.write('Accuracy for {}:\n'.format(MLtype))
	for test in [0.10,0.15,0.20,0.25]:
		xData_train,xData_test,yData_train,yData_test = tts(xData,yData,test_size=test,random_state=42)
		if(MLtype=='LSVC'):	clf = LSVC()
		if(MLtype=='LR'): clf = LR()
		if(MLtype=='MNB'): clf = MNB()
		else: clf = MLPC()
		clf.fit(xData_train,yData_train)
		score = clf.score(xData_test,yData_test)
		f_obj.write('\ttest partition {} yields {} accuracy\n'.format(test,score))
	f_obj.write('\n')
    def train(self):
        logging.info('-' * 20)
        logging.info('Start training the %s model', self.model)
        train_data = self.feature_extractor.extract_feature(
            self.data_loader.get_trainset())
        if self.model == 'GNB':
            # Gaussian naive bayes
            self.classifier = GNB()
        elif self.model == 'BNB':
            # Bernoulli naive bayes
            self.classifier = BNB()
            # self.tok = RT(r'\w+')
            # vectorizer = Vectorizer(tokenizer=self.tok.tokenize)
            # train_data = self.data_loader.get_trainset()
            # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]]
            # self.vocabulary = vectorizer.get_feature_names()
        elif self.model == 'MNB':
            # Multinomial naive bayes
            self.classifier = MNB()
        elif self.model == 'LR':
            # Logistic regression
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=LR(penalty=self.penalty,
                                              max_iter=self.epoch,
                                              solver='liblinear'),
                                 param_grid=param)
        elif self.model == 'SVM':
            # Support vector machine
            self.penalty = self.penalty if self.penalty in ['l1', 'l2'
                                                            ] else 'l2'
            dual = self.penalty == 'l2'
            #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual)
            param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]}
            self.classifier = GS(cv=5,
                                 estimator=SVM(penalty=self.penalty,
                                               dual=dual,
                                               max_iter=self.epoch),
                                 param_grid=param)

        elif self.model == 'R':
            # RandomGuess
            self.classifier = DC(strategy='stratified')
        else:
            logging.info('Unsupported model : %s', self.model)
            exit(0)

        self.classifier.fit(train_data[0], train_data[1])
        self.classifier.predict(train_data[0])
        predictions = self.classifier.predict(train_data[0])
        acc = evaluator.accuracy_score(train_data[1], predictions)
        return acc
def OcToFr(data, CV, target, names):
    tf_transformer = TTransformer(use_idf=False).fit(data)
    X_train_tf = tf_transformer.transform(data)
    print(X_train_tf.shape)
    ttransformer = TTransformer()
    X_train_tfidf = ttransformer.fit_transform(data)

    Xtrain = X_train_tfidf
    clf = MNB().fit(Xtrain, target)
    docs_new = ['God is love', 'OpenGL in the GPU is fast']
    X_new_counts = CV.transform(docs_new)
    X_new_tfidf = ttransformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)

    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, names[category]))
Exemple #14
0
def MNBpredictor(X_train, y_train, X_test):
    ''' Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.naive_bayes import MultinomialNB as MNB
    # Cross validation may not be needed for random forest classifier
    model = MNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, y_pred)
    logLoss = metrics.log_loss(y_train, y_pred)

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy
    predictions[modelName] = y_pred

    return y_pred, accuracy
    def fit(self, max_features=1000):
        stemmer = SnowballStemmer('english')
        #names = list(self.shelf.keys())
        # we redefined names to only include seen non-renewals
        names = set([
            name for name, ren in zip(sample_df.insuredname, sample_df.renewal)
            if type(name) is str and name in seen_names and name in self.shelf
            and ren == 0
        ])
        t0 = time.time()
        train_cutoff = int(len(names) * 0.9)
        texts = [self.shelf[name]['results'] for name in names]
        train_texts = texts[:train_cutoff]
        test_texts = texts[train_cutoff:]
        ys = [int(self.shelf[name]['outcome']) for name in names]
        train_ys = ys[:train_cutoff]
        test_ys = ys[train_cutoff:]
        print("making train_tf")
        train_tf = self.vectorizer.fit_transform(
            [text for text in train_texts])
        self.nb = MNB()
        self.nb.fit(train_tf.todense(), train_ys)
        train_yhats = self.nb.predict_proba(train_tf.todense())[:, 1]
        print("making test_tf")
        test_tf = self.vectorizer.fit_transform([text for text in test_texts])
        test_yhats = self.nb.predict_proba(test_tf.todense())[:, 1]
        print("train AUROC:", roc_auc_score(train_ys, train_yhats))
        print("test AUROC:", roc_auc_score(test_ys, test_yhats))

        text_clf = Pipeline([
            ('vect',
             CountVectorizer(max_features=10000, preprocessor=stemmer.stem)),
            ('tfidf', TfidfTransformer()),
            # ('clf', MNB()),
            ('clf',
             SGDClassifier(loss='log', penalty='elasticnet', alpha=0.00001))
        ])
        text_clf.fit(train_texts, train_ys)
        train_yhats = text_clf.predict_proba(train_texts)[:, 1]
        test_yhats = text_clf.predict_proba(test_texts)[:, 1]
        print("train AUROC:", roc_auc_score(train_ys, train_yhats))
        print("test AUROC:", roc_auc_score(test_ys, test_yhats))
Exemple #16
0
def naive_bayes(train_x, test_x, test, label):
    """朴素贝叶斯"""
    model_NB = MNB()  # (alpha=1.0, class_prior=None, fit_prior=True)
    # 为了在预测的时候使用
    model_NB.fit(train_x, label)

    print("多项式贝叶斯分类器10折交叉验证得分:  \n",
          cross_val_score(model_NB, train_x, label, cv=10, scoring='roc_auc'))
    print(
        "多项式贝叶斯分类器10折交叉验证平均得分: ",
        np.mean(
            cross_val_score(model_NB, train_x, label, cv=10,
                            scoring='roc_auc')))

    test_predicted = np.array(model_NB.predict(test_x))
    submission_df = pd.DataFrame(data={
        'id': test['id'],
        'sentiment': test_predicted
    })
    print("结果:")
    print(submission_df.head(100))
Exemple #17
0
def OnRunModels(data):

    xtrain, xtest, ytrain, ytest = OnSplitData(data)

    knn = KNN(n_neighbors=10, metric='minkowski', p=2)
    svm = SVM.SVC(kernel='linear', C=1.0, gamma='auto')
    mnb = MNB()
    lreg = LREG(random_state=0)

    knn.fit(xtrain, ytrain)
    knn_preds = knn.predict(xtest)
    print(' KNN Results : ')
    print(' Classification Report')
    print(CREP(ytest, knn_preds))
    print(' Confusion Matrix')
    print(CMAT(ytest, knn_preds), '\n\n')

    svm.fit(xtrain, ytrain)
    svm_preds = svm.predict(xtest)
    print(' SVM Results : ')
    print(' Classification Report')
    print(CREP(ytest, svm_preds))
    print(' Confusion Matrix')
    print(CMAT(ytest, svm_preds), '\n\n')

    mnb.fit(xtrain, ytrain)
    mnb_preds = mnb.predict(xtest)
    print(' MNB Results : ')
    print(' Classification Report')
    print(CREP(ytest, mnb_preds))
    print(' Confusion Matrix')
    print(CMAT(ytest, mnb_preds), '\n\n')

    lreg.fit(xtrain, ytrain)
    lreg_preds = lreg.predict(xtest)
    print(' LREG Results : ')
    print(' Classification Report')
    print(CREP(ytest, lreg_preds))
    print(' Confusion Matrix')
    print(CMAT(ytest, lreg_preds), '\n\n')
 def createClassifier(self, config):
     if self.classifier == "lr":
         return LogisticRegression(class_weight='balanced',
                                   penalty=config["penalty"],
                                   C=config["C"])
     elif self.classifier == "gnb":
         return GaussianNB()
     elif self.classifier == "gp":
         return GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)
     elif self.classifier == "mnb":
         return MNB(alpha=config["alpha"], fit_prior=config["fit_prior"])
     elif self.classifier == "svm":
         return SVC(C=config["C"],
                    kernel=config["kernel"],
                    class_weight='balanced')
     elif self.classifier == "rf":
         return RFC(n_estimators=config["n_estimators"],
                    class_weight='balanced')
     elif self.classifier == "dt":
         return DTC(criterion=config["criterion"], class_weight='balanced')
     elif self.classifier == "nbsvm":
         return NBSVM(C=config["C"], beta=config["beta"])
Exemple #19
0
def check_alphas(X, y):
    u"""Takes in an X matrix and a Y array of labels.
    Checks five possible alpha values; returns the
    classifier with the highest cross-validated score."""
    best = None
    best_score = None
    alphas = [1E-4, 1E-3, 1E-2, 1E-1, 1]
    for alpha in alphas:
        mnb = MNB(alpha)
        score = np.mean(cross_val_score(mnb, X, y, cv=10))
        print "alpha: ", alpha, "score: ", score
        if not best:
            best = mnb
            best_score = score
            best_alpha = alpha
        elif score > best_score:
            best_score = score
            best = mnb
            best_alpha = alpha
    best.fit(X, y)
    print "our best score and our best alpha:"
    print best_score, best_alpha
    return best
Exemple #20
0
def train():
	X, y_train, voc = get_trainset()
	from sklearn.naive_bayes import MultinomialNB as MNB
	model_NB = MNB(alpha=1.0, class_prior=None, fit_prior=True)
	
	print 'Start Training!\n'
	start = time.time()
	model_NB.fit(X, y_train) 
	end = time.time()
	f = open('model2.pickle', 'wb')
	f.write(pickle.dumps(model_NB))
	f.close()
	print 'Finish Training!\n'
	
	from sklearn.cross_validation import cross_val_score
	import numpy as np
	print np.mean(cross_val_score(model_NB, X, y_train, cv=5, scoring='roc_auc'))

	f = open('voc_senti.pickle', 'wb')
	f.write(pickle.dumps(voc))
	f.close()

	return voc
import numpy as np,pandas as pd,os,re,io,sys
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import LinearSVC as LSVC


models = [LSVC(), MNB(), LR(), RFC()]
modelNames = ['Linear SVC', 'MultinomialNB','LogisticRegression','RandomForestClassifier']
stats_name = 'stats_ML_models.txt'
########################## TRAIN AND TEST ON MODELS ###########################  
###############################################################################
def perform_train_test(pd_df,dataName,f_obj,X,y):
    pd_df['cat_id']=pd.Series(pd_df[y].factorize()[0]).astype(int)
    id_label_map = dict(pd_df[['cat_id',y]].drop_duplicates().sort_values('cat_id').values)
    f_obj.write('Stats for dataset {}:\n'.format(dataName))
    for (index,model) in enumerate(models):
	X_train,X_test,y_train,y_test=train_test_split(pd_df[X],pd_df['cat_id'],test_size=0.2,random_state=0)
	X_train_obj=TfidfVectorizer().fit(X_train) #returns TFIDF object FITTED TO TRAINING PARTITION
	X_train_tfidf=X_train_obj.transform(X_train)
	clf = model.fit(X_train_tfidf,y_train)
	X_test_tfidf=X_train_obj.transform(X_test)
	accuracy=clf.score(X_test_tfidf,y_test)
	pred_lab = [id_label_map[int(id_index)] for id_index in clf.predict(X_test_tfidf)]
	pred_df = pd.DataFrame(data={'Review Test':X_test, 'Label Test':pred_lab},columns=['Review Test','Label Test'])
	pred_df.to_csv('pred_{}_for{}.csv'.format(dataName,modelNames[index]))
	f_obj.write('\tAccuracy of model {} with data {}: {}\n'.format(modelNames[index],dataName,accuracy))
        f_obj.write('\n')
Exemple #22
0
tfv.fit(train_text)


X_all = train_text + unlabeled_text +test_text
len_train = len(train_text)
len_unlabeled = len(unlabeled_text)
X_all = tfv.transform(X_all)
 
 
# 恢复成训练集和测试集部分
# 左闭右开
train_X = X_all[:len_train]
unlabeled_X = X_all[len_train:len_train+len_unlabeled]
test_X = X_all[len_train+len_unlabeled:]

MNB(alpha=1.0, class_prior=None, fit_prior=True)
'''
alpha : float,optional(默认值= 1.0)
拉普拉斯平滑参数(0表示无平滑)。
fit_prior : boolean,optional(default = True)
如果为假,则使用统一的先验。
class_prior : 可选(默认=无)
类的先验概率。如果指定,则不根据数据调整先验。
'''
model_NB = MNB()
model_NB.fit(train_X, train_label) #特征数据直接灌进来

#使用测试集测试效果。输出信息供参考
print("predict")
unlabeled_label = model_NB.predict(unlabeled_X)
test_label = model_NB.predict(test_X)
Exemple #23
0
quest = TextClassification(train_filename=args.train, test_filename=args.test, categories=[])

x_train, y_train = quest.readFileAndCut(quest.train_filename)
x_test, y_test = quest.readFileAndCut(quest.test_filename)

train_tfidf, test_tfidf = quest.train_tf(x_train, x_test)

def execClassify(name, model):
    classifier = Classifier(name, model)
    classifier.fit(train_tfidf, y_train)
    classifier.test(y_test, test_tfidf)

if args.m == 'mnb':
    from sklearn.naive_bayes import MultinomialNB as MNB
    execClassify(name='多项式朴素贝叶斯分类器', model=MNB())
elif args.m == 'bnb':
    from sklearn.naive_bayes import BernoulliNB as BNB
    execClassify(name='伯努利朴素贝叶斯分类器', model=BNB())
elif args.m == 'linearSVC':
    from sklearn.svm import LinearSVC
    execClassify(name='线性SVM分类器', model=LinearSVC())
elif args.m == 'dt':
    from sklearn.tree import DecisionTreeClassifier
    execClassify(name='决策树分类器', model=DecisionTreeClassifier())
elif args.m == 'knn':
    from sklearn.neighbors import KNeighborsClassifier
    execClassify(name='KNN分类器', model=KNeighborsClassifier())
elif args.m == 'xgb':
    import xgboost as xgb
    dtrain = xgb.DMatrix(train_tfidf, label=y_train)
Exemple #24
0
    return words


if __name__ == "__main__":
    train = pd.read_csv(", ", header=0, delimiter="\t", quoting=3)
    test = pd.read_csv(", ", header=0, delimiter="\t", quoting=3)

    num_review = train["review"].size
    clean_train = []
    for i in range(0, num_review):
        clean_train.append(review_to_words(train["review"][i]))

    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=5000)
    tran_data_f = vectorizer.fit_transform(clean_train)
    tran_data_f = tran_data_f.toarray()

    model_NB = MNB()
    model_NB.fit(tran_data_f, train["sentiment"])

    score = np.mean(
        cross_val_score(model_NB,
                        tran_data_f,
                        train["sentiment"],
                        cv=20,
                        scoring="roc_auc"))
    print("MultinomialNB score: %s" % score)
Exemple #25
0
        for i in range(len(line)):
            rows.append([ids[i], line[i]])
            out = open(path.split('.')[0] + '.csv', 'a', newline='')
            csv_write = csv.writer(out, dialect='excel')
            csv_write.writerow(['id', 'sentiment'])
            for v in rows:
                csv_write.writerow(v)
            out.close()


#利用朴素贝叶斯算法进行分类
from sklearn.naive_bayes import MultinomialNB as MNB

label = train['sentiment']

MNBmodle = MNB(alpha=1.0, class_prior=None, fit_prior=True)
svm_model = LinearSVC()  #SVM
knn = KNeighborsClassifier()  #K邻近
mlp = MLPClassifier(hidden_layer_sizes=(30, 30, 30),
                    activation='logistic',
                    max_iter=100)  #感知机
clf = tree.DecisionTreeClassifier(criterion='gini')

print('train MNB')
begin = datetime.datetime.now()
MNBmodle.fit(train_x, label)
end = datetime.datetime.now()
k = end - begin
print('MNB训练时长:', k.total_seconds())

predict_save(modle, 'MNB.json')
Exemple #26
0
tfidf_vectorizer_pos = TfidfVectorizer()

data_tfidf_text = tfidf_vectorizer_text.fit_transform(data['text'])
data_tfidf_pos = tfidf_vectorizer_pos.fit_transform(data['posTags_string'])

data_tfidf = hstack([data_tfidf_pos, data_tfidf_text]).toarray()
data_tfidf = pd.DataFrame(data_tfidf)

split = int(len(data)*0.75)
y_train = data['isClickbait'][:split].values
y_test = data['isClickbait'][split:].values
X_train = data_tfidf[:split].values
X_test = data_tfidf[split:].values

svm = LinearSVC()
mnb = MNB()
lr = LR()
rf = RFC(n_estimators = 100)


models = {'Linear Support Vector': svm,
          'Multinomial Naive Bayes': mnb,
          'Logistic Regression': lr,
          'Random Forest': rf}

p = []
for n, m in models.items():
  m.fit(X_train, y_train)
  predictions = m.predict(X_test)
  p.append(predictions)
  print('%s : %.3f' % (n, accuracy_score(y_test, predictions)))
filt = np.logical_not(y == '-99')
#filt = np.logical_not(np.logical_or(y == 'Unknown', y == '-99'))
#filt = np.logical_not(np.logical_or(np.logical_or(y == 'Unknown', y == '-99'), y == 'Various'))

y = y[filt]

#- Predictors
X = pd.get_dummies(X, dummy_na=True)
X = X[filt]

le_y = preprocessing.LabelEncoder()
y = le_y.fit_transform(y)

#%% Initialize model
clf = MNB()

#%% Cross Validation using Stratisfied 10-Fold

kf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=0)

scores = []
for train_idx, test_idx in kf.split(X, y):
    #print("TRAIN:", train_idx, "TEST:", test_idx)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = clf.fit(X_train, y_train)
    predictions = model.predict(X_test)
    scores.append(accuracy_score(y_test, predictions))
print("Model training complete!")
Exemple #28
0
from sklearn.datasets import load_svmlight_file as svmlight
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB as MNB, BernoulliNB as BNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

folderpath = r"D:\my_data"
folder = "C:/Users/hanfs/Desktop/data-mining-project/training_data_file.TF.txt"

#wondering if we should just hard code each path file but most likely because there is only 3
feature_vectors, targets = svmlight(folder)

###Lets Generate the Classifier items###
print("TF Data")
clf = MNB()
scores = cross_val_score(clf,
                         feature_vectors,
                         targets,
                         cv=5,
                         scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

clf = BNB()
scores = cross_val_score(clf,
                         feature_vectors,
                         targets,
                         cv=5,
                         scoring='f1_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Exemple #29
0
# THE SPARSITY CALCULATION
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print(sparsity)

# TFIDF TRANSFORMATION
tt = TT().fit(messages_bow)
tfidf4 = tt.transform(bow4)

print(tfidf4)

# TFIDF TRANSFORMATION OF BOW TRANSFORMATION (THE COUNT VECTORIZED MESSAGES)
messages_tfidf = tt.transform(messages_bow)

# NAIVE BAYES
spam_detect_model = MNB().fit(messages_tfidf,df['label'])
pred4 = spam_detect_model.predict(tfidf4)[0]
print(pred4)

pred = spam_detect_model.predict(messages_tfidf)

rate = np.mean(pred == df['label'])
print("Rate: {}\n".format(rate))

# TRAIN AND TEST
msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64)

# PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE
pipe = Pipeline([
	('bow',CV(analyzer=text_process)), # COUNT VECTORIZER
	('tfidf',TT()), # TFIDF TRANSFORMER
def main():

    # 1. Data Preparation
    data = numpy.loadtxt('mod_data.txt')
    labels = numpy.loadtxt('mod_labels.txt')

    test_data = numpy.loadtxt('mod_test_data.txt')
    test_labels = numpy.loadtxt('mod_test_label.txt')
    test = numpy.column_stack((test_data, test_labels))

    # 2. Create Sorted Data Vector by Label
    sorted_data = [[], [], [], [], [], [], [], [], [], []]
    idx = 0
    for row in data:
        sorted_data[int(labels[idx]) - 1].append(row)
        idx += 1
    sorted_data = numpy.asarray(sorted_data)

    # 3. Calculate Cluster Scores
    cluster_scores = [[], [], [], [], [], [], [], [], [], []]
    for idx in range(0, 10):
        cluster_scores[idx] = numpy.mean(sorted_data[idx], axis=0)
    cluster_scores = numpy.around(cluster_scores)

    # 4. Initial Training
    nb_classifier = MNB().fit(data, labels)

    # 5. Initial Enqueue of All New Agents
    new_queue = Queue.Queue(0)
    recycle_queue = Queue.Queue(0)
    for v in test:
        new_queue.put(v)

    # 6. Event Loop
    itr = 1
    correct = 0
    for i in range(0, 500):
        print "[Iteration %d]" % itr
        agent = None
        # 6-1 New Agent Queue
        if not new_queue.empty():
            agent = new_queue.get()
            print("New Agent Dequeued")
        # 6-2 Recycled Agent Queue
        else:
            if not recycle_queue.empty():
                agent = recycle_queue.get()
                print("Recycled Agent Dequeued")
            # 6-3 Random Event
            else:
                n = random.random()
                if (n < 0.01):
                    print("Random Event!")
                    time.sleep(2)
                    continue

        # 6-4 Classification
        if agent is not None:
            result = nb_classifier.predict(agent[0:-1])

        if int(result) == int(agent[-1]):
            correct += 1

        idx = int(result) - 1
        print "Label: %d" % int(result)

        # 6-5 Integrity Check (Tolerance to 10, Reject Threshold 10 ~ 1.7%)
        integrities = numpy.isclose(cluster_scores[idx], agent[0:-1], atol=50)
        accepted = True if numpy.bincount(integrities)[0] < 50 else False
        print "Accepted: %r" % accepted

        # 6-6 Re-train Classifier
        if accepted:
            # 6-6-1 Add Accepted Agent into Table
            sorted_data[idx].append(agent[0:-1])

            # 6-6-2 Recalculate Cluster Scores
            cluster_scores[idx] = numpy.mean(sorted_data[idx], axis=0)
            cluster_scores[idx] = numpy.around(cluster_scores[idx], decimals=5)

            # 6-6-3 Recheck Integrity of Individual Agent
            index = 0
            for row in sorted_data[idx]:
                row_integrities = numpy.isclose(cluster_scores[idx],
                                                row,
                                                atol=5)
                accepted = True if numpy.bincount(
                    integrities)[0] < 5 else False
                if not accepted:
                    recycle_queue.put(accepted)
                    numpy.delete(sorted_data[idx], row)
                    print "Agent %d in Cluster %d Rejected. Placed in recycle_queue" % (
                        index, idx)
                index += 1

            # 6-6-4 Update Naive Bayes Classifier (Partial Fit)
            nb_classifier = nb_classifier.partial_fit(cluster_scores,
                                                      numpy.arange(1, 11))

        # 6-7 Enqueue in Recycled Queue
        else:
            recycle_queue.put(agent)

        print "New Agent Queue: %d\nRecycle Queue: %d\n" % (
            new_queue.qsize(), recycle_queue.qsize())
        itr += 1

    print "Correct: %d" % (correct)