def __init__(self, **config): linfo('config: %s' % config) self._path = config['train_path'] self._feature_extract_config = config['feature'] self._emoticon = config['emoticon'] self.test_path = '../../test_data/bi_test_data' self.clf = BNB(fit_prior=True)
def train(self): logging.info('-' * 20) logging.info('Start training the %s model', self.model) train_data = self.feature_extractor.extract_feature( self.data_loader.get_trainset()) if self.model == 'GNB': # Gaussian naive bayes self.classifier = GNB() elif self.model == 'BNB': # Bernoulli naive bayes self.classifier = BNB() # self.tok = RT(r'\w+') # vectorizer = Vectorizer(tokenizer=self.tok.tokenize) # train_data = self.data_loader.get_trainset() # train_data = [vectorizer.fit_transform(train_data[0]).toarray(), train_data[1]] # self.vocabulary = vectorizer.get_feature_names() elif self.model == 'MNB': # Multinomial naive bayes self.classifier = MNB() elif self.model == 'LR': # Logistic regression param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=LR(penalty=self.penalty, max_iter=self.epoch, solver='liblinear'), param_grid=param) elif self.model == 'SVM': # Support vector machine self.penalty = self.penalty if self.penalty in ['l1', 'l2' ] else 'l2' dual = self.penalty == 'l2' #self.classifier = SVM(penalty=self.penalty, C=self.c, max_iter=self.epoch, dual=dual) param = {'C': [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]} self.classifier = GS(cv=5, estimator=SVM(penalty=self.penalty, dual=dual, max_iter=self.epoch), param_grid=param) elif self.model == 'R': # RandomGuess self.classifier = DC(strategy='stratified') else: logging.info('Unsupported model : %s', self.model) exit(0) self.classifier.fit(train_data[0], train_data[1]) self.classifier.predict(train_data[0]) predictions = self.classifier.predict(train_data[0]) acc = evaluator.accuracy_score(train_data[1], predictions) return acc
def __init__(self): """Create a persistent model file if there isn't one. If one exists, use it.""" self.file_path = 'models/decision_tree.joblib' self.include_hunger = False self.accuracy_metric_float = 0.0 self.accuracy = metrics.Accuracy() if path.exists(self.file_path): self.model = load(self.file_path) else: # self.model = compat.convert_sklearn_to_river( # estimator=MLPClassifier(random_state=1, max_iter=300), # classes=[0, 1] # ) self.model = compat.convert_sklearn_to_river( estimator=BNB(binarize=.1), classes=[0, 1] ) self.save_model()
x_train, y_train = quest.readFileAndCut(quest.train_filename) x_test, y_test = quest.readFileAndCut(quest.test_filename) train_tfidf, test_tfidf = quest.train_tf(x_train, x_test) def execClassify(name, model): classifier = Classifier(name, model) classifier.fit(train_tfidf, y_train) classifier.test(y_test, test_tfidf) if args.m == 'mnb': from sklearn.naive_bayes import MultinomialNB as MNB execClassify(name='多项式朴素贝叶斯分类器', model=MNB()) elif args.m == 'bnb': from sklearn.naive_bayes import BernoulliNB as BNB execClassify(name='伯努利朴素贝叶斯分类器', model=BNB()) elif args.m == 'linearSVC': from sklearn.svm import LinearSVC execClassify(name='线性SVM分类器', model=LinearSVC()) elif args.m == 'dt': from sklearn.tree import DecisionTreeClassifier execClassify(name='决策树分类器', model=DecisionTreeClassifier()) elif args.m == 'knn': from sklearn.neighbors import KNeighborsClassifier execClassify(name='KNN分类器', model=KNeighborsClassifier()) elif args.m == 'xgb': import xgboost as xgb dtrain = xgb.DMatrix(train_tfidf, label=y_train) dtest = xgb.DMatrix(test_tfidf, label=y_test) # label可以不要,此处需要是为了测试效果 param = {'max_depth': 6, 'eta': 0.5, 'eval_metric': 'merror', 'silent': 1, 'objective': 'multi:softmax', 'num_class': 10} # 参数
def naive_bayes(corpus_CV, classes): bnb = BNB() classes = np.ravel(classes) bnb.fit(corpus_CV, classes) return bnb
'''######''' '''###############################################################################''' '''#################################Step 7########################################''' '''Training using Bernouli Naive-Bayesian model to learn a classifier on vital_seg''' '''###############################################################################''' ''' vital_seg = 2*number of vital_seg*1500 train = 2*number of vital_seg*1500 ''' train = [] labels = [] for cluster_n in range(number_books): for seg in vital_seg[cluster_n]: train.append(seg.tolist()) labels.append(cluster_n) model3 = BNB(fit_prior=True) model3 = model3.fit(train, labels) print "STEP 7 done" '''######''' '''Step 7''' '''######''' '''################################################################''' '''##########################Step 8################################''' '''classfying sentences on trained classifier and calculating score''' '''################################################################''' ''' auth_proba = [[.22, .05, .12, ... number of authors] [.22, .05, .12, ... number of authors] ... test_size ] probability of a sentence written by authors
folder = "C:/Users/hanfs/Desktop/data-mining-project/training_data_file.TF.txt" #wondering if we should just hard code each path file but most likely because there is only 3 feature_vectors, targets = svmlight(folder) ###Lets Generate the Classifier items### print("TF Data") clf = MNB() scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) clf = BNB() scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) clf = KNeighborsClassifier() scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
import utils import pickle from os.path import isfile from sklearn.naive_bayes import BernoulliNB as BNB from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split filename = '/usr/src/app/sentiment/models/pickles/BernoulliNB.pickle' if isfile(filename) == False: train, test = train_test_split(utils.read_data(), test_size=0.2) train_embeddings = utils.combined_embeddings(train['text'].tolist()) test_embeddings = utils.combined_embeddings(test['text'].tolist()) clf = BNB(alpha=50) clf.fit(train_embeddings, train['sentiment']) prediction = clf.predict(test_embeddings) report = classification_report(test['sentiment'], prediction) print(report) with open(filename, 'wb') as f: pickle.dump(clf, f) else: print('Already Trained!')
folderpath = r"D:\my_data" folder = "C:/Users/khada/Desktop/data-mining-project/training_data_file.TF.txt" feature_vectors, targets = load_svmlight_file(folder) X = feature_vectors X = X.astype(int) y = targets y = y.astype(int) #print(y) ##X_new1 = SelectKBest(chi2, k=100).fit(X, y)#returns 100 of the best of feature vectors according to the chi squared test ##X_new2 = SelectKBest(mutual_info_classif, k=100).fit_transform(X, y) ## clfm = MNB() clfb = BNB() clfk = KNeighborsClassifier() clfs = SVC() ##scores = cross_val_score(clf, feature_vectors, targets, cv=5, scoring='f1_macro') vals = [10, 25, 50, 100] #(x-axis: K; y-axis:fl_macro) ##I believe the K is the x_new(s) and the f1_macro is the scoring items received from the classifier for K in vals: X_new1 = SelectKBest(chi2, k=K).fit_transform( X, y ) #returns 100 of the best of feature vectors according to the chi squared test X_new2 = SelectKBest(mutual_info_classif, k=K).fit_transform(X, y) scoresm1 = cross_val_score(clfm, X_new1, targets, cv=K, scoring='f1_macro') scoresb1 = cross_val_score(clfb, X_new1, targets, cv=K, scoring='f1_macro')
# ETsC : sklearn.ensemble.forest.ExtraTreesClassifier from sklearn.ensemble.forest import RandomForestClassifier as RFC from sklearn.ensemble.forest import ExtraTreesClassifier as ETsC _Models = { "LR": LR(), "LRCV": LRCV(), "LDA": LDA(), "QDA": QDA(), "KNC": KNC(), # "RNC" : RNC(), "DTC": DTC(), "ETC": ETC(), "GNB": GNB(), # "BDNB" : BDNB(), "MNB": MNB(), "BNB": BNB(), "LSVC": LSVC(), "SVC": SVC(), "NSVC": NSVC(), # "OCSVM" : OCSVM() } # 审查结果比较 print("审查结果比较及其可视化...:") _Algorithm_CMP_Results = [] _Algorithm_CMP_Result_List = [] _Result_File.write("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)\n") print("模型名称" + " " * 6 + "MEAN(准确度)" + " " * 12 + "STD(应该是标准差)") for _Each in _Models: cv_results = model_selection.cross_val_score(_Models[_Each], X=_X_Train, y=_Y_Train,
# ### Not good # In[ ]: from sklearn.naive_bayes import MultinomialNB as MNB mnb = MNB() mnb.fit(df_train, out_train) mnb.score(df_test, out_test) # ### Not good yet # In[ ]: from sklearn.naive_bayes import BernoulliNB as BNB bnb = BNB() bnb.fit(df_train, out_train) bnb.score(df_test, out_test) # ### Better, but still not close to 90% # #### So the Naive Bayes Classifier also doesnot do a good job in this task. # #### Now I will try and experiment with Tree based classifiers. # ## Decision Trees # In[ ]: from sklearn.cross_validation import cross_val_score as cvs from sklearn.tree import DecisionTreeClassifier as dtree tr = dtree()
path = '...' os.chdir(path) d = {} test_size = 0.1 datasets = [] for i in os.listdir(os.getcwd()): datasets.append(i) # set the ids of the folds that lead to a different random_state of kFold, leading to len(folds) * 10-cross-validation processes folds = [1, 2, 3, 4, 5, 7, 23, 66, 123, 2018] # else, set just a seed into the folds list for one only 10-cross-validation procedure folds = [23] learners = [SGD(loss= 'log') , SGD(loss= 'modified_huber'), SGD(loss= 'log' , penalty = 'l1') , SGD(loss= 'log' , penalty = 'elasticnet') , SGD(loss= 'modified_huber' , penalty = 'l1') , SGD(loss= 'modified_huber' , penalty = 'elasticnet') , MNB(), BNB()] for t in learners: l = [] print '#### \t' , t, '\t ####' for x in range(0, len(datasets)): lea = copy.deepcopy(t) acc = [] stdev = [] dataframe = read_csv(datasets[x] , skiprows = 1 , header=None) dataframe = dataframe.dropna() dataset = dataframe.values print