def predict(self): ClassifierN = 0 classifier = GaussianNB() #X_train,X_test,y_train,y_test = train_test_split(self.training,self.trainingLabels,test_size=0.1,random_state=33) classifier.fit(self.training, self.trainingLabels) # predict UnLabledData pred_labelsForTrainingUn = classifier.predict(self.test) print 'Enhanced classifier...' while 1: p1 = pred_labelsForTrainingUn # 将带λ参数的无标签数据拟合入分类器 classifier.partial_fit( self.test, pred_labelsForTrainingUn, classes=['0', '1'], sample_weight=np.ones(len(self.test), dtype=np.float) * self.Lambda) pred_labelsForTrainingUn = classifier.predict(self.test) p2 = pred_labelsForTrainingUn # 判断分类器是否稳定 if list(p1) == list(p2): ClassifierN += 1 elif ClassifierN > 0: ClassifierN = 0 if ClassifierN == 20: break pred_labels = classifier.predict(self.test) print 'naive_bayes with EM algorithm:' print classification_report(self.testLabels, pred_labels, digits=4) return classification_report(self.testLabels, pred_labels, digits=4)
def test_gnb_sample_weight(): """Test whether sample weights are properly used in GNB. """ # Sample weights all being 1 should not change results sw = np.ones(6) clf = GaussianNB().fit(X, y) clf_sw = GaussianNB().fit(X, y, sw) assert_array_almost_equal(clf.theta_, clf_sw.theta_) assert_array_almost_equal(clf.sigma_, clf_sw.sigma_) # Fitting twice with half sample-weights should result # in same result as fitting once with full weights sw = rng.rand(y.shape[0]) clf1 = GaussianNB().fit(X, y, sample_weight=sw) clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2) clf2.partial_fit(X, y, sample_weight=sw / 2) assert_array_almost_equal(clf1.theta_, clf2.theta_) assert_array_almost_equal(clf1.sigma_, clf2.sigma_) # Check that duplicate entries and correspondingly increased sample # weights yield the same result ind = rng.randint(0, X.shape[0], 20) sample_weight = np.bincount(ind, minlength=X.shape[0]) clf_dupl = GaussianNB().fit(X[ind], y[ind]) clf_sw = GaussianNB().fit(X, y, sample_weight) assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_) assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
def main(): """ GaussianNB类的主要参数仅有一个,即先验概率priors ,对应Y的各个类别的先验概率P(Y=Ck)。 这个值默认不给出,如果不给出此时P(Y=Ck)=mk/m。其中m为训练集样本总数量,mk为输出为第k类别的训练集样本数。如果给出的话就以priors为准 """ nb = GaussianNB(priors=None) # 多次循环fit nb.partial_fit() """ binarize: BernoulliNB一共有4个参数,其中3个参数的名字和意义和MultinomialNB完全相同。 唯一增加的一个参数是binarize。这个参数主要是用来帮BernoulliNB处理二项分布的,可以是数值或者不输入。 如果不输入,则BernoulliNB认为每个数据特征都已经是二元的。否则的话,小于binarize的会归为一类,大于binarize的会归为另外一类 """ nb = BernoulliNB(alpha=1.0, fit_prior=True, class_prior=None, binarize=.0) """ alpha: λ为一个大于0的常数,常常取为1,即拉普拉斯平滑。也可以取其他值 参数alpha即为上面的常数λ,如果你没有特别的需要,用默认的1即可。如果发现拟合的不好,需要调优时,可以选择稍大于1或者稍小于1的数 fit_prior: 布尔参数fit_prior表示是否要考虑先验概率,如果是false,则所有的样本类别输出都有相同的类别先验概率。 否则可以自己用第三个参数class_prior输入先验概率,或者不输入第三个参数class_prior让MultinomialNB自己从训练集样本来计算先验概率, 此时的先验概率为P(Y=Ck)=mk/m。其中m为训练集样本总数量,mk为输出为第k类别的训练集样本数 fit_prior class_prior 最终先验概率 false 填或者不填没有意义 P(Y=Ck)=1/k true 不填 P(Y=Ck)=mk/m true 填 P(Y=Ck)=class_prior """ nb = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
def main(): for i in range(num_chunks): smart_data = pd.read_csv(INPUT_PATH, iterator=True, chunksize=chunk_size) model = GaussianNB() counter = 0 test_y = pd.DataFrame() test_X = pd.DataFrame() for data in smart_data: # data_working = data[data['failure'] == 0] # data_broken = data[data['failure'] == 1] # data_working = data_working.sample(n=len(data_broken.index)) # data = pd.concat((data_broken, data_working)) # data = data.sample(frac=1) data_y = data['failure'] data_X = data.drop(labels=['failure'], axis=1) if counter == i: test_y = data_y test_X = data_X else: model.partial_fit(data_X, data_y, classes=[0, 1]) counter += 1 predictions = model.predict(test_X) print(classification_report(test_y, predictions, output_dict=True)) print(confusion_matrix(test_y, predictions))
def predict(self): ClassifierN = 0 classifier = GaussianNB() X_train, X_test, y_train, y_test = train_test_split( self.training, self.trainingLabels, test_size=0.75, random_state=33) classifier.fit(X_train, y_train) # predict UnLabledData #pred_labelsForTrainingUn = classifier.predict(X_test) print 'Enhanced classifier...' while 1: proba_labelsForTrainingUn = classifier.predict_proba(X_test) X_test_labels = np.hstack((X_test, proba_labelsForTrainingUn)) X_test_labels0_sort = sorted(X_test_labels, key=lambda x: x[5], reverse=True) if X_test_labels0_sort[4][5] > X_test_labels0_sort[4][6]: a = map(lambda x: x[:5], X_test_labels0_sort) b = a[0:5] classifier.partial_fit( b, ['0', '0', '0', '0', '0'], classes=['0', '1'], sample_weight=np.ones(len(b), dtype=np.float) * self.Lambda) X_test_labels = X_test_labels0_sort[5:] X_test = a[5:] X_test_labels0_sort = sorted(X_test_labels, key=lambda x: x[6], reverse=True) if X_test_labels0_sort[4][5] < X_test_labels0_sort[4][6]: a = map(lambda x: x[:5], X_test_labels0_sort) b = a[0:5] classifier.partial_fit( b, ['1', '1', '1', '1', '1'], classes=['0', '1'], sample_weight=np.ones(len(b), dtype=np.float) * 1) X_test = a[5:] if len(X_test) < 6: break # while 1 : # p1 = pred_labelsForTrainingUn # # 将带λ参数的无标签数据拟合入分类器 # classifier.partial_fit(X_test, pred_labelsForTrainingUn,classes=['0','1'], sample_weight=np.ones(len(X_test),dtype=np.float)*self.Lambda) # pred_labelsForTrainingUn = classifier.predict(X_test) # p2 = pred_labelsForTrainingUn # # 判断分类器是否稳定 # if list(p1)==list(p2) : # ClassifierN += 1 # elif ClassifierN > 0: # ClassifierN = 0 # if ClassifierN == 20: # break pred_labels = classifier.predict(self.test) print 'naive_bayes with EM algorithm:' print classification_report(self.testLabels, pred_labels, digits=4) return classification_report(self.testLabels, pred_labels, digits=4)
def test(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB() clf.fit(X, Y) print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) print(clf_pf.predict([[-0.8, -1]]))
def bayes_test(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB() clf.fit(X, Y) print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) print(clf_pf.predict([[-0.8, -1]]))
def main(): #Gaussian Naive Bayes classifier clf = GaussianNB() clf.fit(X, Y) #vector of predictions print clf.predict([[-0.8, -1]]) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) print clf_pf.predict([[-0.8, -1]])
def trainGaussianNB(X,y,loadweights): print("Training GaussianNB...") classifier = GaussianNB() if loadweights: with open('weights/GaussianNB.pickle', 'rb') as handle: classifier = pickle.load(handle) for _ in range(10): classifier.partial_fit(X,y,classes=[0,1]) with open('weights/GaussianNB.pickle', 'wb') as handle: pickle.dump(classifier, handle, protocol=pickle.HIGHEST_PROTOCOL) print (classifier.score(X,y))
def test_gnb_partial_fit(): clf = GaussianNB().fit(X, y) clf_pf = GaussianNB().partial_fit(X, y, np.unique(y)) assert_array_almost_equal(clf.theta_, clf_pf.theta_) assert_array_almost_equal(clf.sigma_, clf_pf.sigma_) assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_) clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y)) clf_pf2.partial_fit(X[1::2], y[1::2]) assert_array_almost_equal(clf.theta_, clf_pf2.theta_) assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_) assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
class DrunkLearningNB(DrunkLearning): """drunk_learning class""" def __init__(self): super(DrunkLearningNB, self).__init__() self.clf = GaussianNB() self.filename = 'modelNB.pkl' def partial_fit(self, X, y): X = np.array([X]) y = np.array(y) self.clf.partial_fit(X, y, [0, 1]) joblib.dump(self.clf, self.filename, compress=9)
def nb_partialFit(X, Y): m = X.shape[0] clf = GaussianNB() chunk_size = 10000 n_chunk = int(m / chunk_size) - 1 for i in tqdm(range(n_chunk)): x_partial = X[i * chunk_size:(i + 1) * chunk_size] y_partial = Y[i * chunk_size:(i + 1) * chunk_size] clf.partial_fit(x_partial.todense(), y_partial, classes=[0, 4]) return clf
def test_mixednb_all_continuous(): """Check that MixedNB is equivalent to GaussNB for continuous variables.""" X, y, types = _classification_task() mixed_nb = MixedNB(is_nominal=[False] * 5) mixed_nb.partial_fit(X, y, classes=['a', 'b', 'c']) mixed_pred = mixed_nb.predict_proba(X) gauss_nb = GaussianNB() gauss_nb.partial_fit(X, y, classes=['a', 'b', 'c']) gauss_pred = gauss_nb.predict_proba(X) assert np.allclose(mixed_pred, gauss_pred)
def perform_naive_bayes(train_X, train_Y, test_X, test_Y): # Split data into 2 to avoid memory error partial_size = ROWS / 2 train_X0 = train_X[partial_size:] train_X1 = train_X[:partial_size] train_Y0 = train_Y[partial_size:] train_Y1 = train_Y[:partial_size] gnb = GaussianNB() gnb.partial_fit(train_X0, train_Y0, classes=np.arange(0, 5)) gnb.partial_fit(train_X1, train_Y1) pred_Y = gnb.predict(test_X) return fbeta_score(test_Y, pred_Y, 0.1, average='macro'), accuracy_score(test_Y, pred_Y)
def gaussian(): """ GaussianNB_高斯朴素贝叶斯 :return: """ X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB() clf.fit(X, Y) print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) print(clf_pf.predict([[-0.8, -1]])) return None
def predict(self): ClassifierN = 0 classifier = GaussianNB() X_train,X_test,y_train,y_test = train_test_split(self.training,self.trainingLabels,test_size=0.75,random_state=33) classifier.fit(X_train, y_train) # predict UnLabledData #pred_labelsForTrainingUn = classifier.predict(X_test) print 'Enhanced classifier...' while 1: if len(X_test)<=5: # min break #min proba_labelsForTrainingUn = classifier.predict_proba(X_test) X_test_labels = np.hstack((X_test, proba_labelsForTrainingUn)) X_test_labels0_sort = sorted(X_test_labels,key=lambda x:x[5],reverse=True) if X_test_labels0_sort[4][5]>X_test_labels0_sort[4][6]: a = map(lambda x: x[:5], X_test_labels0_sort) b = a[0:5] classifier.partial_fit(b, ['0','0','0','0','0'], classes=['0', '1'],sample_weight=np.ones(len(b), dtype=np.float) * self.Lambda) X_test_labels = X_test_labels0_sort[5:] X_test = a[5:] if len(X_test)<6: # min break #min X_test_labels0_sort = sorted(X_test_labels, key=lambda x: x[5], reverse=True) if X_test_labels0_sort[4][5]<=X_test_labels0_sort[4][6]: #min a = map(lambda x: x[:5], X_test_labels0_sort) b = a[0:5] classifier.partial_fit(b, ['1', '1', '1', '1', '1'], classes=['0', '1'],sample_weight=np.ones(len(b), dtype=np.float) * 1) X_test_labels = X_test_labels0_sort[5:] # min X_test = a[5:] if len(X_test)<6: break # while 1 : # p1 = pred_labelsForTrainingUn # # 将带λ参数的无标签数据拟合入分类器 # classifier.partial_fit(X_test, pred_labelsForTrainingUn,classes=['0','1'], sample_weight=np.ones(len(X_test),dtype=np.float)*self.Lambda) # pred_labelsForTrainingUn = classifier.predict(X_test) # p2 = pred_labelsForTrainingUn # # 判断分类器是否稳定 # if list(p1)==list(p2) : # ClassifierN += 1 # elif ClassifierN > 0: # ClassifierN = 0 # if ClassifierN == 20: # break pred_labels = classifier.predict(self.test) print 'naive_bayes with EM algorithm:' return pred_labels
def train_NB(): data = pandas.read_csv("bugSample_encoded.csv").values #data = pandas.read_csv("test.csv").values X = np.array(data[:, 0:-1]) y = np.array(data[:, -1]) kf = KFold(n_splits=5) clf = GaussianNB() print(clf) acc = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.partial_fit(X_train, y_train, classes=[0, 1, 2, 3, 4, 5, 6]) predict = clf.predict(X_test) acc = accuracy_score(y_test, predict) print("Naive Bayes: ", acc)
def fit(self, X, y): train_size = len(X) x_test = X[int(train_size - train_size / 6):train_size] X = X[0:train_size - int(train_size / 6)] y_test = y[int(train_size - train_size / 6):train_size] y = y[0:train_size - int(train_size / 6)] m = 1 self.createExpert() num_classes = len(np.unique(y)) predictions = np.zeros((num_classes, )) max_weight = 0 acc = [] nb_acc = [] sizes = [] nb = GaussianNB() for i, sample in enumerate(X): for j, exp in enumerate(self.experts): y_hat = self.getExpertPrediction(exp, sample) y_hat = int(y_hat) if (y_hat != y[i]) and (i % self.period == 0): self.weights[j] *= self.beta print( str(self.experts[j]) + "th Expert Weight: " + str(self.weights[j])) predictions[y_hat] += self.weights[j] max_weight = max(max_weight, self.weights[j]) y_hat = np.array([np.argmax(predictions)]) if i % self.period == 0: self.normalizeWeights(max_weight) self.removeExpert() if y_hat != y[i]: m = m + 1 self.createExpert() nb.partial_fit([sample], [y[i]], np.unique(y)) nb_pred = nb.predict(x_test) nb_acc.append(accuracy_score(y_test, nb_pred)) for exp in self.experts: exp.partial_fit([sample], [y[i]], np.unique(y)) sizes.append(len(self.experts)) acc.append(accuracy_score(self.predict(x_test), y_test)) return sizes, acc, nb_acc
def trainClassifier(partition_data,cols): #### Naive Bayes gnb = GaussianNB() #### TODO change and put in parallel data=np.array(list(partition_data[1])) #the first call to partial_fit must include a list of all the classes gnb=gnb.partial_fit(data[:,0:cols.value],data[:,cols.value],[0,1]) return gnb
def naiveBayesClassifier(data, output_dir): """ Trains a Naive Bayes classifier on the data generated in the 'features' step. Parameters ---------- data : str Path to the working data directory containing the features output_dir : str Path to the output directory where the models are stored """ # Grab list of files features = os.path.join(data, 'features') feature_files = [os.path.join(features, f) for f in os.listdir(features)] clf = GaussianNB() starting_index = scan_for_start(feature_files) with h5py.File(feature_files[starting_index], 'r') as hf: X = hf['vectors'][:] y = hf['labels'][:] clf.fit(X, y) # Delete from list so we dont train on it again del feature_files[starting_index] for i, f in enumerate(feature_files): if (i + 1) % 10 == 0: print("Training GNB on file {} of {}".format( i + 1, len(feature_files) + 1)) with h5py.File(f, 'r') as hf: X = hf['vectors'][:] y = hf['labels'][:] clf.partial_fit(X, y) output_path = os.path.join(output_dir, 'classifiers') if not os.path.isdir(output_path): os.makedirs(output_path) joblib.dump(clf, os.path.join(output_path, 'gnb.joblib'))
class GaussianBatchNB(TransformerMixin): def __init__(self, batch_size, classes, *args, **kwargs): self._batch_size = batch_size self._classes = classes self._args = args self._kwargs = kwargs self._model = GaussianNB(*args, **kwargs) def fit(self, x, y, **fit_params): batch_size = self._batch_size self._model = GaussianNB(*self._args, **self._kwargs) for index in tqdm(range(batch_size, x.shape[0]+batch_size, batch_size)): self._model.partial_fit( x[index-batch_size:index, :].toarray(), y[index-batch_size:index], classes=self._classes ) return self @staticmethod def transform(x, y=None, **fit_params): return x def predict(self, x): batch_size = self._batch_size predictions = [] for index in tqdm(range(batch_size, x.shape[0]+batch_size, batch_size)): predictions.extend( self._model.predict( x[index-batch_size:index, :].toarray() ).tolist() ) return np.array(predictions).ravel() def score(self, x, y): y_pred = self.predict(x) return accuracy_score(y, y_pred) def __str__(self): return "GaussianBatchNB()" def __repr__(self): return self .__str__()
class TfidfGaussianNB: def __init__(self, nfeats=300, vocab=None): self.clf = GaussianNB() self.vectorizer = TfidfVectorizer(max_features=nfeats, dtype=np.float32, vocabulary=vocab) def train(self, train_data, train_labels, classes, feature_selection=False, percentile=100, batch_size=1000): if feature_selection: selector = SelectPercentile(chi2, percentile=percentile) X = selector.fit_transform( self.vectorizer.fit_transform(train_data), train_labels) new_vocab = list( np.array(self.vectorizer.vocabulary)[selector.get_support()]) self.vectorizer = TfidfVectorizer(dtype=np.float32, vocabulary=new_vocab) print(len(self.vectorizer.vocabulary)) for i in range(0, train_data.size, batch_size): print(i) data = train_data[i:i + batch_size] X = self.vectorizer.fit_transform(data).toarray() # self.clf.partial_fit(X, train_labels[i:i+batch_size], classes=classes) self.clf.partial_fit(X, train_labels[i:i + batch_size], classes=classes) def predict(self, data): return self.clf.predict(self.vectorizer.fit_transform(data).toarray()) def load_model(self, filename): with open(filename + ".p", 'rb') as fp: self.clf = pickle.load(fp) def save_model(self, filename): pickle.dump(self.clf, open(filename + '.p', 'wb'))
def biased_naive_bayes(df, X_test, Y_test): fail_df = df.copy(deep=True).loc[df["G3"] == 0] pass_df = df.copy(deep=True).loc[df["G3"] == 1] # Target values are G3 Y = df.pop("G3") Y_fail = fail_df.pop("G3") Y_pass = pass_df.pop("G3") # Feature set is remaining features X = df X_fail = fail_df X_pass = pass_df gnb = GaussianNB() for i in (0, 3): gnb.partial_fit(X_fail, Y_fail, [0, 1]) gnb.partial_fit(X_pass, Y_pass, [0, 1]) for i in (0, 3): gnb.partial_fit(X_fail, Y_fail, [0, 1]) print("\n\nGuassian Naive Bayes (Boosted) Accuracy: ", gnb.score(X_test, Y_test)) confuse(Y, gnb.predict(X)) return gnb
def demoOne(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB(priors=None) clf.fit(X, y) print(clf.predict([[-0.8, -1]])) print('predict_prob: ', clf.predict_proba([[-0.8, -1]])) print('predict_log_prob: ', clf.predict_log_proba([[-0.8, -1]])) print(clf.score([[-0.8, -1]], clf.predict([[-0.8, -1]]))) print(clf.partial_fit(X, y, classes=np.unique(y))) print(clf.set_params()) return X, y
def getgrade(): model = train_model(x_train, y_train, x_test, y_test, GaussianNB) #from sklearn.svm import SVC #model = train_model(x_train, y_train, x_test, y_test, SVC, C=0.05, kernel='linear') # In[6]: gnb = GaussianNB() # In[7]: gnb.partial_fit(x, y, np.unique(y)) # In[8]: return gnb.predict([[499,30,30,65,43,16,47,134,3,16,59,102,65,0]])
def get_class_name(dataHandledPath): filePath = getFilePath(dataHandledPath) #print(filePath) for tmp_class in filePath: file_class.append(all_class_name[tmp_class.split('/')[-2]]) print(file_class) allText = [] for tfp in filePath: #print(tfp) #with open(tfp,'r') as fo: with codecs.open(tfp, 'rb') as co: text = co.read() encodeInfo = chardet.detect(text) text = text.decode(encodeInfo["encoding"]) #text = ast.literal_eval(text) #读取处理过保存的数据文件 tmpText = " " tmpText = tmpText.join(ast.literal_eval(text)) #print(tmpText) #获取所有内容 allText.append(tmpText) print(len(allText)) #生成tfidf tfidf = TfidfVectorizer() tfidfModel = tfidf.fit(allText) #tfidf的矩阵形式表示 tfidfResult = tfidfModel.transform(allText) print(tfidfResult) matrixResult = tfidfResult.todense() model = GaussianNB() for tmp in zip(matrixResult, file_class): model.partial_fit(matrixResult, file_class, all_class_name.values()) predicted = model.predict(matrixResult[0]) print(predicted)
def otrosEjemplos(ejemplo): switcherExample = { 0: "Ejemplo 1: Clasificador Gaussiano Naive Bayes--> Caso más sencillo con datos manuales", 1: "Ejemplo 2: Clasificador Gaussiano Naive Bayes--> Caso Datos de Iris", 2: "Ejemplo 3: Clasificador Multnomial Naive Bayes --> Caso más sencillo con datos random", } if ejemplo in switcherExample: if ejemplo == 0: # Definimos los datos ejemplo: X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) # Definimos el clasificador Naive Bayes: clf = GaussianNB() clf.fit(X, Y) GaussianNB(priors=None) print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) GaussianNB(priors=None) print(clf_pf.predict([[-0.8, -1]])) elif ejemplo == 1: iris = datasets.load_iris() gnb = GaussianNB() y_pred = gnb.fit(iris.data, iris.target).predict(iris.data) print("Number of mislabeled points out of a total %d points : %d" % (iris.data.shape[0], (iris.target != y_pred).sum())) elif ejemplo == 2: X = np.random.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = MultinomialNB() clf.fit(X, y) MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) print(clf.predict(X[2:3]))
def train3(trainData): gnb = GaussianNB() X = [] y = [] batch_size = 100 count = 0 for i in trainData: X.append(i["features"]) y.append(i["label"]) count += 1 if count == batch_size: model = gnb.partial_fit(X, y) count = 0 X.clear() y.clear() joblib.dump(model, '/tmp/sk_example.pkl')
def genCharsChrom(genNum): """ Return chromosome (list) of length l, each of which is made up of the characters from chars. pre: isinstance(l, int) hasattr(chars, '__getitem__') hasattr(chars, '__len__') len(chars) > 0 post[l, chars]: __old__.l == l __old__.chars == chars len(__return__) == l forall(__return__, lambda a: a in chars) """ l = np.random.random_integers(0,49,20) for i in range(0,50): if Y[i] == 1: #print("!!!!!!") l = np.append(l,[i]) break for i in range(0,50): if Y[i] == -1: #print("######") l = np.append(l,[i]) break x = X[l] y = Y[l] clf = GaussianNB() clf = clf.partial_fit(x,y,[1,-1]) return Individual([clf,genNum])
def genCharsChrom(): """ Return chromosome (list) of length l, each of which is made up of the characters from chars. pre: isinstance(l, int) hasattr(chars, '__getitem__') hasattr(chars, '__len__') len(chars) > 0 post[l, chars]: __old__.l == l __old__.chars == chars len(__return__) == l forall(__return__, lambda a: a in chars) """ l = np.random.random_integers(0, 99, 20) for i in range(0, 100): if Y[i] == 1: # print("!!!!!!") l = np.append(l, [i]) break for i in range(0, 100): if Y[i] == -1: # print("######") l = np.append(l, [i]) break x = X[l] y = Y[l] clf = GaussianNB() clf = clf.partial_fit(x, y, [1, -1]) return clf
class MixedNB(BaseEstimator, ClassifierMixin): """Naive Bayes for mix of continuous and nominal features. For continuous variables the likelihood of the features is assumed to be Gaussian and for nominal features the likelihood is multinomial.""" def __init__(self, is_nominal=None, alpha=1., class_prior=None): """ :param is_nominal: boolean array indicating which columns are nominal. For technical reasons, continuous variables must go first in X matrix! :param alpha: see MultinomialNB :param class_prior: see MultinomialNB """ self.is_nominal = is_nominal self.multi_nb = MultinomialNB(alpha=alpha, class_prior=class_prior, fit_prior=class_prior is None) self.gauss_nb = GaussianNB(priors=class_prior) def _is_nominal(self, X): is_nominal = np.array(self.is_nominal) assert X.shape[1] == len(is_nominal), 'length of is_nominal array is not the same as number of features' assert all(sorted(is_nominal) == is_nominal), 'Continuous variables must go first in X matrix' return is_nominal def fit(self, X, y, **kwargs): is_nominal = self._is_nominal(X) if any(is_nominal): self.multi_nb.fit(X[:, is_nominal], y, **kwargs) if any(~is_nominal): self.gauss_nb.fit(X[:, ~is_nominal], y, **kwargs) return self def partial_fit(self, X, y, **kwargs): is_nominal = self._is_nominal(X) if any(is_nominal): self.multi_nb.partial_fit(X[:, is_nominal], y, **kwargs) if any(~is_nominal): self.gauss_nb.partial_fit(X[:, ~is_nominal], y, **kwargs) return self def _multi_joint_log_likelihood(self, X): is_nominal = self._is_nominal(X) if all(~is_nominal): return 0 return X[:, is_nominal].dot(self.multi_nb.feature_log_prob_.T) def _gauss_joint_log_likelihood(self, X): is_nominal = self._is_nominal(X) if all(is_nominal): return 0 X = X[:, ~is_nominal] joint_log_likelihood = [] for i in range(np.size(self.gauss_nb.classes_)): n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.gauss_nb.sigma_[i, :])) n_ij -= 0.5 * np.sum(((X - self.gauss_nb.theta_[i, :]) ** 2) / (self.gauss_nb.sigma_[i, :]), 1) joint_log_likelihood.append(n_ij) joint_log_likelihood = np.array(joint_log_likelihood).T return joint_log_likelihood @property def classes_(self): if hasattr(self.multi_nb, 'classes_'): return self.multi_nb.classes_ else: return self.gauss_nb.classes_ @property def class_log_prior_(self): if hasattr(self.multi_nb, 'class_log_prior_'): return np.maximum(self.multi_nb.class_log_prior_, -1e10) else: return np.maximum(np.log(self.gauss_nb.class_prior_), -1e10) def predict_proba(self, X): loglike = self._multi_joint_log_likelihood(X) + self._gauss_joint_log_likelihood(X) + self.class_log_prior_ norm = logsumexp(loglike, axis=1) probs = np.exp(loglike - norm[:, np.newaxis]) return probs def predict(self, X): probs = self.predict_proba(X) return self.classes_[np.argmax(probs, 1)] def score(self, X, y, sample_weight=None): return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
import numpy as np from sklearn.naive_bayes import GaussianNB x = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB() print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(x, y, np.unique(y)) print(clf_pf.predict([[-0.8, -1]]))
class Bayes(object): def __init__(self): self.clf = GaussianNB() def histo_plot(self, data): (mu, sigma) = norm.fit(data) # the histogram of the data n, bins, patches = plt.hist(data, 30, normed=1, facecolor='green', alpha=0.75) # add a 'best fit' line y = mlab.normpdf( bins, mu, sigma) l = plt.plot(bins, y, 'r--', linewidth=2) def training(self): # Training data: a and b # b is twice, three times, four times of a a = np.random.normal(1, 0.1, 3000) b = np.append(np.random.normal(2, 0.2, 1000), np.random.normal(3, 0.3, 1000)) b = np.append(b, np.random.normal(4, 0.4, 1000)) # X: b/a, Y: type, 2/3/4 X = b/a Y = np.append(np.ones(1000) * 2, np.ones(1000) * 3) Y = np.append(Y, np.ones(1000) * 4) # Plot the histogram of b/a self.histo_plot(X[:1000]) self.histo_plot(X[1000:2000]) self.histo_plot(X[2000:3000]) plt.xlabel('b/a') plt.ylabel('Probability') plt.title('Histogram of the training set b/a') plt.grid(True) plt.show() # Plot b/a plt.plot(X) plt.plot(Y, marker='o', markersize=5, label='Type') plt.legend() plt.title("Training Set b/a, and types") plt.ylabel("b/a") plt.xlabel("time (s)") plt.show() # Bayes Classifier X = X.reshape(-1, 1) self.clf.partial_fit(X, Y, np.unique(Y)) print self.clf.class_prior_ print self.clf.theta_ print self.clf.sigma_ def testing(self): # Testing data, c and d c = np.random.normal(2, 0.2, 300) d = np.append(np.random.normal(4, 0.4, 100), np.random.normal(6, 1, 100)) d = np.append(d, np.random.normal(8, 1, 100)) # X1: testing set, Y1: predicted result X1 = d/c Y1 = [] for item in X1: Y1.append(self.clf.predict([[item]])) # Plot the histogram of d/c self.histo_plot(X1[:100]) self.histo_plot(X1[100:200]) self.histo_plot(X1[200:300]) plt.xlabel('d/c') plt.ylabel('Probability') plt.title('Histogram of the testing set d/c') plt.grid(True) plt.show() # Plot d/c plt.plot(X1) plt.plot(Y1, marker='o', markersize=5, label='Type') plt.legend() plt.title("Testing Set d/c, and types") plt.ylabel("d/c") plt.xlabel("time (s)") plt.show()
indx = [k for k in range(30)] random.shuffle(indx) #1.打乱文件的索引顺序,这样就能乱序训练了 rslt1 = [] rslt2 = [] rslt3 = [] # In[6]: from sklearn.naive_bayes import GaussianNB clf = GaussianNB() X = np.loadtxt(tif_name[0]) y = np.loadtxt(lab_name[0]) #.reshape(-1, 1) Xt = np.loadtxt(tif_name[29]) yt = np.loadtxt(lab_name[29]) #.reshape(-1, 1) clf.partial_fit(X, y, classes=np.array([[0], [1], [2], [3], [4], [5]])) scr = clf.score(Xt, yt) rslt3.append(scr) for n in tqdm(indx[1:-15]): X = np.loadtxt(tif_name[n]) y = np.loadtxt(lab_name[n]) #.reshape(-1, 1) Xt = np.loadtxt(tif_name[29 - n]) yt = np.loadtxt(lab_name[29 - n]) #.reshape(-1, 1) clf.partial_fit(X, y) scr = clf.score(Xt, yt) rslt3.append(scr) print(rslt3)
plt.rcParams['ytick.labelsize'] = 20 plt.rcParams['legend.fontsize'] = 20 plt.rcParams['figure.titlesize'] = 15 X_train, y_train, X_test, y_test = _Dataset() col_test = X_test.columns col_train = X_train.columns scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_train = pd.DataFrame(X_train, columns=col_train) X_test = pd.DataFrame(X_test, columns=col_test) prior = [1 / 7] * 7 clf = GaussianNB(priors=prior) clf.fit(X_train, y_train["label"]) print(clf.score(X_test, y_test["label"])) y_predict = clf.predict(X_test) confusion_matrix_Cisco(y_test["label"], y_predict) Report_Matrix(y_test["label"], y_predict) clf_pf = GaussianNB(priors=prior) clf_pf.partial_fit(X_train, y_train["label"], np.unique(y_train["label"])) print(clf_pf.score(X_test, y_test["label"])) y_predict_pf = clf_pf.predict(X_test) confusion_matrix_Cisco(y_test["label"], y_predict_pf) Report_Matrix(y_test["label"], y_predict_pf)
trainLabel1 = [] clf1 = GaussianNB() iter = 1 for sentence in sentences: text = sentence.string x = vec.vector(text) trainData.append(x) if sentence.get('opinionated') == 'N': emotion = 'none' else: emotion = sentence.get('emotion-1-type') trainLabel1.append(label[emotion]) if len(trainData) >= 1000: trainData = array(trainData) trainLabel1 = array(trainLabel1) print 'Training emotion 1...%d' % iter clf1.partial_fit(trainData, trainLabel1, unique(trainLabel1)) trainData = [] trainLabel1 = [] iter += 1 print 'Testing...' output = [] soup = BeautifulSoup(''.join(open('../data/Testing data for Emotion Classification.xml').readlines())) weibos = soup.find_all('weibo') for weibo in weibos: weibo_id = weibo.get('id') if int(weibo_id) > 6000: break text = '' sentences = weibo.find_all('sentence') for sentence in sentences: text += sentence.string + ' '
#-*- encoding:utf-8 -*- r''' 高斯贝叶斯分类器本质是一个线性分类器, 它是贝叶斯分类器在联合概率分布满足 高斯分布的特殊情况 ''' import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() print clf.fit(X, Y) #>>GaussianNB() print(clf.predict([[-0.8, -1],[2,4]])) #>>[1 2] #批量训练,即将数据集分成一块块,对于大数据集十分有效 clf_pf = GaussianNB() print clf_pf.partial_fit(X, Y,np.unique(Y)) #>>GaussianNB() print(clf_pf.predict([[-0.8, -1],[2,4]])) #>>[1 2]
import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(X, Y) GaussianNB() print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) GaussianNB() print(clf_pf.predict([[-0.8, -1]]))
import numpy as np import pandas as pd from sklearn import cross_validation print '---------- Naive Bayes -----------' df = pd.read_csv("~/Desktop/My DM/Baltimore/Baltimore.csv",low_memory=False) features = ["Month of the Crime","Mean Temperature","Mean Dew Point","Mean Visibility","Max Humidity","Mean Wind Speed","Max Sea Level"] x = df[features] y = df["Crime Type"] print 'Partial Fit - training classifier' clf_pf = GaussianNB() clf_pf.partial_fit(x, y, np.unique(y)) print '--Cross Validation--' scores = cross_validation.cross_val_score(clf_pf, x, y, cv=5) print scores.mean() print '--Random Split--' X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(x, y, test_size=0.2, random_state=0) clf1 = GaussianNB().fit(X_train, Y_train) print clf1.score(X_test, Y_test) # Test file df_test = pd.read_csv("~/Desktop/My DM/Baltimore/Test_Baltimore.csv",low_memory=False) xt = df_test[features] print 'Partial Fit Predicted - '+str(clf_pf.predict(xt)) print 'Predict Probability - '+str(clf_pf.predict_proba(xt))
import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) Y = np.array([1, 1, 1, 2, 2, 2]) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(X, Y) GaussianNB(priors=None) print(clf.predict([[-0.8, -1]])) clf_pf = GaussianNB() clf_pf.partial_fit(X, Y, np.unique(Y)) GaussianNB(priors=None) print(clf_pf.predict([[-0.8, -1]])) import matplotlib.pyplot as plt plt.plot(X[0],'bo') plt.ylabel('y-label') plt.show()
class Evaluator(EvaluatorBase): ''' This Evaluator requires the following for each module [model] screenfile naivebayes_screen_model sleepfile naivebayes_sleep_model history 5 minsleeptime 240 minscreentime 240 ''' def __init__(self, config): ''' Constructor Accepts a module configuration dictionary ''' self.config = config self.historylen = config.get("model",{}).get("history",5) self.history = [0.0]*(len(self.modules)*self.historylen) #TODO: Discover these. self.modules = self.config.get("modules",[]) self.modules = self.modules +['bias'] self.screenfile = config.get("model",{}).get("screenfile",os.environ.get("HOME")+"naivebayesscreenmodel") if os.path.isfile(self.screenfile): self.screenclassifier = pickle.load(self.screenfile) else: self.screenclassifier = GaussianNB() self.screenclassifier.fit(np.asarray(self.history, dtype = np.float32),np.zeros([1])) self.sleepfile = config.get("model",{}).get("sleepfile",os.environ.get("HOME")+"naivebayessleepmodel") if os.path.isfile(self.sleepfile): self.sleepclassifier = pickle.load(self.sleepfile) else: self.sleepclassifier = GaussianNB() self.sleepclassifier.fit(np.asarray(self.history, dtype = np.float32),np.zeros([1])) def eval(self, metrics): #preload expected inputs at 0.0 inputs = {} for m in self.modules: inputs[m] = 0.0 inputs['bias'] = 1.0 #update given inputs for modulename,metric in metrics.iteritems(): inputs[modulename] = metric #Transform into consistent order vector inputvector = [] for inputname in sorted(self.modules): inputvector.append(inputs[inputname]) self.history = self.history[0:-len(self.modules)] + inputvector npinvec = np.asarray(self.history, dtype=np.float32) sleepdecision = self.sleepclassifier.predict(npinvec)[0] == 1 screendecision = self.sleepclassifier.predict(npinvec)[0] == 1 return {'sleep':sleepdecision, 'screenoff':screendecision} def update(self,timeslept, timescreenoff): #TODO: Use this knowledge about how long the sleep lasted to update the ML model. self.sleepclassifier.partial_fit(np.asarray(self.history, dtype=np.float32) , np.asarray([timeslept > self.config.get("model",{}).get("minsleeptime",240)],dtype=np.int )) self.screenclassifier.partial_fit(np.asarray(self.history, dtype=np.float32) , np.asarray([timescreenoff > self.config.get("model",{}).get("minscreentime",240)],dtype=np.int ))
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind((HOST, PORT)) s.listen(5) classes = [1, 2, 3, 4] dump_path = "dump.pkl" try: # Loads dump if exists clf = joblib.load(dump_path) except IOError: # No dump, loads base samples base = np.loadtxt("base.txt") y_base = base[:,0] # col 0: delay in minutes X_base = base[:,[1,2,3]] # cols 1(wind spd), 2(dew diff), 3(sky cover) normalize(X_base) # translate continuous to discrete output for classifier y_base[y_base <= 5] = 1 y_base[np.logical_and(y_base > 5, y_base <= 30)] = 2 y_base[np.logical_and(y_base > 30, y_base <= 60)] = 3 y_base[y_base > 60] = 4 clf = GaussianNB() clf.partial_fit(X_base, y_base, classes) joblib.dump(clf, 'dump.pkl') while True: # receive data from web app (clientsocket, address) = s.accept() t = threading.Thread(target=worker, args=(clientsocket,)) t.start()