def cross_validate(folds, method): if folds < 2: print 'Must have at least 2 folds.. evaluating 2-fold cross validation' folds = 2 test_size = 100/folds training_size = 100 - test_size songs_by_class = split_by_class() sentiment_accuracy_sum = 0.0 emotion_accuracy_sum = 0.0 for f in range(0,folds): test_set = songs_by_class['+'][int(test_size*f):int(test_size+test_size*f)] + songs_by_class['0'][int(test_size*f):int(test_size+test_size*f)] +songs_by_class['-'][int(test_size*f):int(test_size+test_size*f)] training_set = songs_by_class['+'][int(test_size+test_size*f):] + songs_by_class['+'][:int(test_size*f)] + songs_by_class['0'][int(test_size+test_size*f):] + songs_by_class['0'][:int(test_size*f)] + songs_by_class['-'][int(test_size+test_size*f):] + songs_by_class['-'][:int(test_size*f)] if method == 'nb': nb = NaiveBayes() nb.train_model(training_set) sentiment_accuracy, emotion_accuracy = nb.evaluate_model(test_set, len(training_set)) emotion_accuracy_sum += emotion_accuracy sentiment_accuracy_sum += sentiment_accuracy elif method == 'sa': sa = SimpleAveraging() avgs = sa.train(training_set) sentiment_accuracy, emotion_accuracy = sa.evaluate(test_set, avgs) emotion_accuracy_sum += emotion_accuracy sentiment_accuracy_sum += sentiment_accuracy # elif method == 'pool': # pool = AffectPool(NaiveBayes(), SimpleAveraging()) # pool.simple_train(training_set) elif method =='r': nb = NaiveBayes() nb.train_model(test_set + training_set) print "EMOTION ACCURACY ", emotion_accuracy_sum / folds, " SENTIMENT ACCURACY: ", sentiment_accuracy_sum / folds
def computeNaiveBayes(args, dict_algorithms): if (args.debug): print("Running naive bayes...", end='') model = NaiveBayes(args) dict_algorithms["naive_bayes"] = model.compute() if (args.debug): print("ok!")
def evaluate(trainfile, testfile): # 訓練データをロード trainData = [] fp = codecs.open(trainfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() trainData.append(temp) fp.close() # ナイーブベイズを訓練 nb = NaiveBayes() nb.train(trainData) print nb # テストデータを評価 hit = 0 numTest = 0 fp = codecs.open(testfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() correct = temp[0] # 正解カテゴリ words = temp[1:] # 文書:単語の集合 predict = nb.classify(words) # ナイーブベイズでカテゴリを予測 if correct == predict: hit += 1 # 予測と正解が一致したらヒット! numTest += 1 print "accuracy:", float(hit) / float(numTest) fp.close()
def _populate(self, tweets): """ :param tweets: A python dictionary containing trends as keys and list of tweets as values against each trend. :return: None This is a private method used by the constructor to populate the inverted index object """ for trendName in tweets: self.trends.append(trendName) self.totalTweets += len(tweets[trendName]) # classify trend tweetsDoc = " ".join([tweet.text for tweet in tweets[trendName]]) model = NaiveBayes() model.loadModelFromDB() self.categories.append(model.classify(tweetsDoc)) for tweet in tweets[trendName]: if tweet.user.screen_name not in self.twitterHandles: self.twitterHandles.append(tweet.user.screen_name) posts = [(self.trends.index(trendName), tweet)] self.indexLists.append(posts) else: posts = self.indexLists[self.twitterHandles.index( tweet.user.screen_name)] posts.append((self.trends.index(trendName), tweet)) self.logger.debug( 'Created and populated Inverted Index: Trends-{}, Tweets-{}'. format(len(self.trends), self.totalTweets))
def spamHamtoyExample() -> None: ''' Trains a naive bayes classifier using a folder with spam/ham emails Checks quality of classifier by using the model to predict the emails from the 'test' folder Different feature numbers are used to check how many features gives the best classification score (1-50) Plots the classification - x-axis = number of features, y-axis = classification accuracy ''' filedir = '../data/emails/' naivebay = NaiveBayes() naivebay.train(os.path.join(filedir, 'train/')) numOfItemsToPrint = 4 naivebay.printMostPopularHamWords(numOfItemsToPrint) naivebay.printMostPopularSpamWords(numOfItemsToPrint) naivebay.printMostindicativeHamWords(numOfItemsToPrint) naivebay.printMostindicativeSpamWords(numOfItemsToPrint) print('Model logPrior: {}'.format(naivebay.logPrior)) features = [1, 2, 5, 10, 20, 30, 40, 50] accuracy = [] for i in features: acc = naivebay.classifyAndEvaluateAllInFolder( os.path.join(filedir, 'test/'), i) accuracy.append(acc) print(i, "features, classification score:", acc) plt.figure("Naive results: #features vs classification error rate") plt.plot(features, accuracy) plt.grid(True) plt.xlabel('Number of Features') plt.ylabel('Classification Score') plt.show()
def crossValidation(data, N=num, randomize=False): if randomize: from random import shuffle shuffle(data) # Cross Validation accuracyList = [] for n in range(N): # split train and test data trainData = [d for i, d in enumerate(data) if i % N != n] testData = [d for i, d in enumerate(data) if i % N == n] # train data nb = NaiveBayes() nb.train(trainData) # accuracy of test data hit = 0 numTest = 0 for d in testData: correct = d[0] words = d[1:] predict = nb.classifier(words) if correct == predict: hit += 1 numTest += 1 accuracy = float(hit) / float(numTest) accuracyList.append(accuracy) average = sum(accuracyList) / float(N) average_f = round(average, 4) return average
def dev_train(): docs = build_doc_set('../papers') driver = Processor() for d in docs: driver.process_document(d) driver.clf.set_classifier(NaiveBayes()) driver.clf.train(docs) driver.save_classifier('saved_classifier-367-1')
def dev_train_test(): """Train and test a new classifier on a directory of .txt documents.""" docs = build_doc_set('../papers') print 'Processing docset with %d docs...' % len(docs) driver = Processor() for d in docs: driver.process_document(d) driver.clf.set_classifier(NaiveBayes()) driver.clf.train_and_test(docs, split=.07)
def test_naivebayes(traindata, testdata): #raw pixel feature feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) print(a)
def test_naivebayes_labeling(self): naivebayes = NaiveBayes() corpus = Serializer.load_csv('../resource/corpus.tsv') data = [] target = [] for row in corpus: data.append(str(row[0])) t = int(row[1]) if t > 5: raise Exception(t) target.append(t) np.array(target, dtype=np.uint8, ndmin=1)
def test_naivebayes_compare(self): basepath = '../resource/' naivebayes = NaiveBayes() json_data = Serializer.load_json(os.path.join(basepath, 'ocr.json')) naivebayes.human_labels = json_data['translate']['country'] x_list = ['ネツァワル王国', 'カセドリア連合王国', 'ゲブランド帝国', 'ホルデイン王国', 'エルソード王国'] print(json_data['translate']['country']) out = naivebayes.predict_all(x_list) for i, y in enumerate(out): if x_list[i] != y: raise Exception('compare x:{0},predict:{1}'.format( x_list[i], y))
def main(flag=True): if flag: start = timer() # 加载邮件数据的label label_df = pd.read_csv("./input/trec06c/full/index_bak", sep=' ..', names=['label', 'filename']) for key in label_df['label'].unique(): print(key, len(label_df[label_df['label'] == key])) train, valid = train_test_split(label_df, test_size=0.2, random_state=2018) normFilelen = train[train['label'] == 'ham'].shape[0] spamFilelen = train[train['label'] == 'spam'].shape[0] model = NaiveBayes(normFilelen, spamFilelen) # model.getStopWords() for index, row in tqdm(train.iterrows(), total=train.shape[0]): # 将每封邮件出现的词保存在wordsList中 model.get_word_list('./input/trec06c' + row['filename'], row['label']) print('训练集学习完毕,已耗时%2fs' % (timer() - start)) for index, row in tqdm(valid.iterrows(), total=valid.shape[0]): if 'test' in model.wordDict.keys(): model.wordDict['test'].clear() model.get_word_list('./input/trec06c' + row['filename'], 'test') wordProbList = model.getTestWords(model.wordDict['test']) # 对每封邮件得到的15个词计算贝叶斯概率 trash_p = model.calBayes(wordProbList) if row['label'] == 'spam': if trash_p > 0.9: model.validResult['TN'] += 1 # trash else: model.validResult['FN'] += 1 # normal else: if trash_p > 0.9: model.validResult['FP'] += 1 # trash else: model.validResult['TP'] += 1 # normal model.calMetric() print('验证集处理完毕,已耗时%2fs' % (timer() - start)) pickle.dump(model, open('bayes_model.obj', 'wb')) else: model = pickle.load(open('bayes_model.obj', 'rb')) print("模型加载成功!") return model
def train(instances): print('starting training') p = None if args.algorithm == 'lambda_means': p = LambdaMeans(args.cluster_lambda, max_max_index, args.clustering_training_iterations) p.train(instances) elif args.algorithm == 'nb_clustering': p = NaiveBayes(args.num_clusters, max_max_index, args.clustering_training_iterations) p.train(instances) print('ending training') return p
def train(self, train_set): """Teaches the classifier with labeled data instances.""" for d in train_set: self.corpus.add_doc(d) print 'Training on %d documents...\n' % len(train_set) if isinstance(self.classifier, NaiveBayes): self.classifier.train(self.corpus) for c in self.corpus.get_classes(): if len(c.get_classes()) > 1: subclassifier = NaiveBayes() subclassifier.train(c) self.subclassifiers[c.get_label()] = subclassifier else: # for nltk classifiers labeled_feature_set = [(d.get_features(), d.get_labels()[0]) for d in train_set] self.classifier.train(labeled_feature_set) # Sklearn classifiers
def main(): nb = NaiveBayes() nb.load_data_training() nb.mulai_training() # TODO: [LANGKAH-10] Cobalah untuk melakukan prediksi! # Apbila cuacanya 'Hujan', suhunya 'Dingin', tingkat kemalasannya 'tinggi', dan 'Bangun siang', # mahasiswanya masuk atau bolos? hasil_prediksi = nb.prediksi(nilai_cuaca='Hujan', nilai_suhu='Dingin', nilai_tingkat_malas='Tinggi', nilai_bangun_siang='Ya') print('=====================================') print('Hasil akhir prediksi = {}, dengan peluang sebesar {}%'.format( hasil_prediksi['hasil'], (hasil_prediksi['peluang'] * 100)))
def trainTrendClassifier(): """ :return: None This function instantiates a model of the NaiveBayes class and trains the model on the categorized trends data. The trained model is stored in the database for future classification purpose. """ logger.debug("trainTrendsClassifier()") trainingFolder = config['training']['trends'] trainingDocs, trainingLabels = getData(trainingFolder) logger.debug("documents: " + str(len(trainingDocs)) + ", labels: " + str(len(trainingLabels))) model = NaiveBayes() model.train(trainingDocs, trainingLabels, stopWordsFlag=True, stem=True) model.saveToDB()
def stdmean(): limit = 0.7 ratio = 0.8 times = 5 print("digit") traindata, testdata = dataloader_digit() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="digitdata Perceptron std") plt.plot(pal, mal, label="digitdata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NaiveBayes std") plt.plot(pal, mal, label="digitdata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NeuralNetwork std") plt.plot(pal, mal, label="digitdata NeuralNetwork mean") print("face") traindata, testdata = dataloader_face() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="facedata Perceptron std") plt.plot(pal, mal, label="facedata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NaiveBayes std") plt.plot(pal, mal, label="facedata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NeuralNetwork std") plt.plot(pal, mal, label="facedata NeuralNetwork mean") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
from datetime import datetime from flask import Flask, render_template, request from search import Search from naivebayes import NaiveBayes from imagesearch import ImageSearch from imagecaption import ImageCaption from sklearn.model_selection import train_test_split from zipfile import ZipFile with ZipFile('pre_processed_data.zip', 'r') as zipObj: zipObj.extractall() search = Search() search.init() naive_bayes = NaiveBayes() naive_bayes.init() imagesearch = ImageSearch() imagesearch.init() image_caption = ImageCaption() image_caption.init() app = Flask(__name__) @app.route("/") def hello(): return '<a href="/search">Search</a><br><a href="/classify">Classify</a><br><a href="/image_search">Image Search</a>'
def main(argv): import fileinput import getopt def usage(): print(f'usage: {argv[0]} ' '[-d] [-o path] [-i path] [-r ratio] [-s supports] [-v vars] ' 'feats.db [items ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'do:i:r:s:v:') except getopt.GetoptError: return usage() debug = 0 outpath = None inpath = None ratio = 0.5 maxsupports = 3 types = {} C = 0.4 # distance weight should be (1.5**d) ~= exp(0.4*d) for (k, v) in opts: if k == '-d': debug += 1 elif k == '-o': outpath = v elif k == '-i': inpath = v elif k == '-r': ratio = float(v) elif k == '-s': maxsupports = int(v) elif k == '-v': types = getvars(v) assert inpath is None or outpath is None if outpath is not None and os.path.exists(outpath): print('Already exists: %r' % outpath) return 1 if not args: return usage() dbpath = args.pop(0) db = FeatDB(dbpath) nallitems = len(db.get_items()) defaultnames = getdefaultnames(types) items = db.get_items() if args: items = [(tid, item) for (tid, item) in items if item in args] def learn(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {feat: fc for (feat, (fc, _)) in fids.items()} for w in words: nb.adddict(w, count, feats) return True def predict(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0} # Use only prominent features that appears more than a certain threshold. threshold = int(max(feats.values()) * ratio) f2 = [feat for (feat, fc) in feats.items() if threshold <= fc] for w in words: nb.removedict(w, count, feats) cands = nb.getkeyfeats(f2)[:len(words)] for w in words: nb.adddict(w, count, feats) if not cands: return False cwords = [w for (_, w, _) in cands] topword = cwords[0] if topword in words: return True print('+ITEM', json.dumps(item)) print('+WORDS', json.dumps(words)) print('+CANDS', json.dumps(cwords)) if item in defaultnames: print('+DEFAULT', json.dumps(defaultnames[item])) fids0 = db.get_feats(tid, source=True) srcs0 = {0: [], 1: [], -1: []} for (fid, (_, srcs)) in fids0.items(): if fid == 0: d = 0 else: d = db.get_feat(fid)[0] if d in srcs0: srcs0[d].extend(srcs) print( '+SOURCE', json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()])) supports = [] for (_, w, a) in cands: # Find top N features for each word. fs = [] for (fid, c) in a[1:]: feat = db.get_feat(fid) assert feat is not None # A rarer feature overall means stronger indication. df = math.log(nallitems / db.get_numfeatitems(fid)) # A more prominent feature for this category means stronger indication. ff = c / nb.fcount[fid][None] # Discount a "distant" feature from the subject. ds = math.exp(-C * abs(feat[0])) fs.append((ds * df * ff, fid, feat)) fs = sorted(fs, reverse=True)[:maxsupports] score = sum(s for (s, _, _) in fs) # Find the variables that contains the same feature. ss = [] for (_, fid, _) in fs: found = None (_, srcs0a) = fids0[0] (_, srcs0b) = fids0[fid] tids = db.get_featitems(fid) for tid1 in tids.keys(): if tid1 == tid: continue item1 = db.get_item(tid1) name1 = stripid(item1) if w not in splitwords(name1): continue fids1 = db.get_feats(tid1, source=True) (_, srcs1a) = fids1[0] (_, srcs1b) = fids1[fid] found = (srcs0a + srcs0b, item1, srcs1a + srcs1b) break ss.append(found) supports.append((w, score, list(zip(fs, ss)))) print('+SCORE', json.dumps(sum(score for (_, score, _) in supports))) print('+SUPPORTS', json.dumps(supports)) print() return False nb = NaiveBayes() proc = learn if inpath is not None: print(f'Importing model: {inpath!r}', file=sys.stderr) with open(inpath, 'rb') as fp: nb.load(fp) proc = predict n = m = 0 for (tid, item) in items: fids = db.get_feats(tid) n += 1 if proc(tid, item, fids): m += 1 sys.stderr.write('.') sys.stderr.flush() print(f'\nProcessed: {m}/{n}', file=sys.stderr) if outpath is not None: print(f'Exporting model: {outpath!r}', file=sys.stderr) with open(outpath, 'wb') as fp: nb.save(fp) if inpath is None and outpath is None: for (tid, item) in items: fids = db.get_feats(tid) predict(tid, item, fids) return 0
with open("winter-" + classifier + ".json") as json_file: for line in json_file: json_obj = json.loads(line) reviews += [(classifier, json_obj)] # Creating model objects model = args.model if (model == "baseline"): model_obj = BaseLine(reviews, categories) elif (model == "logreg"): model_obj = LogReg(reviews) elif (model == "multinomialNB"): model_obj = NaiveBayes(reviews, "multinomial") elif (model == "lda"): model_obj = TopicModel(reviews) elif (model == "kNearestNeighbors"): model_obj = knn(reviews, target) else: # put additional models here. print("Argument Error: invalid model specified") sys.exit() model_classified = [] # classifications stored here reviews = [] # resetting reviews list to save memory # Reading test data into reviews list
##datafile = "../data/weather.nominal.txt" ##pos_class = "play:yes" ##pos_class = "play:no" ##datafile = "haireyescolor.txt" ##pos_class = "Sex:Male" ##pos_class = "Sex:Female" ##datafile = "../data/cmc-full.txt" ##pos_class = "contraceptive-method:none" ##pos_class = "contraceptive-method:long-term" ##pos_class = "contraceptive-method:short-term" d = Data(datafile) prnb = NaiveBayes(d) ##prnb = MaxAPost(d) prnb.train() pos = 0.0 neg = 0.0 for (v, c_true) in d.test_set: if c_true == pos_class: pos += 1 else: neg += 1 result_pos = [] result_neg = [] result_dif = []
df['text'], df['is_spam'], test_size=0.2, random_state=191) print('Data set:') print('{} total'.format(df.shape[0])) for t, t_name in zip(targets, target_names): print('{} {}'.format(len(df[df['is_spam'] == t]), t_name)) print('\nTraining set:') print('{} total'.format(len(X_train))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_train]), t_name)) print('\nTest set:') print('{} total'.format(len(X_test))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_test]), t_name)) print('') # Build Classifier gvoc_model = NaiveBayes('General Vocabulary', X_train, y_train, targets, target_names) gvoc_model.train() gvoc_model.evaluate(X_test, y_test, show_top_features=10) rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets, target_names, max_features=200) rvoc_model.train() rvoc_model.evaluate(X_test, y_test, show_top_features=10)
def accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) X, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=123, return_centers=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=1234) clf = NaiveBayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy(y_test, y_pred)) color_map = {0: 'r', 1: 'k', 2: 'g'} label_color = [color_map[l] for l in y_pred] plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color) plt.show()
def timeana(): import time limit = 0.7 ratio = 1 times = 200 print("digit") traindata, testdata = dataloader_digit() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata NeuralNetwork") print("face") traindata, testdata = dataloader_face() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="facedata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata NeuralNetwork") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
from data import Data from naivebayes import NaiveBayes filename = "datasets/weatherNominal.td" ## filename = "datasets/titanic.td" ## filename = "datasets/cmc.td" d = Data(filename) d.report() pr = NaiveBayes(d) pr.train() pr.show() for (v, c_true) in d.test_set: c_pred = pr.predict(v)[0] print(v, ":") print(" ", c_pred, "( true class:", c_true, ")") ## print(pr.predict(("Class:1st","Sex:Female","Age:Child"))) ## print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
def test_naivebayes_argmax_all(): traindata, testdata = dataloader_digit() feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata random") traindata, testdata = dataloader_face() feature_domians = [[0, 1] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata random") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("accuracy") plt.show()
# Creating model objects model = args.model if (model == "baseline"): model_obj = BaseLine(reviews, { "winter": 0, "spring": 0, "summer": 0, "fall": 0 }) elif (model == "kNearestNeighbors"): model_obj = knn(reviews, target) elif (model == "logreg"): model_obj = LogReg(reviews) elif (model == "multinomialNB"): model_obj = NaiveBayes(reviews, "multinomial") elif (model == "gaussianNB"): model_obj = NaiveBayes(reviews, "gaussian") elif (model == "lda"): model_obj = TopicModel(reviews) else: # put additional models here. print("Argument Error: invalid model specified") sys.exit() model_classified = [] # classifications stored here reviews = [] # resetting reviews list to save memory # Reading test data into reviews list
def __init__(self): self.classifier = NaiveBayes()
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from naivebayes import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
def naivebayes(trainf, testf): nb = NaiveBayes(trainf) nb.classify(testf)