def solveAssignment(training_file, test_file, k): # read data from both files as it is features, training_class, training_data = nb.readFile(training_file) f2, ref_ids, test_data = nb.readFile(test_file) # find max number of attributes in both the files max_attribute, max_attribute_values = nb.findMaxNumAttributes(training_data, test_data) # format training and test data which can be used by classifier # training_class, training_data = nb.formatData(training_data, max_attribute) # test_class, test_data = nb.formatData(test_data, max_attribute) # make k classifiers from training_data and class labels using ensemble method adaboost kClassifiers, kClassifiers_errors = formEnsembleClassifiers(training_class, training_data, max_attribute_values, k) # print kClassifiers_errors # predict using all the classifier built using adaboost on test data boosted_predicted_class = ensembleClassify(training_data, training_class, kClassifiers, kClassifiers_errors) boosted_predicted_test_class = ensembleClassify(test_data, [], kClassifiers, kClassifiers_errors) output = [] for i in range(0, len(boosted_predicted_test_class)): output.append([ref_ids[i], boosted_predicted_test_class[i]]) with open("output.csv", "wb") as f: writer = csv.writer(f) writer.writerows(output)
def train(self): if self.uni_gram == True: self.nb_uni = NaiveBayes.NaiveBayes(self.train_tweets) self.nb_uni.train() if self.bi_gram == True: self.nb_bi = NaiveBayes.NaiveBayes(self.train_tweets, bi_gram=True) self.nb_bi.train()
def ensembleClassify(test_data, test_class, kClassifiers, kClassifiers_errors): boosted_predicted_class = [] k_predictions = [] for i in range(0, k): k_predictions.append(nb.predictClass(test_data, kClassifiers[i])) for i in range(0, len(test_data)): vote_probability = {} for ki in range(0, k): if k_predictions[ki][i] not in vote_probability: vote_probability[k_predictions[ki][i]] = 0 if kClassifiers_errors[ki] == 0.0: vote_probability[k_predictions[ki][i]] += 0 else: vote_probability[k_predictions[ki][i]] += math.log((1.0 - kClassifiers_errors[ki]) / kClassifiers_errors[ki]) max_vote = 0 max_class = None for class_label in vote_probability: if vote_probability[class_label] > max_vote: max_vote = vote_probability[class_label] max_class = class_label boosted_predicted_class.append(max_class) if test_class != []: nb.generateMeasures(test_class, boosted_predicted_class) return boosted_predicted_class
def fitnessEvaluation(chromosomes, fileName): choose = list() flag = 0 count = 0 fitnessValues = list() # print("Chromosomes = ", chromosomes) for row in chromosomes: count += 1 choose[:] = [] # print(row) for j in range(0, len(row)): # print(row[j], end='') if (row[j] == 1): choose.append(j) # print("Choose : ", choose) # print() dataset = NaiveBayes.loadCsv(fileName, choose) # if (flag==0): # print (dataset) # flag=1 # x is the fitness function value # print ("dataset = ",dataset) x = NaiveBayes.main(dataset) fitnessValues.append(x) # print ("count = ",count," x = ",x) return fitnessValues
def main(): topic_list = [] with open('topics.txt') as file: for line in file: topic_list.append(line.strip('\n')) print(topic_list) tokenized_training_data = xr.tokenize("Training", topic_list, 200) tokenized_test_data = xr.tokenize("Test", topic_list, 50) wordmap = xr.create_wordmap(tokenized_training_data) vector_training = xr.create_vector(tokenized_training_data, wordmap) vector_test = xr.create_vector(tokenized_test_data, wordmap) # for dst_measure in range(1,4): # print() # if dst_measure == 1: # print("Euclidian distance") # if dst_measure == 2: # print("Hamming distance") # if dst_measure == 3: # print("Cosine Similarity") # # for K in range(1, 6, 2): # print('For K = ', K) # knn.knn(vector_training, vector_test, K, dst_measure) # test = knn.idf(vector_test) # print(test[0]) nb.naive_bayes(vector_training, vector_test, topic_list, V=len(wordmap))
def cross_validation_nb(folds_array): #Initial values corrects = 0 incorrects = 0 #Separate train and test data for i in range(0,10): training_data = [] test_data = [] for j in range(0,10): if j == i: test_data = folds_array[j] else: training_data = training_data + folds_array[j] #Train the algorithm using training data train = NaiveBayes.train_nb(training_data) #Predict values for j in range(0,len(test_data)): prediction = NaiveBayes.naive_bayes(test_data[j],train) length = len(test_data[j])-1 #Check if the value is correct if prediction == test_data[j][length]: corrects = corrects + 1 else: incorrects = incorrects + 1 return float(corrects)/float(corrects+incorrects)
def main(): parser = argparse.ArgumentParser(description="Parse Values.") parser.add_argument('-arg1', 'trainPath', type=str, required=True) parser.add_argument('-arg2', 'testPath', type=str, required=True) parser.add_argument('-arg3', 'n', type=int, required=True) parser.add_argument('-arg4', 'lamda', type=float, required=True) args = parser.parse_args() trainPath = args.trainPath testPath = args.testPath n = args.n lamda = args.lamda nbModel = NaiveBayes() inout = io.IO() trainSet = inout.readDocuments(trainPath, n) testSet = inout.readDocuments(testPath, n) nbModel.train(trainSet) for doc in testSet: bestLanguage = nbModel.mostLikelyLanguage(doc.text, lamda) print(id + "|" + bestLanguage)
def output_test_file(input_filename, output_filename): #class, gender, and ticket fare # KNN_classifier = KNN(5, [test_columns.Pclass,test_columns.Sex,test_columns.Fare]) train_data = load_data('train.csv', 'train') bin_data(train_data) # attributes = [ x for x,y in enumerate(att_values) if (y != 'skip' and x != 0)] # DecisionTreeClassifier = DecisionTree(train_data, attributes,'') NBClassifier = \ NaiveBayes([test_columns.PassengerId,test_columns.Sex,test_columns.Fare,test_columns.Pclass,test_columns.Age]) test_data = load_data(input_filename, 'test'); output_file_object = csv.writer(open("%s" % output_filename, 'wb')) output_file_object.writerow(["Survived", "PassengerID"]) # for row in test_data: # if row[test_columns.Sex] == 'female': # row[test_columns.Sex] = 0.0 # else: # row[test_columns.Sex] = 1.0 bin_data(test_data) for row in test_data: if NBClassifier.predict(row) == 1: output_file_object.writerow(["1", row[0]]) else: output_file_object.writerow(["0", row[0]])
def cross_validation_nb(folds_array): #Initial values corrects = 0 incorrects = 0 #Separate train and test data for i in range(0, 10): training_data = [] test_data = [] for j in range(0, 10): if j == i: test_data = folds_array[j] else: training_data = training_data + folds_array[j] #Train the algorithm using training data train = NaiveBayes.train_nb(training_data) #Predict values for j in range(0, len(test_data)): prediction = NaiveBayes.naive_bayes(test_data[j], train) length = len(test_data[j]) - 1 #Check if the value is correct if prediction == test_data[j][length]: corrects = corrects + 1 else: incorrects = incorrects + 1 return float(corrects) / float(corrects + incorrects)
def recepcion(): if request.method == 'POST': restText = open("rest.txt", "w") Age = request.form['inputAge'] Sex = request.form['inputSex'] Bp = request.form['inputBp'] Ch = request.form['inputCh'] Nak = request.form['inputNak'] restText.write(Age) restText.write('\n') restText.write(Sex) restText.write('\n') restText.write(Bp) restText.write('\n') restText.write(Ch) restText.write('\n') restText.write(Nak) restText.write('\n') restText.close() probText = open("prob1.txt", "w") texto = str(x.totalidad()) probText.write(texto) probText.close() print("1. Prob total desde readpython es: ", x.totalidad()) print("2. Edad: ", Age) print("3. Nak: ", Nak) else: print("No se ha posteado nada") return render_template('menu.html')
def output_test_file(input_filename, output_filename): #class, gender, and ticket fare # KNN_classifier = KNN(5, [test_columns.Pclass,test_columns.Sex,test_columns.Fare]) train_data = load_data('train.csv', 'train') bin_data(train_data) # attributes = [ x for x,y in enumerate(att_values) if (y != 'skip' and x != 0)] # DecisionTreeClassifier = DecisionTree(train_data, attributes,'') NBClassifier = \ NaiveBayes([test_columns.PassengerId,test_columns.Sex,test_columns.Fare,test_columns.Pclass,test_columns.Age]) test_data = load_data(input_filename, 'test') output_file_object = csv.writer(open("%s" % output_filename, 'wb')) output_file_object.writerow(["Survived", "PassengerID"]) # for row in test_data: # if row[test_columns.Sex] == 'female': # row[test_columns.Sex] = 0.0 # else: # row[test_columns.Sex] = 1.0 bin_data(test_data) for row in test_data: if NBClassifier.predict(row) == 1: output_file_object.writerow(["1", row[0]]) else: output_file_object.writerow(["0", row[0]])
def testNaiveBayes(): X = np.mat(np.loadtxt(r"data\iris\iris.txt", delimiter=",")) numbers = np.mat([0] * 4) nb = NaiveBayes(1) nb.train(X, numbers) result = nb.predict(X) print(X[(X[:, -1] != result).A.flatten(), :].shape[0] / X.shape[0])
def run(): 'Main loop, it gets and processes user input until "bye".' print( '''Hi there! My name is Mr. Rabbits! (\_/) Welcome to Mr. Rabbits' Machine Learning Adventure! (^.^) Today we will be exploring the difference between c(> <) Naive Bayes classification and k-nearest neighbors. There are two datasets to choose from: Fisher's Iris flower data set or ________.''' ) while True: invalid = False info = input( '''Please let me know which classifier you would like to explore: (type 'knn' or 'naive bayes' or 'bagging' or 'bye' to exit).\n''') if info == 'bye': print('Goodbye! Bring me a carrot next time! :3"') return print( "Which dataset will you be exploring today? Fisher's iris flower dataset or Wisconsin breast cancer diagnostics?" ) dataset = input("Type 'FI' or 'BC'\n") split = input( "What % of the dataset should be split into the training set? (type a value from 0 to 100)\n" ) split = float(split) / 100 filename = '' if dataset == "FI": filename = 'iris.csv' elif dataset == "BC": filename = 'wdbc_clean.csv' trainSet = [] testSet = [] createDataset(filename, trainSet, testSet, split) if info == 'knn': k = input("What value should k be? (# of nearest neighbors)\n") KNN.run(trainSet, testSet, int(k)) elif info == 'naive bayes': NaiveBayes.run(trainingSet=trainSet, testSet=testSet) elif info == 'bagging': k = input("What value should k be? (# of nearest neighbors)\n") bagSize = input("How big should the bags be?\n") bagNum = input("How many bags should I use?\n") bagging(int(k), trainSet, testSet, int(bagSize), int(bagNum)) else: invalid = True if invalid: print( "Oops! There was some invalid input somewhere along the way.") print("Let's start from the top again.\n") else: print("Wow! That was fun. Let's do it again.\n")
def run_adaboost(training_data,testing_data,values_in_features,max_index): round_error = [] round_model = [] round_alpha = [] for example in training_data: example['weight'] = float(1)/float(len(training_data)) current_round_ctr = 0 while current_round_ctr < total_rounds: sample_training_data = weighted_random_sampling(training_data) #print "Done sampling." conditional_prob_model= NaiveBayes.train_naive_bayes_get_classifier(sample_training_data,values_in_features) #print "Done training on naive bayes model." #Get the model as got by training on the random sample. round_model.append(conditional_prob_model) predictions = NaiveBayes.get_predictions_from_model(conditional_prob_model,training_data,max_index) error = find_error(training_data,predictions) #Find the error of the model. round_error.append(error) #print "Done finding predictions and getting error." #print error if error >= 0.5: break training_data = update_weights(training_data,error,predictions) #print "Done updating weights." training_data = normalize_weights(training_data) #print "Done normalizing weights." current_round_ctr += 1 #print "Done training models for multiple rounds." round_alpha = calculate_weight_classifiers(round_error) total_classifiers_generated = len(round_error) #print "Done calculating alpha." adaboost_predictions = [] #Get the boosted predictions for different examples. for example in testing_data: boosted_prediction = 1 for current_round_ctr in range(0,total_classifiers_generated): predicted_label = "-1" features_prob_product_positive = 1.0 features_prob_product_negative = 1.0 for feature in range(1,max_index + 1): if feature in example: pass_value = example[feature] else: pass_value = 0 string_lookup = str(feature) + ':' + str(pass_value) + ':' + "+1" features_prob_product_positive = float(features_prob_product_positive) * float(round_model[current_round_ctr][string_lookup]) string_lookup = str(feature) + ':' + str(pass_value) + ':' + "-1" features_prob_product_negative = float(features_prob_product_negative) * float(round_model[current_round_ctr][string_lookup]) if (float(features_prob_product_positive*round_model[current_round_ctr]['prior_positive']) >= float(features_prob_product_negative*round_model[current_round_ctr]['prior_negative'])): predicted_label = "+1" boosted_prediction = float(boosted_prediction) + float(float(round_alpha[current_round_ctr]) * float(predicted_label)) if boosted_prediction > 0: final_prediction = "+1" else: final_prediction = "-1" adaboost_predictions.append(final_prediction) #print "Done with Adaboost predictions." return adaboost_predictions
def runNBFace(percent,trainsize,testsize): trainingData,trainingLabels,testData,testLabels=extractImages("faces",trainsize,testsize) featureFunction = featureFuncLib.basicFeatureExtractorFace_2 trainFeatures = list(map(featureFunction, trainingData)) testFeatures = list(map(featureFunction,testData)) trainFeatures,trainingLabels=xPercent(percent,trainFeatures,trainingLabels) start = time.time() givenTrue, pyTrue, givenFalse, pyFalse=NaiveBayes.trainFace(trainFeatures,trainingLabels) end = time.time() predict=NaiveBayes.predictFace(testFeatures, testLabels, givenTrue, pyTrue, givenFalse, pyFalse) return percentCorrect(testLabels,predict),abs(end-start)
def get_input(i,t, section_text, tokens, b_dic, headline): ''' User interface for supervised learning. Enables the user to classify sections of webpages, or confirm the current guess. Saves changes both to b-dic (in memory) and training_tsv file on disk. ''' Helpers.clear_screen() print headline Helpers.print_progress_bar(t,i) print "Token %d of %d" % (i,t) # Confirm that we have enough data to make guesses can_make_guesses = NaiveBayes.can_make_guesses(b_dic) if can_make_guesses: guess = NaiveBayes.extract_winner(NaiveBayes.guess(tokens,b_dic)) print "\nGuess is " + guess.upper() + "\n\n" # 'article' -> 'a' guess_key = "" for key in valid_categories.keys(): if guess == valid_categories[key]: guess_key = key assert guess_key != "" print "\n\n\n\n" + section_text print "\n*** *** *** *** *** *** *** *** *** *** *** *** *** *** \n" cmd = "" while True: input_msg = "[A] for article, [H] for headline, [S] for spam or junk content, [D] for date, [B] for byline:\n" if can_make_guesses: input_msg = "Hit enter to confirm guess of " + guess + " or " + input_msg cmd = raw_input(input_msg) # 'if not cmd' enables user to just hit enter, and it updates cmd to # the guess value if not cmd and can_make_guesses: cmd = guess_key else: cmd = cmd.lower() if cmd in valid_categories.keys(): # Save token classification to b_dic b_dic = NaiveBayes.train(tokens,valid_categories[cmd], b_dic) # Save to the tab file as well, so that that the b_dic can be rebuilt Helpers.append_file_utf(("," + cmd ), training_tsv) break else: print "Error: '", cmd.upper(), "' is an invalid command. Please try again." return b_dic
def adaboostWithNaiveBayes(data, label, datatype): datadim, datanum = shape(data) classifiersweight = [] #分类器权重 classifiers = [] #每个分类器中的每个样本对应的权重,即数据分布 classifiersdata = [] #每个分类器中的数据 classifierslabel = [] #每个分类器中数据对应的标签 sampleweights = [1 / datanum] * datanum #初始样本权重,即初始数据分布 classifiers.append([i for i in sampleweights]) classifiersdata.append(data) classifierslabel.append(label) result, errorrate = NaiveBayes.naiveBayes(data, label, data, label, datatype) #用原始数据集训练第一个贝叶斯分类器 if (errorrate > 0.5): print('初始分类器不满足要求') return cweight = 0.5 * math.log((1 - errorrate) / errorrate, math.e) #初始分类器权重 classifiersweight.append(cweight) #更新样本分布 for i in range(datanum): sampleweights[i] = sampleweights[i] * math.exp( -cweight * label[i] * result[i]) sumweights = sum(sampleweights) for i in range(datanum): #规范化 sampleweights[i] = sampleweights[i] / sumweights classifiers.append([i for i in sampleweights]) print('第 1 个分类器的误差:', errorrate) #重复训练分类器 T = 2 for iteration in range(1, T): newdata, newlabel, gindices = generateData(data, label, sampleweights) #result, errorrate = NaiveBayes.naiveBayes(newdata, newlabel, newdata, newlabel, datatype) result, errorrate = NaiveBayes.naiveBayes(data, label, newdata, newlabel, datatype) print('第', iteration + 1, '个分类器的误差:', errorrate) classifiersdata.append(newdata) classifierslabel.append(newlabel) cweight = 0.5 * math.log((1 - errorrate) / errorrate, math.e) classifiersweight.append(cweight) #更新样本分布 for i in range(datanum): #sampleweights[i] = sampleweights[i] * math.exp(-cweight * label[i] * result[gindices[i]]) sampleweights[i] = sampleweights[i] * math.exp( -cweight * label[i] * result[i]) sumweights = sum(sampleweights) for i in range(datanum): #规范化 sampleweights[i] = sampleweights[i] / sumweights classifiers.append([i for i in sampleweights]) return classifiersweight, classifiers, classifiersdata, classifierslabel
def emailTest2(): #读取email文件 docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = NaiveBayes.textParse( open('email/spam/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(1) #1-垃圾邮件 wordList = NaiveBayes.textParse( open('email/ham/%d.txt' % i, 'r').read()) docList.append(wordList) fullText.append(wordList) classList.append(0) #0-正常邮件 vocabList = NaiveBayes.createVocabList(docList) trainingSet = list(range(50)) # 创建存储训练集的索引值的列表和测试集的索引值的列表 testSet = [] for i in range(10): # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集 randIndex = int(random.uniform(0, len( trainingSet))) #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high. testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) testSampleText = [] for i in range(10): testSampleText.append(fullText[testSet[i]]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(NaiveBayes.bagOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) clf = MultinomialNB() model = clf.fit(np.array(trainMat), np.array(trainClasses)) for testEntry in testSampleText: testDoc = np.array(NaiveBayes.bagOfWords2Vec(vocabList, testEntry)) testResult = model.predict(np.array(testDoc).reshape(1, -1))[0] print('The testSample is: ', testEntry, '\n') print('It is classified as : ', testResult, '\n') print('------------------------------------------------')
def predicting_using_naivebayes(train_mat_with, train_por_with, test_mat_with, test_por_with): # get temp vars totalsize_mat, probability_n_mat, probability_p_mat = NABE.calculate_probability_with(train_mat_with) totalsize_por, probability_n_por, probability_p_por = NABE.calculate_probability_with(train_por_with) # get final score f_score_mat, accuracy_mat = NABE.naive_bayes_with(totalsize_mat, probability_n_mat, probability_p_mat, test_mat_with) f_score_por, accuracy_por = NABE.naive_bayes_with(totalsize_por, probability_n_por, probability_p_por, test_por_with) # show results print('NABE Mat With G1, G2: Accuracy: ' + str(accuracy_mat) + ' f_score: ' + str(f_score_mat)) print('NABE Por With G1, G2: Accuracy: ' + str(accuracy_por) + ' f_score: ' + str(f_score_por)) return 0
def binary_naive_bayes(): model = nb.NaiveBayesModel() clean = cn.DataCLean() doc_vector = dv.DocumentVector() df_clean, uniqueWords = clean.Clean() df_clean_test, df_clean_train = split( df_clean, 0, int(.3 * (df_clean['class'].count()))) docVector = doc_vector.binary_docvector(df_clean_train, uniqueWords) # print(docVector) df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel( docVector, uniqueWords) # print("Model Trained") predict_df, test_data = model.predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI, df_clean_test, clean) print( "--------------Binary Naive Bayes Accuracy Stats---------------------------" ) stats = em.Evaluate() TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df) print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN)) print("Precision = ", stats.Precision(TP, FP)) print("Recall = ", stats.Recall(TP, FN)) print("fScore = ", stats.fScore(TP, FN, FP)) print("True Negative = ", stats.TrueNegative(TN, FP)) print( "---------------------------------------------------------------------" )
def handle_data(train_set, test_set): ''' get the predictions for three algorithms - decision tree, knn and naive bayes :param train_set: x :param test_set: y predict the y_hat, calc the accuracy and write to file the accuracies + the tree ''' # split the files and get the data and labels train_data, train_data_labels, attributes, label_key = split_train_data( train_set) test_data, test_data_labels = split_test_data(test_set) # get the algorithms decision_tree, knn, naive_bayes = DecisionTree.Model(), Knn.Model( ), NaiveBayes.Model() algorithms = [decision_tree, knn, naive_bayes] accuracies = [] # for every algorithm - get the prediction on the test set, calc the accuracy and add to list for algorithm in algorithms: algorithm.set_data(train_data, train_data_labels, [label_key, attributes]) prediction = algorithm.predict(test_data) accuracy = get_acc(prediction, test_data_labels) accuracies.append( "{0:.2f}".format(accuracy)) # get the 2 digits after point # get the output tree and write to the file tree = decision_tree.get_tree() tree.write_tree(OUTPUT_FILE) # write the accuracies to the same file write_accuracies(OUTPUT_FILE, accuracies)
def s(x): log1,log2 = logistic_regression.predict(x) svm1,svm2 = SVM.predict(x) nb1,nb2 = NaiveBayes.predict(x) X = np.concatenate((log1.reshape(len(log1),1) , log2.reshape(len(log2),1), svm1.reshape(len(svm1),1), svm2.reshape(len(svm2),1),nb1.reshape(len(nb1),1),nb2.reshape(len(nb2),1)),axis = 1) prediction = model.predict(X) return prediction
def test(): model = nb.NaiveBayesModel() path = 'E:/DATA/Sem8/fyp/Training.csv' final_df, df = model.extract('E:/DATA/Sem8/fyp/merge.csv') count = 0 start = -200 end = 0 accuracy = [] precision = [] recall = [] fscore = [] stats = em.Evaluate() for count in range(5): df_test, df_train = split(final_df, start+200, end+200) print(df_train) li_clean_text = model.clean_data(df_train) uniqueWords = model.make_unique_li(li_clean_text) # # print(uniqueWords) docVector = model.binary_docvector(final_df, uniqueWords) df_WordGivenPI,df_WordGivenNoPi,Prob_PI,Prob_NoPI,numWordsInPI,numWordsInNoPI = model.TrainModel(docVector, uniqueWords) predict_df, test_data = model.Predict(Prob_PI, Prob_NoPI, uniqueWords, df_WordGivenPI, df_WordGivenNoPi, numWordsInPI, numWordsInNoPI) # print("--------------Naive Bayes Accuracy Stats---------------------------") TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df) accuracy.append(stats.Accuracy(TP, TN, FP, FN)) precision.append(stats.Precision(TP, FP)) recall.append(stats.Recall(TP, FN)) fscore.append(stats.fScore(TP, FN, FP)) # print("---------------------------------------------------------------------") print("accuracy = ",Average(accuracy)) print("precison = ", Average(precision)) print("recall = ", Average(recall)) print("f-score = ", Average(fscore))
def filter_article_div(score_dic, article_div, article_dic): ''' Iterates through the article div, removing anything that the classifier says it not article text. Also updates the article_dic with a new headline (if found). ''' headline_max = 0 # Again, like above, need to make multiple loops to make sure # that the extract command actually takes. for x in range(0,3): for child in article_div.contents: rankings = score_dic[child] guess = NaiveBayes.extract_winner(rankings) if guess == "headline": # If we some how guess that more than one element is the # headline, want to make sure we get the one with # the highest score if rankings['headline'] > headline_max: article_dic['headline'] = child.get_text headline_max = rankings['headline'] if guess != "article": child.extract() return article_dic
def prediction(): content = {} content['message'] = str(session['email_msg']) word_list = utils.clean_message(content['message']) print(word_list) word_df = utils.make_dataframe([word_list]) print(word_df.index[0]) sparse_df = utils.make_sparse_matrix(word_df, word_index) sparse_df = np.array(sparse_df) full_df = nb.make_full_matrix(sparse_df, VOCAB_SIZE) full_df = np.array(full_df) output = nb.predict(full_df) session['status'] = output[0] return redirect(url_for("results")) return render_template('prediction.html', results=output[1])
def main(): """ main method :return: """ # get data train_df, test_df = generate_df("data/review_polarity/txt_sentoken") # separate training data into data, train_labels train_labels = pd.DataFrame(train_df["category"]) train_df = train_df["text"] # create model nb = NaiveBayes.NaiveBayes() # train nb.fit(train_df, train_labels) # predict output = nb.predict(test_df) # check accuracy df = pd.DataFrame() df['guess'] = output['guess'] df['actual'] = test_df['category'] df['correct'] = df['guess'] == df['actual'] print df print np.mean(df['correct'])
def given_real_data_test(): patients = pandas.DataFrame.from_csv( './data/training_SyncPatient.csv').reset_index() transcripts = pandas.DataFrame.from_csv( './data/training_SyncTranscript.csv').reset_index() transcripts = transcripts[transcripts['Height'] > 0] transcripts = transcripts[transcripts['Weight'] > 0] transcripts = transcripts[transcripts['BMI'] > 0] joined_df = patients.merge(transcripts, on='PatientGuid', how='inner') final_df = joined_df.groupby('PatientGuid').first().reset_index() female_set = final_df.ix[np.random.choice( final_df[final_df['Gender'] == 'F'].index, 500)] male_set = final_df.ix[np.random.choice( final_df[final_df['Gender'] == 'M'].index, 500)] training_data = [(x[2], (x[8], x[9], x[10])) for x in female_set.values] training_data += [(x[2], (x[8], x[9], x[10])) for x in male_set.values] classifier = NaiveBayes.Classifier() for class_label, input_data in training_data: classifier.train(classification=class_label, observation=input_data) # Manual verification pprint.pprint(classifier._calculate_model_parameters()) # Men print("Men") print(classifier.classify(observation=(71.3, 210.0, 23.509))) print(classifier.classify(observation=(66.0, 268.8, 27.241999999999997))) print(classifier.classify(observation=(65.0, 284.0, 30.616))) print("Women") print(classifier.classify(observation=(60.5, 151.0, 29.002))) print(classifier.classify(observation=(60.0, 148.0, 28.901))) print(classifier.classify(observation=(60.0, 134.923, 26.346999999999998))) assert True, "Always pass until we want to manually evaluate."
def main(): # trainFile = "../../data/spambase/missing_values/{}_percent_missing_train.txt" # testFile = "../../data/spambase/missing_values/{}_percent_missing_test.txt" set_printoptions(threshold='nan') Accuracy_train = ones(10) Accuracy_test = ones(10) for i in range(10): print "Working on data with {} testing set".format(i) # Step 1: loading data print "Loading data..." # trainX, trainY, testX, testY = util.loadData(trainFile.format(i*10), testFile.format(i*10)) # data = loadtxt('../../data/spambase/spambase.data', delimiter=',') # trainX, trainY, testX, testY = util.initialData(data) ##### gammas data = loadtxt('../../data/spambase/spambase.data', delimiter=',') trainX, trainY, testX, testY = util.initialGammaData(data, i) # # Step 2: training data print "Training data..." # model = NaiveBayes.train(trainX, trainY) # model = NaiveBayes.train_missing_value(trainX, trainY) ##### gammas model = NaiveBayes.train_gamma(trainX, trainY) # # Step 3: predict test data print "Predicting data..." # predict_y = NaiveBayes.test(testX, model) # predict_y = NaiveBayes.test_missing_value(testX, model) ##### gammas # train_y = NaiveBayes.test_gamma(trainX, model) test_y = NaiveBayes.test_gamma(testX, model) # # # Step 4: Calculate the Accuracy. print "Accuracy..." # accuracy = sum(predict_y == testY) / float(testY.size) # print "Accuracy on testing : {:.2f}%".format(accuracy*100) # print "....Done...." ##### gammas Accuracy_train[i] = sum(train_y == trainY) / float(trainY.size) print "Accuracy on training : {:.2f}%".format(Accuracy_train[i]*100) Accuracy_test[i] = sum(test_y == testY) / float(testY.size) print "Accuracy on test : {:.2f}%".format(Accuracy_test[i]*100) print "Total average accuracy on training: {:.2f}%".format(mean(Accuracy_train)*100) print "Total average accuracy on testing: {:.2f}%".format(mean(Accuracy_test)*100)
def __init__(self, filename,classifier='NaiveBayes'): self.classifier = NB.NaiveBayes() self.filename = filename data = pd.read_csv(filename, header=None, \ delimiter="\t", quoting=3) self.corpus = data[1] self.labels = data[0] self.build_vocab(self.corpus)
def tarea1(entrenamiento, prueba): d = Main() (t_0, t_1) = d.split(entrenamiento) nb = NaiveBayes.NaiveBayes(entrenamiento, t_1, t_0, prueba) nb.plot() b = Bayes.Bayes(entrenamiento, t_1, t_0, prueba) b.plot() return
def given_one_observation_for_two_classes_test(): classifier = NaiveBayes.Classifier() classifier.train(classification='a class', observation=0) classifier.train(classification='b class', observation=100) classification = classifier.classify(observation=23.2) assert classification is None, "Should classify as the nearest class." classification = classifier.classify(observation=73.2) assert classification is None, "Should classify as the nearest class."
def main(): """ Loads data into partitions, creates a Naive Bayes model based on the train data, runs the model on the test data, and evaluates its accuracy. """ opts = util.parse_args() train_partition, test_partition = util.read_arff(opts.filename) nb_model = NaiveBayes(train_partition) examples = test_partition.data total = len(examples) total_correct = 0 K = test_partition.K confusion_matrix = np.zeros((K, K), int) for example in examples: y_hat = nb_model.classify(example.features) y = example.label confusion_matrix[y][y_hat] += 1 if y_hat == y: total_correct += 1 accuracy = round(total_correct / total, 6) accuracy_str = "Accuracy: " + str(accuracy) + " (" correct_str = str(total_correct) + " out of " + str(total) + " correct)" print(accuracy_str + correct_str) stretch = 8 prediction_labels = " " top_row = " " table = "" for y_hat in range(K): prediction_labels += " " * (stretch - len(str(y_hat + 1))) + str(y_hat + 1) top_row += "-" * stretch for y in range(K): table += " " + str(y + 1) + "|" for y_hat in range(K): entry = str(confusion_matrix[y][y_hat]) table += " " * (stretch - len(entry)) + entry table += "\n" print("\n\n prediction") print(prediction_labels) print(top_row) print(table)
def run(): file, training, test = get_input() print("File Being Used is " + file) print("Ratio of Training Data Being Used: " + str(training)) print("Ratio of Testing Data Being Used: " + str(test)) cont = input("Continue with this data? (Y/n)") if cont == "n": run() print("IMPORTING DATA") data = nb.get_data_set(file) print("CLEANING DATA") data = nb.clean_data_set(data) print("IMPORTING CLEANED DATA") data = nb.get_data_set(data) print("SPLITTING THE DATA INTO TRAINING AND TEST DATA") training_data, testing_data = nb.split_data(data, training) print("LENGTH OF TRAINING DATA -> " + str(len(training_data))) print("LENGTH OF TESTING DATA -> " + str(len(testing_data))) print("CREATING CLASS SUMMARY") summary = nb.class_summary(training_data) print("MAKING PREDICTIONS ON TESTING DATA BASED OFF OF MODEL") testing = nb.prediction(summary, testing_data) accuracy, right, items = nb.accuracy(testing_data, testing) print("ACCURACY IS -> {:2}".format(accuracy)) print("AMOUNT CORRECT IS -> {}".format(right)) print("OUT OF -> {}".format(items)) print("WRITING TO LOG") log = open("log.txt", "w") log.write("ACCURACY -> " + str(accuracy) + "\n" + "AMOUNT CORRECT -> " + str(right) + "\n" + "OUT OF -> " + str(items)) log.close()
def test(self): """Test na sztucznych danych.""" def getfeatures(text): """Funkcja do testów.""" return list(set(text.split())) bayes = NaiveBayes.NaiveBayes(getfeatures) bayes.feature_count = {('terms,', 'C1'): 1, ('considers', 'C2'): 1, ('independently', 'C3'): 1, ('each', 'C1'): 1, ('that', 'C1'): 1, ('the', 'C3'): 1, ('on', 'C1'): 1, ('features', 'C1'): 1, ('and', 'C3'): 1, ('is', 'C2'): 1, ('feature.', 'C2'): 1, ('For', 'C2'): 1, ('fruit', 'C2'): 1, ('features,', 'C2'): 1, ('classifier', 'C2'): 1, ('(or', 'C2'): 2, ('these', 'C1'): 1, ('the', 'C2'): 2, ('particular', 'C2'): 1, ('may', 'C2'): 1, ('Bayes', 'C2'): 1, ('all', 'C2'): 1, ('feature', 'C2'): 1, ('apple', 'C3'): 1, ('naive', 'C2'): 1, ('depend', 'C1'): 1, ('other', 'C2'): 2, ('if', 'C3'): 1, ('contribute', 'C3'): 1, ('any', 'C2'): 1, ('these', 'C2'): 1, ('4"', 'C3'): 1, ('classifier', 'C1'): 1, ('other', 'C1'): 1, ('of', 'C1'): 1, ('assumes', 'C1'): 1, ('Bayes', 'C1'): 1, ('Even', 'C1'): 1, ('presence', 'C1'): 1, ('the', 'C1'): 2, ('a', 'C2'): 3, ('upon', 'C1'): 1, ('that', 'C3'): 1, ('example,', 'C2'): 1, ('properties', 'C3'): 1, ('this', 'C3'): 1, ('to', 'C2'): 1, ('In', 'C1'): 1, ('round,', 'C3'): 1, ('about', 'C3'): 1, ('absence)', 'C2'): 2, ('of', 'C2'): 3, ('diameter.', 'C3'): 1, ('existence', 'C1'): 1, ('be', 'C3'): 1, ('considered', 'C3'): 1, ('a', 'C1'): 1, ('it', 'C3'): 1, ('an', 'C3'): 1, ('or', 'C1'): 1, ('if', 'C1'): 1, ('presence', 'C2'): 1, ('is', 'C3'): 1, ('to', 'C3'): 2, ('unrelated', 'C2'): 1, ('red,', 'C3'): 1, ('probability', 'C3'): 1, ('naive', 'C1'): 1, ('class', 'C2'): 1, ('in', 'C3'): 1, ('simple', 'C1'): 1} bayes.class_count = {'C1': 2, 'C2': 3, 'C3': 2} feat_cats = [ ('of', 'C2'), ('to', 'C3'), ('features', 'C1'), ('Bayes', 'C1'), ('of', 'C1'), ('to', 'C5'), ('features', 'C3'), ('Bayes', 'C2')] probs = [0.0, 0.0, -0.6931, -0.6931, -0.6931, -1e+300, -7.6009, -1.0986] for idx in range(len(feat_cats)): self.assertAlmostEqual( featprob(bayes, feat_cats[idx][0], feat_cats[idx][1]), probs[idx], 4)
def spamTest(): #read datas from text to docList and classList docList = [] classList = [] fullTest = [] for i in range(1, 26): wordList = textParse(open(r'Datas\email\spam\%d.txt' % i).read()) docList.append(wordList) fullTest.extend(wordList) classList.append(1) wordList = textParse(open(r'Datas\email\ham\%d.txt' % i).read()) docList.append(wordList) fullTest.extend(wordList) classList.append(0) vocabList = nb.createVocabList(docList) #move Data from trainingSet to testSet trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) #generate train Mat and Classes trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(nb.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) #train NaiveBayes classifier p0V, p1V, pSpam = nb.trainNaiveBayes0(array(trainMat), array(trainClasses)) #test classifier use testSet errorCount = 0 for docIndex in testSet: wordVector = nb.setOfWords2Vec(vocabList, docList[docIndex]) if nb.classifyNaiveBayse(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is : ', float(errorCount) / len(testSet)
def main(): nb = NaiveBayes() ## Training Set rTrainFeatures = getFeatures("C:\Users\John\Documents\BRMSentimentAnalysis\data\MSFTSmall.txt") uTrainFeatures = getFeatures("C:\Users\John\Documents\BRMSentimentAnalysis\data\unrelated.txt") features = [] features.append(rTrainFeatures) features.append(uTrainFeatures) labels = ["Company", "Unrelated"] f = (float)(len(rTrainFeatures))/(len(rTrainFeatures) + len(uTrainFeatures)) print f nb.addExamples(features, labels) ## Test Set test_features = [] test_features.append("Apple") test_features.append("Mac") print "The test example", test_features, "should be labeled 'Company', and is in fact labeled: {0}".format(nb.classify(test_features))
def storeTweetSentiment(self, tweet): try: related_topic_term = self.isTweetRelatedFF(tweet)[1] text = tweet['text'] sentiment = NaiveBayes.sentimentClassify(text, sentimentTokenizer, stopwords, sentimentClassifier) state = getTweetState(tweet) sentiment_file_writer = csv.writer(open('FFsentiment.csv','a'), lineterminator='\n') sentiment_file_writer.writerow((related_topic_term, tweet['text'], sentiment, state)) except: e = sys.exc_info()[0] print( "<p>Error: %s</p>" % e )
def learnClassifer(self): model = NaiveBayes() dict = {}; dict['cases'] = 1 attributes = [] for j in range(len(self.featureFactory.datatable)): dict = {}; dict['cases'] = 1 dict['attributes'] = {} line = self.featureFactory.datatable[j] for i in range(len(line)): dict['attributes'][str(i)] = line[i] attributes.append(str(i)) dict['label'] = self.featureFactory.classes[j] model.add_instances(dict) model.set_real(attributes) model.train() self.model = model return pickle.dumps(model).encode('string_escape')
def saveCounts(): with open('total-counts.csv', 'w') as counts: counts= csv.DictWriter(counts, states.keys()) counts.writeheader() counts.writerow(states) if __name__ == '__main__': #geolocator = GoogleV3() tweets = open('test.json', 'w', encoding='utf-8') tweets.close() sentiment_file_writer = csv.writer(open('FFsentiment.csv','w'), lineterminator='\n') sentimentClassifier = NaiveBayes.getSentimentClassifier() print("Sentiment Classifier Created") sentimentTokenizer = happytokenizer.TweetTokenizer() #This handles Twitter authentication and the connection to Twitter Streaming API Tweetlistener = StdOutListener() auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) stream = Stream(auth, Tweetlistener) #This line filter Twitter Streams to capture data posted from US in English with utf-8 encoding while True: try: stream.filter(languages=['en'], async=False, locations=[-125,25,-65,48]) except:
def main(): trainingFiles, testFiles = getYELPFiles() print 'importing ~230,000 reviews...' reviewsList = importJSONFiles([trainingFiles[REVIEW]])[0] print 'import finished' # construct list "y" of scores scoreVector = np.asmatrix([review['votes']['useful'] for review in reviewsList]).T # GENERATE GRAPH ################################# Graphs.helpfulDist(reviewsList) ################################# # CONCURRENT REGRESSION CONFIGURATIONS ############################################################################################### pid1 = os.fork() if pid1 == 0: weightVector, RMSLE = regSentences(reviewsList, scoreVector) # RMSLE = 0.6447 exit(RMSLE) pid2 = os.fork() if pid2 == 0: weightVector, RMSLE = regLines(reviewsList, scoreVector) # RMSLE = 0.6382 exit(RMSLE) pid3 = os.fork() if pid3 == 0: weightVector, RMSLE = regLinesSqrLines(reviewsList, scoreVector) # RMSLE = 0.6371 exit(RMSLE) pid4 = os.fork() if pid4 == 0: weightVector, RMSLE = regLinesLogLines(reviewsList, scoreVector) # RMSLE = 0.6365 exit(RMSLE) pid5 = os.fork() if pid5 == 0: weightVector, RMSLE = regLinesSentences(reviewsList, scoreVector) # RMSLE = 0.6320 exit(RMSLE) pid6 = os.fork() if pid6 == 0: weightVector, RMSLE = regUserScores(reviewsList, scoreVector, trainingFiles) # RMSLE = 0.5330 exit(RMSLE) weightVector, RMSLE = regUserScores(reviewsList, scoreVector, trainingFiles) # RMSLE = 0.5330 pid7 = os.fork() if pid7 == 0: weightVector, RMSLE = regLogLinesLogSentences(reviewsList, scoreVector) # RMSLE = 0.6340 exit(RMSLE) RMSLE1 = os.waitpid(pid1,0) RMSLE2 = os.waitpid(pid2,0) RMSLE3 = os.waitpid(pid3,0) RMSLE4 = os.waitpid(pid4,0) RMSLE5 = os.waitpid(pid5,0) RMSLE6 = os.waitpid(pid6,0) RMSLE7 = os.waitpid(pid7,0) ############################################################################################### # REGRESSION (with testing) ON ADJECTIVES AND ADVERBS RMSLE = 0.6329 ################################################################################# # CONCURRENT training (set desired number of training reviews to use inside the method) weightVector, RMSLE = concurrentFeatureExtractor(reviewsList, scoreVector) # SEQUENTIAL training (set desired number of training reviews to use inside the method) weightVector, RMSLE = regLinesAdjAdv(reviewsList, scoreVector) # concurrent testing TestSet.testConcAdjAdv(testFiles, weightVector) ################################################################################# # 2 other possible test configurations ################################################################# weightVector, RMSLE = regLinesSentences(reviewsList, scoreVector) TestSet.testLinesSentences(testFiles, weightVector) ################################################################# # NAIVE BAYES #################################################### NaiveBayes.probScoreGivenCategories(trainingFiles) #################################################### print '\nGot to the end, Terminating...'
# # # Main program # # # ############################## #Read information from the command line file = sys.argv[1] examples = sys.argv[2] algorithm = sys.argv[3] #Check with algorithm will be used if algorithm != 'NB': algorithm = algorithm.replace("NN", "") #Read data and train it for Naive Bayes data = Helper.readfile(file) train = NaiveBayes.train_nb(data) #Read example data f = open(examples, 'r') #Test every example for line in f: array_line = line.split(',') row = [] length = len(array_line) for i in range (0,length): row.append(float(array_line[i])) #Apply the algorithm if algorithm != 'NB': print KNN.knearest(int(algorithm),data,row)
def formEnsembleClassifiers(training_class, training_data, max_attribute_values, k, max_run=5): num_training_data = len(training_class) ensemble_classifiers = [] errors_Mi = [] tuple_weights = [(1.0 / len(training_data)) for i in range(0, len(training_data))] #tuple_ids = [i for i in range(0, len(training_class))] for rk in range(0, k): run_emis = [] run_classifiers = [] run_predictions = [] run_training_class = [] current_run = 0 while True: new_training_data = [] new_training_class = [] prefixed_weights = prefixScan(tuple_weights) for i in range(0, num_training_data): pick_id = drawRandomPD(prefixed_weights)#random.choice(tuple_ids) new_training_data.append(training_data[pick_id]) new_training_class.append(training_class[pick_id]) Mi = nb.makeClassifier(new_training_class, new_training_data, max_attribute_values) predicted_class = nb.predictClass(new_training_data, Mi) eMi = findMiError(predicted_class, new_training_class, tuple_weights) if eMi < 0.5:# and (errors_Mi != [] and eMi < min(errors_Mi)): ensemble_classifiers.append(Mi) errors_Mi.append(eMi) break run_emis.append(eMi) run_classifiers.append(Mi) run_predictions.append(predicted_class) run_training_class.append(new_training_class) current_run += 1 # # if current_run == max_run: # eMi = max(run_emis) # min_run_id = run_emis.index(eMi) # ensemble_classifiers.append(run_classifiers[min_run_id]) # predicted_class = run_predictions[min_run_id] # new_training_class = run_training_class[min_run_id] # errors_Mi.append(eMi) # break print eMi new_tuple_weights = assignNewTupleWeights(tuple_weights, eMi, predicted_class, new_training_class) if new_tuple_weights == None: rk -= 1 continue tuple_weights = new_tuple_weights #tuple_ids = makeNewTupleIds(tuple_weights, num_training_data) return ensemble_classifiers, errors_Mi
data = [ [5.92, 190, 11], [5.58, 170, 12], [5.92, 165, 10], [5, 100, 6], [5.5, 150, 8], [5.42, 130, 7], [5.75, 150, 9], [6, 180, 12], [7, 220, 11], ] labs = ["male", "male", "male", "female", "female", "female", "female", "male", "male"] pred_data = [[6, 130, 8], [7, 199, 12], [5.42, 170, 8], [5.8, 220, 11]] node, prior_prob = NaiveBayes.train(data, labs) output = NaiveBayes.predict(node, prior_prob, pred_data) for predicted_value in output: print predicted_value ### NOW TESTING ITS OUTPUT COMPARED TO sklearn.naive_bayes implementation. X = array(data) y = array(labs) gnb = GaussianNB() classifier = gnb.fit(X, y)
features_prob_product_negative = float(features_prob_product_negative) * float(round_model[current_round_ctr][string_lookup]) if (float(features_prob_product_positive*round_model[current_round_ctr]['prior_positive']) >= float(features_prob_product_negative*round_model[current_round_ctr]['prior_negative'])): predicted_label = "+1" boosted_prediction = float(boosted_prediction) + float(float(round_alpha[current_round_ctr]) * float(predicted_label)) if boosted_prediction > 0: final_prediction = "+1" else: final_prediction = "-1" adaboost_predictions.append(final_prediction) #print "Done with Adaboost predictions." return adaboost_predictions if __name__ == "__main__": if(len(sys.argv)) != 3: print NaiveBayes.usage("NBAdaBoost.py") sys.exit(1) else: train_file_name = sys.argv[1] test_file_name = sys.argv[2] train_file = open(train_file_name,"r") test_file = open(test_file_name,"r") training_data,testing_data,values_in_features,max_index = NaiveBayes.process_files(train_file,test_file) train_file.close() test_file.close() adaboost_predictions = run_adaboost(training_data,training_data,values_in_features,max_index) NaiveBayes.print_metrics(training_data,adaboost_predictions) adaboost_predictions = run_adaboost(training_data,testing_data,values_in_features,max_index) NaiveBayes.print_metrics(testing_data,adaboost_predictions)
else: if (int(df[df['날짜'] >= issueDate].tail(1)['종가']) < int(df[df['날짜'] < issueDate].head(1)['종가'])): print('down') docList.append(wordList) classList.append(0) else: print('hold') docList.append(wordList) classList.append(0) except: continue else: pass vocaList = NaiveBayes.createVocabList(docList) trainMat = [] for postinDoc in docList: trainMat.append(NaiveBayes.setOfWords2Vec(vocaList, postinDoc)) print('vocaList : ', vocaList) print('trainMat : ', trainMat) print('testEntry : ', testEntry) p0V, p1V, pAb = NaiveBayes.trainNB0(array(trainMat), array(classList)) # testEntry = ['카카오', '인공지능', '알파고'] thisDoc = array(NaiveBayes.setOfWords2Vec(vocaList, testEntry)) print(testIssueDate, ' 일자의 ', testTitle, '기사 이후 주가는 ?\n', NaiveBayes.classifyNB(thisDoc, p0V, p1V, pAb)) else: print("Error Code:" + rescode)