def smoothing_eval(): #best implies least incorrect best_one = float("inf") best_two = 0 bayes.multinomial_factor = 0.5 smoothing_increment = 0.05 while True: best_two = best_one bayes.multinomial_factor += smoothing_increment classification_rules = bayes.train_nb(train_docs, train_labels) best_guess = bayes.classify_documents(eval_docs, classification_rules, bayes.unique_set(train_labels)) error_cases, acc, incorrects, seperation = bayes.accuracy( eval_labels, best_guess, eval_docs) best_one = incorrects log_file = open("smoothing_eval.txt", "a+") log_file.write( str(acc) + " " + str(seperation) + " " + str(incorrects) + " " + str(bayes.multinomial_factor) + "\n") log_file.close() print(bayes.multinomial_factor) print(str(best_one) + " " + str(best_two)) if best_one > best_two: continue
def access(): totalAcc = 0.0 for i in range(10): labels, samples = bayes.load_data(_filename) trainIdx, testIdx = bayes.splitData(samples.index, 0.7) dic = bayes.buildVocabularyList(samples) trainVecs, trainLabels, pw = bayes.buildVecsAndLabels( samples[trainIdx], labels[trainIdx], dic) spamPro, pamPro = bayes.bayesian(trainVecs, trainLabels, dic) acc = bayes.accuracy(samples[testIdx], labels[testIdx], dic, spamPro, pamPro, pw) totalAcc += acc print 'accuracy = %s' % acc averageAcc = float(totalAcc) / 10 print 'average accuracy is %f' % averageAcc
def stopword_eval(stopwords): for i in reversed(range(0, len(stopwords))): removed_item = stopwords[i] stopwords.remove(removed_item) classification_rules = bayes.train_nb(train_docs, train_labels) best_guess = bayes.classify_documents(eval_docs, classification_rules, bayes.unique_set(train_labels)) error_cases, acc, incorrects, seperation = bayes.accuracy( eval_labels, best_guess, eval_docs) log_file = open("stopword_eval.txt", "a+") log_file.write( str(acc) + " " + str(incorrects) + " " + " ".join(stopwords) + "\n") log_file.close()
def synonym_eval(): # manually set max range each time for i in range(0, 430): bayes.synonoyms = {} with open("synonyms.txt", 'r') as syn_file: raw_synonoyms = syn_file.readlines() for index, line in enumerate(raw_synonoyms): if index > i: line = line.replace("\n", "").upper().split("\t") lhs = line[1].split(",") rhs = line[0] for key in lhs: bayes.synonoyms[key] = rhs classification_rules = bayes.train_nb(train_docs, train_labels) best_guess = bayes.classify_documents(eval_docs, classification_rules, bayes.unique_set(train_labels)) error_cases, acc, incorrects, seperation = bayes.accuracy( eval_labels, best_guess, eval_docs) log_file = open("synonym_eval.txt", "a+") log_file.write( str(acc) + " " + str(incorrects) + " " + " ".join(json.dumps(bayes.synonoyms)) + "\n") log_file.close()
train_dinner_dict = bayes.fea_and_class(train_dinner) #---calculate the mean and cov according to class------------------------ train_breakfast_summaries = bayes.normalsummarize(train_breakfast_dict) train_lunch_summaries = bayes.normalsummarize(train_lunch_dict) train_dinner_summaries = bayes.normalsummarize(train_dinner_dict) #---predict the class and get a list of classes-------------------------- predictions = [] for i in range(len(val_x_list)): if val_x_list[i][0] == "breakfast": del val_x_list[i][0] prediction = bayes.normalpredict(train_breakfast_dict, train_breakfast_summaries, val_x_list[i]) if val_x_list[i][0] == "lunch": del val_x_list[i][0] prediction = bayes.normalpredict(train_lunch_dict, train_lunch_summaries, val_x_list[i]) if val_x_list[i][0] == "dinner": del val_x_list[i][0] prediction = bayes.normalpredict(train_dinner_dict, train_dinner_summaries, val_x_list[i]) predictions.append(prediction) #calculate the accuracy using the validation and predictions-------------- del val["Type"] val_array = np.array(val) val_list = val_array.tolist() accuracy += bayes.accuracy(val_list, predictions) average_accuracy = accuracy / 1000.0 print average_accuracy
with open("stopwords.txt", 'r') as stop_file: bayes.filter = stop_file.readlines() for index, line in enumerate(bayes.filter): bayes.filter[index] = line.replace("\n", "").upper() with open("synonyms.txt", 'r') as syn_file: raw_synonoyms = syn_file.readlines() for index, line in enumerate(raw_synonoyms): line = line.replace("\n", "").upper().split("\t") lhs = line[1].split(",") rhs = line[0] for key in lhs: bayes.synonoyms[key] = rhs print(bayes.synonoyms) #log_file = open("stopword_eval.txt", "a+") classification_rules = bayes.train_nb(train_docs, train_labels) best_guess = bayes.classify_documents(eval_docs, classification_rules, bayes.unique_set(train_labels)) error_cases, acc, incorrects, seperation = bayes.accuracy( eval_labels, best_guess, eval_docs) #log_file.write(str(acc) + " " + str(incorrects) + " " + " ".join(bayes.filter) + "\n") #log_file.close() for error in error_cases: print(error) pass #print(len(error_cases)) #smoothing_eval()