def getSolution(self): classifier = NaiveBayesClassifier() solution = ''.join([ classifier.getClassification(jchar_CSV) for jchar_CSV in self.jchar_CSV_list ]) return solution
def summarize_text(self, sites, articles): if (len(articles) < 1): return ["Not enought information about player"] summary_methods = NaiveBayesClassifier() return summary_methods.get_summary(sites, articles)
def NBCTest(): nbc = NaiveBayesClassifier() sites = [ "https://www.theplayerstribune.com/doublelift-league-of-legends-everyone-else-is-trash/" ] article_extractor = articles.ArticleExtractor('doublelift', 'league of legends', 5) #sites = article_extractor.get_websites() article = [article_extractor.parse_websites(site) for site in sites] string_text = list_to_string(article) string_text = string_text.replace("', '", ' ') string_text = string_text.replace('", "', ' ') summary = nbc.get_summary(string_text, 5) print("Summary:") for sentence in summary: print(u'\u2022 ' + sentence.lstrip("[]1234567890',.\" "))
def main(): data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) print("Spam" if classifier.classify("Get free laptops now!") > 0.5 else "Not Spam") classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] count = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) spammiest_hams, hammiest_spams = most_misclassified(classified) print("Accuracy: ", accuracy(count)) print("Precision: ", precision(count)) print("Recall: ", recall(count)) print("\nTop 5 falsely classified as spam:\n\n", spammiest_hams) print("\nTop 5 falsely classified as not spam:\n\n", hammiest_spams) print("\nMost spammiest words: ", spammiest_word(classifier))
def spamFilterChecker(): print('Received the JAVA Request!') # Get the text data from the JAVA Program. req_data = abhishek_request.get_json() text_to_be_classified = req_data['text_to_be_classified'] print(text_to_be_classified) # ---------------------------------------------------------------------------- # Make a POST request to the plino Spam API. # ---------------------------------------------------------------------------- data = [] for verdict in ['spam', 'not_spam']: for files in glob.glob(PATH + verdict + "/*")[:500]: is_spam = True if verdict == 'spam' else False with open(files, "r", encoding='utf-8', errors='ignore') as f: for line in f: if line.startswith("Subject:"): subject = re.sub("^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.80) classifier = NaiveBayesClassifier() classifier.train(train_data) json_response = "" value = classifier.classify(text_to_be_classified) if value < 0.9: json_response = "{'email_class' : 'spam'}" else: json_response = "{'email_class' : 'ham'}" print("====================================================") print("POSSIBILITY OF HAM : ", value) print(json_response) print("====================================================") return json_response
import pandas from naive_bayes_classifier import NaiveBayesClassifier df = pandas.read_csv("weather.csv", sep=";") naive_bayes = NaiveBayesClassifier(df, 0.6) naive_bayes.train_algorithm() accuracy = naive_bayes.test_algorithm() print("Algorithm Accuracy : " + str(accuracy) + " %") tuple_data = ('Overcast', 'Hot', 'Normal', True) prediction = naive_bayes.predict(tuple_data) print("Prediction for " + str(tuple_data) + " is " + prediction)
svm_classifier = SvmClassifier() svm_classifier.setSvm(svm) #CONFIGURACAO DO RF rf = RfModule() rf_classifier = RfClassifier() rf_classifier.setRf(rf) #CONFIGURACAO DO RF dt = DecisionTreeModule() dt_classifier = DecisionTreeClassifier() dt_classifier.setDecisionTree(dt) #CONFIGURACAO DA NAIVEBAYES naive_bayes = NaiveBayesModule() naive_bayes_classifier = NaiveBayesClassifier() naive_bayes_classifier.setNaiveBayes(naive_bayes) #CONFIGURACAO DO LSTM lstm = LstmModule() lstm.setInputLength(20) lstm.setNumberExamples(1000) lstm_classifier = LstmClassifier() lstm_classifier.setLstm(lstm) #CONFIGURACAO DA REDE NEURAL rna = RnaModule() rna.setNumberNeuronsImputLayer(20) rna.setActivationFunctionImputLayer("tanh") rna.setImputDimNeurons(20) rna.setNumberNeuronsHiddenLayer(20)
def setUp(self): self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as APS and Railway Public School , is a private , all boys , boarding school for , 7th to 12th grade students , located in Abbottabad , Pakistan .''']} self.classifier = NaiveBayesClassifier(self.examples)
class TestClassifier(unittest.TestCase): def setUp(self): self.examples = {'university': ['''Abbottabad Public School , also commonly referred to as APS and Railway Public School , is a private , all boys , boarding school for , 7th to 12th grade students , located in Abbottabad , Pakistan .''']} self.classifier = NaiveBayesClassifier(self.examples) def test_create_vocabulary(self): self.classifier.vocabulary.should.contain('private') def test_vocabulary_size(self): self.classifier.vocabulary_size.should.eql(28) def test_subset_of_documents_with_target_value(self): len(self.classifier.get_documents_with_target_value('university')).should.eql(1) def test_text_of_documents(self): documents = self.classifier.get_documents_with_target_value('university') self.classifier.get_text(documents).should.contain('private') def test_text_distinct_words(self): documents = self.classifier.get_documents_with_target_value('university') text = self.classifier.get_text(documents) self.classifier.get_text_diff_words_count(text).should.eql(28) def test_example_count(self): self.classifier.get_example_count().should.eql(1) def test_occurrences_of_word_count(self): documents = self.classifier.get_documents_with_target_value('university') text = self.classifier.get_text(documents) self.classifier.occurrences_count(',', text).should.eql(7) def test_learn(self): self.classifier.learn() def test_word_positions_in_doc(self): documents = self.classifier.get_documents_with_target_value('university') len(self.classifier.word_positions(documents[0])).should.eql(38) def test_classify(self): self.classifier.learn() self.classifier.classify(self.examples['university'][0]).should.eql('university')
y_train = [] X_test = [] y_test = [] for i in range(len(train)): y_train.append(train[i][0]) X_train.append(train[i][1:]) for i in range(len(test)): y_test.append(test[i][0]) X_test.append(test[i][1:]) # Instantiate the Naive Bayes classifier model made from scratch model = NaiveBayesClassifier() # Fit the model model.fit(X_train, y_train) # Make predictions y_pred = model.predict(X_test) # Print the accuracy of the model print("NaiveBayesClassifier accuracy: {0:.3f}".format(model.accuracy(y_test, y_pred))) # Instantiate the Gaussian Naive Bayes classifier model from Scikit-learn
time_period = "daily" keywords = "hillary-trump" training_required = 0 # log('Failed to load configurations; error in function loadConfig; exiting...', 'error') # print 'FAILED to load configurations' # check if the model dump file is missing #if training_required == 0: # if not os.path.exists(classifier): # training_required = 1 # train a model if required if training_required: tweets = [] nb = NaiveBayesClassifier(tweets, keywords, time_period, training_data, classifier, training_required) # create the HBase tweets table if missing print "Checking database tables..." try: connection = happybase.Connection('localhost') connection.create_table('tweets', {'keyword': dict(max_versions=10), 'sentiment': dict(max_versions=10), 'tweet': dict(max_versions=10) } ) print "...OK" except: print "Table already exists" # log('Table already exists; skipping creation.')
# glob.glob returns every filename that matches the wildcarded path for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, "r") as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
mid_point = len(unique_data) // 2 high = unique_data[mid_point:] low = unique_data[:mid_point] fixed_array = [] for element in array: if element in low: fixed_array.append(False) elif element in high: fixed_array.append(True) return fixed_array naive_bayes_classifier_random_50_dataframe_1 = DataFrame.from_array( [boolize_data(row) for row in dataframe.to_array()], dataframe.columns) naive_bayes_classifier = NaiveBayesClassifier( dataframe=naive_bayes_classifier_random_50_dataframe_1, dependent_variable='Survived') naive_bayes_classifier_classifications = get_classifications( naive_bayes_classifier, testing_dataframe) for row in naive_bayes_classifier_classifications: print(row) print('\n') max_depth_5_decision_tree = DecisionTree( dataframe=dataframe, class_name='Survived', features=[column for column in dataframe.columns if column != 'Survived'], max_depth=5) max_depth_5_decision_tree.fit() max_depth_5_decision_tree_classifications = get_classifications( max_depth_5_decision_tree, testing_dataframe)
import pandas as pd from naive_bayes_classifier import NaiveBayesClassifier if __name__ == '__main__': df = pd.read_csv('spam.csv', encoding='ISO-8859-1') df = df.sample(frac=1.0) spam_data, legit_data = [], [] for _, row in df.iterrows(): if row['v1'] == 'spam': spam_data.append(row['v2']) else: legit_data.append(row['v2']) NB_classifier = NaiveBayesClassifier() spam_train = spam_data[:int(len(spam_data) * 2 / 3)] spam_test = spam_data[int(len(spam_data) * 2 / 3):] legit_train = legit_data[:int(len(legit_data) * 2 / 3)] legit_test = legit_data[int(len(legit_data) * 2 / 3):] NB_classifier.train(spam_train, legit_train) spam_accuracy = 0 legit_accuracy = 0 for text in spam_test: prediction = NB_classifier.predict(text) spam_accuracy += prediction
df = DataFrame.from_array( [ [False, False, False], [True, True, True], [True, True, True], [False, False, False], [False, True, False], [True, True, True], [True, False, False], [False, True, False], [True, False, True], [False, True, False] ], columns = ['errors', 'links', 'scam'] ) naive_bayes = NaiveBayesClassifier(df, dependent_variable='scam') print("Testing Probabilities") assert naive_bayes.probability('scam', True) == 0.4 assert naive_bayes.probability('scam', False) == 0.6 print("passed") print("Testing Conditional Probabilities") assert naive_bayes.conditional_probability(('errors',True), given=('scam',True)) == 1.0 assert naive_bayes.conditional_probability(('links',False), given=('scam',True)) == 0.25 assert naive_bayes.conditional_probability(('errors',True), given=('scam',False)) == 0.16666666666666666 assert naive_bayes.conditional_probability(('links',False), given=('scam',False)) == 0.5
# glob.glob returns every filename that matches the wildcarded path for fn in glob.glob(path): is_spam = "ham" not in fn with open(fn, 'r') as file: for line in file: if line.startswith("Subject:"): # remove the leading "Subject: " and keep what's left subject = re.sub(r"^Subject: ", "", line).strip() data.append((subject, is_spam)) random.seed(0) train_data, test_data = split_data(data, 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) # triplets (subject, actual is_spam, predicted spam probability) classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data] # assume that spam_probability > 0.5 corresponds to spam prediction # and count the combinations of (actual is_spam, predicted is_spam) counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified) # sort by spam_probability from smallest to largest classified.sort(key=lambda row: row[2]) # the highest predicted spam probabilities among the non-spams