def predict(self, s): obb = prepro.preprocess() s = re.sub('[^A-Za-z ]', '', s) s = obb.collapse_terms(s) input_ = word_tokenize(s) input_ = [w.lower() for w in input_ if w.lower() in self.vocab] pre_c1 = 1.0 pre_c2 = 1.0 for i in input_: pre_c1 *= self.prob_c1[i] pre_c2 *= self.prob_c2[i] if pre_c1 * (self.count_c1 / float(self.count_c1 + self.count_c2)) < pre_c2 * ( self.count_c2 / float(self.count_c1 + self.count_c2)): return 1 else: return 0
def on_post(self, req, resp): items = req.media.get('texts') results = [] for item in items: tid = item.get('id') text = item.get('text') try: prediction = self.classifier.predict(preprocess(text), self.k) except Exception as e: self.logger.error( 'Error occurred during prediction: {}'.format(e)) raise falcon.HTTPInternalServerError( title='Internal server error', description= 'The service was unavailable. Please try again later.') scores = {} for label, score in prediction: scores[label[9:]] = score results.append({'id': tid, 'scores': scores}) resp.media = results
from prepro import preprocess import numpy as np import pandas as pd from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from nltk.tokenize import word_tokenize #global data data_train, data_test, features = np.array(preprocess()).ravel() #prior probability of spam and ham pspam = 0.0 pham = 0.0 #counter array to calculate probability counter_words = dict((key, [0, 0]) for key in features) #array to store probability of each word in spam and ham prob_spam = dict((key, 0.0) for key in features) prob_ham = dict((key, 0.0) for key in features) def prior_prob(): total = data_train.shape[0] spamcounter = 0 hamcounter = 0 for value in data_train['CLASS']: if value: spamcounter += 1 else: hamcounter += 1 pspam = spamcounter / float(total)
print("Confusion matrix : ") mat = confusion_matrix(self.test_data['CLASS'], op) acc = accuracy_score(self.test_data['CLASS'], op) fs = f1_score(self.test_data['CLASS'], op) print(mat) print("Accuracy : " + str(acc * 100) + "%") print("F-Score : " + str(fs)) print("MCC : " + str(matthews_corrcoef(self.test_data['CLASS'], op))) def run(self): op = [] for idx, d in self.data.iterrows(): op.append(self.predict(d['CONTENT'])) print("Result on Whole Dataset : ") print("Confusion matrix : ") mat = confusion_matrix(self.data['CLASS'], op) acc = accuracy_score(self.data['CLASS'], op) fs = f1_score(self.data['CLASS'], op) print(mat) print("Accuracy : " + str(acc * 100) + "%") print("F-Score : " + str(fs)) print("MCC : " + str(matthews_corrcoef(self.data['CLASS'], op))) if __name__ == "__main__": ob1 = prepro.preprocess(['data'], ['CLASS', 'CONTENT']) data = ob1.read_and_clean() ob2 = NBC(data) ob2.train() ob2.test_run()
from prepro import preprocess from NB import NBC if __name__ == "__main__": file_name = 'data' #object for preprocess data read,clean,transform process = preprocess(file_name) data = process.read_and_clean() #Naive Bayes classifier object classifier = NBC(data,split_size=0.2) classifier.train() classifier.test_run() classifier.run()
from nltk.stem.porter import PorterStemmer import prepro as p import out as o file_name = input( "Please enter the name of Queries file, you can also input the address of file:" ) query = open( file_name, "r" ) #Following code snippit is to tokenize query data and calculate tfidf values for it. query_line = query.readline() query_data = dict() count = 1 while (query_line): k = p.preprocess(query_line) query_data.update({count: k}) query_line = query.readline() count += 1 qindex = dict() # This is used to calculate tf for each word in a document for key, value in query_data.items(): for each in value: if each not in qindex: qindex.update({each: {key: 1}}) else: if key not in qindex[each]: qindex[each].update({key: 1}) else: qindex[each][key] = qindex[each][key] + 1