class Classifier: def __init__(self): fp = open("./data/train.csv") self.cl = NaiveBayesClassifier(fp, format="csv") fp.close() def test(self): return self.cl.classify("This is a test sentence") def classify(self, text): return self.cl.classify(text) def n_classify(self, text): dist = self.cl.prob_classify(text) probs = {"sentiments": []} for s in dist.samples(): if dist.prob(s) >= .10: probs["sentiments"].append({s: dist.prob(s)}) return json.dumps(probs) def accuracy(self): fp = open('./data/train.csv') train_accuracy = self.cl.accuracy(fp, format="csv") fp.close() fp = open('./data/test.csv') test_accuracy = self.cl.accuracy(fp, format="csv") fp.close() return json.dumps({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy}) def labels(self): return json.dumps({"labels": self.cl.labels()})
class Model(object): """docstring for Model""" def __init__(self, name='Guess', config={}): self.name = name self.config = config self.clf = NaiveBayesClassifier([]) def train(self, training_data): safe_training = [] for example in training_data: safe_training.append((example.get('text'), example.get('label'))) self.clf.update(safe_training) def evaluate(self, text): label = self.clf.classify(text) prob_dist = self.clf.prob_classify(text) label_prob = prob_dist.prob(label) return label, label_prob def get_classes(self): return self.clf.labels() def save(self): pass def load(self): pass
def PLN_CSV(text): """ function that uses supervised machine learning to classify whether the message is positive or negative, at the end it returns a list (PLN) Natural Language Processing... """ feelings_list = [] try: feelings = pd.read_csv('analysistext/pln/feelings.csv', sep=';', header=None) clf = NaiveBayesClassifier(feelings.values, format="csv") except: print("Não conseguiu abrir o arquivo ou ele não existe") # A acurácia vai ser None clf = None # separação da probabilidade dist_prob = clf.prob_classify(text) dist_prob_max = dist_prob.max() dist_prob_positivo = dist_prob.prob('positivo') dist_prob_negativo = dist_prob.prob('negativo') feelings_list.append({ "dist_prob_max": dist_prob_max, "dist_prob_positivo": dist_prob_positivo, "dist_prob_negativo": dist_prob_negativo }) return feelings_list
class HelpLabeler(object): HELP_DATA = 'help_data.json' def __init__(self): with open(self.HELP_DATA, 'r') as fp: self.c = NaiveBayesClassifier(fp, format="json") with open(self.HELP_DATA, 'r') as fp: self.help_json = {} for i in json.load(fp): self.help_json[i['text']] = i['label'] def get_label(self, text, lower_placeholders=[]): text = text.lower() self.save_help(text) prob_dist = self.c.prob_classify(text) label = prob_dist.max() prob = round(prob_dist.prob(label), 2) if prob > 0.7: return(label) else: return(None) def save_help(self, lower_text): try: self.help_json[lower_text] except KeyError: self.help_json[lower_text] = 'unknown' with open(self.HELP_DATA, 'w') as fp: json.dump([{'text': k, 'label': v} for k, v in self.help_json.items()], fp, indent=4)
class NBClassifier: def __init__(self, train_data_file): self._train_data_file = train_data_file f = open(self._train_data_file, 'r+') self._cl = NaiveBayesClassifier(f, format="json") f.close() def update_train_set(self, sentence): new_data = [(sentence.str_sentence, sentence.label)] self._cl.update(new_data) self._save_data_to_file() def _save_data_to_file(self): TEXT = "{\"text\":\"" LABEL = "\", \"label\":\"" dict_str = ",\n".join([ str(TEXT + str(el[0]) + LABEL + str(el[1]) + "\"}") for el in self._cl.train_set ]) f = open(self._train_data_file, 'r+') f.write("[" + dict_str + "]") f.close() def prob_classify(self, sentence): # import ipdb; ipdb.set_trace() return self._cl.prob_classify(sentence).max()
def enginemongo(text): from textblob.classifiers import NaiveBayesClassifier trainingset = db.trainingset.find() tsarr = [] for t in trainingset: tsarr.append((t["question"], t["answer"])) print(tsarr) cl = NaiveBayesClassifier(tsarr) prob_dist = cl.prob_classify(text) print("TEST:", text, " ", prob_dist, " ", prob_dist.max()) maxprob = 0 maxanswer = "" for a in prob_dist.samples(): pd = round(prob_dist.prob(a), 2) if (pd > maxprob): maxprob = pd maxanswer = a print(a, ":", round(prob_dist.prob(a), 2)) print(cl.show_informative_features()) print("RISPOSTA:", maxanswer, " --- ", maxprob) aa = cl.extract_features(text) print(aa) print("---------------------------------------") return {"answer_key": maxanswer, "answer_prob": maxprob}
class HelpLabeler(object): HELP_DATA = 'help_data.json' def __init__(self): with open(self.HELP_DATA, 'r') as fp: self.c = NaiveBayesClassifier(fp, format="json") with open(self.HELP_DATA, 'r') as fp: self.help_json = {} for i in json.load(fp): self.help_json[i['text']] = i['label'] def get_label(self, text, lower_placeholders=[]): text = text.lower() self.save_help(text) prob_dist = self.c.prob_classify(text) label = prob_dist.max() prob = round(prob_dist.prob(label), 2) if prob > 0.7: return (label) else: return (None) def save_help(self, lower_text): try: self.help_json[lower_text] except KeyError: self.help_json[lower_text] = 'unknown' with open(self.HELP_DATA, 'w') as fp: json.dump([{ 'text': k, 'label': v } for k, v in self.help_json.items()], fp, indent=4)
def classifier(something): speech = something train = [] test = [] with open("training.csv") as csvfile: reader = csv.reader(csvfile) # change contents to floats for row in reader: # each row is a list train.append(row) with open("test.csv") as csvfile: reader = csv.reader(csvfile) # change contents to floats for row in reader: # each row is a list test.append(row) cl = NaiveBayesClassifier(train) cl.classify("This is an amazing library!") prob_dist = cl.prob_classify("This one's a doozy.") prob_dist.max() round(prob_dist.prob("machine"), 2) round(prob_dist.prob("no machine"), 2) blob = TextBlob(speech, classifier=cl) blob.classify() for s in blob.sentences: print("\n\n\n" + str(s)) print("\n" + str(s.classify())) return (s.classify())
def getresult(): if request.method == "POST": # try: print("########", request.args) body = request.data a = json.loads(body.decode('utf-8')) print(a) with open("pt_1.csv", "r", encoding="utf8") as fp: c1 = NaiveBayesClassifier(fp) #w = c1.classify(a) prob_list = c1.prob_classify(a) print(prob_list.max()) ab = prob_list.max() print("Ex ", round(prob_list.prob("Ex"), 3)) print("In", round(prob_list.prob("In"), 3)) #if w=='' #print(w) # if ans is not None: # return ans # else: # return 'none' # except: # return "excetions" #return str(a) return str(ab)
def traintestclassifier(train_dataset, test_dataset, classifystring): classifier = NaiveBayesClassifier(train_dataset) prob_dist = classifier.prob_classify(classifystring) dep_prob = round(prob_dist.prob("dep"), 2) happy_prob = round(prob_dist.prob("happy"), 2) print(dep_prob, "is the probability of this sentence being depressing") print(happy_prob, "is the probability of this sentence being happy") print(classifier.accuracy(test_dataset), "Is the accuracy")
def classify(text): classifier = NaiveBayesClassifier(train, feature_extractor=extract) prob_dist = classifier.prob_classify(text) label = prob_dist.max() if prob_dist.prob(label) > 0.5: return label else: return None
def server_annotation(self, article: str, model: str) -> List[Tuple[str]]: # init: load saved model load_success, annotated_data = self._load(model) annotated_data = [tuple((datum[0], datum[1])) for datum in annotated_data] classifier = NaiveBayesClassifier(annotated_data) # 1: paragraphing paragraphed_text = self._paragrapher(article) # 2: pre-cleansing paragraphed_text = [ self._pre_cleanse_text(paragraph) for paragraph in paragraphed_text ] paragraphed_text = list(filter(None, paragraphed_text)) # 3: segmenting segmented_text = [] for paragraph in paragraphed_text: segmented_text += self._sentence_segmentor(paragraph) # 4: post-cleansing segmented_text = [ self._post_cleanse_text(sentence) for sentence in segmented_text ] segmented_text = list(filter(None, segmented_text)) # 5: lemmatising segmented_text = [ self._lemmatise_sentence(sentence) for sentence in segmented_text ] # 6: annotating annotated = [] for i in range(len(segmented_text)): classification = classifier.prob_classify(segmented_text[i]) if classification.max() == "1": annotated.append( tuple( (segmented_text[i], "pos", round(classification.prob("1"), 2)) ) ) elif classification.max() == "2": annotated.append( tuple( (segmented_text[i], "neu", round(classification.prob("2"), 2)) ) ) else: annotated.append( tuple( (segmented_text[i], "neg", round(classification.prob("3"), 2)) ) ) return annotated
def run_test(train, test, name): print "Training..." cll = NaiveBayesClassifier(train) print "Done training\n" accuracy = cll.accuracy(test) print "Accuracy: " + str(accuracy) # get matching lists of predicted and true labels pred_labels = list() true_labels = list() for obj in test: prob_label = cll.prob_classify(obj[0]).max() true_label = obj[1] true_labels.append(true_label) pred_labels.append(prob_label) # transform our labels to numbers labels = cll.labels() i = 0 label_num = dict() for label in labels: label_num[label] = i i = i + 1 # match our predicted and true labels with the number representations true_label_nums = list() pred_label_nums = list() for true_l, pred_l in zip(true_labels, pred_labels): true_label_nums.append(label_num[true_l]) pred_label_nums.append(label_num[pred_l]) cm = confusion_matrix(true_label_nums, pred_label_nums) print cm print "\n" with open("test_results.txt", "a") as tr: tr.write(str(name) + "\n") tr.write(str(accuracy) + "\n") tr.write(str(cm)) tr.write("\n\n") import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) plt.title("Confusion Matrix For "+name) fig.colorbar(cax) ax.set_xticklabels(['']+labels) ax.set_yticklabels(['']+labels) plt.xlabel("Predicted") plt.ylabel("True") plt.savefig('plots/'+name+'.pdf', bbox_inches='tight')
def classify_v1(text): #<str> is passed to func text = bc.basic_cleanning(text) #returned value is in <list> format #print(text) if text != []: with open('train_dataset.csv') as csv_file: cl = NaiveBayesClassifier(csv_file, format="csv") #cl = NaiveBayesClassifier() #pass dataset as list result = cl.classify(text) #print (type(result)) # <str> format prob_dist = cl.prob_classify(text) pos_result = round(prob_dist.prob("pos"), 2) neg_result = round(prob_dist.prob("neg"), 2) return result
def main(): data =[] train =[] test =[] with open('hellopeter_labelled.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') spamreader = list(spamreader) for row in spamreader: if (row[13] =='strongly positive'): data.append((row[8],'pos')) if (row[13] =='positive' ): data.append((row[8],'pos')) if ( row[13] =='neutral' ): data.append((row[8],'neu')) if ( row[13] =='negative'): data.append((row[8],'neg')) if (row[13] =='strongly negative' ): data.append((row[8],'neg')) train = data[:1000] test = data[1001:] for innf in test: print innf cl = NaiveBayesClassifier(train) for tnew in test: print '%%%%%%%' print ' ' print tnew[0] print tnew[1] print '%%%%%%%' print '#######' cl.classify(tnew[0]) prob_class = cl.prob_classify(tnew[0]) print '----max prob---' print prob_class.max() print '-----+ve-----' print prob_class.prob("pos") print '-----neutral-----' print prob_class.prob("neu") print '------ve-----' print prob_class.prob("neg") cl.accuracy(test)
def main(): data = [] train = [] test = [] with open('hellopeter_labelled.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',') spamreader = list(spamreader) for row in spamreader: if (row[13] == 'strongly positive'): data.append((row[8], 'pos')) if (row[13] == 'positive'): data.append((row[8], 'pos')) if (row[13] == 'neutral'): data.append((row[8], 'neu')) if (row[13] == 'negative'): data.append((row[8], 'neg')) if (row[13] == 'strongly negative'): data.append((row[8], 'neg')) train = data[:1000] test = data[1001:] for innf in test: print innf cl = NaiveBayesClassifier(train) for tnew in test: print '%%%%%%%' print ' ' print tnew[0] print tnew[1] print '%%%%%%%' print '#######' cl.classify(tnew[0]) prob_class = cl.prob_classify(tnew[0]) print '----max prob---' print prob_class.max() print '-----+ve-----' print prob_class.prob("pos") print '-----neutral-----' print prob_class.prob("neu") print '------ve-----' print prob_class.prob("neg") cl.accuracy(test)
def ClassifyDirectory(Dictionary_Excel_File, News_Text_Directory): # Merge all text files into one string called FILE FILE = "" all_files = os.listdir(News_Text_Directory) for i in range(0, len(all_files)): if (all_files[i].endswith(".txt")): # print(all_files[i]) with open(News_Text_Directory + all_files[i], 'rt') as news: FILE = FILE + news.read() news.close() # clean text data from punctuation to_remove = "0123456789;.:?,#+%*/\t[]><'" + '"' table = {ord(char): ' ' for char in to_remove} FILE = FILE.translate(table) FILE = re.sub(' +', ' ', FILE) FILE = re.sub('\n ', '\n', FILE) FILE = re.sub('\n+', '\n', FILE) # extract the 'EXCEL' file data into a training data EXCEL = pd.read_excel(Dictionary_Excel_File, sheet_name=0, usecols=[0, 1]) train = [] n = len(EXCEL['FWD']) for i in range(n): if EXCEL['FWD'][i] == "" or math.isnan(EXCEL['Result'][i]): break if EXCEL['Result'][i] > 0.0: data = (EXCEL['FWD'][i], "Positive") train.append(data) else: data = (EXCEL['FWD'][i], "Negative") train.append(data) # train the classifier with training data CL = NaiveBayesClassifier(train) # Classify the merged text file Result_Probability = CL.prob_classify(FILE) Result = Result_Probability.max() print("Positive Probability: " + str( round(Result_Probability.prob("Positive") * 100, 2)) + "%, Negative Probability: " + str( round(Result_Probability.prob("Negative") * 100, 2)) + "%") print("Final Result: ", Result) return
class InputLabeler(object): LABELS_DATA = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'labels_data.json') def __init__(self): with open(self.LABELS_DATA, 'r') as fp: self.c = NaiveBayesClassifier(fp, format="json") with open(self.LABELS_DATA, 'r') as fp: self.labels_json = {} for i in json.load(fp): self.labels_json[i['text']] = i['label'] def get_num_labels(self): return (len(self.get_labels())) def get_labels(self): labels = self.labels_json.values() labels.sort() return (set(labels)) def get_label(self, text): text = text.lower() # self.save_placeholder(text) prob_dist = self.c.prob_classify(text) label = prob_dist.max() prob = round(prob_dist.prob(label), 2) if prob > 0.7: return (label) else: return (None) def save_placeholder(self, text): try: self.labels_json[text] except KeyError: self.labels_json[text] = 'unknown' with open(self.LABELS_DATA, 'w') as fp: json.dump([{ 'text': k, 'label': v } for k, v in self.labels_json.items()], fp, indent=4)
def engine(text): from textblob.classifiers import NaiveBayesClassifier from textblob.classifiers import MaxEntClassifier from textblob.classifiers import NLTKClassifier url_train = "https://" file_train = "train.csv" if not (os.path.isfile(file_train)): with open(file_train, 'wb') as handle: print("Train loaded from Request:", url_train) response = requests.get(url_train, stream=True) if not response.ok: # Something went wrong pass for block in response.iter_content(1024): handle.write(block) handle.close() print("Request DONE") else: print("Train loaded from cache:", file_train) with open(file_train, 'r', encoding="utf8") as fp: #cl = MaxEntClassifier(fp) cl = NaiveBayesClassifier(fp) # print(cl.classify("This is an amazing library!")) # print(cl.accuracy(test)) # cl.update(test) # print(cl.accuracy(test)) prob_dist = cl.prob_classify(text) print("TEST:", text, " ", prob_dist, " ", prob_dist.max()) for a in prob_dist.samples(): print(a, ":", round(prob_dist.prob(a), 2)) print(cl.show_informative_features()) aa = cl.extract_features(text) print(aa) print("---------------------------------------") return cl.classify(text)
class TBSentiment(Model): """Wrapper around the TextBlob sentiment analyzer. Can train and test a using the standardized data format. Args: Model (): Initialize the model. """ def __init__(self): self.cl = NaiveBayesClassifier([]) def classify(self, comment): prob_dist = self.cl.prob_classify(comment) pol_pred = prob_dist.max() confidence = prob_dist.prob(pol_pred) return pol_pred, confidence def train(self, data, eval=None, d_print=False): """Train the TextBlob object on custom data. Args: data (:obj:`list` of :obj:`tuple`): Take a list of tuples with format (comment, polarity in ["pos", "neg"]). """ self.cl.update(data) def test(self, data): """Test the TextBlob object on custom data. Args: data (:obj:`list` of :obj:`tuple`): Take a list of tuples with format (comment, polarity in ["pos", "neg"]). Returns: :obj:`tuple`: Return the successes and failures in a list (:obj:`list`, :obj:`list`) """ return
def naive_bayes_classify(data): class_to_predict = 'type' # product importance all_data = [ tuple(x) for x in data[['text', class_to_predict]].to_records(index=False) ] text_counts = {} for item in all_data: for word in set(item[0].split()): if word in text_counts: text_counts[word] += 1 else: text_counts[word] = 1 for i in range(len(all_data)): new_text = '' for word in all_data[i][0].split(): if text_counts[word] >= 5: new_text += ' ' + word all_data[i] = (new_text, all_data[i][1]) print('Finished preprocessing!') test_corpus = all_data[3000:3600] training_corpus = all_data[:3000] model = NBC(training_corpus, verbose=True) print('Done training!') print('Accuracy: ' + str(model.accuracy(test_corpus))) y_pred = [] y_true = [] for test_item in test_corpus: y_pred.append(model.prob_classify(test_item[0]).max()) y_true.append(test_item[1]) print('F1 score: ' + str(f1_score(y_true, y_pred, average='weighted')))
class InputLabeler(object): LABELS_DATA = 'labels_data.json' def __init__(self): with open(self.LABELS_DATA, 'r') as fp: self.c = NaiveBayesClassifier(fp, format="json") with open(self.LABELS_DATA, 'r') as fp: self.labels_json = {} for i in json.load(fp): self.labels_json[i['text']] = i['label'] def get_num_labels(self): return(len(self.get_labels())) def get_labels(self): labels = self.labels_json.values() labels.sort() return(set(labels)) def get_label(self, text): text = text.lower() # self.save_placeholder(text) prob_dist = self.c.prob_classify(text) label = prob_dist.max() prob = round(prob_dist.prob(label), 2) if prob > 0.7: return(label) else: return(None) def save_placeholder(self, text): try: self.labels_json[text] except KeyError: self.labels_json[text] = 'unknown' with open(self.LABELS_DATA, 'w') as fp: json.dump([{'text': k, 'label': v} for k,v in self.labels_json.items()], fp, indent=4)
import data_sets #train = data_sets.en_train #test = data_sets.en_test train = data_sets.subte_train test = data_sets.subte_test #tx_cl = "I feel amazing!" #tx_prob = "This one's a doozy." tx_cl = "El subte esta demorado" tx_prob = "El subte funciona bien" cl = NaiveBayesClassifier(train) print cl.classify(tx_cl) print cl.classify("El subte funciona bien") prob_dist = cl.prob_classify(tx_prob) print prob_dist.max() print round(prob_dist.prob("pos"), 2) print round(prob_dist.prob("neg"), 2) print cl.accuracy(data_sets.en_test) print cl.show_informative_features(5) #Using TextBlob blob = TextBlob("No funca por que hay obras para mejorar la cosa", classifier=cl) print blob.sentiment print blob.classify() blob = TextBlob("El subte funciona normal", classifier=cl) print blob.sentiment
def mainQuery(query): generic_questions = ("Let's go","You never wanted to go out with 'me, did you?","Who knows?","What annoys you?", "you've heard of him?","What were you doing?","Thank you anyway","No problem", 'She okay?',"Yes, I have a question.","What is your question?","What are your hobbies?", "You know how sometimes you just become this 'persona'? And you don't know how to quit?", "what's up?",'sup people? I see the weather\'s getting better over there, Ben.', "how are you doing?","Hi","Hello","Hey","How's you?","Have you heard the news?", 'i had the same problem your having so thats my i made my own.',"What is your favorite book?", "good night","good morning","good afternoon","good evening","So what's your favorite color?", 'What good stuff?',"what's new?","How's life?","That is good to hear", "I am doing well, how about you?","I am doing well, how about you?","I'm also good.", "What are you then?",'What are you working on?',"Who are you?","What is it like?", "How do you work?","Who is your appointment with?","What languages do you like to use?", ) technical_questions=("Clearpass is extended to IT systems using which API?", "Which browsers are supported for ClearPass?", "Which virtualization platforms is supported by Clearpass?", "name the authentication/authorization sources used by clearpass.", "does Clearpass use ipv6 or ipv4 addressing?", "how many sessioons can be provided by ClearPass C2000 Hardware Appliance?", "how does Admin/Operator access security?", 'Virtual Appliances are supported on which platforms?', "Name the ClearPass Hardware Appliance Ports.", "What is the expansion of OCSP?", "what are the active Profiling Methods?", "What are cookies?", "what does dynamic authorisation mean?", "Which standard the clearpass Guest is built on?", "which protocol is used by the NAS to authenticate the user ?", "Which network connectivity is provisioned for Clearpass Guest?", "What is NAS?", "What are the possible states of a session?", "what does dynamic authorisation mean?", 'Which standard the clearpass Guest is built on?', "Which network connectivity is provisioned for Clearpass Guest?", "What is the use of airgroup?", "What are cookies used for?", 'Is Windows Server 2008 "Server Core" appropriate for a SQL Server instance?', "Is there any list of the network devices supported by clearpass for 802.1x auth", "How can I Block my users from installing new virtual machines", "Is there any list of medical devices compatible with clearpass ?", "what are Good branching and merging tutorials for TortoiseSVN?", "how to Add scripting functionality to .NET applications", "why is VMWare Server Under Linux Secondary NIC connection", "Setting up Continuous Integration with SVN", "Does CruiseControl.NET run on IIS 7.0?", "what to do when there are users in both Edmonton and Toronto that access the same “Corpnet” Wireless LAN.", "what are the three hardware appliance platforms that aruba provides?", "how to Powering Off the ClearPass Hardware Appliance?", "what are the Supported Hypervisors for clearpass?" ) generic_questions = [(x, 'generic') for x in generic_questions] technical_questions = [(x, 'tech') for x in technical_questions] training_set = [] training_set.extend(generic_questions) training_set.extend(technical_questions) Qclassifier = NaiveBayesClassifier(training_set) #print(Qclassifier.show_informative_features(), Qclassifier.labels()) #test_queries=("What are cookies used for?","what are the Integrated and Third-Party Profiling Methods?","Hi. Good morning","what does dynamic authorisation mean?","Howdy?") #for t in test_queries: prob_dist = Qclassifier.prob_classify(query) #print(t, '\n', prob_dist.max(), prob_dist.prob(prob_dist.max())) if(prob_dist.max()=="tech"): return "tech" elif(prob_dist.max()=="generic"): return "generic" else: return None
# Testing accuracy with a testset test = [ ('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg') ] print(cls.accuracy(test)) print(cls.classify("Their burgers are amazing")) print(cls.classify("I don't like their pizza.")) from textblob import TextBlob blob = TextBlob("The beer was amazing. " "But the hangover was horrible. My boss was not happy.", classifier=cls) print(blob.classify()) # Fetching label probablities prob_dist = cls.prob_classify("Their burgers are amazing") print(prob_dist.max()) print(round(prob_dist.prob("pos"), 2)) print(round(prob_dist.prob("neg"), 2))
] test = [('I am still waiting for a call back', 'neg'), ('Im an accountant and I had a question about balancing reports', 'pos'), ('declining everything', 'neg'), ('I have been waiting on hold for 20 minutes', 'neg'), ('This problem is still not resolved', 'neg') ] from textblob.classifiers import NaiveBayesClassifier cl = NaiveBayesClassifier(train) cl.classify("im in offline mode") prob_dist = cl.prob_classify("im in offline mode") prob_dist.max() round(prob_dist.prob("pos"), 2) round(prob_dist.prob("neg"), 2) cl.classify("we are busy with dinner service and need help") prob_dist = cl.prob_classify("we are busy with dinner service and need help") prob_dist.max() round(prob_dist.prob("pos"), 2) round(prob_dist.prob("neg"), 2) polarity=[]
('I feel amazing!', 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] print test print train cl = NaiveBayesClassifier(train) # Learning classifier with NaiveBayesClassifier # Classifying Text ( Call the classify(text) method to use the classifier.) test_check = cl.classify("This is an amazing library!") print test_check # You can get the label probability distribution with the prob_classify(text) method. prob_dist = cl.prob_classify("This one's a doozy.") print prob_dist.max() print round(prob_dist.prob("pos"), 2) print round(prob_dist.prob("neg"), 2) print prob_dist.prob("pos") print prob_dist.prob("neg") blob = TextBlob("The beer is good. But the hangover is horrible.", classifier=cl) print blob.classify() # Evaluating Classifiers (To compute the accuracy on our test set, use the accuracy(test_data) method.) print cl.accuracy(test) # Updating Classifiers with New Data (Use the update(new_data) method to update a classifier with new training data.)
('thank you', 'pos'), ('thank you', 'pos'), ('quick question about how to add a user', 'pos'), ('monthly subscription charge question', 'pos')] test = [('I am still waiting for a call back', 'neg'), ('Im an accountant and I had a question about balancing reports', 'pos'), ('declining everything', 'neg'), ('I have been waiting on hold for 20 minutes', 'neg'), ('This problem is still not resolved', 'neg')] from textblob.classifiers import NaiveBayesClassifier cl = NaiveBayesClassifier(train) cl.classify("im in offline mode") prob_dist = cl.prob_classify("im in offline mode") prob_dist.max() round(prob_dist.prob("pos"), 2) round(prob_dist.prob("neg"), 2) cl.classify("we are busy with dinner service and need help") prob_dist = cl.prob_classify("we are busy with dinner service and need help") prob_dist.max() round(prob_dist.prob("pos"), 2) round(prob_dist.prob("neg"), 2) polarity = [] def GetPolarity(string):
'''Dataset source: Abdulla N. A., Mahyoub N. A., Shehab M., Al-Ayyoub M., ìArabic Sentiment Analysis: Corpus-based and Lexicon-basedî, IEEE conference on Applied Electrical Engineering and Computing Technologies (AEECT 2013), December 3-12, 2013, Amman, Jordan. (Accepted for Publication).''' # creating Naive Bayes Classifier from textblob.classifiers import NaiveBayesClassifier cl = NaiveBayesClassifier("train.csv", format="csv") #cl = NaiveBayesClassifier(train) # Test model with its two labels print cl.classify(u" احسن علاج هذا") # second cl model test prob_dist = cl.prob_classify(u"ك يوم يا ظالم,") print prob_dist.max() print prob_dist.prob("positive") print prob_dist.prob("negative") # compute the accuracy on our test set print "accuracy on the test set:{} ".format(cl.accuracy("testing.csv", format="csv")) # display a listing of the most informative features. cl.show_informative_features(5) # add new data new_data = [(u"كلام صحيح من شان هيك الدول اللي ما فيها بطالة والمجتمعات المفتوحة بتقل فيها المشاكل النفسية", 'positive'), (u"لا طبعا التقرب الى الله هو خير علاج للحالات النفسية", 'positive'), (u"تفائلوا بالخير تجدوه", 'positive'), (u"يا ترى الحكومه بدها تزيد دعم المواطن الي الله يكون في عونه", 'negative')]
with open('test.json', 'r') as test_file: model1_accuracy = model1.accuracy(test_file, format=None) print("model1 accuray = '%s' " %model1_accuracy) ############################################################################### ############################################################################### print("#################################################") text3 = "We did not like his results." #probability_classification_chosen = 'neg' #probability_positive = '0.11' #probability_negative = '0.89' print("Assessing text = " + text3) model1_prob_dist = model1.prob_classify(text3) probability_classification_chosen = model1_prob_dist.max() print("probability_classification = '%s' " %probability_classification_chosen) probability_positive = round(model1_prob_dist.prob("pos"), 2) print("probability_positive = '%s' " %probability_positive) probability_negative = round(model1_prob_dist.prob("neg"), 2) print("probability_negative = '%s' " %probability_negative) text_classification = "NOT-PROPERLY-CLASSIFIED" if probability_positive<=0.55 and probability_negative<=0.55: text_classification = "NEUTRAL" elif probability_positive>0.55 and probability_positive<=0.75 and probability_negative<=0.55: text_classification = "SLIGHTLY-POSITIVE"
b = row['timestamp'].replace('-',' ').replace(':',' ').split() b = [int(x) for x in b] time = datetime.datetime(b[0],b[1],b[2],b[3],b[4],b[5]) # Skip if not at time yet if start_time > time: continue # Break if past endtime if end_time < time: break # Add to sentiment n+=1 newline=row['text'].decode('utf-8') prob_dist=cl.prob_classify(newline) line_sent = prob_dist.max() if line_sent==' pos': sentiment+=1 elif line_sent==' neg': sentiment-=1 # If interval has been reach, start a new bin if abs(time.minute - minutes) >= interval: # Record time variables times.append(tot_time) tot_time += interval labels.append(str(time.hour) + ':' + str(time.minute)) minutes = time.minute
import data_sets #train = data_sets.en_train #test = data_sets.en_test train = data_sets.subte_train test = data_sets.subte_test #tx_cl = "I feel amazing!" #tx_prob = "This one's a doozy." tx_cl = "El subte esta demorado" tx_prob = "El subte funciona bien" cl = NaiveBayesClassifier(train) print cl.classify(tx_cl) print cl.classify("El subte funciona bien") prob_dist = cl.prob_classify(tx_prob) print prob_dist.max() print round(prob_dist.prob("pos"), 2) print round(prob_dist.prob("neg"), 2) print cl.accuracy(data_sets.en_test) print cl.show_informative_features(5) #Using TextBlob blob = TextBlob("No funca por que hay obras para mejorar la cosa", classifier=cl) print blob.sentiment print blob.classify() blob = TextBlob("El subte funciona normal", classifier=cl) print blob.sentiment print blob.classify()
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
# # cl = NaiveBayesClassifier(newTrMerged) cl = NaiveBayesClassifier(newTrMerged) print "end training" # # open test file and evaluate prediction probabiity test_df = read_csv('test1_org.csv') tr_ID = test_df['ID']#[:5] tr_review = test_df['review']#[:5] newTestMerged = zip(tr_review,tr_ID) with open('result.csv', 'wb') as csvfile: resultwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) resultwriter.writerow(("ID","Predicted")) emptyCl = [] g = (line for line in newTestMerged) for line in g: expected_label = cl.classify(line[0]) emptyCl.append(expected_label) prob_dist = cl.prob_classify(line[0]) prob_pos = prob_dist.prob("1") result = line[1], prob_pos resultwriter.writerow(result) print("done in %fs" % (time() - t0))
model_accuracy = model.accuracy(test_file, format=None) print("model accuray = '%s' " % model_accuracy) ############################################################################### # CREATING A NEUTRAL CLASS FROM POSITIVE AND NEGATIVE ############################################################################### print("#################################################") text3 = "We did not like his results." #probability_classification_chosen = 'neg' #probability_positive = '0.11' #probability_negative = '0.89' print("Assessing text = " + text3) model_prob_dist = model.prob_classify(text3) #probability_classification_chosen = model_prob_dist.max() #print("probability_classification = '%s' " %probability_classification_chosen) probability_positive = round(model_prob_dist.prob("pos"), 2) #print("probability_positive = '%s' " %probability_positive) probability_negative = round(model_prob_dist.prob("neg"), 2) #print("probability_negative = '%s' " %probability_negative) text_classification = "NOT-PROPERLY-CLASSIFIED" if probability_positive <= 0.55 and probability_negative <= 0.55: text_classification = "NEUTRAL" elif probability_positive > 0.55 and probability_positive <= 0.75 and probability_negative <= 0.55: text_classification = "SLIGHTLY-POSITIVE"
with open("yelplinks.txt") as f: array= f.readlines() for line in array: line1=line.split('\n') openfile= "wordstopolarity/"+line1[0]+".txt" outputfile = open ("polarity/"+line1[0]+".txt" , "w+") outputfiletest = open ("polaritytesting/"+line1[0]+".txt" , "w+") k=0 with open(openfile) as s: for line in s: text = line if k % 200 ==0 : print line1[0]+ "\t"+str (k) k=k+1 naivebayes=naive.prob_classify(text) naivebayes_max=naivebayes.max() naivebayes_prob=round(naivebayes.prob(naivebayes_max), 3) naivebayes_value=0 if str(naivebayes_max)=="pos": naivebayes_value= 1 else: naivebayes_value= -1 decisionTest=[(review_features(text))] decisionTree=decision.classify_many(decisionTest) decisionTree_value=0 if str(decisionTree[0])=="pos": decisionTree_value=1 else: decisionTree_value= -1
# # w= Word('running') # print w.lemmatize() #Text Classify train = [('I love this sandwich.', 'pos'), ('this is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('this is my best work.', 'pos'), ("what an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'), ('my boss is horrible.', 'neg')] test = [('the beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = NaiveBayesClassifier(train) print cl.classify("This is an amazing library!") print cl.accuracy(test) print cl.show_informative_features(5) prob_dist = cl.prob_classify("This one's a doozy.") print prob_dist.max()
historicos = {} fPolH = open('util/politicos-historico.txt', 'r') for item in fPolH: historicos[item.strip()] = 'politico' fMedH = open('util/medios-historico.txt', 'r') for item in fMedH: historicos[item.strip()] = 'medio' print('\nClasificando:') clasifSalida = {} for item in clasificaEsto: if item in historicos: clasifSalida[item] = historicos[item] else: prob_dist = clasificador.prob_classify(clasificaEsto[item]) if round(prob_dist.prob(prob_dist.max()), 3) == 1: clasifSalida[item] = prob_dist.max() else: clasifSalida[item] = 'ciudadano' print 'Leyendo lista completa de usuarios...' fUserList = open(sys.argv[2], 'r') for item in fUserList: item = item.strip() if not item in clasifSalida: if item in historicos: clasifSalida[item] = historicos[item] else: clasifSalida[item] = 'ciudadano'
pol_labels = pol_df['labels'].copy() pol_labels[pol_mask] = 'pos' pol_labels[~pol_mask] = 'neg' pol_df['etc'] = pol_labels pol_df nb_training = set() for i, row in pol_df.iterrows(): nb_training.add((row[0], row[2])) for i, row in adj_df.iterrows(): nb_training.add((row[0], row[2])) nb_training nbc = NaiveBayesClassifier(nb_training) prob_dist = nbc.prob_classify('trump hates racism') prob_dist.max() prob_dist.prob('neg') nb_name = 'naivebayesclassifier.pkl' with open(nb_name, 'wb') as f: pickle.dump(nbc, f) lin_reg_training = {} for i, row in adj_df.iterrows(): lin_reg_training[row[0]] = round(row[1] / 10.0, 3) for i, row in pol_df.iterrows(): lin_reg_training[row[0]] = round(row[1] / 10.0, 3) with open('sentiment_lexicon.pkl', 'wb') as f:
for d in data: ol.append(d.tolist()) print ol train=ol[:60] test=ol[29900:] cl = NaiveBayesClassifier(train) accuracy = cl.accuracy(test) print("Accuracy: {0}".format(accuracy)) res= pd.read_csv('foo.csv') res=res.values print res pl=[] for r in res: pl.append(r[1]) print pl pred=cl.prob_classify(pl) print pred.max() # Show 5 most informative features #cl.show_informative_features(5)
class Emote(object): emoteClassOn = False # Is Emote being used as a library or class? runningScript = False # Or is Emote being run as a script directly? firstTime = True # Emote running for the first time? pickledOn = False # Is a pickled database detected? SQLDataOn = False # Is a SQL database detected? fullCount = "" # The string result detailing the full amount of classifications (sorted by type and frequency) that the current training database contains writtenAnalysis = False # Turn writte analysis on? levelsAnalysis = True # Turn full levels analysis on? defaultCorpus = "" # What's the default corpus? # connectDB = sqlite3.connect('base_corpus.db') # Using SQL db for base corpus texts def __init__(self, message="", pre_result="", prob_dist=0, prob_dist_max=0, positive=0, negative=0, joy=0, anger=0, love=0, hate=0, certainty=0, boredom=0, intensity=0, regret=0, challenging=0, agreeable=0, desire=0, calm=0, sarcastic=0, emphatic=0, pride=0, accusative=0, admiration=0, inquisitive=0, modest=0, instructive=0, ambivalence=0, vulgarity=0, train=[], cl=NaiveBayesClassifier([]), punctCountDict={}, wordCount=0, sentenceCount=0, normalizedProbValues={}, sentences=[], sentencesProbValues=[], massResults=[]): self.train = train # PLACE THE TRAINING DATA (TUPLES) IN SELF.TRAIN BELOW self.train = [] # self.message = message self.punctCountDict = punctCountDict self.wordCount = wordCount self.sentenceCount = sentenceCount self.pre_result = pre_result self.prob_dist = prob_dist self.prob_dist_max = prob_dist_max self.positive = positive self.negative = negative self.joy = joy self.anger = anger self.love = love self.hate = hate self.certainty = certainty self.boredom = boredom self.intensity = intensity self.regret = regret self.challenging = challenging self.agreeable = agreeable self.desire = desire self.calm = calm self.sarcastic = sarcastic self.emphatic = emphatic self.pride = pride self.accusative = accusative self.admiration = admiration self.inquisitive = inquisitive self.modest = modest self.instructive = instructive self.ambivalence = ambivalence self.vulgarity = vulgarity self.prob_dist = prob_dist self.prob_dist_max = prob_dist_max self.cl = cl self.normalizedProbValues = normalizedProbValues self.sentences = sentences self.sentencesProbValues = sentencesProbValues self.massResults = massResults def getInput(self, _message): global firstTime global runningScript global emoteClassOn if runningScript == True: if firstTime == False: self.message = input('\n\tWrite message to be analyzed: ') _message = self.message self.countPunct(_message) self.countWordSent(_message) self.runAnalysis(_message) else: print( """\n\tNow starting Emote as a script. Use Emote Mass Analyzer to break down a text into individual sentence classifications, or import Emote as a library.""") firstTime = False self.initialTrain() else: if firstTime == True: # print("\nFIRST TIME IS TRUE") print("\n\tRunning Emote as a library..") self.message = _message emoteClassOn = True self.countPunct(_message) self.countWordSent(_message) self.runAnalysis(_message) else: # print("\nFIRST TIME IS FALSE") emoteClassOn = False self.message = _message self.countPunct(_message) self.countWordSent(_message) self.runAnalysis(_message) def initialTrain(self): # For interchangable corpuses.. uncomment code modifying selectedCorpus # selectedCorpus = input('\n\tEnter the name of the corpus file to load (Press enter to load default, from base_corpus.py): ') global defaultCorpus global pickledOn global SQLDataOn global SQLData global connectDB global fullCount # ` = str(self.train) fullDatabase = str(self.train) countPositive = fullDatabase.count("'positive')", 0, len(fullDatabase)) countNegative = fullDatabase.count("'negative')", 0, len(fullDatabase)) countLove = fullDatabase.count("'love')", 0, len(fullDatabase)) countHate = fullDatabase.count("'hate')", 0, len(fullDatabase)) countJoy = fullDatabase.count("'joy')", 0, len(fullDatabase)) countAnger = fullDatabase.count("'anger')", 0, len(fullDatabase)) countCertainty = fullDatabase.count("'certainty'", 0, len(fullDatabase)) countConfusion = fullDatabase.count("'confusion'", 0, len(fullDatabase)) countAmusement = fullDatabase.count("'amusement'", 0, len(fullDatabase)) countBoredom = fullDatabase.count("'boredom'", 0, len(fullDatabase)) countIntensity = fullDatabase.count("'intensity'", 0, len(fullDatabase)) countRegret = fullDatabase.count("'regret'", 0, len(fullDatabase)) countAgreeable = fullDatabase.count("'agreeable'", 0, len(fullDatabase)) countChallenging = fullDatabase.count("'challenging'", 0, len(fullDatabase)) countDesire = fullDatabase.count("'desire'", 0, len(fullDatabase)) countCalm = fullDatabase.count("'calm'", 0, len(fullDatabase)) countEmphatic = fullDatabase.count("'emphatic'", 0, len(fullDatabase)) countSarcastic = fullDatabase.count("'sarcastic'", 0, len(fullDatabase)) countInstructive = fullDatabase.count("'instructive'", 0, len(fullDatabase)) countAccusative = fullDatabase.count("'accusative'", 0, len(fullDatabase)) countAdmiration = fullDatabase.count("'admiration'", 0, len(fullDatabase)) countInquisitive = fullDatabase.count("'inquisitive'", 0, len(fullDatabase)) countModest = fullDatabase.count("'modest'", 0, len(fullDatabase)) countPride = fullDatabase.count("'pride'", 0, len(fullDatabase)) countAmbivalence = fullDatabase.count("'ambivalence'", 0, len(fullDatabase)) countVulgarity = fullDatabase.count("'vulgarity'", 0, len(fullDatabase)) fullCount = "\n\tNumbers and types of classifications in loaded database: \n"+ "\t\tPositive: " + str(countPositive) + "\t" + "Negative: " + str(countNegative) + \ "\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger) + "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + \ "\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion) + "\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom) + \ "\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret) + "\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging) + \ "\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm) + "\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic) + \ "\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative) + "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + \ "\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive) + "\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity) print( """\n\tNumbers and types of classifications in database to be loaded: \n""" ) print("\t\tPositive: " + str(countPositive) + "\t" + "Negative: " + str(countNegative)) print("\t\tLove: " + str(countLove) + "\t\t" + "Hate: " + str(countHate)) print("\t\tJoy: " + str(countJoy) + "\t\t" + "Anger: " + str(countAnger)) print("\t\tCertainty: " + str(countCertainty) + "\t" + "Confusion: " + str(countConfusion)) print("\t\tAmusement: " + str(countAmusement) + "\t" + "Boredom: " + str(countBoredom)) print("\t\tIntensity: " + str(countIntensity) + "\t" + "Regret: " + str(countRegret)) print("\t\tAgreeable: " + str(countAgreeable) + "\t" + "Challenging: " + str(countChallenging)) print("\t\tDesire: " + str(countDesire) + "\t" + "Calm: " + str(countCalm)) print("\t\tEmphatic: " + str(countEmphatic) + "\t" + "Sarcastic: " + str(countSarcastic)) print("\t\tInstructive: " + str(countInstructive) + "\t" + "Accusative: " + str(countAccusative)) print("\t\tAdmiration: " + str(countAdmiration) + "\t" + "Inquisitive: " + str(countInquisitive)) print("\t\tModest: " + str(countModest) + "\t" + "Pride: " + str(countPride)) print("\t\tAmbivalence: " + str(countAmbivalence) + "\t" + "Vulgarity: " + str(countVulgarity)) # if selectedCorpus != defaultCorpus and selectedCorpus != "": # defaultCorpus = selectedCorpus # elif selectedCorpus == "": # defaultCorpus = defaultCorpus # else: # defaultCorpus = "base_corpus.py" selectedCorpus = defaultCorpus try: path = os.getcwd() path = os.path.join(path, 'data', 'base_corpus.pickle') with open(path, 'rb') as fp: size = os.path.getsize(path) if size > 0: pickledOn = True print("\n\tPickled data found!") else: pass fp.close() except IOError as err: pickledOn = False path = os.getcwd() print( "\n\tNo pickled data found.. now creating and loading pickle.." ) # If corpus text in SQL db.. # try: # path = os.getcwd() # path = os.path.join(path, '../data', 'base_corpus.db') # with open(path, 'r') as fp: # SQLDataOn = True # size = os.path.getsize(path) # if size > 5: # SQLDataOn = True # print("\n\tNo SQL found.") # else: # SQLDataOn = False # print("\n\tSQL found!") # fp.close() # except IOError as err: # SQLDataOn = False # print("\n\tNo SQL data found.. now creating and loading SQL.") # SHELVE STUFF # READING TRAINING DATA FROM FILE DEFAULTCORPUS if pickledOn == False: # Code below takes training data from text file input # path = os.getcwd() # path = os.path.join(path, 'data', 'base_corpus.py') # shelvedData = shelve.open('base_corpus.db') # if shelvedData: # pickledOn = True # with open(path, 'r') as fp: # print(fp) # fp = open(path,'r').read().tt('\n') # self.train = fp.readlines() # temp = [line[:-1] for line in self.train] # print(temp) # self.train = self.train.rstrip("\r\n") # for i in self.train: # i = i.encode('ascii', 'backslashreplace') # i = i.rstrip("\r\n") # print(i) # lines = tuple(open(path, 'r', encoding = 'utf-8')) # lines = lines.strip() # print(str(lines)) # self.train = lines # print(self.train) print("\n\tOpening training data.") # if SQLDataOn == False: # self.sendToSQL() # currentTime = datetime.datetime.now().time() # print("\n\n\tTIME NEW DATABASE STARTED TRAINING: ", currentTime) # print("""\n\tStarting NaiveBayesClassifer training for """ + str(len(self.train)) + """ supervised classifications.. the initial training period will take a while.""") # elif SQLDataOn == True: # self.parseFromSQL() random.seed(1) random.shuffle(self.train) self.cl = NaiveBayesClassifier(self.train) print("\n\tTraining now..") # shelvedData["base"] = cl # SHELF vs PICKLE path = os.getcwd() path = os.path.join(path, 'data', 'base_corpus.pickle') fp = open(path, 'wb') print("\n\tLoaded training data into pickle file.") pickle.dump(self.cl, fp, protocol=pickle.HIGHEST_PROTOCOL) fp.close() print( "\n\tPickling complete, and will be loaded as the default database corpus next time, skipping the training period." ) currentTime = datetime.datetime.now().time() print("\n\n\tTIME NEW DATABASE FINISHED TRAINING AND SAVING: ", currentTime) # shelvedData.close() # SHELF vs PICKLE if pickledOn == True: try: # shelvedData = shelve.open("base_corpus.dat") # SHELF VS PICKLE path = os.getcwd() path = os.path.join(path, 'data', 'base_corpus.pickle') fp = open(path, "rb") self.cl = pickle.load(fp) fp.close() print("\n\tTraining has been loaded from the selected corpus.") print("\t\t" + fullCount) except IOError as err: print( "\n\tError training pickle file.. system will exit. Go into the directory, delete the corrupt pickle file, and retry this script to train a new copy." ) sys.exit() pass if emoteClassOn == True: self.runAnalysis(_message) else: self.getInput(_message) # If corpus data was stored in SQL.. # def sendToSQL(self): # c.execute("DROP TABLE IF EXISTS Base") # c.execute("CREATE TABLE Base (Date_Sorted TEXT, Source TEXT, Message TEXT);") # for i in self.train: # # print(i) # try: # c.execute("INSERT INTO Base VALUES (?, ?, ?);", ('11-05-2016', 'general', i)) # connectDB.commit() # print(i) # except: # print('err') # pass # c.close() # def parseFromSQL(self): # global SQLData # global connectDB # print("Training data from SQL..") # try: # # connectDB.row_factory = sqlite3.Row # c.execute("SELECT Message FROM base WHERE 1") # # connectDB.text_factory = lambda x: x.decode("utf-8") # all_rows = cursor.fetchall() # # line = re.sub('[!@#$]', '', line) # # all_rows = [row[0].strip for row in cursor.fetchall()] # # for r in all_rows: # # temp_row = r[0] # # temp_row = temp_row.strip() # # temp_row = re.sub('\r\n', '', temp_row) # # temp_row = re.sub('\\', '', temp_row) # # temp_row = unicodedata.normalize('NFKD', temp_row).encode('ascii','ignore') # # print(temp_row) # # temp_row = temp_row.replace("\\","") # # SQLData.append(unicodedata.normalize('NFKD', temp_row)) # # SQLData.append(str(temp_row).strip()) # except: # pass def countPunct(self, _message): numberCount = 0 periodCount = 0 commaCount = 0 exclamationPtCount = 0 questionMkCount = 0 for char in _message: if char.isdigit() == True: numberCount += 1 elif char == '.': periodCount += 1 elif char == ',': commaCount += 1 elif char == '!': exclamationPtCount += 1 elif char == '?': questionMkCount += 1 else: pass self.punctCountDict = { "numbers": numberCount, "periods_end": periodCount, "question_marks": questionMkCount, "exclamation_points": exclamationPtCount, "commas": commaCount } return self.punctCountDict def countWordSent(self, _message): _messageSplitWords = _message.split() _messageSplitSent = sent_tokenize(_message) self.wordCount = len(_messageSplitWords) # print("\n\tWord count in message: " + str(self.wordCount)) self.sentenceCount = len(_messageSplitSent) # print("\n\tSentence count in message: " + str(self.sentenceCount)) return self.wordCount, self.sentenceCount def split_into_sentences(self, _message): # global firstTime sentenceTempValStore = [] self.normalizedProbValues = [] # if firstTime == False: self.sentences = sent_tokenize(_message) if len(self.sentences) > 1: for i in self.sentences: self.runAnalysis(str(i)) self.sentencesProbValues.append(self.normalizedProbValues) return self.sentencesProbValues else: pass def analyzeCSV(self, path): csvData = [] csvTextData = [] file = open(path, 'r') csv_file = csv.reader(file, delimiter=",") for row in csv_file: csvData.append(row[0]) csvTextData.append(row[1]) file.close() print("\n\t", csvData) print("\n\t", csvTextData) print("\n\t", csvTextData) print("\n\t", csvData) self.massResults = [] for i in range(len(csvTextData)): self.runAnalysis(csvTextData[i]) print(emote.normalizedProbValues) self.massResults.append(self.normalizedProbValues) path = os.getcwd() path = os.path.join(path, 'static', 'results.csv') csvFile = open('static/results.csv', 'w', newline='') for i in range(len(self.massResults)): # with open('static/results.csv', 'w', newline='') as csvFile: csvIndRowList = [] csvResults = csv.writer(csvFile, delimiter=',') csvIndRowList.append(csvData[i]) csvIndRowList.append(csvTextData[i]) csvIndRowList.append(self.massResults[i][0]) csvIndRowList.append(self.massResults[i][1]) csvIndRowList.append(self.massResults[i][2]) csvIndRowList.append(self.massResults[i][3]) csvIndRowList.append(self.massResults[i][4]) csvIndRowList.append(self.massResults[i][5]) print("\n\tROW LIST", csvIndRowList) csvResults.writerow(csvIndRowList) csvFile.close() return csvResults, csvFile, self.massResults def runAnalysis(self, _message): global emoteClassOn global firstTime global runningScript if firstTime == True and emoteClassOn == True: print( "\n\n\tFirst time running analysis.. load pickle data. The initial analysis will be slower because of the loading." ) path = os.getcwd() # path = os.path.join(path, '/Users/johnny/Documents/GitHub/emote/data', 'base_corpus.pickle') # path = os.getcwd() path = os.path.join(path, 'data', 'base_corpus.pickle') fp = open(path, 'rb') self.cl = pickle.load(fp) fp.close() emoteClassOn = False firstTime = False # print("\n\tAnalyzing " + "'"+str(_message)+"'" +"..") self.prob_dist = self.cl.prob_classify(_message) self.prob_dist_max = self.prob_dist.max() self.positive = round(self.prob_dist.prob("positive"), 4) self.negative = round(self.prob_dist.prob("negative"), 4) self.joy = round(self.prob_dist.prob("joy"), 4) self.anger = round(self.prob_dist.prob("anger"), 4) self.love = round(self.prob_dist.prob("love"), 4) self.hate = round(self.prob_dist.prob("hate"), 4) self.certainty = round(self.prob_dist.prob("certainty"), 4) self.confusion = round(self.prob_dist.prob("confusion"), 4) self.amusement = round(self.prob_dist.prob("amusement"), 4) self.boredom = round(self.prob_dist.prob("boredom"), 4) self.intensity = round(self.prob_dist.prob("intensity"), 4) self.regret = round(self.prob_dist.prob("regret"), 4) self.agreeable = round(self.prob_dist.prob("agreeable"), 4) self.challenging = round(self.prob_dist.prob("challenging"), 4) self.desire = round(self.prob_dist.prob("desire"), 4) self.calm = round(self.prob_dist.prob("calm"), 4) self.emphatic = round(self.prob_dist.prob("emphatic"), 4) self.sarcastic = round(self.prob_dist.prob("sarcastic"), 4) self.instructive = round(self.prob_dist.prob("instructive"), 4) self.accusative = round(self.prob_dist.prob("accusative"), 4) self.admiration = round(self.prob_dist.prob("admiration"), 4) self.inquisitive = round(self.prob_dist.prob("inquisitive"), 4) self.modest = round(self.prob_dist.prob("modest"), 4) self.pride = round(self.prob_dist.prob("pride"), 4) self.ambivalence = round(self.prob_dist.prob("ambivalence"), 4) self.vulgarity = round(self.prob_dist.prob('vulgarity'), 4) valueList = [ self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity ] posNegAbsVal = math.fabs(self.positive - self.negative) if posNegAbsVal <= .25: self.positive = self.positive * math.sqrt( self.positive) * math.sqrt(self.positive) * math.sqrt( self.positive) * math.sqrt(self.positive) self.negative = self.negative * math.sqrt( self.negative) * math.sqrt(self.negative) * math.sqrt( self.negative) * math.sqrt(self.negative) else: pass if runningScript == True: # print("\n") # print("\n\tProbability Values Pre-Normalization: ") # print("\tStrongest Emotion: " + self.prob_dist_max) # print("\tPositive: " + str(self.positive) + "\tNegative: " + str(self.negative)) # print("\tJoy: " + str(self.joy) + "\tAnger: " + str(self.anger)) # print("\tLove: " + str(self.love) + "\tHate: " + str(self.hate)) # print("\tCertainty: " + str(self.certainty) + "\tConfusion: " + str(self.confusion)) # print("\tAmusement: " + str(self.amusement) + "\tBoredom: " + str(self.boredom)) # print("\tIntensity: " + str(self.intensity) + "\tRegret: " + str(self.regret)) # print("\tAgreeable: " + str(self.agreeable) + "\tChallenging: " + str(self.challenging)) # print("\tDesire: " + str(self.desire) + "\tCalm: " + str(self.calm)) # print("\tEmphatic: " + str(self.emphatic) + "\tSarcastic: " + str(self.sarcastic)) # print("\tInstructive: " + str(self.instructive) + "\tAccusative: " + str(self.accusative)) # print("\tAdmiration: " + str(self.admiration) + "\tInquisitive: " + str(self.inquisitive)) # print("\tModest: " + str(self.modest) + "\tPride: " + str(self.pride)) # print("\tAmbivalence: " + str(self.ambivalence) + "\tVulgarity: " + str(self.vulgarity)) # print("\n") # pdData = [{'positive': self.positive, 'negative' : self.negative, 'joy' : self.joy, 'anger' : self.anger, # 'love': self.love, 'hate' : self.hate, 'certainty' : self.certainty, 'confusion' : self.confusion, # 'amusement' : self.amusement, 'boredom' : self.boredom, 'intensity' : self.intensity, 'regret' : self.regret, # 'agreeable': self.agreeable, 'challenging' : self.challenging, 'desire' : self.desire, 'calm' : self.calm, # 'emphatic' : self.emphatic, 'sarcastic' : self.sarcastic, 'instructive' : self.instructive, 'accusative' : self.accusative, # 'admiration' : self.admiration, 'inquisitive' : self.inquisitive, 'modest' : self.modest, 'pride' : self.pride, # 'ambivalence' : self.ambivalence, 'vulgarity' : self.vulgarity}] self.normalizedProbValues = pd.Series({ 'positive': self.positive, 'negative': self.negative, 'joy': self.joy, 'anger': self.anger, 'love': self.love, 'hate': self.hate, 'certainty': self.certainty, 'confusion': self.confusion, 'amusement': self.amusement, 'boredom': self.boredom, 'intensity': self.intensity, 'regret': self.regret, 'agreeable': self.agreeable, 'challenging': self.challenging, 'desire': self.desire, 'calm': self.calm, 'emphatic': self.emphatic, 'sarcastic': self.sarcastic, 'instructive': self.instructive, 'accusative': self.accusative, 'admiration': self.admiration, 'inquisitive': self.inquisitive, 'modest': self.modest, 'pride': self.pride, 'ambivalence': self.ambivalence, 'vulgarity': self.vulgarity }) # self.normalizedProbValues = pd.DataFrame(pdData).astype(np.float32) # print("\n\t",self.normalizedProbValues) # print("\n\t", self.normalizedProbValues.describe()) self.normalizeProbabilityPunctuation(_message) # return self.normalizedProbValues # return self.prob_dist, self.prob_dist_max, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity # return self.normalizedProbValues, self.prob_dist, self.prob_dist_max, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity else: # pdData = [{'positive': self.positive, 'negative' : self.negative, 'joy' : self.joy, 'anger' : self.anger, # 'love': self.love, 'hate' : self.hate, 'certainty' : self.certainty, 'confusion' : self.confusion, # 'amusement' : self.amusement, 'boredom' : self.boredom, 'intensity' : self.intensity, 'regret' : self.regret, # 'agreeable': self.agreeable, 'challenging' : self.challenging, 'desire' : self.desire, 'calm' : self.calm, # 'emphatic' : self.emphatic, 'sarcastic' : self.sarcastic, 'instructive' : self.instructive, 'accusative' : self.accusative, # 'admiration' : self.admiration, 'inquisitive' : self.inquisitive, 'modest' : self.modest, 'pride' : self.pride, # 'ambivalence' : self.ambivalence, 'vulgarity' : self.vulgarity}] self.normalizedProbValues = pd.Series({ 'positive': self.positive, 'negative': self.negative, 'joy': self.joy, 'anger': self.anger, 'love': self.love, 'hate': self.hate, 'certainty': self.certainty, 'confusion': self.confusion, 'amusement': self.amusement, 'boredom': self.boredom, 'intensity': self.intensity, 'regret': self.regret, 'agreeable': self.agreeable, 'challenging': self.challenging, 'desire': self.desire, 'calm': self.calm, 'emphatic': self.emphatic, 'sarcastic': self.sarcastic, 'instructive': self.instructive, 'accusative': self.accusative, 'admiration': self.admiration, 'inquisitive': self.inquisitive, 'modest': self.modest, 'pride': self.pride, 'ambivalence': self.ambivalence, 'vulgarity': self.vulgarity }) self.normalizeProbabilityPunctuation(_message) # return self.normalizedProbValues # return self.prob_dist, self.prob_dist_max, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity return self.normalizedProbValues, self.prob_dist, self.prob_dist_max, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity def normalizeProbabilityPunctuation(self, _message): # print("\n\t", self.punctCountDict) # print("\tNow normalizing probability based on punctuation count..") ############################################################################################################################################################ # Base values below. Variables will be scaled off of linearly increasing relationships based off these values below, to determine different probability ranges. minWordCountRange = 0 minSentenceCountRange = 0 maxWordCountRange = 50 maxSentenceCountRange = 3 maxCommaCountRange = 6 msgWordCountLeveler = 0 msgSentenceCountLeveler = 0 punctSlidingThreshold = 1 # Code below contains the actual sliding algorithm for probability normalization through punctuation # START (The values in this if-then don't need to be sliding (mapped to a range), because anything longer than 50 words or 2 sentences will be considered "long"). # This part of the algorithm is also not adjusted by the leveler, because the progression does not scale well enough based off the original values without manipulation. # Manipulation come from the msgWordCountLeveler and msgSentenceCountLeveler variables if minWordCountRange < self.wordCount < maxWordCountRange and minSentenceCountRange < self.sentenceCount <= maxSentenceCountRange: # print("\tProbability normalization based off of the first level of scaling.") punctSlidingThreshold = 1 # Emphatic sentences more likely more likely (deep analytical thinking) # Values below are mapped to linearly scaling variables (to save having to numbers manually and repeatedly, of course). # PunctSlidingThreshold not used for commas for this instance case because multiplying by 1 does not give a high enough threshold # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount >= maxSentenceCountRange and self.punctCountDict['commas'] <= 3: # print("\tLong, slow writing, with many commas.") # elif minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount < maxSentenceCountRange and self.punctCountDict['commas'] <= 3: # print("\tQuick, rapid writing. Many short sentences, few commas.") # else: # pass if self.punctCountDict['numbers'] >= punctSlidingThreshold: # More informative or descriptive message more likely # print("\tNumbers detected.") pass elif self.punctCountDict['periods_end'] >= punctSlidingThreshold: # print("\tPeriods detected.") pass elif self.punctCountDict['question_marks'] >= punctSlidingThreshold: if self.inquisitive <= .1: self.inquisitive = .1 else: self.inquisitive = self.inquisitive / math.sqrt( self.inquisitive ) * self.punctCountDict['question_marks'] # print("\tQuestions detected.") elif self.punctCountDict[ 'exclamation_points'] >= punctSlidingThreshold: if self.intensity <= .1: self.intensity = .1 else: self.intensity = self.intensity / math.sqrt( self.intensity ) * self.punctCountDict['exclamation_points'] # print("\tExclamations detected.") elif self.punctCountDict['commas'] >= punctSlidingThreshold * 1.5: # print("\tCommas detected.") pass else: pass # END # START if self.wordCount > maxWordCountRange or minSentenceCountRange > maxSentenceCountRange: # print("\tProbability normaliziation based off of a proportionally increased level of scaling from word / sentence count.") msgWordCountLeveler = int(self.wordCount / maxWordCountRange) msgSentenceCountLeveler = int(self.sentenceCount / maxSentenceCountRange) minWordCountRange = 1 * msgWordCountLeveler minSentenceCountRange = 1 * msgSentenceCountLeveler maxWordCountRange = maxWordCountRange * msgWordCountLeveler maxSentenceCountRange = minSentenceCountRange * msgSentenceCountLeveler # Make sure we're not dividing by 0 if msgSentenceCountLeveler < 1: msgSentenceCountLeveler = 1 punctSlidingThreshold = int( (punctSlidingThreshold * (msgSentenceCountLeveler * msgWordCountLeveler / msgSentenceCountLeveler))) if minWordCountRange < self.wordCount < maxWordCountRange and minSentenceCountRange < self.sentenceCount < maxSentenceCountRange: # Emphatic sentences more likely more likely (deep analytical thinking) # print("\tLong sentence detected.") # Punctuation threshold for commas are slightly higher than end marks, so they are multiplied by 1.5 # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount >= maxSentenceCountRange and self.commas < int(punctSlidingThreshold) * 1.5: # print("\tQuick, rapid writing. Many short sentences, few commas.") # if minWordCountRange < self.wordCount < maxWordCountRange and self.sentenceCount < maxSentenceCountRange and self.commas >= int(punctSlidingThreshold) * 1.5: # print("\tLong, slow writing, with many commas.") if self.punctCountDict['numbers'] >= punctSlidingThreshold: # More informative or descriptive message more likely # print("\tNumbers detected.") pass elif self.punctCountDict[ 'periods_end'] >= punctSlidingThreshold: # print("\tPeriods detected.") pass elif self.punctCountDict[ 'question_marks'] >= punctSlidingThreshold: if self.inquisitve <= .1: self.inquisitve = .1 else: self.inquisitive = self.inquisitive / math.sqrt( self.inquisitive ) * self.punctCountDict['question_marks'] # print("\tQuestions detected.") elif self.punctCountDict[ 'exclamation_points'] >= punctSlidingThreshold: if self.intensity <= .1: self.intensity = .1 else: self.intensity = self.intensity / math.sqrt( self.intensity ) * self.punctCountDict['exclamation_points'] # print("\tExclamations detected.") elif self.punctCountDict[ 'commas'] >= punctSlidingThreshold * 1.5: # print("\tCommas detected.") pass else: pass # END ############################################################################################################################################################ # print("\n\tProbability Values Post-Normalization Counting Punctuation: ") # print(self.normalizedProbValues) # self.normalizeProbabilityOpposites(_message) self.normalizedProbValues = pd.Series({ 'positive': self.positive, 'negative': self.negative, 'joy': self.joy, 'anger': self.anger, 'love': self.love, 'hate': self.hate, 'certainty': self.certainty, 'confusion': self.confusion, 'amusement': self.amusement, 'boredom': self.boredom, 'intensity': self.intensity, 'regret': self.regret, 'agreeable': self.agreeable, 'challenging': self.challenging, 'desire': self.desire, 'calm': self.calm, 'emphatic': self.emphatic, 'sarcastic': self.sarcastic, 'instructive': self.instructive, 'accusative': self.accusative, 'admiration': self.admiration, 'inquisitive': self.inquisitive, 'modest': self.modest, 'pride': self.pride, 'ambivalence': self.ambivalence, 'vulgarity': self.vulgarity }) self.normalizeProbability(_message) # return self.normalizedProbValues def normalizeProbability(self, _message): normalizedProbValTemp = self.normalizedProbValues self.normalizedProbValues = preprocessing.RobustScaler( with_centering=True, with_scaling=True, quantile_range=(50.0, 100.0), copy=True).fit_transform(normalizedProbValTemp) normalizedProbValTemp = self.normalizedProbValues self.normalizedProbValues = preprocessing.StandardScaler( with_mean=False, with_std=False).fit_transform(normalizedProbValTemp) normalizedProbValTemp = self.normalizedProbValues self.normalizedProbValues = preprocessing.normalize( normalizedProbValTemp, norm='max') normalizedProbValTemp = self.normalizedProbValues self.normalizedProbValues = np.array(normalizedProbValTemp).tolist() normalizedProbValTemp = self.normalizedProbValues # LIST BELOW IS SORTED ALPHABETICALLY BECAUSE OF HOW NUMPY DOES IT normalizedAccusative = normalizedProbValTemp[0][0] normalizedAdmiration = normalizedProbValTemp[0][1] normalizedAgreeable = normalizedProbValTemp[0][2] normalizedAmbivalence = normalizedProbValTemp[0][3] normalizedAmusement = normalizedProbValTemp[0][4] normalizedAnger = normalizedProbValTemp[0][5] normalizedBoredom = normalizedProbValTemp[0][6] normalizedCalm = normalizedProbValTemp[0][7] normalizedCertainty = normalizedProbValTemp[0][8] normalizedChallenging = normalizedProbValTemp[0][9] normalizedConfusion = normalizedProbValTemp[0][10] normalizedDesire = normalizedProbValTemp[0][11] normalizedEmphatic = normalizedProbValTemp[0][12] normalizedHate = normalizedProbValTemp[0][13] normalizedInquisitive = normalizedProbValTemp[0][14] normalizedInstructive = normalizedProbValTemp[0][15] normalizedIntensity = normalizedProbValTemp[0][16] normalizedJoy = normalizedProbValTemp[0][17] normalizedLove = normalizedProbValTemp[0][18] normalizedModest = normalizedProbValTemp[0][19] normalizedNegative = normalizedProbValTemp[0][20] normalizedPositive = normalizedProbValTemp[0][21] normalizedPride = normalizedProbValTemp[0][22] normalizedRegret = normalizedProbValTemp[0][23] normalizedSarcastic = normalizedProbValTemp[0][24] normalizedVulgarity = normalizedProbValTemp[0][25] self.positive = float(round(normalizedPositive, 3) * 100) self.negative = float(round(normalizedNegative, 3) * 100) self.joy = float(round(normalizedJoy, 3) * 100) self.anger = float(round(normalizedAnger, 3) * 100) self.love = float(round(normalizedLove, 3) * 100) self.hate = float(round(normalizedHate, 3) * 100) self.certainty = float(round(normalizedCertainty, 3) * 100) self.confusion = float(round(normalizedConfusion, 3) * 100) self.amusement = float(round(normalizedAmusement, 3) * 100) self.boredom = float(round(normalizedBoredom, 3) * 100) self.intensity = float(round(normalizedIntensity, 3) * 100) self.regret = float(round(normalizedRegret, 3) * 100) self.agreeable = float(round(normalizedAgreeable, 3) * 100) self.challenging = float(round(normalizedChallenging, 3) * 100) self.desire = float(round(normalizedDesire, 3) * 100) self.calm = float(round(normalizedCalm, 3) * 100) self.emphatic = float(round(normalizedEmphatic, 3) * 100) self.sarcastic = float(round(normalizedSarcastic, 3) * 100) self.instructive = float(round(normalizedInstructive, 3) * 100) self.accusative = float(round(normalizedAccusative, 3) * 100) self.admiration = float(round(normalizedAdmiration, 3) * 100) self.inquisitive = float(round(normalizedInquisitive, 3) * 100) self.modest = float(round(normalizedModest, 3) * 100) self.pride = float(round(normalizedPride, 3) * 100) self.ambivalence = float(round(normalizedAmbivalence, 3) * 100) self.vulgarity = float(round(normalizedVulgarity, 3) * 100) normalizedProbValTemp = {} normalizedProbValTemp['positive'] = self.positive normalizedProbValTemp['negative'] = self.negative normalizedProbValTemp['joy'] = self.joy normalizedProbValTemp['anger'] = self.anger normalizedProbValTemp['love'] = self.love normalizedProbValTemp['hate'] = self.hate normalizedProbValTemp['certainty'] = self.certainty normalizedProbValTemp['confusion'] = self.confusion normalizedProbValTemp['amusement'] = self.amusement normalizedProbValTemp['boredom'] = self.boredom normalizedProbValTemp['intensity'] = self.intensity normalizedProbValTemp['regret'] = self.regret normalizedProbValTemp['agreeable'] = self.agreeable normalizedProbValTemp['challenging'] = self.challenging normalizedProbValTemp['desire'] = self.desire normalizedProbValTemp['calm'] = self.calm normalizedProbValTemp['emphatic'] = self.emphatic normalizedProbValTemp['sarcastic'] = self.sarcastic normalizedProbValTemp['instructive'] = self.instructive normalizedProbValTemp['accusative'] = self.accusative normalizedProbValTemp['admiration'] = self.admiration normalizedProbValTemp['inquisitive'] = self.inquisitive normalizedProbValTemp['modest'] = self.modest normalizedProbValTemp['pride'] = self.pride normalizedProbValTemp['ambivalence'] = self.ambivalence normalizedProbValTemp['vulgarity'] = self.vulgarity # print("\n\n\t", normalizedProbValTemp) self.normalizedProbValues = normalizedProbValTemp normalizedProbValTemp = sorted(self.normalizedProbValues.items(), key=operator.itemgetter(1), reverse=True) self.normalizedProbValues = normalizedProbValTemp self.normalizedProbValues = list(self.normalizedProbValues) print("\n\t", self.normalizedProbValues) if runningScript == True: self.getInput(_message) return self.normalizedProbValues, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity # self.normalizeProbabilityPunctuation(_message) return self.normalizedProbValues, self.positive, self.negative, self.joy, self.anger, self.love, self.hate, self.certainty, self.confusion, self.amusement, self.boredom, self.intensity, self.regret, self.agreeable, self.challenging, self.desire, self.calm, self.emphatic, self.sarcastic, self.instructive, self.accusative, self.admiration, self.inquisitive, self.modest, self.ambivalence, self.vulgarity
experience_utterances = [(x, 'experience') for x in experience_utterances] environment_utterances = [(x, 'enivornment') for x in environment_utterances] working_on_utterances = [(x, 'working') for x in working_on_utterances] # FIXME: find better way to flatten lists together training_set = [] training_set.extend(experience_utterances) training_set.extend(environment_utterances) training_set.extend(working_on_utterances) classifier = NaiveBayesClassifier(training_set) print(classifier.show_informative_features(), classifier.labels()) bogus_utterances = ( 'if you going to use nltk u may want to check this out spacy .io', 'sup people? I see the weather\'s getting better over there, Ben.', 'i had the same problem your having so thats my i made my own.', 'try http, instead of https' ) # TODO: Figure out how to make this stronger dual_utterance = ('how long have you been coding and what IDE do you use',) test_utterances = ('what are you making', 'hey that nyancat is cool, how do you get that?') for t in test_utterances: prob_dist = classifier.prob_classify(t) print(t, '\n', prob_dist.max(), prob_dist.prob(prob_dist.max()))
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_custom_format(self): redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): self.client = client self.port = port @classmethod def detect(cls, stream): return True def to_iterable(self): return redis_train formats.register('redis', MockRedisFormat) mock_redis = mock.Mock() cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) assert_equal(cl.train_set, redis_train) def test_data_with_no_available_format(self): mock_fp = mock.Mock() mock_fp.read.return_value = '' assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
train = [ ('I love this sandwich.', 'pos'), ('this is an amazing place!', 'pos'), ('I feel very good about these beers.', 'pos'), ('this is my best work.', 'pos'), ("what an awesome view", 'pos'), ('I do not like this restaurant', 'neg'), ('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('he is my sworn enemy!', 'neg'), ('my boss is horrible.', 'neg') ] cl = NaiveBayesClassifier(train) prob_dist = cl.prob_classify("How you doing") cl.show_informative_features(5) txt_A = TextBlob("He can climb the mountain") txt_B = TextBlob("The mountain can be climbed by him") txt_C = TextBlob("He is doing his homework") txt_D = TextBlob("The homework is being done by him") print txt_A.tags print txt_B.tags print txt_C.tags print txt_D.tags
class Bot(object): instance = None engine = 'default' def __init__(self, start_web_app=False): self.module_path = '' self.memory: memory.Memory = None self.event_listeners = [] self._web_events = [] self._on_start = [] self._user_id = '' self._user_name = '' self.help = help.Help() self._learn_map: List[Tuple[List[str], 'function']] = [ ] # saves all sentences to learn for a function self._classifier: NaiveBayesClassifier = None self._web_app = None if start_web_app: self._web_app = self.make_web_app() @staticmethod def make_web_app(): """Creates a web application. Returns: web.Application. """ log.info('Creating a web app') return web.Application([(r'/health_check', HealthCheck)]) def _start_web_app(self): """Creates a web server on WEB_PORT and WEB_PORT_SSL""" if not self._web_app: return log.info('Listing on port %s' % WEB_PORT) self._web_app.listen(WEB_PORT) if not WEB_NO_SSL: try: self._web_app.listen( WEB_PORT_SSL, ssl_options={ "certfile": "/tmp/alphabot.pem", # Generate these in your entrypoint "keyfile": "/tmp/alphabot.key" }) except ValueError as e: log.error(e) log.error( 'Failed to start SSL web app on %s. To disable - set WEB_NO_SSL', WEB_PORT_SSL) def _setup(self): pass def add_web_handler(self, path, handler): """Adds a Handler to a web app. Args: path (string): Path where the handler should be served. handler (web.RequestHandler): Handler to use. Raises: WebApplicationNotAvailable """ if not self._web_app: raise WebApplicationNotAvailable self._web_app.add_handlers('.*', [(path, handler)]) async def setup(self, memory_type, script_paths): await self._setup_memory(memory_type=memory_type) await self._setup() # Engine specific setup await self._gather_scripts(script_paths) async def _setup_memory(self, memory_type='dict'): # TODO: memory module should provide this mapping. memory_map = { 'dict': memory.MemoryDict, 'redis': memory.MemoryRedis, } # Get associated memory class or default to Dict memory type. memoryclass = memory_map.get(memory_type) if not memoryclass: raise InvalidOptions('Memory type "%s" is not available.' % memory_type) self.memory = memoryclass() await self.memory.setup() def load_all_modules_from_dir(self, dirname): log.debug('Loading modules from "%s"' % dirname) for importer, package_name, _ in pkgutil.iter_modules([dirname]): self.module_path = "%s/%s" % (dirname, package_name) log.debug("Importing '%s'" % package_name) try: importer.find_module(package_name).load_module(package_name) except Exception as e: log.critical('Could not load `%s`. Error follows.' % package_name) log.critical(e, exc_info=1) exc_type, exc_value, exc_traceback = sys.exc_info() traceback_string = StringIO() traceback.print_exception(exc_type, exc_value, exc_traceback, file=traceback_string) asyncio.ensure_future( self.send( 'Could not load `%s` from %s.' % (package_name, dirname), DEBUG_CHANNEL)) asyncio.ensure_future( self.send(traceback_string.getvalue(), DEBUG_CHANNEL)) async def _gather_scripts(self, script_paths=None): log.info('Gathering scripts...') if not script_paths: log.warning('Warning! You did not specify any scripts to load.') else: for path in script_paths: log.info('Gathering functions from %s' % path) self.load_all_modules_from_dir(path) # TODO: Add a flag to control these log.info('Installing default scripts...') pwd = os.path.dirname(os.path.realpath(__file__)) self.load_all_modules_from_dir("{path}/{default}".format( path=pwd, default=DEFAULT_SCRIPT_DIR)) def _event(self, payload): log.info('Adding an event on top of the stack: %s' % payload) self._web_events.append(payload) async def _get_next_event(self): pass async def start(self): if self._web_app: log.info('Starting web app.') self._start_web_app() log.info('Executing the start scripts.') for func in self._on_start: log.debug('On Start: %s' % func.__name__) await func() log.info('Bot started! Listening to events.') while True: event = await self._get_next_event() log.debug('Received event: %s' % event) log.debug('Checking against %s listeners' % len(self.event_listeners)) if event['text']: if not self._classifier: learn_map = [] for l in self._learn_map: learn_map.extend([(k, l[1]) for k in l[0]]) self._classifier = NaiveBayesClassifier(learn_map) choices = self._classifier.prob_classify(event['text']) func = choices.max() prob = choices.prob(func) log.debug( f'NLTK matched `{func.__name__}` function at {int(prob * 100)}%' ) message = await self.event_to_chat(event) min_prob = 0.65 if message.is_direct else 0.95 if prob > min_prob: asyncio.ensure_future(func(message)) continue # Do not loop through event listeners! # Note: Copying the event_listeners list here to prevent # mid-loop modification of the list. for kwargs, func in list(self.event_listeners): match = self._check_event_kwargs(event, kwargs) log.debug('Function %s requires %s. Match: %s' % (func.__name__, kwargs, match)) if match: future = func(event=event) asyncio.ensure_future(future) # TODO: add a way to detect if any of these were "REAL" Match # then execute the NLP part if none matched. async def wait_for_event(self, **event_args): # Demented python scope. # http://stackoverflow.com/questions/4851463/python-closure-write-to-variable-in-parent-scope # This variable could be an object, but instead it's a single-element list. event_matched = [] async def mark_true(event): event_matched.append(event) log.info('Creating a temporary listener for %s' % (event_args, )) self.event_listeners.append((event_args, mark_true)) while not event_matched: await asyncio.sleep(0.001) log.info('Deleting the temporary listener for %s' % (event_args, )) self.event_listeners.remove((event_args, mark_true)) return event_matched[0] def add_listener(self, chat, **kwargs): log.info('Adding chat listener...') async def cmd(event): message = await self.event_to_chat(event) asyncio.ensure_future(chat.hear(message)) # Uniquely identify this `cmd` to delete later. cmd._listener_chat_id = id(chat) if 'type' not in kwargs: kwargs['type'] = 'message' self._register_function(kwargs, cmd) def _remove_listener(self, chat): match = None # Have to search all the event_listeners here for kw, cmd in self.event_listeners: if (hasattr(cmd, '_listener_chat_id') and cmd._listener_chat_id == id(chat)): match = (kw, cmd) self.event_listeners.remove(match) def _check_event_kwargs(self, event, kwargs): """Check that all expected kwargs were satisfied by the event.""" return dict_subset(event, kwargs) # Decorators to be used in development of scripts def on_start(self, cmd): self._on_start.append(cmd) return cmd def _register_function(self, kwargs, cmd): log.debug('New Listener: %s => %s()' % (kwargs, cmd.__name__)) self.event_listeners.append((kwargs, cmd)) def on(self, **kwargs): """This decorator will invoke your function with the raw event.""" def decorator(cmd): self._register_function(kwargs, cmd) return cmd return decorator def add_command(self, regex, direct=False): """Will convert the raw event into a message object for your function.""" def decorator(cmd): # Register some basic help using the regex. self.help.update(cmd, regex) async def wrapper(event): message = await self.event_to_chat(event) matches_regex = message.matches_regex(regex) log.debug('Command %s should match the regex %s' % (cmd.__name__, regex)) if not matches_regex: return False if direct and not message.is_direct: return False log.debug(f"Executing {cmd.__name__}") await cmd(message=message, **message.regex_group_dict) return True wrapper.__name__ = 'wrapped:%s' % cmd.__name__ self._register_function({'type': 'message'}, wrapper) return cmd return decorator def learn(self, sentences: List[str], direct=False): """Learn sentences for a command. :param sentences: list of strings - :param direct: :return: """ def decorator(cmd): self._learn_map.append((sentences, cmd)) return cmd return decorator def add_help(self, desc=None, usage=None, tags=None): def decorator(cmd): self.help.update(cmd, usage=usage, desc=desc, tags=tags) return cmd return decorator def on_schedule(self, **schedule_keywords): """Invoke bot command on a schedule. Leverages APScheduler for asyncio. http://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html#api year (int|str) - 4-digit year month (int|str) - month (1-12) day (int|str) - day of the (1-31) week (int|str) - ISO week (1-53) day_of_week (int|str) - number or name of weekday (0-6 or mon,tue,wed,thu,fri,sat,sun) hour (int|str) - hour (0-23) minute (int|str) - minute (0-59) second (int|str) - second (0-59) start_date (datetime|str) - earliest possible date/time to trigger on (inclusive) end_date (datetime|str) - latest possible date/time to trigger on (inclusive) timezone (datetime.tzinfo|str) - time zone to use for the date/time calculations (defaults to scheduler timezone) """ if 'second' not in schedule_keywords: # Default is every second. We don't want that. schedule_keywords['second'] = '0' def decorator(cmd): log.info('New Schedule: cron[%s] => %s()' % (schedule_keywords, cmd.__name__)) scheduler.add_job(cmd, trigger='cron', **schedule_keywords) return cmd return decorator # Functions that scripts can tell bot to execute. async def event_to_chat(self, event) -> 'Chat': raise CoreException('Chat engine "%s" is missing event_to_chat(...)' % (self.__class__.__name__)) async def api(self, text, to): raise CoreException('Chat engine "%s" is missing api(...)' % (self.__class__.__name__)) async def send(self, text, to, extra=None) -> 'Chat': raise CoreException('Chat engine "%s" is missing send(...)' % (self.__class__.__name__)) async def _update_channels(self): raise CoreException( 'Chat engine "%s" is missing _update_channels(...)' % (self.__class__.__name__)) def get_channel(self, name) -> 'Channel': raise CoreException('Chat engine "%s" is missing get_channel(...)' % (self.__class__.__name__)) def find_channels(self, pattern): raise CoreException('Chat engine "%s" is missing find_channels(...)' % (self.__class__.__name__))
] if __name__ == "__main__": # print "Initiallizing classifier... (training...)" # train_positive() # train_negative() # print train_set # classifier = NaiveBayesClassifier(train_set) # with open('./texts/words.txt', 'r') as fp: # classifier = NaiveBayesClassifier(fp, format="csv") # print classifier.accuracy(test_set) # print classifier.show_informative_features() classifier = NaiveBayesClassifier(train_set) print train_set print classifier.accuracy(test_set) print classifier.show_informative_features() print "Ready " while 1: try: line = sys.stdin.readline() prob_dist = classifier.prob_classify(line.lower()) print prob_dist.max() print "PROB POS: " + str(round(prob_dist.prob("pos"), 2)) print "PROB NEG: " + str(round(prob_dist.prob("neg"), 2)) except KeyboardInterrupt: break if not line: break