def __init__(self, db_pool=None): self.pool = None self.conn = None if db_pool is None: Logger.warn('can not get database connection pool from env') Logger.info( 'create a new temporary database connection for this call') self.conn = Connection( host=os.environ['DB_HOST'], user=os.environ['DB_USER'], password=os.environ['DB_PWD'], database=os.environ['DB_NAME'], ) else: self.pool = db_pool
class NaiveBayers(): def __init__(self, verbose=True, training_cases=2500, testcases=500): self.verbose = verbose self.training_cases = training_cases self.testcases = testcases self.training = list() self.test = list() self.frequency = dict() self.stop_words = self.get_stop_words() self.positive_words = 0 self.negative_words = 0 self.positive_sentence_count = 0 self.negative_sentence_count = 0 self.total_sentences = 0 self.logger = Logger('NaiveBayers', 'NaiveBayers.log') self.filenames = [ 'res\\benchmark\\yelp_labelled.txt', 'res\\benchmark\\amazon_cells_labelled.txt', 'res\\benchmark\\imdb_labelled.txt' ] def _print(self, message): if self.verbose: print(message) def clean(self, sentence): ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?''' sentence = self.replace_characters(sentence, ignore_characters) return sentence.lower().strip() def tokenise(self, sentence): sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token) return filtered_tokens def replace_characters(self, text, characters): for char in characters: text = text.replace(char, ' ') return text def get_data(self): data = list() for filename in self.filenames: self._print("Filename : " + filename) for datum in tqdm(self.load_data_from_file(filename)): sentence, label = datum.split('\t') label = int(label) sentence = self.clean(sentence) data.append([sentence, label]) self.training = data[:self.training_cases] self.test = data[-self.testcases:] def load_data_from_file(self, filename, encoding="utf8"): with open(filename, encoding=encoding) as file: data = file.readlines() return data def get_kgrams(self, sentence, k=1): grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def train(self): # try: # with open('frequency.pickle', "rb") as file: # self.frequency = pickle.load(file) # with open("count.pickle", "rb") as file: # self.positive_words, self.negative_words = pickle.load(file) # except Exception as error: # self.logger.debug("Frequency file not found") # self.train_unigrams() self.find_frequency_unigrams() self.train_from_negative_sentences() self.train_from_positive_sentences() # print(self.positive_words) # print(self.negative_words) # print(len(self.frequency)) self.find_probablility_unigrams() # print(len(self.probablility)) self.logger.info("Training completed") self.logger.info("Number of positive sentences : " + str(self.positive_sentence_count)) self.logger.info("Number of negative sentences : " + str(self.negative_sentence_count)) def classify(self, sentence): sentence = self.preprocess(sentence) positive_probablity = self.positive_sentence_count / self.total_sentences negative_probablity = self.negative_sentence_count / self.total_sentences self.logger.debug("sentence : " + str(sentence)) self.logger.debug("words considered : ") for word in sentence: word = word[0] word_positive_probability, word_negative_probability = 1, 1 if word in self.probablility: word_positive_probability, word_negative_probability = self.probablility[ word] self.logger.debug("word : " + word + " word_positive_probability : " + str(word_positive_probability) + " word_negative_probability : " + str(word_negative_probability)) positive_probablity *= word_positive_probability negative_probablity *= word_negative_probability self.logger.debug("positive_probablity : " + str(positive_probablity)) self.logger.debug("negative_probablity : " + str(negative_probablity)) # if abs(positive_probablity - negative_probablity) < 0.0000000000000001: # self.logger.debug("sentence is neutral") # return ("neutral" , -1) if positive_probablity > negative_probablity: self.logger.debug("sentence is positive") return ("positive", 1) if negative_probablity > positive_probablity: self.logger.debug("sentence is negative") return ("negative", 0) def test_classifier(self): correct, wrong = 0, 0 total = len(self.test) for sentence, actual_label in self.test: verdict, label = self.classify(sentence) if label == actual_label: correct += 1 else: wrong += 1 self.logger.info("correct : " + str(correct)) self.logger.info("wrong : " + str(wrong)) self.logger.info("total : " + str(total)) self.logger.info("accuracy : " + str(int((correct / total) * 100))) def get_stop_words(self): data = self.load_data_from_file('res\\eng_stop_words.txt') return set([datum.replace('\n', '') for datum in data]) def remove_stop_words(self, sentence): filtered_words = list() for word in sentence: if word in self.stop_words: continue filtered_words.append(word) return filtered_words def find_probablility_unigrams(self): self.probablility = dict() for word in self.frequency: positive_probablity = (self.frequency[word][0] + 1) / ( self.positive_words + len(self.frequency)) negative_probablity = (self.frequency[word][1] + 1) / ( self.negative_words + len(self.frequency)) self.probablility[word] = [ positive_probablity, negative_probablity ] def preprocess(self, sentence): sentence = self.tokenise(sentence) #sentence = self.remove_stop_words(sentence) sentence = self.get_kgrams(sentence, k=1) return sentence def train_from_negative_sentences(self): negative_files = ['res\\rt-polaritydata\\rt-polarity-neg.txt'] for filename in negative_files: new_sentences = self.load_data_from_file(filename) for sentence in new_sentences: sentence = self.preprocess(sentence) self.negative_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][1] += 1 self.negative_words += 1 def train_from_positive_sentences(self): positive_files = ['res\\rt-polaritydata\\rt-polarity-pos.txt'] data = list() for filename in positive_files: new_sentences = self.load_data_from_file(filename) for sentence in new_sentences: sentence = self.preprocess(sentence) self.positive_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][0] += 1 self.positive_words += 1 def find_frequency_unigrams(self): for sentence, label in self.training: self.total_sentences += 1 sentence = self.preprocess(sentence) if label == 1: #positive sentence self.positive_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][0] += 1 self.positive_words += 1 elif label == 0: #negative sentence self.negative_sentence_count += 1 for word in sentence: word = word[0] if word not in self.frequency: self.frequency[word] = [0, 0] self.frequency[word][1] += 1 self.negative_words += 1
class BagOfWordSentiment(): def __init__(self, no_of_grams=4, verbose=True, no_of_testcases=1000): self.verbose = verbose self.logger = Logger('BagOfWordSentiment', 'logs\\bag_of_words.log', is_verbose=self.verbose) self.no_of_grams = no_of_grams self.double_negations, self.double_negations_collection = set(), set() self.negations, self.negation_collection = set(), set() self.positive_words, self.positive_word_collection = set(), set() self.negative_words, self.negative_word_collection = set(), set() self.no_of_testcases = no_of_testcases self.positve_test_bag = list() self.negative_test_bag = list() def ready(self): self.logger.info("Bag of words loading") self.load_data() self.logger.info("Bag of words ready") def classify(self, sentence): ''' classifies the sentence to positve or negative or neutral using bag of words method ''' positive_score, negative_score = self.find_score(sentence) if positive_score > negative_score: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_score) if positive_score < negative_score: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_score) if positive_score == negative_score: self.logger.info("sentence - " + sentence + " - is neutral") return ("neutral", -1, positive_score) def find_score(self, sentence): ''' finds positive and negative score for a given sentence ''' positive_score, negative_score = 0, 0 self.logger.info("sentence : " + sentence) sentence = self.tokenise(sentence) self.logger.info("tokenised sentence after cleaning : " + str(sentence)) kgrams = list() for k in range(self.no_of_grams, 0, -1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) sentence = ' '.join(sentence) if phrase in sentence: self.logger.info("considering phrase '" + phrase + "' from '" + sentence + "'") #check this phrase for double negation contains_double_negation, remaining_phrase = self.is_double_negation( phrase) if contains_double_negation: if self.is_positive(remaining_phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info( "double negation of positive phrase : " + phrase) continue if self.is_negative(remaining_phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info( "double negation of negative phrase : " + phrase) continue #check this phrase for negations contains_negation, remaining_phrase = self.is_negation(phrase) if contains_negation: if self.is_positive(remaining_phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negation of positive phrase : " + phrase) continue if self.is_negative(remaining_phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negation of negative phrase : " + phrase) continue #check for positive phrase if self.is_positive(phrase): positive_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("positive phrase : " + phrase) continue #check for negative phrase if self.is_negative(phrase): negative_score += 1 sentence = sentence.replace(phrase, ' ') sentence = self.tokenise(sentence) self.logger.info("negative phrase : " + phrase) continue self.logger.info("cannot deduce sentiment from phrase '" + phrase + "'") sentence = self.tokenise(sentence) return positive_score, negative_score def is_double_negation(self, phrase): ''' checks whether a word is in bag of double negations ''' for double_negation in self.double_negations: double_negation = double_negation + " " if phrase.startswith(double_negation): remaining_phrase = phrase.replace(double_negation, '') return True, remaining_phrase for double_negation in self.double_negations_collection: if phrase.startswith(double_negation): phrase_length = len(phrase.split(" ")) double_negation_length = len(double_negation.split(" ")) diff = phrase_length - double_negation_length if diff <= 0: return False, phrase remaining_phrase = ' '.join(phrase.split(" ")[-diff:]) return True, remaining_phrase return False, phrase def is_negation(self, phrase): ''' checks whether a word is in bag of negations ''' for negation in self.negations: negation = negation + " " if phrase.startswith(negation): remaining_phrase = phrase.replace(negation, '') return True, remaining_phrase for negation in self.negation_collection: if phrase.startswith(negation): phrase_length = len(phrase.split(" ")) negation_length = len(negation.split(" ")) diff = phrase_length - negation_length if diff <= 0: return False, phrase remaining_phrase = ' '.join(phrase.split(" ")[-diff:]) return True, remaining_phrase return False, phrase def is_positive(self, word): ''' checks whether a word is in bag of positive words ''' if word in self.positive_words: return True for positive_word in self.positive_word_collection: if word.startswith(positive_word): return True return False def is_negative(self, word): ''' checks whether a word is in bag of negative words ''' if word in self.negative_words: return True for negative_word in self.negative_word_collection: if word.startswith(negative_word): return True return False def get_kgrams(self, sentence, k=1): ''' return list of kgrams from a given sentence ''' grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def load_data(self): ''' loads the data necessary for analysis ''' double_negation_files = [ 'res\\bag_of_words_dataset\\double_negation.txt' ] negations_files = ['res\\bag_of_words_dataset\\negation.txt'] positive_word_files = ['res\\bag_of_words_dataset\\positive_words.txt'] negative_word_files = ['res\\bag_of_words_dataset\\negative_words.txt'] self.double_negations, self.double_negations_collection = self.get_words( self.load_data_from_files(double_negation_files)) self.negations, self.negation_collection = self.get_words( self.load_data_from_files(negations_files)) self.positive_words, self.positive_word_collection = self.get_words( self.load_data_from_files(positive_word_files)) self.negative_words, self.negative_word_collection = self.get_words( self.load_data_from_files(negative_word_files)) self.logger.info("words loaded") self.logger.info("double negations : " + str( len(self.double_negations) + len(self.double_negations_collection))) self.logger.info( "negations : " + str(len(self.negations) + len(self.negation_collection))) self.logger.info( "positive words : " + str(len(self.positive_words) + len(self.positive_word_collection))) self.logger.info( "negative words : " + str(len(self.negative_words) + len(self.negative_word_collection))) def get_words(self, input_words): ''' cleans the input words and group them into set of words and set of mulitple word set(words that have different forms) ''' words = set() multiple_words = set() for word in input_words: word = word.replace('\n', '').replace('(1)', '').replace("'", '') word = word.replace('_', ' ').replace('-', ' ').strip().lower() if '*' in word: word = word.replace('*', '') multiple_words.add(word.strip()) continue words.add(word) return words, multiple_words def tokenise(self, sentence): ''' split the sentence into words ''' sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token) return filtered_tokens def clean(self, sentence): ''' clean the sentence by removing ignored characters ''' ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{!}?:;_-''' sentence = sentence.lower().strip() sentence = self.remove_stop_words(sentence) sentence = self.replace_characters(sentence, ignore_characters) sentence = sentence.replace("'", '') return sentence.lower().strip() def remove_stop_words(self, sentence): stop_words = self.load_data_from_files( ['res\\bag_of_words_dataset\\refined_stop_words.txt']) sentence = sentence.split(" ") stop_word_set = set() for stop_word in stop_words: stop_word_set.add( stop_word.replace('\n', '').replace('\t', '').strip()) new_sentence = list() for word in sentence: if word not in stop_word_set: new_sentence.append(word) return ' '.join(new_sentence) def replace_characters(self, text, characters): ''' replace the specified characters from text to blank spaces ''' for char in characters: text = text.replace(char, ' ') return text def load_data_from_files(self, filenames, encoding="utf8"): ''' load the data as a list from the specified filenames ''' data = list() for filename in filenames: with open(filename, encoding=encoding) as file: data.extend(file.readlines()) return data def find_accuracy(self): self.load_test_cases() self.create_test_set() correct, wrong = 0, 0 total = len(self.positve_test_bag) + len(self.negative_test_bag) _correct, _wrong = self.test_for_bag(self.positve_test_bag, actual_result=1) correct += _correct wrong += _wrong _correct, _wrong = self.test_for_bag(self.negative_test_bag, actual_result=0) correct += _correct wrong += _wrong self.accuracy = (correct / total) * 100 self.logger.info("total test sentences : " + str(total)) self.logger.info("correct output : " + str(correct)) self.logger.info("wrong output : " + str(wrong)) self.logger.info("accuracy (%) : " + str(int(self.accuracy))) return (self.accuracy, total, correct, wrong) def test_for_bag(self, bag, actual_result): self.logger.is_verbose = False correct, wrong = 0, 0 for sentence in bag: result = self.classify(sentence=sentence) if result[1] == actual_result: correct += 1 else: wrong += 1 self.logger.is_verbose = True self.logger.debug("total test sentences in bag : " + str(len(bag))) self.logger.debug("correct output : " + str(correct)) self.logger.debug("wrong output : " + str(wrong)) self.logger.debug("accuracy (%) : " + str(int((correct / len(bag)) * 100))) return correct, wrong def create_test_set(self): ''' randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences ''' from numpy import random as np_random count = self.no_of_testcases // 2 while (count != 0): index = np_random.random_integers(low=0, high=len(self.positive_bag) - 1) self.positve_test_bag.append(self.positive_bag.pop(index)) index = np_random.random_integers(low=0, high=len(self.negative_bag) - 1) self.negative_test_bag.append(self.negative_bag.pop(index)) count -= 1 self.logger.info("test sentences selected") self.logger.info( "Total sentences for testing : " + str(len(self.positve_test_bag) + len(self.negative_test_bag))) self.logger.info("positive sentences for testing : " + str(len(self.positve_test_bag))) self.logger.info("negative sentences for testing : " + str(len(self.negative_test_bag))) def load_test_cases(self): ''' loads the positive and negative sentences from filenames specified ''' mixed_bag_paths = [ 'res\\dataset\\uci_dataset\\yelp_labelled.txt', 'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt', 'res\\dataset\\uci_dataset\\imdb_labelled.txt' ] #followed training sets contain hard testcases positive_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt' ] negative_bag_paths = [ 'res\dataset\polarity_dataset\\rt-polarity-neg.txt' ] #uncomment below two lines not to include difficult testcases # positive_bag_paths = [] # negative_bag_paths = [] self.positive_bag, self.negative_bag = list(), list() count_positive, count_negative = 0, 0 for filename in mixed_bag_paths: for mixed_data in self.load_data_from_files([filename]): sentence, label = mixed_data.split('\t') label = int(label) if label == 1: #if sentence is positive self.positive_bag.append(sentence) count_positive += 1 else: self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from mixed bag imported") self.logger.debug("positive sentences : " + str(count_positive)) self.logger.debug("negative sentences : " + str(count_negative)) count_positive = 0 for filename in positive_bag_paths: for sentence in self.load_data_from_files([filename]): self.positive_bag.append(sentence) count_positive += 1 self.logger.debug("sentences from positive bag imported") self.logger.debug("positive sentences : " + str(count_positive)) count_negative = 0 for filename in negative_bag_paths: for sentence in self.load_data_from_files([filename]): self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from negative bag imported") self.logger.debug("negative sentences : " + str(count_negative)) self.logger.debug("sentences imported") self.logger.debug("Total sentences : " + str(len(self.positive_bag) + len(self.negative_bag))) self.logger.debug("positive sentences : " + str(len(self.positive_bag))) self.logger.debug("negative sentences : " + str(len(self.negative_bag)))
class Comparer(): def __init__(self, no_of_testcases=100, verbose=True, nb=None, bw=None): self.logger = Logger('Comparer', 'logs\\comparer.log', is_verbose=verbose) self.load_html_structure() if nb is None: self.nb = NaiveBayes(verbose=False, test_set_count=no_of_testcases, no_of_grams=4) self.nb.ready() else: self.nb = nb self.nb.logger.is_verbose = False if bw is None: self.bw = BagOfWordSentiment(verbose=False, no_of_grams=4) self.bw.ready() else: self.bw = bw self.bw.logger.is_verbose = False self.no_of_testcases = no_of_testcases self.nb_correct, self.bw_correct, self.tb_correct = 0, 0, 0 self.nb_wrong, self.bw_wrong, self.tb_wrong = 0, 0, 0 self.nb_accuracy, self.bw_accuracy, self.tb_accuracy = 0, 0, 0 self.counter = 0 self.testcases = dict() def ready(self): self.positive_test_bag = self.nb.get_positive_test_bag() self.negative_test_bag = self.nb.get_negative_test_bag() def compare(self): ''' compares sentiment analysis done through Naive Bayes and bag of words method with popular text processing library textblob. ''' self.test_for_bag(self.positive_test_bag, 1) self.test_for_bag(self.negative_test_bag, 0) self.nb_accuracy = (self.nb_correct / len(self.testcases)) * 100 self.bw_accuracy = (self.bw_correct / len(self.testcases)) * 100 self.tb_accuracy = (self.tb_correct / len(self.testcases)) * 100 self.logger.info("Naive Bayes classifier") self.logger.info("Correct classification : " + str(self.nb_correct)) self.logger.info("Wrong classification : " + str(self.nb_wrong)) self.logger.info("Accuracy classification : " + str(int(self.nb_accuracy))) self.logger.info("Bag of Words classifier") self.logger.info("Correct : " + str(self.bw_correct)) self.logger.info("Wrong classification : " + str(self.bw_wrong)) self.logger.info("Accuracy : " + str(int(self.bw_accuracy))) self.logger.info("textblob classifier") self.logger.info("Correct : " + str(self.tb_correct)) self.logger.info("Wrong classification : " + str(self.tb_wrong)) self.logger.info("Accuracy : " + str(int(self.tb_accuracy))) self.file_html = self.file_html.replace("@nb_right", str(self.nb_correct)) self.file_html = self.file_html.replace("@bw_right", str(self.bw_correct)) self.file_html = self.file_html.replace("@tb_right", str(self.tb_correct)) self.file_html = self.file_html.replace("@nb_wrong", str(self.nb_wrong)) self.file_html = self.file_html.replace("@bw_wrong", str(self.bw_wrong)) self.file_html = self.file_html.replace("@tb_wrong", str(self.tb_wrong)) self.file_html = self.file_html.replace("@nb_accuracy", str(int(self.nb_accuracy))) self.file_html = self.file_html.replace("@bw_accuracy", str(int(self.bw_accuracy))) self.file_html = self.file_html.replace("@tb_accuracy", str(int(self.tb_accuracy))) self.file_html = self.file_html.replace("@total_sentences", str(len(self.testcases))) self.testcases["nb_results"] = { "correct": self.nb_correct, "wrong": self.nb_wrong, "accuracy": self.nb_accuracy } self.testcases["bw_results"] = { "correct": self.bw_correct, "wrong": self.bw_wrong, "accuracy": self.bw_accuracy } self.testcases["tb_results"] = { "correct": self.tb_correct, "wrong": self.tb_wrong, "accuracy": self.tb_accuracy } self.store_results() def store_results(self): with open('output\\comparison_data.json', 'w', encoding="utf-8") as file_pointer: json.dump(self.testcases, file_pointer) with open('output\\output.html', 'w', encoding="utf-8") as file_pointer: file_pointer.write(self.file_html) def test_for_bag(self, bag, actual_result): for sentence in bag: sentence = ' '.join(sentence) nb_result = self.nb.classify(sentence) bw_result = self.bw.classify(sentence) tb_result = self.classify_using_textblob(sentence) self.counter += 1 self.testcases[self.counter] = { "sentence": sentence, "actual": actual_result, "nb_result": list(nb_result), "bw_result": list(bw_result), "tb_result": list(tb_result) } temp_html = self.html_structure temp_html = temp_html.replace("@sentence", str(sentence)) temp_html = temp_html.replace("@actual_label", str(actual_result)) temp_html = temp_html.replace("@nb_prediction", str(nb_result[1])) temp_html = temp_html.replace("@bw_prediction", str(bw_result[1])) temp_html = temp_html.replace("@tb_prediction", str(tb_result[1])) temp_html = temp_html.replace("@nb_label", str(nb_result[0])) temp_html = temp_html.replace("@bw_label", str(bw_result[0])) temp_html = temp_html.replace("@tb_label", str(tb_result[0])) temp_html = temp_html.replace("@nb_score", str(nb_result[2])) temp_html = temp_html.replace("@bw_score", str(bw_result[2])) temp_html = temp_html.replace("@tb_score", str(tb_result[2])) self.file_html = self.file_html + temp_html if nb_result[1] == actual_result: self.nb_correct += 1 else: self.nb_wrong += 1 if bw_result[1] == actual_result: self.bw_correct += 1 else: self.bw_wrong += 1 if tb_result[1] == actual_result: self.tb_correct += 1 else: self.tb_wrong += 1 def classify_using_textblob(self, sentence): ''' classifies the sentence using textblob library ''' text_blob = TextBlob(sentence) polarity = text_blob.sentiment[0] if polarity > 0: return ("positive", 1, polarity) if polarity < 0: return ("negative", 0, polarity) return ("neutral", -1, polarity) def load_html_structure(self): ''' stores the data from dictionary to html file ''' with open('res\\table_structure.html', 'r') as myfile: self.html_structure = myfile.read() with open('res\\table_header.html', 'r') as myfile: self.file_html = myfile.read()
class NaiveBayes(): ''' implementation of Naive Bayes classifer ''' def __init__(self, verbose=True, test_set_count=500, no_of_grams=1): self.logger = Logger('NaiveBayes', 'logs\\NaiveBayes.log', is_verbose=verbose) self.verbose = verbose self.counts = dict() self.positive_bag = [] self.negative_bag = [] self.positve_test_bag = [] self.negative_test_bag = [] self.counts["test set"] = test_set_count self.counts["positive phrases"] = 0 self.counts["negative phrases"] = 0 self.counts["total sentences"] = 0 self.counts["positive sentences"] = 0 self.counts["negative sentences"] = 0 self.no_of_grams = no_of_grams self.phrase_occurrences = dict() self.phrase_probabilities = dict() def ready(self): self.logger.info("starting Naive Bayers classifier") self.load_data() self.create_test_set() self.fit() # self.find_accuracy() self.logger.info("Naive Bayers classifier ready.") def classify(self, sentence): ''' classifies a given sentence to positive or negative class ''' positive_probablity, negative_probablity = self.find_conditional_probability( sentence) # if positive_probablity == 1 and negative_probablity == 1: #unable to classify a sentence # self.logger.debug("sentence - " + sentence + " - is neutral") # return ("neutral", -1, positive_probablity) if positive_probablity == 1 and negative_probablity != 1: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_probablity) if positive_probablity != 1 and negative_probablity == 1: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_probablity) if positive_probablity > negative_probablity: self.logger.info("sentence - " + sentence + " - is positive") return ("positive", 1, positive_probablity) if negative_probablity > positive_probablity: self.logger.info("sentence - " + sentence + " - is negative") return ("negative", 0, negative_probablity) if negative_probablity == positive_probablity: #unable to classify a sentence self.logger.info("sentence - " + sentence + " - is neutral") self.logger.info("no sense can be deduced from this sentence") return ("neutral", -1, positive_probablity) def find_conditional_probability(self, sentence): ''' finds the conditional probablity for a given sentence from phrase_probabilities ''' sentence_str = sentence sentence = self.preprocess(sentence) sentence_positive_probablity = 1 sentence_negative_probablity = 1 positive_class_probability = self.counts[ "positive sentences"] / self.counts["total sentences"] negative_class_probability = self.counts[ "negative sentences"] / self.counts["total sentences"] sentence_positive_probablity *= positive_class_probability sentence_negative_probablity *= negative_class_probability kgrams = list() for k in range(self.no_of_grams, 0, -1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: #this give around 80% phrase = ' '.join(kgram) sentence = ' '.join(sentence) if phrase in sentence and phrase in self.phrase_probabilities: phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[ phrase] count = sentence.count(phrase) self.logger.info(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability) + " " + str(count)) sentence_positive_probablity *= phrase_positive_probability**count sentence_negative_probablity *= phrase_negative_probability**count sentence = sentence.replace(phrase, ' ') sentence = self.preprocess(sentence) # for kgram in kgrams: #this give 75% # phrase = ' '.join(kgram) # if phrase in self.phrase_probabilities: # phrase_positive_probability, phrase_negative_probability = self.phrase_probabilities[phrase] # self.logger.debug(phrase + " " + str(phrase_positive_probability) + " " + str(phrase_negative_probability)) # sentence_positive_probablity *= phrase_positive_probability # sentence_negative_probablity *= phrase_negative_probability return sentence_positive_probablity, sentence_negative_probablity def fit(self): ''' trains the model with sentences in positive and negative bags ''' self.logger.info("training started") self.logger.info("total sentences : " + str(self.counts["total sentences"])) self.logger.info("positive sentences : " + str(self.counts["positive sentences"])) self.logger.info("negative sentences : " + str(self.counts["negative sentences"])) self.get_occurrences_from_bags() self.logger.info("calculated occurrences") self.logger.info("unique phrases : " + str(len(self.phrase_occurrences))) self.logger.info("phrases in positive class : " + str(self.counts["positive phrases"])) self.logger.info("phrases in negative class : " + str(self.counts["negative phrases"])) self.get_conditional_probabilities() self.logger.info("conditional probality for phrases calculated") self.logger.info("training completed") def get_conditional_probabilities(self): ''' calculates the conditional probability for phrase|positive class and phrase|negative class ''' total_unique_phrases = len(self.phrase_occurrences) for phrase in self.phrase_occurrences: positive_probablity = (self.phrase_occurrences[phrase][0] + 1) / ( self.counts["positive phrases"] + total_unique_phrases) negative_probablity = (self.phrase_occurrences[phrase][1] + 1) / ( self.counts["negative phrases"] + total_unique_phrases) self.phrase_probabilities[phrase] = [ positive_probablity, negative_probablity ] def get_occurrences_from_bags(self): ''' calculates the occurrences of the phrases ''' self.get_occurrences_from_positive_bag() self.get_occurrences_from_negative_bag() def get_occurrences_from_positive_bag(self): ''' calculates the occurrences of unigram, bigram, trigram and quadgram from positive bag ''' for sentence in self.positive_bag: kgrams = list() for k in range(1, self.no_of_grams + 1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) self.counts["positive phrases"] += 1 if phrase not in self.phrase_occurrences: self.phrase_occurrences[phrase] = [ 0, 0 ] #[word occurrence in positive class, word occurrence in negative class] self.phrase_occurrences[phrase][0] += 1 def get_occurrences_from_negative_bag(self): ''' calculates the occurrences of unigram, bigram, trigram and quadgram from negative bag ''' for sentence in self.negative_bag: kgrams = list() for k in range(1, self.no_of_grams + 1): kgrams.extend(self.get_kgrams(sentence, k)) for kgram in kgrams: phrase = ' '.join(kgram) self.counts["negative phrases"] += 1 if phrase not in self.phrase_occurrences: self.phrase_occurrences[phrase] = [0, 0] self.phrase_occurrences[phrase][1] += 1 def get_kgrams(self, sentence, k=1): ''' return list of kgrams from a given sentence ''' grams = list() for i in range(len(sentence)): grams.append(sentence[i:i + k]) if i + k >= len(sentence): break return grams def create_test_set(self): ''' randomly selects test sentences from positive and negative bags and making a uniform distribution of test sentences ''' from numpy import random as np_random count = self.counts["test set"] // 2 while (count != 0): index = np_random.random_integers(low=0, high=len(self.positive_bag) - 1) self.positve_test_bag.append(self.positive_bag.pop(index)) index = np_random.random_integers(low=0, high=len(self.negative_bag) - 1) self.negative_test_bag.append(self.negative_bag.pop(index)) count -= 1 self.logger.info("test sentences selected") self.logger.info( "Total sentences for testing : " + str(len(self.positve_test_bag) + len(self.negative_test_bag))) self.logger.info("positive sentences for testing : " + str(len(self.positve_test_bag))) self.logger.info("negative sentences for testing : " + str(len(self.negative_test_bag))) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) def load_data(self): ''' loads the positive and negative sentences from filenames specified ''' mixed_bag_paths = [ 'res\\dataset\\uci_dataset\\yelp_labelled.txt', 'res\\dataset\\uci_dataset\\amazon_cells_labelled.txt', 'res\\dataset\\uci_dataset\\imdb_labelled.txt' ] positive_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-pos.txt' ] negative_bag_paths = [ 'res\\dataset\\polarity_dataset\\rt-polarity-neg.txt' ] count_positive, count_negative = 0, 0 for filename in mixed_bag_paths: for mixed_data in self.load_data_from_file(filename): sentence, label = mixed_data.split('\t') label = int(label) sentence = self.preprocess(sentence) if label == 1: #if sentence is positive self.positive_bag.append(sentence) count_positive += 1 else: self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from mixed bag imported") self.logger.debug("positive sentences : " + str(count_positive)) self.logger.debug("negative sentences : " + str(count_negative)) count_positive = 0 for filename in positive_bag_paths: for sentence in self.load_data_from_file(filename): sentence = self.preprocess(sentence) self.positive_bag.append(sentence) count_positive += 1 self.logger.debug("sentences from positive bag imported") self.logger.debug("positive sentences : " + str(count_positive)) count_negative = 0 for filename in negative_bag_paths: for sentence in self.load_data_from_file(filename): sentence = self.preprocess(sentence) self.negative_bag.append(sentence) count_negative += 1 self.logger.debug("sentences from negative bag imported") self.logger.debug("negative sentences : " + str(count_negative)) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) self.logger.info("sentences imported") self.logger.info("Total sentences : " + str(self.counts["total sentences"])) self.logger.info("positive sentences : " + str(self.counts["positive sentences"])) self.logger.info("negative sentences : " + str(self.counts["negative sentences"])) def load_data_from_file(self, filename, encoding="utf8"): ''' load the data as a list from the specified filename ''' with open(filename, encoding=encoding) as file: data = file.readlines() return data def preprocess(self, sentence): ''' preprocess the sentence and return as a list of words ''' sentence = self.tokenise(sentence) #sentence = self.remove_stop_words(sentence) return sentence def tokenise(self, sentence): ''' convert the sentence to list of words ''' sentence = self.clean(sentence) tokens = sentence.split(' ') filtered_tokens = list() for token in tokens: if len(token.strip()) != 0: filtered_tokens.append(token.strip()) return filtered_tokens def clean(self, sentence): ''' clean sentence by removing the ignored characters ''' ignore_characters = '''\t\n&"`~@#$%^*;+=<>//.,()[]{}:;!?''' sentence = self.replace_characters(sentence, ignore_characters) return sentence.lower().strip() def replace_characters(self, text, characters): ''' replaces the specified characters in text with blank space ''' for char in characters: text = text.replace(char, ' ') return text def get_positive_test_bag(self): return self.positve_test_bag def get_negative_test_bag(self): return self.negative_test_bag def test_for_fish_guitar(self): positive_sentences = [ "fish smoked fish", "fish line", "fish haul smoked" ] negative_sentences = ["guitar jazz line"] self.positive_bag = [ sentence.split(" ") for sentence in positive_sentences ] self.negative_bag = [ sentence.split(" ") for sentence in negative_sentences ] self.counts["total sentences"] = len(self.positive_bag) + len( self.negative_bag) self.counts["positive sentences"] = len(self.positive_bag) self.counts["negative sentences"] = len(self.negative_bag) self.get_occurrences_from_bags() self.get_conditional_probabilities() test_sentence = "line guitar jazz jazz" result = self.classify(sentence=test_sentence) self.logger.info(str(result)) return result def find_accuracy(self): correct, wrong = 0, 0 total = len(self.positve_test_bag) + len(self.negative_test_bag) _correct, _wrong = self.test_for_bag(self.positve_test_bag, actual_result=1) correct += _correct wrong += _wrong _correct, _wrong = self.test_for_bag(self.negative_test_bag, actual_result=0) correct += _correct wrong += _wrong self.accuracy = (correct / total) * 100 self.logger.info("total test sentences : " + str(total)) self.logger.info("correct output : " + str(correct)) self.logger.info("wrong output : " + str(wrong)) self.logger.info("accuracy (%) : " + str(int(self.accuracy))) def test_for_bag(self, bag, actual_result): self.logger.is_verbose = False correct, wrong = 0, 0 for sentence in bag: sentence = ' '.join(sentence) result = self.classify(sentence=sentence) if result is None: self.logger.info("result is none : " + str(sentence)) wrong += 1 continue if result[1] == actual_result: correct += 1 else: wrong += 1 self.logger.is_verbose = True self.logger.debug("total test sentences in bag : " + str(len(bag))) self.logger.debug("correct output : " + str(correct)) self.logger.debug("wrong output : " + str(wrong)) self.logger.debug("accuracy (%) : " + str(int((correct / len(bag)) * 100))) return correct, wrong
from flask_cors import CORS from flask import Flask from flask_restful_swagger_2 import Api from resources.login.login import Login from resources.sender.sender import Sender from resources.register.register import Register from resources.verify.verify import Verify from resources.puzzle.puzzle import Puzzle from resources.balance.balance import Balance from utilities.logger import Logger logger = Logger(__name__) app = Flask(__name__) CORS(app) api = Api(app, api_version='0.1') api.add_resource(Login, "/login") api.add_resource(Register, "/register") api.add_resource(Puzzle, "/puzzle") api.add_resource(Verify, "/verify/<string:user>/<int:pin>") api.add_resource(Sender, "/send") api.add_resource(Balance, "/user/balance") if __name__ == '__main__': logger.info('Starting API') app.run(host="0.0.0.0", port=5000)