def swn_pipeline(s: str, label_diff_thresh=0, contractions=None, emoti_dict=None, special={}, count_nouns=True): """pipline that takes in a sentence string and returns the appropriate label: Parameters ---------- s: str sentence to find the sentiment of DEFAULTS: label_diff_thresh:int = 0 how far we want the positive and negative score to differ to be considered not neutral contractions: dict dictionary of contractions mapped to their expanded forms emoti_dict: dict dictionary of emoticons mapped to their label special: dict dictionary of specialized lexicon to score words on top of SentiWordNet count_nouns=true: determines whether or not to include nouns in the scoring Returns ------- tuple(str, list) tuple where first item is sentiment label of s (-1, 0, or 1) and second item is a list of the words with their scores """ ##preliminary pre processing (emoticons, phrases, etc) s = extra_preprocessing(s, emoti_dict) # if got a lable if s == "0" or s == "1" or s == "-1": return (s, []) ##text processing if contractions is None: # if not want to remove contractions tokens = tp.tokenize_words(s) pos_tokens = tp.simple_pos_tag(tokens) pos_tokens = tp.remove_pos_stopwords(pos_tokens) else: # if want to remove negation contractions pos_tokens = rneg.removeNegationsPipeline(s, contractions) return swn_label(pos_tokens, diff_thresh=label_diff_thresh, special=special, count_nouns=count_nouns)
def predict(test_list: list, special={}): """Takes in a test_list of sentences to classify and returns a list of labels Parameters ---------- test_list:[str] list of strings to be labeled DEFAULTS: special:dict special lexicon dictionary we would like to use in scoring words Returns ------- labels:list list of labels for each item in test_list e.g. [1,0,0,-1,...] """ contractions = rneg.getContractions() # create contractions emoti_dict = emoticons.load_emoticon_sentiment() # create emoti_dict label_list = [] # list to return for item in test_list: label, _ = swn_pipeline(item, contractions=contractions, emoti_dict=emoti_dict, special=special) label_list.append(label) return label_list
def senti_train( train_list: list, special_dict={}, iterations=5, stop_iter_threshold=0.0, diff_thresh=0, word_err_threshold=0.2, output_filename="outputNew.txt", learn_nouns=False, count_nouns=True, ): """trains on train_list to create special lexiographic dictionary Each iteration, will create a bag_of_words to count all the words that appear in the tweets incorrectly labeled. It will then reverse the positive and negative scores for all the words that cause an error above word_err_threshold Do this for iterations times or until error rate is below threshold Parameters ---------- train_list: list[list] in form [[label1, tweet1], [label2, tweet2],...] Label must be either 1, 0 or -1 for positive, neutral, negative respectively DEFAULTS: special: dict = {} dictionary of specialized lexicon to score words on top of SentiWordNet iterations: int = 5 how many times we want to iterate through the list to train stop_iter_threshold: int = 0.0 err_threshold below which to stop training diff_thresh:int = 0 how far we want the positive and negative score to differ to be considered not neutral word_err_threshold: int=0.2 how many errors word would have to cause in order to have their sentiscores changed output_filename: str file to output debugging results learn_nouns: boolean=False considers whether or not to add nouns to the outputted specialized dictionary count_nouns=true: determines whether or not to include nouns in the scoring Returns ------- tuple(dict, list) dict is the newly created specialized sentiment lexicon list is a is a list of error rates from each iteration """ if type(iterations) != int: raise TypeError("Expected 'iterations' to be of type int but was of type {}".format(type(iterations))) if iterations < 0: raise ValueError("Parameter 'iterations' must be >= 0") if stop_iter_threshold < 0: raise ValueError("Paremeter 'threshold' must be >= 0") if word_err_threshold < 0: raise ValueError("Paremeter 'sp_threshold' must be >= 0") if diff_thresh < 0: raise ValueError("Paremeter 'diff_thresh' must be >= 0") # create all necessary variables, files and dictionaries for processing output = open(output_filename, "w", encoding="utf8") output.write( "Training on {} items with a specialized dictionary of {} items\n".format(len(train_list), len(special_dict)) ) output.write( "Iterations={}; Stopping Error Threshold={}; word error to be in dictionary threshold = {}\n".format( iterations, stop_iter_threshold, word_err_threshold ) ) output.write( "Words must be {} apart to not be neutral. Learning nouns set to {} and counting nounse set to = {}\n".format( diff_thresh, learn_nouns, count_nouns ) ) output.flush() contractions = rneg.getContractions() emoti_dict = emoticons.load_emoticon_sentiment() it = 0 # current iteration err_rate = 1 # current_error rate (initialized to 1 because assume everything wrong) err_list = [] # list of error_rates at each iteration special = special_dict # specialized dictionary different from sentiwordnet labels learned flip_dict = {} # keeps track of words that keep flipping back and forth (keeps the one with lower error) while it < iterations and err_rate > stop_iter_threshold: err_count = 0 # counts how many errors were in the iteration err_bow = {} # dict of words, counts and scores word:{count, pos, neg, pos_count, neg_count, neutral_count} # look for words in which there is an error for item in train_list: # pow_word_list has tuples in form (word, positive score, negative score, part_of_speech) prediction, pow_word_tup = swn_pipeline( item[1], contractions=contractions, label_diff_thresh=diff_thresh, special=special, emoti_dict=emoti_dict, count_nouns=count_nouns, ) # if incorrect increase error and add to err_bow if prediction != item[0]: err_count += 1 for word, pos_score, neg_score, pos in pow_word_tup: # count all words in erroneous tweet # count number of errors if word in err_bow: err_bow[word]["count"] += 1 else: err_bow[word] = { "count": 1, "pos_score": pos_score, "neg_score": neg_score, "pos_count": 0, "neg_count": 0, "neutral_count": 0, "part_of_speech": pos, } # add count for label in word if item[0] == "-1": # if label was suppose to be negative err_bow[word]["neg_count"] += 1 elif item[0] == "1": # suppose to be positive err_bow[word]["pos_count"] += 1 else: # if was suppose to be neutral err_bow[word]["neutral_count"] += 1 if count_nouns: output.write( "word: {}; p-o-s: {}; pos_score: {}; neg_score: {}\n".format( word, pos, pos_score, neg_score ) ) # uncomment bottom to see what incorrect labeling has been done output.write("label: {} ==> {} for tweet ({})\n".format(prediction, item[0], item[1])) output.write("__________________________________________________________\n") sys.stdout.flush() # adjust scores of words for ( word, info_dict, ) in ( err_bow.items() ): # info_list in form [count, pos_score, neg_score, pos_count, neg_count, neutral_count, part_of_speech] # adds a higher score to to the correct label score if info_dict["count"] > word_err_threshold * err_count and ( learn_nouns or info_dict["part_of_speech"] != "n" ): # word caused a lot of errors # if suppose to be neutral if ( info_dict["neutral_count"] > info_dict["pos_count"] and info_dict["neutral_count"] > info_dict["neg_count"] or info_dict["pos_count"] == info_dict["neg_count"] ): new_pos = 0.0 new_neg = 0.0 # otherwise, if word was not in sentidict add word with initialized score elif info_dict["pos_score"] == None: # word was not in sentidict if info_dict["pos_count"] > info_dict["neg_count"]: # suppose to be positive new_pos = 0.5 new_neg = 0.0 else: # suppose to be negative new_pos = 0.0 new_neg = 0.5 # if word should have been positive/negative, but scores are opposite # invert them elif ( ( info_dict["pos_count"] > info_dict["neg_count"] and info_dict["pos_score"] < info_dict["neg_score"] ) or info_dict["pos_count"] < info_dict["neg_count"] and info_dict["pos_score"] > info_dict["neg_score"] ): # if had been flipped originally invert only if this cuased a bigger error if word not in flip_dict or word in flip_dict and flip_dict[word] < info_dict["count"]: new_pos = info_dict["neg_score"] new_neg = info_dict["pos_score"] flip_dict[word] = info_dict["count"] # if word scores were correct in comparison but not large enough difference else: if info_dict["pos_count"] > info_dict["neg_count"]: new_pos = info_dict["pos_score"] + gv.score_increment new_pos = 1.0 if new_pos > 1.0 else new_pos new_neg = info_dict["neg_score"] else: new_neg = info_dict["neg_score"] + gv.score_increment new_neg = 1.0 if new_neg > 1.0 else new_neg new_pos = info_dict["pos_score"] special[word] = (new_pos, new_neg) err_rate = err_count / len(train_list) it += 1 err_list.append(err_rate) print("current special dictionary:", special) print("it: {}, err_rate: {}".format(it - 1, err_rate)) print("###########################################################") output.write("current special dictionary: {}\n".format(special)) output.write("it: {}, err_rate: {}\n".format(it - 1, err_rate)) output.write("###########################################################\n") output.flush() output.close() return special, err_list