def test(self, testing_data): """ Tests the precision/recall of the model Parameters: testing_data (array): data on which to test the model Returns: Null """ for row in testing_data: u_priors = dict(self.priors) tokenized_turn1 = tokenize(row["turn1"]["text"]) tokenized_turn2 = tokenize(row["turn2"]["text"]) tokenized_turn3 = tokenize(row["turn3"]["text"]) conv = tokenized_turn1 + tokenized_turn2 + tokenized_turn3 parsed_message = flatten([ ngrams_and_remove_stop_words(x, self.ngram_choice) for x in [tokenized_turn1, tokenized_turn2, tokenized_turn3] ]) for var in self.variables: classification = normalize( self._classify(self.ngrams[var], parsed_message, conv, u_priors[var], var)) for i, e in enumerate(self.variable_dimensions): u_priors[var][e] = classification[i] parsed_message = ngrams_and_remove_stop_words( tokenized_turn1, self.ngram_choice) for var in self.variables: classification = normalize( self._classify(self.ngrams[var], parsed_message, tokenized_turn1, u_priors[var], var)) for i, e in enumerate(self.variable_dimensions): u_priors[var][e] = classification[i] parsed_message = ngrams_and_remove_stop_words( tokenized_turn3, self.ngram_choice) var_classification = {dim: "" for dim in self.variables} for var in self.variables: var_classification[var] = self._classify( self.ngrams[var], parsed_message, tokenized_turn3, u_priors[var], var, False) self.true.append(row["turn3"]["emotion"]) emo_class = self.__map_to_emotion(var_classification) self.pred.append(emo_class[0]) super(AVModel, self).calculate_scores()
def test(self, testing_data): """ Tests the precision/recall of the model Parameters: testing_data (array): data on which to test the model Returns: Null """ for var in self.variables: self.true[var] = [] self.pred[var] = [] for row in testing_data: u_priors = dict(self.priors) tokenized_turn1 = tokenize(row["turn1"]["text"]) tokenized_turn2 = tokenize(row["turn2"]["text"]) tokenized_turn3 = tokenize(row["turn3"]["text"]) conv = tokenized_turn1 + tokenized_turn2 + tokenized_turn3 parsed_message = flatten([ngrams_and_remove_stop_words(x, self.ngram_choice) for x in [ tokenized_turn1, tokenized_turn2, tokenized_turn3]]) for var in self.variables: classification = normalize(self.__classify( self.ngrams[var], parsed_message, conv, u_priors[var], var)) for i, e in enumerate(self.variable_dimensions): u_priors[var][e] = classification[i] parsed_message = ngrams_and_remove_stop_words( tokenized_turn1, self.ngram_choice) for var in self.variables: classification = normalize(self.__classify( self.ngrams[var], parsed_message, tokenized_turn1, u_priors[var], var)) for i, e in enumerate(self.variable_dimensions): u_priors[var][e] = classification[i] parsed_message = ngrams_and_remove_stop_words( tokenized_turn3, self.ngram_choice) for var in self.variables: weight = self.variable_dimensions[int( row["turn3"]["appraisals"][var])] self.true[var].append(weight) classification = self.__classify( self.ngrams[var], parsed_message, tokenized_turn3, u_priors[var], var, False) self.pred[var].append(classification) self.calculate_scores()
def test(self, testing_data): """ Tests the precision/recall of the model Parameters: testing_data (array): data on which to test the model Returns: Null """ for row in testing_data: u_priors = dict(self.priors) tokenized_turn1 = tokenize(row["turn1"]["text"]) tokenized_turn2 = tokenize(row["turn2"]["text"]) tokenized_turn3 = tokenize(row["turn3"]["text"]) conv = tokenized_turn1 + tokenized_turn2 + tokenized_turn3 parsed_message = flatten([ ngrams_and_remove_stop_words(x, self.ngram_choice) for x in [tokenized_turn1, tokenized_turn2, tokenized_turn3] ]) classification = normalize( self.__classify(self.ngrams, parsed_message, conv, u_priors)) for i, e in enumerate(self.emotions): u_priors[e] = classification[i] parsed_message = ngrams_and_remove_stop_words( tokenized_turn1, self.ngram_choice) classification = normalize( self.__classify(self.ngrams, parsed_message, tokenized_turn1, u_priors)) for i, e in enumerate(self.emotions): u_priors[e] = classification[i] emotion = row["turn3"]["emotion"] self.true.append(emotion) parsed_message = ngrams_and_remove_stop_words( tokenized_turn3, self.ngram_choice) classification = self.__classify(self.ngrams, parsed_message, tokenized_turn3, u_priors, False) self.pred.append(str(classification)) self.calculate_scores()
def _train_by_variable(self, training_set, variable, data_points={}): """ Calculates the counts for each unigram and priors for each classification Parameters: training_set (array): training data used to train the model variable (string): variable in use in training Returns: Object: ngrams with associated counts Object: sums for each classification Object: priors for each classification """ words = {} words_totals = {dim: 0 for dim in self.variable_dimensions} tense_totals = {dim: 0 for dim in self.variable_dimensions} pronoun_totals = {dim: 0 for dim in self.variable_dimensions} words_vocab = set() tense_vocab = set() pronoun_vocab = set() for row in training_set: for turn in ["turn1", "turn2", "turn3"]: true_dim = self.variable_dimensions[int( row[turn]["appraisals"][variable])] tokenized_res = tokenize(row[turn]["text"]) pos = parts_of_speech(tokenized_res) tense_vocab, tense_totals, pronoun_vocab, pronoun_totals = self.__build_pos_counts( pos, tense_vocab, variable, true_dim, tense_totals, pronoun_vocab, pronoun_totals) res = ngrams_and_remove_stop_words( tokenized_res, self.ngram_choice) words_vocab, words, words_totals = self.__build_word_counts( res, words_vocab, words, true_dim, words_totals) denom = sum(words_totals.values()) self.priors[variable] = {dim: float( words_totals[dim])/float(denom) for dim in self.variable_dimensions} self.__calculate_probabilities( words, words_totals, words_vocab, tense_totals, tense_vocab, pronoun_totals, pronoun_vocab, variable)
def train(self, training_data): """ Builds a trained Emotions model Parameters: training_data (array): training data used to train the model Returns: Model: a trained model """ words = {} words_vocab = set() tense_vocab = set() pronoun_vocab = set() words_totals = {emotion: 0 for emotion in self.emotions} tense_totals = {emotion: 0 for emotion in self.emotions} pronoun_totals = {emotion: 0 for emotion in self.emotions} for row in training_data: for turn in ["turn1", "turn2", "turn3"]: true_emotion = row[turn]["emotion"] tokenized_res = tokenize(row[turn]["text"]) pos = parts_of_speech(tokenized_res) tense_vocab, tense_totals, pronoun_vocab, pronoun_totals = self.__build_pos_counts( pos, tense_vocab, true_emotion, tense_totals, pronoun_vocab, pronoun_totals) res = ngrams_and_remove_stop_words(tokenized_res, self.ngram_choice) words_vocab, words, words_totals = self.__build_word_counts( res, words_vocab, words, true_emotion, words_totals) sum_totals = sum(words_totals.values()) self.priors = { emotion: float(words_totals[emotion]) / float(sum_totals) for emotion in self.emotions } self.__calculate_probabilities(words, words_totals, words_vocab, tense_totals, tense_vocab, pronoun_totals, pronoun_vocab)