def initialize_dictionaries(self, p_set): """ Initialize dictionaries with the textual inputs in the PredictorSet object p_set - PredictorSet object that has had data fed in """ success = False if not (hasattr(p_set, '_type')): error_message = "needs to be an essay set of the train type." log.exception(error_message) raise util_functions.InputError(p_set, error_message) if not (p_set._type == "train"): error_message = "needs to be an essay set of the train type." log.exception(error_message) raise util_functions.InputError(p_set, error_message) div_length = len(p_set._essay_sets) if div_length == 0: div_length = 1 #Ensures that even with a large amount of input textual features, training time stays reasonable max_feats2 = int(math.floor(200 / div_length)) for i in range(0, len(p_set._essay_sets)): self._extractors.append(FeatureExtractor()) self._extractors[i].initialize_dictionaries(p_set._essay_sets[i], max_feats2=max_feats2) self._initialized = True success = True return success
def initialize_dictionaries(self, e_set, max_feats2=200): """ Initializes dictionaries from an essay set object Dictionaries must be initialized prior to using this to extract features e_set is an input essay set returns a confirmation of initialization """ if (hasattr(e_set, '_type')): if (e_set._type == "train"): #normal text (unstemmed) useful words/bigrams nvocab = util_functions.get_vocab(e_set._text, e_set._score, max_feats2=max_feats2) #stemmed and spell corrected vocab useful words/ngrams svocab = util_functions.get_vocab(e_set._clean_stem_text, e_set._score, max_feats2=max_feats2) #dictionary trained on proper vocab self._normal_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=nvocab) #dictionary trained on proper vocab self._stem_dict = CountVectorizer(ngram_range=(1, 2), vocabulary=svocab) self.dict_initialized = True #Average spelling errors in set. needed later for spelling detection self._mean_spelling_errors = sum( e_set._spelling_errors) / float(len( e_set._spelling_errors)) self._spell_errors_per_character = sum( e_set._spelling_errors) / float( sum([len(t) for t in e_set._text])) #Gets the number and positions of grammar errors good_pos_tags, bad_pos_positions = self._get_grammar_errors( e_set._pos, e_set._text, e_set._tokens) self._grammar_errors_per_character = ( sum(good_pos_tags) / float(sum([len(t) for t in e_set._text]))) #Generate bag of words features bag_feats = self.gen_bag_feats(e_set) #Sum of a row of bag of words features (topical words in an essay) f_row_sum = numpy.sum(bag_feats[:, :]) #Average index of how "topical" essays are self._mean_f_prop = f_row_sum / float( sum([len(t) for t in e_set._text])) ret = "ok" else: raise util_functions.InputError( e_set, "needs to be an essay set of the train type.") else: raise util_functions.InputError( e_set, "wrong input. need an essay set object") return ret
def gen_feats(self, p_set): """ Generates features based on an iput p_set p_set - PredictorSet """ if self._initialized != True: error_message = "Dictionaries have not been initialized." log.exception(error_message) raise util_functions.InputError(p_set, error_message) textual_features = [] for i in range(0, len(p_set._essay_sets)): textual_features.append(self._extractors[i].gen_feats( p_set._essay_sets[i])) textual_matrix = numpy.concatenate(textual_features, axis=1) predictor_matrix = numpy.array(p_set._numeric_features) print(textual_matrix.shape) print(predictor_matrix.shape) overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix), axis=1) return overall_matrix.copy()
def update_prompt(self, prompt_text): """ Update the default prompt string, which is "". prompt_text should be a string. Returns the prompt as a confirmation. """ if (type(prompt_text) == type("text")): self._prompt = util_functions.sub_chars(prompt_text) ret = self._prompt else: raise util_functions.InputError( prompt_text, "Invalid prompt. Need to enter a string value.") return ret
def gen_bag_feats(self, e_set): """ Generates bag of words features from an input essay set and trained FeatureExtractor Generally called by gen_feats Returns an array of features e_set - EssaySet object """ if(hasattr(self, '_stem_dict')): sfeats = self._stem_dict.transform(e_set._clean_stem_text) nfeats = self._normal_dict.transform(e_set._text) bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1) else: raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.") return bag_feats.copy()
def add_essay(self, essay_text, essay_score, essay_generated=0): """ Add new (essay_text,essay_score) pair to the essay set. essay_text must be a string. essay_score must be an int. essay_generated should not be changed by the user. Returns a confirmation that essay was added. """ # Get maximum current essay id, or set to 0 if this is the first essay added if (len(self._id) > 0): max_id = max(self._id) else: max_id = 0 # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1 try: essay_text = essay_text.encode('ascii', 'ignore') if len(essay_text) < 5: essay_text = "Invalid essay." except: log.exception("Could not parse essay into ascii.") try: #Try conversion of types essay_score = int(essay_score) essay_text = str(essay_text) except: #Nothing needed here, will return error in any case. log.exception( "Invalid type for essay score : {0} or essay text : {1}". format(type(essay_score), type(essay_text))) if isinstance(essay_score,int) and isinstance(essay_text, basestring)\ and (essay_generated == 0 or essay_generated == 1): self._id.append(max_id + 1) self._score.append(essay_score) # Clean text by removing non digit/work/punctuation characters try: essay_text = str(essay_text.encode('ascii', 'ignore')) except: essay_text = (essay_text.decode('utf-8', 'replace')).encode( 'ascii', 'ignore') cleaned_essay = util_functions.sub_chars(essay_text).lower() if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH): cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH] self._text.append(cleaned_essay) # Spell correct text using aspell cleaned_text, spell_errors, markup_text = util_functions.spell_correct( self._text[len(self._text) - 1]) self._clean_text.append(cleaned_text) self._spelling_errors.append(spell_errors) self._markup_text.append(markup_text) # Tokenize text self._tokens.append( nltk.word_tokenize(self._clean_text[len(self._clean_text) - 1])) # Part of speech tag text self._pos.append( nltk.pos_tag(self._clean_text[len(self._clean_text) - 1].split(" "))) self._generated.append(essay_generated) # Stem spell corrected text porter = nltk.PorterStemmer() por_toks = " ".join( [porter.stem(w) for w in self._tokens[len(self._tokens) - 1]]) self._clean_stem_text.append(por_toks) ret = "text: " + self._text[len(self._text) - 1] + " score: " + str(essay_score) else: raise util_functions.InputError( essay_text, "arguments need to be in format " "(text,score). text needs to be string," " score needs to be int.")
def add_row(self, numeric_features, textual_features, target): #Basic input checking if not isinstance(target, (int, long, float)): error_message = "Target is not a numeric value." log.exception(error_message) raise util_functions.InputError(target, error_message) if not isinstance(numeric_features, list): error_message = "Numeric features are not a list." log.exception(error_message) raise util_functions.InputError(numeric_features, error_message) if not isinstance(textual_features, list): error_message = "Textual features are not a list." log.exception(error_message) raise util_functions.InputError(textual_features, error_message) #Do some length checking for parameters if len(self._numeric_features) > 0: numeric_length = len(self._numeric_features[-1]) current_numeric_length = len(numeric_features) if numeric_length != current_numeric_length: error_message = "Numeric features are an improper length." log.exception(error_message) raise util_functions.InputError(numeric_features, error_message) if len(self._textual_features) > 0: textual_length = len(self._textual_features[-1]) current_textual_length = len(textual_features) if textual_length != current_textual_length: error_message = "Textual features are an improper length." log.exception(error_message) raise util_functions.InputError(textual_features, error_message) #Now check to see if text features and numeric features are individually correct for i in xrange(0, len(numeric_features)): try: numeric_features[i] = float(numeric_features[i]) except: error_message = "Numeric feature {0} not numeric.".format( numeric_features[i]) log.exception(error_message) raise util_functions.InputError(numeric_features, error_message) for i in xrange(0, len(textual_features)): try: textual_features[i] = str(textual_features[i].encode( 'ascii', 'ignore')) except: error_message = "Textual feature {0} not string.".format( textual_features[i]) log.exception(error_message) raise util_functions.InputError(textual_features, error_message) #Create essay sets for textual features if needed if len(self._textual_features) == 0: for i in xrange(0, len(textual_features)): self._essay_sets.append( essay_set.EssaySet(essaytype=self._type)) #Add numeric and textual features self._numeric_features.append(numeric_features) self._textual_features.append(textual_features) #Add targets self._target.append(target) #Add textual features to essay sets for i in xrange(0, len(textual_features)): self._essay_sets[i].add_essay(textual_features[i], target)