Exemple #1
0
    def initialize_dictionaries(self, p_set):
        """
        Initialize dictionaries with the textual inputs in the PredictorSet object
        p_set - PredictorSet object that has had data fed in
        """
        success = False
        if not (hasattr(p_set, '_type')):
            error_message = "needs to be an essay set of the train type."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)

        if not (p_set._type == "train"):
            error_message = "needs to be an essay set of the train type."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)

        div_length = len(p_set._essay_sets)
        if div_length == 0:
            div_length = 1

        #Ensures that even with a large amount of input textual features, training time stays reasonable
        max_feats2 = int(math.floor(200 / div_length))
        for i in range(0, len(p_set._essay_sets)):
            self._extractors.append(FeatureExtractor())
            self._extractors[i].initialize_dictionaries(p_set._essay_sets[i],
                                                        max_feats2=max_feats2)
            self._initialized = True
            success = True
        return success
 def initialize_dictionaries(self, e_set, max_feats2=200):
     """
     Initializes dictionaries from an essay set object
     Dictionaries must be initialized prior to using this to extract features
     e_set is an input essay set
     returns a confirmation of initialization
     """
     if (hasattr(e_set, '_type')):
         if (e_set._type == "train"):
             #normal text (unstemmed) useful words/bigrams
             nvocab = util_functions.get_vocab(e_set._text,
                                               e_set._score,
                                               max_feats2=max_feats2)
             #stemmed and spell corrected vocab useful words/ngrams
             svocab = util_functions.get_vocab(e_set._clean_stem_text,
                                               e_set._score,
                                               max_feats2=max_feats2)
             #dictionary trained on proper vocab
             self._normal_dict = CountVectorizer(ngram_range=(1, 2),
                                                 vocabulary=nvocab)
             #dictionary trained on proper vocab
             self._stem_dict = CountVectorizer(ngram_range=(1, 2),
                                               vocabulary=svocab)
             self.dict_initialized = True
             #Average spelling errors in set. needed later for spelling detection
             self._mean_spelling_errors = sum(
                 e_set._spelling_errors) / float(len(
                     e_set._spelling_errors))
             self._spell_errors_per_character = sum(
                 e_set._spelling_errors) / float(
                     sum([len(t) for t in e_set._text]))
             #Gets the number and positions of grammar errors
             good_pos_tags, bad_pos_positions = self._get_grammar_errors(
                 e_set._pos, e_set._text, e_set._tokens)
             self._grammar_errors_per_character = (
                 sum(good_pos_tags) /
                 float(sum([len(t) for t in e_set._text])))
             #Generate bag of words features
             bag_feats = self.gen_bag_feats(e_set)
             #Sum of a row of bag of words features (topical words in an essay)
             f_row_sum = numpy.sum(bag_feats[:, :])
             #Average index of how "topical" essays are
             self._mean_f_prop = f_row_sum / float(
                 sum([len(t) for t in e_set._text]))
             ret = "ok"
         else:
             raise util_functions.InputError(
                 e_set, "needs to be an essay set of the train type.")
     else:
         raise util_functions.InputError(
             e_set, "wrong input. need an essay set object")
     return ret
Exemple #3
0
    def gen_feats(self, p_set):
        """
        Generates features based on an iput p_set
        p_set - PredictorSet
        """
        if self._initialized != True:
            error_message = "Dictionaries have not been initialized."
            log.exception(error_message)
            raise util_functions.InputError(p_set, error_message)

        textual_features = []
        for i in range(0, len(p_set._essay_sets)):
            textual_features.append(self._extractors[i].gen_feats(
                p_set._essay_sets[i]))

        textual_matrix = numpy.concatenate(textual_features, axis=1)
        predictor_matrix = numpy.array(p_set._numeric_features)

        print(textual_matrix.shape)
        print(predictor_matrix.shape)

        overall_matrix = numpy.concatenate((textual_matrix, predictor_matrix),
                                           axis=1)

        return overall_matrix.copy()
Exemple #4
0
 def update_prompt(self, prompt_text):
     """
     Update the default prompt string, which is "".
     prompt_text should be a string.
     Returns the prompt as a confirmation.
     """
     if (type(prompt_text) == type("text")):
         self._prompt = util_functions.sub_chars(prompt_text)
         ret = self._prompt
     else:
         raise util_functions.InputError(
             prompt_text, "Invalid prompt. Need to enter a string value.")
     return ret
Exemple #5
0
 def gen_bag_feats(self, e_set):
     """
     Generates bag of words features from an input essay set and trained FeatureExtractor
     Generally called by gen_feats
     Returns an array of features
     e_set - EssaySet object
     """
     if(hasattr(self, '_stem_dict')):
         sfeats = self._stem_dict.transform(e_set._clean_stem_text)
         nfeats = self._normal_dict.transform(e_set._text)
         bag_feats = numpy.concatenate((sfeats.toarray(), nfeats.toarray()), axis=1)
     else:
         raise util_functions.InputError(self, "Dictionaries must be initialized prior to generating bag features.")
     return bag_feats.copy()
Exemple #6
0
    def add_essay(self, essay_text, essay_score, essay_generated=0):
        """
        Add new (essay_text,essay_score) pair to the essay set.
        essay_text must be a string.
        essay_score must be an int.
        essay_generated should not be changed by the user.
        Returns a confirmation that essay was added.
        """
        # Get maximum current essay id, or set to 0 if this is the first essay added
        if (len(self._id) > 0):
            max_id = max(self._id)
        else:
            max_id = 0
            # Verify that essay_score is an int, essay_text is a string, and essay_generated equals 0 or 1

        try:
            essay_text = essay_text.encode('ascii', 'ignore')
            if len(essay_text) < 5:
                essay_text = "Invalid essay."
        except:
            log.exception("Could not parse essay into ascii.")

        try:
            #Try conversion of types
            essay_score = int(essay_score)
            essay_text = str(essay_text)
        except:
            #Nothing needed here, will return error in any case.
            log.exception(
                "Invalid type for essay score : {0} or essay text : {1}".
                format(type(essay_score), type(essay_text)))

        if isinstance(essay_score,int) and isinstance(essay_text, basestring)\
        and (essay_generated == 0 or essay_generated == 1):
            self._id.append(max_id + 1)
            self._score.append(essay_score)
            # Clean text by removing non digit/work/punctuation characters
            try:
                essay_text = str(essay_text.encode('ascii', 'ignore'))
            except:
                essay_text = (essay_text.decode('utf-8', 'replace')).encode(
                    'ascii', 'ignore')
            cleaned_essay = util_functions.sub_chars(essay_text).lower()
            if (len(cleaned_essay) > MAXIMUM_ESSAY_LENGTH):
                cleaned_essay = cleaned_essay[0:MAXIMUM_ESSAY_LENGTH]
            self._text.append(cleaned_essay)
            # Spell correct text using aspell
            cleaned_text, spell_errors, markup_text = util_functions.spell_correct(
                self._text[len(self._text) - 1])
            self._clean_text.append(cleaned_text)
            self._spelling_errors.append(spell_errors)
            self._markup_text.append(markup_text)
            # Tokenize text
            self._tokens.append(
                nltk.word_tokenize(self._clean_text[len(self._clean_text) -
                                                    1]))
            # Part of speech tag text
            self._pos.append(
                nltk.pos_tag(self._clean_text[len(self._clean_text) -
                                              1].split(" ")))
            self._generated.append(essay_generated)
            # Stem spell corrected text
            porter = nltk.PorterStemmer()
            por_toks = " ".join(
                [porter.stem(w) for w in self._tokens[len(self._tokens) - 1]])
            self._clean_stem_text.append(por_toks)

            ret = "text: " + self._text[len(self._text) -
                                        1] + " score: " + str(essay_score)
        else:
            raise util_functions.InputError(
                essay_text, "arguments need to be in format "
                "(text,score). text needs to be string,"
                " score needs to be int.")
Exemple #7
0
    def add_row(self, numeric_features, textual_features, target):
        #Basic input checking
        if not isinstance(target, (int, long, float)):
            error_message = "Target is not a numeric value."
            log.exception(error_message)
            raise util_functions.InputError(target, error_message)

        if not isinstance(numeric_features, list):
            error_message = "Numeric features are not a list."
            log.exception(error_message)
            raise util_functions.InputError(numeric_features, error_message)

        if not isinstance(textual_features, list):
            error_message = "Textual features are not a list."
            log.exception(error_message)
            raise util_functions.InputError(textual_features, error_message)

        #Do some length checking for parameters
        if len(self._numeric_features) > 0:
            numeric_length = len(self._numeric_features[-1])
            current_numeric_length = len(numeric_features)
            if numeric_length != current_numeric_length:
                error_message = "Numeric features are an improper length."
                log.exception(error_message)
                raise util_functions.InputError(numeric_features,
                                                error_message)

        if len(self._textual_features) > 0:
            textual_length = len(self._textual_features[-1])
            current_textual_length = len(textual_features)
            if textual_length != current_textual_length:
                error_message = "Textual features are an improper length."
                log.exception(error_message)
                raise util_functions.InputError(textual_features,
                                                error_message)

        #Now check to see if text features and numeric features are individually correct

        for i in xrange(0, len(numeric_features)):
            try:
                numeric_features[i] = float(numeric_features[i])
            except:
                error_message = "Numeric feature {0} not numeric.".format(
                    numeric_features[i])
                log.exception(error_message)
                raise util_functions.InputError(numeric_features,
                                                error_message)

        for i in xrange(0, len(textual_features)):
            try:
                textual_features[i] = str(textual_features[i].encode(
                    'ascii', 'ignore'))
            except:
                error_message = "Textual feature {0} not string.".format(
                    textual_features[i])
                log.exception(error_message)
                raise util_functions.InputError(textual_features,
                                                error_message)

        #Create essay sets for textual features if needed
        if len(self._textual_features) == 0:
            for i in xrange(0, len(textual_features)):
                self._essay_sets.append(
                    essay_set.EssaySet(essaytype=self._type))

        #Add numeric and textual features
        self._numeric_features.append(numeric_features)
        self._textual_features.append(textual_features)

        #Add targets
        self._target.append(target)

        #Add textual features to essay sets
        for i in xrange(0, len(textual_features)):
            self._essay_sets[i].add_essay(textual_features[i], target)