Beispiel #1
0
 def generate_additional_essays(self, e_text, e_score, dictionary=None, max_syns=3):
     """
     Substitute synonyms to generate extra essays from existing ones.
     This is done to increase the amount of training data.
     Should only be used with lowest scoring essays.
     e_text is the text of the original essay.
     e_score is the score of the original essay.
     dictionary is a fixed dictionary (list) of words to replace.
     max_syns defines the maximum number of additional essays to generate.  Do not set too high.
     """
     random.seed(1)
     e_toks = nltk.word_tokenize(e_text)
     all_syns = []
     for word in e_toks:
         synonyms = util_functions.get_wordnet_syns(word)
         if(len(synonyms) > max_syns):
             synonyms = random.sample(synonyms, max_syns)
         all_syns.append(synonyms)
     new_essays = []
     for i in range(0, max_syns):
         syn_toks = e_toks
         for z in range(0, len(e_toks)):
             if len(all_syns[z]) > i and (dictionary == None or e_toks[z] in dictionary):
                 syn_toks[z] = all_syns[z][i]
         new_essays.append(" ".join(syn_toks))
     for z in xrange(0, len(new_essays)):
         self.add_essay(new_essays[z], e_score, 1)
Beispiel #2
0
    def gen_prompt_feats(self, e_set):
        """
        Generates prompt based features from an essay set object and internal prompt variable.
        Generally called internally by gen_feats
        Returns an array of prompt features
        e_set - EssaySet object
        """
        prompt_toks = nltk.word_tokenize(e_set._prompt)
        expand_syns = []
        for word in prompt_toks:
            synonyms = util_functions.get_wordnet_syns(word)
            expand_syns.append(synonyms)
        expand_syns = list(chain.from_iterable(expand_syns))
        prompt_overlap = []
        prompt_overlap_prop = []
        for j in e_set._tokens:
            tok_length=len(j)
            if(tok_length==0):
                tok_length=1
            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
        expand_overlap = []
        expand_overlap_prop = []
        for j in e_set._tokens:
            tok_length=len(j)
            if(tok_length==0):
                tok_length=1
            expand_overlap.append(len([i for i in j if i in expand_syns]))
            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))

        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()

        return prompt_arr.copy()
Beispiel #3
0
 def generate_additional_essays(self,
                                e_text,
                                e_score,
                                dict=None,
                                max_syns=3):
     """
     Substitute synonyms to generate extra essays from existing ones.
     This is done to increase the amount of training data.
     Should only be used with lowest scoring essays.
     e_text is the text of the original essay.
     e_score is the score of the original essay.
     dict is a fixed dictionary (list) of words to replace.
     max_syns defines the maximum number of additional essays to generate.  Do not set too high.
     """
     random.seed(1)
     e_toks = nltk.word_tokenize(e_text)
     all_syns = []
     for word in e_toks:
         synonyms = util_functions.get_wordnet_syns(word)
         if (len(synonyms) > max_syns):
             synonyms = random.sample(synonyms, max_syns)
         all_syns.append(synonyms)
     new_essays = []
     for i in range(0, max_syns):
         syn_toks = e_toks
         for z in range(0, len(e_toks)):
             if len(all_syns[z]) > i and (dict == None
                                          or e_toks[z] in dict):
                 syn_toks[z] = all_syns[z][i]
         new_essays.append(" ".join(syn_toks))
     for z in xrange(0, len(new_essays)):
         self.add_essay(new_essays[z], e_score, 1)
Beispiel #4
0
    def gen_prompt_feats(self, e_set):
        """
        Generates prompt based features from an essay set object and internal prompt variable.
        Generally called internally by gen_feats
        Returns an array of prompt features
        e_set - EssaySet object
        """
        prompt_toks = nltk.word_tokenize(e_set._prompt)
        expand_syns = []
        for word in prompt_toks:
            synonyms = util_functions.get_wordnet_syns(word)
            expand_syns.append(synonyms)
        expand_syns = list(chain.from_iterable(expand_syns))
        prompt_overlap = []
        prompt_overlap_prop = []
        for j in e_set._tokens:
            tok_length = len(j)
            if(tok_length == 0):
                tok_length = 1
            prompt_overlap.append(len([i for i in j if i in prompt_toks]))
            prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length))
        expand_overlap = []
        expand_overlap_prop = []
        for j in e_set._tokens:
            tok_length = len(j)
            if(tok_length == 0):
                tok_length = 1
            expand_overlap.append(len([i for i in j if i in expand_syns]))
            expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length))

        prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose()

        return prompt_arr.copy()