def generate_additional_essays(self, e_text, e_score, dictionary=None, max_syns=3): """ Substitute synonyms to generate extra essays from existing ones. This is done to increase the amount of training data. Should only be used with lowest scoring essays. e_text is the text of the original essay. e_score is the score of the original essay. dictionary is a fixed dictionary (list) of words to replace. max_syns defines the maximum number of additional essays to generate. Do not set too high. """ random.seed(1) e_toks = nltk.word_tokenize(e_text) all_syns = [] for word in e_toks: synonyms = util_functions.get_wordnet_syns(word) if(len(synonyms) > max_syns): synonyms = random.sample(synonyms, max_syns) all_syns.append(synonyms) new_essays = [] for i in range(0, max_syns): syn_toks = e_toks for z in range(0, len(e_toks)): if len(all_syns[z]) > i and (dictionary == None or e_toks[z] in dictionary): syn_toks[z] = all_syns[z][i] new_essays.append(" ".join(syn_toks)) for z in xrange(0, len(new_essays)): self.add_essay(new_essays[z], e_score, 1)
def gen_prompt_feats(self, e_set): """ Generates prompt based features from an essay set object and internal prompt variable. Generally called internally by gen_feats Returns an array of prompt features e_set - EssaySet object """ prompt_toks = nltk.word_tokenize(e_set._prompt) expand_syns = [] for word in prompt_toks: synonyms = util_functions.get_wordnet_syns(word) expand_syns.append(synonyms) expand_syns = list(chain.from_iterable(expand_syns)) prompt_overlap = [] prompt_overlap_prop = [] for j in e_set._tokens: tok_length=len(j) if(tok_length==0): tok_length=1 prompt_overlap.append(len([i for i in j if i in prompt_toks])) prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length)) expand_overlap = [] expand_overlap_prop = [] for j in e_set._tokens: tok_length=len(j) if(tok_length==0): tok_length=1 expand_overlap.append(len([i for i in j if i in expand_syns])) expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length)) prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose() return prompt_arr.copy()
def generate_additional_essays(self, e_text, e_score, dict=None, max_syns=3): """ Substitute synonyms to generate extra essays from existing ones. This is done to increase the amount of training data. Should only be used with lowest scoring essays. e_text is the text of the original essay. e_score is the score of the original essay. dict is a fixed dictionary (list) of words to replace. max_syns defines the maximum number of additional essays to generate. Do not set too high. """ random.seed(1) e_toks = nltk.word_tokenize(e_text) all_syns = [] for word in e_toks: synonyms = util_functions.get_wordnet_syns(word) if (len(synonyms) > max_syns): synonyms = random.sample(synonyms, max_syns) all_syns.append(synonyms) new_essays = [] for i in range(0, max_syns): syn_toks = e_toks for z in range(0, len(e_toks)): if len(all_syns[z]) > i and (dict == None or e_toks[z] in dict): syn_toks[z] = all_syns[z][i] new_essays.append(" ".join(syn_toks)) for z in xrange(0, len(new_essays)): self.add_essay(new_essays[z], e_score, 1)
def gen_prompt_feats(self, e_set): """ Generates prompt based features from an essay set object and internal prompt variable. Generally called internally by gen_feats Returns an array of prompt features e_set - EssaySet object """ prompt_toks = nltk.word_tokenize(e_set._prompt) expand_syns = [] for word in prompt_toks: synonyms = util_functions.get_wordnet_syns(word) expand_syns.append(synonyms) expand_syns = list(chain.from_iterable(expand_syns)) prompt_overlap = [] prompt_overlap_prop = [] for j in e_set._tokens: tok_length = len(j) if(tok_length == 0): tok_length = 1 prompt_overlap.append(len([i for i in j if i in prompt_toks])) prompt_overlap_prop.append(prompt_overlap[len(prompt_overlap) - 1] / float(tok_length)) expand_overlap = [] expand_overlap_prop = [] for j in e_set._tokens: tok_length = len(j) if(tok_length == 0): tok_length = 1 expand_overlap.append(len([i for i in j if i in expand_syns])) expand_overlap_prop.append(expand_overlap[len(expand_overlap) - 1] / float(tok_length)) prompt_arr = numpy.array((prompt_overlap, prompt_overlap_prop, expand_overlap, expand_overlap_prop)).transpose() return prompt_arr.copy()