def CalCosDist(self, ans_sentencelist, std_sen): debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6) match_sen = None max_cos = 0 apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion) all_std_words = std_sen['KeySVec'].keys() for stu_sen in ans_sentencelist: # Make sure student sentence not already matched # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence). if (self.only_match_sentence_once and stu_sen.has_key('Selected')): debug_print("Ingoring already matched sentence %s" % stu_sen['No']) continue # Compute measure for current sentence q, s, qs = 0, 0, 0 exp_terms = [] for word in all_std_words: # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word] # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word] # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word] # If a standard word doesn't occur in the student sentence, then apply term expansion # by checking for most frequent synonym and/or ancestor term that does occur. # Note: Ancestor terms might be too general, so not checked if synonym found. # Also, expansions omit standard terms to avoid counting evidence twice. # TODO: Scale ancestor weight by degree of generality. std_freq = std_sen['KeySVec'][word] stu_freq = stu_sen['StuSVec'][word] if stu_sen[ 'StuSVec'].has_key(word) else 0 std_word = word if ((stu_freq == 0) and apply_term_expansion): stu_word = std_word scale_factor = 1.0 # Check synonyms (e.g., attorney for lawyer), excluding words in standard if (self.apply_synonym_expansion): debug_print( "Checking for synonym of standard term '%s' among student terms" % std_word, level=5) synonyms = list_difference( wordnet.get_synonyms(std_word), all_std_words) exp_word = find_most_freq_term(synonyms, stu_sen['StuSVec']) if (exp_word and (exp_word != stu_word)): # Note: Uses frequency from student vector for synonym term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print( "Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Check ancestors (e.g., professional for lawyer), excluding words in standard if (self.apply_ancestor_expansion and (stu_word == std_word)): debug_print( "Checking for ancestor of standard term '%s' among student terms" % std_word, level=5) ancestors = list_difference( wordnet.get_hypernym_terms( std_word, self.max_ancestor_links), all_std_words) exp_word = find_most_freq_term(ancestors, stu_sen['StuSVec']) if (exp_word and (exp_word != stu_word)): # As before, uses frequency from student vector for expansion term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print( "Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Update frequency and make note of expansion for posthoc diagnosis if (stu_word != std_word): stu_freq = stu_sen['StuSVec'][stu_word] * scale_factor debug_print("Scaled frequency score from %f to %f" % (stu_sen['StuSVec'][stu_word], stu_freq), level=7) exp_terms.append(std_word + "->" + stu_word) # Do component-wise update debug_print("deltas: q=%f s=%f qs=%f" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq), level=6) q += std_freq * std_freq s += stu_freq * stu_freq qs += std_freq * stu_freq debug_print("q=%f s=%f qs=%f" % (q, s, qs), level=7) if q == 0 or s == 0: qs_cos = 0 else: qs_cos = qs / (math.sqrt(q * s)) if (apply_term_expansion): stu_sen['ExpTerms'] = exp_terms # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms) stu_words = [ word for word in stu_sen['StuSVec'] if stu_sen['StuSVec'][word] > 0 ] if qs_cos > max_cos and len(stu_words) > 0: max_cos = qs_cos match_sen = stu_sen if (self.only_match_sentence_once and match_sen): match_sen['Selected'] = True debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen))), level=6) return max_cos, match_sen
def CalCosDist(self, ans_sentencelist, std_sen): debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6) match_sen = None max_cos = 0 best_matching_stu_words = [] apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion) all_std_words = std_sen['KeySVec'].keys() # Setup the hash key to use for looking up student frequencies stu_freq_master_key = 'StuSVec' stu_freq_lookup_key = 'StuSVecTemp' if self.only_match_word_tokens_once else stu_freq_master_key for stu_sen in ans_sentencelist: # Create bookkeeping hash when sentence encountered first time during single-word-token matching # Note: new temp hash used (e.g., 'StuSVecTemp'), which shadows the input version during calculations. if self.only_match_word_tokens_once and (not stu_sen.has_key(stu_freq_lookup_key)): # TODO: stu_sen[stu_freq_lookup_key] = stu_sen[stu_freq_master_key].copy stu_sen[stu_freq_lookup_key] = dict() for word in stu_sen[stu_freq_master_key].keys(): stu_sen[stu_freq_lookup_key][word] = stu_sen[stu_freq_master_key][word] debug_print("stu_sen[stu_freq_lookup_key] (len=%d): %s" % (len(stu_sen[stu_freq_lookup_key]), stu_sen[stu_freq_lookup_key]), 6) assert(len(stu_sen[stu_freq_lookup_key]) == len(stu_sen[stu_freq_master_key])) # Make sure student sentence not already matched # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence). if (self.only_match_sentence_once and stu_sen.has_key('Selected')): debug_print("Ignoring already matched sentence %s" % stu_sen['No'], 4) continue # Compute measure for current sentence q, s, qs = 0, 0, 0 exp_terms = [] matching_stu_words = [] for word in all_std_words: # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word] # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word] # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word] # If a standard word doesn't occur in the student sentence, then apply term expansion # by checking for most frequent synonym and/or ancestor term that does occur. # Note: Ancestor terms might be too general, so not checked if synonym found. # Also, expansions omit standard terms to avoid counting evidence twice. # TODO: Scale ancestor weight by degree of generality. std_freq = std_sen['KeySVec'][word] stu_freq = stu_sen[stu_freq_lookup_key][word] if stu_sen[stu_freq_lookup_key].has_key(word) else 0 std_word = word stu_word = std_word if ((stu_freq == 0) and apply_term_expansion): scale_factor = 1.0 # Check synonyms (e.g., attorney for lawyer), excluding words in standard if (self.apply_synonym_expansion): debug_print("Checking for synonym of standard term '%s' among student terms" % std_word, level=5) synonyms = list_difference(wordnet.get_synonyms(std_word), all_std_words) exp_word = find_most_freq_term(synonyms, stu_sen[stu_freq_lookup_key]) if (exp_word and (exp_word != stu_word)): # Note: Uses frequency from student vector for synonym term stu_word = exp_word scale_factor = self.synonym_scale_factor debug_print("Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Check ancestors (e.g., professional for lawyer), excluding words in standard if (self.apply_ancestor_expansion and (stu_word == std_word)): debug_print("Checking for ancestor of standard term '%s' among student terms" % std_word, level=5) ## OLD: ancestors = list_difference(wordnet.get_hypernym_terms(std_word, self.max_ancestor_links), all_std_words) ancestors = list_difference(wordnet.get_hypernym_terms(std_word), all_std_words) exp_word = find_most_freq_term(ancestors, stu_sen[stu_freq_lookup_key]) if (exp_word and (exp_word != stu_word)): # As before, uses frequency from student vector for expansion term stu_word = exp_word scale_factor = self.ancestor_scale_factor debug_print("Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4) # Update frequency and make note of expansion for posthoc diagnosis if (stu_word != std_word): stu_freq = stu_sen[stu_freq_lookup_key][stu_word] * scale_factor debug_print("Scaled frequency score from %f to %f" % (stu_sen[stu_freq_lookup_key][stu_word], stu_freq), level=7) exp_terms.append(std_word + "->" + stu_word) # Do component-wise update debug_print("deltas: q=%f s=%f qs=%f w=%s" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq, word), level=6) q += std_freq * std_freq s += stu_freq * stu_freq qs += std_freq * stu_freq debug_print("q=%f s=%f qs=%f" % (q, s, qs),level=7) if (std_freq * stu_freq > 0): matching_stu_words.append(stu_word) if q == 0 or s == 0: qs_cos = 0 else: qs_cos = qs / (math.sqrt(q * s)) if (apply_term_expansion): stu_sen['ExpTerms'] = exp_terms # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms) stu_words = [word for word in stu_sen[stu_freq_lookup_key] if stu_sen[stu_freq_lookup_key][word] > 0] if qs_cos > max_cos and len(stu_words) > 0: max_cos = qs_cos match_sen = stu_sen best_matching_stu_words = matching_stu_words # Optionally, remove sentences or individual words matched from further consideration if (match_sen): if (self.only_match_sentence_once): match_sen['Selected'] = True if self.only_match_word_tokens_once: for word in best_matching_stu_words: match_sen[stu_freq_lookup_key][word] = 0 debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen, best_matching_stu_words))), level=6) return max_cos, match_sen, best_matching_stu_words