Ejemplo n.º 1
0
    def CalCosDist(self, ans_sentencelist, std_sen):
        debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)),
                    level=6)
        match_sen = None
        max_cos = 0
        apply_term_expansion = (self.apply_synonym_expansion
                                or self.apply_ancestor_expansion)
        all_std_words = std_sen['KeySVec'].keys()
        for stu_sen in ans_sentencelist:
            # Make sure student sentence not already matched
            # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence).
            if (self.only_match_sentence_once and stu_sen.has_key('Selected')):
                debug_print("Ingoring already matched sentence %s" %
                            stu_sen['No'])
                continue
            # Compute measure for current sentence
            q, s, qs = 0, 0, 0
            exp_terms = []
            for word in all_std_words:
                # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word]
                # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word]
                # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word]

                # If a standard word doesn't occur in the student sentence, then apply term expansion
                # by checking for most frequent synonym and/or ancestor term that does occur.
                # Note: Ancestor terms might be too general, so not checked if synonym found.
                # Also, expansions omit standard terms to avoid counting evidence twice.
                # TODO: Scale ancestor weight by degree of generality.
                std_freq = std_sen['KeySVec'][word]
                stu_freq = stu_sen['StuSVec'][word] if stu_sen[
                    'StuSVec'].has_key(word) else 0
                std_word = word
                if ((stu_freq == 0) and apply_term_expansion):
                    stu_word = std_word
                    scale_factor = 1.0
                    # Check synonyms (e.g., attorney for lawyer), excluding words in standard
                    if (self.apply_synonym_expansion):
                        debug_print(
                            "Checking for synonym of standard term '%s' among student terms"
                            % std_word,
                            level=5)
                        synonyms = list_difference(
                            wordnet.get_synonyms(std_word), all_std_words)
                        exp_word = find_most_freq_term(synonyms,
                                                       stu_sen['StuSVec'])
                        if (exp_word and (exp_word != stu_word)):
                            # Note: Uses frequency from student vector for synonym term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print(
                                "Using (student) synonym '%s' to match (standard) word '%s'"
                                % (exp_word, std_word),
                                level=4)
                    # Check ancestors (e.g., professional for lawyer), excluding words in standard
                    if (self.apply_ancestor_expansion
                            and (stu_word == std_word)):
                        debug_print(
                            "Checking for ancestor of standard term '%s' among student terms"
                            % std_word,
                            level=5)
                        ancestors = list_difference(
                            wordnet.get_hypernym_terms(
                                std_word, self.max_ancestor_links),
                            all_std_words)
                        exp_word = find_most_freq_term(ancestors,
                                                       stu_sen['StuSVec'])
                        if (exp_word and (exp_word != stu_word)):
                            # As before, uses frequency from student vector for expansion term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print(
                                "Using (student) ancestor term '%s' to match (standard) word '%s'"
                                % (exp_word, std_word),
                                level=4)
                    # Update frequency and make note of expansion for posthoc diagnosis
                    if (stu_word != std_word):
                        stu_freq = stu_sen['StuSVec'][stu_word] * scale_factor
                        debug_print("Scaled frequency score from %f to %f" %
                                    (stu_sen['StuSVec'][stu_word], stu_freq),
                                    level=7)
                        exp_terms.append(std_word + "->" + stu_word)
                # Do component-wise update
                debug_print("deltas: q=%f s=%f qs=%f" %
                            (std_freq * std_freq, stu_freq * stu_freq,
                             std_freq * stu_freq),
                            level=6)
                q += std_freq * std_freq
                s += stu_freq * stu_freq
                qs += std_freq * stu_freq
                debug_print("q=%f s=%f qs=%f" % (q, s, qs), level=7)
            if q == 0 or s == 0:
                qs_cos = 0
            else:
                qs_cos = qs / (math.sqrt(q * s))
            if (apply_term_expansion):
                stu_sen['ExpTerms'] = exp_terms

            # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms)
            stu_words = [
                word for word in stu_sen['StuSVec']
                if stu_sen['StuSVec'][word] > 0
            ]
            if qs_cos > max_cos and len(stu_words) > 0:
                max_cos = qs_cos
                match_sen = stu_sen
        if (self.only_match_sentence_once and match_sen):
            match_sen['Selected'] = True
        debug_print("Answer.CalCosDist(%s,_) => %s" %
                    (str(ans_sentencelist), str((max_cos, match_sen))),
                    level=6)
        return max_cos, match_sen
Ejemplo n.º 2
0
    def CalCosDist(self, ans_sentencelist, std_sen):
        debug_print("Answer.CalCosDist%s" % str((ans_sentencelist, std_sen)), level=6)
        match_sen = None
        max_cos = 0
        best_matching_stu_words = []
        apply_term_expansion = (self.apply_synonym_expansion or self.apply_ancestor_expansion)
        all_std_words = std_sen['KeySVec'].keys()
        # Setup the hash key to use for looking up student frequencies
        stu_freq_master_key = 'StuSVec'
        stu_freq_lookup_key = 'StuSVecTemp' if self.only_match_word_tokens_once else stu_freq_master_key

        for stu_sen in ans_sentencelist:
            # Create bookkeeping hash when sentence encountered first time during single-word-token matching
            # Note: new temp hash used (e.g., 'StuSVecTemp'), which shadows the input version during calculations.
            if self.only_match_word_tokens_once and (not stu_sen.has_key(stu_freq_lookup_key)):
                # TODO: stu_sen[stu_freq_lookup_key] = stu_sen[stu_freq_master_key].copy
                stu_sen[stu_freq_lookup_key] = dict()
                for word in stu_sen[stu_freq_master_key].keys():
                    stu_sen[stu_freq_lookup_key][word] = stu_sen[stu_freq_master_key][word]
            debug_print("stu_sen[stu_freq_lookup_key] (len=%d): %s" % (len(stu_sen[stu_freq_lookup_key]), stu_sen[stu_freq_lookup_key]), 6)
            assert(len(stu_sen[stu_freq_lookup_key]) == len(stu_sen[stu_freq_master_key]))
            # Make sure student sentence not already matched
            # TODO: Rework the already-matched check to be in terms of words not sentences (e.g., in case student just gives one long sentence).
            if (self.only_match_sentence_once and stu_sen.has_key('Selected')):
                debug_print("Ignoring already matched sentence %s" % stu_sen['No'], 4)
                continue
            # Compute measure for current sentence
            q, s, qs = 0, 0, 0
            exp_terms = []
            matching_stu_words = []
            for word in all_std_words:
                # OLD: q += std_sen['KeySVec'][word] * std_sen['KeySVec'][word]
                # OLD: s += stu_sen['StuSVec'][word] * stu_sen['StuSVec'][word]
                # OLD: qs += std_sen['KeySVec'][word] * stu_sen['StuSVec'][word]

                # If a standard word doesn't occur in the student sentence, then apply term expansion
                # by checking for most frequent synonym and/or ancestor term that does occur.
                # Note: Ancestor terms might be too general, so not checked if synonym found.
                # Also, expansions omit standard terms to avoid counting evidence twice.
                # TODO: Scale ancestor weight by degree of generality.
                std_freq = std_sen['KeySVec'][word]
                stu_freq = stu_sen[stu_freq_lookup_key][word] if stu_sen[stu_freq_lookup_key].has_key(word) else 0
                std_word = word
                stu_word = std_word
                if ((stu_freq == 0) and apply_term_expansion):
                    scale_factor = 1.0
                    # Check synonyms (e.g., attorney for lawyer), excluding words in standard
                    if (self.apply_synonym_expansion):
                        debug_print("Checking for synonym of standard term '%s' among student terms" % std_word, level=5)
                        synonyms = list_difference(wordnet.get_synonyms(std_word), all_std_words)
                        exp_word = find_most_freq_term(synonyms, stu_sen[stu_freq_lookup_key])
                        if (exp_word and (exp_word != stu_word)):
                            # Note: Uses frequency from student vector for synonym term
                            stu_word = exp_word
                            scale_factor = self.synonym_scale_factor
                            debug_print("Using (student) synonym '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4)
                    # Check ancestors (e.g., professional for lawyer), excluding words in standard
                    if (self.apply_ancestor_expansion and (stu_word == std_word)):
                        debug_print("Checking for ancestor of standard term '%s' among student terms" % std_word, level=5)
                        ## OLD: ancestors = list_difference(wordnet.get_hypernym_terms(std_word, self.max_ancestor_links), all_std_words)
                        ancestors = list_difference(wordnet.get_hypernym_terms(std_word), all_std_words)
                        exp_word = find_most_freq_term(ancestors, stu_sen[stu_freq_lookup_key])
                        if (exp_word and (exp_word != stu_word)):
                            # As before, uses frequency from student vector for expansion term
                            stu_word = exp_word
                            scale_factor = self.ancestor_scale_factor
                            debug_print("Using (student) ancestor term '%s' to match (standard) word '%s'" % (exp_word, std_word), level=4)
                    # Update frequency and make note of expansion for posthoc diagnosis
                    if (stu_word != std_word):
                        stu_freq = stu_sen[stu_freq_lookup_key][stu_word] * scale_factor
                        debug_print("Scaled frequency score from %f to %f" % (stu_sen[stu_freq_lookup_key][stu_word], stu_freq), level=7)
                        exp_terms.append(std_word + "->" + stu_word)
                # Do component-wise update
                debug_print("deltas: q=%f s=%f qs=%f w=%s" % (std_freq * std_freq, stu_freq * stu_freq, std_freq * stu_freq, word), level=6)
                q += std_freq * std_freq
                s += stu_freq * stu_freq
                qs += std_freq * stu_freq
                debug_print("q=%f s=%f qs=%f" % (q, s, qs),level=7)
                if (std_freq * stu_freq > 0):
                    matching_stu_words.append(stu_word)
            if q == 0 or s == 0:
                qs_cos = 0
            else:
                qs_cos = qs / (math.sqrt(q * s))
            if (apply_term_expansion):
                stu_sen['ExpTerms'] = exp_terms

            # Update max score, optionally recording expansion terms in hash for matching student sentence (under ExpTerms)
            stu_words = [word for word in stu_sen[stu_freq_lookup_key] if stu_sen[stu_freq_lookup_key][word] > 0]
            if qs_cos > max_cos and len(stu_words) > 0:
                max_cos = qs_cos
                match_sen = stu_sen
                best_matching_stu_words = matching_stu_words
        # Optionally, remove sentences or individual words matched from further consideration
        if (match_sen):
            if (self.only_match_sentence_once):
                match_sen['Selected'] = True
            if self.only_match_word_tokens_once:
                for word in best_matching_stu_words:
                    match_sen[stu_freq_lookup_key][word] = 0
        debug_print("Answer.CalCosDist(%s,_) => %s" % (str(ans_sentencelist), str((max_cos, match_sen, best_matching_stu_words))), level=6)
        return max_cos, match_sen, best_matching_stu_words