Beispiel #1
0
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt",
                question_dir="../data/all_sat/seven_sat_raw.txt"):

    # Look at text before blank
    def getBeforeBlankText(sentence):
        return sentence[:sentence.find("____")]

    # Look at text after blank
    def getAfterBlankText(sentence):
        return sentence[sentence.find("____") + len("____"):]

    print "Loading Questions"
    questions = loadQuestions(question_dir)

    print "num questions: ", len(questions)

    print "Loading Glove None"
    glove = Glove(glove_file,
                  delimiter=" ",
                  header=False,
                  quoting=csv.QUOTE_NONE,
                  v=False)

    print "Experimenting on 100 percent of questions"
    for i in range(int(math.floor(
            len(questions) *
            1))):  #change 1 to decimal to reduce amount of questions
        question = questions[i]

        #only want single blanks for now
        if len(re.findall('____(.*?)____', question.text, re.DOTALL)) != 0:
            continue

        answer_words = getStrippedAnswerWords(question.getCorrectWord())
        answer_vec = glove.getVec(answer_words[0])

        total_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   question.getSentence()))
        before_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   getBeforeBlankText(question.text)))
        after_vec = glove.getAverageVec(
            filter(lambda x: x not in stopwords.words('english'),
                   getAfterBlankText(question.text)))

        # prints if using the sentence model (average of all sentence's VSMS) is less than
        # both before and after
        total_distance = cosine(answer_vec, total_vec)
        before_distance = cosine(answer_vec,
                                 before_vec) if len(before_vec) > 2 else 2
        after_distance = cosine(answer_vec,
                                after_vec) if len(after_vec) > 2 else 2
        if total_distance < before_distance and total_distance < after_distance:
            continue  #comment this out to print for every question
        print question.text, answer_words[0]
        print "total distance:", total_distance
        print "before distance: ", before_distance
        print "after distance: ", after_distance
        print "\n\n"
Beispiel #2
0
def experiment1(glove_file="../data/glove_vectors/glove.6B.100d.txt", question_dir="../data/all_sat/seven_sat_raw.txt"):

	# Look at text before blank
	def getBeforeBlankText(sentence):
		return sentence[:sentence.find("____")]

	# Look at text after blank
	def getAfterBlankText(sentence):
		return sentence[sentence.find("____") + len("____"):]

	print "Loading Questions"
	questions = loadQuestions(question_dir)

	print "num questions: " , len(questions)

	print "Loading Glove None"
	glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False)

	

	print "Experimenting on 100 percent of questions" 
	for i in range(int(math.floor(len(questions) * 1))):#change 1 to decimal to reduce amount of questions
		question = questions[i]

		#only want single blanks for now
		if len(re.findall ( '____(.*?)____', question.text, re.DOTALL)) != 0:
			continue

		answer_words = getStrippedAnswerWords(question.getCorrectWord())
		answer_vec = glove.getVec(answer_words[0])
		
		total_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), question.getSentence()))
		before_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getBeforeBlankText(question.text)))
		after_vec = glove.getAverageVec(filter(lambda x: x not in stopwords.words('english'), getAfterBlankText(question.text)))
		
		# prints if using the sentence model (average of all sentence's VSMS) is less than
		# both before and after
		total_distance = cosine(answer_vec, total_vec)
		before_distance = cosine(answer_vec, before_vec) if len(before_vec) > 2 else 2
		after_distance = cosine(answer_vec, after_vec) if len(after_vec) > 2 else 2
		if total_distance < before_distance and total_distance < after_distance:
			continue #comment this out to print for every question
		print question.text, answer_words[0]
		print "total distance:", total_distance
		print "before distance: " , before_distance
		print "after distance: " , after_distance
		print "\n\n"