Ejemplo n.º 1
0
def load_gre_sentence(sentence_path, lsi, word2id, algorithm=TOTAL_SIMILARITY):
    """load all gre sentence tasks and select answers based on LSA"""
    file = open(sentence_path);

    taskScore = [];
    taskCount = 0;

    for line in file:
        if "" == line.strip():
            if len(taskScore) == 5:
                maxScore, index = mathutils.max(taskScore);
                canswer = index2answer(index)
                print 'score : ' + str(maxScore) + ' answer: ' + canswer;
                cal_ans.append(canswer);
                taskScore = [];
                taskCount = taskCount + 1;
            pass;
        else:
            text = ""
            sentence = "";
            li = -1;
            options = [];

            isInParenthese = False;
            for i in range(0, len(line)):
                if line[i] == '(':
                    li = i;
                    text = text + line[i]
                    isInParenthese = True;
                elif line[i] == ')':
                    options.append(line[li+1:i]);
                    isInParenthese = False;
                elif isInParenthese == False:
                    sentence = sentence + line[i];
                    text = text + line[i]
            context = nltk.word_tokenize(sentence);

            blanks = [];

            for blank in options:
                temp = nltk.word_tokenize(blank);
                # print temp;
                blanks.append(temp);
            if algorithm == TOTAL_SIMILARITY:
                taskScore.append(calculate_total_similarity(lsi, word2id=word2id, blanks=blanks, context=context));
            elif algorithm == TOTAL_SIMILARITY_WITH_COMBINATION:
                taskScore.append(calculate_total_similarity_with_combination(lsi, word2id=word2id, blanks=blanks, context=context));
            elif algorithm == TOTAL_SIMILARITY_K_MAX:
                taskScore.append(calculate_total_similarity_by_k_max(lsi, word2id=word2id, blanks=blanks, context=context));
            elif algorithm == TOTAL_SIMILARITY_WITH_RAKE:
                taskScore.append(calculate_total_similarity_with_rake(lsi, text, word2id, blanks))
    file.close();
    print taskCount;
Ejemplo n.º 2
0
def load_gre_sentence(sentence_path, lsi, word2id, algorithm=TOTAL_SIMILARITY,k=11):
    """load all gre sentence tasks and select answers based on LSA"""
    file = open(sentence_path);
    
    taskScore = [];
    taskCount = 0;
    f = open('/Users/junchen/Documents/CSCI544/project/wiki_data/w2v_score.txt','w')
    for line in file:
        if "" == line.strip():
            if len(taskScore) == 5:
                s = 0
                for t in taskScore:
                    s += t + 1.0
                for t in range(len(taskScore)):
                    f.write(str((taskScore[t] + 1.0)/s) + '\r\n')
                f.write('\r\n')
                maxScore, index = mathutils.max(taskScore);
                canswer = index2answer(index)
                #print 'score : ' + str(maxScore) + ' answer: ' + canswer;
                if len(cal_ans) == len(answer):
                    cal_ans[taskCount] = canswer
                else:
                    cal_ans.append(canswer);
                taskScore = [];
                taskCount = taskCount + 1;
            pass;
        else:
            text = "";
            sentence = "";
            li = -1;
            options = [];
            
            isInParenthese = False;
            for i in range(0, len(line)):
                if line[i] == '(':
                    li = i;
                    text = text + line[i]
                    isInParenthese = True;
                elif line[i] == ')':
                    options.append(line[li+1:i]);
                    isInParenthese = False;
                elif isInParenthese == False:
                    sentence = sentence + line[i];
                    text = text + line[i]
            context = nltk.word_tokenize(sentence);
            
            blanks = [];
            
            for blank in options:
                temp = nltk.word_tokenize(blank);
                # print temp;
                blanks.append(temp);
            if algorithm == TOTAL_SIMILARITY:
                taskScore.append(calculate_total_similarity(lsi, word2id=word2id, blanks=blanks, context=context));
            elif algorithm == TOTAL_SIMILARITY_WITH_COMBINATION:
                taskScore.append(calculate_total_similarity_with_combination(lsi, word2id=word2id, blanks=blanks, context=context));
            elif algorithm == K_MAX_TOTAL_SIMILARITY:
                taskScore.append(calculate_total_similarity_by_k_max(lsi, word2id=word2id, blanks=blanks, context=context, k=k));
            elif algorithm == TOTAL_SIMILARITY_WITH_RAKE:
                taskScore.append(calculate_total_similarity_with_rake(lsi, text, word2id, blanks,k))
    file.close();
    f.close()
    print taskCount;