コード例 #1
0
def get_predictions(test_data):
    """compute predictions for the test set given as argument using the two different scoring heuristics
    """
    with open(test_data) as f:
        i = 0
        predictions = []
        options = []  # stores the different options possible for a single sentence
        for line in f:
            match = re.search('\[([\d\w\'\-,]+)\]', line)
            option = match.group(1)
            if i % OPTIONS_PER_SENTENCE == 0:
                options = [option]
                line = line.replace("[%s]" % option, "")  # remove fill word from sentence
                words_in_sentence = line.split()[1:]  # start from index 1 since 1st cell contains the question number
            elif i % OPTIONS_PER_SENTENCE == 4:
                if not STEMMING:
                    options.append(option)
                    best_option_1, best_option_2 = get_best_option(options, words_in_sentence)
                    predictions.append((best_option_1, best_option_2))
                else:
                    options.append(option)
                    stemmed_options = [stem(option) for option in options]
                    stemmed_words_in_sentence = [stem(word) for word in words_in_sentence]
                    best_option_1, best_option_2 = get_best_option(stemmed_options, stemmed_words_in_sentence)
                    if best_option_1 and best_option_2:
                        best_option_index_1 = stemmed_options.index(best_option_1)
                        best_option_index_2 = stemmed_options.index(best_option_2)
                        predictions.append((options[best_option_index_1], options[best_option_index_2]))
                    else:
                        predictions.append((None, None))
            else:
                options.append(option)
            i += 1
    return predictions
コード例 #2
0
def get_predictions(test_data):
    """compute predictions for the test set given as argument using the two different scoring heuristics
    """
    with open(test_data) as f:
        i = 0
        predictions = []
        options = [
        ]  # stores the different options possible for a single sentence
        for line in f:
            match = re.search('\[([\d\w\'\-,]+)\]', line)
            option = match.group(1)
            if i % OPTIONS_PER_SENTENCE == 0:
                options = [option]
                line = line.replace("[%s]" % option,
                                    "")  # remove fill word from sentence
                words_in_sentence = line.split(
                )[1:]  # start from index 1 since 1st cell contains the question number
            elif i % OPTIONS_PER_SENTENCE == 4:
                if not STEMMING:
                    options.append(option)
                    best_option_1, best_option_2 = get_best_option(
                        options, words_in_sentence)
                    predictions.append((best_option_1, best_option_2))
                else:
                    options.append(option)
                    stemmed_options = [stem(option) for option in options]
                    stemmed_words_in_sentence = [
                        stem(word) for word in words_in_sentence
                    ]
                    best_option_1, best_option_2 = get_best_option(
                        stemmed_options, stemmed_words_in_sentence)
                    if best_option_1 and best_option_2:
                        best_option_index_1 = stemmed_options.index(
                            best_option_1)
                        best_option_index_2 = stemmed_options.index(
                            best_option_2)
                        predictions.append((options[best_option_index_1],
                                            options[best_option_index_2]))
                    else:
                        predictions.append((None, None))
            else:
                options.append(option)
            i += 1
    return predictions
コード例 #3
0
def filter_stem(input_path, output_path):
    """filter every word so that only the stem remains

    Keyword arguments:
    input_path -- input file path
    output_path -- output file path
    """
    with open(input_path) as inp, open(output_path, 'w') as out:
        for line in inp:
            line = " ".join([stem(word) for word in line.split()])
            out.write(line+'\n')
コード例 #4
0
 def term_normalize(term):
    res = ''.join(e for e in term if e.isalpha())
    res = stem(res.lower())
    return res