Esempio n. 1
0
def compute_correlation_score_match(matrix, game_set_file,
                                    output_file_clues_matched,
                                    output_file_solutions_matched):
    import corpora
    import lexicon
    unfound_pair_score = matrix.get_min_association_score()
    lex_freq_dict = lexicon.loadLexFreqFromFile(corpora.PAISA_LEX_FREQ_FILE)
    nouns_lex_freq_dict = lexicon.loadLexFreqFromFile(
        corpora.PAISA_SOSTANTIVI_FREQ_FILE)
    most_freq_nouns = [
        item[0]
        for item in sorted(nouns_lex_freq_dict.items(), key=lambda x: -x[1])
    ]
    game_set = read_game_set_tab(game_set_file)
    output_lines_clues_matched = []
    output_lines_clues_matched.append('\t'.join(['Scores', 'Matched']))
    scores_guessed = []
    scores_missed = []
    for game_words in game_set:
        clues = game_words[:5]
        gold_solution = game_words[5]
        result = getBestWordAssociationGroups(matrix,
                                              clues,
                                              unfound_pair_score,
                                              nBest=100)
        if len(result) == 0:
            best_solution = most_freq_nouns[0]
            clues_matched_count = 0
            scores_sum = unfound_pair_score * 5
        else:
            best_result = result[0]
            best_solution = best_result['ranked_solution']
            clues_matched_count = best_result['clues_matched_count']
            scores = best_result['scores']
            scores_sum = best_result['scores_sum']
        if best_solution == gold_solution:
            scores_guessed.append(scores_sum)
        else:
            scores_missed.append(scores_sum)
        output_lines_clues_matched.append('{}\t{}'.format(
            scores_sum, clues_matched_count))
    with open(output_file_clues_matched, 'w') as f_out:
        print_write(f_out, '\n'.join(output_lines_clues_matched))
    with open(output_file_solutions_matched, 'w') as f_out:
        print_write(f_out,
                    '\t'.join(['Scores Guessed', 'Scores Missed']) + '\n')
        max_lines = max(len(scores_guessed), len(scores_missed))
        for i in range(max_lines):
            if i < len(scores_guessed):
                print_write(f_out, str(scores_guessed[i]))
            if i < len(scores_missed):
                print_write(f_out, '\t' + str(scores_missed[i]))
            print_write(f_out, '\n')
Esempio n. 2
0
def getAggettiviSetFromPaisa(min_freq, inflected):
    import lexicon
    agg_lex_freq = lexicon.loadLexFreqFromFile(PAISA_AGGETTIVI_FREQ_FILE)
    agg_lex_min_freq = [w for w, f in agg_lex_freq.items() if f >= min_freq]
    agg_lex_set = set(agg_lex_min_freq)
    if inflected:
        lemma_inflections_dict = getLemmasInflectionsDict()
        for w in agg_lex_min_freq:
            if w in lemma_inflections_dict:
                agg_lex_set.update(lemma_inflections_dict[w])
    return agg_lex_set
Esempio n. 3
0
def getSostantiviSetFromPaisa(min_freq, inflected):
    import lexicon
    sostantivi_lex_freq = lexicon.loadLexFreqFromFile(
        PAISA_SOSTANTIVI_FREQ_FILE)
    sostantivi_lex_min_freq = [
        w for w, f in sostantivi_lex_freq.items() if f >= min_freq
    ]
    sostantivi_lex_set = set(sostantivi_lex_min_freq)
    if inflected:
        lemma_inflections_dict = getLemmasInflectionsDict()
        for w in sostantivi_lex_min_freq:
            if w in lemma_inflections_dict:
                sostantivi_lex_set.update(lemma_inflections_dict[w])
    return sostantivi_lex_set
Esempio n. 4
0
def builDizAugmentedPaisa(lexPosFreqFile, lexPosBaseFile, min_freq,
                          output_file):
    import lexicon
    vowels = [v for v in 'aeiou']
    output_file_log = output_file + '_log'
    paisa_pos_lex_freq = lexicon.loadLexFreqFromFile(lexPosFreqFile)
    diz_base = lexicon.loadLexiconFromFile(lexPosBaseFile)
    diz_sostantivi_prefix = set()
    for w in diz_base:
        if len(w) > 1 and w[-1] in vowels:
            diz_sostantivi_prefix.add(w[:-1])
    with open(output_file_log, 'w') as f_out:
        for w, f in sorted(paisa_pos_lex_freq.items(), key=lambda x: -x[1]):
            if f >= min_freq and len(
                    w
            ) > 1 and w not in diz_base and w[:
                                              -1] in diz_sostantivi_prefix and w[
                                                  -1] in vowels:
                diz_base.add(w)
                origin = next(o for o in diz_base
                              if o[:-1] == w[:-1] and o != w
                              and len(o) == len(w) and o[-1] in vowels)
                f_out.write('{}->{}\n'.format(origin, w))
    lexicon.printLexiconToFile(diz_base, output_file)
Esempio n. 5
0
def batch_solver(matrix,
                 game_set_file,
                 output_file,
                 nBest=100,
                 extra_search=False):
    import time
    import corpora
    import lexicon
    from lexicon import morph_normalize_word
    unfound_pair_score = matrix.get_min_association_score()
    lex_freq_dict = lexicon.loadLexFreqFromFile(corpora.PAISA_LEX_FREQ_FILE)
    #most_freq_words = [item[0] for item in sorted(lex_freq_dict.items(), key=lambda x: -x[1])]
    nouns_lex_freq_dict = lexicon.loadLexFreqFromFile(
        corpora.PAISA_SOSTANTIVI_FREQ_FILE)
    most_freq_nouns = [
        item[0]
        for item in sorted(nouns_lex_freq_dict.items(), key=lambda x: -x[1])
    ]
    game_set = read_game_set_tab(game_set_file)
    output_lines_clues_matched = []
    for game_words in game_set:
        start_time = time.time()
        clues = game_words[:5]
        result = getBestWordAssociationGroups(matrix, clues,
                                              unfound_pair_score, nBest)
        if extra_search and len(result) < 100:
            morphed_clues = [
                morph_normalize_word(c, lex_freq_dict) for c in clues
            ]
            if morphed_clues != clues:
                result += getBestWordAssociationGroups(matrix, morphed_clues,
                                                       unfound_pair_score,
                                                       nBest)
                # resorting results (omitting if we want to give more relevance to unmorphed clues)
                # result = sorted(result, key=lambda r: r['scores_sum'])
        if len(result) == 0:
            best_solution = most_freq_nouns[0]
            clues_matched_count = 0
            scores = -9999
            scores_sum = -9999
            remaining_solutions = most_freq_nouns[1:nBest]
        else:
            best_result = result[0]
            other_results = result[1:]
            best_solution = best_result['ranked_solution']
            clues_matched_count = best_result['clues_matched_count']
            scores = best_result['scores']
            scores_sum = best_result['scores_sum']
            remaining_solutions = [r['ranked_solution'] for r in other_results]
            if len(remaining_solutions) < (nBest - 1):
                missing_count = nBest - 1 - len(remaining_solutions)
                missing_nouns = [
                    n for n in most_freq_nouns
                    if n != best_solution and n not in other_results
                ][:missing_count]
                remaining_solutions += missing_nouns
        remaining_solutions_str = ', '.join(remaining_solutions)
        elapsed_time = int(round(time.time() - start_time) * 1000)
        report_fields = clues + [
            best_solution, clues_matched_count, scores, scores_sum,
            remaining_solutions_str, elapsed_time
        ]
        output_lines_clues_matched.append('\t'.join(
            [str(x) for x in report_fields]))
    print('Input lines: {}'.format(len(game_set)))
    print('Output lines: {}'.format(len(output_lines_clues_matched)))
    with open(output_file, 'w') as f_out:
        print_write(f_out, '\n'.join(output_lines_clues_matched))
def coverage():
    lexicon_freq = lexicon.loadLexFreqFromFile(LEX_FREQ_FILE)
    scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)