def build_and_eval():
    utility.make_dir(OUTPUT_DIR)

    print('Building lexicon')
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon)
    lex_solution_set =  set(sost_lexicon+agg_lexicon)
    
    lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE)
    lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE)

    print('Computing coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)

    print('Building association matrix')    
    matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_tv_FILE, EVAL_NLP4FUN_DEV_TV_FILE)
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_bg_FILE, EVAL_NLP4FUN_DEV_BG_FILE)
Beispiel #2
0
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)
    print('\nBuilding lexicon')

    lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)
    lex_solution_set = lex_set
    '''
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=100, inflected=True))
    print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon)))
    agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=100, inflected=True))
    print('\nSize of agg lex: {}'.format(len(agg_lexicon)))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    lex_solution_set =  set(sost_lexicon+agg_lexicon)
    #lex_solution_set = lex_set
    '''

    print('\nComputing lex coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set,
                                        corpora.GAME_SET_100_FILE,
                                        COVERAGE_WORD_GAME100_FILE)

    print('\nBuilding association matrix')
    matrix = Matrix(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO)
    corpora.addBigramFromPolirematicheInMatrix(matrix, weight=1)
    #corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=10)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('\nEval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE,
                                             EVAL_WORD_GAME100_FILE)
def check_dizionario_base_base_coverage():
    import corpora
    import scorer
    dizionario = get_dizionario_base_set(False)
    lexicon_freq = {w: 1 for w in dizionario}
    report_coverage_file = DE_MAURO_PATH + 'diz_base_game_coverage.txt'
    scorer.computeCoverageOfGameWordLex(lexicon_freq,
                                        corpora.GAME_SET_100_FILE,
                                        report_coverage_file)
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)

    print('Building lexicon')

    poli_lexicon = list(
        lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    lex_solution_set = set(sost_lexicon + agg_lexicon)
    '''
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=1000, inflected=True))
    print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon)))
    agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=1000, inflected=True))
    print('\nSize of agg lex: {}'.format(len(agg_lexicon)))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    '''

    lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE)
    lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE)

    print('Computing lex coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set,
                                        corpora.GAME_SET_100_FILE,
                                        COVERAGE_WORD_GAME100_FILE)

    print('Building association matrix')
    matrix = Matrix_Dict(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO,
                                    weight=DE_MAURO_WEIGHT)
    matrix.add_patterns_from_corpus(corpora.PROVERBI_INFO,
                                    weight=PROVERBI_WEIGHT)
    matrix.add_patterns_from_corpus(corpora.ITWAC_RAW_INFO, weight=1)
    matrix.add_patterns_from_corpus(corpora.WIKI_IT_TITLES_INFO,
                                    weight=WIKI_IT_WEIGHT)
    #matrix.add_patterns_from_corpus(corpora.WIKI_IT_TEXT_INFO, weight=1)
    corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT)
    corpora.addBigramFromCompunds(matrix,
                                  lex_set,
                                  min_len=4,
                                  weight=COMPOUNDS_WEIGHT)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE,
                                             EVAL_WORD_GAME100_FILE)
def check_dizionario_polirematiche_base_coverage():
    import corpora
    import scorer
    import patterns_extraction
    dizionario = get_dizionario_base_set(False)
    polirematiche = get_polirematiche_set(True)
    for p in polirematiche:
        p = patterns_extraction.tokenizeLine(p)
        for w in p.split():
            dizionario.add(w)
    write_lexicon_to_file(sorted(dizionario), DIZ_POLI_WORD_SORTED_FILE)
    report_coverage_file = DE_MAURO_PATH + 'diz_poli_game_coverage.txt'
    lexicon_freq = {w: 1 for w in dizionario}
    scorer.computeCoverageOfGameWordLex(lexicon_freq,
                                        corpora.GAME_SET_100_FILE,
                                        report_coverage_file)
Beispiel #6
0
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)
    print('Building lexicon')
    lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)
    lexicon_freq = {w: 1 for w in lex_set}
    #lex_solution_set =  lexicon.loadLexiconFromFile(corpora.DIZIONARIO_BASE_SOSTANTIVI_FILE)
    print('Lex size: {}'.format(len(lex_set)))
    lexicon.printLexFreqToFile(lexicon_freq, LEX_FREQ_FILE)
    print('Computing coverage')
    scorer.computeCoverageOfGameWordLex(lexicon_freq,
                                        corpora.GAME_SET_100_FILE,
                                        COVERAGE_WORD_GAME100_FILE)
    print('Building association matrix')
    matrix = matrix_dict.Matrix_Dict(lex_set=lex_set)  # lex_solution_set
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO,
                                    weight=DE_MAURO_WEIGHT)
    corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)
    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE,
                                             EVAL_WORD_GAME100_FILE)
def coverage():
    lexicon_freq = lexicon.loadLexFreqFromFile(LEX_FREQ_FILE)
    scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)