def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon) lex_solution_set = set(sost_lexicon+agg_lexicon) lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE) lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE) print('Computing coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE) scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_tv_FILE, EVAL_NLP4FUN_DEV_TV_FILE) scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_bg_FILE, EVAL_NLP4FUN_DEV_BG_FILE)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('\nBuilding lexicon') lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE) lex_solution_set = lex_set ''' poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=100, inflected=True)) print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon))) agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=100, inflected=True)) print('\nSize of agg lex: {}'.format(len(agg_lexicon))) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) lex_solution_set = set(sost_lexicon+agg_lexicon) #lex_solution_set = lex_set ''' print('\nComputing lex coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('\nBuilding association matrix') matrix = Matrix(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO) corpora.addBigramFromPolirematicheInMatrix(matrix, weight=1) #corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=10) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('\nEval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def check_dizionario_base_base_coverage(): import corpora import scorer dizionario = get_dizionario_base_set(False) lexicon_freq = {w: 1 for w in dizionario} report_coverage_file = DE_MAURO_PATH + 'diz_base_game_coverage.txt' scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, report_coverage_file)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') poli_lexicon = list( lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE)) agg_lexicon = list( lexicon.loadLexiconFromFile( corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE)) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) lex_solution_set = set(sost_lexicon + agg_lexicon) ''' poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)) sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=1000, inflected=True)) print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon))) agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=1000, inflected=True)) print('\nSize of agg lex: {}'.format(len(agg_lexicon))) lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon) ''' lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE) lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE) print('Computing lex coverage') scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = Matrix_Dict(lex_set, lex_solution_set) matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO, weight=DE_MAURO_WEIGHT) matrix.add_patterns_from_corpus(corpora.PROVERBI_INFO, weight=PROVERBI_WEIGHT) matrix.add_patterns_from_corpus(corpora.ITWAC_RAW_INFO, weight=1) matrix.add_patterns_from_corpus(corpora.WIKI_IT_TITLES_INFO, weight=WIKI_IT_WEIGHT) #matrix.add_patterns_from_corpus(corpora.WIKI_IT_TEXT_INFO, weight=1) corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT) corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=COMPOUNDS_WEIGHT) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def check_dizionario_polirematiche_base_coverage(): import corpora import scorer import patterns_extraction dizionario = get_dizionario_base_set(False) polirematiche = get_polirematiche_set(True) for p in polirematiche: p = patterns_extraction.tokenizeLine(p) for w in p.split(): dizionario.add(w) write_lexicon_to_file(sorted(dizionario), DIZ_POLI_WORD_SORTED_FILE) report_coverage_file = DE_MAURO_PATH + 'diz_poli_game_coverage.txt' lexicon_freq = {w: 1 for w in dizionario} scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, report_coverage_file)
def build_and_eval(): utility.make_dir(OUTPUT_DIR) print('Building lexicon') lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE) lexicon_freq = {w: 1 for w in lex_set} #lex_solution_set = lexicon.loadLexiconFromFile(corpora.DIZIONARIO_BASE_SOSTANTIVI_FILE) print('Lex size: {}'.format(len(lex_set))) lexicon.printLexFreqToFile(lexicon_freq, LEX_FREQ_FILE) print('Computing coverage') scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE) print('Building association matrix') matrix = matrix_dict.Matrix_Dict(lex_set=lex_set) # lex_solution_set matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO) matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO, weight=DE_MAURO_WEIGHT) corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT) matrix.compute_association_scores() matrix.write_matrix_to_file(MATRIX_FILE) print('Eval') scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
def coverage(): lexicon_freq = lexicon.loadLexFreqFromFile(LEX_FREQ_FILE) scorer.computeCoverageOfGameWordLex(lexicon_freq, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)