def build_and_eval():
    utility.make_dir(OUTPUT_DIR)

    print('Building lexicon')
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon)
    lex_solution_set =  set(sost_lexicon+agg_lexicon)
    
    lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE)
    lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE)

    print('Computing coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)

    print('Building association matrix')    
    matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_tv_FILE, EVAL_NLP4FUN_DEV_TV_FILE)
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.NLP4FUN_DEV_TSV_v2_bg_FILE, EVAL_NLP4FUN_DEV_BG_FILE)
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)

    print('Building lexicon')

    poli_lexicon = list(
        lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    lex_solution_set = set(sost_lexicon + agg_lexicon)
    '''
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=1000, inflected=True))
    print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon)))
    agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=1000, inflected=True))
    print('\nSize of agg lex: {}'.format(len(agg_lexicon)))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    '''

    lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE)
    lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE)

    print('Computing lex coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set,
                                        corpora.GAME_SET_100_FILE,
                                        COVERAGE_WORD_GAME100_FILE)

    print('Building association matrix')
    matrix = Matrix_Dict(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO,
                                    weight=DE_MAURO_WEIGHT)
    matrix.add_patterns_from_corpus(corpora.PROVERBI_INFO,
                                    weight=PROVERBI_WEIGHT)
    matrix.add_patterns_from_corpus(corpora.ITWAC_RAW_INFO, weight=1)
    matrix.add_patterns_from_corpus(corpora.WIKI_IT_TITLES_INFO,
                                    weight=WIKI_IT_WEIGHT)
    #matrix.add_patterns_from_corpus(corpora.WIKI_IT_TEXT_INFO, weight=1)
    corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT)
    corpora.addBigramFromCompunds(matrix,
                                  lex_set,
                                  min_len=4,
                                  weight=COMPOUNDS_WEIGHT)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE,
                                             EVAL_WORD_GAME100_FILE)
Exemple #3
0
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)
    print('\nBuilding lexicon')

    lex_set = lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE)
    lex_solution_set = lex_set
    '''
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(corpora.getSostantiviSetFromPaisa(min_freq=100, inflected=True))
    print('\nSize of sostantivi lex: {}'.format(len(sost_lexicon)))
    agg_lexicon = list(corpora.getAggettiviSetFromPaisa(min_freq=100, inflected=True))
    print('\nSize of agg lex: {}'.format(len(agg_lexicon)))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    lex_solution_set =  set(sost_lexicon+agg_lexicon)
    #lex_solution_set = lex_set
    '''

    print('\nComputing lex coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set,
                                        corpora.GAME_SET_100_FILE,
                                        COVERAGE_WORD_GAME100_FILE)

    print('\nBuilding association matrix')
    matrix = Matrix(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO)
    corpora.addBigramFromPolirematicheInMatrix(matrix, weight=1)
    #corpora.addBigramFromCompunds(matrix, lex_set, min_len=4, weight=10)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)

    print('\nEval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE,
                                             EVAL_WORD_GAME100_FILE)
def build():
    utility.make_dir(OUTPUT_DIR)

    print('Building lexicon')

    poli_lexicon = list(
        lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(
        lexicon.loadLexiconFromFile(
            corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon + sost_lexicon + agg_lexicon)
    lex_solution_set = set(sost_lexicon + agg_lexicon)

    lexicon.printLexiconToFile(lex_set, LEX_FREQ_FILE)
    lexicon.printLexiconToFile(lex_solution_set, SOLUTION_LEX_FREQ_FILE)

    def add_patterns_from_corpus(corpus_info):
        lines_extractor = corpora.extract_lines(corpus_info)
        source = corpus_info['name']
        patterns_count = 0
        print("Adding patterns from source: {}".format(source))
        tot_lines = corpus_info['lines']
        for n, line in enumerate(lines_extractor, 1):
            patterns_count += patterns_extraction.addPatternsFromLineInMongo(
                line, lex_set, source)
            if n % 1000 == 0:
                sys.stdout.write("Progress: {0:.1f}%\r".format(
                    float(n) * 100 / tot_lines))
                sys.stdout.flush()
        print('Extracted patterns: {}'.format(patterns_count))

    # print('Computing lex coverage')
    # scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)

    print('Adding patterns in db')
    add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO)
Exemple #5
0
def build_and_eval():
    utility.make_dir(OUTPUT_DIR)
    print('Building lexicon')
    poli_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZ_POLI_WORD_SORTED_FILE))
    sost_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE))
    agg_lexicon = list(lexicon.loadLexiconFromFile(corpora.DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE))
    lex_set = set(poli_lexicon+sost_lexicon+agg_lexicon)
    lexicon_freq = {w:1 for w in lex_set}   
    lex_solution_set =  set(sost_lexicon+agg_lexicon)
    print('Lex size: {}'.format(len(lex_set)))
    print('Solution Lex size: {}'.format(len(lex_solution_set)))
    lexicon.printLexFreqToFile(lexicon_freq, LEX_FREQ_FILE)
    # solution_lexicon_freq = {w:1 for w in lex_solution_set}   
    print('Computing lex_set coverage')
    scorer.computeCoverageOfGameWordLex(lex_set, lex_solution_set, corpora.GAME_SET_100_FILE, COVERAGE_WORD_GAME100_FILE)
    print('Building association matrix')
    matrix = matrix_dict.Matrix_Dict(lex_set, lex_solution_set)
    matrix.add_patterns_from_corpus(corpora.PAISA_RAW_INFO)
    matrix.add_patterns_from_corpus(corpora.DE_MAURO_POLIREMATICHE_INFO, weight=DE_MAURO_WEIGHT)
    corpora.addBigramFromPolirematicheInMatrix(matrix, DE_MAURO_WEIGHT)
    matrix.compute_association_scores()
    matrix.write_matrix_to_file(MATRIX_FILE)
    print('Eval')
    scorer.evaluate_kbest_MeanReciprocalRank(matrix, corpora.GAME_SET_100_FILE, EVAL_WORD_GAME100_FILE)
Exemple #6
0
def buildDizSostantiviAugmentedPaisaInflected():
    import lexicon
    lemma_inflections_dict = getLemmasInflectionsDict()
    diz_base_inflected = [
        [
            DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_FILE,
            DIZIONARIO_SOSTANTIVI_AUGMENTED_PAISA_INFLECTED_FILE
        ],
        [
            DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_FILE,
            DIZIONARIO_AGGETTIVI_AUGMENTED_PAISA_INFLECTED_FILE
        ]
    ]
    for diz_base_file, diz_inflected_file in diz_base_inflected:
        word_set = lexicon.loadLexiconFromFile(diz_base_file)  # set
        inflected_words = set()
        for lemma in word_set:
            if lemma in lemma_inflections_dict:
                inflected_words.update(lemma_inflections_dict[lemma])
        word_set.update(inflected_words)
        lexicon.printLexiconToFile(inflected_words, diz_inflected_file)
Exemple #7
0
def builDizAugmentedPaisa(lexPosFreqFile, lexPosBaseFile, min_freq,
                          output_file):
    import lexicon
    vowels = [v for v in 'aeiou']
    output_file_log = output_file + '_log'
    paisa_pos_lex_freq = lexicon.loadLexFreqFromFile(lexPosFreqFile)
    diz_base = lexicon.loadLexiconFromFile(lexPosBaseFile)
    diz_sostantivi_prefix = set()
    for w in diz_base:
        if len(w) > 1 and w[-1] in vowels:
            diz_sostantivi_prefix.add(w[:-1])
    with open(output_file_log, 'w') as f_out:
        for w, f in sorted(paisa_pos_lex_freq.items(), key=lambda x: -x[1]):
            if f >= min_freq and len(
                    w
            ) > 1 and w not in diz_base and w[:
                                              -1] in diz_sostantivi_prefix and w[
                                                  -1] in vowels:
                diz_base.add(w)
                origin = next(o for o in diz_base
                              if o[:-1] == w[:-1] and o != w
                              and len(o) == len(w) and o[-1] in vowels)
                f_out.write('{}->{}\n'.format(origin, w))
    lexicon.printLexiconToFile(diz_base, output_file)