def calculate_AMs(collocation_matrix):
    LLR = {}
    locMI = {}
    for row in collocation_matrix:

        c1 = first_order_freqs[row][0]
        LLR.update( { row : [] } )
        locMI.update( { row : [] } )
    
        for col in collocation_matrix[row]:

            c2 = freq_list[rank_map[col]]*wordcount_ratio
            c12 = collocation_matrix[row][col]*first_order_freqs[row][1]
            tf = sum(config_data[CORPUS_FREQUENCIES])

            if c12 > 0:
                LLR[row].append(comp.LLR(c1, c2, c12, tf))
                locMI[row].append(comp.localMI(c1, c2, c12, tf))
            else:
                LLR[row].append(0)
                locMI[row].append(0)

    print("printing output files")

    with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_LLR.json", "w", encoding="utf-8") as f:
        json.dump(LLR, f)

    with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_locMI.json", "w", encoding="utf-8") as f:
        json.dump(locMI, f)
        if "full_count" in data[tag] and data[tag]["full_count"] > 0:
            f2_freqs.update({word: data[tag]["full_count"]})

LLR = {}
PMI = {}
locMI = {}

print(f2_freqs)

for synpat in syn_list:

    if syn_list[synpat]["word"] in f2_freqs:
        f2 = f2_freqs[syn_list[synpat]["word"]]
        f12 = syn_list[synpat]["f12"]
        args = (f1, f2, f12, tf)
        LLR.update({synpat: comp.LLR(*args)})
        PMI.update({synpat: comp.PMI(*args)})
        locMI.update({synpat: comp.localMI(*args)})

with open(config_data[PROJECT_PATH] + "syntactic_synpat_associations_LLR.tsv",
          "w",
          encoding="utf-8") as f:
    for line in LLR:
        f.write(line + "\t" + str(LLR[line]) + "\n")

with open(config_data[PROJECT_PATH] + "syntactic_synpat_associations_PMI.tsv",
          "w",
          encoding="utf-8") as f:
    for line in PMI:
        f.write(line + "\t" + str(PMI[line]) + "\n")
Beispiel #3
0
for row in data:
    LLR_vectors.update( { row : {} } )
    PMI_vectors.update( { row : {} } )
    locMI_vectors.update( { row : {} } )
    LPPMI_vectors.update( { row : {} } )
    LPlocMI_vectors.update( { row : {} } )
    if "full_count" in data[row]:
        f1 = data[row]["full_count"]

        for col in word_frequencies:
       
            if word_frequencies[col] > F_THRESHOLD:
                f2 = word_frequencies[col]
                if col in data[row]:
                    f12 = data[row][col]
                    LLR_vectors[row].update({ col : comp.LLR(f1, f2, f12, tf) } )
                    LPlocMI_vectors[row].update({ col : comp.localMI(f1+1, f2+1, f12+1, tf+len(word_frequencies)) } )
                    LPPMI_vectors[row].update({ col : comp.PMI(f1+1, f2+1, f12+1, tf+len(word_frequencies)) } )
                    locMI_vectors[row].update({ col : comp.localMI(f1, f2, f12, tf) } )
                    PMI_vectors[row].update({ col : comp.PMI(f1, f2, f12, tf) } )
                else:
                    LPPMI_vectors[row].update({ col : comp.PMI(f1+1, f2+1, 1, tf+len(word_frequencies)) })
                    LPlocMI_vectors[row].update({ col : comp.localMI(f1+1, f2+1, 1, tf+len(word_frequencies)) } )
                    LLR_vectors[row].update( { col : 0 } )
                    PMI_vectors[row].update( { col : 0 } )
                    locMI_vectors[row].update( { col : 0 } )

    else:
        print("no count", row)

with open(config_data[PROJECT_PATH]+"syntactic_LLR_vectors_"+str(F_THRESHOLD)+".json", "w", encoding="utf-8") as f:
Beispiel #4
0
print("bulding AM matrices")

for i in f_matrix:

    LLR_matrix.update({i: {}})
    PMI_matrix.update({i: {}})
    locMI_matrix.update({i: {}})
    LPPMI_matrix.update({i: {}})
    LPlocMI_matrix.update({i: {}})

    for j in f_matrix[i]:
        f1 = full_counts[i]
        f2 = full_counts[j]
        f12 = f_matrix[i][j]
        if f12 > 0:
            LLR_matrix[i].update({j: comp.LLR(f1, f2, f12, tf)})
            PMI_matrix[i].update({j: comp.PMI(f1, f2, f12, tf)})
            locMI_matrix[i].update({j: comp.localMI(f1, f2, f12, tf)})
            LPPMI_matrix[i].update(
                {j: comp.PMI(f1 + 1, f2 + 1, f12 + 1, tf + len(wordlist))})
            LPlocMI_matrix[i].update(
                {j: comp.localMI(f1 + 1, f2 + 1, f12 + 1, tf + len(wordlist))})
        else:

            LLR_matrix[i].update({j: 0})
            PMI_matrix[i].update({j: 0})
            locMI_matrix[i].update({j: 0})

            LPPMI_matrix[i].update(
                {j: comp.PMI(f1 + 1, f2 + 1, 1, tf + len(wordlist))})
            LPlocMI_matrix[i].update(
print("calculating association measures")

for row in collocation_matrix:

    c1 = first_order_freqs[row][0]
    LLR.update( { row : [] } )
    locMI.update( { row : [] } )
    
    for col in collocation_matrix[row]:

        c2 = freq_list[rank_map[col]]*wordcount_ratio
        c12 = collocation_matrix[row][col]*first_order_freqs[row][1]
        tf = sum(config_data[CORPUS_FREQUENCIES])

        if c12 > 0:
            LLR[row].append(comp.LLR(c1, c2, c12, tf))
            locMI[row].append(comp.localMI(c1, c2, c12, tf))
        else:
            LLR[row].append(0)
            locMI[row].append(0)

print("printing output files")

with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_LLR.json", "w", encoding="utf-8") as f:
    json.dump(LLR, f)

with open(config_data[PROJECT_PATH]+"sentence_collocation_matrix_with_freqs_locMI.json", "w", encoding="utf-8") as f:
    json.dump(locMI, f)