Exemple #1
0
 def score_bigrams(ns_counts):
     for bigram, count in ns_counts[2].most_common():
         score = BigramAssocMeasures.likelihood_ratio(
             ns_counts[2][(bigram[0], bigram[1])],
             (
                 ns_counts[1][(bigram[0],)],
                 ns_counts[1][(bigram[1],)],
             ),
             ns_counts[0][()],
         )
         yield bigram, score
def count_statistics(candidates, bigram_corpus_size, trigram_corpus_size):
    """The function for counting contingency tables"""
    print('=== Counting association measure ===')
    # Getting word frequencies
    word_counts = {}
    for word in candidates:
        i = 0
        for linkage in candidates[word]:
            for obj in candidates[word][linkage]:
                i += obj.abs_freq
                if not obj.third_word:
                    word_counts[obj.first_word + '_' +
                                obj.second_word] = obj.abs_freq
        word_counts[word] = i
    # Getting frequencies for a contingency table
    for word in candidates:
        for linkage in candidates[word]:
            for obj in candidates[word][linkage]:
                # Contingency tables for trigrams
                if obj.third_word:
                    n_iii = obj.abs_freq  # counts (w1, w2, w3)
                    n_ixx = word_counts[obj.first_word]  # counts (w1, , )
                    n_xix = word_counts[obj.second_word]  # counts ( , w2, )
                    n_xxi = word_counts[obj.third_word]  # counts ( , , w3)
                    if obj.first_word + '_' + obj.second_word in word_counts:
                        n_iix = word_counts[obj.first_word + '_' +
                                            obj.second_word]
                    else:
                        n_iix = 0
                    if obj.first_word + '_' + obj.third_word in word_counts:
                        n_ixi = word_counts[obj.first_word + '_' +
                                            obj.third_word]
                    else:
                        n_ixi = 0
                    if obj.second_word + '_' + obj.third_word in word_counts:
                        n_xii = word_counts[obj.second_word + '_' +
                                            obj.third_word]
                    else:
                        n_xii = 0
                    n_xxx = trigram_corpus_size  # counts any trigram

                    # Counting association measures for trigrams
                    obj.dice = 3 * float(n_iii) / float(n_ixx + n_xix + n_xxi)
                    obj.chi = TrigramAssocMeasures.chi_sq(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.jaccard = TrigramAssocMeasures.jaccard(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    # obj.likelihood_ratio = TrigramAssocMeasures.likelihood_ratio(n_iii,
                    #                                                              (n_iix, n_ixi, n_xii),
                    #                                                              (n_ixx, n_xix, n_xxi),
                    #                                                              n_xxx)
                    obj.mi = TrigramAssocMeasures.mi_like(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.pmi = TrigramAssocMeasures.pmi(n_iii,
                                                       (n_iix, n_ixi, n_xii),
                                                       (n_ixx, n_xix, n_xxi),
                                                       n_xxx)
                    obj.poisson_stirling = TrigramAssocMeasures.poisson_stirling(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)
                    obj.t_score = TrigramAssocMeasures.student_t(
                        n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi),
                        n_xxx)

                # Contingency tables for bigrams
                else:
                    n_ii = obj.abs_freq  # counts (w1, w2)
                    n_ix = word_counts[obj.first_word]  # counts (w1, )
                    n_xi = word_counts[obj.second_word]  # counts (, w2)
                    n_xx = bigram_corpus_size  # counts any bigram
                    # Counting the Dice statistics for bigrams
                    obj.dice = BigramAssocMeasures.dice(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.chi = BigramAssocMeasures.chi_sq(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.t_score = BigramAssocMeasures.student_t(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.poisson_stirling = BigramAssocMeasures.poisson_stirling(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.pmi = BigramAssocMeasures.pmi(n_ii, (n_ix, n_xi), n_xx)
                    obj.mi = BigramAssocMeasures.mi_like(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.likelihood_ratio = BigramAssocMeasures.likelihood_ratio(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.jaccard = BigramAssocMeasures.jaccard(
                        n_ii, (n_ix, n_xi), n_xx)
                    obj.fisher = BigramAssocMeasures.fisher(
                        n_ii, (n_ix, n_xi), n_xx)