Example #1
0
def interpolate_ngram_collection(ngram_collection, corpus, start_token,
                                 end_token, inf_constant):

    # n is how big our ngram is
    n = len(ngram_collection)
    lambds = lambdas_from_ngrams(ngram_collection)

    results = []
    for sentence in corpus:

        # Create ntuples from sentence
        word_list = __build_word_list(sentence, start_token, end_token, n)
        n_tuple = gramer.ngram_from_word_list(word_list, n)

        total = 0

        # print sentence
        for sentence_tuple in n_tuple:
            # print '======================='
            # print "%s" %(sentence_tuple,)
            powerset = powerset_from_collection(sentence_tuple)

            try:

                #Account for foreign words. If unigram isn't there, then powerset is invalid
                if is_unkown_ngram(powerset[0], ngram_collection[0]):
                    total = float(inf_constant)
                    break

                # Get log probabilities for this tuple across all ngrams
                log_probs = [
                    item[powerset[i]]
                    for i, item in enumerate(ngram_collection)
                ]

                # Calculate log probability
                probs_sum = [2**log_prob for log_prob in log_probs]
                trigram_interpolated_log_prob = math.log(
                    lambds[0], 2) + math.log(np.sum(probs_sum), 2)

                # Add to overall sentence count
                total += trigram_interpolated_log_prob
                # print '%f TOTAL:%f' %(trigram_interpolated_log_prob, total)
            except:
                # print 'skipped %s' %(sentence_tuple,)
                pass

        results.append(total)

    return results
Example #2
0
def score(ngram_p, n, corpus, start_token, end_token):
    results = []
    for sentence in corpus:

        # Create ntuples from sentence
        word_list = __build_word_list(sentence, start_token, end_token, n)
        n_tuple = gramer.ngram_from_word_list(word_list, n)

        total = 0

        for sentence_tuple in n_tuple:
            total += ngram_p[sentence_tuple]

        results.append(total)

    return results
def score(ngram_p, n, corpus, start_token, end_token):
    results = []
    for sentence in corpus:

        # Create ntuples from sentence
        word_list = __build_word_list(sentence, start_token, end_token, n)
        n_tuple = gramer.ngram_from_word_list(word_list, n)

        total = 0

        for sentence_tuple in n_tuple:
            total += ngram_p[sentence_tuple]

        results.append(total)

    return results
def interpolate_ngram_collection(ngram_collection, corpus, start_token, end_token, inf_constant):

    # n is how big our ngram is
    n = len(ngram_collection)
    lambds = lambdas_from_ngrams(ngram_collection)

    results = []
    for sentence in corpus:

        # Create ntuples from sentence
        word_list = __build_word_list(sentence, start_token, end_token, n)
        n_tuple = gramer.ngram_from_word_list(word_list, n)

        total = 0

        # print sentence
        for sentence_tuple in n_tuple:
            # print '======================='
            # print "%s" %(sentence_tuple,)
            powerset = powerset_from_collection(sentence_tuple)

            try:

                #Account for foreign words. If unigram isn't there, then powerset is invalid
                if is_unkown_ngram(powerset[0], ngram_collection[0]):
                    total = float(inf_constant)
                    break

                # Get log probabilities for this tuple across all ngrams
                log_probs = [item[powerset[i]] for i, item in enumerate(ngram_collection)]

                # Calculate log probability
                probs_sum = [2**log_prob for log_prob in log_probs]
                trigram_interpolated_log_prob = math.log(lambds[0], 2) + math.log(np.sum(probs_sum), 2)

                # Add to overall sentence count
                total += trigram_interpolated_log_prob
                # print '%f TOTAL:%f' %(trigram_interpolated_log_prob, total)
            except:
                # print 'skipped %s' %(sentence_tuple,)
                pass

        results.append(total)

    return results