Beispiel #1
def seq_prob(w_all, n, n_grams, n_min_1_grams):
    parsed_n_grams = parse_ngrams(w_all, n)

    if n is 1:
        return product(
            [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])

    return product([
                  ng.split()[0:-1], n_grams, n_min_1_grams)
        for ng in parsed_n_grams
Beispiel #2
def seq_prob_add1(w_all, n, n_grams, n_min_1_grams, unigrams):
    Applies add-1 smoothing to the bi-gram model
    parsed_n_grams = parse_ngrams(w_all, n)
    if n is 1:
        return product(
            [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])

    return product([
                       ng.split()[0:-1], n_grams, n_min_1_grams, unigrams)
        for ng in parsed_n_grams
Beispiel #3
def seq_prob(w_all, n, n_grams, n_min_1_grams):
    m = n - 1# if n > 1 else n
    # add START and STOPS
    for i in xrange(0, m):
        w_all.insert(0, 'START')

    parsed_n_grams = parse_ngrams(w_all, n)

    if n is 1:
        return product([rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])
    return product([cond_prob(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams) for ng in parsed_n_grams])
Beispiel #4
def seq_prob_gt(w_all, n, n_grams, n_min_1_grams, N, unigrams):
    Computes the sequential probability after good-turing has been performed
    prob = 1
    parsed_n_grams = parse_ngrams(w_all, n)
    unseen = len([ng for ng in parsed_n_grams if ng not in n_grams])

    if unseen:
        prob = (float(N[1]) / (N[0] * len(n_grams)))**unseen
    if unseen == len(parsed_n_grams):
        return prob

    prob = prob * product([
                     ng.split()[0:-1], n_grams, n_min_1_grams)
        for ng in parsed_n_grams if ng in n_grams
    return prob
Beispiel #5
    if not args.train_file or not args.n or not args.test_file or args.n is not 2:
            'Missing required arguments or n is not 2 (assignment is for bigrams)'

    # split and flatten array
    # sentences is list of sentences that start with START and end with STOP
    sentences = get_sentences(
        add_start_stop(args.train_file, args.n if not args.m else 1))
    test_sentences = get_sentences(
        add_start_stop(args.test_file, args.n if not args.m else 1))

    n_grams = Counter(
        list(chain(*[parse_ngrams(sen, args.n) for sen in sentences])))
    n_min_1_grams = Counter(
        list(chain(*[parse_ngrams(sen, args.n - 1) for sen in sentences])))

    unigrams = len(n_min_1_grams)

    probs = calc_probabilities_seq_file(test_sentences, args.n, n_grams,
                                        n_min_1_grams, unigrams,

    percentagenonzero = 100 * float(
        len([prob for prob in probs if probs[prob] != 0])) / len(probs)
    print('{} % of {} have a nonzero probability'.format(
        percentagenonzero, len(probs)))
    print('{} most likely sentences:'.format(args.m))
    print_ngrams(sort_ngrams_bidirectional(probs, True), args.m)
Beispiel #6
        if idx is m:
        idx += 1
        print '{} {}'.format(word, freq)
if __name__ == "__main__":
    # here code for program
    parser = ArgumentParser(description='Assignment A, Step 1')
    parser.add_argument('-corpus', dest ='input_file', type=str, help='Path to corpus file')
    parser.add_argument('-n', dest='n', type=int, help='Length of word-sequences to process (n-grams)')
    parser.add_argument('-m', dest='m', type=int, default=None, help='Number of n-grams to show in output')
    args = parser.parse_args()
    lines = read_words(args.input_file)
    n_grams_frequency = Counter(parse_ngrams(lines, args.n))
    freq_sum = sum(n_grams_frequency.values())
    print 'sum: {}'.format(freq_sum)
    # sort n_grams by value in descending order
    n_grams_frequency = sort_ngrams(n_grams_frequency)
    print_ngrams(n_grams_frequency, args.m)
a1_step2.py000644 000765 000024 00000014531 12470142143 013515 0ustar00markusstaff000000 000000 # -*- coding: utf-8 -*-
Created on Tue Feb 10 11:36:38 2015

@author: markus
Beispiel #7
    if args.scored_perms and args.n is not 2:
        exit('n must be 2 when using permutations')

    if not args.cond_file and not args.m and not args.seq_file and not args.scored_perms:
        exit('What shall I do?')

    # split and flatten array
    # sentences is list of sentences that start with START and end with STOP
    sentences = get_sentences(
        add_start_stop(args.input_file, args.n if not args.m else 1))


    n_grams = Counter(
        list(itertools.chain(*[parse_ngrams(sen, args.n)
                               for sen in sentences])))
    n_min_1_grams = Counter(
                *[parse_ngrams(sen, args.n - 1) for sen in sentences])))

    # when n=1 n_min_1_grams would become a dict instead of a Counter. To keep
    # stuff consistent...
    if not n_min_1_grams:
        n_min_1_grams = Counter()

    # if wished, print m most bigrams
    if args.m:
        print_ngrams(sort_ngrams(n_grams), args.m)