Beispiel #1
0
def seq_prob(w_all, n, n_grams, n_min_1_grams):
    parsed_n_grams = parse_ngrams(w_all, n)
    #print(parsed_n_grams)

    if n is 1:
        return product(
            [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])

    return product([
        cond_prob(ng.split(),
                  ng.split()[0:-1], n_grams, n_min_1_grams)
        for ng in parsed_n_grams
    ])
Beispiel #2
0
def seq_prob_add1(w_all, n, n_grams, n_min_1_grams, unigrams):
    """
    Applies add-1 smoothing to the bi-gram model
    """
    parsed_n_grams = parse_ngrams(w_all, n)
    if n is 1:
        return product(
            [rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])

    return product([
        cond_prob_add1(ng.split(),
                       ng.split()[0:-1], n_grams, n_min_1_grams, unigrams)
        for ng in parsed_n_grams
    ])
Beispiel #3
0
def seq_prob(w_all, n, n_grams, n_min_1_grams):
    
    m = n - 1# if n > 1 else n
    
    # add START and STOPS
    for i in xrange(0, m):
        w_all.insert(0, 'START')
        w_all.append('STOP')

    parsed_n_grams = parse_ngrams(w_all, n)
    #print(parsed_n_grams)

    if n is 1:
        return product([rel_prob(ng.split(), n_grams) for ng in parsed_n_grams])
    
    return product([cond_prob(ng.split(), ng.split()[0:-1], n_grams, n_min_1_grams) for ng in parsed_n_grams])
Beispiel #4
0
def seq_prob_gt(w_all, n, n_grams, n_min_1_grams, N, unigrams):
    """
    Computes the sequential probability after good-turing has been performed
    """
    prob = 1
    parsed_n_grams = parse_ngrams(w_all, n)
    unseen = len([ng for ng in parsed_n_grams if ng not in n_grams])

    if unseen:
        prob = (float(N[1]) / (N[0] * len(n_grams)))**unseen
    if unseen == len(parsed_n_grams):
        return prob

    prob = prob * product([
        cond_prob_gt(ng.split(),
                     ng.split()[0:-1], n_grams, n_min_1_grams)
        for ng in parsed_n_grams if ng in n_grams
    ])
    return prob
Beispiel #5
0
    # INPUT CHECKS
    if not args.train_file or not args.n or not args.test_file or args.n is not 2:
        parser.print_help()
        exit(
            'Missing required arguments or n is not 2 (assignment is for bigrams)'
        )

    # split and flatten array
    # sentences is list of sentences that start with START and end with STOP
    sentences = get_sentences(
        add_start_stop(args.train_file, args.n if not args.m else 1))
    test_sentences = get_sentences(
        add_start_stop(args.test_file, args.n if not args.m else 1))

    n_grams = Counter(
        list(chain(*[parse_ngrams(sen, args.n) for sen in sentences])))
    n_min_1_grams = Counter(
        list(chain(*[parse_ngrams(sen, args.n - 1) for sen in sentences])))

    unigrams = len(n_min_1_grams)

    probs = calc_probabilities_seq_file(test_sentences, args.n, n_grams,
                                        n_min_1_grams, unigrams,
                                        args.smoothing)

    percentagenonzero = 100 * float(
        len([prob for prob in probs if probs[prob] != 0])) / len(probs)
    print('{} % of {} have a nonzero probability'.format(
        percentagenonzero, len(probs)))
    print('{} most likely sentences:'.format(args.m))
    print_ngrams(sort_ngrams_bidirectional(probs, True), args.m)
Beispiel #6
0
        if idx is m:
            break
        idx += 1
        print '{} {}'.format(word, freq)
    
if __name__ == "__main__":
    # here code for program
    
    parser = ArgumentParser(description='Assignment A, Step 1')
    parser.add_argument('-corpus', dest ='input_file', type=str, help='Path to corpus file')
    parser.add_argument('-n', dest='n', type=int, help='Length of word-sequences to process (n-grams)')
    parser.add_argument('-m', dest='m', type=int, default=None, help='Number of n-grams to show in output')
    args = parser.parse_args()
    
    lines = read_words(args.input_file)
    n_grams_frequency = Counter(parse_ngrams(lines, args.n))
    
    freq_sum = sum(n_grams_frequency.values())
    print 'sum: {}'.format(freq_sum)
    
    # sort n_grams by value in descending order
    n_grams_frequency = sort_ngrams(n_grams_frequency)
    
    print_ngrams(n_grams_frequency, args.m)
a1_step2.py000644 000765 000024 00000014531 12470142143 013515 0ustar00markusstaff000000 000000 # -*- coding: utf-8 -*-
"""
Created on Tue Feb 10 11:36:38 2015

@author: markus
"""
Beispiel #7
0
    if args.scored_perms and args.n is not 2:
        exit('n must be 2 when using permutations')

    if not args.cond_file and not args.m and not args.seq_file and not args.scored_perms:
        parser.print_help()
        exit('What shall I do?')

    # split and flatten array
    # sentences is list of sentences that start with START and end with STOP
    sentences = get_sentences(
        add_start_stop(args.input_file, args.n if not args.m else 1))

    #print(sentences)

    n_grams = Counter(
        list(itertools.chain(*[parse_ngrams(sen, args.n)
                               for sen in sentences])))
    n_min_1_grams = Counter(
        list(
            itertools.chain(
                *[parse_ngrams(sen, args.n - 1) for sen in sentences])))

    # when n=1 n_min_1_grams would become a dict instead of a Counter. To keep
    # stuff consistent...
    if not n_min_1_grams:
        n_min_1_grams = Counter()

    # if wished, print m most bigrams
    if args.m:
        print('n-grams:')
        print_ngrams(sort_ngrams(n_grams), args.m)