コード例 #1
0
    def setUpClass(cls):

        text = [list("abcd"), list("egdbe")]
        cls.trigram_counter = NgramCounter(
            (everygrams(sent, max_len=3) for sent in text))
        cls.bigram_counter = NgramCounter(
            (everygrams(sent, max_len=2) for sent in text))
コード例 #2
0
ファイル: test_counter.py プロジェクト: vishalbelsare/nltk
 def setup_class(self):
     text = [list("abcd"), list("egdbe")]
     self.trigram_counter = NgramCounter(
         everygrams(sent, max_len=3) for sent in text)
     self.bigram_counter = NgramCounter(
         everygrams(sent, max_len=2) for sent in text)
     self.case = unittest.TestCase()
コード例 #3
0
def prune_counter(counter, order, threshold=10):
    new_counter = NgramCounter()
    new_counter._counts[1] = counter[1]
    for i in range(2, order + 1):
        new_counter._counts[i] = prune_cond_dist(counter[i],
                                                 threshold=threshold)
    return new_counter
コード例 #4
0
    def test_train_on_illegal_sentences(self):
        str_sent = ["Check", "this", "out", "!"]
        list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]]

        with self.assertRaises(TypeError):
            NgramCounter([str_sent])

        with self.assertRaises(TypeError):
            NgramCounter([list_sent])
コード例 #5
0
ファイル: test_counter.py プロジェクト: vishalbelsare/nltk
    def test_train_on_unigrams(self):
        words = list("abcd")
        counter = NgramCounter([[(w, ) for w in words]])

        assert not counter[3]
        assert not counter[2]
        self.case.assertCountEqual(words, counter[1].keys())
コード例 #6
0
    def test_train_on_unigrams(self):
        words = list("abcd")
        counter = NgramCounter([[(w, ) for w in words]])

        self.assertFalse(bool(counter[3]))
        self.assertFalse(bool(counter[2]))
        six.assertCountEqual(self, words, counter[1].keys())
コード例 #7
0
    def test_train_on_mix(self):
        mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h", )]
        counter = NgramCounter([mixed_sent])
        unigrams = ["h"]
        bigram_contexts = [("a", ), ("c", )]
        trigram_contexts = [("e", "f")]

        six.assertCountEqual(self, unigrams, counter[1].keys())
        six.assertCountEqual(self, bigram_contexts, counter[2].keys())
        six.assertCountEqual(self, trigram_contexts, counter[3].keys())
コード例 #8
0
ファイル: test_counter.py プロジェクト: vishalbelsare/nltk
class TestNgramCounter:
    """Tests for NgramCounter that only involve lookup, no modification."""
    @classmethod
    def setup_class(self):
        text = [list("abcd"), list("egdbe")]
        self.trigram_counter = NgramCounter(
            everygrams(sent, max_len=3) for sent in text)
        self.bigram_counter = NgramCounter(
            everygrams(sent, max_len=2) for sent in text)
        self.case = unittest.TestCase()

    def test_N(self):
        assert self.bigram_counter.N() == 16
        assert self.trigram_counter.N() == 21

    def test_counter_len_changes_with_lookup(self):
        assert len(self.bigram_counter) == 2
        self.bigram_counter[50]
        assert len(self.bigram_counter) == 3

    def test_ngram_order_access_unigrams(self):
        assert self.bigram_counter[1] == self.bigram_counter.unigrams

    def test_ngram_conditional_freqdist(self):
        case = unittest.TestCase()
        expected_trigram_contexts = [
            ("a", "b"),
            ("b", "c"),
            ("e", "g"),
            ("g", "d"),
            ("d", "b"),
        ]
        expected_bigram_contexts = [("a", ), ("b", ), ("d", ), ("e", ),
                                    ("c", ), ("g", )]

        bigrams = self.trigram_counter[2]
        trigrams = self.trigram_counter[3]

        self.case.assertCountEqual(expected_bigram_contexts,
                                   bigrams.conditions())
        self.case.assertCountEqual(expected_trigram_contexts,
                                   trigrams.conditions())
def SetUpUnigramModel():
    newsListOne = []
    with open("combined.txt", 'r', encoding='utf-8', errors='ignore') as outfile:
        newslist = json.load(outfile)
    for news in newslist:
        newsListOne.extend(news)
    text = ' '.join([str(elem) for elem in newsListOne])
    tokenized_text = [list(map(str.lower, nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(text)]
    text_unigrams = [ngrams(sent, 1) for sent in tokenized_text]
    unigram_counter_model = NgramCounter(text_unigrams)
    return unigram_counter_model
コード例 #10
0
def SetUpUnigramModel():
    if os.path.isfile('combined.txt'):
        with io.open('combined.txt', encoding='utf8') as fin:
            text1 = fin.read()
    tokenized_text = [
        list(map(str.lower, nltk.word_tokenize(sent)))
        for sent in nltk.sent_tokenize(text1)
    ]
    #print(tokenized_text)
    text_unigrams = [ngrams(sent, 1) for sent in tokenized_text]
    unigram_counter_model = NgramCounter(text_unigrams)
    return unigram_counter_model
コード例 #11
0
ファイル: functions.py プロジェクト: qfournier/syscall_args
def nltk_ngram(call, vocab, n):
    """Compute n-grams using the nltk library.

    Args:
        call (list): list of system call name (as integer) sequences
        vocab (list): mapping from integer to system call name
        n (int): the n-gram order

    Returns:
        tuple: list of n-grams, list of n-grams count, list of n-grams
         probability, dictionary {context: prediction}
    """
    # convert sequences of integer into sequences of string and call NLTK
    counter = NgramCounter([ngrams([vocab[w] for w in s], n) for s in call])
    # store predictions in a dictionary {context: prediction}
    return {
        context: max(counter[context].items(), key=operator.itemgetter(1))[0]
        for context in it.product(vocab, repeat=n - 1) if counter[context]
    }
コード例 #12
0
from nltk.util import ngrams
import os
import io
import nltk
from nltk.util import ngrams
from nltk.lm import NgramCounter
#text_unigrams = [ngrams(sent, 1) for sent in text]

if __name__ == '__main__':
    if os.path.isfile('combined.txt'):
        with io.open('combined.txt', encoding='utf8') as fin:
            text = fin.read()
    tokenized_text = [
        list(map(str.lower, nltk.word_tokenize(sent)))
        for sent in nltk.sent_tokenize(text)
    ]
    #print(tokenized_text)
    text_unigrams = [ngrams(sent, 1) for sent in tokenized_text]
    uingram_counts = NgramCounter(text_unigrams)
    print(unigram_counts['අද'])
コード例 #13
0
 def test_None(self):
     test = NgramCounter(None)
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
コード例 #14
0
 def test_empty_list(self):
     test = NgramCounter([])
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
コード例 #15
0
 def test_empty_string(self):
     test = NgramCounter("")
     self.assertNotIn(2, test)
     self.assertEqual(test[1], FreqDist())
コード例 #16
0
 def setUp(self):
     self.counter = NgramCounter()
コード例 #17
0
def count_ngrams_and_vocab(corpus, n=3, unk_cutoff=10):
    tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in corpus]
    training_ngrams, padded_sents = padded_everygram_pipeline(n, tokenized_text)
    return NgramCounter(training_ngrams), Vocabulary(padded_sents, unk_cutoff=unk_cutoff)
コード例 #18
0
    def test_train_on_bigrams(self):
        bigram_sent = [("a", "b"), ("c", "d")]
        counter = NgramCounter([bigram_sent])

        self.assertFalse(bool(counter[3]))
コード例 #19
0
def process_reviews(file_name):
    positive_texts, negative_texts, first_sent = read_reviews(file_name)

    # There are 150 positive reviews and 150 negative reviews.
    # print(len(positive_texts))
    # print(len(negative_texts))
    pos = []
    poswords = []
    neg = []
    negwords = []
    for i in range(0, len(positive_texts)):
        p = normalize(word_tokenize(positive_texts[i]))
        for item in p:
            poswords.append(item)
        pos.append(p)
        n = normalize(word_tokenize(negative_texts[i]))
        for item in n:
            negwords.append(item)
        neg.append(n)
    pu = open("POSITIVE-unigram-freq.txt", 'w', encoding="utf-8")
    nu = open("NEGATIVE-unigram-freq.txt", 'w', encoding="utf-8")
    pb = open("POSITIVE-bigram-freq.txt", 'w', encoding="utf-8")
    nb = open("NEGATIVE-bigram-freq.txt", 'w', encoding="utf-8")

    fdist = FreqDist(word for word in poswords)
    print(fdist["the"])
    print(fdist["wine"])
    print(fdist["list"])
    pos_unigrams = [ngrams(sent, 1) for sent in pos]
    pos_bigrams = [ngrams(sent, 2) for sent in pos]
    pos_trigrams = [ngrams(sent, 3) for sent in pos]
    pos_4grams = [ngrams(sent, 4) for sent in pos]
    pos_5grams = [ngrams(sent, 5) for sent in pos]

    pos_counts = NgramCounter(pos_unigrams + pos_bigrams + pos_trigrams +
                              pos_4grams + pos_5grams)

    neg_unigrams = [ngrams(sent, 1) for sent in neg]
    neg_bigrams = [ngrams(sent, 2) for sent in neg]
    neg_trigrams = [ngrams(sent, 3) for sent in neg]
    neg_4grams = [ngrams(sent, 4) for sent in neg]
    neg_5grams = [ngrams(sent, 5) for sent in neg]

    neg_counts = NgramCounter(neg_unigrams + neg_bigrams + neg_trigrams +
                              neg_4grams + neg_5grams)

    p1 = pos_counts[1]
    p2 = pos_counts[2]
    p3 = pos_counts[3]
    p4 = pos_counts[4]
    p5 = pos_counts[5]

    print(fdist.N())
    print(p2[('restaurant', 'excellent')])
    n1 = neg_counts[1]
    n2 = neg_counts[2]
    n3 = neg_counts[3]
    n4 = neg_counts[4]
    n5 = neg_counts[5]

    unigramout(p1, pu)
    unigramout(n1, nu)
    bigramout(p2, pb)
    bigramout(n2, nb)

    postext = nltk.Text(poswords)
    negtext = nltk.Text(negwords)
    postext.collocations()
    negtext.collocations()
    return
コード例 #20
0
def get_subset_from_counter(counter, order):
    new_counter = NgramCounter()
    for i in range(order):
        new_counter._counts[i + 1] = counter[i + 1]

    return new_counter
コード例 #21
0
ファイル: test_counter.py プロジェクト: vishalbelsare/nltk
 def setup_class(self):
     self.counter = NgramCounter()
     self.case = unittest.TestCase()
コード例 #22
0
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

from nltk.lm import Lidstone
from nltk.lm import Laplace
from nltk.lm import KneserNeyInterpolated

# Exercise 1

president_unigrams = {}

for president in inaugural.fileids():
    text_unigrams = [ngrams(sent, 1) for sent in inaugural.sents(president)]
    ngram_counts = NgramCounter(text_unigrams)
    president_unigrams[president] = ngram_counts.N()

inverse_unigrams = [(value, key) for key, value in president_unigrams.items()]
print(max(inverse_unigrams)[1],
      max(inverse_unigrams)[0])  #longest discourse for Harrison in 1841
print(min(inverse_unigrams)[1],
      min(inverse_unigrams)[0])  #shortest discourse for Washington in 1793

president_vocabulary = {}

for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)

inverse_vocabulary = [(value, key)
コード例 #23
0
from nltk.lm import NgramCounter, Vocabulary
from nltk.lm.preprocessing import padded_everygram_pipeline
import pickle

model_dir = '../../data/ngrams/'

with open(f'{model_dir}tokenized_text.pickle', 'rb') as file:
    tokenized_text = pickle.load(file)

training_ngrams, padded_sents = padded_everygram_pipeline(3, tokenized_text)
counter = NgramCounter(training_ngrams)
vocabulary = Vocabulary(padded_sents, unk_cutoff=10)

with open(f'{model_dir}counter.pickle', 'wb') as file:
    pickle.dump(counter, file)

with open(f'{model_dir}vocabulary.pickle', 'wb') as file:
    pickle.dump(vocabulary, file)
コード例 #24
0
    s = 0
    for i in range(order):
        s += len(counter[i])
    return s


# Return a subset of ngram of lower order
def get_subset_from_counter(counter, order):
    new_counter = NgramCounter()
    for i in range(order):
        new_counter._counts[i + 1] = counter[i + 1]

    return new_counter


ngram = 7
fname = "lm_7gram_counter.pkl"

if __name__ == "__main__":
    counter = NgramCounter()
    for p in range(1, 100):
        print("file {}".format(p))
        fnum = "0000" + str(p) if p < 10 else "000" + str(p)
        fn = PATH.BASE_DIR + '../data/1blm/training-monolingual.tokenized.shuffled/news.en-' + fnum + '-of-00100'
        counter = update_counter(counter, ngram, fn)

    with open(fname, 'wb') as fout:
        pickle.dump(counter, fout)

    print("Completed.")