コード例 #1
0
 def __init__(self, corpus):
     """
     :param corpus: corpus over which to create the language model
     """
     unigram_calculator = UnigramCalculator(corpus)
     self.unigrams, self.corpus_unigram_length = unigram_calculator.calculate_unigrams()
     self.corpus_sentence_length = len(corpus.get_sentences())
コード例 #2
0
class NgramCalculatorContainer:
    def __init__(self, corpus, n):
        self.n = n
        self.corpus_sentence_length = len(corpus.get_sentences())

        self.unigram_calculator = UnigramCalculator(corpus)
        self.corpus_unigram_length = self.unigram_calculator.total_unigrams

        self.ngram_calculators = []
        for i in range(2, n+1):
            self.ngram_calculators.append(NgramCalculator(corpus, i, True, False))

    def get_ngram_count(self, ngram):
        """
        Gets the count of the bigram in the corpus
        :param ngram: bigram to find in the corpus
        :return: number of times the bigram is found in the corpus
        """
        if ngram[0] == '<s>' and ngram[-1] == '<s>':  # special case where pregram can be <s>
            return self.corpus_sentence_length

        if len(ngram) == 1:  # base case
            unigram_count = self.unigram_calculator.get_ngram_count(ngram)
            return unigram_count

        ngram_calculator = self.ngram_calculators[len(ngram)-2]
        return ngram_calculator.get_ngram_count(ngram)

    def get_pregram_instances(self, pregram):
        if len(pregram) == 0:
            return self.unigram_calculator.get_pregram_instances(pregram)

        ngram_calculator = self.ngram_calculators[len(pregram)-1]
        return ngram_calculator.get_pregram_instances(pregram)
コード例 #3
0
def q2_calculate_unique_unigrams_in_reuters_training():
    corpus = ReutersTrainingCorpus()
    unigram_calculator = UnigramCalculator(corpus)
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_unigrams = unigram_calculator.get_percentage_unique_unigrams()
    unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1)

    print("Percentage of unique unigrams (unigram calculator): ", unique_unigrams)
    print("Percentage of unique unigrams (ngram calculator): ", unique_unigrams_ngram)
コード例 #4
0
    def __init__(self, corpus, n):
        self.n = n
        self.corpus_sentence_length = len(corpus.get_sentences())

        self.unigram_calculator = UnigramCalculator(corpus)
        self.corpus_unigram_length = self.unigram_calculator.total_unigrams

        self.ngram_calculators = []
        for i in range(2, n + 1):
            self.ngram_calculators.append(
                NgramCalculator(corpus, i, True, False))
コード例 #5
0
def q2_calculate_unique_unigrams_in_reuters_training():
    corpus = ReutersTrainingCorpus()
    unigram_calculator = UnigramCalculator(corpus)
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_unigrams = unigram_calculator.get_percentage_unique_unigrams()
    unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1)

    print("Percentage of unique unigrams (unigram calculator): ",
          unique_unigrams)
    print("Percentage of unique unigrams (ngram calculator): ",
          unique_unigrams_ngram)
コード例 #6
0
    def __init__(self, corpus, n):
        self.n = n
        self.corpus_sentence_length = len(corpus.get_sentences())

        self.unigram_calculator = UnigramCalculator(corpus)
        self.corpus_unigram_length = self.unigram_calculator.total_unigrams

        self.ngram_calculators = []
        for i in range(2, n+1):
            self.ngram_calculators.append(NgramCalculator(corpus, i, True, False))
コード例 #7
0
class TestUnigramCalculator:
    corpus = Corpus()
    unigram_calculator = UnigramCalculator(corpus)

    def test_calculate_unigrams(self):
        assert 20 == self.unigram_calculator.total_unigrams
        assert self.corpus.unigrams == self.unigram_calculator.unigrams

    def test_get_percentage_unique_unigrams(self):
        assert 16 / 18 == self.unigram_calculator.get_percentage_unique_unigrams(
        )
コード例 #8
0
class NgramCalculatorContainer:
    def __init__(self, corpus, n):
        self.n = n
        self.corpus_sentence_length = len(corpus.get_sentences())

        self.unigram_calculator = UnigramCalculator(corpus)
        self.corpus_unigram_length = self.unigram_calculator.total_unigrams

        self.ngram_calculators = []
        for i in range(2, n + 1):
            self.ngram_calculators.append(
                NgramCalculator(corpus, i, True, False))

    def get_ngram_count(self, ngram):
        """
        Gets the count of the bigram in the corpus
        :param ngram: bigram to find in the corpus
        :return: number of times the bigram is found in the corpus
        """
        if ngram[0] == '<s>' and ngram[
                -1] == '<s>':  # special case where pregram can be <s>
            return self.corpus_sentence_length

        if len(ngram) == 1:  # base case
            unigram_count = self.unigram_calculator.get_ngram_count(ngram)
            return unigram_count

        ngram_calculator = self.ngram_calculators[len(ngram) - 2]
        return ngram_calculator.get_ngram_count(ngram)

    def get_pregram_instances(self, pregram):
        if len(pregram) == 0:
            return self.unigram_calculator.get_pregram_instances(pregram)

        ngram_calculator = self.ngram_calculators[len(pregram) - 1]
        return ngram_calculator.get_pregram_instances(pregram)