Ejemplo n.º 1
0
def calculate_linear_perplexity(linear_model, lamda1, lambda2, lambda3,
                                eval_path) -> int:
    exponent = 0
    path_to_unked_data = write_new_training_data(eval_path,
                                                 linear_model.get_vocab(),
                                                 EVAL_UNKED_DATA_)
    eval_stream = open(path_to_unked_data, "r")
    sentence = eval_stream.readline()
    ngram_counter = 0
    corpus_size = 0
    ngram_size = 3
    while sentence:
        sentence_tokens = pad_sentence(sentence, ngram_size - 1).split()
        corpus_size += len(
            sentence.split()) + 1  # don't count start -- but do count stop
        for i in range(len(sentence_tokens)):
            if i + ngram_size - 1 < len(sentence_tokens):
                ngram_key = tuple(sentence_tokens[i:i + ngram_size])
                exponent += log2(
                    linear_model.get_prob(ngram_key, lamda1, lambda2, lambda3))

            ngram_counter += 1
        sentence = eval_stream.readline()

    #  Perplexity is equal to 2 to the power of the negative `l`
    perplexity = 2**-(exponent / corpus_size)
    eval_stream.close()
    return round(perplexity, 4)
Ejemplo n.º 2
0
    def extract_vocab(self, training_data_path):

        # set up our model structure
        self.unigram_counter = {}  # use a dictionary for convenience

        training_data_file = open(training_data_path, "r")  # open a handle to our training corpus (read mode)
        sentence = training_data_file.readline()  # get the first sentence

        while sentence:
            sentence = pad_sentence(sentence)
            self.consume_sentence(sentence)  # update our model with this sentence
            sentence = training_data_file.readline()  # go to the next line
        training_data_file.close()
Ejemplo n.º 3
0
def calculate_ngram_perplexity(eval_path: str,
                               vocab: dict,
                               probs: dict,
                               ngram_size: int,
                               smoothed=False,
                               report_mode=False) -> int:
    exponent = 0
    path_to_unked_data = write_new_training_data(eval_path, vocab,
                                                 EVAL_UNKED_DATA_)
    eval_stream = open(path_to_unked_data, "r")
    sentence = eval_stream.readline()
    ngram_counter = 0
    corpus_size = 0
    while sentence:
        sentence_tokens = pad_sentence(sentence, ngram_size - 1).split()
        corpus_size += len(
            sentence.split()) + 1  # don't count start -- but do count stop
        for i in range(len(sentence_tokens)):
            ngram_key = tuple(
                sentence_tokens[i:i +
                                ngram_size])  # slice to get ngram size tuples
            if probs.get(ngram_key):
                exponent += log2(probs[ngram_key])
            else:
                if not smoothed:
                    exponent += -inf
                else:
                    # universal smooth -- words we knew but didnt see together
                    exponent += log2(probs[get_unk_tuple(
                        ngram_size)])  # the ngram sized ungram we add

            ngram_counter += 1
        sentence = eval_stream.readline()

    #  Perplexity is equal to 2 to the power of the negative `l`
    perplexity = 2**-(exponent / corpus_size)

    if report_mode: print("Perplexity Score: {}".format(perplexity))

    eval_stream.close()
    return round(perplexity, 4)
Ejemplo n.º 4
0
    def extract_ngrams(self, training_data_path, ngram_size):
        training_data = open(training_data_path, "r")
        sentence = training_data.readline()
        # loop through first and unk low occurrence words
        while sentence:
            sentence_tokens = pad_sentence(
                sentence, ngram_size -
                1).split()  # pad our sentences with starts n stops
            for i in range(len(sentence_tokens)):
                # test for if this is a valid ngram with this size
                if i < len(sentence_tokens) - (
                        ngram_size - 1
                ):  # don't overstep into late sentence words who would include [STOP]
                    ngram_key = tuple(sentence_tokens[i:i + ngram_size - 1])
                    ngram_tuple = tuple(sentence_tokens[i:i + ngram_size])

                    self.ngram_key_occurrence[ngram_key] = \
                        self.ngram_key_occurrence.get(ngram_key, 0) + 1  # increase how many sightings of the prefix
                    self.ngram_sighting[ngram_tuple] = self.ngram_sighting.get(
                        ngram_tuple, 0) + 1  # how many times have seen ngram
                    self.total_ngrams += 1  # count up the total number of ngrams we see
            sentence = training_data.readline()
        training_data.close()
Ejemplo n.º 5
0
def calculate_perplexity(eval_path: str,
                         probs: dict,
                         report_mode=False) -> int:
    perplexity = -1
    exponent = 0
    eval_stream = open(eval_path, "r")
    sentence = eval_stream.readline()
    corpus_size = 0
    unigram_counter = 0
    while sentence:
        sentence = pad_sentence(sentence)
        for token in sentence.split():
            exponent += log2(probs.get(token, probs[UNK_]))
            corpus_size += 1
            unigram_counter += 1
        sentence = eval_stream.readline()

    #  Perplexity is equal to 2 to the power of the negative `l`
    perplexity = 2**-(exponent / corpus_size)

    if report_mode: print("Perplexity Score: {}".format(perplexity))

    eval_stream.close()
    return round(perplexity, 4)
Ejemplo n.º 6
0
 def test_pad_sentence_adds_starts_if_asked(self):
     self.assertEqual(START_ + " sentence " + STOP_, pad_sentence("sentence",1))
     self.assertEqual(START_ + " " + START_ + " sentence " + STOP_, pad_sentence("sentence",2))
     self.assertEqual(START_ + " " + START_ + " " + START_ + " sentence " + STOP_, pad_sentence("sentence",3))
Ejemplo n.º 7
0
 def test_pad_sentence_adds_whitespace_and_special_stop(self):
     self.assertEqual("sentence " + STOP_,pad_sentence("sentence"))