Ejemplo n.º 1
0
def encode_text(storage: WordStorage, text: tuple) -> tuple:
    validation.ensure_type(
        (storage, WordStorage),
        (text, tuple),
        )
    validation.ensure_not_empty(storage)

    return tuple(storage.get_id(word) for word in text)
Ejemplo n.º 2
0
def tokenize_by_sentence(text: str) -> tuple:
    validation.ensure_type((text, str))
    validation.ensure_not_empty((text, ))

    clean_text = re.sub(r'[^.!?\w\s]', '', text)
    tokens = tuple(re.sub(r'([A-Z][\w\s]+)[.!?]?',
                          lambda x: x.group(1).lower() + ' <END>',
                          clean_text
                          ).split())
    return tokens
Ejemplo n.º 3
0
    def _put_word(self, word: str) -> int:
        validation.ensure_type((word, str))
        validation.ensure_not_empty(word)

        if word not in self.storage:
            word_id = len(self.storage)
            self.storage[word] = word_id
            self.reversed_storage[word_id] = word

        return self.storage[word]
Ejemplo n.º 4
0
    def _generate_next_word(self, context: tuple) -> int:
        validation.ensure_type((context, tuple))
        validation.ensure_not_empty(context)

        max_trie_size = len(context) + 1
        self._n_gram_tries = [trie for trie in self._n_gram_tries if trie.size <= max_trie_size]

        for trie in self._n_gram_tries:
            self._n_gram_trie = trie
            if n_gram := super()._generate_next_word(context):
                return n_gram
            context = context[:-1]
Ejemplo n.º 5
0
def decode_text(storage: WordStorage, encoded_text: tuple) -> tuple:
    validation.ensure_type(
        (storage, WordStorage),
        (encoded_text, tuple),
        )
    validation.ensure_not_empty(storage, encoded_text)

    text = ' '.join((storage.get_word(word) for word in encoded_text)) + ' '
    text = text.split(' <END> ')
    text = tuple(sent.capitalize() for sent in text)[:-1]

    return text
Ejemplo n.º 6
0
    def generate_text(self, context: tuple, number_of_sentences: int) -> tuple:
        validation.ensure_type(
            (context, tuple),
            (number_of_sentences, int),
            )
        validation.ensure_not_empty(context)
        validation.ensure_correct_int(number_of_sentences)

        text = context
        length = self._n_gram_trie.size - 1
        for _ in range(number_of_sentences):
            text += self._generate_sentence(context)[length:]
            context = text[-length:]
        return text
Ejemplo n.º 7
0
def save_model(model: NGramTextGenerator, path_to_saved_model: str):
    validation.ensure_type(
        (model, NGramTextGenerator),
        (path_to_saved_model, str),
        )
    validation.ensure_not_empty((path_to_saved_model, ))

    #_n_gram_trie = 'encoded_text', 'n_gram_frequencies', 'n_grams', 'size', 'uni_grams'
    _n_gram_trie = {
        'encoded_text': model._n_gram_trie.encoded_text,
        'n_gram_frequencies': model._n_gram_trie.n_gram_frequencies,
        'n_grams': model._n_gram_trie.n_grams,
        'size': model._n_gram_trie.size,
        'uni_grams': model._n_gram_trie.uni_grams,
        }

    #_word_storage = 'storage', 'reversed_storage'
    _word_storage = {
        'storage': model._word_storage.storage,
        'reversed_storage': model._word_storage.reversed_storage,
        }

    model_dict = {
        'name': f'{type(model).__name__}',
        'data': {
            '_n_gram_trie' : _n_gram_trie,
            '_word_storage': _word_storage,
        }
    }

    # if *_n_gram_tries
    if model_dict['name'] == 'BackOffGenerator':
        _n_gram_tries = []

        for trie in model._n_gram_tries:
            _n_gram_trie = {
                'encoded_text': trie.encoded_text,
                'n_gram_frequencies': trie.n_gram_frequencies,
                'n_grams': trie.n_grams,
                'size': trie.size,
                'uni_grams': trie.uni_grams,
                }
            _n_gram_tries.append(_n_gram_trie)

        model_dict['data']['_n_gram_tries'] = _n_gram_tries

    with open(path_to_saved_model, 'w') as file:
        pprint(model_dict, stream=file, compact=True)
Ejemplo n.º 8
0
    def _generate_sentence(self, context: tuple) -> tuple:
        validation.ensure_type((context, tuple))
        validation.ensure_not_empty(context)

        sent = context
        length = self._n_gram_trie.size - 1
        n_gram = sent[-length:]
        end_id = self._word_storage.get_id('<END>')
        for _ in range(20):
            sent += (self._generate_next_word(n_gram), )
            if sent[-1] == end_id:
                break
            n_gram = sent[-length:]
        else:
            sent += (end_id, )
        return sent
Ejemplo n.º 9
0
    def _generate_next_word(self, context: tuple) -> int:
        validation.ensure_type((context, tuple))
        validation.ensure_not_empty(context)
        validation.ensure_length(context, self._n_gram_trie.size - 1)

        self._word_storage.update_reversed_storage()

        likelihood = {}
        for word in self._word_storage.reversed_storage:
            if self._calculate_maximum_likelihood(word, context) in likelihood:
                likelihood[self._calculate_maximum_likelihood(word, context)].append(word)
            else:
                likelihood[self._calculate_maximum_likelihood(word, context)] = [word]

        max_likelihood = max(likelihood.keys())

        return likelihood[max_likelihood][0]
Ejemplo n.º 10
0
def load_model(path_to_saved_model: str) -> NGramTextGenerator:
    validation.ensure_type((path_to_saved_model, str))
    validation.ensure_not_empty((path_to_saved_model, ))

    with open(path_to_saved_model, 'r') as file:
        model_dict = file.read()

    model_dict = ast.literal_eval(model_dict)

    _n_gram_trie = NGramTrie(
        model_dict['data']['_n_gram_trie']['size'],
        model_dict['data']['_n_gram_trie']['encoded_text'],
        )
    _n_gram_trie.n_gram_frequencies = model_dict['data']['_n_gram_trie']['n_gram_frequencies']
    _n_gram_trie.n_grams = model_dict['data']['_n_gram_trie']['n_grams']
    _n_gram_trie.uni_grams = model_dict['data']['_n_gram_trie']['uni_grams']

    _word_storage = WordStorage()
    _word_storage.storage = model_dict['data']['_word_storage']['storage']
    _word_storage.reversed_storage = model_dict['data']['_word_storage']['reversed_storage']

    if model_dict['name'] == 'NGramTextGenerator':
        generator = NGramTextGenerator(_word_storage, _n_gram_trie)

    elif model_dict['name'] == 'LikelihoodBasedTextGenerator':
        generator = LikelihoodBasedTextGenerator(_word_storage, _n_gram_trie)

    elif model_dict['name'] == 'BackOffGenerator':
        tries = []
        for trie in model_dict['data']['_n_gram_tries']:
            _n_gram_trie = NGramTrie(
                trie['size'],
                trie['encoded_text'],
                )
            _n_gram_trie.n_gram_frequencies = trie['n_gram_frequencies']
            _n_gram_trie.n_grams = trie['n_grams']
            _n_gram_trie.uni_grams = trie['uni_grams']
            tries.append(_n_gram_trie)

        generator = BackOffGenerator(_word_storage, _n_gram_trie, *tries)

    return generator
Ejemplo n.º 11
0
    def get_id(self, word: str) -> int:
        validation.ensure_type((word, str))
        validation.ensure_not_empty(word)

        return self.storage[word]