Ejemplo n.º 1
0
def test_convert_unicode_word_ignores_if_set():
    """ convert_unicode_word ignores Unicode words if set.
    """
    wg = WordGenerator([], allow_unicode_text=False)

    result = wg.convert_unicode_word(u'č')
    assert result == (False, ''), '{}'.format(result)
Ejemplo n.º 2
0
def test_convert_unicode_word():
    """ convert_unicode_word converts Unicode words correctly.
    """
    wg = WordGenerator([], allow_unicode_text=True)

    result = wg.convert_unicode_word(u'č')
    assert result == (True, u'\u010d'), '{}'.format(result)
Ejemplo n.º 3
0
    def __init__(self, vocabulary, fixed_length, custom_wordgen=None,
                 ignore_sentences_with_only_custom=False, masking_value=0,
                 unknown_value=1):
        """ Needs a dictionary as input for the vocabulary.
        """

        if len(vocabulary) > np.iinfo('uint16').max:
            raise ValueError('Dictionary is too big ({} tokens) for the numpy '
                             'datatypes used (max limit={}). Reduce vocabulary'
                             ' or adjust code accordingly!'
                             .format(len(vocabulary), np.iinfo('uint16').max))

        # Shouldn't be able to modify the given vocabulary
        self.vocabulary = deepcopy(vocabulary)
        self.fixed_length = fixed_length
        self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom
        self.masking_value = masking_value
        self.unknown_value = unknown_value

        # Initialized with an empty stream of sentences that must then be fed
        # to the generator at a later point for reusability.
        # A custom word generator can be used for domain-specific filtering etc
        if custom_wordgen is not None:
            assert custom_wordgen.stream is None
            self.wordgen = custom_wordgen
            self.uses_custom_wordgen = True
        else:
            self.wordgen = WordGenerator(None, allow_unicode_text=True,
                                         ignore_emojis=False,
                                         remove_variation_selectors=True,
                                         break_replacement=True)
            self.uses_custom_wordgen = False
Ejemplo n.º 4
0
def test_check_ascii():
    """ check_ascii recognises ASCII words properly.
        In Python 3 all string are Unicode
    """
    if not IS_PYTHON2:
        return

    wg = WordGenerator([])
    assert wg.check_ascii('ASCII')
    assert not wg.check_ascii('ščřžýá')
    assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢')
Ejemplo n.º 5
0
    def __init__(self, vocabulary, fixed_length, custom_wordgen=None,
                 ignore_sentences_with_only_custom=False, masking_value=0,
                 unknown_value=1):
        """ Needs a dictionary as input for the vocabulary.
        """

        if len(vocabulary) > np.iinfo('uint16').max:
            raise ValueError('Dictionary is too big ({} tokens) for the numpy '
                             'datatypes used (max limit={}). Reduce vocabulary'
                             ' or adjust code accordingly!'
                             .format(len(vocabulary), np.iinfo('uint16').max))

        # Shouldn't be able to modify the given vocabulary
        self.vocabulary = deepcopy(vocabulary)
        self.fixed_length = fixed_length
        self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom
        self.masking_value = masking_value
        self.unknown_value = unknown_value

        # Initialized with an empty stream of sentences that must then be fed
        # to the generator at a later point for reusability.
        # A custom word generator can be used for domain-specific filtering etc
        if custom_wordgen is not None:
            assert custom_wordgen.stream is None
            self.wordgen = custom_wordgen
            self.uses_custom_wordgen = True
        else:
            self.wordgen = WordGenerator(None, allow_unicode_text=True,
                                         ignore_emojis=False,
                                         remove_variation_selectors=True,
                                         break_replacement=True)
            self.uses_custom_wordgen = False
Ejemplo n.º 6
0
def test_only_unicode_accepted():
    """ Non-Unicode strings raise a ValueError.
        In Python 3 all string are Unicode
    """
    if not IS_PYTHON2:
        raise ValueError("You are using python 3 so this test should always pass")

    sentences = [
        u'Hello world',
        u'I am unicode',
        'I am not unicode',
        ]

    wg = WordGenerator(sentences)
    for w in wg:
        pass
Ejemplo n.º 7
0
class SentenceTokenizer():
    """ Create numpy array of tokens corresponding to input sentences.
        The vocabulary can include Unicode tokens.
    """
    def __init__(self,
                 vocabulary,
                 fixed_length,
                 custom_wordgen=None,
                 ignore_sentences_with_only_custom=False,
                 masking_value=0,
                 unknown_value=1):
        """ Needs a dictionary as input for the vocabulary.
        """

        if len(vocabulary) > np.iinfo('uint16').max:
            raise ValueError('Dictionary is too big ({} tokens) for the numpy '
                             'datatypes used (max limit={}). Reduce vocabulary'
                             ' or adjust code accordingly!'.format(
                                 len(vocabulary),
                                 np.iinfo('uint16').max))

        # Shouldn't be able to modify the given vocabulary
        self.vocabulary = deepcopy(vocabulary)
        self.fixed_length = fixed_length
        self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom
        self.masking_value = masking_value
        self.unknown_value = unknown_value

        # Initialized with an empty stream of sentences that must then be fed
        # to the generator at a later point for reusability.
        # A custom word generator can be used for domain-specific filtering etc
        if custom_wordgen is not None:
            assert custom_wordgen.stream is None
            self.wordgen = custom_wordgen
            self.uses_custom_wordgen = True
        else:
            self.wordgen = WordGenerator(None,
                                         allow_unicode_text=True,
                                         ignore_emojis=False,
                                         remove_variation_selectors=True,
                                         break_replacement=True)
            self.uses_custom_wordgen = False

    def tokenize_sentences(self,
                           sentences,
                           reset_stats=True,
                           max_sentences=None):
        """ Converts a given list of sentences into a numpy array according to
            its vocabulary.

        # Arguments:
            sentences: List of sentences to be tokenized.
            reset_stats: Whether the word generator's stats should be reset.
            max_sentences: Maximum length of sentences. Must be set if the
                length cannot be inferred from the input.

        # Returns:
            Numpy array of the tokenization sentences with masking,
            infos,
            stats

        # Raises:
            ValueError: When maximum length is not set and cannot be inferred.
        """

        if max_sentences is None and not hasattr(sentences, '__len__'):
            raise ValueError('Either you must provide an array with a length'
                             'attribute (e.g. a list) or specify the maximum '
                             'length yourself using `max_sentences`!')
        n_sentences = (max_sentences
                       if max_sentences is not None else len(sentences))

        if self.masking_value == 0:
            tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16')
        else:
            tokens = (np.ones(
                (n_sentences, self.fixed_length), dtype='uint16') *
                      self.masking_value)

        if reset_stats:
            self.wordgen.reset_stats()

        # With a custom word generator info can be extracted from each
        # sentence (e.g. labels)
        infos = []

        # Returns words as strings and then map them to vocabulary
        self.wordgen.stream = sentences
        next_insert = 0
        n_ignored_unknowns = 0
        for s_words, s_info in self.wordgen:
            s_tokens = self.find_tokens(s_words)

            if (self.ignore_sentences_with_only_custom and np.all(
                [True if t < len(SPECIAL_TOKENS) else False
                 for t in s_tokens])):
                n_ignored_unknowns += 1
                continue

            if len(s_tokens) > self.fixed_length:
                s_tokens = s_tokens[:self.fixed_length]

            tokens[next_insert, :len(s_tokens)] = s_tokens

            infos.append(s_info)
            next_insert += 1

        # For standard word generators all sentences should be tokenized
        # this is not necessarily the case for custom wordgenerators as they
        # may filter the sentences etc.
        if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom:
            assert len(
                sentences) == next_insert  # I can't figure out what this does
            #pass
        else:
            # adjust based on actual tokens received
            tokens = tokens[:next_insert]
            infos = infos[:next_insert]

        return tokens, infos, self.wordgen.stats

    def find_tokens(self, words):
        assert len(words) > 0
        tokens = []
        for w in words:
            try:
                tokens.append(self.vocabulary[w])
            except KeyError:
                tokens.append(self.unknown_value)
        return tokens

    def split_train_val_test(self,
                             sentences,
                             info_dicts,
                             split_parameter=[0.7, 0.1, 0.2],
                             extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(
                    filter(
                        lambda i: isinstance(i, numbers.Number) and i < len(
                            sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(
                ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train,
                                                  test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]
        # if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]:
        #     result_infos = [torch.from_numpy(label).long() for label in result_infos]

        return result, result_infos, added

    def to_sentence(self, sentence_idx):
        """ Converts a tokenized sentence back to a list of words.

        # Arguments:
            sentence_idx: List of numbers, representing a tokenized sentence
                given the current vocabulary.

        # Returns:
            String created by converting all numbers back to words and joined
            together with spaces.
        """
        # Have to recalculate the mappings in case the vocab was extended.
        ind_to_word = {ind: word for word, ind in self.vocabulary.items()}

        sentence_as_list = [ind_to_word[x] for x in sentence_idx]
        cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK']
        return " ".join(cleaned_list)
Ejemplo n.º 8
0
    def split_train_val_test(self,
                             sentences,
                             info_dicts,
                             split_parameter=[0.7, 0.1, 0.2],
                             extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(
                    filter(
                        lambda i: isinstance(i, numbers.Number) and i < len(
                            sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(
                ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train,
                                                  test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]
        # if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]:
        #     result_infos = [torch.from_numpy(label).long() for label in result_infos]

        return result, result_infos, added
Ejemplo n.º 9
0
class SentenceTokenizer():
    """ Create numpy array of tokens corresponding to input sentences.
        The vocabulary can include Unicode tokens.
    """
    def __init__(self, vocabulary, fixed_length, custom_wordgen=None,
                 ignore_sentences_with_only_custom=False, masking_value=0,
                 unknown_value=1):
        """ Needs a dictionary as input for the vocabulary.
        """

        if len(vocabulary) > np.iinfo('uint16').max:
            raise ValueError('Dictionary is too big ({} tokens) for the numpy '
                             'datatypes used (max limit={}). Reduce vocabulary'
                             ' or adjust code accordingly!'
                             .format(len(vocabulary), np.iinfo('uint16').max))

        # Shouldn't be able to modify the given vocabulary
        self.vocabulary = deepcopy(vocabulary)
        self.fixed_length = fixed_length
        self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom
        self.masking_value = masking_value
        self.unknown_value = unknown_value

        # Initialized with an empty stream of sentences that must then be fed
        # to the generator at a later point for reusability.
        # A custom word generator can be used for domain-specific filtering etc
        if custom_wordgen is not None:
            assert custom_wordgen.stream is None
            self.wordgen = custom_wordgen
            self.uses_custom_wordgen = True
        else:
            self.wordgen = WordGenerator(None, allow_unicode_text=True,
                                         ignore_emojis=False,
                                         remove_variation_selectors=True,
                                         break_replacement=True)
            self.uses_custom_wordgen = False

    def tokenize_sentences(self, sentences, reset_stats=True, max_sentences=None):
        """ Converts a given list of sentences into a numpy array according to
            its vocabulary.

        # Arguments:
            sentences: List of sentences to be tokenized.
            reset_stats: Whether the word generator's stats should be reset.
            max_sentences: Maximum length of sentences. Must be set if the
                length cannot be inferred from the input.

        # Returns:
            Numpy array of the tokenization sentences with masking,
            infos,
            stats

        # Raises:
            ValueError: When maximum length is not set and cannot be inferred.
        """

        if max_sentences is None and not hasattr(sentences, '__len__'):
            raise ValueError('Either you must provide an array with a length'
                             'attribute (e.g. a list) or specify the maximum '
                             'length yourself using `max_sentences`!')
        n_sentences = (max_sentences if max_sentences is not None
                       else len(sentences))

        if self.masking_value == 0:
            tokens = np.zeros((n_sentences, self.fixed_length), dtype='uint16')
        else:
            tokens = (np.ones((n_sentences, self.fixed_length), dtype='uint16')
                      * self.masking_value)

        if reset_stats:
            self.wordgen.reset_stats()

        # With a custom word generator info can be extracted from each
        # sentence (e.g. labels)
        infos = []

        # Returns words as strings and then map them to vocabulary
        self.wordgen.stream = sentences
        next_insert = 0
        n_ignored_unknowns = 0
        for s_words, s_info in self.wordgen:
            s_tokens = self.find_tokens(s_words)

            if (self.ignore_sentences_with_only_custom and
                np.all([True if t < len(SPECIAL_TOKENS)
                        else False for t in s_tokens])):
                n_ignored_unknowns += 1
                continue
            if len(s_tokens) > self.fixed_length:
                s_tokens = s_tokens[:self.fixed_length]
            tokens[next_insert,:len(s_tokens)] = s_tokens
            infos.append(s_info)
            next_insert += 1

        # For standard word generators all sentences should be tokenized
        # this is not necessarily the case for custom wordgenerators as they
        # may filter the sentences etc.
        if not self.uses_custom_wordgen and not self.ignore_sentences_with_only_custom:
            assert len(sentences) == next_insert
        else:
            # adjust based on actual tokens received
            tokens = tokens[:next_insert]
            infos = infos[:next_insert]

        return tokens, infos, self.wordgen.stats

    def find_tokens(self, words):
        assert len(words) > 0
        tokens = []
        for w in words:
            try:
                tokens.append(self.vocabulary[w])
            except KeyError:
                tokens.append(self.unknown_value)
        return tokens

    def split_train_val_test(self, sentences, info_dicts,
                             split_parameter=[0.7, 0.1, 0.2], extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(filter(lambda i: isinstance(i, numbers.Number)
                            and i < len(sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]
        # if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]:
        #     result_infos = [torch.from_numpy(label).long() for label in result_infos]

        return result, result_infos, added

    def to_sentence(self, sentence_idx):
        """ Converts a tokenized sentence back to a list of words.

        # Arguments:
            sentence_idx: List of numbers, representing a tokenized sentence
                given the current vocabulary.

        # Returns:
            String created by converting all numbers back to words and joined
            together with spaces.
        """
        # Have to recalculate the mappings in case the vocab was extended.
        ind_to_word = {ind: word for word, ind in self.vocabulary.items()}

        sentence_as_list = [ind_to_word[x] for x in sentence_idx]
        cleaned_list = [x for x in sentence_as_list if x != 'CUSTOM_MASK']
        return " ".join(cleaned_list)
Ejemplo n.º 10
0
from __future__ import print_function, unicode_literals
import example_helper
import json
from torchmoji.create_vocab import extend_vocab, VocabBuilder
from torchmoji.word_generator import WordGenerator

new_words = ['#zzzzaaazzz', 'newword', 'newword']
word_gen = WordGenerator(new_words)
vb = VocabBuilder(word_gen)
vb.count_all_words()

with open('../model/vocabulary.json') as f:
    vocab = json.load(f)

print(len(vocab))
print(vb.word_counts)
extend_vocab(vocab, vb, max_tokens=1)

# 'newword' should be added because it's more frequent in the given vocab
print(vocab['newword'])
print(len(vocab))
Ejemplo n.º 11
0
import copy
from nltk.tokenize import TweetTokenizer
import random
from torchmoji.word_generator import WordGenerator

warnings.filterwarnings('ignore')

random.seed(3)
np.random.seed(3)

torch_helper = TorchHelper()

tokenizer = TweetTokenizer()
wordgen = WordGenerator(None,
                        allow_unicode_text=True,
                        ignore_emojis=False,
                        remove_variation_selectors=True,
                        break_replacement=True)

#NAG, OAG, CAG
task1_weights = [0.53151412, 1.78422392, 1.79174652]
#NGEN, GEN
task2_weights = [0.58438203, 3.46271605]

class_weights1 = torch.FloatTensor(task1_weights)
class_weights2 = torch.FloatTensor(task2_weights)

start_epoch = 0
batch_size = C.batch_size
max_epochs = 200
learning_rate = 0.00001
Ejemplo n.º 12
0
    PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
    PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)

    with open(PATH_RAW, 'rb') as dataset:
        if IS_PYTHON2:
            data = pickle.load(dataset)
        else:
            data = pickle.load(dataset, fix_imports=True)

    # Decode data
    try:
        texts = [unicode(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    wg = WordGenerator(texts)
    vb = VocabBuilder(wg)
    vb.count_all_words()

    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(tokenize(t)) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))

    # Extract labels
    labels = [x['label'] for x in data['info']]

    convert_dataset(PATH_OWN, 50000, {})
    convert_dataset(PATH_OUR, 0, vocab)
    convert_dataset(PATH_COMBINED, 10000, vocab)
Ejemplo n.º 13
0
def test_convert_unicode_chars():
    """ convert_unicode_word correctly converts accented characters.
    """
    wg = WordGenerator([], allow_unicode_text=True)
    result = wg.convert_unicode_word(u'ěščřžýáíé')
    assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result)
Ejemplo n.º 14
0
def test_unicode_sentences_ignored_if_set():
    """ Strings with Unicode characters tokenize to empty array if they're not allowed.
    """
    sentence = [u'Dobrý den, jak se máš?']
    wg = WordGenerator(sentence, allow_unicode_text=False)
    assert wg.get_words(sentence[0]) == []