Esempio n. 1
0
    def __init__(self, texts, padding, reverse=False, num_words=None):
        #Inheritance of the Tokenizer class from keras
        Tokenizer.__init__(self, num_words=num_words, char_level=False)

        self.fit_on_texts(texts)

        self.index_to_words = dict(
            zip(self.word_index.values(), self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            for count, g in enumerate(self.tokens, 0):
                self.tokens[count] = reversing(g)
            truncating = 'pre'

        else:
            truncating = 'post'

        self.num_tokens = [len(g) for g in self.tokens]
        self.max_tokens = 24

        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
Esempio n. 2
0
    def __init__(self, texts, num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        self.fit_on_texts(texts)
        self.tokens = self.texts_to_sequences(texts)
        self.tokens_length = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.tokens_length) + \
            2 * np.std(self.tokens_length)
        self.max_tokens = int(self.max_tokens)

        self.tokens_padded = pad_sequences(
            self.tokens, maxlen=self.max_tokens, truncating='post')
Esempio n. 3
0
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        :param num_words: Max number of words to use.
        """

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens)                           + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
Esempio n. 4
0
    def __init__(self, texts, num_words=None):
        """
        :param texts: List of strings with the data-set.
        :param num_words: Max number of words to use.
        """

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(
            zip(self.word_index.values(), self.word_index.keys()))
Esempio n. 5
0
    def __init__(self, nt):

        Tokenizer.__init__(self)
        if nt == 3:
            self.dic = [
                a + b + c for a in 'ATCG' for b in 'ATCG' for c in 'ATCG'
            ]
        elif nt == 2:
            self.dic = [a + b for a in 'ATCG' for b in 'ATCG']
        elif nt == 1:
            self.dic = [a for a in 'ATCG']
        else:
            self.dic = []
        self.fit_on_texts(self.dic)
Esempio n. 6
0
    def __init__(self, texts, padding, reverse=False, num_words=None):

        Tokenizer.__init__(self, num_words=num_words)
        self.fit_on_texts(texts)
        self.index_to_word = dict(
            zip(self.word_index.values(), self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(
            self.num_tokens) + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)
Esempio n. 7
0
 def __init__(self, texts, num_words=None):
     Tokenizer.__init__(self, num_words=num_words)
     self.fit_on_texts(texts)
     self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
Esempio n. 8
0
 def __init__(self, **tokenizer_params):
     Tokenizer.__init__(self, **tokenizer_params)