def __init__(self, texts, padding, reverse=False, num_words=None): #Inheritance of the Tokenizer class from keras Tokenizer.__init__(self, num_words=num_words, char_level=False) self.fit_on_texts(texts) self.index_to_words = dict( zip(self.word_index.values(), self.word_index.keys())) self.tokens = self.texts_to_sequences(texts) if reverse: for count, g in enumerate(self.tokens, 0): self.tokens[count] = reversing(g) truncating = 'pre' else: truncating = 'post' self.num_tokens = [len(g) for g in self.tokens] self.max_tokens = 24 self.tokens_padded = pad_sequences(self.tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)
def __init__(self, texts, num_words=None): Tokenizer.__init__(self, num_words=num_words) self.fit_on_texts(texts) self.tokens = self.texts_to_sequences(texts) self.tokens_length = [len(x) for x in self.tokens] self.max_tokens = np.mean(self.tokens_length) + \ 2 * np.std(self.tokens_length) self.max_tokens = int(self.max_tokens) self.tokens_padded = pad_sequences( self.tokens, maxlen=self.max_tokens, truncating='post')
def __init__(self, texts, padding, reverse=False, num_words=None): """ :param texts: List of strings. This is the data-set. :param padding: Either 'post' or 'pre' padding. :param reverse: Boolean whether to reverse token-lists. :param num_words: Max number of words to use. """ Tokenizer.__init__(self, num_words=num_words) # Create the vocabulary from the texts. self.fit_on_texts(texts) # Create inverse lookup from integer-tokens to words. self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys())) # Convert all texts to lists of integer-tokens. # Note that the sequences may have different lengths. self.tokens = self.texts_to_sequences(texts) if reverse: # Reverse the token-sequences. self.tokens = [list(reversed(x)) for x in self.tokens] # Sequences that are too long should now be truncated # at the beginning, which corresponds to the end of # the original sequences. truncating = 'pre' else: # Sequences that are too long should be truncated # at the end. truncating = 'post' # The number of integer-tokens in each sequence. self.num_tokens = [len(x) for x in self.tokens] # Max number of tokens to use in all sequences. # We will pad / truncate all sequences to this length. # This is a compromise so we save a lot of memory and # only have to truncate maybe 5% of all the sequences. self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens) self.max_tokens = int(self.max_tokens) # Pad / truncate all token-sequences to the given length. # This creates a 2-dim numpy matrix that is easier to use. self.tokens_padded = pad_sequences(self.tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)
def __init__(self, texts, num_words=None): """ :param texts: List of strings with the data-set. :param num_words: Max number of words to use. """ Tokenizer.__init__(self, num_words=num_words) # Create the vocabulary from the texts. self.fit_on_texts(texts) # Create inverse lookup from integer-tokens to words. self.index_to_word = dict( zip(self.word_index.values(), self.word_index.keys()))
def __init__(self, nt): Tokenizer.__init__(self) if nt == 3: self.dic = [ a + b + c for a in 'ATCG' for b in 'ATCG' for c in 'ATCG' ] elif nt == 2: self.dic = [a + b for a in 'ATCG' for b in 'ATCG'] elif nt == 1: self.dic = [a for a in 'ATCG'] else: self.dic = [] self.fit_on_texts(self.dic)
def __init__(self, texts, padding, reverse=False, num_words=None): Tokenizer.__init__(self, num_words=num_words) self.fit_on_texts(texts) self.index_to_word = dict( zip(self.word_index.values(), self.word_index.keys())) self.tokens = self.texts_to_sequences(texts) if reverse: self.tokens = [list(reversed(x)) for x in self.tokens] truncating = 'pre' else: truncating = 'post' self.num_tokens = [len(x) for x in self.tokens] self.max_tokens = np.mean( self.num_tokens) + 2 * np.std(self.num_tokens) self.max_tokens = int(self.max_tokens) self.tokens_padded = pad_sequences(self.tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)
def __init__(self, texts, num_words=None): Tokenizer.__init__(self, num_words=num_words) self.fit_on_texts(texts) self.index_to_word = dict(zip(self.word_index.values(), self.word_index.keys()))
def __init__(self, **tokenizer_params): Tokenizer.__init__(self, **tokenizer_params)