def preprocess(train_data_file, word_index_file, num_words): """Loads Numpy file .npz format and process its the data. Pad the arrays so they all have the same length, then create an integer tensor of shape max_length * num_reviews. Then we use an embedding layer capable of handling this shape as the first layer in our network. Args: train_data_file: (str) Location of file. word_index_file: (str) Location of JSON file with index information. num_words: (int) Number of words to get from IMDB dataset. Returns: A tuple of training and test data. """ (train_data, train_labels), (test_data, test_labels) = _load_data( path=train_data_file, num_words=num_words) word_index = _get_word_index(word_index_file) # Standardize the lengths for training. train_data = pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=SENTENCE_SIZE) # Standardize the lengths for test. test_data = pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=SENTENCE_SIZE) return (train_data, train_labels), (test_data, test_labels)
def input_fn(texts, labels, tokenizer, batch_size, mode): # Transform text to sequence of integers x = tokenizer.texts_to_sequences(texts) # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x = sequence.pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH) # default settings for training num_epochs = None shuffle = True # override if this is eval if mode == tf.estimator.ModeKeys.EVAL: num_epochs = 1 shuffle = False return tf.estimator.inputs.numpy_input_fn( x, y=labels, batch_size=batch_size, num_epochs=num_epochs, shuffle=shuffle, queue_capacity=50000 )
def get_dataset(): (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features) x_train = sequence.pad_sequences(x_train, maxlen=80) ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)) ds = ds.repeat() ds = ds.map(lambda x, y: (x, tf.cast(y, tf.int32))) ds = ds.batch(32, drop_remainder=True) return ds
def get_sentence_data(train_path_list, test_path_list): train_sentence_list = get_sentence_list(train_path_list) train_data = pd.DataFrame({'sentence' : train_sentence_list, 'label' : [0]*1000 + [1]*1000}) test_sentence_list = get_sentence_list(test_path_list) test_data = pd.DataFrame({'sentence' : test_sentence_list, 'label' : [0]*1000 + [1]*1000}) clean_train_sentences = [] for sentence in train_data['sentence']: clean_train_sentences.append(preprocessing(sentence, remove_stopwords=True)) clean_test_sentences = [] for sentence in test_data['sentence']: clean_test_sentences.append(preprocessing(sentence, remove_stopwords=True)) tokenizer = Tokenizer(num_words=10000) tokenizer.fit_on_texts(clean_train_sentences) train_text_sequences = tokenizer.texts_to_sequences(clean_train_sentences) test_text_sequences = tokenizer.texts_to_sequences(clean_test_sentences) MAX_SEQUENCE_LENGTH = 3817 X_train = pad_sequences(train_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') X_test = pad_sequences(test_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # clean_train_df = pd.DataFrame({'sentence': clean_train_sentences, 'label': train_data['label']}) # clean_test_df = pd.DataFrame({'sentence': clean_test_sentences, 'label': test_data['label']}) y_train = np.array(train_data['label']) print('Shape of X_train: ', X_train.shape) print('Shape of y_train: ', y_train.shape) np.save(data_path + 'X_train', X_train) np.save(data_path + 'y_train', y_train) y_test = np.array(test_data['label']) print('Shape of X_test: ', X_test.shape) print('Shape of y_test: ', y_test.shape) np.save(data_path + 'X_test', X_test) np.save(data_path + 'y_test', y_test) print('finished saving data') ################################### return tokenizer
def tokenize(self, sent): if len(self._train_corpus_tokens_) > 0: input_len = len(self._train_corpus_tokens_[0]) else: input_len = len(self._test_corpus_tokens_[0]) output = np.asarray(pad_sequences(self.tok.texts_to_sequences([sent]), maxlen = input_len, truncating='post')[0]) print(output) return output
def predict_rnn(question): tokenizer = load(path + 'tokenizer_ref.pkl') X_token = tokenizer.texts_to_sequences([my_data.str_clean(question)]) X_token = pad_sequences(X_token, maxlen=max_tokens, padding=pad, truncating=pad).tolist() result = predict_rnn_token(X_token) logging.info('predict rnn: ' + question + ' result') return result
def predict(cls, input): """For the input, do the predictions and return them. Args: input (a single news headline): The data on which to do the predictions. """ clf = cls.get_model() seq = tokenizer.texts_to_sequences([input]) d = pad_sequences(seq, maxlen=MAX_LEN) prediction = clf.predict_classes(np.array(d)) return (get_class_label(prediction))
def evaluate(test_file, sess, actions, actions_len, max_sentence_len, utterance_ph, all_utterance_len_ph, response_ph, response_len, y_pred): each_test_run = len(actions) // 3 acc1 = [0.0] * 10 rank1 = 0.0 cnt = 0 print('evaluating') with open(test_file, encoding="utf8") as f: lines = f.readlines() low = 0 history, true_utt = build_evaluate_data(lines) history, history_len = multi_sequences_padding(history, max_sentence_len) true_utt_len = np.array( get_sequences_length(true_utt, maxlen=max_sentence_len)) true_utt = np.array( pad_sequences(true_utt, padding='post', maxlen=max_sentence_len)) history, history_len = np.array(history), np.array(history_len) feed_dict = { utterance_ph: history, all_utterance_len_ph: history_len, response_ph: true_utt, response_len: true_utt_len } true_scores = sess.run(y_pred, feed_dict=feed_dict) true_scores = true_scores[:, 1] for i in range(true_scores.shape[0]): all_candidate_scores = [] for j in range(3): feed_dict = { utterance_ph: np.concatenate([history[low:low + 1]] * each_test_run, axis=0), all_utterance_len_ph: np.concatenate([history_len[low:low + 1]] * each_test_run, axis=0), response_ph: actions[each_test_run * j:each_test_run * (j + 1)], response_len: actions_len[each_test_run * j:each_test_run * (j + 1)] } candidate_scores = sess.run(y_pred, feed_dict=feed_dict) all_candidate_scores.append(candidate_scores[:, 1]) all_candidate_scores = np.concatenate(all_candidate_scores, axis=0) pos1 = np.sum(true_scores[i] + 1e-8 < all_candidate_scores) if pos1 < 10: acc1[pos1] += 1 rank1 += pos1 low += 1 cnt += true_scores.shape[0] print([a / cnt for a in acc1]) # rank top 1 to top 10 acc print(rank1 / cnt) # average rank print(np.sum(acc1[:3]) * 1.0 / cnt) # top 3 acc
def get_4chan(lookback_list, tokenizer, model): biz = py4chan.Board('biz') threads = biz.get_all_threads() thread_list = [] post_list = [] timestamp_list = [] for thread in threads: posts = [post.text_comment for post in thread.replies] timestamps = [post.timestamp for post in thread.replies] topics = [thread.topic.text_comment for post in thread.replies] for post in posts: post_list.append(post.strip('>')) for ts in timestamps: timestamp_list.append(ts) for topic in topics: thread_list.append(topic) post_df = pd.DataFrame(timestamp_list, columns=['Timestamp']) post_df['Thread'] = pd.Series(thread_list) post_df['Text'] = pd.Series(post_list) max_val = max(lookback_list) placeholder = get_lookback(max_val) if type(placeholder) == datetime.datetime: placeholder = time.mktime(placeholder.timetuple()) start = datetime.datetime.fromtimestamp(placeholder) unique_posts = post_df.drop_duplicates(keep='first', inplace=False) unique_comment_seqs = tokenizer.texts_to_sequences( unique_posts['Text'].values) padded_seqs = pad_sequences(unique_comment_seqs, maxlen=12) original_seqs = padded_seqs.shape[0] batch_size = model.input_shape[0] filler = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) while padded_seqs.shape[0] % batch_size != 0: padded_seqs = np.vstack((padded_seqs, filler)) final_data = np.vstack( (padded_seqs, np.zeros(shape=(batch_size * 10, 12)))) preds = model.predict(final_data, batch_size=128, verbose=0) origs = preds[:original_seqs] unique_posts['Negative'] = origs[:, 0] unique_posts['Positive'] = origs[:, 1] unique_posts[ 'Net_Sentiment'] = unique_posts['Positive'] - unique_posts['Negative'] timeframe_lists = [ unique_posts[unique_posts['Timestamp'] >= dt_to_int(get_lookback(ph))] for ph in lookback_list ] for lb in lookback_list: lb = get_lookback(lb) if type(lb) == datetime.datetime: lb = time.mktime(lb.timetuple()) timing = datetime.datetime.fromtimestamp(lb) print(f'Cryptocurrency 4chan posts from {timing} to now.') return timeframe_lists
def read_tokens_v2(token_user_ids_path): token_ids_set = [] token_len_set = [] with open(token_user_ids_path, mode="rt", encoding="utf-8") as fhu: user_utt = fhu.readline() counter = 0 while user_utt: counter += 1 if counter % 10000 == 0: print(" reading %s, line %d" % (token_user_ids_path, counter)) sys.stdout.flush() user_utt = user_utt.replace("\n", "") source_ids = user_utt.split("\u241D") cid = int(source_ids[0]) token_seq = source_ids[1].split("\u241E") token_len = len(token_seq) # q1_batch = [] # q2_batch = [] # label_batch = [] # # for q1, q2, label in batch: # q1_length, q2_length = len(q1), len(q2) # # q1_padding = [PAD_INDEX] * (max_len - q1_length) # q2_padding = [PAD_INDEX] * (max_len - q2_length) # # q1 = list(map(int, q1)) # q2 = list(map(int, q2)) # # q1_pad_seq, q2_pad_seq = (q1 + q1_padding), (q2 + q2_padding) # q1_pad_seq, q2_pad_seq = q1_pad_seq[:max_len], q2_pad_seq[:max_len] # # # input embed stuff # q1_pad_seq = emb_vector[q1_pad_seq] # q2_pad_seq = emb_vector[q2_pad_seq] # # # q1_pad_seq = pad_sequences(q1, maxlen=m token_seq = pad_sequences([token_seq], maxlen=config.buckets[0], padding='post') # for idx in range(len(token_seq)): # if token_seq[idx] >= config.input_vocab_size: # token_seq[idx] = config.UNK_ID # if token_seq[idx] >= config.get('input_vocab_size'): # token_seq[idx] = config.get('UNK_ID') token_ids_set.append([cid, token_seq]) token_len_set.append([cid, token_len]) user_utt = fhu.readline() return dict(token_ids_set), dict(token_len_set)
def input_data_for_model(input_shape): # 数据导入 input_data = load_data() # 数据处理 data_processing() # 导入字典 with open(CONSTANTS[1], 'rb') as f: word_dictionary = pickle.load(f) with open(CONSTANTS[2], 'rb') as f: inverse_word_dictionary = pickle.load(f) with open(CONSTANTS[3], 'rb') as f: label_dictionary = pickle.load(f) with open(CONSTANTS[4], 'rb') as f: output_dictionary = pickle.load(f) vocab_size = len(word_dictionary.keys()) label_size = len(label_dictionary.keys()) # 处理输入数据 aggregate_function = lambda input: [ (word, pos, label) for word, pos, label in zip(input['word'].values.tolist(), input[ 'pos'].values.tolist(), input['tag'].values.tolist()) ] grouped_input_data = input_data.groupby('sent_no').apply( aggregate_function) sentences = [sentence for sentence in grouped_input_data] x = [[word_dictionary[word[0]] for word in sent] for sent in sentences] x = sequence.pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0) y = [[label_dictionary[word[2]] for word in sent] for sent in sentences] y = sequence.pad_sequences(maxlen=input_shape, sequences=y, padding='post', value=0) y = [to_categorical(label, num_classes=label_size + 1) for label in y] return x, y, output_dictionary, vocab_size, label_size, inverse_word_dictionary
def label_correlation(df, label): # construct the label positive correlation and negative correlation # goals, goals_word = get_tficf(df['Goals'].tolist()) # targets, targets_word = get_tficf(df['Targets'].tolist()) #indicators, indicators_word = get_tficf(df['Indicators'].tolist()) # tficf = np.concatenate([goals, targets, indicators], axis=-1) ## Indicators level feature tficf, word_0 = get_tficf(df['Indicators'].tolist()) level_0 = defaultdict(lambda: defaultdict(list)) i = 0 for idx, row in df.iterrows(): level_0[row['goal_no']][row['target_no']].append(tficf[i]) i += 1 nT, nI = max([len(x) for x in level_0.values()]), max([max([len(i) for i in x.values()]) for x in level_0.values()]) level_0 = sequence.pad_sequences( [sequence.pad_sequences( [i for i in x.values()], maxlen=nI, dtype='float32', padding='post', truncating='post') for x in level_0.values()], maxlen=nT, dtype='float32', padding='post', truncating='post') np.save("level_0.npy", level_0) json.dump(word_0, open('word_0.json', 'w', encoding='utf-8'), indent=4) ## Targets level feature label_l1 = df.groupby('Targets').agg({'goal_no':'first','Goals':'first','Indicators':'. '.join,'target_no':','.join}).reset_index() label_l1 = label_sequence(label_l1, label) tficf, word_1 = get_tficf((label_l1['Targets']+' '+label_l1['Indicators']).tolist()) level_1 = defaultdict(list) i = 0 for idx, row in label_l1.iterrows(): level_1[row['goal_no']].append(tficf[i]) i+=1 level_1 = sequence.pad_sequences([x for x in level_1.values()], maxlen=nT, dtype='float32', padding='post', truncating='post') np.save("level_1.npy", level_1) json.dump(word_1, open('word_1.json', 'w', encoding='utf-8'), indent=4) ## Goals level feature label_l2 = label_l1.groupby('Goals').agg({'goal_no': 'first','Targets': '. '.join, 'Indicators': '. '.join}).reset_index() label_l2 = label_sequence(label_l2, label) level_2, word_2 = get_tficf((label_l2['Goals']+' '+label_l2['Targets']+' '+label_l2['Indicators']).tolist()) np.save("level_2.npy", level_2) json.dump(word_2, open('word_2.json', 'w', encoding='utf-8'), indent=4) return level_0, level_1, level_2, word_0, word_1, word_2
def sequence_vectorize(train_texts, val_texts, k=1): """Vectorizes texts as sequence vectors. 1 text = 1 sequence vector with fixed length. # Arguments train_texts: list, training text strings. val_texts: list, validation text strings. # Returns x_train, x_val, word_index: vectorized training and validation texts and word index dictionary. """ print('Tokenizing') # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=TOP_K) tokenizer.fit_on_texts(train_texts) print('Vectorizing') # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > MAX_SEQUENCE_LENGTH: max_length = MAX_SEQUENCE_LENGTH # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. print('Padding/Truncating Sequences') x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) # Save Tokenizer to Disk print('Saving Tokenizer') tokenConfig = tokenizer.to_json() f = open('amazon_sepcnn_' + str(k) + 'k_tokenizer.json', 'w') f.write(tokenConfig) f.close() return x_train, x_val, tokenizer.word_index
def split_and_zero_padding(df, max_seq_length): logging.info('Padding sequence') # Split to dicts x = {'left': df['current_n'], 'right': df['prior_n']} # Zero padding dataset = dict() for i, index in itertools.product([x], ['left', 'right']): dataset[index] = pad_sequences(i[index], padding='pre', truncating='post', maxlen=max_seq_length) return dataset
def transform(self, text_list): # Transform text to sequence of integers text_list = [self._clean_line(txt) for txt in text_list] text_sequence = self._tokenizer.texts_to_sequences(text_list) # Fix sequence length to max value. Sequences shorter than the length # are padded in the beginning and sequences longer are truncated # at the beginning. padded_text_sequence = sequence.pad_sequences( text_sequence, maxlen=self._max_sequence_length) return padded_text_sequence
def transform(self, X): res = self.tokenizer.texts_to_sequences([str(word) for word in X]) max_len = len(max(X, key=len)) if max_len > self.max_sequence_len: max_len = self.max_sequence_len res = sequence.pad_sequences(res, maxlen=max_len) global INPUT_SHAPE INPUT_SHAPE = res.shape[1:] return res
def pad_one_sequence(sequence, length, dtype='int32', padding='post', truncating='pre', value=0.0): sequence = tf.expand_dims(sequence, axis=0) sequence_padded = pad_sequences(sequence, length, dtype, padding, truncating, value) sequence_padded = tf.squeeze(sequence_padded, axis=0) return (sequence_padded)
def test_pad_sequences(self): a = [[1], [1, 2], [1, 2, 3]] # test padding b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='pre') self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]]) b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='post') self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]]) # test truncating b = preprocessing_sequence.pad_sequences(a, maxlen=2, truncating='pre') self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]]) b = preprocessing_sequence.pad_sequences(a, maxlen=2, truncating='post') self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]]) # test value b = preprocessing_sequence.pad_sequences(a, maxlen=3, value=1) self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
def build_model_input(self, data): model_input = {name: data[name] for name in self.sparse_features} if self.variable_length_features: for feat in self.variable_length_features: pad_variable_length_features = pad_sequences( data[feat], maxlen=self.variable_length_features_max_len[feat], padding='post', ) model_input[feat] = pad_variable_length_features return model_input
def __init__(self, timit_root): self.max_label_len = 0 # load the dataset training_root = os.path.join(timit_root, 'TRAIN') test_root = os.path.join(timit_root, 'TEST') self.ph_org_train, self.train_input_length, self.train_label_length, self.x_train, self.y_train = self.load_split_timit_data( training_root) self.ph_org_test, self.test_input_length, self.test_label_length, self.x_test, self.y_test = self.load_split_timit_data( test_root) self.normalize_xs() self.train_padded_ph = pad_sequences(self.y_train, maxlen=self.max_label_len, padding='post', value=len(self.phonemes)) self.test_padded_ph = pad_sequences(self.y_test, maxlen=self.max_label_len, padding='post', value=len(self.phonemes))
def padding(sentences: List[List[int]], pad: int = None) -> List[List[str]]: """ This method is used to pad a sentence converted with text2id :param sentences: converted sentence to pad :param pad: maximum padding length :return: a padded sentence """ return pad_sequences(sentences, maxlen=pad, truncating="post", padding="post")
def sent2oh(sentence, language='en', se=False, reverse=False): oh = list() if language == 'en': sequence = en_token.texts_to_sequences([sentence]) sequence = pad_sequences(sequence, padding='post', maxlen=max_en_len) #add padding if reverse == True: sequence = sequence[:, ::-1] for seq in sequence: oh.append(en_oh[seq]) elif language == 'fr' and se == True: sequence = fr_se_token.texts_to_sequences([sentence]) sequence = pad_sequences(sequence, padding='post', maxlen=max_fr_se_len) if reverse == True: sequence = sequence[:, ::-1] for seq in sequence: oh.append(fr_se_oh[seq]) return np.array(oh)
def tokenizer(text): token_text = token.texts_to_sequences(text) token_text = pad_sequences(token_text, maxlen=max_items, padding='pre', truncating='pre') result = model.predict(token_text)[0] print(result) for item in result: print(item) end_result = "Negative" if result >= 0.5: end_result = "Positive" return end_result
def convertData(x): top_words = 15000 tokenizer = Tokenizer(num_words=top_words) tokenizer.fit_on_texts(x) max_tokens = 228 x_train_tokens = tokenizer.texts_to_sequences(x) x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding='pre', truncating='pre') return x_train_pad
def _input_text_to_pad_id(text, vocab_to_ids, tokenizer): data_id = [ vocab_to_ids[token] if token in vocab_to_ids else WordVector.UNK_ID for token in tokenizer.tokenize(text.upper()) ] data = sequence.pad_sequences([data_id], maxlen=MAXLEN, truncating='post', padding='post', value=WordVector.PAD_ID) return {'input': data}
def text_to_sequences(self): # tokenize (fitting) on the training data (set oov_token to True so new words in # X_test are not ignored) word_tokenizer = Tokenizer(oov_token=True) word_tokenizer.fit_on_texts(self.X_train) # length of the vocabulary self.vocab_length = len(word_tokenizer.word_index) + 1 # text_to_sequences on both the training and the testing embedded_sentences_train = word_tokenizer.texts_to_sequences( self.X_train) embedded_sentences_test = word_tokenizer.texts_to_sequences( self.X_test) word_count = lambda sentence: len(word_tokenize(sentence)) longest_sentence = max(self.X_train, key=word_count) self.length_long_sentence = len(word_tokenize(longest_sentence)) self.padded_sentences_train = pad_sequences(embedded_sentences_train, self.length_long_sentence, padding='post') self.padded_sentences_test = pad_sequences(embedded_sentences_test, self.length_long_sentence, padding='post') embeddings_dictionary = dict() for line in self.glove_file: records = line.split() word = records[0] vector_dimensions = np.asarray(records[1:], dtype='float32') embeddings_dictionary[word] = vector_dimensions self.glove_file.close() # embedding matrix self.embedding_matrix = np.zeros((self.vocab_length, 300)) for word, index in word_tokenizer.word_index.items(): embedding_vector = embeddings_dictionary.get(word) if embedding_vector is not None: self.embedding_matrix[index] = embedding_vector
def sequence_vectorize(train_texts, val_texts,number_of_features,max_sequence_length): # Create vocabulary with training texts. tokenizer = text.Tokenizer(num_words=number_of_features) tokenizer.fit_on_texts(train_texts) # Vectorize training and validation texts. x_train = tokenizer.texts_to_sequences(train_texts) x_val = tokenizer.texts_to_sequences(val_texts) # Get max sequence length. max_length = len(max(x_train, key=len)) if max_length > max_sequence_length: max_length = max_sequence_length # Fix sequence length to max value. Sequences shorter than the length are # padded in the beginning and sequences longer are truncated # at the beginning. x_train = sequence.pad_sequences(x_train, maxlen=max_length) x_val = sequence.pad_sequences(x_val, maxlen=max_length) return x_train, x_val, tokenizer.word_index
def prepare_data(self): """ main prepare data :return: """ (_, _), (x_test, y_test) = imdb.load_data(num_words=self.flags.vocab_size) # build word index and reverse word index self.build_word_index() self.build_reverse_word_index() self.x_test = x_test x_test = pad_sequences(x_test, maxlen=250, value=self.word_index['<PAD>'], padding='post') return x_test
def Text_Pipeline(Data, tokenizer, MAX_LENGTH=50): """ tokenizing and padding the sequences :param Data: text array :param tokenizer: tokenizer object :param MAX_LENGTH: sequnence sequence length :return pads :padded sequences """ seqs = tokenizer.texts_to_sequences(Data) pads = pad_sequences(seqs, maxlen=MAX_LENGTH) return pads
def split_and_zero_padding(df, max_seq_length): # Split to dicts X = {'left': df['question1_n'], 'right': df['question2_n']} # Zero padding for dataset, side in itertools.product([X], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length) return dataset
def vectorize_texts(self, texts: List[str]) -> array: if self.tokenizer is not None: vectorized_texts: List[ List[int]] = self.tokenizer.texts_to_sequences(texts) else: raise UntrainedBrainError padded_vectors: array = sequence.pad_sequences( vectorized_texts, maxlen=self._max_sequence_length) return padded_vectors
def text_to_tokens(self,text,reverse=False,padding=False): tokens=self.texts_to_sequences([text]) tokens=np.array(tokens) if (reverse): tokens=np.flip(tokens,axis=1) truncating='pre' else: truncating='post' if(padding): tokens=pad_sequences(tokens,maxlen=self.max_tokens,padding='pre',truncating=truncating) return tokens
def encode_x_batch(self, x_batch): return pad_sequences([self.encode_x(x) for x in x_batch], maxlen=self.length_range[1])