def main(): text_tweets = pd.read_csv('../data/tweets_data.csv', delimiter='\t') X = text_tweets.Text.values print('All data has been loaded') SENTENCE_LENGTH = 167 NUM = 100000 tokenizer = Tokenizer(num_words=NUM) tokenizer.fit_on_texts(X) X_seq = get_sequences(tokenizer, X, SENTENCE_LENGTH) print('Input data has been tokenized') with open('../output/model.pkl', 'rb') as file: model = pickle.load(file) model.load_weights('../output/cnn-frozen-embeddings-37.hdf5') print('Model has been loaded') classes = np.array(['anger', 'happiness', 'love', 'neutral', 'sadness']) predictions = model.predict(X_seq) predicted_ix = np.apply_along_axis(lambda x: np.argmax(x), 1, predictions) text_tweets['class_prediction'] = pd.Series( np.apply_along_axis(lambda x: classes[x], 0, predicted_ix)) text_tweets.to_csv('../output/predictions.csv', sep='\t') print('Predictions have been saved')
def process_data(): path = '/content/drive/My Drive/Colab Notebooks/alexa_toy.json' # path = os.getcwd() + '/alexa_toy.json' with open(path) as f: data = json.load(f) # extract text and label text, label = [], [] for k, v in data.items(): for x in v['content']: text.append(x['message'].lower()) label.append(x['sentiment']) # convert labels to index index, label_id = 0, {} for x in np.unique(label): label_id[x] = index index += 1 label = [label_id[x] for x in label] # process text (for convenience, used keras tools) tokenizer = Tokenizer() tokenizer.fit_on_texts(text) text = tokenizer.texts_to_sequences(text) text = pad_sequences(text, maxlen=50) train_x, test_x, train_y, test_y = train_test_split(text, label, test_size=0.05, shuffle=False, random_state=42) print ('training size : {} \t test size : {}'.format(len(train_y), len(test_y))) return train_x, test_x, train_y, test_y, tokenizer
def createModel(self, text): self.embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), encoding='utf') for line in f: values = line.split() word = ''.join(values[:-300]) #word = values[0] coefs = np.asarray(values[-300:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(self.embeddings_index)) tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS, lower=False) tokenizer.fit_on_texts(text) self.word_index = tokenizer.word_index pickle.dump(self.word_index, open("../Models/DeId/word_index.pkl", 'wb')) self.embedding_matrix = np.zeros( (len(self.word_index) + 1, self.EMBEDDING_DIM)) print(self.embedding_matrix.shape) for word, i in self.word_index.items(): embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.embedding_matrix[i] = embedding_vector self.embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.embedding_matrix], input_length=70, trainable=True) self.model = Sequential() self.model.add(self.embedding_layer) self.model.add( Bidirectional( LSTM(150, dropout=0.3, recurrent_dropout=0.6, return_sequences=True)) ) #{'sum', 'mul', 'concat', 'ave', None} self.model.add( Bidirectional( LSTM(60, dropout=0.2, recurrent_dropout=0.5, return_sequences=True))) self.model.add( SeqSelfAttention(attention_activation='sigmoid', attention_width=12)) self.model.add(TimeDistributed(Dense( 9, activation='softmax'))) # a dense layer as suggested by neuralNer self.model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=['accuracy']) self.model.summary() pass
class DLModel(BenchmarkedModel): def __init__(self): super().__init__() max_features = 1024 model = Sequential() model.add(Embedding(max_features, output_dim=256)) model.add(LSTM(128)) model.add(Dropout(0.5)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"]) self.clf = model self.vectorizer = Tokenizer() def fit(self, data, labels): self.vectorizer.fit_on_texts(data) processed_data = self.vectorizer.texts_to_matrix(data, mode="count") self.clf.fit(processed_data, labels, batch_size=16, epochs=10) def predict(self, data): processed_data = self.vectorizer.texts_to_matrix(data, mode="count") self.clf.predict(processed_data)
def prepare_data(data_set, length=None): #tokenize the data set bodies_tokenizer, headlines_tokenizer = (Tokenizer(), Tokenizer()) #find the max length of each dataset bodies_max_length = 0 headlines_max_length = 0 if not length: bodies_max_length = data_set['articleBody'].map(lambda x : len(x.split())).max() headlines_max_length = data_set['Headline'].map(lambda x : len(x.split())).max() else: bodies_max_length = length[0] headlines_max_length = length[1] #fit the tokenizer on the data set bodies_tokenizer.fit_on_texts(data_set['articleBody']) headlines_tokenizer.fit_on_texts(data_set['Headline']) #convert the texts to sequences bodies_sequences = bodies_tokenizer.texts_to_sequences(data_set['articleBody']) headlines_sequences = headlines_tokenizer.texts_to_sequences(data_set['Headline']) #pad the data to be the max length bodies_sequences = pad_sequences(bodies_sequences, maxlen=bodies_max_length, padding='post', truncating='post') headlines_sequences = pad_sequences(headlines_sequences, maxlen=headlines_max_length, padding='post', truncating='post') return bodies_sequences, headlines_sequences, bodies_tokenizer.word_index, headlines_tokenizer.word_index, data_set['Stance']
def train(self, X, y=None): X, y = self.augment_instances(X, y) #X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL]) X_text = X[:, self.args.TEXT_COL] self.max_features = 4000 self.tokenizer = Tokenizer(num_words=self.max_features) self.tokenizer.fit_on_texts(X_text) X_text = self.tokenizer.texts_to_sequences(X_text) X_text = self.tokenizer.sequences_to_texts(X_text) self.text_rep_model = self.build_fit_w2v(X_text) X_text = self.transform_text_to_w2v(self.text_rep_model, X_text) X_all_feats = self.augment_features(X_text, X) pca = PCA(n_components=self.num_clusters, random_state=self.args.random_state) pca.fit(X_all_feats) model = KMeans(init=pca.components_, n_clusters=self.num_clusters, n_init=1, random_state=self.args.random_state) model.fit(X_all_feats) self.clf_model = model
def predict(): test = pd.read_csv(os.path.join(data_path, 'test.csv')) test_hash = test['unique_hash'] # print(test.head()) test = test.reindex(np.random.permutation(test.index)) test = test[['text', 'drug']] test['text_comb'] = test['text'] + test['drug'] test.text_comb = test.text_comb.apply(remove_stopwords) # print(test.head()) tk = Tokenizer(num_words=NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ") tk.fit_on_texts(test.text_comb) test_seq = tk.texts_to_sequences(test.text_comb) test_oh = one_hot_seq(test_seq) reg_model = models.load_model('./data/reg_model.h5') prediction = reg_model.predict_classes(test_oh) submission = pd.DataFrame({ 'unique_hash': test_hash, 'sentiment': prediction, }) submission.to_csv('./data/dl_submission.csv', index=False)
def main(): reviews_df = get_data() print(reviews_df) # tokenize all content tokenizer = Tokenizer() tokenizer.fit_on_texts(reviews_df['Review']) num_encoder_tokens = len(tokenizer.word_index) + 1 # create training and testing vars X_train, X_test, y_train, y_test = train_test_split(reviews_df['Review'], reviews_df['Numeric_Label'], test_size=0.2, shuffle=True) max_review_length = 500 X_train = get_encoded_padded_content(tokenizer, X_train, max_review_length) X_test = get_encoded_padded_content(tokenizer, X_test, max_review_length) # print(X_train) # print(y_train) # create the model embedding_vecor_length = 32 model = Sequential() model.add(Embedding(num_encoder_tokens, embedding_vecor_length, input_length=max_review_length)) model.add(LSTM(100)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model_history = model.fit(X_train, y_train, epochs=3, batch_size=4, validation_split=0.2) print(model_history.history) # Final evaluation of the model scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1] * 100)) pass
def __init__(self, master=None): super().__init__(master) self.r = sr.Recognizer() self.lexical = LexicalAnalysis.LexicalAnalysis() self.master = master self.pack() self.create_widgets() self.running = False self.text = "" self.text_sequence = None self.stemmer = WordNetLemmatizer() df = pandas.read_csv("D:\\PycharmProjects\\ThesisWork\\Data\\EmotionDetection\\%_by_Emo_Full_Data_data (1).csv") df['Tweet'] = df['Tweet'].apply(self.clean) MAX_NB_WORDS = 50000 # Max number of words in each tweet. self.MAX_SEQUENCE_LENGTH = 250 self.tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) self.tokenizer.fit_on_texts(df['Tweet'].values) # Integer replacement X = self.tokenizer.texts_to_sequences(df['Tweet'].values) X = pad_sequences(X, maxlen=self.MAX_SEQUENCE_LENGTH) # Gets categorical values for the labels Y = pandas.get_dummies(df['Emotion']).values self.neuralNetwork = NeuralNetwork.NeuralNetwork(X.shape[1], 4) self.neuralNetwork.fit(X, Y)
def make_dictionaries(file_path, src_dict_path=None, tgt_dict_path=None, encoding="utf-8", min_freq=5, **kwargs): if not os.path.isdir(file_path): sents, chunks = _parse_data(open(file_path, 'r', encoding=encoding)) else: sents, chunks = _parse_data_from_dir(file_path) src_tokenizer = Tokenizer(**kwargs) tgt_tokenizer = Tokenizer(**kwargs) src_tokenizer.fit_on_texts(sents) tgt_tokenizer.fit_on_texts(chunks) src_sub = sum(map(lambda x: x[1] < min_freq, src_tokenizer.word_counts.items())) tgt_sub = sum(map(lambda x: x[1] < min_freq, tgt_tokenizer.word_counts.items())) src_tokenizer.num_words = len(src_tokenizer.word_index) - src_sub tgt_tokenizer.num_words = len(tgt_tokenizer.word_index) - tgt_sub if src_dict_path is not None: save_dictionary(src_tokenizer, src_dict_path, encoding=encoding) if tgt_dict_path is not None: save_dictionary(tgt_tokenizer, tgt_dict_path, encoding=encoding) return src_tokenizer, tgt_tokenizer
def data_preprocess(text_data, text_label): text_sentence = [] temp = [] for i in text_data: k = jieba.lcut(i) text_sentence.append(k) temp += [j for j in k if j not in k] # 构建词典 tokenizer = Tokenizer(num_words=len(temp)) tokenizer.fit_on_texts(text_sentence) # 文本序列化 text_sentence = tokenizer.texts_to_sequences(text_sentence) text_sentence = pad_sequences(text_sentence, maxlen=64, padding='post') # 标签 text_label = to_categorical(text_label) # 提取预训练好的词向量 embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 60), dtype=np.float32) file = open('wiki.zh.text.vector', encoding='utf-8') file = file.readlines() for text in file: text = text.split() if text[0] in temp: embedding_matrix[tokenizer.word_index[text[0]]] = text[1:] return text_sentence, text_label, tokenizer, embedding_matrix
def define_sequences(raw_string, seq_length): ''' Codes input-string into readable format for RNN (converts letters to corresponding integers) Input: raw_string = data in form of string seq_length = integer, amount of pre-sequential "letters" RNN will use to make next prediction Output: X = input-data for RNN, format - (N, seq_length, 1) y = labels for RNN (in form of overlay-mask), format - (N, #uniquesymbols) c_indices: dictionary of available symbols in input-text ''' tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts(raw_string) c_indices = tokenizer.word_index X = [] y = [] for i in range(len(raw_string) - seq_length): inputseq = raw_string[i:i + seq_length] outputseq = raw_string[i + seq_length] X.append([c_indices[char] for char in inputseq]) y.append(c_indices[outputseq]) return X, y, c_indices
def prueba_2(): cantidad_twits=10 # define class twits test = load_test() twits = preprocesing(test[:cantidad_twits, 0]) print(f"\ntwiters:\n{twits}") # define class labels labels = test[:cantidad_twits, 1].astype('float32') print(f"\nlabels:\n{labels}") # prepare tokenizer t = Tokenizer() t.fit_on_texts(twits) vocab_size = len(t.word_index) + 1 # integer encode the documents encoded_twits = t.texts_to_sequences(twits) print(f"\nencoded_twits:\n{encoded_twits}") # pad documents to a max length of 4 words # Calculo largo maximo mylen = np.vectorize(len) lens=mylen(encoded_twits) max_len=max(lens) #TODO: Contar el twtit mas largo max_length = max_len padded_twits = pad_sequences(encoded_twits, maxlen=max_length, padding='post') print(f"\npadded_twits:\n{padded_twits}") # load the whole embedding into memory embeddings_index = dict() f = open('fasttext.es.300.txt') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs embedding_matrix = np.zeros((vocab_size, 300)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector # define model model = Sequential() e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=False) model.add(e) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) # compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # summarize the model print(model.summary()) # fit the model model.fit(padded_twits, labels, epochs=50, verbose=0) # evaluate the model loss, accuracy = model.evaluate(padded_twits, labels, verbose=0) print('Accuracy: %f' % (accuracy * 100))
def create_keras_tokenizer(captions): """ функция используется для создания tokenizer, и его обучения на наборе описаний """ list_of_captions = captions_to_list(captions) keras_tokenizer = Tokenizer() keras_tokenizer.fit_on_texts(list_of_captions) return keras_tokenizer
def tokenizer(text): max_num_words = 8000 tokenize = Tokenizer(num_words=max_num_words) tokenize.fit_on_texts(text) # print(tokenize.word_index) vocab_size = len(tokenize.word_index) + 1 text2int = tokenize.texts_to_sequences(text) max_ln = np.max([len(cap) for cap in text2int]) return [tokenize, text2int, vocab_size, max_ln]
def __init__(self, articles: Articles, max_article_length: int): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.title_and_summary()) self.max_article_length: int = max_article_length self.sequences = self.transform_to_sequences(articles) self.voc_size = len( self.tokenizer.word_index) + 1 # +1 because we pad with 0. self.document_count = self.tokenizer.document_count
def get_tokenized_data(self, max_sentence_len): sents, is_intent = self.get_data() # token_list = (data['sentence'].apply(get_tokens)) token_list = [get_tokens(sent) for sent in sents] tokenizer = Tokenizer() tokenizer.fit_on_texts(token_list) X, Y = self.get_netio(is_intent, token_list, max_sentence_len, tokenizer) return X, Y, tokenizer
def predict(input_sentence): # sentence = "A lot of good things are happening. We are respected again throughout the world, and that's a great thing" max_fatures = 2000 tokenizer = Tokenizer(num_words=max_fatures, split=' ') tokenizer.fit_on_texts(input_sentence) X = tokenizer.texts_to_sequences(input_sentence) X = pad_sequences(X, maxlen=28) sentiment = model.predict(X, batch_size=1, verbose=2)[0] print(sentiment)
def preprocess_text(text, padding='post'): # tokeniz e tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE) tokenizer.fit_on_texts(text) sequences = tokenizer.texts_to_sequences(text) # padding sequences = pad_sequences(sequences, padding=padding) return sequences, len(tokenizer.word_index)
def get_data_as_one_hot(num_words, data_location='data/data', labels_location='data/labels'): data, labels = read_data_and_labels(data_location, labels_location) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data) one_hot = tokenizer.texts_to_matrix(data, mode='binary') encoded_labels = np.asarray(labels).astype('float32') print('Returning encoded text, labels and tokenizer') return one_hot, encoded_labels, tokenizer
def initialize(self, query_manager, emb_path): self.tokenizer = Tokenizer() self.word_embedding = WordEmbedding(embfile=emb_path) self.query_list = query_manager.query_list query_list = [query.query for query in self.query_list] self.tokenizer.fit_on_texts(query_list) self.tokenizer.fit_on_texts(query_manager.corpus) self.word_embedding.create_embedding_matrix(self.tokenizer)
def predict_intent(self, text): prepared_text = DataHandler.get_preprocessed_message(text) tokenizer = Tokenizer(num_words=vocabulary_size) tokenizer.fit_on_texts(prepared_text) X_temp = tokenizer.texts_to_sequences(prepared_text) X = pad_sequences(X_temp, padding='post', maxlen=max_input_length) print(X) result = self.intent_classifier.predict_classes(X) print("Res = " + str(result)) print(most_common(result).item()) return most_common(result).item()
def get_data_as_padded_sequences(num_words, max_length, data_location='data/data', labels_location='data/labels'): data, labels = read_data_and_labels(data_location, labels_location) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data) sequences = tokenizer.texts_to_sequences(data) sequences = pad_sequences(sequences, maxlen=max_length) encoded_labels = np.asarray(labels).astype('float32') return sequences, encoded_labels, tokenizer
def run_tokenizer(train, test): logger.info('Fitting tokenizer') tokenizer = Tokenizer() tokenizer.fit_on_texts( list(train['comment_text']) + list(test['comment_text'])) # X_train = tokenizer.texts_to_sequences(list(train['comment_text'])) # X_test = tokenizer.texts_to_sequences(list(test['comment_text'])) # X_train = pad_sequences(X_train, maxlen=MAX_LEN) # X_test = pad_sequences(X_test, maxlen=MAX_LEN) # word_index = tokenizer.word_index return tokenizer # X_train, X_test, word_index
def __init__(self, wine_dataset: WineDataSet, max_len, topn_varieties: int = 7, balance_class=False): filter_list = wine_dataset.varieties_count( )['variety'][:topn_varieties].tolist() filtered_df = wine_dataset.data[wine_dataset.data['variety'].isin( filter_list)] if balance_class: aux_df = deepcopy(filtered_df) d = aux_df.groupby('variety') d = d.apply( lambda x: x.sample(d.size().min()).reset_index(drop=True)) d = d.reset_index(drop=True) filtered_df = d del aux_df, d wine_embeddings_filter = filtered_df.index.values self._varieties_list = from_array(filter_list) self._wine_embeddings_filter = from_array(wine_embeddings_filter) self._variety2index = { variety: index for index, variety in enumerate(filter_list) } self._index2variety = { index: variety for index, variety in enumerate(filter_list) } # self._X = wine_embeddings[wine_embeddings_filter].compute() self._X = deepcopy(filtered_df['description_cleaned'].tolist()) tokenizer = Tokenizer() tokenizer.fit_on_texts(self._X) self._X, self._X_tokenizer = tokenizer.texts_to_sequences( self._X), tokenizer self._X = pad_sequences(self._X, maxlen=84, padding="pre", truncating="post") self._index2word = self._X_tokenizer.index_word self._word2index = self._X_tokenizer.word_index self._index2word.update({0: 'pad'}) self._word2index.update({'pad': 0}) self._Y = deepcopy(filtered_df['variety']) self._Y.replace(self._variety2index, inplace=True) self._Y = np.array(self._Y.tolist())
def closure(mu): (x_train, y_train), (_, _) = imdb.load_data() tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_sequences(x_train) x_train = tokenizer.sequences_to_matrix(x_train, "tfidf") # Note: svd_solver=full is needed on GPU server x_train = PCA(n_components=100, svd_solver='full').fit_transform(x_train) ds = {"data": x_train, "target": y_train} # Apply noise and return res = preprocess_and_noise(dataset=ds, mu=mu) return res
def createModel(self, text): self.embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') self.embeddings_index[word] = coefs f.close() print('Found %s word vectors.' % len(self.embeddings_index)) tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS, lower=False) tokenizer.fit_on_texts(text) self.word_index = tokenizer.word_index self.embedding_matrix = np.zeros( (len(self.word_index) + 1, self.EMBEDDING_DIM)) print(self.embedding_matrix.shape) for word, i in self.word_index.items(): embedding_vector = self.embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. self.embedding_matrix[i] = embedding_vector self.embedding_layer = Embedding(len(self.word_index) + 1, self.EMBEDDING_DIM, weights=[self.embedding_matrix], input_length=70, trainable=False) self.model = Sequential() self.model.add(self.embedding_layer) self.model.add( Bidirectional( LSTM(200, dropout=0.3, recurrent_dropout=0.7, return_sequences=True)) ) #{'sum', 'mul', 'concat', 'ave', None} # self.model.add(TimeDistributed(Bidirectional(LSTM(60, dropout=0.2, recurrent_dropout=0.5, return_sequences=True)))) #self.model.add(TimeDistributed(Dense(50, activation='relu'))) self.model.add(TimeDistributed(Dense( 9, activation='softmax'))) # a dense layer as suggested by neuralNer #crf = CRF(17, sparse_target=True) #self.model.add(crf) #self.model.compile(loss=crf_loss, optimizer='adam', metrics=[crf_viterbi_accuracy]) self.model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=['accuracy']) self.model.summary() pass
def topXInSet(outFile, x): with open('reducedCombined(no gov).txt', 'r', encoding='utf-8') as file, \ open(outFile, 'w', encoding='utf-8') as target: f = file.readlines() random.shuffle(f) tk = Tokenizer() tk.fit_on_texts(f) tfList = [] start = perf_counter() # x = 6 -> 9000 words # x = 8 -> 7000 words # x = 10 -> 5500 words # x = 15 -> 3500 words # x = 20 -> 2750 words stopNum = round(1 / x * (len(tk.word_index))) for n in range(len(f)): keep = True for word in f[n].split(): if keep is True: for num, entry in enumerate(list(tk.word_index.keys())): word = ''.join( c for c in word if c not in '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n').lower() if entry == word: break if num == stopNum: keep = False break tfList.append(keep) stop = perf_counter() i = 0 for tf in tfList: if tf: i = i + 1 print("Trues:", i) print("Time to finish:", stop - start) for x in range(len(f)): if tfList[x] is True: target.write(f[x]) print("\n" + str(stopNum)) print(len(f)) print(len(tfList))
def tokenize_http_status(data): if config.SAVE: tokenizer = Tokenizer(num_words=20, filters='', oov_token=0) tokenizer.fit_on_texts(data.astype(str)) save_tokenizer(tokenizer, "status") if not config.SAVE: tokenizer = load_tokenizer("status") data = tokenizer.texts_to_sequences(data.astype(str)) data = numpy.array(data) return data
def run(self): self.dataset.load() X = self.dataset.X_train_labeled['moment'].values X = np.append(X, self.dataset.X_train_unlabeled['moment'].values, axis=0) X = np.append(X, self.dataset.X_test['moment'].values, axis=0) tokenizer = Tokenizer() tokenizer.fit_on_texts(X) self.build_embedding(tokenizer.word_index)
def tokenlize_text(max_num_words, max_seq_length, x_train): """Tokenlize text. Vectorize a text corpus by transform each text in texts to a sequence of integers. Args: max_num_words: Int, max number of words in the dictionary. max_seq_length: Int, the length of each text sequence, padding if shorter, trim is longer. x_train: List contains text data. Returns: x_train: Tokenlized input data. word_index: Dictionary contains word with tokenlized index. """ from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer print("tokenlizing texts...") tokenizer = Tokenizer(num_words=max_num_words) tokenizer.fit_on_texts(x_train) sequences = tokenizer.texts_to_sequences(x_train) word_index = tokenizer.word_index x_train = pad_sequences(sequences, maxlen=max_seq_length) print("data readed and convert to %d length sequences" % max_seq_length) return x_train, word_index