def load_and_prec(): train_df = pd.read_csv("../input/train.csv") test_df = pd.read_csv("..input/test.csv") print("Train shape : ", train_df.shape) print("Test shape : ", test_df.shape) # lower train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: x.lower()) test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: x.lower()) # Clean the text train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x)) test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x)) # Clean numbers train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x)) test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_numbers(x)) # Clean speellings train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x)) test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x)) # fill up the missing values train_X = train_df["question_text"].fillna("_##_").values test_X = test_df["question_text"].fillna("_##_").values # Tokenize the setences tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_text(list(train_X)) train_X = tokenizer.text_to_sequences(train_X) test_X = tokenizer.text_to_sequences(test_X) # Pad the sentences train_X = pad_sequences(train_X, maxlen=maxlen) test_X = pad_sequences(test_X, maxlen=maxlen) # Get the target values train_y = train_df["target"].values # suffling the data np.random.seed(SEED) trn_idx = np.random.permutation(len(train_X)) train_X = train_X[trn_idx] train_y = train_y[trn_idx] return train_X, test_X, train_y, tokenizer.word_index
import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import Adam import numpy as np tokenizer = Tokenizer() data = open('/tmp/irish-lyrics-eof.txt').read() corpus = data.lower().split('\n') tokenizer.fit_on_text(corpus) totoal_words = len(tokenizer.word_index) + 1 input_sequences = [] #build ngram tokens for line in corpurs: token_list = tokenizer.texts_to_sequence([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) #pad the input sequence up to the max length max_sequence_len = max([len(x) for x in input_sequences]) input_sequence = np.array( pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre'))
import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.text import Tokenizer sentences = ['i love my dog', 'I love my cat'] tokenizer = Tokenizer(num_words=100) tokenizer.fit_on_text(sentences) word_index = tokenizer.word_index print(word_index)