def input_ready(self, df, tokenize=False, **kwargs): if tokenize: _df = self.tokenize(df) else: _df = df.copy() X = _df['TOKENS'].apply( lambda x: [self.w_idx.get(w, 1) for w in x]).values X = pad_sequences(X, maxlen=self.max_seq_len, padding='post', truncating='post') X = [X[:, i].astype('int32') for i in range(X.shape[1])] asp = _df[self.aspcol].apply( lambda w: int32(self.asp_idx[w])).values.tolist() use_2nd_lex = kwargs.get('use_second_lexicon', False) if use_2nd_lex: lx = _df['TOKENS'].apply( lambda x: [self.test_lx_idx.get(w, 0) for w in x]).values print('Using second lexicon.') else: lx = _df['TOKENS'].apply( lambda x: [self.lx_idx.get(w, 0) for w in x]).values lx = pad_sequences(lx, maxlen=self.max_seq_len, padding='post', truncating='post') lx = [lx[:, i].astype('int32') for i in range(lx.shape[1])] y = get_dummies(_df[self.clscol].astype(str)).values.astype('float32') return X, asp, lx, y
def batchfy_fn(data): x2 = [d[1] for d in data] x1 = [d[0] for d in data] y = [d[2] for d in data] max_len1 = max([len(x) for x in x1]) max_len2 = max([len(x) for x in x2]) return sequence.pad_sequences( x1, maxlen=max_len1, padding='post'), sequence.pad_sequences(x2, maxlen=max_len2, padding='post'), y
def batchfy_fn(data): x = [d[0] for d in data] y = [d[1] for d in data] max_len = max(map(len, x)) # if max_len % 5 != 0: # max_len += 5 - (max_len % 5) return sequence.pad_sequences(x, maxlen = max_len, padding = 'post'), y
def pad_by_buckets(self, init_df): buckets = sorted(self.intervals.keys()) bucket = {} for b in buckets: _df = init_df.loc[init_df.BUCKET == b][[ self.enc_col + '_TOK', self.dec_col + '_TOK', self.dec_col + '_TL' ]] if not _df.empty: _enc_symbs = self.tok2sym(_df[self.enc_col + '_TOK']) _dec_symbs = self.tok2sym(_df[self.dec_col + '_TOK']) enc_mlen = max(self.intervals[b]) dec_mlen = _df[self.dec_col + '_TL'].max() enc_symbs = pad_sequences(_enc_symbs, maxlen=enc_mlen) dec_symbs = pad_sequences(_dec_symbs, maxlen=enc_mlen) bucket.update({b: (enc_symbs, dec_symbs)}) return bucket
def preprocess_input_sequences(self, data): if not self.args.use_char_embedding: documents, questions, answer_spans = data else: documents, questions, documents_char, questions_char, answer_spans = data documents_char_ok = pad_sequences(documents_char, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") questions_char_ok = pad_sequences(questions_char, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") # FIXME: here can not use the array ,because the postiton is counted under character. not words answer_start = [ np.array([int(i == answer_span[0]) for i in range(self.d_len)]) for answer_span in answer_spans ] answer_end = [ np.array([int(i == answer_span[1]) for i in range(self.d_len)]) for answer_span in answer_spans ] if self.args.use_char_embedding: return documents_ok, questions_ok, documents_char_ok, questions_char_ok, np.asarray( answer_start), np.asarray(answer_end) else: return documents_ok, questions_ok, np.asarray( answer_start), np.asarray(answer_end)
def tokenize(self, comments): print('Comments shape is {}'.format(comments.shape)) token = Tokenizer(num_words=self.vocab_size) token.fit_on_texts(comments) tokenized_comments = token.texts_to_sequences(comments) tokenized_comments = sequence.pad_sequences( sequences=tokenized_comments, maxlen=self.max_sentence_len, padding='post', value=0)
def preprocess_input_sequences(self, data): """ preprocess,pad to fixed length. """ documents, questions, answer, candidates = data questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post") y_true = np.zeros_like(candidates_ok) y_true[:, 0] = 1 return questions_ok, documents_ok, candidates_ok, y_true
def __generate_seq(self, seed_int_encode, n_chars=250, diversity=0.2): ''' Generate text using the current LSTM model Input: @seed_int_encode: Seed sequence encoded as integers @n_chars: Number of characters to generate @diversity: How randomized the character selection should be Output: @return: 'n_chars' characters of generated text ''' # Begin with seed sequence int_encode = seed_int_encode # Translate seed text start_chars = [self.indices_char[x] for x in int_encode] message = ''.join(start_chars) # generate a fixed number of characters for _ in range(n_chars): # truncate sequences to a fixed length int_encode = pad_sequences([int_encode], maxlen=self.seq_len, truncating='pre')[0] # one hot encode hot_encode = to_categorical(int_encode, num_classes=self.vocab_size) # Change shape from: (seq_len, vocab) # to: (1, seq_len, vocab) # Since LSTM requires a tensor input hot_encode = np.expand_dims(hot_encode, 0) # Predict next character preds = self.model.predict(hot_encode, verbose=0)[0] yhat = self.__sample(preds, diversity) # Append int encoding to continue recurrant predictions int_encode = np.append(int_encode, yhat) # Keep track of full message generated message += self.indices_char[yhat] # Return generated message return (message)
def prepare(self, X, Y, emb_model, seq_length=200, stratify='n', test_split=0.2, emb_dim=100): #prepare data for use in NN #Convert text to sequences and create word index for use in creating embedding matrix from tensorflow.contrib.keras.api.keras.preprocessing.text import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(X) X_seq = tokenizer.texts_to_sequences(X) word_idx = tokenizer.word_index from tensorflow.contrib.keras.api.keras.preprocessing import sequence X_seq = sequence.pad_sequences(X_seq, maxlen=seq_length) #encode labels in 1h vector from sklearn.preprocessing import LabelBinarizer label_encoder = LabelBinarizer() Y_coded = label_encoder.fit_transform(Y) #create test and train split from sklearn.model_selection import train_test_split if stratify == 'y': x_train, x_test, y_train, y_test = train_test_split( X_seq, Y_coded, test_size=test_split, random_state=141289, stratify=Y_coded) else: x_train, x_test, y_train, y_test = train_test_split( X_seq, Y_coded, test_size=test_split, random_state=141289) #learn embedding matrix from the passed model import numpy as np embedding_mat = np.zeros((len(word_idx) + 1, emb_dim)) for w, i in word_idx.items(): try: embedding_vector = emb_model[w] embedding_mat[i] = embedding_vector except KeyError: pass #print ("no "+ word+" pos" + str(i)) return x_train, x_test, y_train, y_test, embedding_mat, tokenizer, label_encoder
def input_ready(self, df, tokenize=False, **kwargs): if tokenize: _df = self.tokenize(df) else: _df = df.copy() X = _df['TOKENS'].apply( lambda x: [self.w_idx[w] if w in self.w_idx else 1 for w in x]).values # Use w_idx to get word index X = pad_sequences(X, maxlen=self.max_seq_len, padding='post', truncating='post') # Pad sequences X = [X[:, i].astype('int32') for i in range(X.shape[1]) ] # Reshape data into [[batch,feats_t1], [batch,feats_t2], ...] y = get_dummies(_df[self.clscol].astype(str)).values.astype( 'float32') # Create one-hot encoded labels return X, y
def predict(out_path, txt, top=1): import pickle import os if os.path.isfile(os.path.join(out_path, 'token_enc.pkl')): with open(os.path.join(out_path, 'token_enc.pkl'), 'rb') as f: tokenizer, seq_len, language = pickle.load(f) #do preprocessing bit from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer stopwords = stopwords.words(language) stemmer = SnowballStemmer(language) import re r = re.compile(r'[\W]', re.U) txt = r.sub(' ', txt) txt = re.sub('[\\s]+', ' ', txt) txt = [ ' '.join( stemmer.stem(w.lower()) for w in txt.split() if w not in stopwords) ] #convert text to sequence txt_seq = tokenizer.texts_to_sequences(txt) from tensorflow.contrib.keras.api.keras.preprocessing import sequence txt_seq = sequence.pad_sequences(txt_seq, maxlen=seq_len) #load NN model and predict from tensorflow.contrib.keras.api.keras.models import load_model model = load_model(os.path.join(out_path, 'CNN1d.h5')) output = model.predict(txt_seq) #create binary sequences for top x predictions sorted_idx = (-output).argsort() import numpy as np label = np.zeros((top, len(output[0]))) for i in range(0, top): label[i][sorted_idx[0][i]] = 1 #convert to txt labels with open(os.path.join(out_path, 'label_enc.pkl'), 'rb') as f: label_decoder = pickle.load(f) return label_decoder.inverse_transform(label) else: return "Invalid output path!"
def forward(self, x, y): max_len = x.shape[1] if max_len % self.stride: max_len += self.stride - (max_len % self.stride) x = sequence.pad_sequences(x, maxlen=max_len, padding='post') x = LongTensor(x) mask = torch.where(x > 0, torch.ones_like(x, dtype=torch.float32), torch.zeros_like(x, dtype=torch.float32)) x_embed = self.embedding(x) x_embed = self.dropout(x_embed) # reduce outputs, h, reduced_mask = self.reduce_ngram( x_embed, mask) # (seq_len, batch, hidden_size * num_directions) # output_maxpooled = self.gather_rnnstate(outputs, reduced_mask) output_maxpooled, _ = torch.max(outputs, 1) # output_maxpooled = h.view(h.shape[1], -1) class_prob = self.linear(output_maxpooled) return class_prob, F.dropout(output_maxpooled)
def forward(self, x, y): max_len = x.shape[1] if max_len % self.stride: max_len += self.stride - (max_len % self.stride) x = sequence.pad_sequences(x, maxlen = max_len, padding = 'post') x = LongTensor(x) mask = torch.where(x > 0, torch.ones_like(x, dtype = torch.float32), torch.zeros_like(x, dtype = torch.float32)) x_embed = self.embedding(x) x_embed = self.dropout(x_embed) x_embed2 = self.embedding(torch.zeros_like(x).cuda() + torch.cat([torch.zeros(size = [x.shape[0], 1], dtype = torch.long).cuda(), x[:, 1:]], -1)) x_embed3 = self.embedding(torch.zeros_like(x).cuda() + torch.cat([x[:, :-1], torch.zeros(size = [x.shape[0], 1], dtype = torch.long).cuda()], -1)) x_embed = torch.cat([x_embed2, x_embed, x_embed3], -1) x_embed = x_embed @ self.params outputs, (h, c) = self.rnn1(x_embed) output_maxpooled, _ = torch.max(outputs, 1) # output_maxpooled = h.view(h.shape[1], -1) class_prob = self.linear(output_maxpooled) return class_prob, F.dropout(output_maxpooled)
def input_ready(self, df, tokenize=False, **kwargs): if tokenize: _df = self.tokenize(df) else: _df = df.copy() X = _df['TOKENS'].apply( lambda x: [self.w_idx[w] if w in self.w_idx else 1 for w in x]).values X = pad_sequences(X, maxlen=self.max_seq_len, padding='post', truncating='post') X = [X[:, i].astype('int32') for i in range(X.shape[1])] asp = _df[self.aspcol].apply( lambda w: int32(self.asp_idx[w])).values.tolist() y = get_dummies(_df[self.clscol].astype(str)).values.astype('float32') return X, asp, y
def get_next_batch(self, mode, idx): """ return next batch of data samples """ batch_size = self.args.batch_size if mode == "train": dataset_x = self.train_x dataset_y = self.train_y sample_num = self.train_nums elif mode == "valid": dataset_x = self.valid_x dataset_y = self.valid_y sample_num = self.valid_nums else: dataset_x = self.test_x dataset_y = self.test_y sample_num = self.test_nums if mode == "train": start = self.train_idx[idx] * batch_size stop = (self.train_idx[idx] + 1) * batch_size else: start = idx * batch_size stop = (idx + 1) * batch_size if start < sample_num and ( idx + 1) * batch_size < sample_num else len(dataset_x) document = [self.getitem(dataset_x, i) for i in range(start, stop)] data = { "document:0": sequence.pad_sequences(document, maxlen=self.max_len, padding="post"), "y_true:0": dataset_y[start:stop] } samples = stop - start if len(document) != len(dataset_y[start:stop]) or len( dataset_y[start:stop]) != samples: print(len(document), len(dataset_y[start:stop]), samples) return data, samples
def batcher(params, batch): # batch = [sent if sent != [] else ['.'] for sent in batch] # embeddings = [] # # for sent in batch: # sentvec = [] # for word in sent: # if word in params.word_vec: # sentvec.append(params.word_vec[word]) # if not sentvec: # vec = np.zeros(params.wvec_dim) # sentvec.append(vec) # sentvec = np.mean(sentvec, 0) # embeddings.append(sentvec) # # embeddings = np.vstack(embeddings) # return embeddings batch_idx = word2id_bathed(batch, params.word2id) max_len1 = max([len(x) for x in batch_idx]) batch_idx = sequence.pad_sequences(batch_idx, maxlen=max_len1, padding='post') embedding = params.model(batch_idx) return embedding
targets = data_set[target_col].values targets = to_categorical(targets, 11) tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(features) features_seq = tokenizer.texts_to_sequences(features) word_index = tokenizer.word_index X_train, X_test, y_train, y_test = train_test_split(features_seq, targets, random_state=55, test_size=0.20) X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post') X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post') print('X - Train ', np.shape(X_train)) print('X - Test ', np.shape(X_test)) print('Y - Train', np.shape(y_train)) print('Y - Test', np.shape(y_test)) train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': X_train}, y=y_train, batch_size=32, num_epochs=None, shuffle=True) test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': X_test}, y=y_test, num_epochs=1,
def load_test_data(): list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test) X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen) return X_test
def load_train_data(list_sentences): list_tokenized = tokenizer.texts_to_sequences(list_sentences) X_train = sequence.pad_sequences(list_tokenized, maxlen=maxlen) return X_train
t = t[i:] texts.append(t) f.close() labels.append(label_id) print('Found %s texts.' % len(texts)) # finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:]
train_y = train_df['target'].values train_y = train_y.reshape(len(train_y), 1) # creates a mapping from the words to the embedding vectors= embeddings_index = dict( get_coefs(*o.split(" ")) for o in open(FLAGS.glove_path, encoding='utf-8')) vocab_size = len(embeddings_index.keys()) print('vocab size :', vocab_size) tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False) tokenizer.fit_on_texts(list(train_X)) train_X = tokenizer.texts_to_sequences(train_X) # val_X = tokenizer.texts_to_sequences(val_X) test_X = tokenizer.texts_to_sequences(test_X) train_X = pad_sequences(train_X, maxlen=FLAGS.max_sentence_len) # val_X = pad_sequences(val_X, maxlen=FLAGS.max_sentence_len) test_X = pad_sequences(test_X, maxlen=FLAGS.max_sentence_len) all_embs = np.stack(embeddings_index.values()) emb_mean, emb_std = all_embs.mean(), all_embs.std() embed_size = all_embs.shape[1] del all_embs word_index = tokenizer.word_index nb_words = min(vocab_size, len( word_index)) + 1 # only want at most vocab_size words in our vocabulary embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) for word, i in word_index.items( ): # insert embeddings we that exist into our matrix
vocab_size = 95000 # words in vocabulary maxlen = 100 # max words to use per question # fill up the missing values train_X = train["question_text"].fillna("_##_").values val_X = validation["question_text"].fillna("_##_").values test_X = test["question_text"].fillna("_##_").values # Use Keras to tokenize and pad sequences tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False) tokenizer.fit_on_texts(list(train_X)) train_X = tokenizer.texts_to_sequences(train_X) val_X = tokenizer.texts_to_sequences(val_X) test_X = tokenizer.texts_to_sequences(test_X) train_X = pad_sequences(train_X, maxlen=maxlen) val_X = pad_sequences(val_X, maxlen=maxlen) test_X = pad_sequences(test_X, maxlen=maxlen) # Get the response train_y = train['target'].values val_y = validation['target'].values all_embs = np.stack(embeddings_index.values()) emb_mean, emb_std = all_embs.mean(), all_embs.std() embed_size = all_embs.shape[1] word_index = tokenizer.word_index nb_words = min( vocab_size, len(word_index)) # only want at most vocab_size words in our vocabulary