def preprocess_input_sequences(self, data): """ preprocess,pad to fixed length. """ documents, questions, answer, candidates = data questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") candidates_ok = pad_sequences(candidates, maxlen=self.A_len, dtype="int32", padding="post", truncating="post") y_true = np.zeros_like(candidates_ok) y_true[:, 0] = 1 return questions_ok, documents_ok, candidates_ok, y_true
def preprocess_input_sequences(self, data): documents, questions, answer_spans = data documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") return documents_ok, questions_ok, answer_spans
def predict(_t_w, _id,_word_index,_model,_model_shape1): sec = [] sequences = [] for w in _t_w: sec.append(_word_index.item().get(w, 0)) sequences.append(sec) data = pad_sequences([sec], maxlen=_model_shape1) data_gen = pad_sequences([diction[id]],maxlen=2) prediction = _model.predict([data,data_gen], batch_size=1) return prediction[0]
def load_data(): ''' ''' print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) return [x_train, y_train, x_test, y_test]
def input_fn(): # calculates the length of the sequences, where # length = min(actual_length, MAX_LENGT x_len = np.minimum(np.array([len(seq) for seq in x_in]), max_length).astype('int32') # x_post_pad = sequence.pad_sequences(x_in, maxlen=max_length, padding='post') x_post_pad = sequence.pad_sequences(x_in, maxlen=max_length, padding='post') # creates the dataset from in memory data ds = tf.contrib.data.Dataset.from_tensor_slices((x_post_pad, x_len, y_in)) # repeats the dataset `epochs` times. ds = ds.repeat(epochs) if shuffle: ds = ds.shuffle(buffer_size=10000) ds = ds.batch(batch_size) # creates iterator x, x_len, y = ds.make_one_shot_iterator().get_next() dict_x = {'x': x, rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY: x_len} return dict_x, y
def to_sequence(texts, window=5, maxlen=None): tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) nb_words = len(tokenizer.word_index.items()) + 1 if maxlen is None: maxlen = max([len(seq) for seq in tokenizer.texts_to_sequences(texts)]) logger.info('Maximum sentence length: {}'.format(maxlen)) logger.info('Padded sentence length: {}'.format(maxlen + 2 * (window - 1))) logger.info('Number of words: {}'.format(nb_words)) maxlen += window - 1 seqs = tokenizer.texts_to_sequences(texts) seqs = sequence.pad_sequences(seqs, padding='post', maxlen=maxlen) seqs = sequence.pad_sequences(seqs, padding='pre', maxlen=maxlen + window - 1) return seqs, tokenizer, nb_words, maxlen + (window - 1)
def preprocess_input_sequences(self, data): documents, questions, answer_spans = data documents_ok = pad_sequences(documents, maxlen=self.d_len, dtype="int32", padding="post", truncating="post") questions_ok = pad_sequences(questions, maxlen=self.q_len, dtype="int32", padding="post", truncating="post") answer_start = [ np.array([int(i == answer_span[0]) for i in range(self.d_len)]) for answer_span in answer_spans ] answer_end = [ np.array([int(i == answer_span[1]) for i in range(self.d_len)]) for answer_span in answer_spans ] return documents_ok, questions_ok, np.asarray( answer_start), np.asarray(answer_end)
def input_fn(): # calculates the length of the sequences, where # length = min(actual_length, MAX_LENGT x_len = np.minimum(np.array([len(seq) for seq in x_in]), max_length).astype('int32') # DynamicRNNEstimator uses `rnn_common.select_last_activations`: # https://goo.gl/L8jtfh # so we need add padding at the end of the sequence, # the default is the beginning of the sequence: # https://goo.gl/NVjJgT x_post_pad = sequence.pad_sequences(x_in, maxlen=max_length, padding='post') # creates the dataset from in memory data ds = tf.contrib.data.Dataset.from_tensor_slices( (x_post_pad, x_len, y_in)) # repeats the dataset `epochs` times. ds = ds.repeat(epochs) if shuffle: ds = ds.shuffle(buffer_size=10000) if batch_by_seq_len: # manually implement bucket by sequence length # the idea is to make batches with sequences of similar length # https://goo.gl/y67FQm ds = ds.group_by_window( key_func=lambda x, x_len, y: _length_bin(x_len, max_length), reduce_func=_make_batch, window_size=batch_size) else: ds = ds.batch(batch_size) # creates iterator x, x_len, y = ds.make_one_shot_iterator().get_next() dict_x = {'x': x, rnn_common.RNNKeys.SEQUENCE_LENGTH_KEY: x_len} return dict_x, y
def multi_sequences_padding(all_sequences, config): max_num_utterance = config.max_num_utterance max_sentence_len = config.max_length_q PAD_SEQUENCE = [0] * max_sentence_len padded_sequences = [] sequences_length = [] for sequences in all_sequences: sequences_len = len(sequences) sequences_length.append( get_sequences_length(sequences, maxlen=max_sentence_len)) if sequences_len < max_num_utterance: sequences += [PAD_SEQUENCE] * (max_num_utterance - sequences_len) sequences_length[-1] += [0] * (max_num_utterance - sequences_len) else: sequences = sequences[-max_num_utterance:] sequences_length[-1] = sequences_length[-1][-max_num_utterance:] sequences = pad_sequences(sequences, padding='post', maxlen=max_sentence_len) padded_sequences.append(sequences) return padded_sequences, sequences_length
# finally, vectorize the text samples into a 2D integer tensor tokenizer = Tokenizer( num_words=MAX_NB_WORDS ) # def __init__(self, num_words=None, filters=\'!"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', lower=True, split=' ', char_level=False, **kwargs); # num_words: how many vocabulary to use in this model, the downloaded and used file has 400000 vocabularies to offer; ### tokenizer.fit_on_texts(texts) create many useful attributes: # self.word_index: dictionary {word: index}, index: from 0 to 174046, 0 is highest count, 174046 refer to lowest counts; # there are 174047 unique words in all samples together; # self.index_docs: dictionary {index_word: counts_doc}, key is index of word from dict self.word_index, and value is num_docs this word appear tokenizer.fit_on_texts( texts ) # texts: a list of strings, or generator of strings; Updates internal vocabulary based on a list of texts; # tokenizer.document_count: num of samples processed so far; # tokenizer.text_to_word_sequence: convert a long string to a list of words; # tokenizer.word_counts: dictionary, {word: counts} added up in each and every sample; # tokenizer.word_docs: dictionary {unique_word: counts}, each sample count unique word only once, add up if appear in a different sample; # self.word_counts.__len__(): total unique words in all samples; # self.word_docs.get("the"): how many documents or samples have "the"; # self.word_counts.get("the"): how many times "the" has occured in all samples; # wcounts = list(self.word_counts.items()): wcounts is a list of tuples (word, counts); # wcounts.sort(key=lambda x: x[1], reverse=True): sort the list from highest to smallest counts; # sorted_voc = [wc[0] for wc in wcounts]: get a list of words sorted by counts from highest to lowest sequences = tokenizer.texts_to_sequences( texts ) # Transforms each text in texts in a sequence of integers (each integer refers to a word); # Only top "num_words" most frequent words (top 20000 most frequent words out of 174047 unique words will be taken into account. Only words known by the tokenizer will be taken into account. # sequences: a list of 19997 sublist, each list has less 20000 unique but most frequent words `for sq in sequences: np.array(sq).max()` word_index = tokenizer.word_index # total num of unique words in all samples; also total vocabularies based on all samples here print('Found %s unique tokens.' % len(word_index)) data = pad_sequences( sequences, maxlen=MAX_SEQUENCE_LENGTH ) # Pads each sequence to the same length (length of the longest sequence), If maxlen is provided, any sequence longer than maxlen is truncated to maxlen. Truncation happens off either the beginning (default) or the end of the sequence. Supports post-padding and pre-padding (default). # previously, maximum length of each sequence is 20000, now maxlen is set to 1000, then we can check data's sublist length won't be longer than 1000; # data.shape == (19997, 1000), for each sample text, there are 1000 most frequent words to summarize it labels = to_categorical( np.asarray(labels) ) # each sample text has its category, from 0 to 19; # to_categorical convert 0-9 to one-hot encoding print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) # indices of all samples np.random.shuffle(indices) # shuffle the indices data = data[indices] # shuffle data samples labels = labels[indices] # shuffle labels (one hot encoded) num_test_samples = int(TEST_SPLIT * data.shape[0])
texts, labels, labels_index = read_data(filename, filename_v) print(max([len(t) for t in texts])) print(min([len(t) for t in texts])) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) print(max([len(t) for t in sequences])) print(min([len(t) for t in sequences])) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) MAX_SEQUENCE_LENGTH = data.shape[1] print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) embeddings_index = {} f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding="utf-8")
t = t[i:] texts.append(t) f.close() labels.append(label_id) print('Found %s texts.' % len(texts)) # Data preprocessing tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:]
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer_gen = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) tokenizer_gen.fit_on_texts(texts_gen) sequences = tokenizer.texts_to_sequences(texts) sequences_gen = tokenizer_gen.texts_to_sequences(texts_gen) print(max([len(t)for t in sequences])) print(min([len(t)for t in sequences])) word_index = tokenizer.word_index word_index_gen = tokenizer_gen.word_index print('Found %s unique tokens.' % len(word_index)) print('Found %s unique tokens.' % len(word_index_gen)) data = pad_sequences(sequences) data_gen = pad_sequences(sequences_gen) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) MAX_SEQUENCE_LENGTH = data.shape[1] print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] data_gen = data_gen[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
TEST_SPLIT = 0.2 INIT_SEED = 2017 GLOBAL_SEED = 2018 MAXLEN = 80 BATCH_SIZE = 128 TEST_BATCH_SIZE = 512 # In[2]: (X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=NB_WORDS) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=MAXLEN) X_test = sequence.pad_sequences(X_test, maxlen=MAXLEN) print('x_train shape:', X_train.shape) print('x_test shape:', X_test.shape) # In[3]: class Model(nn.Module): def __init__(self, nb_words, hidden_size=128, embedding_size=128, n_layers=1, wdrop=0.25, odrop=0.25, edrop=0.1, idrop=0.25, variational=False, standard_dropout=False, batch_first=True): super(Model, self).__init__() self.standard_dropout = standard_dropout self.lockdrop = LockedDropout(batch_first=batch_first)
def __init__(self, init_seed, maxlen, nb_words, skip_top, test_split): self.start_char = 1 self.oov_char = 2 self.index_from = 3 files = [ "Dennis+Schwartz", "James+Berardinelli", "Scott+Renshaw", "Steve+Rhodes" ] texts, ratings = [], [] for file in files: with open("data/scaledata/" + file + "/subj." + file, "r") as f: texts += list(f) with open("data/scaledata/" + file + "/rating." + file, "r") as f: ratings += list(f) tokenizer = text.Tokenizer(filters='') tokenizer.fit_on_texts(texts) X = tokenizer.texts_to_sequences(texts) Y = [float(rating) for rating in ratings] # Shuffle data: np.random.seed(init_seed) np.random.shuffle(X) np.random.seed(init_seed) np.random.shuffle(Y) # Parse data X = [[self.start_char] + [w + self.index_from for w in x] for x in X] new_X = [] new_Y = [] for x, y in zip(X, Y): for i in range(0, len(x), maxlen): new_X.append(x[i:i + maxlen]) new_Y.append(y) X = np.array(new_X) Y = np.array(new_Y) # by convention, use 2 as OOV word # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV) X = [[ self.oov_char if (w >= nb_words or w < skip_top) else w for w in x ] for x in X] self.X_train = X[:int(len(X) * (1 - test_split))] self.Y_train = Y[:int(len(X) * (1 - test_split))] self.mean_y_train = np.mean(self.Y_train) self.std_y_train = np.std(self.Y_train) self.Y_train = (self.Y_train - self.mean_y_train) / self.std_y_train self.X_test = X[int(len(X) * (1 - test_split)):] self.Y_test = Y[int(len(X) * (1 - test_split)):] self.Y_test = (self.Y_test - self.mean_y_train) / self.std_y_train print(len(self.X_train), 'train sequences') print(len(self.X_test), 'test sequences') print("Pad sequences (samples x time)") self.X_train = sequence.pad_sequences(self.X_train, maxlen=maxlen) self.X_test = sequence.pad_sequences(self.X_test, maxlen=maxlen) print('X_train shape:', self.X_train.shape) print('X_test shape:', self.X_test.shape)
with open(filename, 'r', encoding="utf-8") as f: with open(os.path.join(SAVE_DIR, 'submissionFile'), 'a') as sf: sf.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n') for line in f: if i>0: text= line[line.find('||') + 2:] id =int(line[:line.find('||')]) t_w = text_to_word_sequence(text) sec = [] sequences = [] for w in t_w: sec.append(word_index.item().get(w, 0)) sequences.append(sec) data = pad_sequences([sec], maxlen=model_shape1) prediction = model.predict(data, batch_size=1) outputstr = str(id) j = 0 for p_i in prediction[0]: if j > 0: outputstr += "," + "%.2f" % p_i j += 1 print(outputstr) sf.write(outputstr + '\n') i+=1 if i>=NUM_ROWS_FROM_TEXT : break print("saved in "+ os.path.join(SAVE_DIR, 'submissionFile'))
from tensorflow.contrib.keras.python.keras.datasets import imdb from tensorflow.contrib.keras.python.keras.layers import Embedding, SimpleRNN, Dropout, Dense, Activation, LSTM, GRU from tensorflow.contrib.keras.python.keras.models import Sequential from tensorflow.contrib.keras.python.keras.preprocessing import sequence max_features = 20000 maxlen = 100 batch_size = 32 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) model = Sequential() model.add(Embedding(max_features, 128, input_length=maxlen)) #model.add(SimpleRNN(128)) #model.add(GRU(128)) model.add(LSTM(128)) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam')