Esempio n. 1
0
    def input_ready(self, df, tokenize=False, **kwargs):
        if tokenize:
            _df = self.tokenize(df)
        else:
            _df = df.copy()
        X = _df['TOKENS'].apply(
            lambda x: [self.w_idx.get(w, 1) for w in x]).values
        X = pad_sequences(X,
                          maxlen=self.max_seq_len,
                          padding='post',
                          truncating='post')
        X = [X[:, i].astype('int32') for i in range(X.shape[1])]

        asp = _df[self.aspcol].apply(
            lambda w: int32(self.asp_idx[w])).values.tolist()

        use_2nd_lex = kwargs.get('use_second_lexicon', False)
        if use_2nd_lex:
            lx = _df['TOKENS'].apply(
                lambda x: [self.test_lx_idx.get(w, 0) for w in x]).values
            print('Using second lexicon.')
        else:
            lx = _df['TOKENS'].apply(
                lambda x: [self.lx_idx.get(w, 0) for w in x]).values
        lx = pad_sequences(lx,
                           maxlen=self.max_seq_len,
                           padding='post',
                           truncating='post')
        lx = [lx[:, i].astype('int32') for i in range(lx.shape[1])]

        y = get_dummies(_df[self.clscol].astype(str)).values.astype('float32')

        return X, asp, lx, y
Esempio n. 2
0
    def batchfy_fn(data):
        x2 = [d[1] for d in data]
        x1 = [d[0] for d in data]
        y = [d[2] for d in data]
        max_len1 = max([len(x) for x in x1])
        max_len2 = max([len(x) for x in x2])

        return sequence.pad_sequences(
            x1, maxlen=max_len1,
            padding='post'), sequence.pad_sequences(x2,
                                                    maxlen=max_len2,
                                                    padding='post'), y
Esempio n. 3
0
 def batchfy_fn(data):
     x = [d[0] for d in data]
     y = [d[1] for d in data]
     max_len = max(map(len, x))
     # if max_len % 5 != 0:
     #     max_len += 5 - (max_len % 5)
     return sequence.pad_sequences(x, maxlen = max_len, padding = 'post'), y
Esempio n. 4
0
 def pad_by_buckets(self, init_df):
     buckets = sorted(self.intervals.keys())
     bucket = {}
     for b in buckets:
         _df = init_df.loc[init_df.BUCKET == b][[
             self.enc_col + '_TOK', self.dec_col + '_TOK',
             self.dec_col + '_TL'
         ]]
         if not _df.empty:
             _enc_symbs = self.tok2sym(_df[self.enc_col + '_TOK'])
             _dec_symbs = self.tok2sym(_df[self.dec_col + '_TOK'])
             enc_mlen = max(self.intervals[b])
             dec_mlen = _df[self.dec_col + '_TL'].max()
             enc_symbs = pad_sequences(_enc_symbs, maxlen=enc_mlen)
             dec_symbs = pad_sequences(_dec_symbs, maxlen=enc_mlen)
             bucket.update({b: (enc_symbs, dec_symbs)})
     return bucket
Esempio n. 5
0
    def preprocess_input_sequences(self, data):
        if not self.args.use_char_embedding:
            documents, questions, answer_spans = data
        else:
            documents, questions, documents_char, questions_char, answer_spans = data
            documents_char_ok = pad_sequences(documents_char,
                                              maxlen=self.d_len,
                                              dtype="int32",
                                              padding="post",
                                              truncating="post")
            questions_char_ok = pad_sequences(questions_char,
                                              maxlen=self.q_len,
                                              dtype="int32",
                                              padding="post",
                                              truncating="post")

        documents_ok = pad_sequences(documents,
                                     maxlen=self.d_len,
                                     dtype="int32",
                                     padding="post",
                                     truncating="post")
        questions_ok = pad_sequences(questions,
                                     maxlen=self.q_len,
                                     dtype="int32",
                                     padding="post",
                                     truncating="post")

        # FIXME: here can not use the array ,because the postiton is counted under character. not words
        answer_start = [
            np.array([int(i == answer_span[0]) for i in range(self.d_len)])
            for answer_span in answer_spans
        ]
        answer_end = [
            np.array([int(i == answer_span[1]) for i in range(self.d_len)])
            for answer_span in answer_spans
        ]
        if self.args.use_char_embedding:
            return documents_ok, questions_ok, documents_char_ok, questions_char_ok, np.asarray(
                answer_start), np.asarray(answer_end)
        else:
            return documents_ok, questions_ok, np.asarray(
                answer_start), np.asarray(answer_end)
Esempio n. 6
0
    def tokenize(self, comments):
        print('Comments shape is {}'.format(comments.shape))

        token = Tokenizer(num_words=self.vocab_size)
        token.fit_on_texts(comments)

        tokenized_comments = token.texts_to_sequences(comments)

        tokenized_comments = sequence.pad_sequences(
            sequences=tokenized_comments,
            maxlen=self.max_sentence_len,
            padding='post',
            value=0)
Esempio n. 7
0
    def preprocess_input_sequences(self, data):
        """
        preprocess,pad to fixed length.
        """
        documents, questions, answer, candidates = data

        questions_ok = pad_sequences(questions,
                                     maxlen=self.q_len,
                                     dtype="int32",
                                     padding="post",
                                     truncating="post")
        documents_ok = pad_sequences(documents,
                                     maxlen=self.d_len,
                                     dtype="int32",
                                     padding="post",
                                     truncating="post")
        candidates_ok = pad_sequences(candidates,
                                      maxlen=self.A_len,
                                      dtype="int32",
                                      padding="post",
                                      truncating="post")
        y_true = np.zeros_like(candidates_ok)
        y_true[:, 0] = 1
        return questions_ok, documents_ok, candidates_ok, y_true
    def __generate_seq(self, seed_int_encode, n_chars=250, diversity=0.2):
        '''
        Generate text using the current LSTM model
        
        Input:
            @seed_int_encode: Seed sequence encoded as integers
            @n_chars: Number of characters to generate
            @diversity: How randomized the character selection should be
        Output:
            @return: 'n_chars' characters of generated text
        '''

        # Begin with seed sequence
        int_encode = seed_int_encode

        # Translate seed text
        start_chars = [self.indices_char[x] for x in int_encode]
        message = ''.join(start_chars)

        # generate a fixed number of characters
        for _ in range(n_chars):

            # truncate sequences to a fixed length
            int_encode = pad_sequences([int_encode],
                                       maxlen=self.seq_len,
                                       truncating='pre')[0]

            # one hot encode
            hot_encode = to_categorical(int_encode,
                                        num_classes=self.vocab_size)

            # Change shape from: (seq_len, vocab)
            # to: (1, seq_len, vocab)
            # Since LSTM requires a tensor input
            hot_encode = np.expand_dims(hot_encode, 0)

            # Predict next character
            preds = self.model.predict(hot_encode, verbose=0)[0]
            yhat = self.__sample(preds, diversity)

            # Append int encoding to continue recurrant predictions
            int_encode = np.append(int_encode, yhat)

            # Keep track of full message generated
            message += self.indices_char[yhat]

        # Return generated message
        return (message)
Esempio n. 9
0
    def prepare(self,
                X,
                Y,
                emb_model,
                seq_length=200,
                stratify='n',
                test_split=0.2,
                emb_dim=100):
        #prepare data for use in NN
        #Convert text to sequences and create word index for use in creating embedding matrix
        from tensorflow.contrib.keras.api.keras.preprocessing.text import Tokenizer
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X)
        X_seq = tokenizer.texts_to_sequences(X)
        word_idx = tokenizer.word_index
        from tensorflow.contrib.keras.api.keras.preprocessing import sequence
        X_seq = sequence.pad_sequences(X_seq, maxlen=seq_length)

        #encode labels in 1h vector
        from sklearn.preprocessing import LabelBinarizer
        label_encoder = LabelBinarizer()
        Y_coded = label_encoder.fit_transform(Y)

        #create test and train split
        from sklearn.model_selection import train_test_split
        if stratify == 'y':
            x_train, x_test, y_train, y_test = train_test_split(
                X_seq,
                Y_coded,
                test_size=test_split,
                random_state=141289,
                stratify=Y_coded)
        else:
            x_train, x_test, y_train, y_test = train_test_split(
                X_seq, Y_coded, test_size=test_split, random_state=141289)

        #learn embedding matrix from the passed model
        import numpy as np
        embedding_mat = np.zeros((len(word_idx) + 1, emb_dim))
        for w, i in word_idx.items():
            try:
                embedding_vector = emb_model[w]
                embedding_mat[i] = embedding_vector
            except KeyError:
                pass  #print ("no "+ word+" pos" + str(i))
        return x_train, x_test, y_train, y_test, embedding_mat, tokenizer, label_encoder
Esempio n. 10
0
 def input_ready(self, df, tokenize=False, **kwargs):
     if tokenize:
         _df = self.tokenize(df)
     else:
         _df = df.copy()
     X = _df['TOKENS'].apply(
         lambda x: [self.w_idx[w] if w in self.w_idx else 1
                    for w in x]).values  # Use w_idx to get word index
     X = pad_sequences(X,
                       maxlen=self.max_seq_len,
                       padding='post',
                       truncating='post')  # Pad sequences
     X = [X[:, i].astype('int32') for i in range(X.shape[1])
          ]  # Reshape data into [[batch,feats_t1], [batch,feats_t2], ...]
     y = get_dummies(_df[self.clscol].astype(str)).values.astype(
         'float32')  # Create one-hot encoded labels
     return X, y
Esempio n. 11
0
def predict(out_path, txt, top=1):
    import pickle
    import os
    if os.path.isfile(os.path.join(out_path, 'token_enc.pkl')):
        with open(os.path.join(out_path, 'token_enc.pkl'), 'rb') as f:
            tokenizer, seq_len, language = pickle.load(f)

        #do preprocessing bit
        from nltk.corpus import stopwords
        from nltk.stem.snowball import SnowballStemmer
        stopwords = stopwords.words(language)
        stemmer = SnowballStemmer(language)
        import re
        r = re.compile(r'[\W]', re.U)
        txt = r.sub(' ', txt)
        txt = re.sub('[\\s]+', ' ', txt)
        txt = [
            ' '.join(
                stemmer.stem(w.lower()) for w in txt.split()
                if w not in stopwords)
        ]

        #convert text to sequence
        txt_seq = tokenizer.texts_to_sequences(txt)
        from tensorflow.contrib.keras.api.keras.preprocessing import sequence
        txt_seq = sequence.pad_sequences(txt_seq, maxlen=seq_len)

        #load NN model and predict
        from tensorflow.contrib.keras.api.keras.models import load_model
        model = load_model(os.path.join(out_path, 'CNN1d.h5'))
        output = model.predict(txt_seq)

        #create binary sequences for top x predictions
        sorted_idx = (-output).argsort()
        import numpy as np
        label = np.zeros((top, len(output[0])))
        for i in range(0, top):
            label[i][sorted_idx[0][i]] = 1

        #convert to txt labels
        with open(os.path.join(out_path, 'label_enc.pkl'), 'rb') as f:
            label_decoder = pickle.load(f)
        return label_decoder.inverse_transform(label)
    else:
        return "Invalid output path!"
Esempio n. 12
0
 def forward(self, x, y):
     max_len = x.shape[1]
     if max_len % self.stride:
         max_len += self.stride - (max_len % self.stride)
         x = sequence.pad_sequences(x, maxlen=max_len, padding='post')
     x = LongTensor(x)
     mask = torch.where(x > 0, torch.ones_like(x, dtype=torch.float32),
                        torch.zeros_like(x, dtype=torch.float32))
     x_embed = self.embedding(x)
     x_embed = self.dropout(x_embed)
     # reduce
     outputs, h, reduced_mask = self.reduce_ngram(
         x_embed, mask)  # (seq_len, batch, hidden_size * num_directions)
     # output_maxpooled = self.gather_rnnstate(outputs, reduced_mask)
     output_maxpooled, _ = torch.max(outputs, 1)
     # output_maxpooled = h.view(h.shape[1], -1)
     class_prob = self.linear(output_maxpooled)
     return class_prob, F.dropout(output_maxpooled)
Esempio n. 13
0
 def forward(self, x, y):
     max_len = x.shape[1]
     if max_len % self.stride:
         max_len += self.stride - (max_len % self.stride)
         x = sequence.pad_sequences(x, maxlen = max_len, padding = 'post')
     x = LongTensor(x)
     mask = torch.where(x > 0, torch.ones_like(x, dtype = torch.float32), torch.zeros_like(x, dtype = torch.float32))
     x_embed = self.embedding(x)
     x_embed = self.dropout(x_embed)
     x_embed2 = self.embedding(torch.zeros_like(x).cuda() + torch.cat([torch.zeros(size = [x.shape[0], 1], dtype = torch.long).cuda(), x[:, 1:]], -1))
     x_embed3 = self.embedding(torch.zeros_like(x).cuda() + torch.cat([x[:, :-1], torch.zeros(size = [x.shape[0], 1], dtype = torch.long).cuda()], -1))
     x_embed = torch.cat([x_embed2, x_embed, x_embed3], -1)
     x_embed = x_embed @ self.params
     outputs, (h, c) = self.rnn1(x_embed)
     output_maxpooled, _ = torch.max(outputs, 1)
     # output_maxpooled = h.view(h.shape[1], -1)
     class_prob = self.linear(output_maxpooled)
     return class_prob, F.dropout(output_maxpooled)
Esempio n. 14
0
    def input_ready(self, df, tokenize=False, **kwargs):
        if tokenize:
            _df = self.tokenize(df)
        else:
            _df = df.copy()
        X = _df['TOKENS'].apply(
            lambda x: [self.w_idx[w] if w in self.w_idx else 1
                       for w in x]).values
        X = pad_sequences(X,
                          maxlen=self.max_seq_len,
                          padding='post',
                          truncating='post')
        X = [X[:, i].astype('int32') for i in range(X.shape[1])]

        asp = _df[self.aspcol].apply(
            lambda w: int32(self.asp_idx[w])).values.tolist()

        y = get_dummies(_df[self.clscol].astype(str)).values.astype('float32')

        return X, asp, y
Esempio n. 15
0
 def get_next_batch(self, mode, idx):
     """
     return next batch of data samples
     """
     batch_size = self.args.batch_size
     if mode == "train":
         dataset_x = self.train_x
         dataset_y = self.train_y
         sample_num = self.train_nums
     elif mode == "valid":
         dataset_x = self.valid_x
         dataset_y = self.valid_y
         sample_num = self.valid_nums
     else:
         dataset_x = self.test_x
         dataset_y = self.test_y
         sample_num = self.test_nums
     if mode == "train":
         start = self.train_idx[idx] * batch_size
         stop = (self.train_idx[idx] + 1) * batch_size
     else:
         start = idx * batch_size
         stop = (idx + 1) * batch_size if start < sample_num and (
             idx + 1) * batch_size < sample_num else len(dataset_x)
     document = [self.getitem(dataset_x, i) for i in range(start, stop)]
     data = {
         "document:0":
         sequence.pad_sequences(document,
                                maxlen=self.max_len,
                                padding="post"),
         "y_true:0":
         dataset_y[start:stop]
     }
     samples = stop - start
     if len(document) != len(dataset_y[start:stop]) or len(
             dataset_y[start:stop]) != samples:
         print(len(document), len(dataset_y[start:stop]), samples)
     return data, samples
Esempio n. 16
0
def batcher(params, batch):
    # batch = [sent if sent != [] else ['.'] for sent in batch]
    # embeddings = []
    #
    # for sent in batch:
    #     sentvec = []
    #     for word in sent:
    #         if word in params.word_vec:
    #             sentvec.append(params.word_vec[word])
    #     if not sentvec:
    #         vec = np.zeros(params.wvec_dim)
    #         sentvec.append(vec)
    #     sentvec = np.mean(sentvec, 0)
    #     embeddings.append(sentvec)
    #
    # embeddings = np.vstack(embeddings)
    # return embeddings
    batch_idx = word2id_bathed(batch, params.word2id)
    max_len1 = max([len(x) for x in batch_idx])
    batch_idx = sequence.pad_sequences(batch_idx,
                                       maxlen=max_len1,
                                       padding='post')
    embedding = params.model(batch_idx)
    return embedding
Esempio n. 17
0
targets = data_set[target_col].values

targets = to_categorical(targets, 11)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(features)

features_seq = tokenizer.texts_to_sequences(features)
word_index = tokenizer.word_index

X_train, X_test, y_train, y_test = train_test_split(features_seq,
                                                    targets,
                                                    random_state=55,
                                                    test_size=0.20)

X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

print('X - Train ', np.shape(X_train))
print('X - Test ', np.shape(X_test))
print('Y - Train', np.shape(y_train))
print('Y - Test', np.shape(y_test))

train_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': X_train},
                                                    y=y_train,
                                                    batch_size=32,
                                                    num_epochs=None,
                                                    shuffle=True)
test_input_fn = tf.estimator.inputs.numpy_input_fn(x={'x': X_test},
                                                   y=y_test,
                                                   num_epochs=1,
    def load_test_data():

        list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
        X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
        return X_test
 def load_train_data(list_sentences):
     list_tokenized = tokenizer.texts_to_sequences(list_sentences)
     X_train = sequence.pad_sequences(list_tokenized, maxlen=maxlen)
     return X_train
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
train_y = train_df['target'].values
train_y = train_y.reshape(len(train_y), 1)

# creates a mapping from the words to the embedding vectors=
embeddings_index = dict(
    get_coefs(*o.split(" ")) for o in open(FLAGS.glove_path, encoding='utf-8'))
vocab_size = len(embeddings_index.keys())
print('vocab size :', vocab_size)

tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
# val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=FLAGS.max_sentence_len)
# val_X = pad_sequences(val_X, maxlen=FLAGS.max_sentence_len)
test_X = pad_sequences(test_X, maxlen=FLAGS.max_sentence_len)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
del all_embs

word_index = tokenizer.word_index
nb_words = min(vocab_size, len(
    word_index)) + 1  # only want at most vocab_size words in our vocabulary
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items(
):  # insert embeddings we that exist into our matrix
vocab_size = 95000  # words in vocabulary
maxlen = 100  # max words to use per question

# fill up the missing values
train_X = train["question_text"].fillna("_##_").values
val_X = validation["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

# Use Keras to tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

# Get the response
train_y = train['target'].values
val_y = validation['target'].values

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(
    vocab_size,
    len(word_index))  # only want at most vocab_size words in our vocabulary