Ejemplo n.º 1
0
def preprocess(train_data_file, word_index_file, num_words):
  """Loads Numpy file .npz format and process its the data.

  Pad the arrays so they all have the same length, then create an integer
  tensor of shape max_length * num_reviews. Then we use an embedding layer
  capable of handling this shape as the first layer in our network.

  Args:
    train_data_file: (str) Location of file.
    word_index_file: (str) Location of JSON file with index information.
    num_words: (int) Number of words to get from IMDB dataset.

  Returns:
    A tuple of training and test data.
  """
  (train_data, train_labels), (test_data, test_labels) = _load_data(
      path=train_data_file, num_words=num_words)
  word_index = _get_word_index(word_index_file)
  # Standardize the lengths for training.
  train_data = pad_sequences(train_data, value=word_index['<PAD>'],
                             padding='post', maxlen=SENTENCE_SIZE)
  # Standardize the lengths for test.
  test_data = pad_sequences(test_data, value=word_index['<PAD>'],
                            padding='post', maxlen=SENTENCE_SIZE)
  return (train_data, train_labels), (test_data, test_labels)
Ejemplo n.º 2
0
def input_fn(texts, labels, tokenizer, batch_size, mode):
    # Transform text to sequence of integers
    x = tokenizer.texts_to_sequences(texts)

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x = sequence.pad_sequences(x, maxlen=MAX_SEQUENCE_LENGTH)

    # default settings for training
    num_epochs = None
    shuffle = True

    # override if this is eval
    if mode == tf.estimator.ModeKeys.EVAL:
        num_epochs = 1
        shuffle = False

    return tf.estimator.inputs.numpy_input_fn(
        x,
        y=labels,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle,
        queue_capacity=50000
    )
def get_dataset():
    (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)

    x_train = sequence.pad_sequences(x_train, maxlen=80)

    ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    ds = ds.repeat()
    ds = ds.map(lambda x, y: (x, tf.cast(y, tf.int32)))
    ds = ds.batch(32, drop_remainder=True)
    return ds
Ejemplo n.º 4
0
def get_sentence_data(train_path_list, test_path_list):
    train_sentence_list = get_sentence_list(train_path_list)
    train_data = pd.DataFrame({'sentence' : train_sentence_list, 'label' : [0]*1000 + [1]*1000})

    test_sentence_list = get_sentence_list(test_path_list)
    test_data = pd.DataFrame({'sentence' : test_sentence_list, 'label' : [0]*1000 + [1]*1000})


    clean_train_sentences = []
    for sentence in train_data['sentence']:
        clean_train_sentences.append(preprocessing(sentence, remove_stopwords=True))

    clean_test_sentences = []
    for sentence in test_data['sentence']:
        clean_test_sentences.append(preprocessing(sentence, remove_stopwords=True))

    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(clean_train_sentences)
    train_text_sequences = tokenizer.texts_to_sequences(clean_train_sentences)
    test_text_sequences = tokenizer.texts_to_sequences(clean_test_sentences)

    MAX_SEQUENCE_LENGTH = 3817

    X_train = pad_sequences(train_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    X_test = pad_sequences(test_text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    # clean_train_df = pd.DataFrame({'sentence': clean_train_sentences, 'label': train_data['label']})
    # clean_test_df = pd.DataFrame({'sentence': clean_test_sentences, 'label': test_data['label']})

    y_train = np.array(train_data['label'])
    print('Shape of X_train: ', X_train.shape)
    print('Shape of y_train: ', y_train.shape)
    np.save(data_path + 'X_train', X_train)
    np.save(data_path + 'y_train', y_train)

    y_test = np.array(test_data['label'])
    print('Shape of X_test: ', X_test.shape)
    print('Shape of y_test: ', y_test.shape)
    np.save(data_path + 'X_test', X_test)
    np.save(data_path + 'y_test', y_test)
    print('finished saving data')
    ###################################
    return tokenizer
Ejemplo n.º 5
0
    def tokenize(self, sent):
        if len(self._train_corpus_tokens_) > 0:
            input_len = len(self._train_corpus_tokens_[0]) 
        else:
            input_len = len(self._test_corpus_tokens_[0])

        output =  np.asarray(pad_sequences(self.tok.texts_to_sequences([sent]), 
        maxlen = input_len, truncating='post')[0])
        print(output)
        return output
Ejemplo n.º 6
0
def predict_rnn(question):
    tokenizer = load(path + 'tokenizer_ref.pkl')
    X_token = tokenizer.texts_to_sequences([my_data.str_clean(question)])
    X_token = pad_sequences(X_token,
                            maxlen=max_tokens,
                            padding=pad,
                            truncating=pad).tolist()
    result = predict_rnn_token(X_token)
    logging.info('predict rnn: ' + question + ' result')
    return result
Ejemplo n.º 7
0
    def predict(cls, input):
        """For the input, do the predictions and return them.

        Args:
            input (a single news headline): The data on which to do the predictions. """
        clf = cls.get_model()
        seq = tokenizer.texts_to_sequences([input])
        d = pad_sequences(seq, maxlen=MAX_LEN)
        prediction = clf.predict_classes(np.array(d))
        return (get_class_label(prediction))
Ejemplo n.º 8
0
def evaluate(test_file, sess, actions, actions_len, max_sentence_len,
             utterance_ph, all_utterance_len_ph, response_ph, response_len,
             y_pred):
    each_test_run = len(actions) // 3
    acc1 = [0.0] * 10
    rank1 = 0.0
    cnt = 0
    print('evaluating')

    with open(test_file, encoding="utf8") as f:
        lines = f.readlines()
        low = 0
        history, true_utt = build_evaluate_data(lines)
        history, history_len = multi_sequences_padding(history,
                                                       max_sentence_len)
        true_utt_len = np.array(
            get_sequences_length(true_utt, maxlen=max_sentence_len))
        true_utt = np.array(
            pad_sequences(true_utt, padding='post', maxlen=max_sentence_len))
        history, history_len = np.array(history), np.array(history_len)
        feed_dict = {
            utterance_ph: history,
            all_utterance_len_ph: history_len,
            response_ph: true_utt,
            response_len: true_utt_len
        }
        true_scores = sess.run(y_pred, feed_dict=feed_dict)
        true_scores = true_scores[:, 1]
        for i in range(true_scores.shape[0]):
            all_candidate_scores = []
            for j in range(3):
                feed_dict = {
                    utterance_ph:
                    np.concatenate([history[low:low + 1]] * each_test_run,
                                   axis=0),
                    all_utterance_len_ph:
                    np.concatenate([history_len[low:low + 1]] * each_test_run,
                                   axis=0),
                    response_ph:
                    actions[each_test_run * j:each_test_run * (j + 1)],
                    response_len:
                    actions_len[each_test_run * j:each_test_run * (j + 1)]
                }
                candidate_scores = sess.run(y_pred, feed_dict=feed_dict)
                all_candidate_scores.append(candidate_scores[:, 1])
            all_candidate_scores = np.concatenate(all_candidate_scores, axis=0)
            pos1 = np.sum(true_scores[i] + 1e-8 < all_candidate_scores)
            if pos1 < 10:
                acc1[pos1] += 1
            rank1 += pos1
            low += 1
        cnt += true_scores.shape[0]
    print([a / cnt for a in acc1])  # rank top 1 to top 10 acc
    print(rank1 / cnt)  # average rank
    print(np.sum(acc1[:3]) * 1.0 / cnt)  # top 3 acc
Ejemplo n.º 9
0
def get_4chan(lookback_list, tokenizer, model):
    biz = py4chan.Board('biz')
    threads = biz.get_all_threads()
    thread_list = []
    post_list = []
    timestamp_list = []
    for thread in threads:
        posts = [post.text_comment for post in thread.replies]
        timestamps = [post.timestamp for post in thread.replies]
        topics = [thread.topic.text_comment for post in thread.replies]
        for post in posts:
            post_list.append(post.strip('>'))
        for ts in timestamps:
            timestamp_list.append(ts)
        for topic in topics:
            thread_list.append(topic)

    post_df = pd.DataFrame(timestamp_list, columns=['Timestamp'])
    post_df['Thread'] = pd.Series(thread_list)
    post_df['Text'] = pd.Series(post_list)

    max_val = max(lookback_list)
    placeholder = get_lookback(max_val)

    if type(placeholder) == datetime.datetime:
        placeholder = time.mktime(placeholder.timetuple())
    start = datetime.datetime.fromtimestamp(placeholder)

    unique_posts = post_df.drop_duplicates(keep='first', inplace=False)
    unique_comment_seqs = tokenizer.texts_to_sequences(
        unique_posts['Text'].values)
    padded_seqs = pad_sequences(unique_comment_seqs, maxlen=12)
    original_seqs = padded_seqs.shape[0]
    batch_size = model.input_shape[0]
    filler = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    while padded_seqs.shape[0] % batch_size != 0:
        padded_seqs = np.vstack((padded_seqs, filler))
    final_data = np.vstack(
        (padded_seqs, np.zeros(shape=(batch_size * 10, 12))))
    preds = model.predict(final_data, batch_size=128, verbose=0)
    origs = preds[:original_seqs]
    unique_posts['Negative'] = origs[:, 0]
    unique_posts['Positive'] = origs[:, 1]
    unique_posts[
        'Net_Sentiment'] = unique_posts['Positive'] - unique_posts['Negative']
    timeframe_lists = [
        unique_posts[unique_posts['Timestamp'] >= dt_to_int(get_lookback(ph))]
        for ph in lookback_list
    ]
    for lb in lookback_list:
        lb = get_lookback(lb)
        if type(lb) == datetime.datetime: lb = time.mktime(lb.timetuple())
        timing = datetime.datetime.fromtimestamp(lb)
        print(f'Cryptocurrency 4chan posts from {timing} to now.')
    return timeframe_lists
Ejemplo n.º 10
0
def read_tokens_v2(token_user_ids_path):
	token_ids_set = []
	token_len_set = []

	with open(token_user_ids_path, mode="rt", encoding="utf-8") as fhu:
		user_utt = fhu.readline()
		counter = 0
		while user_utt:
			counter += 1
			if counter % 10000 == 0:
				print("  reading %s, line %d" % (token_user_ids_path, counter))
				sys.stdout.flush()

			user_utt = user_utt.replace("\n", "")
			source_ids = user_utt.split("\u241D")
			cid = int(source_ids[0])
			token_seq = source_ids[1].split("\u241E")
			token_len = len(token_seq)

			# q1_batch = []
			# q2_batch = []
			# label_batch = []
			#
			# for q1, q2, label in batch:
			# 	q1_length, q2_length = len(q1), len(q2)
			#
			# 	q1_padding = [PAD_INDEX] * (max_len - q1_length)
			# 	q2_padding = [PAD_INDEX] * (max_len - q2_length)
			#
			# 	q1 = list(map(int, q1))
			# 	q2 = list(map(int, q2))
			#
			# 	q1_pad_seq, q2_pad_seq = (q1 + q1_padding), (q2 + q2_padding)
			# 	q1_pad_seq, q2_pad_seq = q1_pad_seq[:max_len], q2_pad_seq[:max_len]
			#
			# 	# input embed stuff
			# 	q1_pad_seq = emb_vector[q1_pad_seq]
			# 	q2_pad_seq = emb_vector[q2_pad_seq]
			#
			# # q1_pad_seq = pad_sequences(q1, maxlen=m

			token_seq = pad_sequences([token_seq], maxlen=config.buckets[0], padding='post')

			# for idx in range(len(token_seq)):
			# 	if token_seq[idx] >= config.input_vocab_size:
			# 		token_seq[idx] = config.UNK_ID
			# if token_seq[idx] >= config.get('input_vocab_size'):
			# 	token_seq[idx] = config.get('UNK_ID')

			token_ids_set.append([cid, token_seq])
			token_len_set.append([cid, token_len])

			user_utt = fhu.readline()

	return dict(token_ids_set), dict(token_len_set)
Ejemplo n.º 11
0
def input_data_for_model(input_shape):

    # 数据导入
    input_data = load_data()
    # 数据处理
    data_processing()
    # 导入字典
    with open(CONSTANTS[1], 'rb') as f:
        word_dictionary = pickle.load(f)
    with open(CONSTANTS[2], 'rb') as f:
        inverse_word_dictionary = pickle.load(f)
    with open(CONSTANTS[3], 'rb') as f:
        label_dictionary = pickle.load(f)
    with open(CONSTANTS[4], 'rb') as f:
        output_dictionary = pickle.load(f)
    vocab_size = len(word_dictionary.keys())
    label_size = len(label_dictionary.keys())

    # 处理输入数据
    aggregate_function = lambda input: [
        (word, pos, label)
        for word, pos, label in zip(input['word'].values.tolist(), input[
            'pos'].values.tolist(), input['tag'].values.tolist())
    ]

    grouped_input_data = input_data.groupby('sent_no').apply(
        aggregate_function)
    sentences = [sentence for sentence in grouped_input_data]

    x = [[word_dictionary[word[0]] for word in sent] for sent in sentences]
    x = sequence.pad_sequences(maxlen=input_shape,
                               sequences=x,
                               padding='post',
                               value=0)
    y = [[label_dictionary[word[2]] for word in sent] for sent in sentences]
    y = sequence.pad_sequences(maxlen=input_shape,
                               sequences=y,
                               padding='post',
                               value=0)
    y = [to_categorical(label, num_classes=label_size + 1) for label in y]

    return x, y, output_dictionary, vocab_size, label_size, inverse_word_dictionary
def label_correlation(df, label):
    # construct the label positive correlation and negative correlation
    # goals, goals_word = get_tficf(df['Goals'].tolist())
    # targets, targets_word = get_tficf(df['Targets'].tolist())
    #indicators, indicators_word = get_tficf(df['Indicators'].tolist())
    # tficf = np.concatenate([goals, targets, indicators], axis=-1)
    ## Indicators level feature
    tficf, word_0 = get_tficf(df['Indicators'].tolist())
    level_0 = defaultdict(lambda: defaultdict(list))
    i = 0
    for idx, row in df.iterrows():
        level_0[row['goal_no']][row['target_no']].append(tficf[i])
        i += 1
    nT, nI = max([len(x) for x in level_0.values()]), max([max([len(i) for i in x.values()]) for x in level_0.values()])
    level_0 = sequence.pad_sequences(
        [sequence.pad_sequences(
            [i for i in x.values()],
            maxlen=nI, dtype='float32', padding='post', truncating='post') for x in level_0.values()],
                maxlen=nT, dtype='float32', padding='post', truncating='post')
    np.save("level_0.npy", level_0)
    json.dump(word_0, open('word_0.json', 'w', encoding='utf-8'), indent=4)

    ## Targets level feature
    label_l1 = df.groupby('Targets').agg({'goal_no':'first','Goals':'first','Indicators':'. '.join,'target_no':','.join}).reset_index()
    label_l1 = label_sequence(label_l1, label)
    tficf, word_1 = get_tficf((label_l1['Targets']+' '+label_l1['Indicators']).tolist())
    level_1 = defaultdict(list)
    i = 0
    for idx, row in label_l1.iterrows():
        level_1[row['goal_no']].append(tficf[i])
        i+=1
    level_1 = sequence.pad_sequences([x for x in level_1.values()], maxlen=nT, dtype='float32', padding='post', truncating='post')
    np.save("level_1.npy", level_1)
    json.dump(word_1, open('word_1.json', 'w', encoding='utf-8'), indent=4)

    ## Goals level feature
    label_l2 = label_l1.groupby('Goals').agg({'goal_no': 'first','Targets': '. '.join, 'Indicators': '. '.join}).reset_index()
    label_l2 = label_sequence(label_l2, label)
    level_2, word_2 = get_tficf((label_l2['Goals']+' '+label_l2['Targets']+' '+label_l2['Indicators']).tolist())
    np.save("level_2.npy", level_2)
    json.dump(word_2, open('word_2.json', 'w', encoding='utf-8'), indent=4)
    return level_0, level_1, level_2, word_0, word_1, word_2
def sequence_vectorize(train_texts, val_texts, k=1):
    """Vectorizes texts as sequence vectors.

    1 text = 1 sequence vector with fixed length.

    # Arguments
        train_texts: list, training text strings.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val, word_index: vectorized training and validation
            texts and word index dictionary.
    """
    print('Tokenizing')
    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=TOP_K)
    tokenizer.fit_on_texts(train_texts)
    print('Vectorizing')
    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > MAX_SEQUENCE_LENGTH:
        max_length = MAX_SEQUENCE_LENGTH

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    print('Padding/Truncating Sequences')
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    # Save Tokenizer to Disk
    print('Saving Tokenizer')
    tokenConfig = tokenizer.to_json()
    f = open('amazon_sepcnn_' + str(k) + 'k_tokenizer.json', 'w')
    f.write(tokenConfig)
    f.close()

    return x_train, x_val, tokenizer.word_index
Ejemplo n.º 14
0
def split_and_zero_padding(df, max_seq_length):
    logging.info('Padding sequence')
    # Split to dicts
    x = {'left': df['current_n'], 'right': df['prior_n']}

    # Zero padding
    dataset = dict()
    for i, index in itertools.product([x], ['left', 'right']):
        dataset[index] = pad_sequences(i[index], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset
Ejemplo n.º 15
0
    def transform(self, text_list):
        # Transform text to sequence of integers
        text_list = [self._clean_line(txt) for txt in text_list]
        text_sequence = self._tokenizer.texts_to_sequences(text_list)

        # Fix sequence length to max value. Sequences shorter than the length
        # are padded in the beginning and sequences longer are truncated
        # at the beginning.
        padded_text_sequence = sequence.pad_sequences(
            text_sequence, maxlen=self._max_sequence_length)
        return padded_text_sequence
Ejemplo n.º 16
0
    def transform(self, X):
        res = self.tokenizer.texts_to_sequences([str(word) for word in X])
        max_len = len(max(X, key=len))
        if max_len > self.max_sequence_len:
            max_len = self.max_sequence_len

        res = sequence.pad_sequences(res, maxlen=max_len)
        global INPUT_SHAPE
        INPUT_SHAPE = res.shape[1:]

        return res
Ejemplo n.º 17
0
def pad_one_sequence(sequence,
                     length,
                     dtype='int32',
                     padding='post',
                     truncating='pre',
                     value=0.0):
    sequence = tf.expand_dims(sequence, axis=0)
    sequence_padded = pad_sequences(sequence, length, dtype, padding,
                                    truncating, value)
    sequence_padded = tf.squeeze(sequence_padded, axis=0)
    return (sequence_padded)
Ejemplo n.º 18
0
    def test_pad_sequences(self):
        a = [[1], [1, 2], [1, 2, 3]]

        # test padding
        b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='pre')
        self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
        b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='post')
        self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])

        # test truncating
        b = preprocessing_sequence.pad_sequences(a, maxlen=2, truncating='pre')
        self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
        b = preprocessing_sequence.pad_sequences(a,
                                                 maxlen=2,
                                                 truncating='post')
        self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])

        # test value
        b = preprocessing_sequence.pad_sequences(a, maxlen=3, value=1)
        self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
Ejemplo n.º 19
0
 def build_model_input(self, data):
     model_input = {name: data[name] for name in self.sparse_features}
     if self.variable_length_features:
         for feat in self.variable_length_features:
             pad_variable_length_features = pad_sequences(
                 data[feat],
                 maxlen=self.variable_length_features_max_len[feat],
                 padding='post',
             )
             model_input[feat] = pad_variable_length_features
     return model_input
Ejemplo n.º 20
0
    def __init__(self, timit_root):
        self.max_label_len = 0

        # load the dataset
        training_root = os.path.join(timit_root, 'TRAIN')
        test_root = os.path.join(timit_root, 'TEST')

        self.ph_org_train, self.train_input_length, self.train_label_length, self.x_train, self.y_train = self.load_split_timit_data(
            training_root)
        self.ph_org_test, self.test_input_length, self.test_label_length, self.x_test, self.y_test = self.load_split_timit_data(
            test_root)
        self.normalize_xs()
        self.train_padded_ph = pad_sequences(self.y_train,
                                             maxlen=self.max_label_len,
                                             padding='post',
                                             value=len(self.phonemes))
        self.test_padded_ph = pad_sequences(self.y_test,
                                            maxlen=self.max_label_len,
                                            padding='post',
                                            value=len(self.phonemes))
Ejemplo n.º 21
0
def padding(sentences: List[List[int]], pad: int = None) -> List[List[str]]:
    """
    This method is used to pad a sentence converted with text2id
    :param sentences: converted sentence to pad
    :param pad: maximum padding length
    :return: a padded sentence
    """
    return pad_sequences(sentences,
                         maxlen=pad,
                         truncating="post",
                         padding="post")
Ejemplo n.º 22
0
def sent2oh(sentence, language='en', se=False, reverse=False):
    oh = list()
    if language == 'en':
        sequence = en_token.texts_to_sequences([sentence])
        sequence = pad_sequences(sequence, padding='post',
                                 maxlen=max_en_len)  #add padding
        if reverse == True:
            sequence = sequence[:, ::-1]
        for seq in sequence:
            oh.append(en_oh[seq])
    elif language == 'fr' and se == True:
        sequence = fr_se_token.texts_to_sequences([sentence])
        sequence = pad_sequences(sequence,
                                 padding='post',
                                 maxlen=max_fr_se_len)
        if reverse == True:
            sequence = sequence[:, ::-1]
        for seq in sequence:
            oh.append(fr_se_oh[seq])
    return np.array(oh)
Ejemplo n.º 23
0
def tokenizer(text):
    token_text = token.texts_to_sequences(text)
    token_text = pad_sequences(token_text, maxlen=max_items, padding='pre', truncating='pre')
    result = model.predict(token_text)[0]
    print(result)
    for item in result:
        print(item)
    end_result = "Negative"
    if result >= 0.5:
        end_result = "Positive"
    return end_result
Ejemplo n.º 24
0
def convertData(x):
    top_words = 15000
    tokenizer = Tokenizer(num_words=top_words)
    tokenizer.fit_on_texts(x)
    max_tokens = 228
    x_train_tokens = tokenizer.texts_to_sequences(x)
    x_train_pad = pad_sequences(x_train_tokens,
                                maxlen=max_tokens,
                                padding='pre',
                                truncating='pre')
    return x_train_pad
Ejemplo n.º 25
0
def _input_text_to_pad_id(text, vocab_to_ids, tokenizer):
    data_id = [
        vocab_to_ids[token] if token in vocab_to_ids else WordVector.UNK_ID
        for token in tokenizer.tokenize(text.upper())
    ]
    data = sequence.pad_sequences([data_id],
                                  maxlen=MAXLEN,
                                  truncating='post',
                                  padding='post',
                                  value=WordVector.PAD_ID)
    return {'input': data}
Ejemplo n.º 26
0
    def text_to_sequences(self):
        # tokenize (fitting) on the training data (set oov_token to True so new words in
        # X_test are not ignored)
        word_tokenizer = Tokenizer(oov_token=True)
        word_tokenizer.fit_on_texts(self.X_train)

        # length of the vocabulary
        self.vocab_length = len(word_tokenizer.word_index) + 1

        # text_to_sequences on both the training and the testing
        embedded_sentences_train = word_tokenizer.texts_to_sequences(
            self.X_train)
        embedded_sentences_test = word_tokenizer.texts_to_sequences(
            self.X_test)

        word_count = lambda sentence: len(word_tokenize(sentence))
        longest_sentence = max(self.X_train, key=word_count)
        self.length_long_sentence = len(word_tokenize(longest_sentence))

        self.padded_sentences_train = pad_sequences(embedded_sentences_train,
                                                    self.length_long_sentence,
                                                    padding='post')
        self.padded_sentences_test = pad_sequences(embedded_sentences_test,
                                                   self.length_long_sentence,
                                                   padding='post')

        embeddings_dictionary = dict()
        for line in self.glove_file:
            records = line.split()
            word = records[0]
            vector_dimensions = np.asarray(records[1:], dtype='float32')
            embeddings_dictionary[word] = vector_dimensions

        self.glove_file.close()

        # embedding matrix
        self.embedding_matrix = np.zeros((self.vocab_length, 300))
        for word, index in word_tokenizer.word_index.items():
            embedding_vector = embeddings_dictionary.get(word)
            if embedding_vector is not None:
                self.embedding_matrix[index] = embedding_vector
Ejemplo n.º 27
0
def sequence_vectorize(train_texts, val_texts,number_of_features,max_sequence_length):

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=number_of_features)
    tokenizer.fit_on_texts(train_texts)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > max_sequence_length:
        max_length = max_sequence_length

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)
    return x_train, x_val, tokenizer.word_index
Ejemplo n.º 28
0
 def prepare_data(self):
     """
     main prepare data
     :return:
     """
     (_, _), (x_test, y_test) = imdb.load_data(num_words=self.flags.vocab_size)
     # build word index and reverse word index
     self.build_word_index()
     self.build_reverse_word_index()
     self.x_test = x_test
     x_test = pad_sequences(x_test, maxlen=250, value=self.word_index['<PAD>'], padding='post')
     return x_test
Ejemplo n.º 29
0
def Text_Pipeline(Data, tokenizer, MAX_LENGTH=50):
    """
    tokenizing and padding the sequences
    :param Data: text array
    :param tokenizer: tokenizer object
    :param MAX_LENGTH: sequnence sequence length
    :return pads :padded sequences
    """
    seqs = tokenizer.texts_to_sequences(Data)

    pads = pad_sequences(seqs, maxlen=MAX_LENGTH)
    return pads
Ejemplo n.º 30
0
def split_and_zero_padding(df, max_seq_length):
    # Split to dicts
    X = {'left': df['question1_n'], 'right': df['question2_n']}

    # Zero padding
    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side],
                                      padding='pre',
                                      truncating='post',
                                      maxlen=max_seq_length)

    return dataset
Ejemplo n.º 31
0
    def vectorize_texts(self, texts: List[str]) -> array:

        if self.tokenizer is not None:
            vectorized_texts: List[
                List[int]] = self.tokenizer.texts_to_sequences(texts)
        else:
            raise UntrainedBrainError

        padded_vectors: array = sequence.pad_sequences(
            vectorized_texts, maxlen=self._max_sequence_length)

        return padded_vectors
    def text_to_tokens(self,text,reverse=False,padding=False):
        tokens=self.texts_to_sequences([text])
        tokens=np.array(tokens)
        if (reverse):
            tokens=np.flip(tokens,axis=1)
            truncating='pre'
        else:
            truncating='post'

        if(padding):
            tokens=pad_sequences(tokens,maxlen=self.max_tokens,padding='pre',truncating=truncating)
        return tokens
 def encode_x_batch(self, x_batch):
     return pad_sequences([self.encode_x(x) for x in x_batch],
                          maxlen=self.length_range[1])