Beispiel #1
0
 def read(self):
     # print('!',sys.argv[0])
     # print(os.path.dirname(__file__)+'/data/ptb.valid.txt')
     file=open(os.path.dirname(__file__)+'/data/ptb.valid.txt')
     lines=file.readlines()
     tokenizer=Tokenizer(9999,oov_token=1)
     tokenizer.fit_on_texts(lines)
     self.seqs=tokenizer.texts_to_sequences(lines)
     return self.seqs
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from Tokenizer import Tokenizer

data = pd.read_json('raw_data/data.json', lines=True)
raw_text, raw_labels = data['headline'].values[0:10000], data[
    'is_sarcastic'].values[0:10000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_text)
tokenized_headlines = tokenizer.transform(raw_text)
max_length = max([len(x) for x in tokenized_headlines])
padded_headlines = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_headlines, maxlen=max_length, padding='post')
with open('glove_embedding/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

onehot_labels = tf.keras.utils.to_categorical(raw_labels, num_classes=2)

train_features, test_features, train_labels, test_labels = train_test_split(
    np.array(padded_headlines), np.array(onehot_labels), test_size=0.4)

np.save('processed_data/x.npy', train_features)
np.save('processed_data/y.npy', train_labels)
np.save('processed_data/test_x.npy', test_features)
np.save('processed_data/test_y.npy', test_labels)