Beispiel #1
0
def reuters_raw(max_features=20000):

    index_offset = 3  # word index offset

    (x_train, y_train), (x_test, y_test) = reuters.load_data(
        num_words=max_features, index_from=index_offset
    )
    x_train = x_train
    y_train = y_train.reshape(-1, 1)
    x_test = x_test
    y_test = y_test.reshape(-1, 1)

    word_to_id = reuters.get_word_index()
    word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2

    id_to_word = {value: key for key, value in word_to_id.items()}
    x_train = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train)
    )
    x_test = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test)
    )
    x_train = np.array(x_train, dtype=np.str)
    x_test = np.array(x_test, dtype=np.str)
    return (x_train, y_train), (x_test, y_test)
    def _get_word_index(self):
        word_index = reuters.get_word_index()
        word_index = {k: (v + 3) for k, v in word_index.items()}
        word_index["<PAD>"] = 0
        word_index["<START>"] = 1
        word_index["<UNK>"] = 2  # unknown
        word_index["<UNUSED>"] = 3

        return word_index
Beispiel #3
0
plt.ylabel('number of samples')
plt.show()

# few EDA: a variation of countplot

fig, axe = plt.subplots(ncols=1)
fig.set_size_inches(12, 5)
sns.countplot(y_train)

# few EDA: unique elements, element counts
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 클래스 빈도수 : ")
print(np.asarray((unique_elements, counts_elements)))

# download reuter word index
word_index = reuters.get_word_index(path="reuters_word_index.json")

# word index of 'the'
word_index['the']

# word index of 'it'
word_index['it']

# index to word simulation
index_to_word = {index + 3: word for word, index in word_index.items()}

# retriving words with its indexes
print(index_to_word[4])
print(index_to_word[16])

# inserting "Secret" index tags
def save_reuters():
    OUT_DIR = 'reuters'

    # Load data from keras API
    (x_train, y_train), (x_test, y_test) = reuters.load_data()

    # get word index
    word_index = reuters.get_word_index()
    # make dictionary to reference index
    word_list = {(value + 3): key for key, value in word_index.items()}
    INVALID_STR = '#$%'
    # define invalid string to remove them later
    word_list[0] = INVALID_STR
    word_list[1] = INVALID_STR
    word_list[2] = INVALID_STR

    # define class names from ex: https://github.com/keras-team/keras/issues/12072
    class_list = [
        'cocoa', 'grain', 'veg-oil', 'earn', 'acq', 'wheat', 'copper',
        'housing', 'money-supply', 'coffee', 'sugar', 'trade', 'reserves',
        'ship', 'cotton', 'carcass', 'crude', 'nat-gas', 'cpi', 'money-fx',
        'interest', 'gnp', 'meal-feed', 'alum', 'oilseed', 'gold', 'tin',
        'strategic-metal', 'livestock', 'retail', 'ipi', 'iron-steel',
        'rubber', 'heat', 'jobs', 'lei', 'bop', 'zinc', 'orange', 'pet-chem',
        'dlr', 'gas', 'silver', 'wpi', 'hog', 'lead'
    ]

    # make train/test dirs and class dirs
    for cid, class_name in enumerate(class_list):
        os.makedirs(os.path.join(OUT_DIR, 'train',
                                 '{:02d}_{}'.format(cid, class_name)),
                    exist_ok=True)
        os.makedirs(os.path.join(OUT_DIR, 'test',
                                 '{:02d}_{}'.format(cid, class_name)),
                    exist_ok=True)

    # convert train data
    for num, (x_data, y_data) in enumerate(zip(x_train, y_train)):
        # make file path
        fpath = os.path.join(OUT_DIR, 'train',
                             '{:02d}_{}'.format(y_data, class_list[y_data]),
                             'train_{:05d}.txt'.format(num))
        with open(fpath, mode='w', encoding='utf-8') as f:
            # convert indices and join words with space
            word_org = ' '.join(word_list[inx] for inx in x_data)
            # remove invalid strings
            word_org = word_org.replace(INVALID_STR + ' ', '')
            # save text
            f.write(word_org)

    # convert test data
    for num, (x_data, y_data) in enumerate(zip(x_test, y_test)):
        # make file path
        fpath = os.path.join(OUT_DIR, 'test',
                             '{:02d}_{}'.format(y_data, class_list[y_data]),
                             'test_{:05d}.txt'.format(num))
        with open(fpath, mode='w', encoding='utf-8') as f:
            # convert indices and join words with space
            word_org = ' '.join(word_list[inx] for inx in x_data)
            # remove invalid strings
            word_org = word_org.replace(INVALID_STR + ' ', '')
            # save text
            f.write(word_org)

    print()
    print('Saved to ' + OUT_DIR + '/')
    print()
# Commented out IPython magic to ensure Python compatibility.
from tensorflow.keras.datasets import reuters

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
(X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=1000,
                                                         test_split=0.2)

print('Train data : {}'.format(len(X_train)))
print('Test data : {}'.format(len(X_test)))
num_classes = max(Y_train) + 1
print('class : {}'.format(num_classes))

word_index = reuters.get_word_index()
"""우리는  11,288개에 달하는 news를 46개의 topic에 맞춰서 classify해볼 것이다. 그래서 Word one hot encoding 및 모델 생성을 위한 모듈을 위에서 import 해주었고 아래 그림에서 데이터 전처리를 수행한다. Pad_sequences 등의 전처리 모듈을 이용한다. 또한 모델의 학습에 overfitting을 방지하는 EarlyStopping도 추가해준다. 이후 LSTM을 이용하여 모델을 생성하고 모델을 학습한다."""

index_word = {}
for key, value in word_index.items():
    index_word[value] = key

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
Beispiel #6
0
 def get_text(self, data):
     word_id_index = reuters.get_word_index()
     id_word_index = dict([(id, value)
                           for (value, id) in word_id_index.items()])
     return ' '.join([id_word_index.get(i - 3, '?') for i in data])
Beispiel #7
0
import numpy as np
import string
import textwrap
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Dense, Dropout, Input, LSTM, GRU, Bidirectional, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
"""Uses the [Reuters newswire](https://keras.io/api/datasets/reuters/) classification dataset, which has text paired with 46 topics as labels. You can see what these labels represent [here](https://martin-thoma.com/nlp-reuters/)."""

(X_train, y_train), (_, _) = reuters.load_data()

# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
# Needed to encode our own reviews later

word_dict = reuters.get_word_index()
word_dict = {k: (v + 3) for k, v in word_dict.items()}
word_dict["<PAD>"] = 0
word_dict["<START>"] = 1
word_dict["<UNK>"] = 2
word_dict["<UNUSED>"] = 3

vocab_size = len(word_dict.keys())

# Needed to decode training data into readable text

inverse_word_dict = {value: key for key, value in word_dict.items()}

X_train = np.array(X_train)
X_train = pad_sequences(X_train)
Beispiel #8
0
def decode_review(index_review):
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    # i - 3 because 0, 1, 2 are reserved indices for "padding", "start of sequence" and "unknown"
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in index_review])
Beispiel #9
0
    def run_reuters():
        # Extract useful data from dataset
        print('Extracting the Reuters dataset')
        (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

        print(f"There are {len(train_data)} training examples and {len(test_data)} testing examples")

        # Illustration of the input data
        print(
            f'In this dataset the labels denote the topic of the piece. There are 46 topics represented, each one is '
            f'mutually exclusive.\nHaving taken the top 10,000 most-used words no word index will exceed 10,000.\n'
            f'Max Index = {max([max(sequence) for sequence in train_data])}')

        print(
            f"For the sake of illustration, let's decode an article back to English (not being printed for easier reading)")
        word_index = reuters.get_word_index()
        reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
        decoded_review = ''.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
        # print(decoded_review)

        # Encoding the inputs
        print("In order to pass these lists of integers into a neural network we must first encode them as tensors of "
              "uniform length.\nIn this example we'll use one-hot encoding, done manually for the sake of understanding.")

        def vectorise_sequences(sequences, dimension=10000):
            ret = np.zeros((len(sequences), dimension))
            for i, sequence in enumerate(sequences):
                ret[i, sequence] = 1
                if i < 1:
                    print(f"\n{sequence} => {ret[i]}\n")
            return ret

        x_train = vectorise_sequences(train_data)
        x_test = vectorise_sequences(test_data)

        print("For the labels this time around, there are a few options. A very common option is one-hot-encoding, for "
              "which Keras has an in-built function (a manual version is included in the code for educational purposes)")

        def to_one_hot(labels, dimension=46):
            ret = np.zeros((len(labels), dimension))
            for i, label in enumerate(labels):
                ret[i, label] = 1
            return ret

        one_hot_train_labels = to_categorical(train_labels)
        one_hot_test_labels = to_categorical(test_labels)

        # Design and compile the model
        print("Now to build the network, this time using parameters with greater configurability")
        model = models.Sequential()
        model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(46, activation='softmax'))

        model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='categorical_crossentropy',
                      metrics=[metrics.categorical_accuracy])

        # Divide the training data
        print("Creating a validation set for greater insight during training")
        x_val = x_train[:1000]  # Taking the 1st 1000 samples for validation
        partial_x_train = x_train[1000:]  # Leaving everything from 1000 onwards for training
        y_val = one_hot_train_labels[:1000]  # Taking the 1st 1000 labels for validation
        partial_y_train = one_hot_train_labels[1000:]  # Leaving everything from 1000 onwards for training

        # Train the model
        print("Begin training the model:")
        history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
        history_dict = history.history

        print(f"\nNote that the history returned by the fit function has a 'history' member which is a dictionary. "
              f"The keys are: {history_dict.keys()}")  # ['loss', 'categorical_accuracy', 'val_loss', 'val_categorical_accuracy']

        # Prepare to plot the training and validation information
        loss_values = history_dict['loss']
        val_loss_values = history_dict['val_loss']
        acc_values = history_dict['categorical_accuracy']
        val_acc_values = history_dict['val_categorical_accuracy']

        epochs = range(1, len(history_dict['categorical_accuracy']) + 1)
        plt.plot(epochs, loss_values, 'bo', label='Training Loss')
        plt.plot(epochs, val_loss_values, 'b', label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

        plt.clf()
        plt.plot(epochs, acc_values, 'bo', label='Training Accuracy')
        plt.plot(epochs, val_acc_values, 'b', label='Validation Accuracy')
        plt.title('Training and Validation Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

        # Evaluate the model
        print("\nAfter reviewing each plot, evaluate the performance of the model on new data")
        results = model.evaluate(x_test, one_hot_test_labels)
        print(f"Evaluation Results: Loss = {results[0]}    Accuracy = {results[1] * 100}%")
Beispiel #10
0
# plt.hist([len(s) for s in x_train], bins=50)
# plt.show()

# y분포
unique_elemnets, count_elements = np.unique(y_train, return_counts=True)
print('y분포 : ',
      dict(zip(unique_elemnets,
               count_elements)))  # dict 딕셔너리 형태, zip 합치는것 ex) 0과 55, 1과 432
print('=============================================================')

# plt.hist(y_train, bins = 46)
# plt.show()

# x의 단어 분포
word_to_index = reuters.get_word_index()  # keras의 datasets에서만 사용가능
print(word_to_index)
print(type(word_to_index))
print('=============================================================')

#  키와 벨류를 교체
index_to_word = {}

for key, value in word_to_index.items():
    index_to_word[value] = key

#  키와 벨류를 교체후
print(index_to_word)
print(index_to_word[1])  # the
print(len(index_to_word))  # 30979
print(index_to_word[30979])  # northerly
Beispiel #11
0
from tensorflow.keras.datasets import reuters

(train_data,train_labels),(test_data,test_labels) = reuters.load_data(num_words=10000)

class_names = ['cocoa','grain','veg-oil','earn','acq','wheat','copper','housing','money-supply',
   'coffee','sugar','trade','reserves','ship','cotton','carcass','crude','nat-gas',
   'cpi','money-fx','interest','gnp','meal-feed','alum','oilseed','gold','tin',
   'strategic-metal','livestock','retail','ipi','iron-steel','rubber','heat','jobs',
   'lei','bop','zinc','orange','pet-chem','dlr','gas','silver','wpi','hog','lead']

print(f"label {class_names[train_labels[0]]}")

print(train_data[0])
print(train_labels[0])

word_to_indx = reuters.get_word_index()
print(word_to_indx)

inverted_word_index = dict([(value,key) for key,value in word_to_indx.items()])
text_news = " ".join(inverted_word_index.get(i ,"?") for i in train_data[0])
text_news

def bag_of_words(text_samples, max_elements=10000):
  output = np.zeros(shape=(len(text_samples),max_elements))
  for i,word in enumerate(text_samples):
    output[i,word] = 1
  return output


x_train = bag_of_words(train_data)
x_test = bag_of_words(test_data)
Beispiel #12
0
def load_data_set(type, max_len, vocab_size, batch_size):
    """
	Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification

	Args:
	type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
	max_len: {int} timesteps used for padding
	vocab_size: {int} size of the vocabulary
	batch_size: batch_size
	Returns:
	train_loader: {torch.Dataloader} train dataloader
	x_test_pad  : padded tokenized test_data for cross validating
	y_test      : y_test
	word_to_id  : {dict} words mapped to indices


	"""
    INDEX_FROM = 2
    if not bool(type):

        NUM_WORDS = vocab_size  # only use top 1000 words
        dataset = pd.read_csv('df_raw_text2.csv')

        dataset = dataset[~dataset.TEXTOS.isnull()]
        dataset.drop_duplicates(subset="DOCS_ID", keep='first', inplace=True)
        dataset = dataset[dataset.V_AMB != 's/d']
        dataset.drop_duplicates(subset="TEXTOS", keep='first', inplace=True)

        texts1 = [' '.join(txt.splitlines())
                  for txt in dataset['TEXTOS']]  # remove /n
        texts2 = [preprocess(txt) for txt in texts1
                  ]  # preprocess. ie. remove punct., lowercase, etc.

        t = Tokenizer(num_words=10000,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True,
                      split=" ",
                      char_level=False,
                      oov_token=1)

        t.fit_on_texts(texts2)
        x_test_seq = t.texts_to_sequences(texts2)

        word_to_id = {k: v for k, v in t.word_index.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<UNK>"] = 1
        id_to_word = {value: key for key, value in word_to_id.items()}

        y = [1 if elem == 'si' else 0 for elem in dataset.VIOLENCIA_DE_GENERO]

        x = np.array(x_test_seq)
        y = np.array(y)

        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            train_size=.75,
                                                            random_state=1)

        x_train_pad = pad_sequences(x_train, maxlen=max_len)
        x_test_pad = pad_sequences(x_test, maxlen=max_len)

        train_data = data_utils.TensorDataset(
            torch.from_numpy(x_train_pad).type(torch.LongTensor),
            torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,
                                             batch_size=batch_size,
                                             drop_last=True)
        return train_loader, x_test_pad, y_test, word_to_id

    else:
        from tensorflow.keras.datasets import reuters

        train_set, test_set = reuters.load_data(path="reuters.npz",
                                                num_words=vocab_size,
                                                skip_top=0,
                                                index_from=INDEX_FROM)
        x_train, y_train = train_set[0], train_set[1]
        x_test, y_test = test_set[0], test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k: (v + 3) for k, v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value: key for key, value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train, maxlen=max_len)
        x_test_pad = pad_sequences(x_test, maxlen=max_len)

        train_data = data_utils.TensorDataset(
            torch.from_numpy(x_train_pad).type(torch.LongTensor),
            torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,
                                             batch_size=batch_size,
                                             drop_last=True)
        return train_loader, train_set, test_set, x_test_pad, word_to_id