Ejemplo n.º 1
0
def embToEnglish(str):
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    decoded_review = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in str])
    return decoded_review
    def load_data(self, sample_size=None):
        print('Load Data...')
        (X_train, y_train), (X_test, y_test) = reuters.load_data(
            start_char=None, index_from=None, nb_words=self.word_vocab_size)
        if sample_size:
            sample_indices_train = random.sample(range(len(X_train)),
                                                 sample_size)
            X_train = itemgetter(*sample_indices_train)(X_train)
            y_train = itemgetter(*sample_indices_train)(y_train)

            sample_indices_test = random.sample(range(len(X_test)),sample_size)
            X_test = itemgetter(*sample_indices_test)(X_test)
            y_test = itemgetter(*sample_indices_test)(y_test)
        index_word = dict((v, k) for k, v in reuters.get_word_index().items())
        X_train_char = [[index_word[idx] for idx in x] for x in X_train]
        X_test_char = [[index_word[idx] for idx in x] for x in X_test]
        X_test_char, X_train_char, vocab_char_size = \
            self.tokenize(X_test_char, X_train_char)
        X_test, X_train, X_test_char, X_train_char = \
            self.pad(X_test_char, X_train_char,X_test, X_train)
        nb_classes = np.max(y_train+y_test)+1
        Y_train = np_utils.to_categorical(y_train, nb_classes)
        Y_test = np_utils.to_categorical(y_test, nb_classes)
        return X_train, X_train_char, Y_train, X_test, X_test_char, Y_test, \
               vocab_char_size, nb_classes
Ejemplo n.º 3
0
def decode_words():
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    decoded_words = ' '.join(
        [reverse_word_index.get(i - 3, "?") for i in train_data[0]])
    return decoded_words
Ejemplo n.º 4
0
def reuters_to_hdf5(file_name):
    if os.path.exists(file_name):
        return

    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=1000)
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    x_train = x_train.astype('object')
    x_test = x_test.astype('object')

    x = numpy.concatenate((x_train, x_test), axis=0)
    x = sequence.pad_sequences(x, maxlen=400)
    y = numpy.concatenate((y_train, y_test), axis=0)
    #    y = sequence.pad_sequences(y, maxlen=400)

    with h5py.File(file_name, 'w') as f:
        f.create_dataset('x', data=x, compression='gzip')
        f.create_dataset('y', data=y, compression='gzip')
        dictionary = f.create_group('dictionary')
        for key in word_index:
            dictionary[key] = word_index[key]
Ejemplo n.º 5
0
def decodeToText(textSequence):
    wordIndex = reuters.get_word_index()
    reverseWordIndex = dict([(value, key)
                             for (key, value) in wordIndex.items()])
    decodedText = ' '.join(
        [reverseWordIndex.get(i - 3, '?') for i in textSequence])
    return decodedText
Ejemplo n.º 6
0
def running_retuter(modelname):
    maxlen = 400
    max_words = 10000

    # 1. Loading started
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)

    word_index = reuters.get_word_index(path="reuters_word_index.json")
    num_classes = np.max(y_train) + 1

    # 2. pad_sequences
    keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

    if (modelname == 'cnn'):
       x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
       x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)
       y_train = keras.utils.to_categorical(y_train, num_classes)
       y_test = keras.utils.to_categorical(y_test, num_classes)

    elif(modelname == 'nn'):
        tokenizer = Tokenizer(num_words=max_words)
        x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
        x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

        y_train = keras.utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.to_categorical(y_test, num_classes)

    bulidModel(modelname, num_classes, x_test, y_test, x_train, y_train)
Ejemplo n.º 7
0
def decode_data(sentence_vec):
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    decoded_newswire = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in train_data[0]])
    return decoded_newswire
Ejemplo n.º 8
0
def transpose_word(train_data):
    word_index = reuters.get_word_index()
    revese_word_index = dict([(value, key)
                              for (key, value) in word_index.items()])
    decoded_newswire = ' '.join(
        [revese_word_index.get(i - 3, '?') for i in train_data[0]])
    return decoded_newswire
Ejemplo n.º 9
0
def keras_reuters_info():
    (X_train, y_train), (X_test, y_test) = reuters.load_data(path=os.path.join(
        root_path, "data", "reuters", "reuters.npz"),
                                                             skip_top=0,
                                                             maxlen=None,
                                                             test_split=0.2,
                                                             seed=113,
                                                             start_char=1,
                                                             oov_char=2,
                                                             index_from=3)
    logger.info(X_train.shape)
    logger.info(y_train.shape)
    logger.info(X_test.shape)
    logger.info(y_test.shape)

    word_index = reuters.get_word_index(
        os.path.join(root_path, "data", "reuters", "reuters_word_index.json"))
    logger.info(word_index)

    num_words = max(max([len(x)
                         for x in X_train]), max([len(x) for x in X_test])) + 1
    num_classify = max(max(y_train), max(y_test)) + 1
    num_vocab = max(max([max(x)
                         for x in X_train]), max([max(x) for x in X_test])) + 1

    logger.info("num_words {0}".format(num_words))
    logger.info("num_classify {0}".format(num_classify))
    logger.info("num_voc {0}".format(num_vocab))
Ejemplo n.º 10
0
def decode(wire_list):
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])

    # Try to get the key in the reverse_word_index with a default value of '?' and join with spaces
    decoded_newswire = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in wire_list])
    return decoded_newswire
Ejemplo n.º 11
0
def reuters_news_wire_texts():
    (x_train, y_train), (x_test, y_test) = reuters.load_data()
    wordDict = {y: x for x, y in reuters.get_word_index().items()}
    texts = []
    for x in x_train:
        texts.append(" ".join([
            wordDict.get(index - 3) for index in x
            if wordDict.get(index - 3) is not None
        ]))
    return texts, y_train
def decode_sample(datapoint, word_index=None, reverse_word_index=None):
	
	if word_index is None:
		word_index = reuters.get_word_index()
	
	if reverse_word_index is None:
		reverse_word_index = dict([(v,k) for (k,v) in word_index.items()])

	text = ' '.join(reverse_word_index.get(i-3,'?') for i in datapoint)
	return text
Ejemplo n.º 13
0
def num2word(input_data):
    word_index = reuters.get_word_index()

    print('字典长度: ' + str(len(word_index)))

    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])

    result = ''.join(reverse_word_index.get(i - 3, '?') for i in input_data)

    return result
Ejemplo n.º 14
0
def train():
    (train_data,
     train_labels), (test_data,
                     test_labels) = reuters.load_data(num_words=10000)
    print len(train_data)
    print len(train_labels)
    print train_data[10]
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    decoded_newswire = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in train_data[0]])
    print train_labels[10]

    x_train = vectorize_sequences(train_data)
    x_test = vectorize_sequences(test_data)

    # one_hot_train_labels = to_one_hot(train_labels)
    # one_hot_test_labels  = to_one_hot(test_labels)

    one_hot_train_labels = to_categorical(train_labels)
    one_hot_test_labels = to_categorical(test_labels)

    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(10000, )))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(46, activation='softmax'))

    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    x_val = x_train[:1000]
    partial_x_train = x_train[1000:]

    y_val = one_hot_train_labels[:1000]
    partial_y_train = one_hot_train_labels[1000:]

    history = model.fit(partial_x_train,
                        partial_y_train,
                        epochs=20,
                        batch_size=512,
                        validation_data=(x_val, y_val))

    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(loss) + 1)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
Ejemplo n.º 15
0
    def decode_review(self):
        word_index = reuters.get_word_index(
        )  # dictionary mapping words to integer index

        # Map integer indices to words
        reverse_word_index = dict([(value, key)
                                   for (key, value) in word_index.items()])

        # Indices offset by 3, as 0, 1 and 2 reserved for padding, start of sequence and unknown
        decoded_newswire = ' '.join(
            [reverse_word_index.get(i - 3, '?') for i in self.train_data[0]])
        print(decoded_newswire)
Ejemplo n.º 16
0
def word_map():
    global reverse_word_index
    # A dictionary mapping words to an integer index
    word_index = reuters.get_word_index()
    # The first indices are reserved
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
Ejemplo n.º 17
0
def main():
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None,
                                                             test_split=0.2)
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    print('# of Training Samples: {}'.format(len(x_train)))
    print('# of Test Samples: {}'.format(len(x_test)))

    num_classes = max(y_train) + 1
    print('# of CLasses: {0}'.format(num_classes))

    max_words = 10000

    tokenizer = Tokenizer(num_words=max_words)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='count')

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    print(x_train[0])
    print(len(x_train[0]))
    print(max(x_train[0]))

    print(y_train[0])
    print(len(y_train[0]))

    model = Sequential()
    model.add(Dense(512, input_shape=(max_words, )))
    # model.add(Activation('relu'))
    model.add(Activation('exponential'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.metrics_names)

    batch_size = 32
    epochs = 2

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_split=0.1)
    score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
Ejemplo n.º 18
0
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict)
Ejemplo n.º 19
0
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict)
Ejemplo n.º 20
0
def multi_dataset_test():
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
        assert len(x_train) == len(y_train) == 60000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = boston_housing.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict)
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        assert len(x_train) == len(y_train) == 60000
        assert len(x_test) == len(y_test) == 10000
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict)
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
        cifarDefaultTrainLength = 50000
        cifarDefaultTestLength = 10000
        assert len(x_train) == len(y_train) == cifarDefaultTrainLength
        assert len(x_test) == len(y_test) == cifarDefaultTestLength

        (x_train, y_train), (x_test, y_test) = cifar100.load_data('fine')
        cifarFineTrainLength = 50000
        cifarFineTestLength = 10000
        assert len(x_train) == len(y_train) == cifarFineTrainLength
        assert len(x_test) == len(y_test) == cifarFineTestLength

        (x_train, y_train), (x_test, y_test) = cifar100.load_data('coarse')
        cifarCoarseTrainLength = 50000
        cifarCoarseTestLength = 10000
        assert len(x_train) == len(y_train) == cifarCoarseTrainLength
        assert len(x_test) == len(y_test) == cifarCoarseTestLength
Ejemplo n.º 21
0
def train():
    (train_data,
     train_labels), (test_data,
                     test_labels) = reuters.load_data(num_words=10000)

    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, key)
                               for (key, value) in word_index.items()])
    # Note that our indices were offset by 3
    # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
    decoded_newswire = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

    # Our vectorized training data
    x_train = vectorize_sequences(train_data)
    # Our vectorized test data
    y_train = vectorize_sequences(test_data)

    # Our vectorized training labels
    one_hot_train_labels = to_one_hot(train_labels)
    # Our vectorized test labels
    one_hot_test_labels = to_one_hot(test_labels)

    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(10000, )))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(46, activation='softmax'))

    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    x_val = x_train[:1000]
    partial_x_train = x_train[1000:]

    y_val = one_hot_train_labels[:1000]
    partial_y_train = one_hot_train_labels[1000:]

    return model.fit(partial_x_train,
                     partial_y_train,
                     epochs=20,
                     batch_size=512,
                     validation_data=(x_val, y_val))
Ejemplo n.º 22
0
def load_retures_keras(text=False):

    from keras.datasets import reuters
    from keras.preprocessing.sequence import pad_sequences

    max_words = 10000

    print('Loading data...')
    (x, y), (_, _) = reuters.load_data(num_words=max_words, test_split=0.0)

    if not text:
        num_classes = np.max(y) + 1
        print(num_classes, 'classes')
        print('Vectorizing sequence data...')
        x = pad_sequences(x, maxlen=250)
        print('x_train shape:', x.shape)

        return x.astype(float), y
    else:
        word_index = reuters.get_word_index()

        word_index = {k: (v + 3) for k, v in word_index.items()}
        word_index["<PAD>"] = 0
        word_index["<START>"] = 1
        word_index["<UNK>"] = 2  # unknown
        word_index["<UNUSED>"] = 3

        reverse_word_index = dict([(value, key)
                                   for (key, value) in word_index.items()])

        def decode_review(text):
            return ' '.join([reverse_word_index.get(i, '?') for i in text])

        all_sentence = []
        for sent in x:
            all_sentence.append(decode_review(sent))

        return np.array(all_sentence), y
Ejemplo n.º 23
0
def main():
    # model parameters:
    maxlen = 400
    max_words = 10000
    batch_size = 32
    epochs = 20
    embedding_dims = 50
    cnn_filters = 100
    cnn_kernel_size = 5
    dense_hidden_dims = 200

    # 1. Loading started
    (x_train, y_train), (x_test,
                         y_test) = reuters.load_data(num_words=max_words,
                                                     test_split=0.2)
    word_index = reuters.get_word_index(path="reuters_word_index.json")

    num_classes = max(y_train) + 1
    # 2. pad_sequences
    x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
    x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # 3. Build CNN model...
    model = Sequential()
    model.add(Embedding(max_words, embedding_dims, input_length=maxlen))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(cnn_filters,
               cnn_kernel_size,
               padding='valid',
               activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(dense_hidden_dims, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='sigmoid'))
    model.summary()

    # 4. compile network
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    # 5.  train model
    history = model.fit(x_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1)

    # 6. evaluate model
    loss_and_metrics = model.evaluate(x_test, y_test, batch_size, verbose=1)
    print('Test loss:{}\nTest accuracy:{}'.format(loss_and_metrics[0],
                                                  loss_and_metrics[1]))

    # Create a graph of accuracy and loss over time
    history_dict = history.history
    history_dict.keys()

    acc = history_dict['categorical_accuracy']
    val_acc = history_dict['val_categorical_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)
    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
def decode_newswire(newswire):
    word_index = reuters.get_word_index()
    reverse_word_index = dict([(value, word) for (word, value) in word_index.items()])
    decoded_newswire = ' '.join([reverse_word_index.get(value - 3, '?') for value in newswire])
    print(decoded_newswire)
def print_newswire(newswire):
    word_index = reuters.get_word_index()
    word_lookup = dict([(value, key) for (key, value) in word_index.items()])
    print(' '.join([word_lookup.get(i - 3, '?') for i in newswire]))
def load_data_set(type,max_len,vocab_size,batch_size):
    """
        Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification
 
        Args:
            type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
            max_len: {int} timesteps used for padding
			vocab_size: {int} size of the vocabulary
			batch_size: batch_size
        Returns:
            train_loader: {torch.Dataloader} train dataloader
            x_test_pad  : padded tokenized test_data for cross validating
			y_test      : y_test
            word_to_id  : {dict} words mapped to indices
 
      
        """
   
    INDEX_FROM=3
    if not bool(type):
        NUM_WORDS=vocab_size # only use top 1000 words
           # word index offset
 
        train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = imdb.get_word_index()
        word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
 
        id_to_word = {value:key for key,value in word_to_id.items()}
        x = np.concatenate([x_train, x_test])
        y = np.concatenate([y_train, y_test])
        n_train = x.shape[0] - 1000
        n_valid = 1000
 
        x_train = x[:n_train]
        y_train = y[:n_train]
        x_test = x[n_train:n_train+n_valid]
        y_test = y[n_train:n_train+n_valid]
 
 
        #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50)
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,x_test_pad,y_test,word_to_id
       
    else:
        from keras.datasets import reuters
 
        train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k:(v+3) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value:key for key,value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,train_set,test_set,x_test_pad,word_to_id
Ejemplo n.º 27
0
def decodeToWords(sequence):
	wordIndex=reuters.get_word_index()
	revIndex=dict(
		[(value,key)for (key,value) in wordIndex.items()])
	decWords=" ".join([revIndex.get(i-3,'?') for i in sequence])
	return decWords
Ejemplo n.º 28
0
#1. 데이터
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000,
                                                         test_split=0.2)
# 가장 많이 쓰이는 단어 10000개를 load하겠다
print("x_train.shape, x_test.shape: ", x_train.shape, x_test.shape)
# x_train.shape, x_test.shape:  (8982,) (2246,)
print("y_train.shape, y_test.shape: ", y_train.shape, y_test.shape)
# y_train.shape, y_test.shape:  (8982,) (2246,)
print("첫번째 훈련용 뉴스 기사: \n", x_train[0])
# 인덱스 숫자만 리스트 형태로 출력
print(" 첫번째 훈련용 뉴스 기사 레이블: \n", y_train[0])
# 인덱스만 출력

# x_train에 들어있는 숫자들이 각각 어떤 단어들을 나타내는지 확인
word_index = reuters.get_word_index()
print("x데이터의 word_index: \n", word_index)
# 딕셔너리 형태로 각 단어별 인덱스 출력

# 인덱스를 단어로 바꿔주기
from keras.preprocessing.text import Tokenizer
token = Tokenizer()
word_index = token.fit_on_texts(reuters.get_word_index())
word = token.sequences_to_texts(x_train[0:1])
print("x_train의 첫번째 word: \n", word)

# x_train의 shape를 확인하고 싶다?
# 하지만 리스트 형이라 shape가 없다
print(len(x_train[0]))  # 87

# y의 카테고리 개수 출력
Ejemplo n.º 29
0
def translate(sentense):
    word_index = reuters.get_word_index()
    reverse_index = dict([(value, key) for (key, value) in word_index.items()])
    decoded = ' '.join([reverse_index.get(i - 3, '*') for i in sentense])
    return decoded
Ejemplo n.º 30
0

import keras

from keras.datasets import reuters
# 限定为前10000个最常用单词
(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

# 样本整数列表转为单词形式
word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# Note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_newswire)



# 准备数据,将其向量化
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

# Our vectorized training data
x_train = vectorize_sequences(train_data)
# Our vectorized test data
x_test = vectorize_sequences(test_data)
Ejemplo n.º 31
0
    model.add(Dropout(0.25))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
    model.summary()
    return model


import pandas as pd
from keras.datasets import reuters
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence

#word_dict = imdb.get_word_index()
word_dict = reuters.get_word_index()


def encode_sentence(text):
    result = []
    arr = text_to_word_sequence(
        text, lower=True, split=" ")  # returns list of words (like split)
    for word in arr:
        w = encode_word(word)
        if w is not None:
            result.append(w)
    return result


def encode_word(word):
    if word not in word_dict:
Ejemplo n.º 32
0
import keras
from keras.datasets import reuters
#Using TensorFlow backend.
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")

print('# of Training Samples: {}'.format(len(x_train)))
print('# of Test Samples: {}'.format(len(x_test)))

num_classes = max(y_train) + 1
print('# of Classes: {}'.format(num_classes))
# of Training Samples: 8982
# of Test Samples: 2246
# of Classes: 46
index_to_word = {}
for key, value in word_index.items():
    index_to_word[value] = key
print(' '.join([index_to_word[x] for x in x_train[0]]))
print(y_train[0])

from keras.preprocessing.text import Tokenizer

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train[0])
Ejemplo n.º 33
0
    theano.config.optimizer = 'fast_run' # also 'fast_run' or 'None' for debugging
    theano.config.linker = 'py'
    theano.config.floatX = 'float32'

    print 'initialising...'

    V = 1001
    E = 12
    total_trainset = 10000
    total_iterations = 9000
    train_x_entropy = 0

    (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.pkl", \
                nb_words=None, skip_top=0, maxlen=None, test_split=0.1, seed=10086)

    word_map_tmp = reuters.get_word_index(path="reuters_word_index.pkl")
    word_dict = dict((v, k) for k, v in word_map_tmp.iteritems())
    word_dict[0] = "<UNK>"

    def real_words(l, eos):
        sent = []
        for word in l:
            if word == eos :
                sent.append("<EOS>")
            elif word > eos:
                sent.append(word_dict[0])
            else:
                sent.append(word_dict[word])

        return sent
Ejemplo n.º 34
0
from keras.datasets import reuters

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)
#从keras内置的数据集中引入数据。
#控制最大单词数目,选取频率前一万的。
#分别装入测试数据和训练数据中,相当于返回了...两个1×2的变量数组。


#此时数据是 Object的数组,因为每个Object的长度不同,所以并不是数组的数组。
#这些Object是单词的有序序列。
#但不满足最后需求的形式,最后要求的是numpy的一个矩阵。
#该数组的下标反映了一共有 多少个 样本。


word_index = reuters.get_word_index()
#是一个 单词 和 序号 的 字典,其中 键 是 单词 ,值 是 数字。
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
#反转词典 键值对 关系 的方法。
#要延后三位。
#前三位都是符号位。(?没懂)
decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
#对于里面的每一个索引序号,都查询词典里对应的单词并且用“ ”拼接在一起。
#这拼出来的啥玩意。
#总之可以还原,但重要的是知道怎么转换。



#preparing the data

def main():

    from keras.datasets import reuters

    (train_data,
     train_labels), (test_data,
                     test_labels) = reuters.load_data(num_words=10000)

    print('\n\nFirst sample of the Reuters training dataset:\n', train_data[0])

    word_index = reuters.get_word_index()
    inverse_word_index = dict((v, k) for k, v in word_index.iteritems())

    news_article_0 = ' '.join(
        [inverse_word_index.get(i - 3, '?') for i in train_data[0]])
    print('Corresponding news article:\n', news_article_0, '\n\n')

    train_data = vectorize(train_data)
    test_data = vectorize(test_data)

    train_labels = to_categorical(train_labels)
    test_labels = to_categorical(test_labels)

    model = build_model()
    model.compile(optimizer=RMSprop(lr=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    validation_data = train_data[:1000]
    validation_labels = train_labels[:1000]

    partial_train_data = train_data[1000:]
    partial_train_labels = train_labels[1000:]

    history = model.fit(partial_train_data,
                        partial_train_labels,
                        epochs=20,
                        batch_size=512,
                        verbose=0,
                        validation_data=(validation_data, validation_labels))

    plot_loss_and_accuracy(history)

    model = build_model()
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(train_data,
              train_labels,
              epochs=9,
              batch_size=512,
              verbose=0,
              validation_data=(test_data, test_labels))

    evaluation = model.evaluate(test_data, test_labels)
    print(
        'Loss on test data:{evaluation[0]}\nAccuracy on test data: {evaluation[1]}\n'
        .format(**locals()))

    predictions = model.predict(test_data)
    print(
        'Topic with highest probability for the first news article in the test dataset:\n',
        np.argmax(predictions[0]))