Example #1
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "imdb/texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse=pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+'))
    allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    y_train = np.zeros((25000,), dtype=np.float32)
    y_test = np.zeros((25000,), dtype=np.float32)
    y_train[12500:25000] = np.ones((12500,), dtype=np.float32)
    y_test[12500:25000] = np.ones((12500,), dtype=np.float32)

    x_seq= np.zeros((50000, (max_len - 3) * 4), dtype=np.int)
    for i in range(50000):
        for j in range(max_len - 3):
            x_seq[i, j * 4] = x[i, j]
            x_seq[i, j * 4 + 1] = x[i][j + 1] + num_words
            x_seq[i, j * 4 + 2] = x[i][j + 2] + num_words * 2
            x_seq[i, j * 4 + 3] = x[i][j + 3] + num_words * 3

    x_train_0 = x[:25000]
    x_train_1 = x_reverse[:25000]
    x_train_2=x_seq[:25000]
    x_test_0 = x[25000:]
    x_test_1 = x_reverse[25000:]
    x_test_2=x_seq[25000:]

    result=[]

    indice = np.arange(25000)
    np.random.shuffle(indice)
    result.append(x_train_0[indice])
    result.append(x_train_1[indice])
    result.append(x_train_2[indice])
    result.append(x_test_0[indice])
    result.append(x_test_1[indice])
    result.append(x_test_2[indice])
    result.append(y_train[indice])
    result.append(y_test[indice])
    
    result.append(embedding_matrix)
    return result
Example #2
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()
Example #3
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        close_file = True
        log.info("Updateing datasets from file list: %s", self._input_source)
        if hasattr(self._input_source, 'read'):
            input_file = self._input_source
            close_file = False
        elif isinstance(self._input_source,
                        str) and self._input_source.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._input_source, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._input_source)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components,
                                             dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        if close_file:
            input_file.close()
Example #4
0
    def update_datasets(self, filter=None):
        if filter is None:
            filter = self._filter

        file_list = []
        log.info("Updateing datasets from file list: %s", self._source_file)
        if self._source_file.startswith("gs://"):
            log.info("Using tensorflow for IO")
            from tensorflow.python.lib.io.file_io import FileIO
            input_file = FileIO(self._source_file, "r")
            log.info("Tensorflow reported size: %d", input_file.size())
        else:
            input_file = open(self._source_file)

        lines = input_file.readlines()
        for line in lines:
            fpath = line.strip()
            parts = fpath.split("/")
            file_name = parts[-1]
            directory_name = "/".join(parts[:-1])
            match = self._re.match(file_name)
            if not match:
                continue
            match_components = match.groupdict()
            dataset_path = self._prepend_path + fpath
            dataset_id = self.update_dataset(match_components=match_components, dataset_path=dataset_path)
            dataset = self.get_dataset_by_id(dataset_id)
            if not filter(dataset_id, match_components, dataset):
                self.remove_dataset_by_id(dataset_id)
        input_file.close()
Example #5
0
def prepare_train():
    print("prepare training data")
    f = FileIO(os.path.join(FLAGS.buckets, 'texts.pkl'), 'rb')
    text1 = pickle.load(f)
    text1 = text1[:25000]
    f.close()
    f = FileIO(os.path.join(FLAGS.buckets, 'texts_unsup.pkl'), 'rb')
    text2 = pickle.load(f)
    f.close()
    texts = text1 + text2

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts)
    sequence = tokenizer.texts_to_sequences(texts)
    sequence_pad = pad_sequences(sequence,
                                 maxlen=MAX_DOCUMENT_LENGTH + 1,
                                 dtype=np.int32,
                                 padding='post',
                                 truncating='post')
    seq_len = []
    for i in range(len(sequence)):
        r = len(sequence[i])
        if r < MAX_DOCUMENT_LENGTH:
            seq_len.append(r)
        else:
            seq_len.append(MAX_DOCUMENT_LENGTH)
    x_1 = sequence_pad[:, :-1]

    y_ = sequence_pad[:, 1:]
    return x_1, seq_len, y_
Example #6
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    # sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    sequences = []
    for i in range(50000):
        t = []
        tokens = texts[i].lower().split(' ')
        for j in range(len(tokens)):
            index = word_index.get(tokens[j], 0)
            if index < num_words:
                t.append(index)
            else:
                t.append(0)
        sequences.append(t)

    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 3) * 4), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 3):
            Xtrain[i, j * 4] = data1[i, j]
            Xtrain[i, j * 4 + 1] = data1[i][j + 1] + num_words
            Xtrain[i, j * 4 + 2] = data1[i][j + 2] + num_words * 2
            Xtrain[i, j * 4 + 3] = data1[i][j + 3] + num_words * 3
    for i in range(25000):
        for j in range(max_len - 3):
            Xtest[i, j * 4] = data2[i, j]
            Xtest[i, j * 4 + 1] = data2[i][j + 1] + num_words
            Xtest[i, j * 4 + 2] = data2[i][j + 2] + num_words * 2
            Xtest[i, j * 4 + 3] = data2[i][j + 3] + num_words * 3

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
Example #7
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 1):
            Xtrain[i, j * 2] = data1[i, j]
            Xtrain[i, j * 2 + 1] = data1[i][j + 1] + num_words
    for i in range(25000):
        for j in range(max_len - 1):
            Xtest[i, j * 2] = data2[i, j]
            Xtest[i, j * 2 + 1] = data2[i][j + 1] + num_words

    indice = np.arange(25000)
    np.random.shuffle(indice)
    Xtrain = Xtrain[indice]
    Ytrain = Ytrain[indice]
    Xtest = Xtest[indice]
    Ytest = Ytest[indice]
    return Xtrain, Ytrain, Xtest, Ytest
Example #8
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "rt/text.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse=pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"), mode='r+'))
    allwords = pickle.load(FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    y = np.zeros((num_data,), dtype=np.float32)
    y[5331:] = np.ones((5331,), dtype=np.float32)

    x_seq= np.zeros((num_data, (max_len - 2) * 3), dtype=np.int)
    for i in range(num_data):
        for j in range(max_len - 2):
            x_seq[i, j * 3] = x[i, j]
            x_seq[i, j * 3 + 1] = x[i][j + 1] + num_words
            x_seq[i, j * 3 + 2] = x[i][j + 2] + num_words * 2

    result=[]
    indice = np.arange(num_data)
    np.random.shuffle(indice)
    result.append(x[indice])
    result.append(x_reverse[indice])
    result.append(x_seq[indice])
    result.append(y[indice])
    
    result.append(embedding_matrix)
    return result
Example #9
0
def get_input():
    f = FileIO(os.path.join(FLAGS.buckets, "20news/texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.fit_on_texts(texts[:num_train])
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_reverse = [list(reversed(seq)) for seq in sequences]

    x = pad_sequences(sequences, maxlen=max_len)
    x_reverse = pad_sequences(sequences_reverse, maxlen=max_len)

    word_index = tokenizer.word_index
    embeddings_index = {}
    wordX = np.load(
        FileIO(os.path.join(FLAGS.buckets, "glove/embedding.300d.npy"),
               mode='r+'))
    allwords = pickle.load(
        FileIO(os.path.join(FLAGS.buckets, "glove/words.pkl"), mode='r+'))
    for i in range(len(allwords)):
        embeddings_index[allwords[i]] = wordX[i, :]
    embedding_matrix = np.zeros((num_words, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None and i < num_words:
            embedding_matrix[i] = embedding_vector

    y_train = np.load(
        FileIO(os.path.join(FLAGS.buckets, "20news/Ytrain.npy"), mode='r+'))
    y_train = to_categorical(y_train)
    y_test = np.load(
        FileIO(os.path.join(FLAGS.buckets, "20news/Ytest.npy"), mode='r+'))
    y_test = to_categorical(y_test)

    x_seq = np.zeros((num_train + num_test, (max_len - 2) * 3), dtype=np.int)
    for i in range(num_train + num_test):
        for j in range(max_len - 2):
            x_seq[i, j * 3] = x[i, j]
            x_seq[i, j * 3 + 1] = x[i][j + 1] + num_words
            x_seq[i, j * 3 + 2] = x[i][j + 2] + num_words * 2

    x_train_0 = x[:num_train]
    x_train_1 = x_reverse[:num_train]
    x_train_2 = x_seq[:num_train]
    x_test_0 = x[num_train:]
    x_test_1 = x_reverse[num_train:]
    x_test_2 = x_seq[num_train:]

    result = []

    indice1 = np.arange(num_train)
    np.random.shuffle(indice1)
    indice2 = np.arange(num_test)
    np.random.shuffle(indice2)

    result.append(x_train_0[indice1])
    result.append(x_train_1[indice1])
    result.append(x_train_2[indice1])
    result.append(x_test_0[indice2])
    result.append(x_test_1[indice2])
    result.append(x_test_2[indice2])
    result.append(y_train[indice1])
    result.append(y_test[indice2])

    result.append(embedding_matrix)
    return result
Example #10
0
def main():
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()
    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.filters = ''
    tokenizer.fit_on_texts(texts[0:25000])
    # print(texts[0])
    # sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    sequences = []
    for i in range(50000):
        t = []
        tokens = texts[i].lower().split(' ')
        for j in range(len(tokens)):
            index = word_index.get(tokens[j], 0)
            if index < num_words:
                t.append(index)
            else:
                t.append(0)
        sequences.append(t)

    print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000, ), dtype=np.float32)
    Ytest = np.zeros((25000, ), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500, ), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500, ), dtype=np.float32)

    Xtrain1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int)
    Xtest1 = np.zeros((25000, (max_len - 2) * 3), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 2):
            Xtrain1[i, j * 3] = data1[i, j]
            Xtrain1[i, j * 3 + 1] = data1[i][j + 1] + num_words
            Xtrain1[i, j * 3 + 2] = data1[i][j + 2] + num_words * 2
    for i in range(25000):
        for j in range(max_len - 2):
            Xtest1[i, j * 3] = data2[i, j]
            Xtest1[i, j * 3 + 1] = data2[i][j + 1] + num_words
            Xtest1[i, j * 3 + 2] = data2[i][j + 2] + num_words * 2

    Xtrain2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    Xtest2 = np.zeros((25000, (max_len - 1) * 2), dtype=np.int)
    for i in range(25000):
        for j in range(max_len - 1):
            Xtrain2[i, j * 2] = data1[i, j]
            Xtrain2[i, j * 2 + 1] = data1[i][j + 1] + num_words
    for i in range(25000):
        for j in range(max_len - 1):
            Xtest2[i, j * 2] = data2[i, j]
            Xtest2[i, j * 2 + 1] = data2[i][j + 1] + num_words

    indice1 = np.arange(25000)
    np.random.shuffle(indice1)
    Xtrain1 = Xtrain1[indice1]
    Xtrain2 = Xtrain2[indice1]
    Ytrain = Ytrain[indice1]

    indice2 = np.arange(25000)
    np.random.shuffle(indice2)
    Xtest1 = Xtest1[indice2]
    Xtest2 = Xtest2[indice2]
    Ytest = Ytest[indice2]
    print('begin to build model ...')
    input1 = Input(shape=((max_len - 2) * 3, ))
    embedding1 = Embedding(num_words * 3,
                           embedding_dimension,
                           input_length=(max_len - 2) * 3,
                           init='orthogonal')(input1)
    x = AveragePooling1D(pool_length=3)(embedding1)
    x = GlobalMaxPooling1D()(x)

    input2 = Input(shape=((max_len - 1) * 2, ))
    embedding2 = Embedding(num_words * 2,
                           embedding_dimension,
                           input_length=(max_len - 1) * 2,
                           init='orthogonal')(input2)
    y = AveragePooling1D(pool_length=2, stride=2)(embedding2)
    y = GlobalMaxPooling1D()(y)
    z = merge([x, y], mode='concat')
    # model.add(Dropout(0.5))
    output = Dense(1, activation='sigmoid')(z)

    model = Model(input=[input1, input2], output=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])
    model.fit([Xtrain1, Xtrain2],
              Ytrain,
              batch_size=32,
              nb_epoch=20,
              verbose=2,
              validation_data=([Xtest1, Xtest2], Ytest))
Example #11
0
def main():
    global ngram
    f = FileIO(os.path.join(FLAGS.buckets, "texts.pkl"), mode='r+')
    texts = pickle.load(f)
    f.close()

    tokenizer = Tokenizer(nb_words=num_words)
    tokenizer.filters=''
    tokenizer.fit_on_texts(texts[0:25000])
    sequences = tokenizer.texts_to_sequences(texts)
    # word_index = tokenizer.word_index
    # sequences = []
    # for i in range(50000):
    #     t = []
    #     tokens = texts[i].lower().split(' ')
    #     for j in range(len(tokens)):
    #         index = word_index.get(tokens[j], 0)
    #         if index < num_words:
    #             t.append(index)
    #         else:
    #             t.append(0)
    #     sequences.append(t)

    # print('Found %s unique tokens.' % len(word_index))

    data1 = pad_sequences(sequences[0:25000], maxlen=max_len)
    data2 = pad_sequences(sequences[25000:50000], maxlen=max_len)
    Ytrain = np.zeros((25000,), dtype=np.float32)
    Ytest = np.zeros((25000,), dtype=np.float32)
    Ytrain[12500:25000] = np.ones((12500,), dtype=np.float32)
    Ytest[12500:25000] = np.ones((12500,), dtype=np.float32)

    Xtrain = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int)
    Xtest = np.zeros((25000, (max_len - ngram + 1) * ngram), dtype=np.int)

    id_range = np.arange(max_len - ngram + 1)
    for i in range(ngram):
        Xtrain[:, id_range * ngram + i] = data1[:, id_range + i] + num_words * i
        Xtest[:, id_range * ngram + i] = data2[:, id_range + i] + num_words * i

    print('begin to build model ...')
    main_input = Input(shape=((max_len - ngram + 1) * ngram,))
    # embedding1 = Embedding(num_words * ngram, word_dim, embeddings_initializer=keras.initializers.Orthogonal())(main_input)
    embedding1 = Embedding(num_words * ngram, word_dim)(main_input)
    x = AveragePooling1D(pool_size=ngram)(embedding1)
    x = GlobalMaxPooling1D()(x)

    weight = np.ones((word_dim, 1), dtype=np.float)
    weight[int(word_dim / 2):] = -1 * np.ones([int(word_dim / 2), 1], dtype=np.float)
    output = Dense(1,
                   weights=[weight, np.zeros([1])],
                   trainable=False,
                   activation='sigmoid')(x)

    model = Model(input=main_input, output=output)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    model.fit([Xtrain], Ytrain,
              batch_size=32,
              shuffle=True,
              nb_epoch=15,
              verbose=2,
              validation_data=([Xtest], Ytest))