Example #1
0
 def _load_data(self, train_set, test_set):
     datasets = {'train': train_set, 'test': test_set}
     # vectorize
     # add offset of 2 for PAD and OOV
     self._tokens_vocab.add_vocab_offset(2)
     self._chars_vocab.add_vocab_offset(2)
     self._tags_vocab.add_vocab_offset(1)
     vec_data = {}
     for f in datasets.keys():
         vec_data[f] = self._prepare_vectors(datasets[f])
     for f in datasets.keys():
         tokens, words, intents, tags = vec_data[f]
         x = pad_sequences(tokens, maxlen=self.sentence_len)
         _w = []
         for s in words:
             _s = pad_sequences(s, maxlen=self.word_len)
             sentence = np.asarray(_s)[-self.sentence_len:]
             if sentence.shape[0] < self.sentence_len:
                 sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0],
                                                 self.word_len)), sentence))
             _w.append(sentence)
         w = np.asarray(_w)
         _y = pad_sequences(tags, maxlen=self.sentence_len)
         y = one_hot_sentence(_y, self.label_vocab_size)
         i = one_hot(intents, self.intent_size)
         self.vecs[f] = [x, w, i, y]
Example #2
0
def lstm(trainData, trainMark, testData, embedding_dim, embedding_matrix, maxlen, output_len):
    # 填充数据,将每个序列长度保持一致
    trainData = list(sequence.pad_sequences(trainData, maxlen=maxlen,
                                            dtype='float64'))  # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0,由于下面序号为0时,对应值也为0,因此可以这样
    testData = list(sequence.pad_sequences(testData, maxlen=maxlen,
                                           dtype='float64'))  # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0

    # 建立lstm神经网络模型
    model = Sequential()  # 多个网络层的线性堆叠,可以通过传递一个layer的list来构造该模型,也可以通过.add()方法一个个的加上层
    # model.add(Dense(256, input_shape=(train_total_vova_len,)))   #使用全连接的输入层
    model.add(Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], mask_zero=False,
                        input_length=maxlen))  # 指定输入层,将高维的one-hot转成低维的embedding表示,第一个参数大或等于0的整数,输入数据最大下标+1,第二个参数大于0的整数,代表全连接嵌入的维度
    # lstm层,也是比较核心的层
    model.add(LSTM(256))  # 256对应Embedding输出维度,128是输入维度可以推导出来
    model.add(Dropout(0.5))  # 每次在参数更新的时候以一定的几率断开层的链接,用于防止过拟合
    model.add(Dense(output_len))  # 全连接,这里用于输出层,1代表输出层维度,128代表LSTM层维度可以自行推导出来
    model.add(Activation('softmax'))  # 输出用sigmoid激活函数
    # 编译该模型,categorical_crossentropy(亦称作对数损失,logloss),adam是一种优化器,class_mode表示分类模式
    model.compile(loss='categorical_crossentropy', optimizer='sgd')

    # 正式运行该模型,我知道为什么了,因为没有补0!!每个array的长度是不一样的,因此才会报错
    X = np.array(list(trainData))  # 输入数据
    print("X:", X)
    Y = np.array(list(trainMark))  # 标签
    print("Y:", Y)
    # batch_size:整数,指定进行梯度下降时每个batch包含的样本数
    # nb_epoch:整数,训练的轮数,训练数据将会被遍历nb_epoch次
    model.fit(X, Y, batch_size=200, nb_epoch=10)  # 该函数的X、Y应该是多个输入:numpy list(其中每个元素为numpy.array),单个输入:numpy.array

    # 进行预测
    A = np.array(list(testData))  # 输入数据
    print("A:", A)
    classes = model.predict(A)  # 这个是预测的数据
    return classes
Example #3
0
def pad_graph(gr, s0pad=s0pad, s1pad=s1pad):
    """ pad sequences in the graph """
    gr['si0'] = pad_sequences(gr['si0'], maxlen=s0pad, truncating='pre', padding='post')
    gr['si1'] = pad_sequences(gr['si1'], maxlen=s1pad, truncating='pre', padding='post')
    gr['f0'] = pad_3d_sequence(gr['f0'], maxlen=s0pad, nd=nlp.flagsdim)
    gr['f1'] = pad_3d_sequence(gr['f1'], maxlen=s1pad, nd=nlp.flagsdim)
    gr['score'] = np.array(gr['score'])
Example #4
0
def bidirectional_lstm(X_train, y_train, X_test, y_test):
    X_train = sequence.pad_sequences(X_train, maxlen=max_len)
    X_test = sequence.pad_sequences(X_test, maxlen=max_len)
    lstm = LSTM(output_dim=64)
    gru = GRU(output_dim=64)  # original examples was 128, we divide by 2 because results will be concatenated
    brnn = Bidirectional(forward=lstm, backward=gru)
    print X_train.shape, y_train.shape
    print X_test.shape, y_test.shape

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=max_len))
    model.add(brnn)  # try using another Bidirectional RNN inside the Bidirectional RNN. Inception meets callback hell.
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")
    # model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    print("Train...")
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=4, validation_data=(X_test, y_test), show_accuracy=True)
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print('Test score:', score)
    print('Test accuracy:', acc)
    pred_labels = model.predict_classes(X_test)
    # print pred_labels
    accuracy = accuracy_score(y_test, pred_labels)
    precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted')
    print precision, recall, f1, supp

    return accuracy, precision, recall, f1
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv
Example #6
0
def imdb_lstm():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print type(X_train)
    exit(0)
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'
    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128, dropout=0.2))
    model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

    print('Train...')
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15,
                        validation_data=(X_test, y_test))
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
Example #7
0
def test_pad_sequences_vector():
    a = [[[1, 1]],
         [[2, 1], [2, 2]],
         [[3, 1], [3, 2], [3, 3]]]

    # test padding
    b = pad_sequences(a, maxlen=3, padding='pre')
    assert_allclose(b, [[[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [2, 1], [2, 2]],
                        [[3, 1], [3, 2], [3, 3]]])
    b = pad_sequences(a, maxlen=3, padding='post')
    assert_allclose(b, [[[1, 1], [0, 0], [0, 0]],
                        [[2, 1], [2, 2], [0, 0]],
                        [[3, 1], [3, 2], [3, 3]]])

    # test truncating
    b = pad_sequences(a, maxlen=2, truncating='pre')
    assert_allclose(b, [[[0, 0], [1, 1]],
                        [[2, 1], [2, 2]],
                        [[3, 2], [3, 3]]])

    b = pad_sequences(a, maxlen=2, truncating='post')
    assert_allclose(b, [[[0, 0], [1, 1]],
                        [[2, 1], [2, 2]],
                        [[3, 1], [3, 2]]])

    # test value
    b = pad_sequences(a, maxlen=3, value=1)
    assert_allclose(b, [[[1, 1], [1, 1], [1, 1]],
                        [[1, 1], [2, 1], [2, 2]],
                        [[3, 1], [3, 2], [3, 3]]])
Example #8
0
 def _fit_internal(self, X_train, y_train):
     if self.X_train is None:
         self.X_train = sequence.pad_sequences(X_train, maxlen=self.padding_size)
     else:
         self.X_train = numpy.vstack((self.X_train, sequence.pad_sequences(X_train, maxlen=self.padding_size)))
     self.y_train = numpy.append(self.y_train, y_train)
     print self.X_train.shape, self.y_train.shape
Example #9
0
def prepare_lstm_data(train, test, filter_fn=None):
  def prepare_numeric(data):
    X = []
    y = []
    names = []
    for card in data:
        X.append(np.concatenate((card.types, card.colors, [card.power, card.toughness, card.loyalty])))
        y.append(card.cost)
        names.append(card.name)
    return np.asarray(X), np.asarray(y), np.asarray(names)

  if filter_fn:
    train = filter(filter_fn, train)
    test = filter(filter_fn, test)
   
  X_train_text = [ card.tokens for card in train ]
  X_test_text = [ card.tokens for card in test ] 
  X_train_text = sequence.pad_sequences(np.asarray(X_train_text), MAX_LEN)
  X_test_text = sequence.pad_sequences(X_test_text, MAX_LEN)
  X_train_numeric, y_train, _ = prepare_numeric(train)
  X_test_numeric, y_test, y_test_names = prepare_numeric(test)

  #Combine text+numeric data
  X_train = map(np.asarray, [X_train_text, X_train_numeric])
  X_test = map(np.asarray, [X_test_text, X_test_numeric])
  return X_train, np.asarray(y_train), X_test, np.asarray(y_test), y_test_names
def process_format_model_in(in_out_pairs, max_len, batch_size, pad='pre', cut='pre'):
    """
    处理输入输出对的格式,使得符合模型的输入要求
    :param in_out_pairs: [(s1, s2, label), (word id list, list, str), ...]
    :param max_len: 最长序列(切词之后)的长度
    :param batch_size:
    :param pad:
    :param cut:
    :return: ({'source1': S1, 'source2': S2}, y)
    S1.shape == S2.shape: 2d numpy array
    y.shape == (in_out_pairs len, vocab_size+1)
    """
    S1 = []
    S2 = []
    y = []
    for in_out_pair in in_out_pairs:
        S1.append(in_out_pair[0])
        S2.append(in_out_pair[1])
        y.append(int(in_out_pair[2]))

    # lists of list => 2d numpy array
    S1 = pad_sequences(S1, maxlen=max_len, padding=pad, truncating=cut)
    S2 = pad_sequences(S2, maxlen=max_len, padding=pad, truncating=cut)

    # binary classification problem
    y = np.asarray(y, dtype=np.int16).reshape(batch_size, 1)
    return {'source1': S1, 'source2': S2}, y
Example #11
0
def imdb_test():
    # set parameters:
    max_features = 5000  # number of vocabulary
    maxlen = 200  # padding
    batch_size = 16
    nb_epoch = 10

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                          test_split=0.2)

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    nb_classes = 2
    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_test = np_utils.to_categorical(y_test, nb_classes)

    model = imdb_cnn()
    plot(model, to_file='./images/imdb_model.png')

    # try using different optimizers and different optimizer configs
    # model.compile(loss='binary_crossentropy', optimizer='adagrad', class_mode="binary")
    model.compile(loss='categorical_crossentropy', optimizer='adagrad')

    print("Train...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test),
              show_accuracy=True, callbacks=[early_stopping])
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print('Test score:', score)
    print('Test accuracy:', acc)
def run_keras_cnn_example():
	# set parameters:
	max_features = 5000
	maxlen = 100
	batch_size = 32
	embedding_dims = 100
	nb_filter = 250
	filter_length = 3
	hidden_dims = 250
	nb_epoch = 2

	print('Loading data...')
	(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
														  test_split=0.2)
	print(len(X_train), 'train sequences')
	print(len(X_test), 'test sequences')

	print('Pad sequences (samples x time)')
	X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
	X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Build model...')
	model = Sequential()

	# we start off with an efficient embedding layer which maps
	# our vocab indices into embedding_dims dimensions
	model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
	model.add(Dropout(0.25))

	# we add a Convolution1D, which will learn nb_filter
	# word group filters of size filter_length:
	model.add(Convolution1D(nb_filter=nb_filter,
							filter_length=filter_length,
							border_mode='valid',
							activation='tanh',
							subsample_length=1))
	# we use standard max pooling (halving the output of the previous layer):
	model.add(MaxPooling1D(pool_length=2))

	# We flatten the output of the conv layer,
	# so that we can add a vanilla dense layer:
	model.add(Flatten())

	# We add a vanilla hidden layer:
	model.add(Dense(hidden_dims))
	model.add(Dropout(0.25))
	model.add(Activation('tanh'))

	# We project onto a single unit output layer, and squash it with a sigmoid:
	model.add(Dense(1))
	model.add(Activation('sigmoid'))

	model.compile(loss='binary_crossentropy',
				  optimizer='rmsprop',
				  class_mode='binary')
	model.fit(X_train, y_train, batch_size=batch_size,
			  nb_epoch=nb_epoch, show_accuracy=True,
			  validation_data=(X_test, y_test))
Example #13
0
 def build(self, vocabulary, q_length, a_length):
     self.xq_data = [map(lambda x: vocabulary[x], terms) for terms in self.xq_data]
     self.xa_data = [map(lambda x: vocabulary[x], terms) for terms in self.xa_data]
     self.xq_np = sequence.pad_sequences(self.xq_data, maxlen = q_length)
     self.xa_np = sequence.pad_sequences(self.xa_data, maxlen = a_length)
     self.y_np = np.array(self.labels)
     self.built = True
def main():
    top_words = 5000  # Keep only the most frequent 500 words in the dataset.
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

    # Keras requires same length (although 0 will mean no information).
    max_review_length = 500
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

    embedding_length = 32
    input_seq = Input(shape=(500,))
    a = Embedding(top_words, embedding_length,
                  input_length=max_review_length)(input_seq)
    b, state_h, state_c = LSTM(100, return_state=True,
                               return_sequences=True)(a)
    c = AttentionLayerV2(attention_depth=4)(b)
    d = Dropout(0.5)(c)
    e = Dense(1, activation='sigmoid')(d)
    model = Model(inputs=[input_seq], outputs=[e])
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])
    model.summary()
    # print(model.predict(np.ones((10, 500))))
    model.fit(X_train, y_train, epochs=5, batch_size=64)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))

    model.save_weights('model_weights.h5')
Example #15
0
	def generate_training_batches():
		index = 0         #IF I understand correctly, state of index will be saved because it's local
		while True:
			remaining = len(train_input_doc) - index
			input_slice = []
			target_slice = []
			if remaining >= batch_size:
				input_doc_slice = train_input_doc[index:(index + batch_size)]
				input_query_slice = train_input_query[index:(index + batch_size)]
				target_slice = train_target_word[index:(index + batch_size)]
				index += batch_size
			else:
				input_doc_slice = train_input_doc[index:]
				input_doc_slice += train_input_doc[:(batch_size - remaining)]
				input_query_slice = train_input_query[index:]
				input_query_slice += train_input_query[:(batch_size - remaining)]
				target_slice = train_target_word[index:]
				target_slice += train_target_word[:(batch_size - remaining)]
				index = batch_size - remaining
			x_train_doc = sequence.pad_sequences(input_doc_slice, maxlen = maxdoclen)
			x_train_query = sequence.pad_sequences(input_query_slice, maxlen = maxquerylen)
			x_train = np.concatenate((x_train_doc, x_train_query), axis = 1)
			y_train = np.zeros((batch_size, vocab_size))
			y_train[np.arange(batch_size), np.array(target_slice)] = 1
			yield {'input': x_train, 'output': y_train}
Example #16
0
def lstm_model(X_train, y_train, X_test, y_test):
    X_train = sequence.pad_sequences(X_train, maxlen=max_len, padding='post')
    X_test = sequence.pad_sequences(X_test, maxlen=max_len, padding='post')
    print X_train.shape, y_train.shape
    print X_test.shape, y_test.shape

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=max_len))
    model.add(LSTM(128))  # try using a GRU instead, for fun
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # print X_train.shape, y_train.shape
    # print X_test.shape, y_test.shape

    model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

    print("Train...")
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=4, validation_data=(X_test, y_test), show_accuracy=True)
    acc, score = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print('Test score:', score)
    print('Test accuracy:', acc)
    pred_labels = model.predict_classes(X_test)
    # print pred_labels
    accuracy = accuracy_score(y_test, pred_labels)
    precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted')
    print precision, recall, f1, supp

    return accuracy, precision, recall, f1
Example #17
0
def evaluate_recurrent_model(dataset,num_classes):
    (X_train, Y_train), (X_test, Y_test) = dataset
    max_features = 20000
    maxlen = 125  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')
    print("Pad sequences (samples x time) with maxlen %d"%maxlen)
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(GRU(512))  # try using a GRU instead, for fun
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='categorical_crossentropy',optimizer='adam')

    print("Train...")
    model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15,
              validation_data=(X_test, Y_test), show_accuracy=True)
    score, acc = model.evaluate(X_test, Y_test,
                                batch_size=batch_size,
                                show_accuracy=True)
    if verbose:
        print('Test score:', score)
        print('Test accuracy:', acc)
    return score[1]
Example #18
0
	def generate_test_batches():
		index = 0
		while True:
			remaining = len(test_input_doc) - index
			input_slice = []
			target_slice = []
			if remaining >= batch_size:
				input_doc_slice = test_input_doc[index:(index + batch_size)]
				input_query_slice = test_input_query[index:(index + batch_size)]
				target_slice = test_target_word[index:(index + batch_size)]
				index += batch_size
			else:
				input_doc_slice = test_input_doc[index:]
				input_doc_slice += test_input_doc[:(batch_size - remaining)]
				input_query_slice = test_input_query[index:]
				input_query_slice += test_input_query[:(batch_size - remaining)]
				target_slice = test_target_word[index:]
				target_slice += test_target_word[:(batch_size - remaining)]
				index = batch_size - remaining
			x_test_doc = sequence.pad_sequences(input_doc_slice, maxlen = maxdoclen)
			x_test_query = sequence.pad_sequences(input_query_slice, maxlen = maxquerylen)
			x_test = np.concatenate((x_test_doc, x_test_query), axis = 1)
			y_test = np.zeros((batch_size, vocab_size))
			y_test[np.arange(batch_size), np.array(target_slice)] = 1
			yield {'input': x_test, 'output': y_test}
Example #19
0
def test_dan_original():
    max_features = 20000
    maxlen = 100  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print("Loading data...")
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
    print(len(X_train), "train sequences")
    print(len(X_test), "test sequences")

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    model = dan_original(max_features)

    # try using different optimizers and different optimizer configs
    model.compile(loss="binary_crossentropy", optimizer="adagrad", class_mode="binary")

    print("Train...")
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print("Test score:", score)
    print("Test accuracy:", acc)
Example #20
0
def evaluate_conv_model(dataset, num_classes, maxlen=125,embedding_dims=250,max_features=5000,nb_filter=300,filter_length=3,num_hidden=250,dropout=0.25,verbose=True,pool_length=2,with_lstm=False):
    (X_train, Y_train), (X_test, Y_test) = dataset
    
    batch_size = 32
    nb_epoch = 5

    if verbose:
        print('Loading data...')
        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')
        print('Pad sequences (samples x time)')
    
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    if verbose:
        print('X_train shape:', X_train.shape)
        print('X_test shape:', X_test.shape)
        print('Build model...')

    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
    model.add(Dropout(dropout))

    # we add a Convolution1D, which will learn nb_filter
    # word group filters of size filter_length:
    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode='valid',
                            activation='relu',
                            subsample_length=1))
    if pool_length:
        # we use standard max pooling (halving the output of the previous layer):
        model.add(MaxPooling1D(pool_length=2))
    if with_lstm:
        model.add(LSTM(125))
    else:
        # We flatten the output of the conv layer,
        # so that we can add a vanilla dense layer:
        model.add(Flatten())

        #We add a vanilla hidden layer:
        model.add(Dense(num_hidden))
        model.add(Activation('relu'))
        model.add(Dropout(dropout))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',optimizer='adam')
    model.fit(X_train, Y_train, batch_size=batch_size,nb_epoch=nb_epoch, show_accuracy=True,validation_split=0.1)
    score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1 if verbose else 0, show_accuracy=True)
    if verbose:
        print('Test score:',score[0])
        print('Test accuracy:', score[1])
    predictions = model.predict_classes(X_test,verbose=1 if verbose else 0)
    return predictions,score[1]
Example #21
0
def seq_driver(vocab, postIndexs, commentIndexs):
    vocab = ['0'] + vocab + ['UNK', 'END']
    #the first feature is none. this feature will be problem in the last full connect layer??

    max_features = len(vocab)
    #for i in range(max_features):
    #    print i, vocab[i]

    maxPostLen = max(map(len, (x for x in postIndexs)))
    maxCommentLen = max(map(len, (x for x in commentIndexs)))
    maxlen = max(maxPostLen, maxCommentLen)

    X = pad_sequences(postIndexs, maxlen, 'int32', 'post', 'post')
    Y = pad_sequences(commentIndexs, maxlen, 'int32', 'post', 'post')
    #Y = pad_sequences(commentIndexs, maxlen)

    #print 'after padd'
    #batch_test(X, Y, max_features, maxlen)
    def to_one_hot(id):
        zeros = [0] * max_features
        zeros[id] = 1
        return zeros
    xs = np.asarray(X)
    Y = map(lambda x: map(to_one_hot, x), Y)
    ys = np.asarray(Y)
    print('maxfeature, maxlen: ',  max_features, maxlen)
    print("XS Shape: ", xs.shape)
    print("YS Shape: ", ys.shape)
    seq2seq(xs, ys, max_features, maxlen, vocab)
Example #22
0
def TrainModel_Data(X,Y):
	X_train = sequence.pad_sequences(np.array(X), maxlen=maxlen)
	X_test = sequence.pad_sequences(np.array(X[:100]), maxlen=maxlen)

	y_train = np.array(Y)
	y_test  = np.array(Y[:100])

	print('Build model...')

	model = Sequential()
	model.add(Embedding(max_features, embedding_size, input_length=maxlen))
	model.add(Dropout(0.25))
	model.add(Convolution1D(nb_filter=nb_filter,
	                        filter_length=filter_length,
	                        border_mode='valid',
	                        activation='relu',
	                        subsample_length=nb_classes))
	model.add(MaxPooling1D(pool_length=pool_length))
	model.add(LSTM(lstm_output_size))
	model.add(Dense(nb_classes))
	model.add(Activation('sigmoid'))
	model.compile(loss='categorical_crossentropy',optimizer='adam')
	print('Train...')
	model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,validation_data=(X_test, y_test))
	score = model.evaluate(X_test, y_test)
	print('Test score:', score)
	SaveModel(model)
	return model
Example #23
0
def smartpadding(X, word_idx, max_sentence_length):
    newX = []
    for i,story in enumerate(X):
        sentences = splitter([word_idx["."], word_idx["?"] ], list(story))
        sentences = filter(lambda a: a != [], sentences)
        new_sentence = []
        #print(i)
        #print(story)
        #print(sentences)
        for sentence in sentences:
            if(sentence == []):
                continue
            sentence = np.array(sentence)
            #print (sentence[-1])
            s = sentence[sentence > 0]
            if(max_sentence_length < len(s)):
                print("Maximum sentence length is not maximum, found" , len(s), max_sentence_length)

            s = pad_sequences([s], maxlen=max_sentence_length, padding="pre")[0]
            #print(s.shape)
            #print(s)

            new_sentence.extend(s)

        newX.append(new_sentence)

    print ("maxlen", max(map(len, newX)))
    newX = pad_sequences(newX, maxlen=max(map(len, newX)),padding = "pre")
    print(newX.shape)
    return newX
Example #24
0
def train():
    # load the dataset but only keep the top n words, zero the rest
    (X_train, Y_train), (X_test, Y_test) = imdb.load_data(nb_words=top_words)
    # truncate and pad input sequences
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

    # create the model
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Dropout(0.2))
    model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
    model.add(MaxPooling1D(pool_length=2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=2, batch_size=64)

    # Final evaluation of the model
    scores = model.evaluate(X_test, Y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    model.save("imdb_%0.2f.pkl" % scores[1])
def vectorize_data(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            for seq in pp.get_sequences(doc):
                x = []
                x_char = []
                y = []
                for token in seq:
                    x.append(1 + token.word_index) # Add 1 to include token for padding
                    if return_chars:
                        x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                    if output_type == "category":
                        y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                    else:
                        y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                    y.append(y_idx) # Add 1 to include token for padding
                X.append(x)
                if return_chars:
                    padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                            pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                    X_char.append(padded_sequence)
                Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y
def extract_representation(maxlen, settings, model=None, seed=107, test_split=0.2) :
	# load data
	save_block, save_sep, yes_dir, no_dir = get_file_paths(train=True)
	(X,y) = ISTapps.load_ISTapps(maxlen, separate=True, save_file=save_block, 
		yes_directory=yes_dir, no_directory=no_dir, seed=seed)
	X = np.asarray(X)
	y = np.asarray(y)
	split_point = int(len(X)*(1-test_split))
	X_train_prime = X[:split_point]
	y_train_prime = y[:split_point]
	X_test_prime = X[split_point:]
	y_test_prime = y[split_point:]
	# convert train set into a huge block of sequences and shuffle again
	(X_train, y_train) = ISTapps.extract_from_apps(X_train_prime, 
		y_train_prime, maxlen, seed, shuffle=True)
	# convert model for LSTM success comparison
	(X_test, y_test) = ISTapps.extract_from_apps(X_test_prime, 
		y_test_prime, maxlen, seed, shuffle=True)
	if not model :
		model = tune_model(X_train, y_train, X_test, y_test, settings)
	# return training data as a shuffled sentence block and separated by app
	X_train_block = model.predict(X_train)
	X_train_rep = [model.predict(sequence.pad_sequences(app,maxlen)) 
		for app in X_train_prime]
	X_test_rep = [model.predict(sequence.pad_sequences(app,maxlen)) 
		for app in X_test_prime]
	assert(len(X_train_rep) == len(y_train_prime))
	assert(len(X_test_rep) == len(y_test_prime))

	return (X_train_block, y_train), (X_train_rep, y_train_prime), (X_test_rep, y_test_prime), model
Example #27
0
 def vectorize(self, slist, emb, spad=60):
     """ build an spad-ed matrix of word indices from a list of
     token sequences; returns an si, sj tuple of indices in vocab
     and emb respectively """
     silist = []
     sjlist = []
     for s in slist:
         si = []
         sj = []
         for t in s:
             if self.icase:
                 t = t.lower()
             if t in self.word_idx:
                 si.append(self.word_idx[t])
                 sj.append(0)
             elif emb is not None and t in emb.w:
                 si.append(0)
                 sj.append(emb.w[t])
             else:
                 si.append(1)  # OOV
                 sj.append(0)
         silist.append(si)
         sjlist.append(sj)
     if spad is not None:
         return (pad_sequences(silist, maxlen=spad, truncating='post', padding='post'),
                 pad_sequences(sjlist, maxlen=spad, truncating='post', padding='post'))
     else:
         return (silist, sjlist)
Example #28
0
def fitmodel(model, X_train, X_test, y_train, y_test, epoch=30):
	X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
	X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
	score = model.evaluate(X_test, y_test, batch_size=32)
	print "score before eval: ", score
	model.fit(X_train, y_train, batch_size=32, nb_epoch=epoch)
	score = model.evaluate(X_test, y_test, batch_size=32)
	print "score after eval: ", score
Example #29
0
def preprocessingCaption(_cap, wordtoidx):

    _cap = map(lambda cap: [wordtoidx[word] for word in cap.lower().split(' ')[:-1] if word in wordtoidx], _cap)
    max_steps = np.max(map(lambda x: len(str(x).split(' ')),_cap)) # 79
    _cap = Sequence.pad_sequences(_cap, maxlen=max_steps+1, padding='post') # ndarray
    _cap = Sequence.pad_sequences(_cap, maxlen=max_steps+2, padding='pre')

    return _cap, max_steps
Example #30
0
def pad_sentences(question1_word_sequences, question2_word_sequences, is_duplicate, maxlen):
    q1_data = pad_sequences(question1_word_sequences, maxlen=maxlen)
    q2_data = pad_sequences(question2_word_sequences, maxlen=maxlen)
    labels = np.array(is_duplicate, dtype=int)
    print('Shape of question1 data tensor:', q1_data.shape)
    print('Shape of question2 data tensor:', q2_data.shape)
    print('Shape of label tensor:', labels.shape)
    return q1_data, q2_data, labels
 def _process(self, X_temp, indexes):
     data_ids = self.tokenizer.texts_to_sequences(X_temp)
     max_length = self.max_length
     batch_x = sequence.pad_sequences(data_ids, maxlen=max_length, padding='post')
     batch_y = self.labels[indexes]
     return batch_x, batch_y
                         for seq in hindi_seqlist]

with open(os.path.join(base_dir, "model", model_name + "_hindi2index.pickle"),
          mode="wb") as file:
    pickle.dump(hindi2index, file)

with open(os.path.join(base_dir, "model", model_name + "_eng2index.pickle"),
          mode="wb") as file:
    pickle.dump(eng2index, file)

#maxsequence length
max_len = 40

# sequence padding
eng_padded_seq = pad_sequences(maxlen=max_len,
                               sequences=encoded_eng_seqlist,
                               padding="post",
                               value=eng2index[" "])
hindi_padded_seq = pad_sequences(maxlen=max_len,
                                 sequences=encoded_hindi_seqlist,
                                 padding="post",
                                 value=hindi2index[" "])

#one hot encoding of hindi sequence
y = [
    to_categorical(seq, num_classes=len(hindi2index))
    for seq in hindi_padded_seq
]

eng_train, eng_test, y_train, y_test = train_test_split(eng_padded_seq,
                                                        y,
                                                        test_size=0.05)
Example #33
0
word2vec_model.save('my_word2vec_model_256_false.model')'''
x_train = []
for comment in table_x:
    s = comment[0].split(',', 1)[1]
    s_cut = jieba.cut(s)
    temp = []
    for word in s_cut:
        if word in my_word2vec_model.wv.vocab:
            temp.append(my_word2vec_model[word])
        #else : temp.append(my_word2vec_model["oov"])

    x_train.append(temp)

x_train = pad_sequences(x_train,
                        maxlen=48,
                        dtype='int32',
                        padding='post',
                        truncating='post',
                        value=my_word2vec_model[" "])
##########################  building model #################################################
model = Sequential()
model.add(
    LSTM(256,
         return_sequences=True,
         input_length=48,
         input_dim=256,
         dropout=0.5,
         recurrent_dropout=0.5,
         kernel_initializer='he_normal'))
model.add(
    LSTM(256,
         return_sequences=False,
encoded_labels = t.texts_to_sequences(y)
test_encoded_docs = t.texts_to_sequences(test_x)
test_encoded_labels = t.texts_to_sequences(test_y)
word_index = t.word_index
index_word = {v:k for k, v in word_index.items()}

def decode_sequence(seq):
	decoded = ""
	for s in seq:
		if not s==0:
			decoded = decoded + index_word[int(s)] + " "
	return decoded.strip()	
	

# pad documents to a max length of z words (where z should be the maximum sequence length)
padded_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post')
padded_labels = pad_sequences(encoded_labels, maxlen=max_len, padding='post')
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_len, padding='post')
test_padded_labels = pad_sequences(test_encoded_labels, maxlen=max_len, padding='post')

x = padded_docs
y = padded_labels
test_x = test_padded_docs
test_y = test_padded_labels


def eval_batch(x_train, y_train, x_test, y_test, classifier, components, no_clusters, dimensionality):

	cluster_finder = cluster.KMeans(n_clusters=no_clusters)
	
	if classifier=='mbk':
Example #35
0
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x


#Removing Numbers
train['text'] = train['text'].apply(lambda x: remove_numbers(x))
test['text'] = test['text'].apply(lambda x: remove_numbers(x))

## Tokenize the sentences
tokenizer = Tokenizer(num_words=MAX_WORD_TO_USE)
tokenizer.fit_on_texts(list(train['text']))
train_X = tokenizer.texts_to_sequences(train['text'])
test_X = tokenizer.texts_to_sequences(test['text'])
## Pad the sentences
train_X = pad_sequences(train_X, maxlen=MAX_LEN)
test_X = pad_sequences(test_X, maxlen=MAX_LEN)
#Converting target to one-hot format
train_y = pd.get_dummies(train['label']).values
test_y = pd.get_dummies(test['label']).values

#words_dict is a dictionary like this:
#words_dict = {'the':5,'among':20,'interest':578}
#words_dict includes words and their corresponding numbers.
words_dict = tokenizer.word_index

#Present working directory
working_dir = os.getcwd()

EMBEDDING_FILE = '../glove.6B.{}d.txt'.format(EMBED_SIZE)
Example #36
0
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)
onehotencoder = OneHotEncoder(sparse=False)
y_train = onehotencoder.fit_transform(y_train.reshape(-1, 1))
y_test = onehotencoder.fit_transform(y_test.reshape(-1, 1))

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(dataDF['text'])

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

embedding_dim = 300
embedding_matrix = create_embedding_matrix(glove_model, tokenizer.word_index,
                                           embedding_dim)

model = Sequential()
model.add(
    Embedding(vocab_size,
              embedding_dim,
              weights=[embedding_matrix],
              input_length=maxlen,
              trainable=True))

model.add(layers.GlobalMaxPool1D())
Example #37
0
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting X_train and X_test with n-grams features
    X_train = add_ngram(X_train, token_indice, ngram_range)
    X_test = add_ngram(X_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, X_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, X_test)), dtype=int)))

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())
Example #38
0
                            seed=1234,
                            iter=25)
w2v_model.save('w2v_model.pkl')
tokenizer = Tokenizer(num_words=len(word_set))
tokenizer.fit_on_texts(corpus)

train_q1 = tokenizer.texts_to_sequences(train_q1)
train_q2 = tokenizer.texts_to_sequences(train_q2)

test_q1 = tokenizer.texts_to_sequences(test_q1)
test_q2 = tokenizer.texts_to_sequences(test_q2)

dev_q1 = tokenizer.texts_to_sequences(dev_q1)
dev_q2 = tokenizer.texts_to_sequences(dev_q2)

train_pad_q1 = pad_sequences(train_q1, maxlen=MAX_SEQUENCE_LENGTH)
train_pad_q2 = pad_sequences(train_q2, maxlen=MAX_SEQUENCE_LENGTH)

test_pad_q1 = pad_sequences(test_q1, maxlen=MAX_SEQUENCE_LENGTH)
test_pad_q2 = pad_sequences(test_q2, maxlen=MAX_SEQUENCE_LENGTH)

dev_pad_q1 = pad_sequences(dev_q1, maxlen=MAX_SEQUENCE_LENGTH)
dev_pad_q2 = pad_sequences(dev_q2, maxlen=MAX_SEQUENCE_LENGTH)

embedding_matrix = np.zeros([len(tokenizer.word_index) + 1, EMB_DIM])

for word, idx in tokenizer.word_index.items():
    try:
        embedding_matrix[idx, :] = w2v_model.wv[word]
    except:
        print('1')
Example #39
0
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1])

#토큰화
token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)
# {'너무': 1, '참': 2, '재밌어요': 3, '최고에요': 4, '잘': 5, '만든': 6, '영화에요': 7, '추천하고': 8, '싶은': 9, '영화': 10, '입니다': 11,
# '한번': 12, '더': 13, '보고': 14, '싶네요': 15, '글쎄요': 16, '별로에요': 17,
# '생각보다': 18, '지루해요': 19, '연기가': 20, '어색해요': 21, '재미없어요': 22, '재미없다': 23, '재밌네요': 24}
# 자주나온 단어는 인덱스를 앞으로 준다.

x = token.texts_to_sequences(docs)
print(x)
#[[1, 3], [4], [2, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16], [17], [18, 19], [20, 21], [22], [1, 23], [2, 24]]

pad_x = pad_sequences(x, padding='pre', value=0)
print(pad_x)

word_size = len(token.word_index) + 1
print(word_size)

model = Sequential()
model.add(Embedding(25, 10, input_length=4))
model.add(Conv1D(10, 2))
model.add(Conv1D(10, 2))
model.add(MaxPool1D())
# model.add(Embedding(word_size,10,input_length=4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()
def train_LSTM_Cross_Domain(tweets_train, tweets_test, vocab, MAX_SEQUENCE_LENGTH):
        a, p, r, f1 = 0., 0., 0., 0.
        a1, p1, r1, f11 = 0., 0., 0., 0.
        pn,rn,fn = 0.,0.,0.
        sentence_len = MAX_SEQUENCE_LENGTH
        batch_size =128
        
        X_train, y_train = gen_sequence(tweets_train,vocab,'categorical')
        X_test, y_test = gen_sequence(tweets_test,vocab,'binary')
        
        X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
        X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

        
        y_train = np.array(y_train)
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        
        
        model = lstm_model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

        if INITIALIZE_WEIGHTS_WITH == "glove":
            weights = get_embedding_weights(vocab)
            model.layers[0].set_weights([weights])
        elif INITIALIZE_WEIGHTS_WITH == "random":
            shuffle_weights(model)
        else:
            print ("ERROR!")
            return
        for epoch in range(EPOCHS):
            for X_batch in batch_gen(X_temp, BATCH_SIZE):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                try:
                    y_temp = np_utils.to_categorical(y_temp, num_classes=3)
                except Exception as e:
                    print (e)
                #print (x.shape, y_temp.shape)
                loss, acc = model.train_on_batch(x, y_temp, class_weight=None)
                #print (loss, acc)
                
        temp = model.predict_on_batch(X_test)
        y_pred_aux = np.argmax(temp, axis=1)
        y_pred=[]
        for i in y_pred_aux:
            if i == 2:
                y_pred.append(1)
            else:
                y_pred.append(i)
                
#         print (classification_report(y_test, y_pred))
#         print (precision_recall_fscore_support(y_test, y_pred))

        wordEmb = model.layers[0].get_weights()[0]

        word2vec_model = create_model(wordEmb,vocab)
        
        tweets_train = select_tweets_whose_embedding_exists(tweets_train, word2vec_model)
        tweets_test = select_tweets_whose_embedding_exists(tweets_test, word2vec_model)
        
        X_train, y_train = gen_data(tweets_train,word2vec_model,'categorical')
        X_test, y_test = gen_data(tweets_test,word2vec_model,'binary')
        
        precision, recall, f1_score, acc, p_weighted, p_macro, r_weighted, r1_macro, f1_weighted, f11_macro = gradient_boosting_classifier(X_train, y_train, X_test, y_test, 'cross')
        a += acc
        p += p_weighted
        p1 += p_macro
        r += r_weighted
        r1 += r1_macro
        f1 += f1_weighted
        f11 += f11_macro
        pn += precision
        rn += recall
        fn += f1_score
        print_scores(p, p1, r,r1, f1, f11,pn, rn, fn, 1)
i=0
y_valid=np.zeros((len(label_valid),max(label_valid)+1))
for x in label_valid:
    y_valid[i][x]=1
    i=i+1


t = Tokenizer()
t.fit_on_texts(input_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(input_train)
#print(encoded_docs)
# pad documents to a max length of 4 words
max_length = max(len_finder)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
#print(padded_docs)
# load the whole embedding into memory
embeddings_index = dict()
f = open("G:\\NLP\\Dataset\\GloVe\\glove.6B.100d.txt", encoding="utf8")
for line in f:
 values = line.split()
 word = values[0]
 coefs = asarray(values[1:], dtype='float32')
 embeddings_index[word] = coefs
f.close()
#print('Loaded %s word vectors.' % len(embeddings_index))
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
 embedding_vector = embeddings_index.get(word)
Example #42
0
# Create a tokenizer
#==============================================================================


tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=True )

tokenizer.fit_on_texts(docs)

sequences = tokenizer.texts_to_sequences(docs)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

# convert text to sequence of tokens and pad them to ensure equal length vectors 
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)



#==============================================================================
# Training, testing and validation
#==============================================================================
seed =1000

x_train, x_test, y_train, y_test = train_test_split(x, dummy_y, train_size=0.7, random_state=seed)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.7, random_state=seed)


'''
Example #43
0
T = list(itertools.chain(*T))

# Generate a dictionary of valid characters
valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X + T)))}

max_features = len(valid_chars) + 1

maxlen = np.max([len(x) for x in X])
print(maxlen)

# Convert characters to int and pad
X1 = [[valid_chars[y] for y in x] for x in X]

T1 = [[valid_chars[y] for y in x] for x in T]

X_train = sequence.pad_sequences(X1, maxlen=maxlen)

X_test = sequence.pad_sequences(T1, maxlen=maxlen)

y_train = np.array(trainlabel)
y_test = np.array(testlabel)

hidden_dims = 128
nb_filter = 128
filter_length = 2
embedding_vecor_length = 128
pool_length = 2
lstm_output_size = 70

model = Sequential()
model.add(Embedding(max_features, embedding_vecor_length, input_length=maxlen))
# 1-b set test and train 
train_x = train['x'] 
train_y = train['y']
test_x = test['x']
test_y = test['y']

# 2-1 Tokenize the data
from keras.preprocessing.text import Tokenizer
token = Tokenizer(7000)
token.fit_on_texts(train_x)

x_train_seq = token.texts_to_sequences(train_x)
x_test_seq = token.texts_to_sequences(test_x)

# 2-2 set max length of data
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test = sequence.pad_sequences(x_test_seq, maxlen=100)
#%%
from s
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, 
import matplotlib.pyplot as plt

modelRNN = Sequential()  #建立模型
#Embedding層將「數字list」轉換成「向量list」
modelRNN.add(Embedding(output_dim=4,   #輸出的維度是32,希望將數字list轉換為32維度的向量
     input_dim=7000,  #輸入的維度是3800,也就是我們之前建立的字典是3800字
     input_length=100)) #數字list截長補短後都是380個數字
Example #45
0
    def train(self, n_epochs=10):
        if not hasattr(self, 'model'):
            self.create_model()
            self.compile_model()

        self.serialize_class_data()
        self.serialize_model()

        validation_split = 0.1
        split_at = int(len(self.X_train) * (1. - validation_split))
        x, val_x = self.X_train[:split_at], self.X_train[split_at:]
        y, val_y = self.Y_train[:split_at], self.Y_train[split_at:]
        training_loss_history = []
        validation_loss_history = []

        for epoch in range(n_epochs):
            print('Epoch', epoch)
            training_loss = []
            end = int(float(len(x)) / self.batch_size)
            progbar = Progbar(end)
            for i in range(0, len(x), self.batch_size):
                inp = sequence.pad_sequences(x[i:i+self.batch_size],
                                             maxlen=self.maxlen)
                out = y[i:i+self.batch_size]
                loss = self.model.train_on_batch(inp, out)
                training_loss.append(loss)
                j = int(float(i) / self.batch_size)
                if j % 16 == 0:
                    progbar.update(j)
            progbar.update(end)
                
            # test on validation set
            validation_loss = []
            print()
            print('Evaluating on validation set:')
            end = int(float(len(val_x)) / self.batch_size)
            progbar = Progbar(end)
            for i in range(0, len(val_x), self.batch_size):
                inp = sequence.pad_sequences(val_x[i:i+self.batch_size],
                                             maxlen=self.maxlen)
                out = val_y[i:i+self.batch_size]
                output = self.model.test_on_batch(inp, out)
                validation_loss.append(output)
                j = int(float(i) / self.batch_size)
                if j % 16 == 0:
                    progbar.update(j)
            progbar.update(end)

            training_loss_history.append(np.mean(training_loss))
            validation_loss_history.append(np.mean(validation_loss))
            filename = op.join(self.serialization_dir,
                               'weights_epoch%d.h5' % epoch)
            self.model.save_weights(filename, overwrite=True)
            print
            print ('Mean training loss: %5.3f; mean validation loss: %5.3f\n' %
                   (training_loss_history[-1], validation_loss_history[-1]))
            if (len(validation_loss_history) > 1 and 
                validation_loss_history[-1] >= validation_loss_history[-2]):
                break

        self.training_history = (map(float, training_loss_history),
                                 map(float, validation_loss_history))
def prepare_text(text):
    text_clean = process_texts(text)
    text_word_sequences = tokenizer.texts_to_sequences(text_clean)
    input_text = pad_sequences(text_word_sequences, maxlen = config.MAX_SEQUENCE_LENGTH, padding = 'post')

    return input_text
Example #47
0
                   str(iteration + 1)):
            iteration += 1
            print("\n\n\n\nMaking nueral Network for iteration:", iteration)

            #Making Training and Testing Data
            X_Train = [Features[x] for x in train_index]
            X_Test = [Features[x] for x in test_index]
            radicalTrain = [Radical[x] for x in train_index]
            radicalTest = [Radical[x] for x in test_index]

            tokenisedTrain = tokenizer.texts_to_sequences(X_Train)
            tokenisedTest = tokenizer.texts_to_sequences(X_Test)

            max_review_length = 180
            X_Train = sequence.pad_sequences(tokenisedTrain,
                                             maxlen=max_review_length,
                                             padding='post')
            X_Test = sequence.pad_sequences(tokenisedTest,
                                            maxlen=max_review_length,
                                            padding='post')

            #Radical
            radicalModel = Sequential()
            radicalModel.add(
                Embedding(vocabSize,
                          100,
                          input_length=max_review_length,
                          weights=[embedding_matrix],
                          trainable=False))
            radicalModel.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
            radicalModel.add(Dense(1, activation='sigmoid'))
        print("批判性: ", score)


if __name__ == '__main__':
    comment_judgements = readJudgementsFromFile()
    comments, judgements = get_comment_and_judgement(comment_judgements)
    word_dataset = build_word_dataset(comment_judgements)
    word_index_dict = build_up_word_index_dict(word_dataset)

    comments = [comment_to_indices(comment, word_index_dict) for comment in comments]

    judgements = np.array(judge_to_one_hot(judgements))

    # -----Preparing the training and testing data-----
    trainAmount = int(len(comments) * 0.6)
    data = pad_sequences(comments, maxlen=max_seq_len, dtype='float32')

    random_mask = np.arange(len(data))
    np.random.shuffle(random_mask)
    data = data[random_mask]
    judgements = judgements[random_mask]

    train_data = data[:trainAmount]
    train_labels = judgements[:trainAmount]
    test_data = data[trainAmount:]
    test_labels = judgements[trainAmount:]
    validation_data = test_data[:200]
    validation_labels = test_labels[:200]

    print("Train Data's shape: ", train_data.shape)
    print("Train Labels' shape: ", train_labels.shape)
Example #49
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    if args['debug']:
        df_train = df_train.iloc[:30000]
        texts_train = df_train['question_text']
    else:
        logger.info('Preprocess text')
        texts_train = preprocess_text(df_train, return_df=False)
    seq_train, tokenizer = tokenize_texts(texts_train, logger)
    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    embed_types = [0, 1, 2]

    logger.info('Start multiprocess nlp feature extraction and embedding matrices loading')
    with mp.Pool(processes=2) as p:
        results = p.map(parallel_apply, [
            (extract_nlp_features, (df_train,)),
            (load_multiple_embeddings, (tokenizer.word_index, embed_types, args['debug']))
        ])

    df_train_extracted = results[0]
    embedding_matrices = results[1]
    embedding_matrix = np.concatenate(
        [np.array([embedding_matrices[i] for i in [0, 1, 2]]).mean(0)] + [
            embedding_matrices[j] for j in [1]
        ], axis=1
    )

    nlp_columns = ['total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!', 'n_you']
    for col in nlp_columns:
        scaler = StandardScaler()
        df_train_extracted[col] = scaler.fit_transform(
            df_train_extracted[col].values.astype(np.float32).reshape(-1, 1)).reshape(-1, )

    x_nlp = [df_train_extracted[col].values.reshape(-1, 1) for col in nlp_columns]
    nlp_size = len(x_nlp)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    trigger = TRIGGER

    if args['debug']:
        epochs = 3
        n_splits = 2
    else:
        epochs = EPOCHS
        n_splits = KFOLD

    logger.info('Start training and evaluation loop')

    model_specs = [
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.75, 'combined': False, 'weight': 1.0},
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.50, 'combined': False, 'weight': 1.0},
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 1.0},
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 5.0},
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.50, 'combined': True, 'weight': 5.0},
        {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2},
                             {'activation': 'relu', 'dim': 16, 'dropout': 0.2}),
         'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0},
                             {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}),
         'upper_layer_types': ({'dim': 64, 'dropout': 0.5},
                               {'dim': 64, 'dropout': 0.3}
                               ),
         'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 3.0},
    ]
    model_name_base = 'NLPFeaturesDeepRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}'

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        oof_mv_preds = np.zeros(len(seq_train))
        oof_preds_proba = np.zeros(len(seq_train))
        oof_opt_preds = np.zeros(len(seq_train))
        oof_reopt_preds = np.zeros(len(seq_train))
        results_list = []
        for fold, (index_train, index_valid) in enumerate(skf.split(label_train, label_train)):
            logger.info(f'Fold {fold + 1} / {KFOLD} - create dataloader and build model')
            x_train = {
                'text': seq_train[index_train].astype(int),
                'nlp': [x[index_train] for x in x_nlp]
            }
            x_valid = {
                'text': seq_train[index_valid].astype(int),
                'nlp': [x[index_valid] for x in x_nlp]
            }
            y_train, y_valid = label_train[index_train].astype(np.float32), label_train[index_valid].astype(np.float32)

            model = NLPFeaturesDeepRNN(embedding_matrix, PADDING_LENGTH, nlp_size,
                                       embed_drop=0.2, mask=True,
                                       nlp_layer_types=spec['nlp_layer_types'],
                                       rnn_layer_types=spec['rnn_layer_types'],
                                       upper_layer_types=spec['upper_layer_types'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            if spec['combined']:
                criterion_type = 'bce_focal'
            else:
                criterion_type = 'focal'

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': criterion_type,
                'criteria_weights': [1.0, spec['weight']],
                'criterion_gamma': spec['gamma'],
                'criterion_alpha': spec['alpha'],
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.003,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold)

            fold_results = calculate_fold_metrics(eval_results, label_train[index_valid].reshape(-1,))
            results_list.append(fold_results)

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'

            message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, '
            message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n'

            message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, '
            message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n'

            message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, '
            message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n'

            message += f'Focal Loss: {fold_results["oof_focal_loss"]}, '
            message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, '
            message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, '
            logger.post(message)

            eval_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            for res in eval_results:
                res.update(eval_results_addition)
                # post_to_snapshot_metrics_table(data=res, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET)

            fold_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            fold_results.update(fold_results_addition)
            post_to_fold_metrics_table(fold_results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET)

            oof_mv_preds[index_valid] = fold_results['oof_mv_preds']
            oof_opt_preds[index_valid] = fold_results['oof_opt_preds']
            oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds']
            oof_preds_proba[index_valid] = fold_results['oof_preds_proba']

        results = calculate_total_metrics(results_list)

        results_addition = {
            'date': datetime.now(),
            'script_name': SCRIPT_NAME,
            'spec_id': spec_id,
            'model_name': model_name
        }
        results.update(results_addition)

        if args['save_preds']:
            save_path = DATA_DIR.joinpath(f'predictions/{SCRIPT_NAME + "_" + model_name + ".pkl"}')
            predictions = {
                'proba': oof_preds_proba,
                'mv': oof_mv_preds,
                'opt': oof_opt_preds,
                'reopt': oof_reopt_preds
            }
            joblib.dump(predictions, str(save_path))

        post_to_total_metrics_table(results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET)

        logger.post(f'Spec ID: {spec_id}\nModel Spec: {spec}')

        message = 'KFold training and evaluation has been done.\n'
        message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, '
        message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n'

        message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, '
        message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n'

        message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, '
        message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n'

        mv_thresholds = ", ".join([str(th) for th in results["mv_thresholds_avg"]])

        message += f'Focal Loss: {results["focal_loss_avg"]}, '
        message += f'Optimized Threshold: {results["opt_threshold_avg"]}, '
        message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n'
        message += f'Majority Voting Thresholds: {mv_thresholds}'
        logger.post(message)
Example #50
0
         left_word4, phonetic_input]
    all_outputs = [outputs, out1, out2, out3, out4, out5, out6]

    model = Model(input=all_inputs, output=all_outputs)
    opt = Adam()

    return model


X_vocab_len = 90
X_max_len = 18
n1, n2, n3, n4, n5, n7, _ = pickle.load(open('pickle-dumps/n', 'rb'))

# print("Zero padding .. ")
X_wrds_inds = pad_sequences(X_wrds_inds,
                            maxlen=X_max_len,
                            dtype='int32',
                            padding='post')
X_left1 = pad_sequences(X_left1,
                        maxlen=X_max_len,
                        dtype='int32',
                        padding='post')
X_left2 = pad_sequences(X_left2,
                        maxlen=X_max_len,
                        dtype='int32',
                        padding='post')
X_left3 = pad_sequences(X_left3,
                        maxlen=X_max_len,
                        dtype='int32',
                        padding='post')
X_left4 = pad_sequences(X_left4,
                        maxlen=X_max_len,
Example #51
0
df.drop(df.columns[1], axis=1, inplace=True)
df.info()

X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05)
max_words = 10000
max_len = 128
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)


def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 50, input_length=max_len)(inputs)
    layer = GRU(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

import pickle
with open(os.path.join('data', 'save', "sequences.txt"), "wb") as _fp:
    pickle.dump(sequences, _fp)


# Load encoded_sentences
with open(os.path.join('data', 'save', "sequences.txt"), "rb") as _fp:
    sequences = pickle.load(_fp)
    gc.collect()


# Padding
from keras.preprocessing.sequence import pad_sequences

max_length = max([len(s) for s in sequences])
X = pad_sequences(sequences, maxlen=max_length, padding='post')

np.save(os.path.join('data', 'save', "X.npy"), X)
X = np.load(os.path.join('data', 'save', "X.npy"))


# Creating X with word embeddings
unique_words = len(word_index)
total_words = unique_words + 1
skipped_words = 0
embedding_dim = 100
embedding_matrix = np.zeros((total_words,embedding_dim))

for word, index in tokenizer.word_index.items():
    try:
        embedding_vector = model[word]
Example #53
0
def prepare_data(filepath, num_data_points=40000, vocab_size=4000, max_length=500):
    train_set_proportion = 0.9
    train_size = int(num_data_points * train_set_proportion)

    print("Preparing Data...")
    current_file = open(filepath, "rb")
    x = current_file.read()
    current_file.close()

    x = x.decode("utf-8")
    x = x.splitlines()
    random.shuffle(x)
    x = x[:num_data_points]
    labels = []
    reviews = []

    reTokenizer = RegexpTokenizer(r'\w+')

    for i in x:
        separated = i.split(" ", 1)
        labels.append(separated[0])
        reviews.append(separated[1])

    for i in range(len(labels)):
        labels[i] = int(labels[i] == '__label__1')

    all_words = []
    for i in range(len(reviews)):
        tokens = reTokenizer.tokenize(reviews[i])
        reviews[i] = []
        for word in tokens:
            word = word.lower()
            all_words.append(word)
            reviews[i].append(word)

    vocab_pickle_location = os.path.join(vocab_directory, "all_words.pkl")

    if not os.path.isdir(vocab_directory):
        print("Error: vocab_directory doesn't exist!")
    else:
        all_words = pickle.load(open(vocab_pickle_location, 'rb'))
        all_words = all_words[:vocab_size]

    word2int = {all_words[i][0]: i + 1 for i in range(vocab_size)}

    # int2word = {x: y for y, x in word2int.items()}
    # dict_as_list = list(word2int)

    def review2intlist(rev_text):
        int_list = []
        for word in rev_text:
            if word in word2int.keys():
                int_list.append(word2int[word])
        return int_list

    X = []
    for i in range(len(reviews)):
        X.append(review2intlist(reviews[i]))
    X = sequence.pad_sequences(X, maxlen=max_length)

    LSTM_inputs = np.zeros(shape=(max_length, num_data_points), dtype=np.float32)
    for i in range(num_data_points):
        LSTM_inputs[:, i] = X[i]
    LSTM_inputs = LSTM_inputs.T

    LSTM_outputs = np.zeros(shape=num_data_points)
    for i in range(num_data_points):
        LSTM_outputs[i] = labels[i]

    x_train, y_train = LSTM_inputs[:train_size], LSTM_outputs[:train_size]
    x_test, y_test = LSTM_inputs[train_size:], LSTM_outputs[train_size:]

    half_test_size = int(len(y_test)/2)
    x_valid = x_test[:half_test_size]
    y_valid = y_test[:half_test_size]
    x_test = x_test[half_test_size:]
    y_test = y_test[half_test_size:]

    print("Finished preparing data...")
    return x_train, y_train, x_test, y_test, x_valid, y_valid
Example #54
0
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

########################################
## generate leaky features
########################################

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)
 def pad(self, data, len=None):
     from keras.preprocessing.sequence import pad_sequences
     return pad_sequences(data, maxlen=len, padding='post',
                          truncating='post', value=0)
Example #56
0
def y_label_change(k,maxlen,sen_label,sen):
    sen=sen[0:k]
    tokenizer = Tokenizer(num_words=None)
    tokenizer.fit_on_texts(sen)   #texts作为处理对象
    word_sequence = tokenizer.texts_to_sequences(sen)  #将文本转换为由索引表示的序列数据
    train_data = pad_sequences(word_sequence, maxlen=maxlen, padding="post")    #对padding进行设置,否则会默认从前面开始填充
    word_index = tokenizer.word_index   #word到索引的映射列表
    # word_index['PAD']=0
    # word_index['UNK']=1
    print(train_data.shape)
    print("word",word_index)
    #model =  KeyedVectors.load_word2vec_format('./model/text.model.bin', binary=True)
    model=gensim.models.Word2Vec.load('./model/ner.model')
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in word_index.items():
        if word in model:
            embedding_matrix[i] = np.asarray(model[word])
        elif word not in model:
                # words not found in embedding index will be all-zeros.
            embedding_matrix[i] =np.asarray(0)

    print("t",embedding_matrix.shape)
    tag=['O','B-TIM', 'I-TIM','B-LOC', 'I-LOC','B-ORG', 'I-ORG','B-COM', 'I-COM','B-PRO', 'I-PRO','B-JOB', 'I-JOB','B-PER', 'I-PER']
    Y =sen_label[0:k]#以下操作将二维结构的标签矩阵转换为多类别数值特征
    l=[]
    a=0
    b=0
    for a in range(len(Y)):
        for b in range(len(Y[a])):
            if Y[a][b]==tag[0]:
                Y[a][b]=0
            elif Y[a][b]==tag[1]:
                Y[a][b] = 1
            elif Y[a][b]==tag[2]:
                Y[a][b] = 2
            elif Y[a][b]==tag[3]:
                Y[a][b] = 3
            elif Y[a][b]==tag[4]:
                Y[a][b] = 4
            elif Y[a][b] == tag[5]:
                Y[a][b] = 5
            elif Y[a][b] == tag[6]:
                Y[a][b] = 6
            elif Y[a][b] == tag[7]:
                Y[a][b] = 7
            elif Y[a][b] == tag[8]:
                Y[a][b] = 8
            elif Y[a][b] == tag[9]:
                Y[a][b] = 9
            elif Y[a][b] == tag[10]:
                Y[a][b] = 10
            elif Y[a][b] == tag[11]:
                Y[a][b] = 11
            elif Y[a][b] == tag[12]:
                Y[a][b] = 12
            elif Y[a][b] == tag[13]:
                Y[a][b] = 13
            elif Y[a][b] == tag[14]:
                Y[a][b] = 14
            else:
                pass
            b=b+1
        a=a+1
    print(Y)
    Y=pad_sequences(Y, maxlen=maxlen, padding="post")
    # print("labelsall",np.array(labels_all).shape)
    num_class=len(set(list(tag)))
    print(num_class)
    Y=np.expand_dims(Y, 2)
    return Y,tag,embedding_matrix,word_index,num_class,train_data
Example #57
0
job_detail_pd[
    'Job_Description_key_word'] = job_detail_pd.Job_Description.apply(
        key_word_extract)

#  -------------------------- 建立字典 -------------------------------
# 建立2000个词的字典
token = Tokenizer(num_words=2000)
token.fit_on_texts(
    job_detail_pd['Job_Description_key_word'])  # 按单词出现次数排序,排序前2000的单词会列入词典中

# 使用token字典将“文字”转化为“数字列表”
Job_Description_Seq = token.texts_to_sequences(
    job_detail_pd['Job_Description_key_word'])

# 截长补短让所有“数字列表”长度都是50  词嵌入前的预处理
Job_Description_Seq_Padding = sequence.pad_sequences(Job_Description_Seq,
                                                     maxlen=50)  # 长度都填充到50
x_train = Job_Description_Seq_Padding
y_train = job_detail_pd['label'].tolist()  # 把数组转化为列表

# ------------------ class--------------------*/
from keras.layers import Input
from keras.models import Model

inputs = Input(shape=(50, ))


class JobModel(keras.Model):
    def __init__(self):
        super(JobModel, self).__init__()
        self.embedding = Embedding(output_dim=32, input_dim=2000)
        self.conv1 = Conv1D(256, 3, activation='relu')
Example #58
0
        y=librosa.effects.harmonic(X), sr=sample_rate).T,
                           axis=0)
    tonnetz_std = np.std(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
                                                 sr=sample_rate).T,
                         axis=0)
    return (mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean,
            mfcc_std, chroma_std, mel_std, contrast_std, tonnetz_std)


for fn in files:
    print("Process...", fn)
    try:
        print('process..', fn)
        feature_lld = extract_lld(fn)
        feature_hfs = extract_hfs(fn)
    except Exception as e:
        print('cannot open', fn)
        traceback.print_exc()
        sys.exit(3)

    lld_features = np.hstack(feature_lld)
    hfs_features = np.hstack(feature_hfs)
    feat_lld.append(lld_features)
    feat_hfs.append(hfs_features)

#feat_np = np.array(feat)
feat_lld = np.array(feat_lld)
feat_lld = sequence.pad_sequences(feat_lld, dtype='float64')
np.save('../data/song_librosa.npy', feat_lld)
np.save('../data/song_librosa_hfs.npy', feat_hfs)
Example #59
0
def test_tokenize(tokenizer, sents, MAX_SEQUENCE_LENGTH=500):
    sequences = tokenizer.texts_to_sequences(sents)
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return text
def encode_docs(tokenizer, max_length, docs):
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded