def train_CNN(filepath):
	texts, labels = convert_csv_list( filepath )
	tokenizer = Tokenizer(num_words=300)
	tokenizer.fit_on_texts(texts)
	sequences = tokenizer.texts_to_sequences(texts)
	word_index = tokenizer.word_index
	vocab_size = len(word_index)
	data = pad_sequences(sequences, maxlen=322)
	print("Length of training data {}".format(len(data)))
	print("Shape of data {}".format(data.shape))
	labels_cat = np.array(labels)

	kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=12)
	cvscores = []
	models = []
	test_data = []

	""" Ready to train """
	print(" data shape {}".format(data.shape))
	# print(" train shape {}".format(labels.shape))
	for train,test in kfold.split(data,labels):
		# As keras does not have support for multi filters in cnn on same output from embedding layer hence
		# proceeding with one layer of cnn with one filte
		Y = labels_cat[train]
		Y_test = labels_cat[test]
		model = create_model(vocab_size+2,100,322,(3,),256,0.3)
		model.fit(data[train],Y,epochs=15,batch_size=128)
		scores = model.evaluate(data[test],Y_test,verbose=1)
		print("{} {}".format(model.metrics,scores))
		cvscores.append(scores[1])
		models.append(model)
		test_data.append(test)
		
	max_index = np.array(cvscores).argmax()
	model = models[max_index]
	t_data = test_data[max_index]
	predicted = model.predict(data[t_data])
	print(np.round(predicted))
	print(labels_cat[t_data])
	print(classification_report(labels_cat[t_data],np.round(predicted)))
	# print(confusion_matrix(np.argmax(labels_cat[t_data]),np.argmax(np.round(predicted))))
	cvscores.append(scores)
	model.save('./models/cl_CNN.h5')
	pickle.dump(tokenizer,open('./models/tokenizer_CNN.p','wb'))
Exemple #2
0
    #labels = to_categorical(np.asarray(labels))  # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..]
    #print(labels)
    #labels = labels[indices]

    kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12)
    cvscores = []
    models = []
    test_data = []
    """ Ready to train """
    print(" data shape {}".format(data.shape))
    print(" train shape {}".format(labels.shape))
    for train, test in kfold.split(data, labels):
        # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte
        Y = labels_cat[train]
        Y_test = labels_cat[test]
        model = create_model(vocab_size, 100, 50, (3, ), 256, 0.3)
        model.fit(data[train], Y, epochs=80, batch_size=16)
        scores = model.evaluate(data[test], Y_test, verbose=1)
        print("{} {}".format(model.metrics, scores))
        cvscores.append(scores[2])
        models.append(model)
        test_data.append(test)
    print(cvscores)
    max_index = np.array(cvscores).argmax()
    model = models[max_index]
    t_data = test_data[max_index]
    predicted = model.predict(data[t_data])
    print(np.round(predicted))
    print(labels_cat[t_data])
    print(classification_report(labels_cat[t_data], np.round(predicted)))
    print(
Exemple #3
0
# 		fh.close()
# 		texts = texts+lines
# 		[labels.append(class_id) for x in lines]
# 		class_id += 1

if __name__ == '__main__':
    read_inputs('./data')
    tokenizer = Tokenizer(num_words=500)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    vocab_size = len(word_index)
    data = pad_sequences(sequences, maxlen=50)
    print("Length of training data {}".format(len(data)))
    print("Shape of data {}".format(data.shape))
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    print("Indices {}".format(indices))
    data = data[indices]
    labels = to_categorical(np.asarray(
        labels))  # this converts [0,0,1,1] to [[1..],[1...],[0 1 0...]..]
    print(labels)
    labels = labels[indices]
    model = create_model(
        vocab_size, 100, 50, (3, ), 256, 0.5
    )  # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filter
    """ Ready to train """
    model.fit(data, labels, epochs=500, batch_size=16)
    model.save('./keras_saved_model/intent_model.h5')
    pickle.dump(tokenizer, open('./keras_saved_model/tokenizer.p', 'wb'))
Exemple #4
0
    #labels = labels[indices]

    kfold = StratifiedKFold(n_splits=30, shuffle=True, random_state=12)
    cvscores = []
    models = []
    test_data = []
    """ Ready to train """
    print(" data shape {}".format(data.shape))
    print(" train shape {}".format(labels.shape))
    for train, test in kfold.split(data, labels):
        # As keras does not have support for multi filters in cnn on same output from embedding layer hence proceeding with one layer of cnn with one filte
        Y = labels_cat[train]
        Y_test = labels_cat[test]
        model = create_model(vocab_size,
                             100,
                             50, (3, ),
                             256,
                             0.3,
                             embedding_matrix=glove.embedding_matrix)
        model.fit(data[train], Y, epochs=80, batch_size=16)
        scores = model.evaluate(data[test], Y_test, verbose=1)
        print("{} {}".format(model.metrics, scores))
        cvscores.append(scores[2])
        models.append(model)
        test_data.append(test)
    print(cvscores)
    max_index = np.array(cvscores).argmax()
    model = models[max_index]
    t_data = test_data[max_index]
    predicted = model.predict(data[t_data])
    print(np.round(predicted))
    print(labels_cat[t_data])