def run_classifer(X_train, s_train, y_train, X_test, s_test, y_test): s_train = np.array(s_train) # samples x features s_test = np.array(s_test) num_labels = 15 batch_size = 100 stemmer = sb.SnowballStemmer('english') swlist = sw.words('english') swlist += [stemmer.stem(w) for w in swlist] swlist += [ "'d", "'s", 'abov', 'ani', 'becaus', 'befor', 'could', 'doe', 'dure', 'might', 'must', "n't", 'need', 'onc', 'onli', 'ourselv', 'sha', 'themselv', 'veri', 'whi', 'wo', 'would', 'yourselv' ] #complained about not having these as stop words pubs = [ 'buzzfe', 'buzzf', 'npr', 'cnn', 'vox', 'reuter', 'breitbart', 'fox', 'guardian', 'review', 'theatlant' ] punct = [ ] #[':', '..', '“', '@', '%', ';', '→', ')', '#', '(', '*', '&', '[', ']', '…', '?','—', '‘', '$'] #gonna leave these in for now swlist += pubs swlist += punct if sys.argv[4].lower() == 'true': tkzr = StemTokenizer() else: tkzr = None if sys.argv[5].lower() != 'true': swlist = [] #what features are we using? if sys.argv[7].lower() == 'word': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train = tfidf_transformer.transform(X_train) X_test = tfidf_transformer.transform(X_test) elif sys.argv[7].lower() == 'topic': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train = lda_model.transform(X_train) X_test = lda_model.transform(X_test) elif sys.argv[7].lower() == 'style': X_train = csr_matrix(s_train) X_test = csr_matrix(s_test) elif sys.argv[7].lower() == 'all': count_vect = CountVectorizer(stop_words=swlist, tokenizer=tkzr) count_vect.fit(X_train) X_train = count_vect.transform(X_train) X_test = count_vect.transform(X_test) tfidf_transformer = TfidfTransformer() tfidf_transformer.fit(X_train) X_train_tf = tfidf_transformer.transform(X_train) X_test_tf = tfidf_transformer.transform(X_test) print(type(X_train_tf)) lda_model = LatentDirichletAllocation(n_components=10) lda_model.fit(X_train) X_train_lda = lda_model.transform(X_train) X_test_lda = lda_model.transform(X_test) print(type(X_train_lda)) X_train = csr_matrix( sparse.hstack( [X_train_tf, csr_matrix(X_train_lda), csr_matrix(s_train)])) X_test = csr_matrix( sparse.hstack( [X_test_tf, csr_matrix(X_test_lda), csr_matrix(s_test)])) print(type(X_train)) # sparse.save_npz("X_train" + sys.argv[6] + ".npz", X_train) # sparse.save_npz("X_test" + sys.argv[6] + ".npz", X_test) else: sys.exit('unknown features') encoder = LabelBinarizer() encoder.fit(y_train) y_train = encoder.transform(y_train) y_test = encoder.transform(y_test) # np.save('X_train.npy', X_train) # np.save('X_test.npy', X_test) # np.save('y_train.npy', y_train) # np.save('y_test.npy', y_test) # sparse.save_npz("y_train" + sys.argv[6] + ".npz", y_train) # sparse.save_npz("y_test" + sys.argv[6] + ".npz", y_test) # load everything back # X_train = sparse.load_npz("X_train.npz") input_dim = X_train.shape[1] model = Sequential() model.add(Dense(512, input_shape=(input_dim, ))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels)) model.add(Activation('softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, batch_size=batch_size, epochs=5, verbose=1, validation_split=0.1) # model.model.save(sys.argv[6] + '.h5') # X_train = np.load('X_train.npy') # X_test = np.load('X_test.npy') # y_train = np.load('y_train.npy') # y_test = np.load('y_test.npy') # model = keras.models.load_model(sys.argv[6] + '.h5') score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) print('Test accuracy:', score[1]) y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) predicted = np.argmax(y_pred, axis=1) p, r, fs, s = precision_recall_fscore_support(np.argmax(y_test, axis=1), predicted) print(p, r, fs, s)
if logs is None: logs = {} if (logs.get('val_accuracy') > 0.98): print("reach 98% accuracy in valudation") self.model.stop_training = True escb1 = EarlyStopCallback() csv = pd.read_csv('data/demo74_bmi.csv') print(csv.shape) print(csv.head(n=10)) csv['height'] = csv['height'] / 200 csv['weight'] = csv['weight'] / 100 encoder = LabelBinarizer() transformedLabel = encoder.fit_transform(csv['label']) print(csv['label'][:10]) print(transformedLabel[:10]) DATA_FOR_TEST = 90000 test_csv = csv[DATA_FOR_TEST:] test_part = test_csv[['weight', 'height']] test_answer = transformedLabel[DATA_FOR_TEST:] train_csv = csv[:DATA_FOR_TEST] train_part = train_csv[['weight', 'height']] train_answer = transformedLabel[:DATA_FOR_TEST] print(test_part.shape, train_part.shape, test_answer.shape, train_answer.shape) model = Sequential() model.add(Dense(10, activation='relu', input_shape=(2, )))
data = [] labels = [] for category in CATEGORIES: path = os.path.join(DIRECTORY, category) for img in os.listdir(path): img_path = os.path.join(path, img) image = load_img(img_path, target_size=(224, 224)) image = img_to_array(image) image = preprocess_input(image) data.append(image) labels.append(category) # perform one-hot encoding on the labels lb = LabelBinarizer() labels = lb.fit_transform(labels) labels = to_categorical(labels) data = np.array(data, dtype="float32") labels = np.array(labels) (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.20, stratify=labels, random_state=42) # construct the training image generator for data augmentation aug = ImageDataGenerator(rotation_range=20, zoom_range=0.15,
def main(argv=None): if argv is None: argv = sys.argv[1:] options = parse_args(argv) print("[INFO] loading images...") loader = SimpleDatasetLoader(preprocessors=[ SimplePreprocessor(width=img_cols, height=img_rows), ImageToArrayPreprocessor(), ]) data, labels = loader.load( driving_log_path=options.driving_log, data_path=options.dataset, verbose=True, ) data = data.astype('float32') import ipdb ipdb.set_trace() # # horizontal reflection for augmentation # data = np.append(data, data[:, :, ::-1], axis=0) # labels = np.append(labels, -labels, axis=0) # split train and validation data, labels = shuffle(data, labels) x_train, x_test, y_train, y_test = train_test_split( data, labels, random_state=13, test_size=0.1, ) # x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) # x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) lb = LabelBinarizer() y_train = lb.fit_transform(y_train) y_test = lb.transform(y_test) label_names = ['straight', 'left', 'right'] aug = ImageDataGenerator( rotation_range=1, width_shift_range=0.1, height_shift_range=0.1, zoom_range=0.2, horizontal_flip=False, fill_mode="nearest", ) print('[INFO] compiling model...') # model = NvidiaNet.build(width=img_cols, height=img_rows, depth=1) # model = TinyNet.build(width=img_cols, height=img_rows, depth=1) # model = ShallowNet.build(width=img_cols, height=img_rows, depth=1, classes=len(label_names)) model = MiniVGGNet.build(width=img_cols, height=img_rows, depth=1, classes=len(label_names)) opt = SGD(lr=learning_rate, momentum=0.9, decay=learning_rate / nb_epoch, nesterov=True) # opt = SGD(lr=learning_rate) # opt = Adam(lr=learning_rate) # model.compile( # loss='mean_squared_error', # metrics=["accuracy"], # optimizer=opt, # ) model.compile( loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt, ) history = model.fit_generator( aug.flow(x_train, y_train, batch_size=batch_size), # history = model.fit( # x_train, y_train, nb_epoch=nb_epoch, # batch_size=batch_size, steps_per_epoch=(len(x_train) // batch_size), verbose=1, validation_data=(x_test, y_test), ) predictions = model.predict(x_test, batch_size=batch_size) print( classification_report( y_test.argmax(axis=1), predictions.argmax(axis=1), target_names=label_names, )) plt.style.use("ggplot") fig, ax_acc = plt.subplots(1, 1) ax_acc.set_xlabel("Epoch #") ax_loss = ax_acc.twinx() ax_loss.grid(None) ax_loss.set_ylabel("Loss") ax_acc.grid(None) ax_acc.set_ylabel("Accuracy") ax_acc.set_ylim([0, 1]) ax_loss.plot(np.arange(0, nb_epoch), history.history["loss"], label="train_loss") ax_loss.plot(np.arange(0, nb_epoch), history.history["val_loss"], label="val_loss") ax_acc.plot(np.arange(0, nb_epoch), history.history["acc"], label="train_acc") ax_acc.plot(np.arange(0, nb_epoch), history.history["val_acc"], label="val_acc") fig.suptitle("Training Loss and Accuracy") fig.legend() plt.show() model.save(options.model) return 0
def load_encoder(data, column="ocean_proximity"): encoder = LabelBinarizer() data_cat = data[column] data_cat_1hot = encoder.fit_transform(data_cat) return data_cat_1hot