def get_conf_mat(set_name): conf_mat_filename = "temp/conf_mat_{}.h5".format(set_name) if not os.path.exists(conf_mat_filename): # Prepare data generator data_folder = os.path.join(Glb.images_folder, "Bal_v14", "Ind-0", set_name) data_iterator = Glb_Iterators.get_iterator(data_folder, div255_resnet="div255", shuffle=False) # Load moddel model_filename = "model_clsf_from_isVisible_20210415_gpu1.h5" print("Loading model {}".format(model_filename)) now = time.time() model = load_model(os.path.join(Glb.results_folder, model_filename)) # 83% test accuracy print("Loaded in {} sec".format(time.time() - now)) # Predict highest classes and get conf_mat print("Predicting...") now = time.time() (y_pred, y_true) = cm.get_pred_actual_classes(model, data_iterator) del model print("Predicted in {} sec".format(time.time() - now)) conf_mat = confusion_matrix(y_true=y_true, y_pred=y_pred) pickle.dump(conf_mat, open(conf_mat_filename, 'wb')) print("Saved conf mat {}".format(set_name)) else: conf_mat = pickle.load(open(conf_mat_filename, 'rb')) print("Loaded conf mat {}".format(set_name)) # sanity check: should be 83% (Test), 49.8 (Val) acc = np.sum([conf_mat[i, i] for i in range(194)]) / np.sum(conf_mat) print("Acc: {}".format(acc)) return conf_mat
#set_name = "Test" #set_name = "Train" set_name = "Val" #dist_method = "manhattan" #dist_method = "euclidean" dist_method = "cosine" #dist_method = "rbf" #linkage_method='centroid' linkage_method='single' #linkage_method='complete' data_iterator = Glb_Iterators.get_iterator(os.path.join( r"C:\IsKnown_Images_IsVisible\Bal_v14\Ind-0", set_name), "div255", batch_size=batch_size) # Total number of images cnt_imgs = len(data_iterator.classes) cnt_classes = len(data_iterator.class_indices) act_filename = act_filename_pattern.format(set_name) #if not os.path.exists (act_filename): # # Allocate buffer for storing activations and labels # act_prelast = np.zeros ((cnt_imgs, prelast_output_shape), dtype=np.float32) # lbls = np.zeros ((cnt_imgs), dtype=np.int) # # cntr = 0 # now = datetime.now() # # # Save activations # for X,y in data_iterator:
4: os.path.join(Glb.results_folder, "model_clsf_from_isVisible_20210614_gpu0_hier4.h5") } data_folders = { 0: os.path.join(Glb.images_folder, "Bal_v14", "Ind-0", "Test"), 1: os.path.join(Glb.images_folder, "Bal_v14", "Ind-1", "Test"), 2: os.path.join(Glb.images_folder, "Bal_v14", "Ind-2", "Test"), 3: r"D:\IsKnown_Images\Bal_102030_v14_Ind-3\Ind-3\Test", 4: r"D:\IsKnown_Images\Bal_102030_v14_Ind-4\Ind-4\Test" } for hier_lvl in range(0, 1): # Prep data, model model = load_model(model_filenames[hier_lvl]) data_folder = data_folders[hier_lvl] data_iterator = Glb_Iterators.get_iterator(data_folder, div255_resnet="div255", shuffle=False) # Predict preds = model.predict(data_iterator) pred_classes = np.argmax(preds, axis=1) actual_classes = data_iterator.classes # Accuracy, f-score acc = accuracy_score(y_true=actual_classes, y_pred=pred_classes) f1 = f1_score(y_true=actual_classes, y_pred=pred_classes, average="macro") print("Model Hier-{}. Acc={}, F1={}".format(hier_lvl, acc, f1))
return tf.train.Feature(float_list=tf.train.FloatList(value=values)) def _int64_feature(value): """Returns an int64_list from a bool / enum / int / uint.""" return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) hier_lvl = 0 set_name = "Test" batch_size = 32 div255_resnet = "div255" img_filepath = os.path.join(Glb.images_folder, "Bal_v14", "Ind-{}".format(hier_lvl), set_name) data_iterator = Glb_Iterators.get_iterator(img_filepath, div255_resnet=div255_resnet, batch_size=batch_size) # all file names to list allfiles_path = [] for barcode_path in os.listdir(img_filepath): allfiles_path += [ os.path.join(img_filepath, barcode_path, filepath) for filepath in os.listdir(os.path.join(img_filepath, barcode_path)) ] now = time.time() for i, (X, y) in enumerate(data_iterator): #print ("batch {}/{}".format(i,len(data_iterator))) #if i+1>=0: #len(data_iterator): if i + 1 >= len(data_iterator):
df_prodnames = pd.read_csv("df_prods_194_translated.csv", header=0)["product"].tolist() df_classes = pd.read_csv("df_prods_194_translated.csv", header=0)["class"].tolist() model_filename = os.path.join( Glb.results_folder, "model_clsf_from_isVisible_20210415_gpu1.h5") # 83% test accuracy #Hier-0 model = load_model(model_filename) data_folder = os.path.join(Glb.images_folder, "Bal_v14", "Ind-{}".format(hier_lvl), set_name) data_iterator = Glb_Iterators.get_iterator(data_folder, div255_resnet="div255", batch_size=350, target_size=256, shuffle=False) total_classes = len(data_iterator.class_indices) actual_classes = data_iterator.classes now = time.time() preds = model.predict(data_iterator, steps=len(data_iterator)) print("Predicted in {} sec".format(time.time() - now)) pred_classes = np.argmax(preds, axis=1) # Sanity check: overall accuracy acc = len(np.where(pred_classes == actual_classes)[0]) / len(actual_classes) total_errors = len(np.where(pred_classes != actual_classes)[0]) print("{} accuracy: {}. Total errors: {}/{}".format(set_name, acc, total_errors,
def trainModel(epochs, bn_layers, dropout_layers, l2_layers, padding, target_size, dense_sizes, architecture, conv_layers_over_5, use_maxpool_after_conv_layers_after_5th, version, load_existing, gpu_id, model_filename, lc_filename, data_dir): # Trains a model # model = optional parameter; creates new if not passed; otherwise keeps training # epochs - number of max epochs to train (subject to early stopping) # bn_layers - list of indexes of Dense layers (-1 and down) and CNN layers (1 and up) where Batch Norm should be applied # dropout_layers - list of indexes of Dense layers (-1 and down) where Dropout should be applied # bn_layers - list of indexes of Dense layers (-1 and down) where L2 regularization should be applied # padding - changed to "same" to keep 2^n feature map sizes # dense_sizes - dictionary of dense layer sizes (cnt of neurons) # architecture - one of: Model_6classes_c4_d3_v1, Model_6classes_c5_d2_v1, Model_6classes_c5_d3_v1 # conv_layers_over_5 - number of convolutional layers after 5th # use_maxpool_after_conv_layers_after_5th - list of boolean values whether to use maxpooling after 5th layer # version - used to name a learning curve file # load_existing - whether to load an existing model file # Returns: # model: trained Keras model # # To call: # model = Train_v1.trainModel(epochs=20) crop_range = 1 # number of pixels to crop image (if size is 235, crops are 0-223, 1-224, ... 11-234) #target_size = 224 batch_size = 32 #datasrc = "visible" # Manually copied to C: to speed up training #data_dir = os.path.join(Glb.images_folder, "Bal_v14", "Ind-{}".format(hier_lvl) ) data_dir_train = os.path.join(data_dir, "Train") data_dir_val = os.path.join(data_dir, "Val") data_dir_test = os.path.join(data_dir, "Test") train_iterator = Glb_Iterators.get_iterator(data_dir_train, "div255") val_iterator = Glb_Iterators.get_iterator(data_dir_val, "div255") test_iterator = Glb_Iterators.get_iterator( data_dir_test, "div255", shuffle=False ) # dont shuffle in order to get proper actual/prediction pairs Softmax_size = len(train_iterator.class_indices) dense_sizes["d-1"] = Softmax_size #model_filename = os.path.join(Glb.results_folder, # "model_clsf_from_isVisible_{}_gpu{}_hier{}.h5".format(date.today().strftime("%Y%m%d"), gpu_id, hier_lvl)) #lc_filename = os.path.join(Glb.results_folder, # "lc_clsf_from_isVisible_{}_gpu{}_hier{}.csv".format(date.today().strftime("%Y%m%d"), gpu_id, hier_lvl)) # Create or load model if not load_existing: print("Creating model") prepModel = modelVersions_dic[architecture] prep_model_params = { "input_shape": (target_size, target_size, 3), "bn_layers": bn_layers, "dropout_layers": dropout_layers, "l2_layers": l2_layers, "padding": padding, "dense_sizes": dense_sizes, "conv_layers_over_5": conv_layers_over_5, "use_maxpool_after_conv_layers_after_5th": use_maxpool_after_conv_layers_after_5th } model = prepModel(**prep_model_params) else: print("Loading model") #model_filename = r"J:\Visible_models\6class\model_6classes_v" + str(version) + ".h5" model = load_model(model_filename) model.compile( loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), # default LR: 0.001 metrics=['accuracy']) print(model.summary()) callback_earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=10, verbose=1, mode='max', restore_best_weights=True) callback_csv_logger = CSVLogger(lc_filename, separator=",", append=False) mcp_save = ModelCheckpoint(model_filename, save_best_only=True, monitor='val_accuracy', mode='max') model.fit(train_iterator, steps_per_epoch=len(train_iterator), epochs=epochs, verbose=2, validation_data=val_iterator, validation_steps=len(val_iterator), callbacks=[callback_csv_logger, callback_earlystop, mcp_save]) # print("Evaluation on test set (1 frame)") test_metrics = model.evaluate(test_iterator) print("Test: {}".format(test_metrics)) print("Evaluating F1 test set (1 frame)") y_pred = model.predict(test_iterator) y_pred_classes = np.argmax(y_pred, axis=1) y_true = test_iterator.classes test_acc = accuracy_score(y_true=y_true, y_pred=y_pred_classes) test_f1 = f1_score(y_true=y_true, y_pred=y_pred_classes, average='macro') print("acc:{}, f1:{}".format(test_acc, test_f1)) # metrics to csv df_metrics = pd.DataFrame( data={ "gpu": [gpu_id], "datetime": [datetime.now().strftime("%Y%m%d %H:%M:%S")], "data_dir": [data_dir], "test_acc": [test_acc], "test_f1": [test_f1] }) df_metrics_filename = os.path.join(Glb.results_folder, "metrics_mrg.csv") df_metrics.to_csv(df_metrics_filename, index=False, header=False, mode='a') #print("Evaluation on validation set (1 frame)") #val_metrics = model.evaluate(val_iterator) #print("Val: {}".format(val_metrics)) return model
def put_prelast_act_to_file(model_filename, act_filename_pattern, hier_lvl, set_name, incl_filenames): model = load_model(model_filename) act_filename = act_filename_pattern.format(set_name, hier_lvl, "filenames" if incl_filenames else "nofilenames") # Data iterator batch_size = 128 #set_name = "Test" #set_name = "Train" #set_name = "Val" #hier_lvl = 0 #hier_lvl = 1 #hier_lvl = 2 #hier_lvl = 3 #hier_lvl = 4 # which layer is needed? # model.summary() prelast_dense_layer = model.layers[-2] #model.layers[dense_layer_ids[-2]] prelast_func_activation = function([model.input], [prelast_dense_layer.output]) prelast_output_shape = prelast_dense_layer.output_shape[1] #data_iterator = Glb_Iterators.get_iterator(os.path.join( Glb.images_folder, "Bal_v14", "Ind-{}".format(hier_lvl), set_name), "div255", batch_size=batch_size) data_folder = os.path.join(Glb.images_folder, "Bal_v14", "Ind-{}".format(hier_lvl), set_name) print ("Datafolder:{}".format(data_folder)) if incl_filenames: data_iterator = Glb_Iterators.get_iterator_incl_filenames( data_folder=data_folder, batch_size=batch_size, target_size=256) else: data_iterator = Glb_Iterators.get_iterator (data_folder=data_folder, div255_resnet="div255", batch_size=batch_size, target_size=256, shuffle=True) #cntr = 0 now = datetime.now() all_filenames = [] # Save activations #for X,y in data_iterator: for cntr, batch_tuple in enumerate( data_iterator ): if incl_filenames: (X, y, filenames) = batch_tuple else: (X, y) = batch_tuple if cntr==0: if incl_filenames: cnt_imgs = len(Glb_Iterators.all_filepaths) else: cnt_imgs = len(data_iterator.classes) # Allocate buffer for storing activations and labels act_prelast = np.zeros((cnt_imgs, prelast_output_shape), dtype=np.float32) lbls = np.zeros((cnt_imgs), dtype=np.int) cnt_samples_in_batch = y.shape[0] #print ("Batch {}/{}".format(cntr, len(data_iterator))) print("Batch {}/{}".format(cntr, Glb_Iterators.len_iterator if incl_filenames else len(data_iterator))) act_prelast[ (cntr*batch_size):(cntr*batch_size+cnt_samples_in_batch),:] = prelast_func_activation([X])[0] lbls [ (cntr*batch_size):(cntr*batch_size+cnt_samples_in_batch) ] = np.argmax(y, axis=1) if incl_filenames: all_filenames += filenames if not incl_filenames and (cntr+1) >= len(data_iterator): break print ("Total seconds: {}".format((datetime.now() - now).seconds)) if incl_filenames: pickle.dump( (act_prelast,lbls,all_filenames), open(act_filename, 'wb') ) else: pickle.dump( (act_prelast,lbls), open(act_filename, 'wb') )