def make_hist_top1( self, hist_pattern ): # historgam (result). Can contain up to 2 placeholders {} for date/time for generation, #products: # Prepare data generators known_data_iterator = cm.get_data_iterator(self.known_data_folder, self.target_size, is_categorical=True) print("self.unknown_data_folder: {}".format(self.unknown_data_folder)) unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder, self.target_size, is_categorical=False) # Get top 1 probabilities print("Getting predictions...") now = datetime.now() preds_known = cm.get_preds_top1(self.model, known_data_iterator) preds_unknown = cm.get_preds_top1(self.model, unknown_data_iterator) print("Predictions done in {} sec".format( (datetime.now() - now).total_seconds())) # Separate if=1 eps = 1e-7 preds_known = np.array([ pred_known + 0.01 if pred_known > 1 - eps else pred_known for pred_known in preds_known ]) preds_unknown = np.array([ pred_unknown + 0.01 if pred_unknown > 1 - eps else pred_unknown for pred_unknown in preds_unknown ]) plt.hist(preds_known, 200, alpha=0.5, label='known ({} samples)'.format(len(preds_known))) plt.hist(preds_unknown, 200, alpha=0.5, label='unknown ({} samples)'.format(len(preds_unknown))) plt.title("Top1 probability distribution for Known vs. Unknown") plt.legend(loc='upper right') # plt.show() hist_file = hist_pattern.format( datetime.now().strftime("%Y%m%d %H%M%S"), known_data_iterator.num_classes) plt.savefig(hist_file) print("Hist at: {}".format(hist_file))
def calc_save_prelast_activations(self, prelast_activations_file_name): known_data_iterator = cm.get_data_iterator(self.known_data_folder, self.target_size, is_categorical=True) unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder, self.target_size, is_categorical=False) (known_classes, _, known_activations) = cm.get_prelast_dense_activations( self.model, known_data_iterator, is_categorical=True) #(_,_,unknown_activations) = cm.get_prelast_dense_activations(self.model, unknown_data_iterator, is_categorical=False) act_file = open(prelast_activations_file_name, 'wb') #pickle.dump( (known_classes,known_activations, unknown_activations), act_file) pickle.dump((known_classes, known_activations), act_file) act_file.close() print("Results saved to file {}".format(prelast_activations_file_name)) return
def calc_save_last_activations(self, last_activations_file_name): # make sure model is loaded self.__load_model() known_data_iterator = cm.get_data_iterator( self.known_data_folder, self.target_size, is_categorical=True) unknown_data_iterator = cm.get_data_iterator( self.unknown_data_folder, self.target_size, is_categorical=False) now = datetime.now() known_preds = cm.get_preds(self.model, known_data_iterator) print ("Got known predictions in {} sec".format((datetime.now()-now).total_seconds() )) now = datetime.now() unknown_preds = cm.get_preds(self.model, unknown_data_iterator) print ("Got unknown predictions in {} sec".format((datetime.now()-now).total_seconds() )) act_file = open(last_activations_file_name, 'wb') pickle.dump( (known_preds,unknown_preds), act_file) act_file.close() print("Results saved to file {}".format(last_activations_file_name)) return
def __process_leaf_folder(self, meansigmas_dic, data_folder, distances_file_name, is_categorical): data_iterator = cm.get_data_iterator(data_folder, self.target_size, is_categorical=is_categorical) (actual, top1, prelast_activations) = cm.get_prelast_dense_activations( self.model, data_iterator, is_categorical=is_categorical) i = 0 for (sample_actual, sample_top1, sample_prelast_activations) in zip(actual, top1, prelast_activations): #Hypothetically, customer chooses each possible product for chosen_id in range(len(meansigmas_dic)): (chosen_mus, chosen_sigmas) = meansigmas_dic[chosen_id] #print ("top1_mus: {}, sample_prelast_activations: {}".format(top1_mus[:2], sample_prelast_activations[:2])) # Calculate euclidean distance and mahalandobis distance dist = np.sum( np.square((sample_prelast_activations - chosen_mus))) # How many sigmas in each dimension varies from mean? (0 sigmas are added epsilon) dist_mahalanobis = np.sum( np.square((sample_prelast_activations - chosen_mus) / (chosen_sigmas + 1e-7))) # cosine distance dist_cosine = scipy.spatial.distance.cosine( sample_prelast_activations, chosen_mus) # Result to file is_selected = chosen_id == sample_actual if is_categorical else 0 df_distances = pd.DataFrame(data=[ np.hstack([ is_selected, sample_actual, dist, dist_mahalanobis, dist_cosine ]) ]) df_distances.to_csv(distances_file_name, header=None, index=None, mode='a') print("Processed {} files".format(i)) if i % 100 == 0 else 0 i += 1
def __process_leaf_folder(self, meansigmas_dic, known_or_unknown, data_folder, distances_file_name): data_iterator = cm.get_data_iterator(data_folder, self.target_size, is_categorical=False) (_, top1, prelast_activations) = cm.get_prelast_dense_activations( self.model, data_iterator, is_categorical=False) i = 0 for (sample_top1, sample_prelast_activations) in zip(top1, prelast_activations): (top1_mus, top1_sigmas) = meansigmas_dic[sample_top1] #print ("top1_mus: {}, sample_prelast_activations: {}".format(top1_mus[:2], sample_prelast_activations[:2])) # Calculate euclidean distance and mahalandobis distance dist = np.sum(np.square((sample_prelast_activations - top1_mus))) # How many sigmas in each dimension varies from mean? (0 sigmas are added epsilon) dist_mahalanobis = np.sum( np.square((sample_prelast_activations - top1_mus) / (top1_sigmas + 1e-7))) # cosine distance dist_cosine = scipy.spatial.distance.cosine( sample_prelast_activations, top1_mus) # Result to file df_distances = pd.DataFrame(data=[ np.hstack([ known_or_unknown, sample_top1, dist, dist_mahalanobis, dist_cosine ]) ]) df_distances.to_csv(distances_file_name, header=None, index=None, mode='a') print("Processed {} files".format(i)) if i % 100 == 0 else 0 i += 1
def make_roc_top1( self, roc_file_pattern ): # ROC graph (result). Can contain up to 2 placeholders {} for date/time for generation, #products: # Prepare data generators known_data_iterator = cm.get_data_iterator(self.known_data_folder, self.target_size, is_categorical=True) unknown_data_iterator = cm.get_data_iterator(self.unknown_data_folder, self.target_size, is_categorical=False) # Get top 1 probabilities print("Getting predictions...") now = datetime.now() preds_known = cm.get_preds_top1(self.model, known_data_iterator) preds_unknown = cm.get_preds_top1(self.model, unknown_data_iterator) print("Predictions done in {} sec".format( (datetime.now() - now).total_seconds())) # Combine known and unknown to same vector y_pred = np.concatenate((preds_known, preds_unknown)) y_true = np.concatenate( (np.ones(len(preds_known)), np.zeros(len(preds_unknown)))) # Calculate ROC (fpr, tpr, thresholds) = roc_curve(y_score=y_pred, y_true=y_true) roc_auc = auc(fpr, tpr) # Find best accuracy accuracy_scores = [] for thresh in thresholds: accuracy_scores.append( accuracy_score(y_true, [1 if m > thresh else 0 for m in y_pred])) best_acc_ind = np.argmax(accuracy_scores) best_acc = accuracy_scores[best_acc_ind] threshhold_to_use = thresholds[best_acc_ind] print("Threshold to use = {}".format(threshhold_to_use)) # Draw ROC plt.figure() plt.plot(fpr, tpr, color='green', lw=2, label='ROC AUC = %0.2f' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate' ) #('Bandoma atpažinti %, kai nežinoma prekė') plt.ylabel( 'True Positive Rate') #('Bandoma atpažinti %, kai žinoma prekė') # cnt_class = len ( np.unique(df_distances["actual"]) ) - 1 #1-unknown class # samples_known = len(np.where (df_distances["actual"]!="")[0]) # samples_unknown = len(np.where (df_distances["actual"]=="")[0]) # plt.title('{} žinomos klasės; {} žinomų prekių, {} nežinomų'.format (cnt_class, samples_known, samples_unknown ) ) plt.legend(loc="lower right") # Draw a point for best accuracy plt.plot(fpr[best_acc_ind], tpr[best_acc_ind], marker="s", color="red") plt.text(fpr[best_acc_ind] + 0.02, tpr[best_acc_ind] - 0.02, "Best accuracy: {:.1f}%".format(best_acc * 100)) # Save roc_file = roc_file_pattern.format( datetime.now().strftime("%Y%m%d %H%M%S"), known_data_iterator.num_classes) plt.savefig(roc_file) print("ROC at: {}".format(roc_file)) return
for version in Visible_versions: for hier in Hier_lvls: for the_set in Extract_sets: # Load model model_file_key = "v" + str(version) + "_Ind-" + str(hier) model_filename = os.path.join(models_path, tc.clsfs[model_file_key]) model = load_model(model_filename) # data iterator set_data_folder = os.path.join(data_folder, "v" + str(version), "Ind-" + str(hier), the_set) data_iter = cm.get_data_iterator(data_folder=set_data_folder, target_size=target_size, is_categorical=True, is_resnet=is_resnet) # calc/save last activations last_activations_filename = last_activations_filepattern.format( version, hier, the_set) calc_save_last_activations(model, data_iter, last_activations_filename) # calc/save pre-last activations prelast_activations_filename = prelast_activations_filepattern.format( version, hier, the_set) calc_save_prelast_activations(model, data_iter, prelast_activations_filename)
def make_conf_mat( self, conf_mat_pattern, # confusion matrix (result). Can contain up to 2 placeholders {} for date/time for generation, #products products_names_file # NULLABLE; csv file w/o header of structure [name,barcode,...] ): # Prepare data generator data_iterator = cm.get_data_iterator(self.data_folder, self.target_size, is_categorical=True) # Predict highest classes (y_pred, y_true) = cm.get_pred_actual_classes(self.model, data_iterator) # Get product names (folder names are barcodes) df_products = None if products_names_file is not None: df_products = pd.read_csv(products_names_file, header=None, dtype=str) # Replace barcodes with product names, if names passed prod_names = list(data_iterator.class_indices.keys()) print("sample barcodes {} (tot: {})".format(prod_names[:2], len(prod_names))) if df_products is not None: prod_names = [ df_products.loc[df_products[1] == barcode, 0].values[0] for barcode in prod_names ] print("sample products {} (tot: {})".format( prod_names[:2], len(prod_names))) # Shorten to 15 characters prod_names = [prod[0:15] for prod in prod_names] #print (prods_short) # result confusion matrix file conf_mat_file = conf_mat_pattern.format( datetime.now().strftime("%Y%m%d %H%M%S"), data_iterator.num_classes) # When 0 images of certain labels, add 1 manually to avoid badly formatted conf mat for lbl in range(len(prod_names)): if lbl not in y_true: y_true = np.append(y_true, lbl) y_pred = np.append(y_pred, lbl) # Draw confusion matrix plt.figure(figsize=(int(len(prod_names) / 15), int(len(prod_names)) / 15), dpi=80) conf_mat = confusion_matrix(y_true=y_true, y_pred=y_pred) print("Shape: {}".format(conf_mat.shape)) ax = sns.heatmap(conf_mat, annot=True, cbar=False, annot_kws={'size': 5}, fmt='g') #for t in ax.texts: t.set_text(t.get_text() + " %") ax.set_xticks(np.arange(len(prod_names)) + 0.5) ax.set_yticks(np.arange(len(prod_names)) + 0.5) #prod_names = ["Product "+str(i) for i in range(len(prod_names))] ax.set_yticklabels(prod_names, horizontalalignment='right', rotation=0, size=5) ax.set_xticklabels(prod_names, horizontalalignment='right', rotation=90, size=5) ax.set_xlabel("PREDICTED", weight="bold") #, size=20) ax.set_ylabel("ACTUAL", weight="bold") #, size=20) plt.tight_layout() plt.savefig(conf_mat_file) plt.close() print("Conf mat at: {}".format(conf_mat_file))