def load_info(self): if self.iterInfoPath.is_file(): self.iterInfo = utils.load_pickle(self.iterInfoPath) else: self.iterInfo = IterInfo(self.unlabeledFolder, self.unlabeledIndexPath, self.loopFolder) dirs.create_folder(self.loopFolder) utils.save_pickle(self.iterInfo, self.iterInfoPath) return self.iterInfo
def dataset_inference_unlabeled(dataset_path, data_transforms, model_path, save_path, batch_size=64, force=False, seed=None, verbose=True): ''' Perform inference on an unlabeled dataset, using a csv Index file as reference. force: Boolean If force is False, search for an existing output file and use it, if it exists. If force is True or output file doesn't exist, compute dataset output and save to file. ''' if os.path.isfile(save_path) and not (force): outputDf = utils.load_pickle(save_path) if len(outputDf) > 0: return outputDf unlabelIndex = IndexManager(dataset_path) # Drop duplicated files unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index, "FrameHash") # Drop missing or corrupt images unlabelIndex.index = dutils.check_df_files(unlabelIndex.index, utils.check_empty_file, "FramePath") imagePathList = unlabelIndex.index["FramePath"].values datasetLen = len(imagePathList) if verbose: print("\nUnlabeled set inference") print("\nDataset information: ") print("\t", datasetLen, "images.") # Label list for an unlabeled dataset (bit of a hack? is there a better way?) labelList = np.zeros(datasetLen) outputDf = _model_inference(imagePathList, data_transforms, labelList, model_path, batch_size) ## Save output to pickle file if verbose: print("\nSaving outputs file to ", save_path) outputDf.to_pickle(save_path)
def dataset_inference_val(dataset_path, data_transforms, model_path, save_path, batch_size=64, force=False, seed=None, verbose=True): ''' Perform inference on validation set and save outputs to file. force: Boolean If force is False, search for an existing output file and use it, if it exists. If force is True or output file doesn't exist, compute dataset output and save to file. ''' if os.path.isfile(save_path) and not (force): outputDf = utils.load_pickle(save_path) if len(outputDf) > 0: return outputDf # Get list of image paths from dataset folder dataset = datasets.ImageFolder(str(dataset_path), transform=data_transforms, is_valid_file=utils.check_empty_file) imageTupleList = dataset.imgs datasetLen = len(imageTupleList) labelList = dataset.targets imagePathList = np.array(dataset.imgs)[:, 0] if verbose: print("Validation set inference.") print("\nDataset information: ") print("\t", datasetLen, "images.") print("\nClasses: ") for key in dataset.class_to_idx.keys(): print("\t{}: {}".format(dataset.class_to_idx[key], key)) outputDf = _model_inference(imagePathList, data_transforms, labelList, model_path, batch_size) ## Save output to pickle file if verbose: print("\nSaving outputs file to ", save_path) outputDf.to_pickle(save_path) return outputDf
def bow_matrix(train_text, test_text, max_features, load_path=None, save_path=None): vectorizer = CountVectorizer(max_features=max_features, preprocessor=lambda x: x, tokenizer=lambda x: x) if load_path: vectorizer.vocabulary_ = utils.load_pickle(load_path) features_train = vectorizer.transform(train_text).toarray() else: features_train = vectorizer.fit_transform(train_text).toarray() vocabulary = vectorizer.vocabulary_ feature_names = vectorizer.get_feature_names() features_test = vectorizer.transform(test_text).toarray() new_train_df = pd.DataFrame(data=features_train, columns=feature_names) new_test_df = pd.DataFrame(data=features_test, columns=feature_names) if save_path: utils.save_pickle(vocabulary, save_path) return new_train_df, new_test_df, vocabulary
unlabelNoManualIndex.to_csv(unlabelNoManualPath, index=False) # If outputs file already exist, skip inference print("\nSTEP: Perform inference on remaining unlabeled set.") if not(fullOutputPath.is_file()): mutils.dataset_inference_unlabeled(unlabelNoManualPath, dataTransforms['val'], modelPath, fullOutputPath, batch_size=inferBatchSize, seed=seed, verbose=True) else: print("Output file already exists: {}\nSkipping inference.".format(fullOutputPath)) print("\nUsing thresholds:\nUpper: {:.4f}\nLower: {:.4f}".format(upperThresh, lowerThresh)) ## Perform automatic labeling print("\nSTEP: Automatic labeling.") unlabeledNoManualIndex = pd.read_csv(unlabelNoManualPath) pickleData = utils.load_pickle(fullOutputPath) outputs, imgHashes, _ = dutils.load_outputs_df(fullOutputPath) outputs = outputs[:, 0] print("\nAutomatic labeling with upper positive ratio {:.1f}%:".format(upperThreshPercent*100)) autoIndex = dutils.automatic_labeling(outputs, imgHashes, unlabeledNoManualIndex, upperThresh, lowerThresh, rede, target_class=target_class) autoIndex.to_csv(autoLabelIndexPath, index=False) plot_outputs_histogram(outputs, lower_thresh=lowerThresh, upper_thresh=upperThresh, title="Unlabeled Outputs Histogram", save_path=unlabelHistogramPath, log=True, show=False) ## Merge labeled sets print("\nMerge auto and manual labeled sets.")
/ "history_{}_no_finetune_{}_epochs_rede_{}_iteration_{}.pickle".format(datasetName, epochs, rede, iteration) resultsFolder = Path(dirs.results) / historyPath.stem nameEnd = "history_{}_epochs_rede_{}_iteration_{}.pdf".format(epochs, rede, iteration) lossName = "loss_" + nameEnd accName = "accuracy_" + nameEnd f1Name = "f1_" + nameEnd if not(historyPath.is_file()): print("History file does not exist.\nFile:\n", historyPath) print("\nExiting program.") exit() dirs.create_folder(resultsFolder) history = utils.load_pickle(historyPath) print(history.keys()) valLoss = history['loss-val'] trainLoss = history['loss-train'] trainAcc = history['acc-train'] valAcc = history['acc-val'] trainF1 = np.array((history['f1-train']))[:, 0] valF1 = np.array((history['f1-val']))[:, 0] plot_model_history([trainLoss, valLoss], data_labels=["Train Loss", "Val Loss"], xlabel="Epochs", ylabel="Loss", title="Training loss history", save_path=resultsFolder / lossName, show=False) plot_model_history([trainAcc, valAcc], data_labels=["Train Acc", "Val Acc"], xlabel="Epochs", ylabel="Acc", title="Training accuracy history", save_path=resultsFolder / accName,
fileLen = entryDf.shape[0] entryDf['Class'] = entryDf['FramePath'].apply(get_class) entryDf['Rede'] = [rede]*fileLen entryDf['Validation'] = [val_type]*fileLen entryDf['Dataset'] = [net_type]*fileLen entryDf['Set'] = entryDf['FramePath'].apply(get_set) print(entryDf.groupby('Class').count()) if allDatasets is None: allDatasets = entryDf else: allDatasets = pd.concat([allDatasets, entryDf], ignore_index=True) utils.save_pickle(allDatasets, dfPath) else: allDatasets = utils.load_pickle(dfPath) print(allDatasets.groupby('Rede').count()) print() print(allDatasets.groupby('Dataset').count()) print(allDatasets.groupby('Set').count()) targetNet = 'reference' tablePath = Path(dirs.results) / 'dataset_counts_sets_{}.xlsx'.format(targetNet) index = allDatasets['Dataset'] == targetNet view = allDatasets.loc[index, :] index = allDatasets['Rede'] == 1 view = allDatasets.loc[index, :] index = allDatasets['Validation'] == 'ref'
indexPath = Path(dirs.iter_folder) / \ "full_dataset_softmax/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration-1, iteration-1) savedModelsFolder = Path( dirs.saved_models) / "full_dataset_rede_{}_softmax/iteration_{}".format( rede, iteration) outputPath = savedModelsFolder / \ "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration) newIndexPath = Path(dirs.iter_folder) / \ "full_dataset/iteration_{}/automatic_labeled_images_iteration_{}.csv".format(iteration, iteration) idealUpperThresh = 0.8923 # Ratio 99% idealLowerThresh = 0.0904 # Ratio 1% indexDf = pd.read_csv(indexPath) pickleData = utils.load_pickle(outputPath) indexDf = dutils.remove_duplicates(indexDf, "FrameHash") outputs, imgHashes, _ = dutils.load_outputs_df(outputPath) outputs = outputs[:, 0] indexDf.set_index("FrameHash", drop=False, inplace=True) print("\nAutomatic labeling with upper positive ratio 99%:") posHashes, negHashes = dutils.automatic_labeling(outputs, imgHashes, idealUpperThresh, idealLowerThresh) newLabeledIndex = dutils.get_classified_index(indexDf, posHashes,