def run(self): self._is_running = True # TODO: sanitize classlist - currently, erronious (possibly dangerous) classnames will reach the SQL request untouched if not path.exists('./instances/'): makedirs('./instances/') workdir = './instances/{}/'.format(self.instanceName) if not path.exists(workdir): makedirs(workdir) dataset_creator = DatasetCreator( self.db.cursor(), workdir=workdir, class_config=self.trainingargs['class_config']) dataset_dataframe = dataset_creator.get_dataset_dataframe() self.trainingargs['class_config'] = dataset_dataframe['label'].unique() checkpointarg = self.trainingargs['checkpoint_name'] checkpointarg = './instances/{}/out/'.format(checkpointarg) if checkpointarg else None if checkpointarg and not path.exists(checkpointarg): checkpointarg = None instance = TrainingInstance( dataset_dataframe=dataset_dataframe, checkpoint_path=checkpointarg) # start training model, history = instance.train() self.insertModel(history, self.instanceName) TrainingThread.save_model(model, output_directory=workdir) logging.info('Training Complete for model {}'.format(self.instanceName)) self._is_running = False exit()
class TestDatasetCreator(unittest.TestCase): def setUp(self): self.imageSource = Mock() self.datasetSplitter = Mock() self.target = DatasetCreator(self.imageSource, self.datasetSplitter) def test_ReadAndSplit(self): images = Mock() dataset = Mock() self.imageSource.load.return_value = images self.datasetSplitter.split.return_value = dataset result = self.target.buildDataset(datasetSplitIn=[0.6, 0.2, 0.2]) self.imageSource.load.assert_called_with() self.datasetSplitter.split.assert_called_with(images, [0.6, 0.2, 0.2]) self.assertEqual(dataset, result) def test_callsPreprocessorIfInformed(self): images = Mock() dataset = Mock() processedDataset = Mock() preprocessor = Mock() self.imageSource.load.return_value = images self.datasetSplitter.split.return_value = dataset preprocessor.process.return_value = processedDataset self.target = DatasetCreator(self.imageSource, self.datasetSplitter, preprocessor) result = self.target.buildDataset(datasetSplitIn=[0.6, 0.2, 0.2]) self.imageSource.load.assert_called_with() self.datasetSplitter.split.assert_called_with(images, [0.6, 0.2, 0.2]) preprocessor.process.assert_called_with(dataset) self.assertEqual(processedDataset, result)
def CreateDatasetFromRosbag(rosbagName, pickleName, isBebop=True, start_frame=None, end_frame=None): """Converts rosbag to format suitable for training/testing. if start_frame, end_frame are unknown, FrameSelector will help you choose how to trim the video Parameters ---------- rosbagName : str The file location of the rosbag pickleName : str name of the new .pickle file isBebop : bool, optional True if you want RGB dataset for the bebop, False if you want Himax-tailored dataset start_frame : int, optional if known, the timestamp in ns of the frame you wish to start from end_frame : int, optional if known, the timestamp in ns of the frame you wish to finish at """ dc = DatasetCreator(rosbagName) if (start_frame is None) or (end_frame is None): start_frame, end_frame = dc.FrameSelector() if isBebop == True: dc.CreateBebopDataset(0, pickleName, start_frame, end_frame) else: dc.CreateHimaxDataset(config.himax_delay, pickleName, start_frame, end_frame)
def CreatePickle(subject_name, rosbagfolder, delay=0): rosbagName = rosbagfolder + subject_name + ".bag" pickleName = config.folder_path + "/../Pickles/16_4_2020/" + subject_name + ".pickle" dc = DatasetCreator(rosbagName) dc.CreateHimaxDataset(delay, pickleName, "/image_raw", "optitrack/gapuino", ["optitrack/head"])
def ProcessAllFilesInFolder(rosbag_folder, pickle_folder): files = os.listdir(rosbag_folder) for f in files: #print(f) dc = DatasetCreator(rosbag_folder + f) pickleName = pickle_folder + os.path.splitext( os.path.basename(f))[0] + ".pickle" #print(pickleName) dc.CreateHimaxDataset(0, pickleName, "/image_raw", "optitrack/gapuino", ["optitrack/head"])
def main(): # subject_name = "davide1" # rosbagName = config.folder_path + "/data/Hand/" + subject_name + ".bag" # pickleName = config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle" # CreateDatasetFromRosbag(rosbagName, pickleName, isBebop=True, start_frame=None, end_frame=None) subject_name = "session3" rosbagName = config.folder_path + "/data/compressed/" + subject_name + ".bag" pickleName = config.folder_path + "/data/compressed/" + subject_name + ".pickle" #CreateDatasetFromDarioRosbag(rosbagName, pickleName, start_frame=None, end_frame=None) dc = DatasetCreator(rosbagName) #dc.CreateBebopDataset(0, pickleName, "bebop/image_raw/compressed", "optitrack/drone", ["optitrack/head", "optitrack/hand"]) dc.CreateHimaxDataset(config.himax_delay, pickleName, "himax_camera", "bebop/image_raw/compressed", "optitrack/drone", ["optitrack/head", "optitrack/hand"])
def training_and_classification_with_kfold_cross_validation(collection_name, k): ''' Training and classification of an autotagger using k-fold cross validation ''' _split_metadata_and_features(collection_name, k) for i in range(1,k+1): # Create a gaia dataset with the training set print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i training_features='train/%s_features__fold%d.tsv' % (collection_name, i) chunk_size=5000 dataset_suffix="fold%d" % i replace_dataset=True dataset_creator = DatasetCreator(collection_name) dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset) # Feature selection over the gaia dataset print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i dataset='dbs/%s__fold%d.db' % (collection_name, i) pca_covered_variance=75 include_highlevel=True feature_selector = FeatureSelector() feature_selector.select(dataset, pca_covered_variance, include_highlevel) # Autotag a given test set print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i) training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i) test_features='test/%s_features__fold%d.tsv' % (collection_name, i) output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i) output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i) metric='LC' num_sim=18 threshold=0.2 autotagger = Autotagger() autotagger.train(dataset, training_metadata) autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False) autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
def CreateDatasetFromDarioRosbag(rosbagName, pickleName, start_frame=None, end_frame=None): """Converts Dario's rosbag to format suitable for training/testing. if start_frame, end_frame are unknown, FrameSelector will help you choose how to trim the video Parameters ---------- rosbagName : str The file location of the rosbag pickleName : str name of the new .pickle file start_frame : int, optional if known, the timestamp in ns of the frame you wish to start from end_frame : int, optional if known, the timestamp in ns of the frame you wish to finish at """ dc = DatasetCreator(rosbagName) if (start_frame is None) or (end_frame is None): start_frame, end_frame = dc.FrameSelector(True) dc.CreateBebopDarioDataset(0, pickleName, start_frame, end_frame)
def TestDatasetCreator(): subject_name = "davide1" dc = DatasetCreator(config.folder_path + "/data/Hand/" + subject_name + ".bag") start_frame, end_frame = dc.FrameSelector() dc.CreateBebopDataset( 0, config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle", start_frame, end_frame) subject_name = "davide2" dc2 = DatasetCreator(config.folder_path + "/data/Hand/" + subject_name + ".bag") start_frame, end_frame = dc2.FrameSelector() dc2.CreateBebopDataset( 0, config.folder_path + "/data/Hand/" + subject_name + "Hand.pickle", start_frame, end_frame) folderPath = config.folder_path + "/data/Hand/" fileList = ["davide1Hand.pickle", "davide2Hand.pickle"] DatasetCreator.JoinPickleFiles( fileList, config.folder_path + "/data/Hand/DavideHand.pickle", folderPath)
calibOutputDir = join( calibOutputBaseDir, 'NFilt_{}_weights_{}_DDW_{}_{}_regFactor_{}'.format( filtsize, lossWeights, DDx_new, loss_function, regFactor)) mkdir(calibOutputDir) calibOutputPath = join(calibOutputDir, 'outputFilters.rawImage') if use_tfrecords: trainFilePaths = recordhandler.ConvertDatabaseToTFRecords( trainPath, join(trainPath, 'tfrecords'), maxExamples=maxNExamples) validFilePaths = recordhandler.ConvertDatabaseToTFRecords( validPath, join(validPath, 'tfrecords'), maxExamples=maxNExamples) else: # get train database myCreator = DatasetCreator(trainPath, NCube=NCube, NDD=NDD, maxNExamples=maxNExamples) myCreator.cropDDWidth(DDx_new) train_database = myCreator.getDataset() # get validation database myCreator = DatasetCreator(validPath, NCube=NCube, NDD=NDD, maxNExamples=maxNExamples) myCreator.cropDDWidth(DDx_new) valid_database = myCreator.getDataset() assert (train_database['Cubes'].shape[0] % batchSize == 0) assert (valid_database['Cubes'].shape[0] % batchSize == 0)
def CreateWithSampleGroupingSplitter(imageSource, preprocessor = None, GetSampleNumberFunction = None): if GetSampleNumberFunction is None: GetSampleNumberFunction = GetSampleNumberFromFilename() return DatasetCreator(imageSource = imageSource, datasetSplitter = SampleGroupingDatasetSplitter(GetSampleNumberFunction), preprocessor = preprocessor)
def CreateWithFileGroupingSplitter(imageSource, numFilePerImage, preprocessor = None): return DatasetCreator(imageSource = imageSource, datasetSplitter = FileGroupingDatasetSplitter(numFilePerImage), preprocessor = preprocessor)
def CreateWithPredicateSplitter(imageSource, imgNumbersInValid, imgNumbersInTest, preprocessor = None): return DatasetCreator(imageSource = imageSource, datasetSplitter = PredicateDatasetSplitter( shouldBeInValid = FileNumberRegexMatcher(imgNumbersInValid), shouldBeInTest = FileNumberRegexMatcher(imgNumbersInTest)), preprocessor = preprocessor)
def CreateWithSplitter(imageSource, datasetSplitter, preprocessor = None): return DatasetCreator(imageSource = imageSource, datasetSplitter = datasetSplitter, preprocessor = preprocessor)
def JoinPickles(fileList, picklename): picklefolder = config.folder_path + "/../Pickles/16_4_2020/" #fileList = {"Clip1.pickle", "Clip2.pickle", "Clip3.pickle", "Clip4.pickle", "Clip5.pickle", "Clip6.pickle"} DatasetCreator.JoinPickleFiles(fileList, picklefolder + picklename, picklefolder)
def calibEstimatorSanityTest2_createData(): logfiledir = _LOG_FILE_DIR validDir = join(logfiledir, 'Valid') trainDir = join(logfiledir, 'Train') # define sizes for tests: sysPaths = SystemSettings.getSystemPaths('Server') sysDims = SystemSettings.getSystemDimensions() NCube = sysDims.NCube # Cube [y, x, lambda] image size NDD = list(sysDims.NDD) # DD [y,x] image size NFilt = sysDims.NFilt # number of coefficients to be estimated for each lambda filter DDx_new = sysDims.DDx_new # the amount of Data influenced by a filter of size 300 NChannels = NCube[2] numTrainExamples = 1000 numValidExamples = 200 NCube_train = (numTrainExamples * NCube[0], 1, NCube[1], NCube[2]) NCube_valid = (numValidExamples * NCube[0], 1, NCube[1], NCube[2]) # Cube_train = np.random.standard_normal(NCube_train).astype(dtype=np.float32) # Cube_valid = np.random.standard_normal(NCube_valid).astype(dtype=np.float32) dataCreator = DatasetCreator(directory=sysPaths.trainPath, NCube=NCube, NDD=NDD) dataCreator.cropDDWidth(DDx_crop=DDx_new) train_dataset = dataCreator.getDataset() Cube_train = train_dataset['Cubes'] # Cube_std = np.std(Cube_train) #Cube_train = Cube_train / Cube_std del train_dataset # print('calibEstimatorSanityTest2_createData: Cube: std: {}, mean: {}, min: {}, max: {}'.format( # np.std(Cube_train), np.mean(Cube_train), np.min(Cube_train), np.max(Cube_train) # )) dataCreator = DatasetCreator(directory=sysPaths.validPath, NCube=NCube, NDD=NDD) dataCreator.cropDDWidth(DDx_crop=DDx_new) valid_dataset = dataCreator.getDataset() Cube_valid = valid_dataset['Cubes'] # Cube_valid = Cube_valid / Cube_std del valid_dataset Filts_GT = np.squeeze(imhand.readImage(_FILTERS_GT_PATH)) # crop Filts_GT to the shape of NFilt crop_remove_size = int((Filts_GT.shape[1] - NFilt) / 2) Filts_GT = Filts_GT[1:32, crop_remove_size:crop_remove_size + NFilt] # Filts_GT = np.random.normal(loc=0.0, scale=1.0, size=(31, 301)).astype(dtype=np.float32) print('calibEstimatorSanityTest2_createData: Filters size: ({}x{})'.format( Filts_GT.shape[0], Filts_GT.shape[1])) NDD[1] = DDx_new # directly use DDx_new instead of the original size which is too big DD_train = np.zeros((NCube_train[0], 1, NDD[1], 1), np.float32) DD_valid = np.zeros((NCube_valid[0], 1, NDD[1], 1), np.float32) # create the DD (Y) image: cEst = CalibEstimator(NX=NCube, NY=NDD, L=NChannels, NFilt=NFilt, learningRate=0.01, batchSize=128, a0=Filts_GT) cEst.setModeEval() cEst.createNPArrayDatasets() cEst.buildModel() DD_train = cEst.eval(Xeval=Cube_train, Yeval=DD_train) DD_valid = cEst.eval(Xeval=Cube_valid, Yeval=DD_valid) cEst.resetModel() # save results: # filters: filters_str = join(logfiledir, 'filters_GT.rawImage') imhand.writeImage(Filts_GT, filters_str) # save training data: for ii in range(numTrainExamples): cube_str = join(trainDir, 'Img_{}_Cube.rawImage'.format(ii)) DD_str = join(trainDir, 'Img_{}_DD.rawImage'.format(ii)) imhand.writeImage( np.squeeze(Cube_train[ii * 256:(ii + 1) * 256, :, :, :]), cube_str) imhand.writeImage(np.squeeze(DD_train[ii * 256:(ii + 1) * 256, :]), DD_str) # save validation data: for ii in range(numValidExamples): cube_str = join(validDir, 'Img_{}_Cube.rawImage'.format(ii)) DD_str = join(validDir, 'Img_{}_DD.rawImage'.format(ii)) imhand.writeImage( np.squeeze(Cube_valid[ii * 256:(ii + 1) * 256, :, :, :]), cube_str) imhand.writeImage(np.squeeze(DD_valid[ii * 256:(ii + 1) * 256, :]), DD_str)
def calibEstimatorSanityTest2(subfold=None): logfiledir = _LOG_FILE_DIR validDir = join(logfiledir, 'Valid') trainDir = join(logfiledir, 'Train') # define sizes for tests: sysDims = SystemSettings.getSystemDimensions() NCube = sysDims.NCube # Cube [y, x, lambda] image size NDD = list(sysDims.NDD) # DD [y,x] image size NFilt = sysDims.NFilt # number of coefficients to be estimated for each lambda filter DDx_new = sysDims.DDx_new # the amount of Data influenced by a filter of size 300 NChannels = NCube[2] NDD[1] = DDx_new # directly use DDx_new instead of the original size which is too big # get train database myCreator = DatasetCreator(trainDir, NCube=NCube, NDD=NDD, maxNExamples=-1) train_database = myCreator.getDataset() # get validation database myCreator = DatasetCreator(validDir, NCube=NCube, NDD=NDD, maxNExamples=-1) valid_database = myCreator.getDataset() Filts_GT = imhand.readImage(join(logfiledir, 'filters_GT.rawImage')) train_dict = { 'Xtrain': train_database['Cubes'], 'Ytrain': train_database['DDs'], 'Xvalid': valid_database['Cubes'], 'Yvalid': valid_database['DDs'] } Cube_train = train_dict['Xtrain'] print( 'calibEstimatorSanityTest2_createData: Cube: std: {}, mean: {}, min: {}, max: {}' .format(np.std(Cube_train), np.mean(Cube_train), np.min(Cube_train), np.max(Cube_train))) if subfold is None: outFold = logfiledir else: outFold = join(logfiledir, subfold) mkdir(outFold) # run a training network and check the output weights # estimate calibration: cEst = CalibEstimator(NX=NCube, NY=NDD, L=NChannels, NFilt=NFilt, learningRate=0.01, batchSize=100, numEpochs=10, logfiledir=outFold, optimizer='gd') cEst.createNPArrayDatasets() cEst.buildModel() cEst.train(DBtype='NPArray', DBargs=train_dict) Filts_Calib = cEst.getCalibratedWeights() imhand.writeImage(Filts_Calib, join(outFold, 'Filters_Calib.rawImage')) diff = np.squeeze(Filts_Calib) - np.squeeze(Filts_GT) maxAbsDiff = np.max(np.abs(diff)) error = np.sum(np.square(diff)) / diff.size print('error norm: {}, max abs error: {}'.format(error, maxAbsDiff)) cEst.resetModel()
# You should have received a copy of the GNU General Public License # along with music-autotagging-msordo. If not, see <http://www.gnu.org/licenses/>. # Written by Mohamed Sordo (@neomoha) # Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com # Website: http://msordo.weebly.com import os, sys, argparse from DatasetCreator import DatasetCreator if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create a Gaia Dataset given a list of feature files') parser.add_argument('collection_name', help='Name of the collection') parser.add_argument('--training-features', default=None, help='A file containing paths to the features of the audios used for training (default="train/COLLECTIONNAME_features.txt")') parser.add_argument('--chunk-size', type=int, default=5000, help='The dataset will be created in chunks of N songs at a time (default=5000)') parser.add_argument('--dataset-suffix', default=None, help='suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)') parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true") args = parser.parse_args() if args.training_features is None: args.training_features = "train/"+args.collection_name+"_features.tsv" if not os.path.exists(args.training_features): print "Taining features file '%s' not found" % args.training_features sys.exit(-1) print args dataset_creator = DatasetCreator(args.collection_name) dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)
def setUp(self): self.imageSource = Mock() self.datasetSplitter = Mock() self.target = DatasetCreator(self.imageSource, self.datasetSplitter)
def Create(imageSource, preprocessor = None): return DatasetCreator(imageSource = imageSource, datasetSplitter = ClassBalancingDatasetSplitter(), preprocessor = preprocessor)
def ConvertDatabaseToTFRecords(inFolder, outFolder, maxExamples=-1): print('ConvertDatabaseToTFRecord:') print('input folder: ' + inFolder) print('output folder: ' + outFolder) print('maxExamples: ' + str(maxExamples)) # get the cube and dd file lists: sysdims = getSystemDimensions() if not isdir(outFolder): mkdir(outFolder) dataCreator = DatasetCreator(directory=inFolder, NCube=sysdims.NCube, NDD=sysdims.NDD, maxNExamples=maxExamples) CubeFiles, DDFiles, Filenames = dataCreator.getFileLists() # crop dd image indices: x_dd_start = int((sysdims.NDD[1] - sysdims.DDx_new) / 2) x_dd_end = x_dd_start + sysdims.DDx_new # initialize output outFiles = [] outFilePath = join(outFolder, 'database_DDW{}.tfrecords'.format(sysdims.DDx_new)) if isfile(outFilePath): # add file to file list and continue: return [outFilePath] writer = tf.python_io.TFRecordWriter(outFilePath) # iterate over all paths: for cubepath, ddpath, filename in zip(CubeFiles, DDFiles, Filenames): # read images: cubeim = imhand.readImage(cubepath) ddim = imhand.readImage(ddpath)[:, x_dd_start:x_dd_end, :] cubeheight = cubeim.shape[0] cubewidth = cubeim.shape[1] cubechannels = cubeim.shape[2] ddheight = ddim.shape[0] ddwidth = ddim.shape[1] for ii in range(cubeheight): # convert image stripes to string: cube_raw = cubeim[ii, :, :].tostring() dd_raw = ddim[ii, :, :].tostring() # create a feature: example = tf.train.Example(features=tf.train.Features( feature={ 'cubewidth': _int64_feature(cubewidth), 'cubechannels': _int64_feature(cubechannels), 'ddwidth': _int64_feature(ddwidth), 'Cube': _bytes_feature(cube_raw), 'DD': _bytes_feature(dd_raw) })) # write feature to file: writer.write(example.SerializeToString()) # close file: writer.close() return [outFilePath]
parser.add_argument( '--chunk-size', type=int, default=5000, help= 'The dataset will be created in chunks of N songs at a time (default=5000)' ) parser.add_argument( '--dataset-suffix', default=None, help= 'suffix to add to the dataset filename (useful when doing k-fold cross validation, for example) (default=None)' ) parser.add_argument('-r', '--replace-dataset', help='Replace old dataset (if it exists)', action="store_true") args = parser.parse_args() if args.training_features is None: args.training_features = "train/" + args.collection_name + "_features.tsv" if not os.path.exists(args.training_features): print "Taining features file '%s' not found" % args.training_features sys.exit(-1) print args dataset_creator = DatasetCreator(args.collection_name) dataset_creator.create(args.training_features, args.chunk_size, args.dataset_suffix, args.replace_dataset)