def processDatasetSplit(train_source_path, test_source_path, logger=None): logger.info(' Training data: %s ', train_source_path) X_training, y_training, valid_frames_training = preprocessWavs.preprocess_dataset( source_path=train_source_path, logger=logger, nbMFCCs=nbMFCCs, debug=debug_size) logger.info(' Test data: %s', test_source_path) X_test, y_test, valid_frames_test = preprocessWavs.preprocess_dataset( source_path=test_source_path, logger=logger, nbMFCCs=nbMFCCs, debug=debug_size) return X_training, y_training, valid_frames_training, X_test, y_test, valid_frames_test
def processDataset(FRAC_TRAINING, data_source_path, logger=None): logger.info(' Data: %s ', data_source_path) X_all, y_all, valid_frames_all = preprocessWavs.preprocess_dataset( source_path=data_source_path, nbMFCCs=nbMFCCs, logger=logger, debug=debug_size) assert len(X_all) == len(y_all) == len(valid_frames_all) logger.info(' Loading data complete.') logger.debug('Type and shape/len of X_all') logger.debug('type(X_all): {}'.format(type(X_all))) logger.debug('type(X_all[0]): {}'.format(type(X_all[0]))) logger.debug('type(X_all[0][0]): {}'.format(type(X_all[0][0]))) logger.debug('type(X_all[0][0][0]): {}'.format(type(X_all[0][0][0]))) logger.info('Creating Validation index ...') total_size = len(X_all) # TOTAL = TRAINING + TEST = TRAIN + VAL + TEST total_training_size = int(math.ceil(FRAC_TRAINING * total_size)) # TRAINING = TRAIN + VAL test_size = total_size - total_training_size # split off a 'test' dataset test_idx = random.sample(range(0, total_training_size), test_size) test_idx = [int(i) for i in test_idx] # ensure that the testidation set isn't empty if DEBUG: test_idx[0] = 0 test_idx[1] = 1 logger.info('Separating test and training set ...') X_training = [] y_training = [] valid_frames_training = [] X_test = [] y_test = [] valid_frames_test = [] for i in range(len(X_all)): if i in test_idx: X_test.append(X_all[i]) y_test.append(y_all[i]) valid_frames_test.append(valid_frames_all[i]) else: X_training.append(X_all[i]) y_training.append(y_all[i]) valid_frames_training.append(valid_frames_all[i]) assert len(X_test) == test_size assert len(X_training) == total_training_size return X_training, y_training, valid_frames_training, X_test, y_test, valid_frames_test
def processDataset(FRAC_TRAINING, data_source_path, logger=None): logger.info(' Data: %s ', data_source_path) X_test, y_test, valid_frames_test = preprocessWavs.preprocess_dataset( source_path=data_source_path, nbMFCCs=nbMFCCs, logger=logger, debug=None) assert len(X_test) == len(y_test) == len(valid_frames_test) logger.info(' Loading data complete.') logger.debug('Type and shape/len of X_test') logger.debug('type(X_test): {}'.format(type(X_test))) logger.debug('type(X_test[0]): {}'.format(type(X_test[0]))) logger.debug('type(X_test[0][0]): {}'.format(type(X_test[0][0]))) logger.debug('type(X_test[0][0][0]): {}'.format(type(X_test[0][0][0]))) return X_test, y_test, valid_frames_test
def preprocessLabeledWavs(wavDir, store_dir, name): # fixWavs -> suppose this is done # convert to pkl X, y, valid_frames = preprocessWavs.preprocess_dataset( source_path=wavDir, nbMFCCs=nbMFCCs, logger=logger_evaluate) X_data_type = 'float32' X = preprocessWavs.set_type(X, X_data_type) y_data_type = 'int32' y = preprocessWavs.set_type(y, y_data_type) valid_frames_data_type = 'int32' valid_frames = preprocessWavs.set_type(valid_frames, valid_frames_data_type) return X, y, valid_frames