def preprocess_datasets(start_i=0, overwrite=False, verbose=True): dataset_ids = get_downloaded_dataset_ids(Preprocess.RAW) num_datasets = len(dataset_ids) for i in range(start_i, num_datasets): dataset_id = dataset_ids [i] if verbose: print('Preprocessing {} of {} (dataset_id: {})' .format(i + 1, num_datasets, dataset_id)) # Don't waste time reading dataset if it's already been preprocessed if is_dataset_file(dataset_id, Preprocess.WHITENED) and not overwrite: if verbose: print('Already preprocessed, so skipping') continue d = read_dataset_and_log(dataset_id, Preprocess.RAW) if d is None: continue X, y, categorical = d['X'], d['y'], d['categorical'] # convert to ndarray if not already X = to_ndarray(X) write_preprocessed_dataset_dict(X, y, categorical, dataset_id, Preprocess.ONEHOT) write_preprocessed_dataset_dict(X, y, categorical, dataset_id, Preprocess.STANDARDIZED) write_preprocessed_dataset_dict(X, y, categorical, dataset_id, Preprocess.ROBUST) write_preprocessed_dataset_dict(X, y, categorical, dataset_id, Preprocess.WHITENED)
def read_raw_downloaded_datasets(start_i=0): dataset_ids = get_downloaded_dataset_ids() num_datasets = len(dataset_ids) for i in range(start_i, num_datasets): print('{} of {}'.format(i + 1, num_datasets), end='\t') dataset_id = dataset_ids[i] read_dataset_and_log(dataset_id, verbose=True)
def purge_bad_datasets(start_i=0, redownload=True, verbose=True): dataset_ids = get_downloaded_dataset_ids() num_datasets = len(dataset_ids) for i in range(start_i, num_datasets): print('{} of {}'.format(i + 1, num_datasets), end='\t') dataset_id = dataset_ids[i] read_dataset_and_purge(dataset_id, redownload=redownload, verbose=verbose)
def read_all_datasets(): dataset_ids = get_downloaded_dataset_ids() num_datasets = len(dataset_ids) for preprocess in Preprocess: print('### Reading {} preprocessed datasets ###'.format( preprocess.value)) start_i = 0 for i in range(start_i, num_datasets): print('{} of {}'.format(i + 1, num_datasets), end='\t') dataset_id = dataset_ids[i] read_dataset_and_log(dataset_id, preprocess=preprocess, verbose=True)
from checksum import compute_checksum_from_dataset # Change action below to recompute all checksums and overwrite the existing JSON data. action = ['check', 'update'][0] # Load existing checksum data checksum_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'checksum_preprocessed_data.json') try: with open(checksum_path, 'r') as fp: checksums = json.load(fp) except FileNotFoundError: checksums = {} # Compute checksums from data files and compare against or update existing checksums. dataset_ids = get_downloaded_dataset_ids() for dataset_id in dataset_ids: for preprocess in [ Preprocess.ONEHOT, Preprocess.STANDARDIZED, Preprocess.ROBUST, Preprocess.WHITENED ]: checksum_key = f'{preprocess.value}/{dataset_id}' if action == 'check': if checksum_key not in checksums: print( f'Preprocessed data {preprocess.value}/{dataset_id} was found on disk but was not expected.' ) else: with open( get_dataset_filename(dataset_id, preprocess=preprocess),