コード例 #1
0
def preprocess_datasets(start_i=0, overwrite=False, verbose=True):
    dataset_ids = get_downloaded_dataset_ids(Preprocess.RAW)
    num_datasets = len(dataset_ids)
    for i in range(start_i, num_datasets):
        dataset_id = dataset_ids [i]
        if verbose:
            print('Preprocessing {} of {} (dataset_id: {})'
                  .format(i + 1, num_datasets, dataset_id))
        # Don't waste time reading dataset if it's already been preprocessed
        if is_dataset_file(dataset_id, Preprocess.WHITENED) and not overwrite:
            if verbose: print('Already preprocessed, so skipping')
            continue
        d = read_dataset_and_log(dataset_id, Preprocess.RAW)
        if d is None: continue
        X, y, categorical = d['X'], d['y'], d['categorical']
        # convert to ndarray if not already
        X = to_ndarray(X)
        write_preprocessed_dataset_dict(X, y, categorical, dataset_id,
                                        Preprocess.ONEHOT)
        write_preprocessed_dataset_dict(X, y, categorical, dataset_id,
                                        Preprocess.STANDARDIZED)
        write_preprocessed_dataset_dict(X, y, categorical, dataset_id,
                                        Preprocess.ROBUST)
        write_preprocessed_dataset_dict(X, y, categorical, dataset_id,
                                        Preprocess.WHITENED)
コード例 #2
0
def read_raw_downloaded_datasets(start_i=0):
    dataset_ids = get_downloaded_dataset_ids()
    num_datasets = len(dataset_ids)
    for i in range(start_i, num_datasets):
        print('{} of {}'.format(i + 1, num_datasets), end='\t')
        dataset_id = dataset_ids[i]
        read_dataset_and_log(dataset_id, verbose=True)
コード例 #3
0
def purge_bad_datasets(start_i=0, redownload=True, verbose=True):
    dataset_ids = get_downloaded_dataset_ids()
    num_datasets = len(dataset_ids)
    for i in range(start_i, num_datasets):
        print('{} of {}'.format(i + 1, num_datasets), end='\t')
        dataset_id = dataset_ids[i]
        read_dataset_and_purge(dataset_id,
                               redownload=redownload,
                               verbose=verbose)
コード例 #4
0
def read_all_datasets():
    dataset_ids = get_downloaded_dataset_ids()
    num_datasets = len(dataset_ids)
    for preprocess in Preprocess:
        print('### Reading {} preprocessed datasets ###'.format(
            preprocess.value))
        start_i = 0
        for i in range(start_i, num_datasets):
            print('{} of {}'.format(i + 1, num_datasets), end='\t')
            dataset_id = dataset_ids[i]
            read_dataset_and_log(dataset_id,
                                 preprocess=preprocess,
                                 verbose=True)
コード例 #5
0
from checksum import compute_checksum_from_dataset

# Change action below to recompute all checksums and overwrite the existing JSON data.
action = ['check', 'update'][0]

# Load existing checksum data
checksum_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'checksum_preprocessed_data.json')
try:
    with open(checksum_path, 'r') as fp:
        checksums = json.load(fp)
except FileNotFoundError:
    checksums = {}

# Compute checksums from data files and compare against or update existing checksums.
dataset_ids = get_downloaded_dataset_ids()
for dataset_id in dataset_ids:
    for preprocess in [
            Preprocess.ONEHOT, Preprocess.STANDARDIZED, Preprocess.ROBUST,
            Preprocess.WHITENED
    ]:
        checksum_key = f'{preprocess.value}/{dataset_id}'
        if action == 'check':
            if checksum_key not in checksums:
                print(
                    f'Preprocessed data {preprocess.value}/{dataset_id} was found on disk but was not expected.'
                )
            else:
                with open(
                        get_dataset_filename(dataset_id,
                                             preprocess=preprocess),