-
Notifications
You must be signed in to change notification settings - Fork 4
/
datasets.py
74 lines (62 loc) · 2.65 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import numpy as np
import logging
import config, utils, data_transforms
logger = logging.getLogger(__name__)
def load_dataset(name):
"""
Load dataset defined with name in SETTINGS json from cache. If not in cache
create the dataset.
Args:
name - a string of a valid dataset defined in SETTINGS
Returns:
the dataset
"""
try:
dataset = utils.load_from_cache(name)
logger.info('Loaded dataset %s from cache' % name)
except IOError:
dataset = make_dataset(name)
utils.save_to_cache(dataset, name)
return dataset
def make_dataset(name):
"""
Create the dataset defined with name in SETTINGS json. A dataset definition
in SETTINGS takes the form:
{ "DatasetName": {
"input_data": ["NamesOfAnyDatasetsRequiredAsInput", ...],
"transforms": [[ "TrainsformName1", { "TransformArg1": Value1,
"TransformArg2": Value2, ... }].
... ]
}
}
"DatasetName" is the name that will be used for the dataset throughout the model
this name is used for accessing configuration information, and when passing the
dataset as input into other dataset definitions or model definitions
"input_data" is a list of other dataset names that are required as input for
the creation of the dataset
"transforms" is a list of lists. Each sub-list must have length 2 and contain
a transform name, which is a valid name of a function defined in data_transforms.py
and a dict of arguments required to pass into the transform. All transforms defined
in data_transforms.py must have the structure:
def transform_name(input_data, **args):
...
The "transforms" list allows for chaining of transforms for creating complex
datasets. The input for the first transform in the chain is always defined in
"input_data", the next transform in the chain takes the output of the previous
transform as input_data.
"""
cfgs = config.dataset_configs(name)
data = [load_dataset(ds) for ds in cfgs['input_data']]
if len(data) == 1:
data = data[0]
logger.info('Creating dataset %s' % name)
for tname, args in cfgs['transforms']:
try:
transform = getattr(data_transforms, tname)
except AttributeError:
raise AttributeError('Unable to find transform \
%s in data_transforms.py' % tname)
logger.info('Applying %s on %s' % (tname, name))
data = transform(data, **args)
return data