def __init__(self, train_examples, test_examples=None, add_generated_examples=True): """Initialize the dataset using the provided train and test examples.""" self.logger.info("Loading DS from files...") self.augmenter = SignalAugmenter(augmentation_start=0.1, augmentation_end=0.9) train = self.prepare_dataset(train_examples, add_generated_examples) test = self.prepare_dataset(test_examples, add_generated_examples) self.id_label_mapping = {v: k for k, v in self.label_id_mapping.items()} self.X_train = self.flatten2d(train.features) self.y_train = train.labels self.X_test = self.flatten2d(test.features) self.y_test = test.labels self.num_labels = len(self.id_label_mapping) self.num_features = self.X_train.shape[1] self.num_train_examples = self.X_train.shape[0] self.num_test_examples = self.X_test.shape[0]
def __init__(self, directory): """Load the dataset data from the directory.""" self.logger.info("Loading DS from files...") self.augmenter = SignalAugmenter(augmentation_start=0.1, augmentation_end=0.9) examples = self.load_examples(directory) examples.shuffle() train, test = examples.split(self.TRAIN_RATIO) augmented_train = self.augmenter.augment_examples(train, 400) print "Augmented `train` with %d examples, %d originally" % ( augmented_train.num_examples - train.num_examples, train.num_examples, ) augmented_train.shuffle() augmented_train.scale_features(self.Feature_Range, self.Feature_Mean) augmented_test = self.augmenter.augment_examples(test, 400) print "Augmented `test` with %d examples, %d originally" % ( augmented_test.num_examples - test.num_examples, test.num_examples, ) augmented_test.shuffle() augmented_test.scale_features(self.Feature_Range, self.Feature_Mean) self.human_labels = {v: k for k, v in self.label_mapping.items()} self.X_train = self.flatten2d(augmented_train.features) self.y_train = augmented_train.labels self.X_test = self.flatten2d(augmented_test.features) self.y_test = augmented_test.labels self.num_labels = len(self.human_labels) self.num_features = self.X_train.shape[1] self.num_train_examples = self.X_train.shape[0] self.num_test_examples = self.X_test.shape[0]
class AccelerationDataset(object): """Dataset containing examples based on acceleration data and their labels.""" logger = logging.getLogger("analysis.AccelerationDataset") TRAIN_RATIO = 0.8 # This defines the range of the values the accelerometer measures Feature_Range = 8000 Feature_Mean = 0 # Augmenter used to increase the number of training examples augmenter = None # Number of examples num_train_examples = None num_test_examples = None # Number of classes num_labels = None num_features = None # Indicator if the data has been loaded yet initialized = False # Mapping of integer class labels to strings human_labels = {} label_mapping = {} def human_label_for(self, label_id): """Convert a label id into a human readable string label.""" return self.human_labels[label_id] def save_labels(self, filename): """Store the label <--> id mapping to file. The id is defined by the line number.""" labels = collections.OrderedDict(sorted(self.human_labels.items())).values() f = open(filename, "wb") for label in labels: f.write("%s\n" % label) f.close() def load_examples(self, path): """Load examples contained in the path into an example collection. Examples need to be stored in CSVs. Arguments: path -- can be directory or zipfile. If zipfile, it will be extract to a tmp path with prefix /tmp/muvr-training- """ self.label_mapping = {} root_directory = "" if os.path.isdir(path): root_directory = path else: # Zip file - extract to temp root_directory first root_directory = tempfile.mkdtemp(prefix="/tmp/muvr-training-") zipfile.ZipFile(path, "r").extractall(root_directory) # csv_files = filter(lambda f: f.endswith("csv"), os.listdir(root_directory)) csv_files = [] def append_csv_file(arg, direname, names): for name in names: f = os.path.join(root_directory, direname, name) if os.path.isfile(f) and f.endswith("csv"): csv_files.append(f) os.path.walk(root_directory, append_csv_file, None) Xs = [] ys = [] for f in csv_files: X, label = self.load_example(f) if label not in self.label_mapping: self.label_mapping[label] = len(self.label_mapping) Xs.append(X) ys.append(self.label_mapping[label]) return ExampleColl(Xs, ys) @staticmethod def load_example(filename): """Load a single example from a CSV file.""" with open(filename, "rb") as csvfile: dialect = csv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) csv_data = csv.reader(csvfile, dialect) X = [] for row in csv_data: label = row[0] + "/" + row[1] X.append(row[2:]) X = np.transpose(np.reshape(np.asarray(X, dtype=float), (len(X), len(X[0])))) return X, label # Load label mapping and train / test data from disk. def __init__(self, directory): """Load the dataset data from the directory.""" self.logger.info("Loading DS from files...") self.augmenter = SignalAugmenter(augmentation_start=0.1, augmentation_end=0.9) examples = self.load_examples(directory) examples.shuffle() train, test = examples.split(self.TRAIN_RATIO) augmented_train = self.augmenter.augment_examples(train, 400) print "Augmented `train` with %d examples, %d originally" % ( augmented_train.num_examples - train.num_examples, train.num_examples, ) augmented_train.shuffle() augmented_train.scale_features(self.Feature_Range, self.Feature_Mean) augmented_test = self.augmenter.augment_examples(test, 400) print "Augmented `test` with %d examples, %d originally" % ( augmented_test.num_examples - test.num_examples, test.num_examples, ) augmented_test.shuffle() augmented_test.scale_features(self.Feature_Range, self.Feature_Mean) self.human_labels = {v: k for k, v in self.label_mapping.items()} self.X_train = self.flatten2d(augmented_train.features) self.y_train = augmented_train.labels self.X_test = self.flatten2d(augmented_test.features) self.y_test = augmented_test.labels self.num_labels = len(self.human_labels) self.num_features = self.X_train.shape[1] self.num_train_examples = self.X_train.shape[0] self.num_test_examples = self.X_test.shape[0] @staticmethod def flatten2d(npa): """Take a 3D array and flatten the last dimension.""" return npa.reshape((npa.shape[0], -1)) # Get the dataset ready for Neon training def train(self): """Provide neon data iterator for training purposes.""" return DataIterator( X=self.X_train, y=self.y_train, nclass=self.num_labels, make_onehot=True, lshape=(self.num_features, 1, 1) ) def test(self): """Provide neon data iterator for testing purposes.""" return DataIterator( X=self.X_test, y=self.y_test, nclass=self.num_labels, make_onehot=True, lshape=(self.num_features, 1, 1) )