def test_process_csv_file(self): schema = Schema.from_csv(csv_path=self.test_csv_file_path) input = Input(schema) input.add_categorical_column('col_0') rows = Dataset.Builder(input=input, name="test", root_dir=self.test_dir, parallelism_level=2)._process_csv_files() self.assertEqual(len(rows), 10) for column in input.columns: if column.name == 'col_0': self.assertTrue(len(column.metadata.categories), 4)
def create_test_dataset(test_dir, test_csv_file_path, dataset_name, header=False, is_related_path=False): col_0 = 'col_0' col_1 = 'col_1' col_5 = 'col_5' if header: col_0 = 'col_0_h' col_1 = 'col_1_h' col_5 = 'col_5_h' schema = Schema.from_csv(csv_path=test_csv_file_path, header=header) schema.merge_columns_in_range('col_vector', (2, 4)) input = Input(schema) input.add_categorical_column(col_0) input.add_numeric_column(col_1) input.add_vector_column('col_vector') img2d = Img2DColumn(is_related_path=is_related_path) input.add_column(col_5, img2d) return Dataset.Builder(input, dataset_name, test_dir, parallelism_level=2).build()
def test_write_read_record_raw_img_false(self): schema = Schema.from_csv(csv_path=self.test_csv_file_path) schema.merge_columns_in_range('col_vector', (2, 4)) input = Input(schema) input.add_categorical_column('col_0') for column in input.columns: if column.name == 'col_0': metadata = CategoricalColumnMetadata() metadata._categories = categories column.metadata = metadata input.add_numeric_column('col_1') input.add_vector_column('col_vector') img2d = Img2DColumn(pre_transforms=[], post_transforms=[], is_raw_img=False) input.add_column("col_5", img2d) os.makedirs(os.path.join(self.test_dir, Dataset.DATA_DIR_NAME)) record_writer = RecordWriter.factory('HDF5', self.test_dir, input.columns) csv_row = [ ent.strip() for ent in Schema.read_n_rows( csv_file_path=self.test_csv_file_path, delimiter=",", rows_number=1)[0] ] precessed_row = {} for column in input.columns: precessed_row[column.name] = column.process_on_write(csv_row) record_writer.write(precessed_row, 0) record_reader = RecordReader.factory('HDF5', self.test_dir) record = record_reader.read(0) data = {} for column in input.columns: data[column.name] = column.process_on_read(record) img_deserialized = data['col_5'] img_original = skimgio.imread(self.test_img_file_path) self.assertTrue(np.array_equal(img_deserialized, img_original))
import os import glob # datasets_base_path = app_flask.config['DATASETS_BASE_PATH'] lstDB = glob.glob('%s/test-*' % datasets_base_path) numDB = len(lstDB) if numDB < 1: path_csv = '../../../../data-test/dataset-image2d/simple4c_test/test-csv-v1.csv' if not os.path.isfile(path_csv): raise Exception('Cant find file [%s]' % path_csv) schema = Schema.from_csv(path_csv, header=True, delimiter=',') schema.merge_columns_in_range('col_vector', (2, 4)) schema.print_data() schema['path'] = 'image' schema.print_columns() input = Input(schema=schema) input.add_categorical_column("label") input.add_vector_column('col_vector') img2d = Img2DColumn(is_related_path=True) input.add_column("image", img2d) dataset = Dataset.Builder(input, "test", datasets_base_path, parallelism_level=2).build() else: dataset = Dataset.load(lstDB[0]) dataShapes = dataset.shapes() data = dataset.get_train_batch(5) for k in data._data.keys(): print '%s : %s' % (k, data[k].shape)