def test_escape_dots(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_dot_escaping.csv') self.assertEqual(data, [{ "Data$Column2": "data2", "Data\uff0eColumn1": "data1" }])
def test_latin_1(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_unicode.csv') self.assertEqual(data, [{ "column1": "data1", "column2": "data2" }, { "column1": "data3", "column2": "Siberiƫ" }])
def test_numerical(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_numbers.csv') self.assertEqual(data, [{ "column1": "data1", "num": 1 }, { "column1": "data3", "num": 2 }])
def test_simple(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_simple.csv') self.assertEqual(data, [{ "column1": "data1", "column2": "data2" }, { "column1": "data3", "column2": "data4" }])
def test_special_char_escaping(self): data = CSVparser.to_dict(current_dir + '/mockups/schema/specialCharacterTest/test.csv') schema = SchemaGenerator.generate_schema(data) self.assertDictEqual(deep_sort(schema), deep_sort({ 'type': 'array', 'items': { 'type': 'object', 'properties': { 'Data\uff0eColumn1': {'type': 'string'}, 'Data$Column2': {'type': 'string'} }, 'required': ['Data\uff0eColumn1', 'Data$Column2'] } }))
def test_schema_generator(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_simple.csv') schema = SchemaGenerator.generate_schema(data) self.assertEqual(schema, { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'column1': {'type': 'string'}, 'column2': {'type': 'string'} }, 'required': ['column1', 'column2'] } })
def test_numpy_float_error(self): data = CSVparser.to_dict(current_dir + '/mockups/schema/numpy-float64/float64.csv') schema = SchemaGenerator.generate_schema(data) self.assertEqual(schema, { 'type': 'array', 'items': { 'type': 'object', 'properties': { 'id': {'type': 'number'}, 'put': {'type': 'number'} }, 'required': ['id', 'put'] } })
def run(file_path): # Init logging and database init_logging() client, file_col, schema_col, source_data_col = init_mongodb(config) # Set up counters and file index successfully_ingested_files = 0 file_counter = 0 file_list = DirLister.get_file_list_recursive(file_path) logging.info('Processing %d files from %s' % (len(file_list), file_path)) for file in file_list: file_counter += 1 ProgressBar.update_progress(file_counter / len(file_list), ('Processing file %s' % file)) # get the file stats document = { 'stats': FileStatter.stats(file), 'filePath': file, '_id': file, 'hash': FileStatter.sha1_from_file(file) } # Load the data or skip if unable if file.lower().endswith('.mif'): try: data = MIFparser.to_dict(file) except ValueError as e: logging.error(e) # if the data loading doesn't work out, just log the error and skip the file continue elif file.lower().endswith('.mid'): logging.debug('Skipping .mid file.') continue # .mid files are processed along with its 'parented' .mif file else: try: data = CSVparser.to_dict(file) except ValueError as e: logging.error('CSV parsing error on file %s: %s' % (file, e)) # if the data loading doesn't work out, just log the error and skip the file continue # Generate the schema and try to ingest it try: schema_data = SchemaGenerator.generate_schema(data) except Exception as e: logging.error('Schema error on file %s: %s' % (file, e)) continue schema_hash = FileStatter.sha1(schema_data) schema = { '_id': schema_hash, 'schema': schema_data, } try: schema_col.insert_one(schema) except DuplicateKeyError: logging.debug('Schema %s was previously processed' % schema_hash) except Exception as e: logging.error('Ingest schema error on file %s: %s' % (file, e)) # if the schema loading doesn't work out, just log the error and skip the file continue # Store the source data source_data_doc_sha1 = FileStatter.sha1(data) source_data_doc = {'_id': source_data_doc_sha1, 'data': data} try: source_data_col.insert_one(document=source_data_doc) except DuplicateKeyError: logging.debug('Sourcedata with sha1 %s was previously processed' % source_data_doc_sha1) except Exception as e: logging.error('Ingest source data error on file %s: %s' % (file, e)) continue # Finalize the file document with the data reference and the schema reference document['data'] = source_data_doc_sha1 document['schema'] = schema['_id'] try: file_col.insert_one(document=document) except DuplicateKeyError: logging.warning('File %s was previously processed, skipping' % file) # Skip to next file continue except Exception as e: logging.error('Ingest file metadata error on file %s: %s' % (file, e)) continue logging.debug('File %s was successfully ingested' % file) successfully_ingested_files += 1 logging.info('Finished!') logging.info('Successfully ingested %d files of %d' % (successfully_ingested_files, len(file_list))) client.close()
def test_data_hasher_from_schema_dict(self): dictionary = CSVparser.to_dict(current_dir + '/mockups/schema/caseInsensitiveTest/test.csv') sha1 = FileStatter.sha1(SchemaGenerator.generate_schema(dictionary)) self.assertEqual(sha1, 'a59a9b5c48657c3828c4c308cd057997aa7927fb')
def test_empty_csv(self): with self.assertRaisesRegex(ValueError, 'empty or invalid'): CSVparser.to_dict(current_dir + '/mockups/csv/empty.csv')
def test_no_csv(self): with self.assertRaisesRegex(ValueError, 'invalid csv'): CSVparser.to_dict(current_dir + '/mockups/csv/test_no_csv.txt')
def test_non_unicode(self): data = CSVparser.to_dict(current_dir + '/mockups/csv/test_non_unicode.csv') self.assertEqual(data[0]['OMSCHRIJF'], 'mal voor reliƫf')