def process(filename, root_path, name, desc, about, preview=False, check=False, allowEmpty=True): """ The recipe for grabbing the file and pushing it to FluidDB """ # Turn the raw input file into a list data structure containing the items # to import into FluidDB raw_data = clean_data(filename) number_of_records = len(raw_data) logger.info('Raw filename: %r' % filename) logger.info('Root namespace path: %r' % root_path) logger.info('About tag field key: %r' % about) logger.info('%d records found' % number_of_records) if preview or check: if preview: # just print out/log a preview logger.info('Generating preview...') output = list() output.append("Preview of processing %r\n" % filename) output.append("The following namespaces/tags will be generated.\n") output.extend(get_preview(raw_data, root_path)) output.append("\n%d records will be imported into FluidDB\n" % number_of_records) result = "\n".join(output) logger.info(result) return result else: # check the file and display the results logger.info('Validating %r\n' % filename) output = list() errors, warnings = validate(raw_data) if errors: output.append("The following ERRORS were found:\n") output.extend(errors) output.append('\n') if warnings: output.append("The following WARNINGS were generated:\n") output.extend(warnings) if output: result = "\n".join(output) else: result = "Validation passed ok" logger.info(result) return result else: process_data_list(raw_data, root_path, name, desc, about, allowEmpty) return "Processed %d records" % number_of_records
] # the Python list of dictionaries you want to process root_path = 'test/foo'# Namespace where imported namespaces/tags are created name = 'dataset_name' # used when creating namespace/tag descriptions desc = 'Plain English dataset description' # exactly what it says about = 'foo' # field whose value to use for the about tag preview = False # True will cause flimp to print out the preview # Make magic happen... process_data_list(data_list, root_path, name, desc, about) # You can also validate the list to check for dictionaries that don't match # the "template" taken from the first entry in the list. # missing = missing fields, extras = extra fields not in the template - both # are lists of instances of these problems. missing, extras = validate(data_list) # In the case of cleaning csv data you have several ways to normalise / clean # the input def clean_header(header): """ A function that takes a column name header and normalises / cleans it into something we'll use as the name of a tag """ # remove leading/trailing whitespace, replace inline whitespace with # underscore and any slashes with dashes. return header.strip().replace(' ', '_').replace('/', '-') def clean_row_item(item): """ A function that takes the string value of an individual item of data that
def test_validate(self): # good data = [ { 'foo': 'a', 'bar': { 'baz': 'b' }, 'bof': 'c' }, { 'foo': 'x', 'bar': { 'baz': 'y' }, 'bof': 'z' }, ] missing, extras = validate(data) self.assertEqual([], missing) # no problem self.assertEqual([], extras) # no problem # missing key data = [ { 'foo': 'a', 'bar': { 'baz': 'b' }, 'bof': 'c' }, { 'foo': 'x', 'bar': { 'baz': 'y' }, }, ] missing, extras = validate(data) self.assertEqual([], extras) # no problem self.assertEqual(1, len(missing)) self.assertTrue("Field 'bof'" in missing[0]) # additional key data = [ { 'foo': 'a', 'bar': { 'baz': 'b' }, 'bof': 'c' }, { 'foo': 'x', 'bar': { 'baz': 'y', }, 'bof': 'z', 'qux': 'quux' }, ] missing, extras = validate(data) self.assertEqual([], missing) # no problem self.assertEqual(1, len(extras)) self.assertTrue("Field 'qux' in record" in extras[0]) # check validation includes sub-dictionaries data = [ { 'foo': 'a', 'bar': { 'baz': 'b' }, 'bof': 'c' }, { 'foo': 'x', 'bar': { 'quux': 'bif' }, 'bof': 'z' }, ] missing, extras = validate(data) self.assertEqual(1, len(extras)) self.assertTrue("Field 'quux' in record" in extras[0]) self.assertEqual(1, len(missing)) self.assertTrue("Field 'baz' in record" in missing[0])
] # the Python list of dictionaries you want to process root_path = 'test/foo' # Namespace where imported namespaces/tags are created name = 'dataset_name' # used when creating namespace/tag descriptions desc = 'Plain English dataset description' # exactly what it says about = 'foo' # field whose value to use for the about tag preview = False # True will cause flimp to print out the preview # Make magic happen... process_data_list(data_list, root_path, name, desc, about) # You can also validate the list to check for dictionaries that don't match # the "template" taken from the first entry in the list. # missing = missing fields, extras = extra fields not in the template - both # are lists of instances of these problems. missing, extras = validate(data_list) # In the case of cleaning csv data you have several ways to normalise / clean # the input def clean_header(header): """ A function that takes a column name header and normalises / cleans it into something we'll use as the name of a tag """ # remove leading/trailing whitespace, replace inline whitespace with # underscore and any slashes with dashes. return header.strip().replace(' ', '_').replace('/', '-') def clean_row_item(item):