Exemple #1
0
def process(filename,
            root_path,
            name,
            desc,
            about,
            preview=False,
            check=False,
            allowEmpty=True):
    """
    The recipe for grabbing the file and pushing it to FluidDB
    """
    # Turn the raw input file into a list data structure containing the items
    # to import into FluidDB
    raw_data = clean_data(filename)
    number_of_records = len(raw_data)
    logger.info('Raw filename: %r' % filename)
    logger.info('Root namespace path: %r' % root_path)
    logger.info('About tag field key: %r' % about)
    logger.info('%d records found' % number_of_records)

    if preview or check:
        if preview:
            # just print out/log a preview
            logger.info('Generating preview...')
            output = list()
            output.append("Preview of processing %r\n" % filename)
            output.append("The following namespaces/tags will be generated.\n")
            output.extend(get_preview(raw_data, root_path))
            output.append("\n%d records will be imported into FluidDB\n" %
                          number_of_records)
            result = "\n".join(output)
            logger.info(result)
            return result
        else:
            # check the file and display the results
            logger.info('Validating %r\n' % filename)
            output = list()
            errors, warnings = validate(raw_data)
            if errors:
                output.append("The following ERRORS were found:\n")
                output.extend(errors)
                output.append('\n')
            if warnings:
                output.append("The following WARNINGS were generated:\n")
                output.extend(warnings)
            if output:
                result = "\n".join(output)
            else:
                result = "Validation passed ok"
            logger.info(result)
            return result
    else:
        process_data_list(raw_data, root_path, name, desc, about, allowEmpty)
        return "Processed %d records" % number_of_records
Exemple #2
0
def process(filename, root_path, name, desc, about, preview=False,
            check=False, allowEmpty=True):
    """
    The recipe for grabbing the file and pushing it to FluidDB
    """
    # Turn the raw input file into a list data structure containing the items
    # to import into FluidDB
    raw_data = clean_data(filename)
    number_of_records = len(raw_data)
    logger.info('Raw filename: %r' % filename)
    logger.info('Root namespace path: %r' % root_path)
    logger.info('About tag field key: %r' % about)
    logger.info('%d records found' % number_of_records) 

    if preview or check:
        if preview:
            # just print out/log a preview
            logger.info('Generating preview...')
            output = list()
            output.append("Preview of processing %r\n" % filename)
            output.append("The following namespaces/tags will be generated.\n")
            output.extend(get_preview(raw_data, root_path))
            output.append("\n%d records will be imported into FluidDB\n" %
                          number_of_records)
            result = "\n".join(output)
            logger.info(result)
            return result
        else:
            # check the file and display the results
            logger.info('Validating %r\n' % filename)
            output = list()
            errors, warnings = validate(raw_data)
            if errors:
                output.append("The following ERRORS were found:\n")
                output.extend(errors)
                output.append('\n')
            if warnings:
                output.append("The following WARNINGS were generated:\n")
                output.extend(warnings)
            if output:
                result = "\n".join(output)
            else:
                result = "Validation passed ok"
            logger.info(result)
            return result
    else:
        process_data_list(raw_data, root_path, name, desc, about, allowEmpty)
        return "Processed %d records" % number_of_records
Exemple #3
0
] # the Python list of dictionaries you want to process
root_path = 'test/foo'# Namespace where imported namespaces/tags are created
name = 'dataset_name' # used when creating namespace/tag descriptions 
desc = 'Plain English dataset description' # exactly what it says
about = 'foo' # field whose value to use for the about tag
preview = False # True will cause flimp to print out the preview

# Make magic happen...
process_data_list(data_list, root_path, name, desc, about)

# You can also validate the list to check for dictionaries that don't match
# the "template" taken from the first entry in the list.

# missing = missing fields, extras = extra fields not in the template - both
# are lists of instances of these problems.
missing, extras = validate(data_list)

# In the case of cleaning csv data you have several ways to normalise / clean
# the input
def clean_header(header):
    """
    A function that takes a column name header and normalises / cleans it into
    something we'll use as the name of a tag
    """
    # remove leading/trailing whitespace, replace inline whitespace with
    # underscore and any slashes with dashes.
    return header.strip().replace(' ', '_').replace('/', '-')

def clean_row_item(item):
    """
    A function that takes the string value of an individual item of data that
Exemple #4
0
 def test_validate(self):
     # good
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y'
             },
             'bof': 'z'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], missing)  # no problem
     self.assertEqual([], extras)  # no problem
     # missing key
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y'
             },
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], extras)  # no problem
     self.assertEqual(1, len(missing))
     self.assertTrue("Field 'bof'" in missing[0])
     # additional key
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y',
             },
             'bof': 'z',
             'qux': 'quux'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], missing)  # no problem
     self.assertEqual(1, len(extras))
     self.assertTrue("Field 'qux' in record" in extras[0])
     # check validation includes sub-dictionaries
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'quux': 'bif'
             },
             'bof': 'z'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual(1, len(extras))
     self.assertTrue("Field 'quux' in record" in extras[0])
     self.assertEqual(1, len(missing))
     self.assertTrue("Field 'baz' in record" in missing[0])
Exemple #5
0
]  # the Python list of dictionaries you want to process
root_path = 'test/foo'  # Namespace where imported namespaces/tags are created
name = 'dataset_name'  # used when creating namespace/tag descriptions
desc = 'Plain English dataset description'  # exactly what it says
about = 'foo'  # field whose value to use for the about tag
preview = False  # True will cause flimp to print out the preview

# Make magic happen...
process_data_list(data_list, root_path, name, desc, about)

# You can also validate the list to check for dictionaries that don't match
# the "template" taken from the first entry in the list.

# missing = missing fields, extras = extra fields not in the template - both
# are lists of instances of these problems.
missing, extras = validate(data_list)


# In the case of cleaning csv data you have several ways to normalise / clean
# the input
def clean_header(header):
    """
    A function that takes a column name header and normalises / cleans it into
    something we'll use as the name of a tag
    """
    # remove leading/trailing whitespace, replace inline whitespace with
    # underscore and any slashes with dashes.
    return header.strip().replace(' ', '_').replace('/', '-')


def clean_row_item(item):
Exemple #6
0
 def test_validate(self):
     # good
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y'
             },
             'bof': 'z'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], missing) # no problem
     self.assertEqual([], extras) # no problem
     # missing key
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y'
             },
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], extras) # no problem
     self.assertEqual(1, len(missing))
     self.assertTrue("Field 'bof'" in missing[0])
     # additional key
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'baz': 'y',
             },
             'bof': 'z',
             'qux': 'quux'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual([], missing) # no problem
     self.assertEqual(1, len(extras))
     self.assertTrue("Field 'qux' in record" in extras[0])
     # check validation includes sub-dictionaries
     data = [
         {
             'foo': 'a',
             'bar': {
                 'baz': 'b'
             },
             'bof': 'c'
         },
         {
             'foo': 'x',
             'bar': {
                 'quux': 'bif'
             },
             'bof': 'z'
         },
     ]
     missing, extras = validate(data)
     self.assertEqual(1, len(extras))
     self.assertTrue("Field 'quux' in record" in  extras[0])
     self.assertEqual(1, len(missing))
     self.assertTrue("Field 'baz' in record" in missing[0])