Example #1
0
 def test_escape_dots(self):
     data = CSVparser.to_dict(current_dir +
                              '/mockups/csv/test_dot_escaping.csv')
     self.assertEqual(data, [{
         "Data$Column2": "data2",
         "Data\uff0eColumn1": "data1"
     }])
Example #2
0
 def test_latin_1(self):
     data = CSVparser.to_dict(current_dir + '/mockups/csv/test_unicode.csv')
     self.assertEqual(data, [{
         "column1": "data1",
         "column2": "data2"
     }, {
         "column1": "data3",
         "column2": "Siberiƫ"
     }])
Example #3
0
 def test_numerical(self):
     data = CSVparser.to_dict(current_dir + '/mockups/csv/test_numbers.csv')
     self.assertEqual(data, [{
         "column1": "data1",
         "num": 1
     }, {
         "column1": "data3",
         "num": 2
     }])
Example #4
0
 def test_simple(self):
     data = CSVparser.to_dict(current_dir + '/mockups/csv/test_simple.csv')
     self.assertEqual(data, [{
         "column1": "data1",
         "column2": "data2"
     }, {
         "column1": "data3",
         "column2": "data4"
     }])
Example #5
0
 def test_special_char_escaping(self):
     data = CSVparser.to_dict(current_dir + '/mockups/schema/specialCharacterTest/test.csv')
     schema = SchemaGenerator.generate_schema(data)
     self.assertDictEqual(deep_sort(schema), deep_sort({
         'type': 'array',
         'items': {
             'type': 'object',
             'properties': {
                 'Data\uff0eColumn1': {'type': 'string'},
                 'Data$Column2': {'type': 'string'}
             },
             'required': ['Data\uff0eColumn1', 'Data$Column2']
         }
     }))
Example #6
0
 def test_schema_generator(self):
     data = CSVparser.to_dict(current_dir + '/mockups/csv/test_simple.csv')
     schema = SchemaGenerator.generate_schema(data)
     self.assertEqual(schema, {
         'type': 'array',
         'items': {
             'type': 'object',
             'properties': {
                 'column1': {'type': 'string'},
                 'column2': {'type': 'string'}
             },
             'required': ['column1', 'column2']
         }
     })
Example #7
0
 def test_numpy_float_error(self):
     data = CSVparser.to_dict(current_dir + '/mockups/schema/numpy-float64/float64.csv')
     schema = SchemaGenerator.generate_schema(data)
     self.assertEqual(schema, {
         'type': 'array',
         'items': {
             'type': 'object',
             'properties': {
                 'id': {'type': 'number'},
                 'put': {'type': 'number'}
             },
             'required': ['id', 'put']
         }
     })
Example #8
0
def run(file_path):
    # Init logging and database
    init_logging()
    client, file_col, schema_col, source_data_col = init_mongodb(config)

    # Set up counters and file index
    successfully_ingested_files = 0
    file_counter = 0
    file_list = DirLister.get_file_list_recursive(file_path)

    logging.info('Processing %d files from %s' % (len(file_list), file_path))

    for file in file_list:
        file_counter += 1
        ProgressBar.update_progress(file_counter / len(file_list),
                                    ('Processing file %s' % file))

        # get the file stats
        document = {
            'stats': FileStatter.stats(file),
            'filePath': file,
            '_id': file,
            'hash': FileStatter.sha1_from_file(file)
        }

        # Load the data or skip if unable
        if file.lower().endswith('.mif'):
            try:
                data = MIFparser.to_dict(file)
            except ValueError as e:
                logging.error(e)
                # if the data loading doesn't work out, just log the error and skip the file
                continue
        elif file.lower().endswith('.mid'):
            logging.debug('Skipping .mid file.')
            continue  # .mid files are processed along with its 'parented' .mif file
        else:
            try:
                data = CSVparser.to_dict(file)
            except ValueError as e:
                logging.error('CSV parsing error on file %s: %s' % (file, e))
                # if the data loading doesn't work out, just log the error and skip the file
                continue

        # Generate the schema and try to ingest it
        try:
            schema_data = SchemaGenerator.generate_schema(data)
        except Exception as e:
            logging.error('Schema error on file %s: %s' % (file, e))
            continue

        schema_hash = FileStatter.sha1(schema_data)
        schema = {
            '_id': schema_hash,
            'schema': schema_data,
        }

        try:
            schema_col.insert_one(schema)
        except DuplicateKeyError:
            logging.debug('Schema %s was previously processed' % schema_hash)
        except Exception as e:
            logging.error('Ingest schema error on file %s: %s' % (file, e))
            # if the schema loading doesn't work out, just log the error and skip the file
            continue

        # Store the source data
        source_data_doc_sha1 = FileStatter.sha1(data)
        source_data_doc = {'_id': source_data_doc_sha1, 'data': data}

        try:
            source_data_col.insert_one(document=source_data_doc)
        except DuplicateKeyError:
            logging.debug('Sourcedata with sha1 %s was previously processed' %
                          source_data_doc_sha1)
        except Exception as e:
            logging.error('Ingest source data error on file %s: %s' %
                          (file, e))
            continue

        # Finalize the file document with the data reference and the schema reference
        document['data'] = source_data_doc_sha1
        document['schema'] = schema['_id']

        try:
            file_col.insert_one(document=document)
        except DuplicateKeyError:
            logging.warning('File %s was previously processed, skipping' %
                            file)
            # Skip to next file
            continue
        except Exception as e:
            logging.error('Ingest file metadata error on file %s: %s' %
                          (file, e))
            continue

        logging.debug('File %s was successfully ingested' % file)
        successfully_ingested_files += 1

    logging.info('Finished!')
    logging.info('Successfully ingested %d files of %d' %
                 (successfully_ingested_files, len(file_list)))
    client.close()
 def test_data_hasher_from_schema_dict(self):
     dictionary = CSVparser.to_dict(current_dir + '/mockups/schema/caseInsensitiveTest/test.csv')
     sha1 = FileStatter.sha1(SchemaGenerator.generate_schema(dictionary))
     self.assertEqual(sha1, 'a59a9b5c48657c3828c4c308cd057997aa7927fb')
Example #10
0
 def test_empty_csv(self):
     with self.assertRaisesRegex(ValueError, 'empty or invalid'):
         CSVparser.to_dict(current_dir + '/mockups/csv/empty.csv')
Example #11
0
 def test_no_csv(self):
     with self.assertRaisesRegex(ValueError, 'invalid csv'):
         CSVparser.to_dict(current_dir + '/mockups/csv/test_no_csv.txt')
Example #12
0
 def test_non_unicode(self):
     data = CSVparser.to_dict(current_dir +
                              '/mockups/csv/test_non_unicode.csv')
     self.assertEqual(data[0]['OMSCHRIJF'], 'mal voor reliƫf')