def test_invalid_zip_file(postgres, db_conn, tmpdir, mocked_config, monkeypatch): """Test that invalid zip files are properly handled.""" valid_csv_file_path = str( tmpdir.join('operator1_with_rat_info_20160701_20160731.csv')) invalid_file_zip_path = valid_csv_file_path[:-3] + 'zip' with open(valid_csv_file_path, 'w') as f: f.close() os.rename(valid_csv_file_path, invalid_file_zip_path) catalog_config_dict = { 'prospectors': [{ 'file_type': 'operator', 'paths': [str(tmpdir)], 'schema_filename': 'OperatorImportSchema.csvs' }, { 'file_type': 'operator', 'paths': [invalid_file_zip_path], 'schema_filename': 'OperatorImportSchema_v2.csvs' }], 'perform_prevalidation': True } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}, catch_exceptions=False) assert result.exit_code == 0
def test_non_unique_path_failure(mocked_config, logger, tmpdir, monkeypatch): """Test if same path is defined two or more times; then config parsing fails.""" # Set config file using the environment variable catalog_config_dict = { 'prospectors': [{ 'file_type': 'operator', 'paths': [str(tmpdir.join('operator.zip'))], 'schema_filename': 'OperatorImportSchema_v2.csvs' }, { 'file_type': 'operator', 'paths': [ str(tmpdir.join('operator.zip')), str(tmpdir.join('operator1.zip')) ], 'schema_filename': 'OperatorImportSchema_v2.csvs' }], 'perform_prevalidation': False } with pytest.raises(Exception) as ex: CatalogConfig(ignore_env=True, **catalog_config_dict) assert 'The paths specified in the catalog config are not globally unique' in str( ex)
def test_perform_prevalidation_option(postgres, db_conn, tmpdir, monkeypatch, mocked_config): """Test pre-validation is not performed if option is turned off in the config.""" files_to_zip = ['unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv'] zip_files_to_tmpdir(files_to_zip, tmpdir) catalog_config_dict = { 'prospectors': [ { 'file_type': 'operator', 'paths': [str(tmpdir.join('operator1_with_rat_info_20160701_20160731.zip'))], 'schema_filename': 'OperatorImportSchema_v2.csvs' } ], 'perform_prevalidation': False } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 # This test basically checks that when pre-validation is disabled, then it is skipped during catalog. # The is_valid_format field would be NULL is that scenario as tested below. The scenario with # pre-validation enabled is implicitly tested in test_all_files_are_harvested test case. with db_conn.cursor() as cursor: cursor.execute('SELECT is_valid_format FROM data_catalog WHERE filename = ' '\'operator1_with_rat_info_20160701_20160731.zip\'') assert cursor.fetchone().is_valid_format is None
def test_non_zip_files_are_not_harvested(postgres, db_conn, tmpdir, mocker, mocked_config, monkeypatch): """Test non-zip files are not cataloged.""" catalog_config_dict = { 'prospectors': [ { 'file_type': 'operator', 'paths': [str(tmpdir)], 'schema_filename': 'OperatorImportSchema.csvs' }, ], 'perform_prevalidation': False } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Mock os.listdir call to return the unzipped test file mocker.patch.object(os, 'listdir', new_callable=mocker.MagicMock( return_value=[os.path.abspath(os.path.dirname(__file__)), 'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv'])) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 with db_conn.cursor() as cursor: cursor.execute('SELECT COUNT(*) FROM data_catalog WHERE filename = ' '\'operator1_with_rat_info_20160701_20160731.csv\'') assert cursor.fetchone()[0] == 0
def test_file_specified_explicitly_is_cataloged_correctly(postgres, db_conn, tmpdir, mocked_config, monkeypatch): """Test that if file is specified explicitly; it is pre-validated using the correct schema.""" files_to_zip = ['unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv'] zip_files_to_tmpdir(files_to_zip, tmpdir) catalog_config_dict = { 'prospectors': [ { 'file_type': 'operator', 'paths': [str(tmpdir)], 'schema_filename': 'OperatorImportSchema.csvs' }, { 'file_type': 'operator', 'paths': [str(tmpdir.join('operator1_with_rat_info_20160701_20160731.zip'))], 'schema_filename': 'OperatorImportSchema_v2.csvs' } ], 'perform_prevalidation': True } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 with db_conn.cursor() as cursor: cursor.execute('SELECT is_valid_format FROM data_catalog WHERE filename = ' '\'operator1_with_rat_info_20160701_20160731.zip\'') assert cursor.fetchone().is_valid_format
def test_num_records_uncompressed_size(mocked_config, tmpdir, monkeypatch, flask_app, api_version): """Test import status info in catalog api. - num_records: the number of lines in the file minus the header. - uncompressed_size_bytes. """ # import operator status success here = path.abspath(path.dirname(__file__)) data_dir = path.join(here, 'unittest_data/operator') valid_csv_operator_data_file_name = 'operator1_20160701_20160731.csv' valid_csv_operator_data_file = path.join(data_dir, valid_csv_operator_data_file_name) # create a zip file inside a temp dir valid_zip_operator_data_file_path = \ str(tmpdir.join('operator1_20160701_20160731.zip')) with zipfile.ZipFile(valid_zip_operator_data_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as valid_csv_operator_data_file_zfile: # zipfile write() method supports an extra argument (arcname) which is the # archive name to be stored in the zip file. valid_csv_operator_data_file_zfile.write(valid_csv_operator_data_file, valid_csv_operator_data_file_name) catalog_config_dict = { 'prospectors': [ { 'file_type': 'operator', 'paths': [valid_zip_operator_data_file_path], 'schema_filename': 'OperatorImportSchema_v2.csvs' } ], 'perform_prevalidation': False } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 # call APIs if api_version == 'v1': rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8')) assert data[0]['filename'] == 'operator1_20160701_20160731.zip' assert data[0]['num_records'] == 20 assert data[0]['uncompressed_size_bytes'] == 1066 assert data[0]['compressed_size_bytes'] == 400 else: # api version 2.0 rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8'))['files'] assert data[0]['filename'] == 'operator1_20160701_20160731.zip' assert data[0]['num_records'] == 20 assert data[0]['uncompressed_size_bytes'] == 1066 assert data[0]['compressed_size_bytes'] == 400
def test_catalog(per_test_postgres, tmpdir, db_user, mocked_config, monkeypatch): """Test catalog works with the security role created based on abstract role.""" files_to_zip = [ 'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv' ] zip_files_to_tmpdir(files_to_zip, tmpdir) catalog_config_dict = { 'prospectors': [{ 'file_type': 'operator', 'paths': [ str( tmpdir.join( 'operator1_with_rat_info_20160701_20160731.zip')) ], 'schema_filename': 'OperatorImportSchema_v2.csvs' }], 'perform_prevalidation': False } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() monkeypatch.setattr(mocked_config.db_config, 'user', db_user) result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) if db_user in ['dirbs_poweruser_login', 'dirbs_catalog_user']: assert result.exit_code == 0 else: assert result.exit_code != 0
def test_import_status(db_conn, mocked_config, tmpdir, monkeypatch, flask_app, api_version, logger, mocked_statsd, metadata_db_conn): """Test import status info in catalog api. - import_status: - ever_imported_successfully: true or false - most_recent_import: status Generate an MD5 hash of the file during import and store it in job_metadata. Then, when cataloging, look at the most recent import job in the job_metadata table where the file had the same MD5 hash and lookup the status. ever_imported_successfully will be true if there is any successfull import - joining on files md5 most_recent_import returns the status of the most recent import - joining on files md5 """ # Step 1 # try to import something successfully to get most_recent_import = success # and test the md5 created in the abstract importer using dirbs-import cli command here = path.abspath(path.dirname(__file__)) data_dir = path.join(here, 'unittest_data/operator') valid_csv_operator_data_file_name = 'operator1_20160701_20160731.csv' valid_csv_operator_data_file = path.join( data_dir, valid_csv_operator_data_file_name) # create a zip file inside a temp dir valid_zip_operator_data_file_path = \ str(tmpdir.join('operator1_20160701_20160731.zip')) with zipfile.ZipFile(valid_zip_operator_data_file_path, 'w') as valid_csv_operator_data_file_zfile: # zipfile write() method supports an extra argument (arcname) which is the # archive name to be stored in the zip file. valid_csv_operator_data_file_zfile.write( valid_csv_operator_data_file, valid_csv_operator_data_file_name) runner = CliRunner() result = runner.invoke(dirbs_import_cli, [ 'operator', 'Operator1', '--disable-rat-import', '--disable-region-check', '--disable-home-check', valid_zip_operator_data_file_path ], obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 catalog_config_dict = { 'prospectors': [{ 'file_type': 'operator', 'paths': [valid_zip_operator_data_file_path], 'schema_filename': 'OperatorImportSchema_v2.csvs' }], 'perform_prevalidation': False } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 # call apis if api_version == 'v1': rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8')) assert data[0]['import_status']['most_recent_import'] == 'success' assert data[0]['import_status']['ever_imported_successfully'] is True with db_conn.cursor() as cursor: cursor.execute('SELECT md5 FROM data_catalog') md5 = cursor.fetchone().md5 # Step 2 with db_conn.cursor() as cursor: cursor.execute('TRUNCATE TABLE job_metadata') # status error job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=10, subcommand='operator', status='error', start_time='2017-08-15 01:15:39.54785+00', extra_metadata={'input_file_md5': md5}) # status in progress, most recent job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=11, subcommand='operator', status='running', start_time='2017-08-15 01:15:40.54785+00', extra_metadata={'input_file_md5': md5}) # call API rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8')) assert data[0]['import_status']['most_recent_import'] == 'running' assert data[0]['import_status']['ever_imported_successfully'] is False assert len(data) == 1 # Step 3 try a different order with db_conn.cursor() as cursor: cursor.execute('TRUNCATE TABLE job_metadata') job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=13, subcommand='gsma', status='success', start_time='2017-08-15 01:15:39.54785+00', extra_metadata={'input_file_md5': md5}) # status in progress, most recent job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=14, subcommand='gsma', status='error', start_time='2017-08-15 01:15:40.54785+00', extra_metadata={'input_file_md5': md5}) # call API rv = flask_app.get(url_for('{0}.catalog_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8')) assert data[0]['import_status']['most_recent_import'] == 'error' assert data[0]['import_status']['ever_imported_successfully'] is True assert len(data) == 1 else: # api version 2.0 rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8'))['files'] assert data[0]['import_status']['most_recent_import'] == 'success' assert data[0]['import_status']['ever_imported_successfully'] is True with db_conn.cursor() as cursor: cursor.execute('SELECT md5 FROM data_catalog') md5 = cursor.fetchone().md5 # Step 2 with db_conn.cursor() as cursor: cursor.execute('TRUNCATE TABLE job_metadata') # status error job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=10, subcommand='operator', status='error', start_time='2017-08-15 01:15:39.54785+00', extra_metadata={'input_file_md5': md5}) # status in progress, most recent job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=11, subcommand='operator', status='running', start_time='2017-08-15 01:15:40.54785+00', extra_metadata={'input_file_md5': md5}) # call API rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8'))['files'] assert data[0]['import_status']['most_recent_import'] == 'running' assert data[0]['import_status']['ever_imported_successfully'] is False assert len(data) == 1 # Step 3 try a different order with db_conn.cursor() as cursor: cursor.execute('TRUNCATE TABLE job_metadata') job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=13, subcommand='gsma', status='success', start_time='2017-08-15 01:15:39.54785+00', extra_metadata={'input_file_md5': md5}) # status in progress, most recent job_metadata_importer(db_conn=db_conn, command='dirbs-import', run_id=14, subcommand='gsma', status='error', start_time='2017-08-15 01:15:40.54785+00', extra_metadata={'input_file_md5': md5}) # call API rv = flask_app.get(url_for('{0}.catalog_get_api'.format(api_version))) assert rv.status_code == 200 data = json.loads(rv.data.decode('utf-8'))['files'] assert data[0]['import_status']['most_recent_import'] == 'error' assert data[0]['import_status']['ever_imported_successfully'] is True assert len(data) == 1
def test_all_files_are_harvested(postgres, db_conn, tmpdir, logger, monkeypatch, mocked_config): """Test all input files are correctly harvested and cataloged.""" files_to_zip = [ 'unittest_data/operator/operator1_with_rat_info_20160701_20160731.csv', 'unittest_data/gsma/sample_gsma_import_list_anonymized.txt', 'unittest_data/stolen_list/sample_stolen_list.csv', 'unittest_data/registration_list/sample_registration_list.csv', 'unittest_data/pairing_list/sample_pairinglist.csv', 'unittest_data/golden_list/sample_golden_list.csv' ] zip_files_to_tmpdir(files_to_zip, tmpdir) catalog_config_dict = { 'prospectors': [{ 'file_type': 'operator', 'paths': [ str( tmpdir.join( 'operator1_with_rat_info_20160701_20160731.zip')) ], 'schema_filename': 'OperatorImportSchema_v2.csvs' }, { 'file_type': 'gsma_tac', 'paths': [str(tmpdir.join('sample_gsma_import_list_anonymized.zip'))], 'schema_filename': 'GSMASchema.csvs' }, { 'file_type': 'stolen_list', 'paths': [str(tmpdir.join('sample_stolen_list.zip'))], 'schema_filename': 'StolenListSchema.csvs' }, { 'file_type': 'pairing_list', 'paths': [str(tmpdir.join('sample_pairinglist.zip'))], 'schema_filename': 'PairingListSchema.csvs' }, { 'file_type': 'registration_list', 'paths': [str(tmpdir.join('sample_registration_list.zip'))], 'schema_filename': 'RegistrationListSchema.csvs' }, { 'file_type': 'golden_list', 'paths': [str(tmpdir.join('sample_golden_list.zip'))], 'schema_filename': 'GoldenListSchemaData.csvs' }], 'perform_prevalidation': True } catalog_config = CatalogConfig(ignore_env=True, **catalog_config_dict) monkeypatch.setattr(mocked_config, 'catalog_config', catalog_config) # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() # Run dirbs-catalog using db args from the temp postgres instance runner = CliRunner() result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 with db_conn.cursor() as cursor: cursor.execute('SELECT * FROM data_catalog') res = [(res.filename, res.file_type, res.compressed_size_bytes, res.is_valid_zip, res.is_valid_format, res.extra_attributes) for res in cursor.fetchall()] assert ('operator1_with_rat_info_20160701_20160731.zip', 'operator', 797, True, True, { 'filename_check': True }) in res assert ('sample_gsma_import_list_anonymized.zip', 'gsma_tac', 1083, True, True, {}) in res assert ('sample_stolen_list.zip', 'stolen_list', 529, True, True, {}) in res assert ('sample_registration_list.zip', 'registration_list', 858, True, True, {}) in res assert ('sample_pairinglist.zip', 'pairing_list', 312, True, True, {}) in res assert ('sample_golden_list.zip', 'golden_list', 474, True, True, {}) in res # Run dirbs-catalog again to verify that no new files are discovered result = runner.invoke(dirbs_catalog_cli, obj={'APP_CONFIG': mocked_config}) assert result.exit_code == 0 assert 'Data catalog is already up-to-date!' in logger_stream_contents( logger)