def test_database_store(): setup_main_database() with tempfile.TemporaryDirectory() as tmpdir: metadata_db = MetadataDB(tmpdir) metadata_db.create_session_metadata("Test", True, "http://www.test.com", "2018-01-01-10-00-00") # test this store assert not database.is_store_done("test_source", "2018-01-01-10-00-00", True) # start it! store_id = database.start_store("test_source", "2018-01-01-10-00-00", True, metadata_db) # test this store assert database.get_id_of_store("test_source", "2018-01-01-10-00-00", True) == store_id # end it! database.end_store(store_id) # test this store assert database.is_store_done("test_source", "2018-01-01-10-00-00", True) # test other stores aren't marked as done assert not database.is_store_done("test_source", "2018-01-01-10-00-00", False) # Not a Sample assert not database.is_store_done("test_source", "2027-01-01-10-00-00", True) # Different version assert not database.is_store_done("a_different_source", "2018-01-01-10-00-00", True) # Different source
def test_basic(): with tempfile.TemporaryDirectory() as tmpdir: fetcher = Basic(tmpdir) metadata_file = join(tmpdir, 'test', 'v1', 'metadb.sqlite3') assert exists(metadata_file) metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert data['publisher_name'] == 'test' assert data['base_url'] == 'test_url' assert data['session_start_datetime'] del data, metadata_db fetcher.run_gather() metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert data['gather_start_datetime'] assert data['gather_finished_datetime'] assert data['gather_success'] del data, metadata_db fetcher.run_fetch() assert exists(join(tmpdir, 'test', 'v1', 'file1.json')) metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert data['file_status'] assert data['fetch_success'] assert data['fetch_start_datetime'] assert data['fetch_finished_datetime'] del data, metadata_db setup_main_database() fetcher.run_store()
def test_bad_url(): with tempfile.TemporaryDirectory() as tmpdir: fetcher = BadUrls(tmpdir) fetcher.run_gather() fetcher.run_fetch() metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert not data['fetch_success'] for value in data['file_status'].values(): assert not value['fetch_success'] assert value['fetch_errors']
def test_exception_gather(): with tempfile.TemporaryDirectory() as tmpdir: fetcher = ExceptionGather(tmpdir) fetcher.run_gather() metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert not data['gather_success'] assert data['gather_errors'] == 'IndexError()' assert data['gather_finished_datetime'] assert data['gather_errors'] with pytest.raises(Exception): fetcher.run_fetch()
def __init__(self, base_dir, source_id, output_directory=None, sample=False, data_version=None): self.base_dir = base_dir self.source_id = source_id self.sample = sample # Make sure the output directory is fully specified, including sample bit (if applicable) self.output_directory = output_directory or source_id if not self.output_directory: raise AttributeError('An output directory needs to be specified') if self.sample and not self.output_directory.endswith('_sample'): self.output_directory += '_sample' # Load all versions if possible, pick an existing one or set a new one. all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\ if os.path.exists(os.path.join(base_dir, self.output_directory)) else [] if data_version and data_version in all_versions: # Version specified is valid self.data_version = data_version elif data_version: # Version specified is invalid! raise AttributeError('A version was specified that does not exist') elif len(all_versions) > 0: # Get the latest version to resume self.data_version = all_versions[0] else: # Should not happen... raise AttributeError( 'The source and/or version is unavailable on the output directory' ) # Build full directory, make sure it exists self.full_directory = os.path.join(base_dir, self.output_directory, self.data_version) if not os.path.exists(self.full_directory): raise AttributeError('Full Directory does not exist!') # Misc self.metadata_db = MetadataDB(self.full_directory)
def test_bad_fetch_exception(): with tempfile.TemporaryDirectory() as tmpdir: fetcher = BadFetchException(tmpdir) fetcher.run_gather() fetcher.run_fetch() metadata_db = MetadataDB(join(tmpdir, 'test', 'v1')) data = metadata_db.get_dict() assert data['gather_success'] assert data['gather_finished_datetime'] assert not data['fetch_success'] assert not data['file_status']['file1.json']['fetch_success'] assert data['file_status']['file1.json']['fetch_start_datetime'] assert data['file_status']['file1.json']['fetch_finished_datetime'] assert data['file_status']['file1.json']['fetch_errors'] == [ "Exception('Whoops',)" ] with pytest.raises(Exception): fetcher.run_store()
def test_checks_records_error(): setup_main_database() with tempfile.TemporaryDirectory() as tmpdir: metadata_db = MetadataDB(tmpdir) metadata_db.create_session_metadata("Test", True, "http://www.test.com", "2018-01-01-10-00-00") metadata_db.add_filestatus({ 'filename': 'test1.json', 'url': 'http://www.test.com', 'data_type': 'record_package' }) # store details source_session_id = database.start_store("test_source", "2018-01-01-10-00-00", True, metadata_db) for data in metadata_db.list_filestatus(): with database.add_file(source_session_id, data) as database_file: database_file.insert_record({'record': 'totally'}, { 'version': '0.1-does-not-exist', 'extensions': [] }) database.end_store(source_session_id) record_id = 1 # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that. # Test assert not database.is_record_check_done(record_id) assert not database.is_check_done(source_session_id) # check! for data in metadata_db.list_filestatus(): checks.check_file(source_session_id, data) # Test assert database.is_record_check_done(record_id) assert database.is_check_done(source_session_id) with database.engine.begin() as connection: s = sa.sql.select([database.record_check_error_table]) result = connection.execute(s) data = result.fetchone() assert 'The schema version in your data is not valid. Accepted values:' in data[ 'error']
def test_checks_releases(): setup_main_database() with tempfile.TemporaryDirectory() as tmpdir: metadata_db = MetadataDB(tmpdir) metadata_db.create_session_metadata("Test", True, "http://www.test.com", "2018-01-01-10-00-00") metadata_db.add_filestatus({ 'filename': 'test1.json', 'url': 'http://www.test.com', 'data_type': 'release_package' }) # store details source_session_id = database.start_store("test_source", "2018-01-01-10-00-00", True, metadata_db) for data in metadata_db.list_filestatus(): with database.add_file(source_session_id, data) as database_file: database_file.insert_release({'release': 'totally'}, {'extensions': []}) database.end_store(source_session_id) release_id = 1 # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that. # Test assert not database.is_release_check_done(release_id) # check! for data in metadata_db.list_filestatus(): checks.check_file(source_session_id, data) # Test assert database.is_release_check_done(release_id) with database.engine.begin() as connection: s = sa.sql.select([database.release_check_table]) result = connection.execute(s) data = result.fetchone() assert data['cove_output']['file_type'] == 'json' assert len(data['cove_output']['validation_errors']) > 0
def test_database_store_file(): setup_main_database() with tempfile.TemporaryDirectory() as tmpdir: metadata_db = MetadataDB(tmpdir) metadata_db.create_session_metadata("Test", True, "http://www.test.com", "2018-01-01-10-00-00") metadata_db.add_filestatus({ 'filename': 'test1.json', 'url': 'http://www.test.com', 'data_type': 'record_package' }) # start it! store_id = database.start_store("test_source", "2018-01-01-10-00-00", True, metadata_db) # test file_id = database.get_id_of_store_file(store_id, {'filename': 'test1.json'}) assert file_id == 1
class SourceStatus: def __init__(self, base_dir, source_id, output_directory=None, sample=False, data_version=None): self.base_dir = base_dir self.source_id = source_id self.sample = sample # Make sure the output directory is fully specified, including sample bit (if applicable) self.output_directory = output_directory or source_id if not self.output_directory: raise AttributeError('An output directory needs to be specified') if self.sample and not self.output_directory.endswith('_sample'): self.output_directory += '_sample' # Load all versions if possible, pick an existing one or set a new one. all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\ if os.path.exists(os.path.join(base_dir, self.output_directory)) else [] if data_version and data_version in all_versions: # Version specified is valid self.data_version = data_version elif data_version: # Version specified is invalid! raise AttributeError('A version was specified that does not exist') elif len(all_versions) > 0: # Get the latest version to resume self.data_version = all_versions[0] else: # Should not happen... raise AttributeError( 'The source and/or version is unavailable on the output directory' ) # Build full directory, make sure it exists self.full_directory = os.path.join(base_dir, self.output_directory, self.data_version) if not os.path.exists(self.full_directory): raise AttributeError('Full Directory does not exist!') # Misc self.metadata_db = MetadataDB(self.full_directory) def is_gather_finished(self): metadata = self.metadata_db.get_session() return bool(metadata['gather_finished_datetime']) def get_gather_progress_as_text(self): out = [] metadata = self.metadata_db.get_session() if metadata['gather_start_datetime']: out.append("Started " + metadata['gather_start_datetime'].strftime("%c")) return "\n".join(out) def is_fetch_finished(self): metadata = self.metadata_db.get_session() return bool(metadata['fetch_finished_datetime']) def get_fetch_progress_as_text(self): file_statuses = self.metadata_db.list_filestatus() count_finished = 0 current_out = [] for file_status in file_statuses: if file_status['fetch_finished_datetime'] is not None: count_finished += 1 if file_status['fetch_start_datetime'] and file_status[ 'fetch_finished_datetime'] is None: current_out.append("Filename: " + file_status['filename']) current_out.append("URL: " + file_status['url']) current_out.append("Data Type: " + file_status['data_type']) current_out.append("Encoding: " + file_status['encoding']) current_out.append( "Started: " + file_status['fetch_start_datetime'].strftime("%c")) out = "Finished " + str(count_finished) + " out of " + str( len(file_statuses)) + " files.\n" if current_out: return out + "\nIn Progress:\n" + "\n".join(current_out) else: return out def is_store_finished(self): return database.is_store_done(self.source_id, self.data_version, self.sample) def get_store_progress_as_text(self): return 'Store is in progress' def is_check_finished(self): source_session_id = database.get_id_of_store(self.source_id, self.data_version, self.sample) return database.is_check_done(source_session_id) def get_check_progress_as_text(self): return 'Check is in progress'
def __init__(self, base_dir, remove_dir=False, publisher_name=None, url=None, output_directory=None, sample=False, data_version=None, new_version=False): self.base_dir = base_dir self.sample = sample self.publisher_name = publisher_name or self.publisher_name if not self.publisher_name: raise AttributeError('A publisher name needs to be specified') # Make sure the output directory is fully specified, including sample bit (if applicable) self.output_directory = output_directory or self.output_directory or self.source_id if not self.output_directory: raise AttributeError('An output directory needs to be specified') if self.sample and not self.output_directory.endswith('_sample'): self.output_directory += '_sample' # Load all versions if possible, pick an existing one or set a new one. all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\ if os.path.exists(os.path.join(base_dir, self.output_directory)) else [] if self.data_version: pass elif data_version and data_version in all_versions: # Version specified is valid self.data_version = data_version elif data_version: # Version specified is invalid! raise AttributeError('A version was specified that does not exist') elif new_version or len(all_versions) == 0: # New Version self.data_version = datetime.datetime.utcnow().strftime( '%Y-%m-%d-%H-%M-%S') elif len(all_versions) > 0: # Get the latest version to resume self.data_version = all_versions[0] else: # Should not happen... raise AttributeError( 'The version is unavailable on the output directory') # Build full directory, make sure it exists self.full_directory = os.path.join(base_dir, self.output_directory, self.data_version) exists = os.path.exists(self.full_directory) try: if exists and remove_dir: os.rmdir(self.full_directory) exists = False if not exists: os.makedirs(self.full_directory) except OSError: raise RuntimeError( "Error: Write permission is needed on the directory specified (or project dir). %s" % self.full_directory) # Misc self.url = url or self.url self.metadata_db = MetadataDB(self.full_directory) self.metadata_db.create_session_metadata( publisher_name=self.publisher_name, sample=self.sample, url=self.url, data_version=self.data_version)
class Source: publisher_name = None url = None output_directory = None source_id = None sample = False data_version = None """It is possible to pass extra arguments. This specifies a list of the extra arguments possible. Each item in the list should be a dict with the keys: * name - a name compatible with argparse. Names should be unique across all sources, so include a prefix of some kind. * help - a help string for argparse """ argument_definitions = [] def __init__(self, base_dir, remove_dir=False, publisher_name=None, url=None, output_directory=None, sample=False, data_version=None, new_version=False): self.base_dir = base_dir self.sample = sample self.publisher_name = publisher_name or self.publisher_name if not self.publisher_name: raise AttributeError('A publisher name needs to be specified') # Make sure the output directory is fully specified, including sample bit (if applicable) self.output_directory = output_directory or self.output_directory or self.source_id if not self.output_directory: raise AttributeError('An output directory needs to be specified') if self.sample and not self.output_directory.endswith('_sample'): self.output_directory += '_sample' # Load all versions if possible, pick an existing one or set a new one. all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\ if os.path.exists(os.path.join(base_dir, self.output_directory)) else [] if self.data_version: pass elif data_version and data_version in all_versions: # Version specified is valid self.data_version = data_version elif data_version: # Version specified is invalid! raise AttributeError('A version was specified that does not exist') elif new_version or len(all_versions) == 0: # New Version self.data_version = datetime.datetime.utcnow().strftime( '%Y-%m-%d-%H-%M-%S') elif len(all_versions) > 0: # Get the latest version to resume self.data_version = all_versions[0] else: # Should not happen... raise AttributeError( 'The version is unavailable on the output directory') # Build full directory, make sure it exists self.full_directory = os.path.join(base_dir, self.output_directory, self.data_version) exists = os.path.exists(self.full_directory) try: if exists and remove_dir: os.rmdir(self.full_directory) exists = False if not exists: os.makedirs(self.full_directory) except OSError: raise RuntimeError( "Error: Write permission is needed on the directory specified (or project dir). %s" % self.full_directory) # Misc self.url = url or self.url self.metadata_db = MetadataDB(self.full_directory) self.metadata_db.create_session_metadata( publisher_name=self.publisher_name, sample=self.sample, url=self.url, data_version=self.data_version) """Returns an array with objects for each url. The return objects includes url,filename,type and more.""" def gather_all_download_urls(self): raise NotImplementedError() def set_arguments(self, arguments): pass def run_gather(self): metadata = self.metadata_db.get_session() if metadata['gather_success']: return self.metadata_db.update_session_gather_start() try: for info in self.gather_all_download_urls(): self.metadata_db.add_filestatus(info) except Exception as e: error = repr(e) stacktrace = traceback.format_exception(*sys.exc_info()) self.metadata_db.update_session_gather_end(False, error, stacktrace) return self.metadata_db.update_session_gather_end(True, None, None) def run_fetch(self): metadata = self.metadata_db.get_session() if metadata['fetch_success']: return if not metadata['gather_success']: raise Exception('Can not run fetch without a successful gather') self.metadata_db.update_session_fetch_start() failed = False stop = False while not stop: stop = True for data in self.metadata_db.list_filestatus(): if data['fetch_success']: continue self.metadata_db.update_filestatus_fetch_start( data['filename']) try: to_add_list, errors = self.save_url( data['filename'], data, os.path.join(self.full_directory, data['filename'])) if to_add_list: stop = False for info in to_add_list: self.metadata_db.add_filestatus(info) except Exception as e: errors = [repr(e)] if errors: self.metadata_db.update_filestatus_fetch_end( data['filename'], False, errors) failed = True else: self.metadata_db.update_filestatus_fetch_end( data['filename'], True) self.metadata_db.update_session_fetch_end(not failed) """Uploads the fetched data as record rows to the Database""" def run_store(self): metadata = self.metadata_db.get_session() if not metadata['fetch_success']: raise Exception('Can not run store without a successful fetch') if database.is_store_done(self.source_id, self.data_version, self.sample): return source_session_id = database.start_store(self.source_id, self.data_version, self.sample, self.metadata_db) for data in self.metadata_db.list_filestatus(): if data['data_type'].startswith('meta'): continue if database.is_store_file_done(source_session_id, data): continue with database.add_file(source_session_id, data) as database_file: try: with open(os.path.join(self.full_directory, data['filename']), encoding=data['encoding']) as f: file_json_data = json.load(f) except Exception as e: # TODO better way of dealing with this? raise e return objects_list = [] if data['data_type'] == 'record_package_list_in_results': objects_list.extend(file_json_data['results']) elif data['data_type'] == 'release_package_list_in_results': objects_list.extend(file_json_data['results']) elif data['data_type'] == 'record_package_list' or data[ 'data_type'] == 'release_package_list': objects_list.extend(file_json_data) else: objects_list.append(file_json_data) del file_json_data for json_data in objects_list: error_msg = '' if not isinstance(json_data, dict): error_msg = "Can not process data in file {} as JSON is not an object".format( data['filename']) if data['data_type'] == 'release_package' or \ data['data_type'] == 'release_package_list_in_results' or \ data['data_type'] == 'release_package_list': if 'releases' not in json_data: error_msg = "Release list not found in file {}".format( data['filename']) elif not isinstance(json_data['releases'], list): error_msg = "Release list which is not a list found in file {}".format( data['filename']) data_list = json_data['releases'] elif data['data_type'] == 'record_package' or \ data['data_type'] == 'record_package_list_in_results' or \ data['data_type'] == 'record_package_list': if 'records' not in json_data: error_msg = "Record list not found in file {}".format( data['filename']) elif not isinstance(json_data['records'], list): error_msg = "Record list which is not a list found in file {}".format( data['filename']) data_list = json_data['records'] else: error_msg = "data_type not a known type" if error_msg: raise Exception(error_msg) package_data = {} for key, value in json_data.items(): if key not in ('releases', 'records'): package_data[key] = value for row in data_list: if not isinstance(row, dict): error_msg = "Row in data is not a object {}".format( data['filename']) raise Exception(error_msg) if data['data_type'] == 'record_package' or \ data['data_type'] == 'record_package_list_in_results' or \ data['data_type'] == 'record_package_list': database_file.insert_record(row, package_data) else: database_file.insert_release(row, package_data) database.end_store(source_session_id) def save_url(self, file_name, data, file_path): return [], save_content(data['url'], file_path) def run_check(self): if not database.is_store_done(self.source_id, self.data_version, self.sample): raise Exception('Can not run check without a successful store') source_session_id = database.get_id_of_store(self.source_id, self.data_version, self.sample) for data in self.metadata_db.list_filestatus(): if data['data_type'].startswith('meta'): continue check_file(source_session_id, data) """Gather, Fetch, Store and Check data from this publisher.""" def run_all(self): self.run_gather() self.run_fetch() self.run_store() self.run_check()