Ejemplo n.º 1
0
def test_database_store():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")

        # test this store
        assert not database.is_store_done("test_source", "2018-01-01-10-00-00",
                                          True)

        # start it!
        store_id = database.start_store("test_source", "2018-01-01-10-00-00",
                                        True, metadata_db)

        # test this store
        assert database.get_id_of_store("test_source", "2018-01-01-10-00-00",
                                        True) == store_id

        # end it!
        database.end_store(store_id)

        # test this store
        assert database.is_store_done("test_source", "2018-01-01-10-00-00",
                                      True)

        # test other stores aren't marked as done
        assert not database.is_store_done("test_source", "2018-01-01-10-00-00",
                                          False)  # Not a Sample
        assert not database.is_store_done("test_source", "2027-01-01-10-00-00",
                                          True)  # Different version
        assert not database.is_store_done("a_different_source",
                                          "2018-01-01-10-00-00",
                                          True)  # Different source
Ejemplo n.º 2
0
def test_basic():
    with tempfile.TemporaryDirectory() as tmpdir:
        fetcher = Basic(tmpdir)
        metadata_file = join(tmpdir, 'test', 'v1', 'metadb.sqlite3')
        assert exists(metadata_file)

        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert data['publisher_name'] == 'test'
        assert data['base_url'] == 'test_url'
        assert data['session_start_datetime']
        del data, metadata_db

        fetcher.run_gather()
        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert data['gather_start_datetime']
        assert data['gather_finished_datetime']
        assert data['gather_success']
        del data, metadata_db

        fetcher.run_fetch()
        assert exists(join(tmpdir, 'test', 'v1', 'file1.json'))

        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert data['file_status']
        assert data['fetch_success']
        assert data['fetch_start_datetime']
        assert data['fetch_finished_datetime']
        del data, metadata_db

        setup_main_database()
        fetcher.run_store()
Ejemplo n.º 3
0
def test_bad_url():
    with tempfile.TemporaryDirectory() as tmpdir:
        fetcher = BadUrls(tmpdir)
        fetcher.run_gather()
        fetcher.run_fetch()

        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert not data['fetch_success']
        for value in data['file_status'].values():
            assert not value['fetch_success']
            assert value['fetch_errors']
Ejemplo n.º 4
0
def test_exception_gather():
    with tempfile.TemporaryDirectory() as tmpdir:
        fetcher = ExceptionGather(tmpdir)
        fetcher.run_gather()

        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert not data['gather_success']
        assert data['gather_errors'] == 'IndexError()'
        assert data['gather_finished_datetime']
        assert data['gather_errors']

        with pytest.raises(Exception):
            fetcher.run_fetch()
Ejemplo n.º 5
0
    def __init__(self,
                 base_dir,
                 source_id,
                 output_directory=None,
                 sample=False,
                 data_version=None):

        self.base_dir = base_dir
        self.source_id = source_id
        self.sample = sample

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = output_directory or source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The source and/or version is unavailable on the output directory'
            )

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)
        if not os.path.exists(self.full_directory):
            raise AttributeError('Full Directory does not exist!')

        # Misc
        self.metadata_db = MetadataDB(self.full_directory)
Ejemplo n.º 6
0
def test_bad_fetch_exception():
    with tempfile.TemporaryDirectory() as tmpdir:
        fetcher = BadFetchException(tmpdir)
        fetcher.run_gather()
        fetcher.run_fetch()

        metadata_db = MetadataDB(join(tmpdir, 'test', 'v1'))
        data = metadata_db.get_dict()
        assert data['gather_success']
        assert data['gather_finished_datetime']
        assert not data['fetch_success']
        assert not data['file_status']['file1.json']['fetch_success']
        assert data['file_status']['file1.json']['fetch_start_datetime']
        assert data['file_status']['file1.json']['fetch_finished_datetime']
        assert data['file_status']['file1.json']['fetch_errors'] == [
            "Exception('Whoops',)"
        ]

        with pytest.raises(Exception):
            fetcher.run_store()
Ejemplo n.º 7
0
def test_checks_records_error():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")
        metadata_db.add_filestatus({
            'filename': 'test1.json',
            'url': 'http://www.test.com',
            'data_type': 'record_package'
        })

        # store details
        source_session_id = database.start_store("test_source",
                                                 "2018-01-01-10-00-00", True,
                                                 metadata_db)
        for data in metadata_db.list_filestatus():
            with database.add_file(source_session_id, data) as database_file:
                database_file.insert_record({'record': 'totally'}, {
                    'version': '0.1-does-not-exist',
                    'extensions': []
                })
        database.end_store(source_session_id)

        record_id = 1
        # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that.

        # Test
        assert not database.is_record_check_done(record_id)
        assert not database.is_check_done(source_session_id)

        # check!
        for data in metadata_db.list_filestatus():
            checks.check_file(source_session_id, data)

        # Test
        assert database.is_record_check_done(record_id)
        assert database.is_check_done(source_session_id)

        with database.engine.begin() as connection:
            s = sa.sql.select([database.record_check_error_table])
            result = connection.execute(s)
            data = result.fetchone()

        assert 'The schema version in your data is not valid. Accepted values:' in data[
            'error']
Ejemplo n.º 8
0
def test_checks_releases():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")
        metadata_db.add_filestatus({
            'filename': 'test1.json',
            'url': 'http://www.test.com',
            'data_type': 'release_package'
        })

        # store details
        source_session_id = database.start_store("test_source",
                                                 "2018-01-01-10-00-00", True,
                                                 metadata_db)
        for data in metadata_db.list_filestatus():
            with database.add_file(source_session_id, data) as database_file:
                database_file.insert_release({'release': 'totally'},
                                             {'extensions': []})
        database.end_store(source_session_id)

        release_id = 1
        # Don't like hard coding ID in. Relies on DB assigning 1 to this new row. But I think we can assume that.

        # Test
        assert not database.is_release_check_done(release_id)

        # check!
        for data in metadata_db.list_filestatus():
            checks.check_file(source_session_id, data)

        # Test
        assert database.is_release_check_done(release_id)

        with database.engine.begin() as connection:
            s = sa.sql.select([database.release_check_table])
            result = connection.execute(s)
            data = result.fetchone()

        assert data['cove_output']['file_type'] == 'json'
        assert len(data['cove_output']['validation_errors']) > 0
Ejemplo n.º 9
0
def test_database_store_file():
    setup_main_database()
    with tempfile.TemporaryDirectory() as tmpdir:
        metadata_db = MetadataDB(tmpdir)
        metadata_db.create_session_metadata("Test", True,
                                            "http://www.test.com",
                                            "2018-01-01-10-00-00")
        metadata_db.add_filestatus({
            'filename': 'test1.json',
            'url': 'http://www.test.com',
            'data_type': 'record_package'
        })

        # start it!
        store_id = database.start_store("test_source", "2018-01-01-10-00-00",
                                        True, metadata_db)

        # test
        file_id = database.get_id_of_store_file(store_id,
                                                {'filename': 'test1.json'})
        assert file_id == 1
Ejemplo n.º 10
0
class SourceStatus:
    def __init__(self,
                 base_dir,
                 source_id,
                 output_directory=None,
                 sample=False,
                 data_version=None):

        self.base_dir = base_dir
        self.source_id = source_id
        self.sample = sample

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = output_directory or source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The source and/or version is unavailable on the output directory'
            )

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)
        if not os.path.exists(self.full_directory):
            raise AttributeError('Full Directory does not exist!')

        # Misc
        self.metadata_db = MetadataDB(self.full_directory)

    def is_gather_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['gather_finished_datetime'])

    def get_gather_progress_as_text(self):

        out = []
        metadata = self.metadata_db.get_session()

        if metadata['gather_start_datetime']:
            out.append("Started " +
                       metadata['gather_start_datetime'].strftime("%c"))

        return "\n".join(out)

    def is_fetch_finished(self):
        metadata = self.metadata_db.get_session()
        return bool(metadata['fetch_finished_datetime'])

    def get_fetch_progress_as_text(self):

        file_statuses = self.metadata_db.list_filestatus()
        count_finished = 0
        current_out = []

        for file_status in file_statuses:
            if file_status['fetch_finished_datetime'] is not None:
                count_finished += 1

            if file_status['fetch_start_datetime'] and file_status[
                    'fetch_finished_datetime'] is None:
                current_out.append("Filename: " + file_status['filename'])
                current_out.append("URL: " + file_status['url'])
                current_out.append("Data Type: " + file_status['data_type'])
                current_out.append("Encoding: " + file_status['encoding'])
                current_out.append(
                    "Started: " +
                    file_status['fetch_start_datetime'].strftime("%c"))

        out = "Finished " + str(count_finished) + " out of " + str(
            len(file_statuses)) + " files.\n"
        if current_out:
            return out + "\nIn Progress:\n" + "\n".join(current_out)
        else:
            return out

    def is_store_finished(self):
        return database.is_store_done(self.source_id, self.data_version,
                                      self.sample)

    def get_store_progress_as_text(self):
        return 'Store is in progress'

    def is_check_finished(self):
        source_session_id = database.get_id_of_store(self.source_id,
                                                     self.data_version,
                                                     self.sample)
        return database.is_check_done(source_session_id)

    def get_check_progress_as_text(self):
        return 'Check is in progress'
Ejemplo n.º 11
0
    def __init__(self,
                 base_dir,
                 remove_dir=False,
                 publisher_name=None,
                 url=None,
                 output_directory=None,
                 sample=False,
                 data_version=None,
                 new_version=False):

        self.base_dir = base_dir
        self.sample = sample

        self.publisher_name = publisher_name or self.publisher_name
        if not self.publisher_name:
            raise AttributeError('A publisher name needs to be specified')

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = output_directory or self.output_directory or self.source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if self.data_version:
            pass
        elif data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif new_version or len(all_versions) == 0:  # New Version
            self.data_version = datetime.datetime.utcnow().strftime(
                '%Y-%m-%d-%H-%M-%S')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The version is unavailable on the output directory')

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)

        exists = os.path.exists(self.full_directory)

        try:
            if exists and remove_dir:
                os.rmdir(self.full_directory)
                exists = False

            if not exists:
                os.makedirs(self.full_directory)
        except OSError:
            raise RuntimeError(
                "Error: Write permission is needed on the directory specified (or project dir). %s"
                % self.full_directory)

        # Misc

        self.url = url or self.url

        self.metadata_db = MetadataDB(self.full_directory)

        self.metadata_db.create_session_metadata(
            publisher_name=self.publisher_name,
            sample=self.sample,
            url=self.url,
            data_version=self.data_version)
Ejemplo n.º 12
0
class Source:
    publisher_name = None
    url = None
    output_directory = None
    source_id = None
    sample = False
    data_version = None
    """It is possible to pass extra arguments.

    This specifies a list of the extra arguments possible. Each item in the list should be a dict with the keys:
      *  name - a name compatible with argparse. Names should be unique across all sources, so include a prefix of some kind.
      *  help - a help string for argparse
    """
    argument_definitions = []

    def __init__(self,
                 base_dir,
                 remove_dir=False,
                 publisher_name=None,
                 url=None,
                 output_directory=None,
                 sample=False,
                 data_version=None,
                 new_version=False):

        self.base_dir = base_dir
        self.sample = sample

        self.publisher_name = publisher_name or self.publisher_name
        if not self.publisher_name:
            raise AttributeError('A publisher name needs to be specified')

        # Make sure the output directory is fully specified, including sample bit (if applicable)
        self.output_directory = output_directory or self.output_directory or self.source_id
        if not self.output_directory:
            raise AttributeError('An output directory needs to be specified')

        if self.sample and not self.output_directory.endswith('_sample'):
            self.output_directory += '_sample'

        # Load all versions if possible, pick an existing one or set a new one.
        all_versions = sorted(os.listdir(os.path.join(base_dir, self.output_directory)), reverse=True)\
            if os.path.exists(os.path.join(base_dir, self.output_directory)) else []

        if self.data_version:
            pass
        elif data_version and data_version in all_versions:  # Version specified is valid
            self.data_version = data_version
        elif data_version:  # Version specified is invalid!
            raise AttributeError('A version was specified that does not exist')
        elif new_version or len(all_versions) == 0:  # New Version
            self.data_version = datetime.datetime.utcnow().strftime(
                '%Y-%m-%d-%H-%M-%S')
        elif len(all_versions) > 0:  # Get the latest version to resume
            self.data_version = all_versions[0]
        else:  # Should not happen...
            raise AttributeError(
                'The version is unavailable on the output directory')

        # Build full directory, make sure it exists
        self.full_directory = os.path.join(base_dir, self.output_directory,
                                           self.data_version)

        exists = os.path.exists(self.full_directory)

        try:
            if exists and remove_dir:
                os.rmdir(self.full_directory)
                exists = False

            if not exists:
                os.makedirs(self.full_directory)
        except OSError:
            raise RuntimeError(
                "Error: Write permission is needed on the directory specified (or project dir). %s"
                % self.full_directory)

        # Misc

        self.url = url or self.url

        self.metadata_db = MetadataDB(self.full_directory)

        self.metadata_db.create_session_metadata(
            publisher_name=self.publisher_name,
            sample=self.sample,
            url=self.url,
            data_version=self.data_version)

    """Returns an array with objects for each url.

    The return objects includes url,filename,type and more."""

    def gather_all_download_urls(self):
        raise NotImplementedError()

    def set_arguments(self, arguments):
        pass

    def run_gather(self):
        metadata = self.metadata_db.get_session()

        if metadata['gather_success']:
            return

        self.metadata_db.update_session_gather_start()

        try:
            for info in self.gather_all_download_urls():
                self.metadata_db.add_filestatus(info)
        except Exception as e:
            error = repr(e)
            stacktrace = traceback.format_exception(*sys.exc_info())
            self.metadata_db.update_session_gather_end(False, error,
                                                       stacktrace)
            return

        self.metadata_db.update_session_gather_end(True, None, None)

    def run_fetch(self):
        metadata = self.metadata_db.get_session()

        if metadata['fetch_success']:
            return

        if not metadata['gather_success']:
            raise Exception('Can not run fetch without a successful gather')

        self.metadata_db.update_session_fetch_start()

        failed = False
        stop = False

        while not stop:
            stop = True
            for data in self.metadata_db.list_filestatus():

                if data['fetch_success']:
                    continue

                self.metadata_db.update_filestatus_fetch_start(
                    data['filename'])
                try:
                    to_add_list, errors = self.save_url(
                        data['filename'], data,
                        os.path.join(self.full_directory, data['filename']))
                    if to_add_list:
                        stop = False
                        for info in to_add_list:
                            self.metadata_db.add_filestatus(info)

                except Exception as e:
                    errors = [repr(e)]

                if errors:
                    self.metadata_db.update_filestatus_fetch_end(
                        data['filename'], False, errors)
                    failed = True
                else:
                    self.metadata_db.update_filestatus_fetch_end(
                        data['filename'], True)

        self.metadata_db.update_session_fetch_end(not failed)

    """Uploads the fetched data as record rows to the Database"""

    def run_store(self):
        metadata = self.metadata_db.get_session()

        if not metadata['fetch_success']:
            raise Exception('Can not run store without a successful fetch')

        if database.is_store_done(self.source_id, self.data_version,
                                  self.sample):
            return

        source_session_id = database.start_store(self.source_id,
                                                 self.data_version,
                                                 self.sample, self.metadata_db)

        for data in self.metadata_db.list_filestatus():

            if data['data_type'].startswith('meta'):
                continue

            if database.is_store_file_done(source_session_id, data):
                continue

            with database.add_file(source_session_id, data) as database_file:

                try:
                    with open(os.path.join(self.full_directory,
                                           data['filename']),
                              encoding=data['encoding']) as f:
                        file_json_data = json.load(f)
                except Exception as e:
                    # TODO better way of dealing with this?
                    raise e
                    return

                objects_list = []
                if data['data_type'] == 'record_package_list_in_results':
                    objects_list.extend(file_json_data['results'])
                elif data['data_type'] == 'release_package_list_in_results':
                    objects_list.extend(file_json_data['results'])
                elif data['data_type'] == 'record_package_list' or data[
                        'data_type'] == 'release_package_list':
                    objects_list.extend(file_json_data)
                else:
                    objects_list.append(file_json_data)

                del file_json_data

                for json_data in objects_list:
                    error_msg = ''
                    if not isinstance(json_data, dict):
                        error_msg = "Can not process data in file {} as JSON is not an object".format(
                            data['filename'])

                    if data['data_type'] == 'release_package' or \
                            data['data_type'] == 'release_package_list_in_results' or \
                            data['data_type'] == 'release_package_list':
                        if 'releases' not in json_data:
                            error_msg = "Release list not found in file {}".format(
                                data['filename'])
                        elif not isinstance(json_data['releases'], list):
                            error_msg = "Release list which is not a list found in file {}".format(
                                data['filename'])
                        data_list = json_data['releases']
                    elif data['data_type'] == 'record_package' or \
                            data['data_type'] == 'record_package_list_in_results' or \
                            data['data_type'] == 'record_package_list':
                        if 'records' not in json_data:
                            error_msg = "Record list not found in file {}".format(
                                data['filename'])
                        elif not isinstance(json_data['records'], list):
                            error_msg = "Record list which is not a list found in file {}".format(
                                data['filename'])
                        data_list = json_data['records']
                    else:
                        error_msg = "data_type not a known type"

                    if error_msg:
                        raise Exception(error_msg)
                    package_data = {}
                    for key, value in json_data.items():
                        if key not in ('releases', 'records'):
                            package_data[key] = value

                    for row in data_list:
                        if not isinstance(row, dict):
                            error_msg = "Row in data is not a object {}".format(
                                data['filename'])
                            raise Exception(error_msg)

                        if data['data_type'] == 'record_package' or \
                                data['data_type'] == 'record_package_list_in_results' or \
                                data['data_type'] == 'record_package_list':
                            database_file.insert_record(row, package_data)
                        else:
                            database_file.insert_release(row, package_data)

        database.end_store(source_session_id)

    def save_url(self, file_name, data, file_path):
        return [], save_content(data['url'], file_path)

    def run_check(self):
        if not database.is_store_done(self.source_id, self.data_version,
                                      self.sample):
            raise Exception('Can not run check without a successful store')

        source_session_id = database.get_id_of_store(self.source_id,
                                                     self.data_version,
                                                     self.sample)

        for data in self.metadata_db.list_filestatus():

            if data['data_type'].startswith('meta'):
                continue

            check_file(source_session_id, data)

    """Gather, Fetch, Store and Check data from this publisher."""

    def run_all(self):
        self.run_gather()
        self.run_fetch()
        self.run_store()
        self.run_check()