Ejemplo n.º 1
0
def end_to_end_test_handler(msg):
    """Request to run E2E tests.

    Return message with new generated dynamic workflow in the header.

    :param msg:
    :return:
    """
    now = datetime.datetime.utcnow()
    start_timestamp = int(now.replace(microsecond=0).timestamp())
    header = msg.get('header', {})

    # Set process id before call to logger.configure
    header['process_id'] = header.get('process_id',
                                      f"{start_timestamp}.e2e_test")

    logger.configure(msg, 'E2E Test')
    logger.info("Clear any previous test data")

    e2etest = E2ETest(header['process_id'])
    e2etest.cleartests()
    logger.info("Start E2E Test")

    return {
        'header': {
            **header,
            'timestamp': now.isoformat(),
            'workflow': e2etest.get_workflow(),
        },
        'contents': ''
    }
Ejemplo n.º 2
0
def distribute_file(conn_info, filename):
    """
    Copy the checked file to its final location

    Check and copy is implemented as a indivisible action.
    If the check is OK then the file is copied to its final location in one action.
    The time between the check and the copy action is as short as possible
    So no extra workflow step has been introduced (possible queueing)

    :param conn_info:
    :param filename:
    :return:
    """
    # Remove export dir from filename to get destination file name
    dst = re.sub(rf'^{EXPORT_DIR}/', '', filename)

    # Copy the file to the destination location
    logger.info(f"Distribute to {dst}")
    conn_info['connection'].copy_object(CONTAINER_BASE, filename,
                                        f"{CONTAINER_BASE}/{dst}")

    # Do not delete the file from its temporary location because a re-run would cause missing file errors

    # Cleanup any date files at the destination location
    cleanup_datefiles(conn_info['connection'], CONTAINER_BASE, dst)
Ejemplo n.º 3
0
def apply_events(storage, last_events, start_after, stats):
    """Apply any unhandled events to the database

    :param storage: GOB (events + entities)
    :param start_after: the is of the last event that has been applied to the storage
    :param stats: update statitics for this action
    :return:
    """
    with ActiveGarbageCollection(
            "Apply events"), storage.get_session() as session:
        logger.info("Apply events")

        PROCESS_PER = 10000
        add_event_tids = set()
        with ProgressTicker("Apply events", PROCESS_PER) as progress:
            unhandled_events = storage.get_events_starting_after(
                start_after, PROCESS_PER)
            while unhandled_events:
                with EventApplicator(storage) as event_applicator:
                    for event in unhandled_events:
                        progress.tick()

                        gob_event, count, applied_events = event_applicator.apply(
                            event, last_events, add_event_tids)
                        action = gob_event.action
                        stats.add_applied(action, count)
                        start_after = event.eventid

                        # Remove event from session, to avoid trying to update event db object
                        session.expunge(event)

                    event_applicator.apply_all()

                unhandled_events = storage.get_events_starting_after(
                    start_after, PROCESS_PER)
Ejemplo n.º 4
0
def _download_sources(conn_info, directory,
                      filenames) -> List[Tuple[str, str]]:
    """

    :param conn_info:
    :param directory:
    :param filenames: list of tuples (dst_path, src_filename)
    :return:
    """
    path = Path(directory)
    path.mkdir(exist_ok=True)

    src_files = []

    for dst_path, filename in filenames:
        src_file_info, src_file = _get_file(conn_info, filename)

        temp_file = os.path.join(directory, dst_path)
        path = Path(os.path.dirname(temp_file))
        path.mkdir(exist_ok=True, parents=True)

        with open(temp_file, "wb") as f:
            f.write(src_file)
        src_files.append((dst_path, temp_file))

    logger.info(f"{len(src_files)} source files downloaded")

    return src_files
Ejemplo n.º 5
0
def _propose_check_file(proposals, filename, obj_info, obj):
    """
    Build a proposal to check the given file

    :param filename: Name of the file to check
    :param obj_info: Current file object info
    :param obj: Current file object
    :return: proposal object
    """
    proposal_key = filename
    for src, dst in _REPLACEMENTS.items():
        # heuristic method to convert variable values to a variable name
        if re.search(dst, filename):
            proposal_key = re.sub(dst, src, proposal_key)

    # Base the proposal on the analysis of the current file
    analysis = _get_analysis(obj_info, obj)
    analysis["age_hours"] = 24

    proposal = {}
    for key, value in analysis.items():
        if key in _MAXIMUM_VALUES:
            proposal[key] = [0, value]
        elif key in _MINIMUM_VALUES:
            proposal[key] = [value, None]
        elif key in _ABSOLUTE_VALUES:
            proposal[key] = [value]
        else:
            # Within limits
            low, high = _get_low_high(value)
            proposal[key] = [low, high]

    logger.info(f"Proposal generated for {proposal_key}")
    proposals[proposal_key] = proposal
Ejemplo n.º 6
0
def _process_events(storage, events, stats):
    """Store and apply events

    :param storage: GOB (events + entities)
    :param event: the event to process
    :param stats: update statitics for this action
    :return:
    """
    # Get the max eventid of the entities and the last eventid of the events
    entity_max_eventid, last_eventid = get_event_ids(storage)
    logger.info(
        f"Events are at {last_eventid or 0:,}, model is at {entity_max_eventid or 0:,}"
    )

    # Get all source_id - last_event combinations to check for validity and existence
    with storage.get_session():
        last_events = storage.get_last_events(
        )  # { source_id: last_event, ... }

    if is_corrupted(entity_max_eventid, last_eventid):
        logger.error("Model is inconsistent! data is more recent than events")
    elif entity_max_eventid == last_eventid:
        logger.info("Model is up to date")
        # Add new events
        return _store_events(storage, last_events, events, stats)
    else:
        logger.warning("Model is out of date, Further processing has stopped")
Ejemplo n.º 7
0
def _store_events(storage, last_events, events, stats):
    """Store events in GOB

    Only valid events are stored, other events are skipped (with an associated warning)
    The events are added in bulk in the database

    :param storage: GOB (events + entities)
    :param events: the events to process
    :param stats: update statitics for this action
    :return:
    """
    with ActiveGarbageCollection("Store events"), storage.get_session():
        # Use a session to commit all or rollback on any error
        logger.info("Store events")

        with ProgressTicker("Store events", 10000) as progress, \
                EventCollector(storage, last_events) as event_collector:

            for event in events:
                progress.tick()

                if event_collector.collect(event):
                    stats.store_event(event)
                else:
                    stats.skip_event(event)
Ejemplo n.º 8
0
def prepare_relate(msg):
    """
    The starting point for the relate process. A relate job will be split into individual relate jobs on
    attribute level. If there's only a catalog in the message, all collections of that catalog will be related.
    When a job which has been split is received the relation name will be added and the job will be forwarded
    to the next step of the relate process where the relations are being made.

    :param msg: a message from the broker containing the catalog and collections (optional)
    :return: the result message of the relate preparation step
    """
    header = msg.get('header', {})
    catalog_name = header.get('catalogue')
    collection_name = header.get('collection')
    attribute_name = header.get('attribute')

    application = "GOBRelate"
    msg["header"] = {
        **msg.get("header", {}),
        "version": "0.1",
        "source": "GOB",
        "application": application,
        "entity": collection_name
    }

    timestamp = datetime.datetime.utcnow().isoformat()

    msg["header"].update({
        "timestamp": timestamp,
    })

    logger.configure(msg, "RELATE")

    if not catalog_name or not collection_name or not attribute_name:
        # A job will be splitted when catalog, collection or attribute are not provided
        logger.info("Splitting relate job")

        _split_job(msg)
        msg['header']['is_split'] = True

        return publish_result(msg, [])
    else:
        # If the job has all attributes, add the relation name and forward to the next step in the relate process
        logger.info(f"** Relate {catalog_name} {collection_name} {attribute_name}")

        relation_name = get_relation_name(GOBModel(), catalog_name, collection_name, attribute_name)

        msg["header"].update({
            "catalogue": "rel",
            "collection": relation_name,
            "entity": relation_name,
            "original_catalogue": catalog_name,
            "original_collection": collection_name,
            "original_attribute": attribute_name,
        })

        return msg
Ejemplo n.º 9
0
    def result(self):
        if self.fatal:
            raise GOBException(
                f"Quality assurance failed for {self.entity_name}"
            )

        if self.duplicates:
            raise GOBException(f"Duplicate primary key(s) found in source: "
                               f"[{', '.join([str(dup) for dup in self.duplicates])}]")

        logger.info("Quality assurance passed")
Ejemplo n.º 10
0
    def _log_intro(self):
        """
        If any unique columns have been defined, log an informational message stating that the file is checked

        :return:
        """
        if self.unique_cols:
            unique_cols = ", ".join([str(cols) for cols in self.unique_cols])
            logger.info(
                f"Checking {self.filename} for unique column values in columns {unique_cols}"
            )
Ejemplo n.º 11
0
    def end_of_workflow(self, msg):
        logger.configure(msg, "WORKFLOW")
        on_complete = msg['header'].pop('on_workflow_complete', None)
        if on_complete is not None:
            if not isinstance(on_complete, dict) or not all([key in on_complete for key in ['exchange', 'key']]):
                logger.error("on_workflow_complete should be a dict with keys 'exchange' and 'key'")
            else:
                publish(on_complete['exchange'], on_complete['key'], msg)
                logger.info(f"Publish on_workflow_complete to {on_complete['exchange']} with {on_complete['key']}")

        logger.info("End of workflow")
        job_end(msg["header"].get("jobid"))
Ejemplo n.º 12
0
def cleanup_datefiles(connection, container, filename):
    """Delete previous files from ObjectStore.

    The file with filename is not deleted.
    """
    cleanup_pattern = get_cleanup_pattern(filename)
    if cleanup_pattern == filename:
        # No dates in filename, nothing to do
        return

    logger.info(f'Clean previous files for {filename}.')

    for item in get_full_container_list(connection, container):
        if re.match(cleanup_pattern,
                    item['name']) and item['name'] != filename:
            delete_object(connection, container, item)
            logger.info(f'File {item["name"]} deleted.')
Ejemplo n.º 13
0
def handle_import_object_msg(msg):
    logger.configure(msg, "IMPORT OBJECT")
    logger.info("Start import object")
    importer = MappinglessConverterAdapter(msg['header'].get('catalogue'),
                                           msg['header'].get('entity'),
                                           msg['header'].get('entity_id_attr'))
    entity = importer.convert(msg['contents'])

    return {
        'header': {
            **msg['header'],
            'mode': ImportMode.SINGLE_OBJECT.value,
            'collection': msg['header'].get('entity'),
        },
        'summary': logger.get_summary(),
        'contents': [entity]
    }
Ejemplo n.º 14
0
def handle_brp_regression_test_msg(msg):
    logger.configure(msg, 'BRP Regression test')

    results = BrpRegression(logger).run()
    writer = ObjectstoreResultsWriter(results, 'regression_tests/results/brp')
    writer.write()
    logger.info(
        "Written test results to Objecstore at regression_tests/results/brp")

    return {
        'header': {
            **msg.get('header', {}),
            'timestamp':
            datetime.datetime.utcnow().isoformat(),
        },
        'summary': logger.get_summary(),
    }
Ejemplo n.º 15
0
    def enrich(self, entity):
        for column, specs in self.enrich_spec.items():
            enricher = self.enrichers[specs["type"]]
            value, logging = enricher["func"](storage=self.storage,
                                              data=entity,
                                              specs=specs,
                                              column=column,
                                              assigned=self.assigned)

            if specs.get("dry_run", False) and value != entity.get(column):
                logger.info(
                    f"Enrich dry run: Generated value {value} for entity {entity[specs['on']]}"
                )
            else:
                entity[column] = value

            if logging:
                logger.info(logging)
Ejemplo n.º 16
0
def on_workflow_progress(msg):
    """
    Process a workflow progress message

    The progress report is START, OK or FAIL
    :param msg: The message that contains the progress info
    :return: None
    """
    status = msg['status']
    step_info = step_status(msg['jobid'], msg['stepid'], status)
    if step_info and status in [STATUS_OK, STATUS_FAIL]:
        logger.configure(msg, "WORKFLOW")
        logger.info(
            f"Duration {str(step_info.end - step_info.start).split('.')[0]}")
        if status == STATUS_FAIL:
            logger.error(f"Program error: {msg['info_msg']}")
            logger.info("End of workflow")
    hooks.on_workflow_progress(msg)
Ejemplo n.º 17
0
def kafka_produce_handler(msg):
    logger.configure(msg, "KAFKA_PRODUCE")
    logger.info("Produce Kafka events")

    catalogue = msg.get('header', {}).get('catalogue')
    collection = msg.get('header', {}).get('collection')

    assert catalogue and collection, "Missing catalogue and collection in header"

    event_producer = KafkaEventProducer(catalogue, collection, logger)
    event_producer.produce()

    return {
        'header': msg['header'],
        'summary': {
            'produced': event_producer.total_cnt,
        }
    }
Ejemplo n.º 18
0
def check_relations(src_catalog_name, src_collection_name, src_field_name):
    """
    Check relations for any dangling relations

    Dangling can be because a relation exist without any bronwaarde
    or the bronwaarde cannot be matched with any referenced entity

    :param src_catalog_name:
    :param src_collection_name:
    :param src_field_name:
    :return: None
    """

    name = f"{src_collection_name} {src_field_name}"

    # Only include sources where not none_allowed
    sources = GOBSources().get_field_relations(src_catalog_name,
                                               src_collection_name,
                                               src_field_name)
    check_sources = [
        source['source'] for source in sources
        if not source.get('none_allowed', False)
    ]

    if not check_sources:
        logger.info(
            f"All sources for {src_catalog_name} {src_collection_name} {src_field_name} allow empty "
            f"relations. Skipping check.")
        return

    # Only filter on sources when necessary (i.e. when there are multiple sources with different values for
    # none_allowed)
    check_sources = check_sources if len(sources) != len(
        check_sources) else None
    missing_query = _get_relation_check_query("missing", src_catalog_name,
                                              src_collection_name,
                                              src_field_name, check_sources)
    _query_missing(missing_query, QA_CHECK.Sourcevalue_exists, name)

    dangling_query = _get_relation_check_query("dangling", src_catalog_name,
                                               src_collection_name,
                                               src_field_name, check_sources)

    _query_missing(dangling_query, QA_CHECK.Reference_exists, name)
Ejemplo n.º 19
0
    def connect(self):  # noqa: C901
        """The first step of every import is a technical step. A connection need to be setup to
        connect to a database, filesystem, API, ...

        :return:
        """

        # Get manually added config, or config based on application name
        datastore_config = self.source.get(
            'application_config') or get_datastore_config(
                self.source['application'])

        read_config = {**self.source.get('read_config', {}), 'mode': self.mode}
        self.datastore = DatastoreFactory.get_datastore(
            datastore_config, read_config)
        self.datastore.connect()

        logger.info(
            f"Connection to {self.app} {self.datastore.user} has been made.")
Ejemplo n.º 20
0
def _check_file(check, filename, stats):
    """
    Test if all checks that have been defined for the given file are OK

    :param filename: Name of the file to check
    :param stats: Statistics of the file
    :param checks: Check to apply onto the statistics
    :return: True if all checks succeed
    """
    total_result = True
    _check_uniqueness(check)

    for key, margin in check.items():
        # Get corresponding value for check
        if key not in stats:
            logger.warning(f"Value missing for {key} check in {filename}")
            continue
        value = stats[key]
        if len(margin) == 1:
            result = value == margin[0]
            formatted_margin = f"= {_fmt(margin[0])}"
        elif margin[0] is None:
            result = value <= margin[1]
            formatted_margin = f"<= {_fmt(margin[1])}"
        elif margin[1] is None:
            result = value >= margin[0]
            formatted_margin = f">= {_fmt(margin[0])}"
        else:
            result = margin[0] <= value <= margin[1]
            formatted_margin = f"{_fmt(margin[0])} - {_fmt(margin[1])}"
        total_result = total_result and result

        # Report any errors for the given filename as a group
        str_value = f"{value:,.2f}".replace(
            ".00", "") if type(value) in [float, int] else value
        extra_data = {'id': filename, 'data': {key: str_value}}
        if result:
            extra_data['id'] += " OK"
            logger.info("OK", extra_data)
        else:
            extra_data['data']['margin'] = formatted_margin
            logger.error("Check FAIL", extra_data)
    return total_result
Ejemplo n.º 21
0
    def dump_collection(self, schema, catalog_name, collection_name, force_full=False):
        """
        Dump a catalog collection into a remote database in the given schema

        If the dump fails the operation is retried with a maximum of MAX_TRIES
        and a wait between each try of RETRY_TIMEOUT seconds
        :param schema:
        :param catalog_name:
        :param collection_name:
        :return:
        """
        tries = 0
        while tries < Dumper.MAX_TRIES:
            tries += 1
            logger.info(f"Try {tries}: dump {catalog_name} - {collection_name}")
            if self.try_dump_collection(schema, catalog_name, collection_name, force_full):
                # On Successful dump
                return
            # Wait RETRY_TIMEOUT seconds before next try
            time.sleep(self.RETRY_TIMEOUT)
        logger.error(f'Export {catalog_name}-{collection_name} failed after {Dumper.MAX_TRIES}')
Ejemplo n.º 22
0
def update_materialized_view(msg):
    """Updates materialized view for a relation for a given catalog, collection and attribute or relation name.

    Expects a message with headers:
    - catalogue
    - collection (if catalogue is 'rel' this should be the relation_name)
    - attribute (optional if catalogue is 'rel')

    examples of correct headers that are functionally equivalent:
    header = {
        "catalogue": "meetbouten",
        "collection": "meetbouten",
        "attribute": "ligt_in_buurt",
    }
    header = {
        "catalogue": "rel",
        "collection": "mbn_mbt_gbd_brt_ligt_in_buurt",
    }

    :param msg:
    :return:
    """
    header = msg.get('header', {})
    catalog_name = header.get('catalogue')
    collection_name = header.get('collection')
    attribute_name = header.get('attribute')

    logger.configure(msg, "UPDATE_VIEW")
    storage_handler = GOBStorageHandler()

    view = _get_materialized_view(catalog_name, collection_name, attribute_name)
    view.refresh(storage_handler)
    logger.info(f"Update materialized view {view.name}")

    timestamp = datetime.datetime.utcnow().isoformat()
    msg['header'].update({
        "timestamp": timestamp
    })

    return msg
Ejemplo n.º 23
0
    def replace_header_references(uniques: list, header: list):
        """
        Replaces column names in a uniques list with column indexes (1-based)

        Example, with header A;B;C;D;E;F :
            replace_header_references(['A', 'B', 'D']) => [1, 2, 4]
            replace_header_references([1, 2, 5]) => [1, 2, 5]  # Leave as is

        :param uniques:
        :param header:
        :return:
        """
        replaced = [
            header.index(col) + 1 if isinstance(col, str) else col
            for col in uniques
        ]

        if uniques != replaced:
            logger.info(
                f"Interpreting columns {str(uniques)} as {str(replaced)}")

        return replaced
Ejemplo n.º 24
0
def data_consistency_test_handler(msg):
    """Request to run data consistency tests.

    :param msg:
    :return:
    """
    catalog = msg['header'].get('catalogue')
    collection = msg['header'].get('collection')
    application = msg['header'].get('application')
    msg['header']['entity'] = msg['header'].get('entity', collection)

    logger.configure(msg, 'Data consistency test')

    assert all([catalog, collection
                ]), "Expecting header attributes 'catalogue' and 'collection'"
    id = f"{catalog} {collection} {application or ''}"
    # No return value. Results are captured by logger.
    logger.info(f"Data consistency test {id} started")
    try:
        DataConsistencyTest(catalog, collection, application).run()
    except GOBConfigException as e:
        logger.error(f"Dataset connection failed: {str(e)}")
    except (NotImplementedCatalogError, NotImplementedApplicationError,
            GOBException) as e:
        logger.error(f"Dataset test failed: {str(e)}")
    else:
        logger.info(f"Data consistency test {id} ended")

    return {
        'header': {
            **msg.get('header', {}),
            'timestamp':
            datetime.datetime.utcnow().isoformat(),
        },
        'summary': logger.get_summary(),
    }
Ejemplo n.º 25
0
    def try_dump_collection(self, schema, catalog_name, collection_name, force_full=False):
        """
        Try to dump the given catalog collection in the given schema

        The dump is performed by issuing an API POST request to the GOB API.

        :param schema:
        :param catalog_name:
        :param collection_name:
        :return:
        """
        url = f"{self.dump_api}/dump/{catalog_name}/{collection_name}/"
        data = {
            "db": self.db_config,
            "schema": schema,
            "include_relations": False,
            "force_full": force_full,
        }
        headers = {
            "Content-Type": "application/json"
        }

        logger.info(f"Dump {catalog_name} - {collection_name} (schema: {schema})")
        start_request = time.time()
        success = False
        try:
            result = requests.post(
                url=url,
                json=data,
                headers=self.update_headers(url, headers),
                stream=True
            )

            last_line = ""
            start_line = time.time()
            for line in result.iter_lines(chunk_size=1):
                last_line = line.decode()
                end_line = time.time()
                logger.info(f"{last_line} ({(end_line - start_line):.2f} / {(end_line - start_request):.2f} secs)")
                start_line = time.time()
        except Exception as e:
            logger.warning(f'Export {catalog_name}-{collection_name} failed: {str(e)}')
        else:
            success = re.match(r'Export completed', last_line) is not None
            if not success:
                logger.warning(f'Export {catalog_name}-{collection_name} completed with errors')
        finally:
            end_request = time.time()
            logger.info(f"Elapsed time: {(end_request - start_request):.2f} secs")
        return success
Ejemplo n.º 26
0
def process_relate(msg: dict):
    """
    This function starts the actual relate process. The message is checked for completeness and the Relater
    builds the new or updated relations and returns the result the be compared as if it was the result
    of an import job.

    :param msg: a message from the broker containing the catalog and collections (optional)
    :return: the result message of the relate process
    """
    logger.configure(msg, "RELATE SRC")

    _check_message(msg)
    header = msg.get('header')

    logger.info("Relate table started")

    full_update = header.get('mode', "update") == "full"

    if full_update:
        logger.info("Full relate requested")

    updater = Relater(header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY])

    filename, confirms = updater.update(full_update)

    logger.info("Relate table completed")

    relation_name = get_relation_name(GOBModel(), header[CATALOG_KEY], header[COLLECTION_KEY], header[ATTRIBUTE_KEY])

    result_msg = {
        "header": {
            **msg["header"],
            "catalogue": "rel",
            "collection": relation_name,
            "entity": relation_name,
            "source": "GOB",
            "application": "GOB",
            "version": RELATE_VERSION,
            "timestamp": msg.get("timestamp", datetime.datetime.utcnow().isoformat()),
        },
        "summary": logger.get_summary(),
        "contents_ref": filename,
        "confirms": confirms,
    }

    return result_msg
Ejemplo n.º 27
0
def full_update(msg):
    """Store the events for the current dataset

    :param msg: the result of the application of the events
    :return: Result message
    """
    logger.configure(msg, "UPDATE")
    logger.info(
        f"Update to GOB Database {GOBStorageHandler.user_name} started")

    # Interpret the message header
    message = ImportMessage(msg)
    metadata = message.metadata

    storage = GOBStorageHandler(metadata)
    model = f"{metadata.source} {metadata.catalogue} {metadata.entity}"
    logger.info(f"Store events {model}")

    # Get events from message
    events = msg["contents"]

    # Gather statistics of update process
    stats = UpdateStatistics()

    _process_events(storage, events, stats)

    # Build result message
    results = stats.results()

    stats.log()
    logger.info(f"Store events {model} completed", {'data': results})

    results.update(logger.get_summary())

    # Return the result message, with no log, no contents but pass-through any confirms
    message = {
        "header": msg["header"],
        "summary": results,
        "contents": None,
        "confirms": msg.get('confirms')
    }
    return message
Ejemplo n.º 28
0
def check_relation(msg):
    """
    Check for any dangling relations

    :param msg:
    :return:
    """
    header = msg.get('header', {})
    catalog_name = header.get('original_catalogue')
    collection_name = header.get('original_collection')
    attribute_name = header.get('original_attribute')

    model = GOBModel()

    logger.configure(msg, "RELATE_CHECK")
    logger.info("Relate check started")

    collection = model.get_collection(catalog_name, collection_name)
    assert collection is not None, f"Invalid catalog/collection combination {catalog_name}/{collection_name}"

    reference = model._extract_references(collection['attributes']).get(attribute_name)

    try:
        is_very_many = reference['type'] == fully_qualified_type_name(VeryManyReference)
        check_function = check_very_many_relations if is_very_many else check_relations
        check_function(catalog_name, collection_name, attribute_name)
    except Exception as e:
        _log_exception(f"{attribute_name} check FAILED", e)

    logger.info("Relation conflicts check started")
    check_relation_conflicts(catalog_name, collection_name, attribute_name)

    logger.info("Relate check completed")

    return {
        "header": msg["header"],
        "summary": logger.get_summary(),
        "contents": None
    }
Ejemplo n.º 29
0
def test(catalogue):
    """
    Test export files for a given catalogue

    :param catalogue: catalogue to test
    :return: None
    """
    logger.info(f"Test export for catalogue {catalogue}")

    logger.info("Connect to Objectstore")

    config = get_datastore_config(GOB_OBJECTSTORE)
    datastore = DatastoreFactory.get_datastore(config)
    datastore.connect()
    container_name = CONTAINER_BASE

    logger.info(f"Load files from {container_name}")
    conn_info = {
        "connection": datastore.connection,
        "container": container_name
    }

    # Get test definitions for the given catalogue
    checks = _get_checks(conn_info, catalogue)

    # Make proposals for any missing test definitions
    proposals = {}
    for config in _export_config[catalogue]:
        resolve_config_filenames(config)

        for name, product in config.products.items():
            filenames = [product['filename']] + [
                product['filename']
                for product in product.get('extra_files', [])
            ]

            for filename in filenames:
                # Check the previously exported file at its temporary location
                obj_info, obj = _get_file(
                    conn_info, f"{EXPORT_DIR}/{catalogue}/{filename}")

                # Clone check so that changes to the check file don't affect other runs
                check = copy.deepcopy(_get_check(checks, filename))

                # Report results with the name of the matched file
                matched_filename = obj_info['name'] if obj_info else filename

                if obj_info is None:
                    logger.error(f"File {filename} MISSING")
                elif check:
                    stats = _get_analysis(obj_info, obj, check)
                    if _check_file(check, matched_filename, stats):
                        logger.info(f"Check {matched_filename} OK")
                        # Copy the file to its final location
                        distribute_file(conn_info, matched_filename)
                    else:
                        logger.info(f"Check {matched_filename} FAILED")
                    _propose_check_file(proposals, filename, obj_info, obj)
                else:
                    logger.warning(f"File {filename} UNCHECKED")
                    # Do not copy unchecked files
                    _propose_check_file(proposals, filename, obj_info, obj)

    # Write out any missing test definitions
    _write_proposals(conn_info, catalogue, checks, proposals)
Ejemplo n.º 30
0
def compare(msg):
    """Compare new data in msg (contents) with the current data

    :param msg: The new data, including header and summary
    :return: result message
    """
    logger.configure(msg, "COMPARE")
    header = msg.get('header', {})
    mode = header.get('mode', FULL_UPLOAD)
    logger.info(
        f"Compare (mode = {mode}) to GOB Database {GOBStorageHandler.user_name} started"
    )

    # Parse the message header
    message = ImportMessage(msg)
    metadata = message.metadata

    # Get the model for the collection to be compared
    gob_model = GOBModel()
    entity_model = gob_model.get_collection(metadata.catalogue,
                                            metadata.entity)

    # Initialize a storage handler for the collection
    storage = GOBStorageHandler(metadata)
    model = f"{metadata.source} {metadata.catalogue} {metadata.entity}"
    logger.info(f"Compare {model}")

    stats = CompareStatistics()

    tmp_table_name = None
    with storage.get_session():
        with ProgressTicker("Collect compare events", 10000) as progress:
            # Check any dependencies
            if not meets_dependencies(storage, msg):
                return {
                    "header": msg["header"],
                    "summary": logger.get_summary(),
                    "contents": None
                }

            enricher = Enricher(storage, msg)
            populator = Populator(entity_model, msg)

            # If there are no records in the database all data are ADD events
            initial_add = not storage.has_any_entity()
            if initial_add:
                logger.info("Initial load of new collection detected")
                # Write ADD events directly, without using a temporary table
                contents_writer = ContentsWriter()
                contents_writer.open()
                # Pass a None confirms_writer because only ADD events are written
                collector = EventCollector(contents_writer,
                                           confirms_writer=None,
                                           version=entity_model['version'])
                collect = collector.collect_initial_add
            else:
                # Collect entities in a temporary table
                collector = EntityCollector(storage)
                collect = collector.collect
                tmp_table_name = collector.tmp_table_name

            for entity in msg["contents"]:
                progress.tick()
                stats.collect(entity)
                enricher.enrich(entity)
                populator.populate(entity)
                collect(entity)

            collector.close()

    if initial_add:
        filename = contents_writer.filename
        confirms = None
        contents_writer.close()
    else:
        # Compare entities from temporary table
        with storage.get_session():
            diff = storage.compare_temporary_data(tmp_table_name, mode)
            filename, confirms = _process_compare_results(
                storage, entity_model, diff, stats)

    # Build result message
    results = stats.results()

    logger.info(f"Compare {model} completed", {'data': results})

    results.update(logger.get_summary())

    message = {
        "header": msg["header"],
        "summary": results,
        "contents_ref": filename,
        "confirms": confirms
    }

    return message