Beispiel #1
0
    def test_load_dataset(self):

        data_archive = eurobis_dataset.EurobisDataset()
        data_archive.load_dataset(self.test_dataset)

        assert (len(data_archive.event_recs) == self.test_dataset_events)
        assert (len(
            data_archive.occurrence_recs) == self.test_dataset_occurences)
        assert (len(data_archive.emof_recs) == self.test_dataset_emof)
        assert (data_archive.imis_das_id == self.test_imis_no)

        sum_len = 0
        for key in data_archive.emof_indices:
            sum_len += len(data_archive.emof_indices[key])

        assert (sum_len == len(data_archive.emof_recs))

        # Second dataset where core = 1
        test_dataset2 = 558
        data_archive2 = eurobis_dataset.EurobisDataset()
        data_archive2.load_dataset(test_dataset2)

        sum_len = 0
        for key in data_archive2.emof_indices:
            sum_len += len(data_archive2.emof_indices[key])

        assert (sum_len == len(data_archive2.emof_recs))
Beispiel #2
0
    def test_get_eml(self):
        """ retrieving EML from the IMIS service to extract
            areas after having queried the DB for dataset info """

        data_archive = eurobis_dataset.EurobisDataset()
        data_archive.get_provider_data(self.test_dataset)
        data_archive.get_areas_from_eml(data_archive.imis_das_id)

        this.logger.info(data_archive.areas)
Beispiel #3
0
    def test_get_eml_negative(self):
        """ Verify that the solution can handle cases when EML data
            is not available (it does not fail for that reason) """

        fake_imis_das_id = 10000
        data_archive = eurobis_dataset.EurobisDataset()
        data_archive.get_areas_from_eml(fake_imis_das_id)

        # Negative
        this.logger.info(
            f"Found {'No interesting' if data_archive.areas is None else len(data_archive.areas)} "
            f"areas in IMIS Dataset N,{fake_imis_das_id}")
        # Positive
        data_archive.get_areas_from_eml(self.test_imis_no)
        this.logger.info(
            f"Found {'No interesting' if data_archive.areas is None else len(data_archive.areas)} "
            f"areas in IMIS Dataset N,{self.test_imis_no}")
Beispiel #4
0
    def test_query_builder_emof(self):
        """ Retrieving eMof records in a dataset from MS SQL
            using the assembled query string """

        data_archive = eurobis_dataset.EurobisDataset()

        sql_string = data_archive.query_builder_emof(self.test_dataset)

        # Try to execute it..
        if not mssql_db_functions.conn:
            conn = mssql_db_functions.open_db()
        else:
            conn = mssql_db_functions.conn

        cursor = conn.cursor()
        cursor.execute(sql_string)

        columns = [column[0] for column in cursor.description]
        records = []
        for row in cursor:
            records.append(dict(zip(columns, row)))

        assert (len(records) == self.test_dataset_emof)
Beispiel #5
0
def dataset_qc_labeling(dataset_id,
                        disable_index=True,
                        with_logging=True,
                        pool_no=0):
    """ Processes an eurobis dataset if it is passed as a dataset_id,
        shall popup a file chooser dialog if this is None
        :param dataset_id (The dataset identifier from the dataproviderstable)
        :param disable_index: Whether we are eventually allowed to disable the index at this level
        :param with_logging (every QC passed is printed)
        """

    if dataset_id is None:
        this.logger.warning(
            "WARNING: Call to dataset_qc_labeling with no dataset_id ")
        return None

    data_archive = eurobis_dataset.EurobisDataset()
    data_archive.load_dataset(dataset_id)

    if with_logging:
        this.logger.info(f"--------------------------------------------------")
        this.logger.info(
            f"Loaded dataset {data_archive.dataset_name}, id = {data_archive.dataprovider_id} "
        )
        this.logger.info(
            f"Number of event records: {len(data_archive.event_recs)}")
        this.logger.info(
            f"Number of occurrence records: {len(data_archive.occurrence_recs)}"
        )
        this.logger.info(
            f"Number of emof records: {len(data_archive.emof_recs)}")
        this.logger.info(f"Interesting areas: {data_archive.areas}")
        this.logger.info(f"Imis dataset ID: {data_archive.imis_das_id}")
        this.logger.info(
            f"Good metadata: {'OK' if data_archive.goodmetadata == True else 'Not OK'}"
        )
        this.logger.info(
            f"Type of core records: {'Event' if data_archive.darwin_core_type == 2 else 'Occurrence'}"
        )
        this.logger.info(f"Poolno: {pool_no}")
        this.logger.info(f"--------------------------------------------------")

    # Starting the QCs:
    # After loading, measure processing time
    time_start = time.time()

    # Proceed top-down...
    if data_archive.darwin_core_type == data_archive.EVENT:
        this.logger.info(f"1A. Event")
        # For all event records, qc event, then occurrence records
        # (which shall recurse into eMof), then own eMof and then "or" all
        for record in data_archive.event_recs:
            # qc_event shall also take care of emof for event
            # this.logger.info(f"1A. Event")
            qc_ev = qc_event(record, data_archive)
            record["qc"] |= qc_ev

            # Generate key and lookup occurrences...
            key = f"{record['dataprovider_id']}_{record['eventID']}"
            if key in data_archive.occ_indices:
                for occ_record in data_archive.occ_indices[key]:
                    # qc_occurrence sall also take care of emof for occurrence
                    qc_occ = qc_occurrence(occ_record, data_archive)
                    # Check that the combination of event and occurrence have the required fields. Assign to occurrence
                    # Consequence of email 24/01/2021
                    occ_record["qc"] |= required_fields.check_ev_occ_required(
                        record, occ_record, False)
                    occ_record[
                        "qc"] |= qc_occ  # make sure it is assigned other than just calculated
                    occ_record[
                        "qc"] |= qc_ev  # Occurrence also inherit 'father' event qc (email 24/01/2021)
                    # qc_ev |= qc_occ  # No aggregation upwards (email 24/01/2021)

                # No longer true after email 24/01/2021
                # Needs to propagate the REQUIRED FIELDS CHECK for the event and its occurrences
                # qc_req_agg = [record]
                # qc_req_agg.extend(data_archive.occ_indices[key])
                # record["qc"] |= required_fields.check_aggregate(qc_req_agg)

    else:  # Only occurrence and emof records
        this.logger.info(f"1B. Occurence and emof")
        for occ_record in data_archive.occurrence_recs:
            # The QC is either 0 or a QC mask - emof are considered inside the occurrence
            qc_occurrence(occ_record, data_archive)

    # Are there any lookups left to do (any record type)
    if len(data_archive.records_for_lookup):
        location.check_xy(data_archive.records_for_lookup)
        data_archive.pyxylookup_counter += 1
        dateTimeObj = datetime.now()
        this.logger.debug(
            f"{dateTimeObj}: Lookups C: {data_archive.pyxylookup_counter}")

        # Must propagate the QC of these records (in case)
        if data_archive.darwin_core_type == data_archive.EVENT:
            for looked_up_record in data_archive.records_for_lookup:
                if looked_up_record[
                        "DarwinCoreType"] == data_archive.OCCURRENCE:
                    key = f"{looked_up_record['dataprovider_id']}_{looked_up_record['eventID']}"
                    if key in data_archive.event_indices:
                        data_archive.event_indices[key][0][
                            "qc"] |= looked_up_record["qc"]

        # Empty the list
        data_archive.records_for_lookup = []

    # Disable QC - if necessary
    if disable_index:
        if len(data_archive.event_recs) + len(
                data_archive.occurrence_recs) > data_archive.INDEX_TRESHOLD:
            eurobis_dataset.EurobisDataset.disable_qc_index()

    # RECORDS UPDATE!
    this.PROCESS_BATCH_SIZE = 1000  # Shall commit at every batch

    # EVENTS
    if len(data_archive.event_recs):
        # Getting the splits
        split_events_lists = misc.split_in_chunks(data_archive.event_recs,
                                                  this.PROCESS_BATCH_SIZE)

        for idx, process_batch in enumerate(split_events_lists):
            eurobis_dataset.EurobisDataset.update_record_qc(
                process_batch, idx, this.PROCESS_BATCH_SIZE,
                data_archive.dataprovider_id, data_archive.EVENT)

    # OCCURRENCES
    if len(data_archive.occurrence_recs):
        # Getting the splits
        split_occurrences_lists = misc.split_in_chunks(
            data_archive.occurrence_recs, this.PROCESS_BATCH_SIZE)
        for idx, process_batch in enumerate(split_occurrences_lists):
            eurobis_dataset.EurobisDataset.update_record_qc(
                process_batch, idx, this.PROCESS_BATCH_SIZE,
                data_archive.dataprovider_id, data_archive.OCCURRENCE)

    # REBUILD QC index
    if disable_index:
        if len(data_archive.event_recs) + len(
                data_archive.occurrence_recs) > data_archive.INDEX_TRESHOLD:
            eurobis_dataset.EurobisDataset.rebuild_qc_index()

    duration = time.time() - time_start
    # Dataset QC finished, taking note of the time.

    if with_logging:
        this.logger.info(
            f"Total net processing time for {data_archive.dataprovider_id} : "
            f"{data_archive.dataset_name} in: {duration} ")
Beispiel #6
0
def process_random_record(with_logging=True):
    """ select a random dataset, then a random core event from it and perform QC """

    # To select a specific type of record,
    # This selects 1 dataset with  less than 10000 events/occurrences reported in the dataproviders table
    # To select Event based data sets, add this between e.dataprovider_id and group_by: where d.core = 2
    sql_random_dataset = f"SELECT TOP 1  d.id, count(e.dataprovider_id) FROM  dataproviders d " \
                         f" inner join eurobis e on d.id = e.dataprovider_id  group by d.id " \
                         f" having count(e.dataprovider_id) < 10000 ORDER BY NEWID()"

    # Go and get the id!
    dataset_id = None

    # Connect to the database to get dataset list
    if not mssql.conn:
        mssql.open_db()

    if mssql.conn is None:
        # Should find a way to exit and advice
        this.logger.error("No connection to DB, nothing can be done! ")
        exit(0)
    else:
        # Fetch a random set of datasets
        cur = mssql.conn.cursor()
        cur.execute(sql_random_dataset)
        dataset = cur.fetchone()
        dataset_id = dataset[0]

    data_archive = eurobis_dataset.EurobisDataset()
    data_archive.load_dataset(dataset_id)

    if with_logging:
        this.logger.info(f"--------------------------------------------------")
        this.logger.info(
            f"Loaded dataset {data_archive.dataset_name}, id = {data_archive.dataprovider_id} "
        )
        this.logger.info(
            f"Number of event records: {len(data_archive.event_recs)}")
        this.logger.info(
            f"Number of occurrence records: {len(data_archive.occurrence_recs)}"
        )
        this.logger.info(
            f"Number of emof records: {len(data_archive.emof_recs)}")
        this.logger.info(f"Interesting areas: {data_archive.areas}")
        this.logger.info(f"Imis dataset ID: {data_archive.imis_das_id}")
        this.logger.info(
            f"Type of core records: {'Event' if data_archive.darwin_core_type == 2 else 'Occurrence'}"
        )
        this.logger.info(f"--------------------------------------------------")
    pass

    # Now everything is in data_archive, we must select a random CORE record, and its children, calculate QC and
    # display all records that originate that reasoning.
    # Proceed top-down as in pipeline ...
    if data_archive.darwin_core_type == data_archive.EVENT:
        # select random core event:
        record_idx = randint(0, len(data_archive.event_recs) - 1)
        record = data_archive.event_recs[record_idx]

        # make sure we start at "Empty"
        record["qc"] = None

        # Perform basic QC:
        qc_ev = mssql_pipeline.qc_event(record, data_archive)
        record["qc"] |= qc_ev  # Make sure it is stamped

        # Generate key and lookup occurrences...
        key = f"{record['dataprovider_id']}_{record['eventID']}"
        if key in data_archive.occ_indices:
            for occ_record in data_archive.occ_indices[key]:
                # qc_occurrence sall also take care of emof for occurrence
                qc_occ = mssql_pipeline.qc_occurrence(occ_record, data_archive)
                qc_occ |= required_fields.check_ev_occ_required(
                    record, occ_record, False)
                occ_record['qc'] |= qc_occ  # also give to occurrence record
                occ_record[
                    'qc'] |= qc_ev  # Inherits the event QC (email 24/01/2021)

            # No longer true as per email 24/01/2021
            # Needs to propagate the REQUIRED FIELDS CHECK for the event and its occurrences
            # qc_req_agg = [record]
            # qc_req_agg.extend(data_archive.occ_indices[key])
            # record["qc"] |= required_fields.check_aggregate(qc_req_agg)
            # qc_ev |= record["qc"]

        # Are there any lookups left to do (any record type)
        if len(data_archive.records_for_lookup):
            location.check_xy(data_archive.records_for_lookup)

            # Need to propagate the (new) QC of the events down to the occurrences records
            for looked_up_record in data_archive.records_for_lookup:
                if looked_up_record["DarwinCoreType"] == data_archive.EVENT:
                    key = f"{looked_up_record['dataprovider_id']}_{looked_up_record['eventID']}"
                    if key in data_archive.occ_indices:
                        for occ_record in data_archive.occ_indices[key]:
                            occ_record["qc"] |= looked_up_record["qc"]

        this.logger.info(f"Calculated quality mask: {qc_ev}, consisting of:")
        this.logger.info(
            f"QC NUMBERS: -------------> {QCFlag.decode_numbers(record['qc'])}"
        )
        this.logger.info(
            f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(record['qc'])}")
        this.logger.info(f"--------------------------------------------------")
        this.logger.info(f"Event Record: {record}")
        this.logger.info(f"--------------------------------------------------")

        if key in data_archive.occ_indices:
            for occ_record in data_archive.occ_indices[key]:
                this.logger.info(f"Occurrence Record: {occ_record}")
                this.logger.info(
                    f"Calculated quality mask: {occ_record['qc']}, consisting of:"
                )
                this.logger.info(
                    f"QC NUMBERS: -------------> {QCFlag.decode_numbers(occ_record['qc'])}"
                )
                this.logger.info(
                    f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(occ_record['qc'])}"
                )
                this.logger.info(
                    f"--------------------------------------------------")
                key_o = f"{occ_record['dataprovider_id']}_" \
                        f"{'NULL' if occ_record['eventID'] is None else occ_record['eventID']}_" \
                        f"{'NULL' if occ_record['occurrenceID'] is None else occ_record['occurrenceID']}"
                if key_o in data_archive.emof_indices:
                    for emof in data_archive.emof_indices[key_o]:
                        this.logger.info(f"eMoF Record: {emof}")
                        this.logger.info(
                            f"--------------------------------------------------"
                        )

        if key in data_archive.emof_indices:
            for emof in data_archive.emof_indices[key]:
                this.logger.info(f"eMoF Record for event: {emof}")
                this.logger.info(
                    f"--------------------------------------------------")

    else:
        # The QC is either 0 or a QC mask
        record_idx = randint(0, len(data_archive.occurrence_recs) - 1)
        record = data_archive.occurrence_recs[record_idx]
        qc_occ = mssql_pipeline.qc_occurrence(record, data_archive)

        # Are there any lookups left to do (any record type)?
        if len(data_archive.records_for_lookup):
            location.check_xy(data_archive.records_for_lookup)

            for lookup_record in data_archive.records_for_lookup:
                record['qc'] |= lookup_record["qc"]
                qc_occ |= lookup_record["qc"]

            data_archive.records_for_lookup = []

        this.logger.info(f"Calculated quality mask: {qc_occ}, consisting of:")
        this.logger.info(
            f"QC NUMBERS: -------------> {QCFlag.decode_numbers(qc_occ)}")
        this.logger.info(
            f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(qc_occ)}")
        this.logger.info(f"--------------------------------------------------")
        this.logger.info(f"Occurrence Record: {record}")
        this.logger.info(f"--------------------------------------------------")

        key_o = f"{record['dataprovider_id']}_NULL_" \
                f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}"
        if key_o in data_archive.emof_indices:
            for emof in data_archive.emof_indices[key_o]:
                this.logger.info(f"eMoF Record: {emof}")
                this.logger.info(
                    f"--------------------------------------------------")