def test_load_dataset(self): data_archive = eurobis_dataset.EurobisDataset() data_archive.load_dataset(self.test_dataset) assert (len(data_archive.event_recs) == self.test_dataset_events) assert (len( data_archive.occurrence_recs) == self.test_dataset_occurences) assert (len(data_archive.emof_recs) == self.test_dataset_emof) assert (data_archive.imis_das_id == self.test_imis_no) sum_len = 0 for key in data_archive.emof_indices: sum_len += len(data_archive.emof_indices[key]) assert (sum_len == len(data_archive.emof_recs)) # Second dataset where core = 1 test_dataset2 = 558 data_archive2 = eurobis_dataset.EurobisDataset() data_archive2.load_dataset(test_dataset2) sum_len = 0 for key in data_archive2.emof_indices: sum_len += len(data_archive2.emof_indices[key]) assert (sum_len == len(data_archive2.emof_recs))
def test_get_eml(self): """ retrieving EML from the IMIS service to extract areas after having queried the DB for dataset info """ data_archive = eurobis_dataset.EurobisDataset() data_archive.get_provider_data(self.test_dataset) data_archive.get_areas_from_eml(data_archive.imis_das_id) this.logger.info(data_archive.areas)
def test_get_eml_negative(self): """ Verify that the solution can handle cases when EML data is not available (it does not fail for that reason) """ fake_imis_das_id = 10000 data_archive = eurobis_dataset.EurobisDataset() data_archive.get_areas_from_eml(fake_imis_das_id) # Negative this.logger.info( f"Found {'No interesting' if data_archive.areas is None else len(data_archive.areas)} " f"areas in IMIS Dataset N,{fake_imis_das_id}") # Positive data_archive.get_areas_from_eml(self.test_imis_no) this.logger.info( f"Found {'No interesting' if data_archive.areas is None else len(data_archive.areas)} " f"areas in IMIS Dataset N,{self.test_imis_no}")
def test_query_builder_emof(self): """ Retrieving eMof records in a dataset from MS SQL using the assembled query string """ data_archive = eurobis_dataset.EurobisDataset() sql_string = data_archive.query_builder_emof(self.test_dataset) # Try to execute it.. if not mssql_db_functions.conn: conn = mssql_db_functions.open_db() else: conn = mssql_db_functions.conn cursor = conn.cursor() cursor.execute(sql_string) columns = [column[0] for column in cursor.description] records = [] for row in cursor: records.append(dict(zip(columns, row))) assert (len(records) == self.test_dataset_emof)
def dataset_qc_labeling(dataset_id, disable_index=True, with_logging=True, pool_no=0): """ Processes an eurobis dataset if it is passed as a dataset_id, shall popup a file chooser dialog if this is None :param dataset_id (The dataset identifier from the dataproviderstable) :param disable_index: Whether we are eventually allowed to disable the index at this level :param with_logging (every QC passed is printed) """ if dataset_id is None: this.logger.warning( "WARNING: Call to dataset_qc_labeling with no dataset_id ") return None data_archive = eurobis_dataset.EurobisDataset() data_archive.load_dataset(dataset_id) if with_logging: this.logger.info(f"--------------------------------------------------") this.logger.info( f"Loaded dataset {data_archive.dataset_name}, id = {data_archive.dataprovider_id} " ) this.logger.info( f"Number of event records: {len(data_archive.event_recs)}") this.logger.info( f"Number of occurrence records: {len(data_archive.occurrence_recs)}" ) this.logger.info( f"Number of emof records: {len(data_archive.emof_recs)}") this.logger.info(f"Interesting areas: {data_archive.areas}") this.logger.info(f"Imis dataset ID: {data_archive.imis_das_id}") this.logger.info( f"Good metadata: {'OK' if data_archive.goodmetadata == True else 'Not OK'}" ) this.logger.info( f"Type of core records: {'Event' if data_archive.darwin_core_type == 2 else 'Occurrence'}" ) this.logger.info(f"Poolno: {pool_no}") this.logger.info(f"--------------------------------------------------") # Starting the QCs: # After loading, measure processing time time_start = time.time() # Proceed top-down... if data_archive.darwin_core_type == data_archive.EVENT: this.logger.info(f"1A. Event") # For all event records, qc event, then occurrence records # (which shall recurse into eMof), then own eMof and then "or" all for record in data_archive.event_recs: # qc_event shall also take care of emof for event # this.logger.info(f"1A. Event") qc_ev = qc_event(record, data_archive) record["qc"] |= qc_ev # Generate key and lookup occurrences... key = f"{record['dataprovider_id']}_{record['eventID']}" if key in data_archive.occ_indices: for occ_record in data_archive.occ_indices[key]: # qc_occurrence sall also take care of emof for occurrence qc_occ = qc_occurrence(occ_record, data_archive) # Check that the combination of event and occurrence have the required fields. Assign to occurrence # Consequence of email 24/01/2021 occ_record["qc"] |= required_fields.check_ev_occ_required( record, occ_record, False) occ_record[ "qc"] |= qc_occ # make sure it is assigned other than just calculated occ_record[ "qc"] |= qc_ev # Occurrence also inherit 'father' event qc (email 24/01/2021) # qc_ev |= qc_occ # No aggregation upwards (email 24/01/2021) # No longer true after email 24/01/2021 # Needs to propagate the REQUIRED FIELDS CHECK for the event and its occurrences # qc_req_agg = [record] # qc_req_agg.extend(data_archive.occ_indices[key]) # record["qc"] |= required_fields.check_aggregate(qc_req_agg) else: # Only occurrence and emof records this.logger.info(f"1B. Occurence and emof") for occ_record in data_archive.occurrence_recs: # The QC is either 0 or a QC mask - emof are considered inside the occurrence qc_occurrence(occ_record, data_archive) # Are there any lookups left to do (any record type) if len(data_archive.records_for_lookup): location.check_xy(data_archive.records_for_lookup) data_archive.pyxylookup_counter += 1 dateTimeObj = datetime.now() this.logger.debug( f"{dateTimeObj}: Lookups C: {data_archive.pyxylookup_counter}") # Must propagate the QC of these records (in case) if data_archive.darwin_core_type == data_archive.EVENT: for looked_up_record in data_archive.records_for_lookup: if looked_up_record[ "DarwinCoreType"] == data_archive.OCCURRENCE: key = f"{looked_up_record['dataprovider_id']}_{looked_up_record['eventID']}" if key in data_archive.event_indices: data_archive.event_indices[key][0][ "qc"] |= looked_up_record["qc"] # Empty the list data_archive.records_for_lookup = [] # Disable QC - if necessary if disable_index: if len(data_archive.event_recs) + len( data_archive.occurrence_recs) > data_archive.INDEX_TRESHOLD: eurobis_dataset.EurobisDataset.disable_qc_index() # RECORDS UPDATE! this.PROCESS_BATCH_SIZE = 1000 # Shall commit at every batch # EVENTS if len(data_archive.event_recs): # Getting the splits split_events_lists = misc.split_in_chunks(data_archive.event_recs, this.PROCESS_BATCH_SIZE) for idx, process_batch in enumerate(split_events_lists): eurobis_dataset.EurobisDataset.update_record_qc( process_batch, idx, this.PROCESS_BATCH_SIZE, data_archive.dataprovider_id, data_archive.EVENT) # OCCURRENCES if len(data_archive.occurrence_recs): # Getting the splits split_occurrences_lists = misc.split_in_chunks( data_archive.occurrence_recs, this.PROCESS_BATCH_SIZE) for idx, process_batch in enumerate(split_occurrences_lists): eurobis_dataset.EurobisDataset.update_record_qc( process_batch, idx, this.PROCESS_BATCH_SIZE, data_archive.dataprovider_id, data_archive.OCCURRENCE) # REBUILD QC index if disable_index: if len(data_archive.event_recs) + len( data_archive.occurrence_recs) > data_archive.INDEX_TRESHOLD: eurobis_dataset.EurobisDataset.rebuild_qc_index() duration = time.time() - time_start # Dataset QC finished, taking note of the time. if with_logging: this.logger.info( f"Total net processing time for {data_archive.dataprovider_id} : " f"{data_archive.dataset_name} in: {duration} ")
def process_random_record(with_logging=True): """ select a random dataset, then a random core event from it and perform QC """ # To select a specific type of record, # This selects 1 dataset with less than 10000 events/occurrences reported in the dataproviders table # To select Event based data sets, add this between e.dataprovider_id and group_by: where d.core = 2 sql_random_dataset = f"SELECT TOP 1 d.id, count(e.dataprovider_id) FROM dataproviders d " \ f" inner join eurobis e on d.id = e.dataprovider_id group by d.id " \ f" having count(e.dataprovider_id) < 10000 ORDER BY NEWID()" # Go and get the id! dataset_id = None # Connect to the database to get dataset list if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice this.logger.error("No connection to DB, nothing can be done! ") exit(0) else: # Fetch a random set of datasets cur = mssql.conn.cursor() cur.execute(sql_random_dataset) dataset = cur.fetchone() dataset_id = dataset[0] data_archive = eurobis_dataset.EurobisDataset() data_archive.load_dataset(dataset_id) if with_logging: this.logger.info(f"--------------------------------------------------") this.logger.info( f"Loaded dataset {data_archive.dataset_name}, id = {data_archive.dataprovider_id} " ) this.logger.info( f"Number of event records: {len(data_archive.event_recs)}") this.logger.info( f"Number of occurrence records: {len(data_archive.occurrence_recs)}" ) this.logger.info( f"Number of emof records: {len(data_archive.emof_recs)}") this.logger.info(f"Interesting areas: {data_archive.areas}") this.logger.info(f"Imis dataset ID: {data_archive.imis_das_id}") this.logger.info( f"Type of core records: {'Event' if data_archive.darwin_core_type == 2 else 'Occurrence'}" ) this.logger.info(f"--------------------------------------------------") pass # Now everything is in data_archive, we must select a random CORE record, and its children, calculate QC and # display all records that originate that reasoning. # Proceed top-down as in pipeline ... if data_archive.darwin_core_type == data_archive.EVENT: # select random core event: record_idx = randint(0, len(data_archive.event_recs) - 1) record = data_archive.event_recs[record_idx] # make sure we start at "Empty" record["qc"] = None # Perform basic QC: qc_ev = mssql_pipeline.qc_event(record, data_archive) record["qc"] |= qc_ev # Make sure it is stamped # Generate key and lookup occurrences... key = f"{record['dataprovider_id']}_{record['eventID']}" if key in data_archive.occ_indices: for occ_record in data_archive.occ_indices[key]: # qc_occurrence sall also take care of emof for occurrence qc_occ = mssql_pipeline.qc_occurrence(occ_record, data_archive) qc_occ |= required_fields.check_ev_occ_required( record, occ_record, False) occ_record['qc'] |= qc_occ # also give to occurrence record occ_record[ 'qc'] |= qc_ev # Inherits the event QC (email 24/01/2021) # No longer true as per email 24/01/2021 # Needs to propagate the REQUIRED FIELDS CHECK for the event and its occurrences # qc_req_agg = [record] # qc_req_agg.extend(data_archive.occ_indices[key]) # record["qc"] |= required_fields.check_aggregate(qc_req_agg) # qc_ev |= record["qc"] # Are there any lookups left to do (any record type) if len(data_archive.records_for_lookup): location.check_xy(data_archive.records_for_lookup) # Need to propagate the (new) QC of the events down to the occurrences records for looked_up_record in data_archive.records_for_lookup: if looked_up_record["DarwinCoreType"] == data_archive.EVENT: key = f"{looked_up_record['dataprovider_id']}_{looked_up_record['eventID']}" if key in data_archive.occ_indices: for occ_record in data_archive.occ_indices[key]: occ_record["qc"] |= looked_up_record["qc"] this.logger.info(f"Calculated quality mask: {qc_ev}, consisting of:") this.logger.info( f"QC NUMBERS: -------------> {QCFlag.decode_numbers(record['qc'])}" ) this.logger.info( f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(record['qc'])}") this.logger.info(f"--------------------------------------------------") this.logger.info(f"Event Record: {record}") this.logger.info(f"--------------------------------------------------") if key in data_archive.occ_indices: for occ_record in data_archive.occ_indices[key]: this.logger.info(f"Occurrence Record: {occ_record}") this.logger.info( f"Calculated quality mask: {occ_record['qc']}, consisting of:" ) this.logger.info( f"QC NUMBERS: -------------> {QCFlag.decode_numbers(occ_record['qc'])}" ) this.logger.info( f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(occ_record['qc'])}" ) this.logger.info( f"--------------------------------------------------") key_o = f"{occ_record['dataprovider_id']}_" \ f"{'NULL' if occ_record['eventID'] is None else occ_record['eventID']}_" \ f"{'NULL' if occ_record['occurrenceID'] is None else occ_record['occurrenceID']}" if key_o in data_archive.emof_indices: for emof in data_archive.emof_indices[key_o]: this.logger.info(f"eMoF Record: {emof}") this.logger.info( f"--------------------------------------------------" ) if key in data_archive.emof_indices: for emof in data_archive.emof_indices[key]: this.logger.info(f"eMoF Record for event: {emof}") this.logger.info( f"--------------------------------------------------") else: # The QC is either 0 or a QC mask record_idx = randint(0, len(data_archive.occurrence_recs) - 1) record = data_archive.occurrence_recs[record_idx] qc_occ = mssql_pipeline.qc_occurrence(record, data_archive) # Are there any lookups left to do (any record type)? if len(data_archive.records_for_lookup): location.check_xy(data_archive.records_for_lookup) for lookup_record in data_archive.records_for_lookup: record['qc'] |= lookup_record["qc"] qc_occ |= lookup_record["qc"] data_archive.records_for_lookup = [] this.logger.info(f"Calculated quality mask: {qc_occ}, consisting of:") this.logger.info( f"QC NUMBERS: -------------> {QCFlag.decode_numbers(qc_occ)}") this.logger.info( f"QC FLAG NAMES: ----------> {QCFlag.decode_mask(qc_occ)}") this.logger.info(f"--------------------------------------------------") this.logger.info(f"Occurrence Record: {record}") this.logger.info(f"--------------------------------------------------") key_o = f"{record['dataprovider_id']}_NULL_" \ f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}" if key_o in data_archive.emof_indices: for emof in data_archive.emof_indices[key_o]: this.logger.info(f"eMoF Record: {emof}") this.logger.info( f"--------------------------------------------------")