Esempio n. 1
0
    def disable_qc_index(cls):
        """ Disable non-clustered index on QC
            important - this depends on the index name being kept the same in the
            variable sql_disable_index below
            """

        # Check DB connection...
        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice (raise an exception may be)
            return "Fail, no connection "
        else:
            try:
                sql_disable_index = "ALTER INDEX IX_eurobis_qc ON dbo.eurobis DISABLE;"
                cursor = mssql.conn.cursor()
                cursor.execute(sql_disable_index)
                mssql.conn.commit()
                this.logger.debug(f"Non clustered index on qc disabled")
                mssql.close_db()
                return "Success"
            except Error as e:
                mssql.close_db()
                return f"Failed to disable clustered index: {e}"
Esempio n. 2
0
    def get_provider_data(self, das_prov_id):
        """ Obtain the dataset info from the dataproviders table
            and stores that into the instance fields
            :param - das_prov_id - Dataset id """

        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice
            pass
        else:
            # Can start querying the DB using the dataset provider ID
            sql_str = f"select * from dataproviders d where id = {das_prov_id};"

            # Fetch the provider line
            cur = mssql.conn.cursor()
            cur.execute(sql_str)
            # Get a dictionary
            provider_fields = [
                description[0] for description in cur.description
            ]

            provider = cur.fetchone()

            self.provider_record = dict(zip(provider_fields, provider))
            self.imis_das_id = self.provider_record['IMIS_DasID']
            self.dataprovider_id = self.provider_record[
                'id']  # Of course this we have already
            self.darwin_core_type = self.provider_record['core']
            self.dataset_name = self.provider_record['name']
            # Should see if more is needed
            mssql.close_db()
Esempio n. 3
0
    def get_ev_occ_records(self, das_prov_id):
        """ retrieves event and occurrence records for the dataset in das_id from SQL Server
            also builds search indexes to easily retrieve dependant records
            :param das_prov_id """
        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice
            pass
        else:
            # get records
            sql_string = self.query_builder_eve_occur(das_prov_id)
            cursor = mssql.conn.cursor()
            cursor.execute(sql_string)

            columns = [column[0] for column in cursor.description]
            records = []
            for row in cursor:
                records.append(dict(zip(columns, row)))

            # We proceed differently in accordance with the type of CORE record...
            if self.darwin_core_type == self.EVENT:
                for record in records:
                    if record['DarwinCoreType'] == self.OCCURRENCE:
                        self.occurrence_recs.append(record)
                        if record[
                                'eventID'] is not None:  # To link we need an eventID
                            # make a key - it must be lined by the eventID
                            key = f"{record['dataprovider_id']}_{record['eventID']}"
                            if key in self.occ_indices:
                                # All the records with this key shall be in a list at 'key'
                                self.occ_indices[key].append(record)
                            else:
                                self.occ_indices[key] = [record]
                    elif record['DarwinCoreType'] == self.EVENT:
                        self.event_recs.append(record)
                        # If coretype is event, then eventid is the key - only used to reverse lookup.
                        key = f"{record['dataprovider_id']}_{record['eventID']}"
                        self.event_indices[key] = [record]

            else:  # Occurrence records (Datasets with core_type = OCCURRENCE do not have events - verified)
                for record in records:
                    if record['DarwinCoreType'] == self.OCCURRENCE:
                        self.occurrence_recs.append(record)
                    else:  # Should really never happen !!
                        self.event_recs.append(record)
            mssql.close_db()
Esempio n. 4
0
    def get_emof_records(self, das_prov_id):
        """ retrieves measurementorfact records for the dataset in das_id from SQL Server
            also building search indexes to easily retrieve dependant records
            NOTE: mof records do not exist for records that have eventID and occurrenceID NULL
            :param das_prov_id"""

        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice
            pass
        else:
            # get records
            sql_string = self.query_builder_emof(das_prov_id)
            cursor = mssql.conn.cursor()
            cursor.execute(sql_string)

            columns = [column[0] for column in cursor.description]
            records = []
            for row in cursor:
                records.append(dict(zip(columns, row)))

            for record in records:
                self.emof_recs.append(record)

                if self.darwin_core_type == self.EVENT:

                    key = f"{record['dataprovider_id']}_{'NULL' if record['eventID'] is None else record['eventID']}_" \
                          f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}"
                else:
                    # Occurrence records in datasets with core = Occurrence may have other info in EventID and NULL in
                    # eMoF record.
                    key = f"{record['dataprovider_id']}_NULL_" \
                          f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}"

                if key in self.emof_indices:
                    # All the records with this key shall be in a list at 'key'
                    self.emof_indices[key].append(record)
                else:
                    self.emof_indices[key] = [record]
            mssql.close_db()
Esempio n. 5
0
    def rebuild_qc_index(cls):
        """ Rebuild non-clustered index on QC
            important - this depends on the index name as per disable_qc_index """

        # Check DB connection...
        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice (raise an exception may be)
            return "Fail, no connection "
        else:
            try:
                sql_disable_index = "ALTER INDEX IX_eurobis_qc ON dbo.eurobis REBUILD;"
                cursor = mssql.conn.cursor()
                cursor.execute(sql_disable_index)
                mssql.conn.commit()
                this.logger.debug(f"Non clustered index on qc rebuilt")
                mssql.close_db()
                return "Success"
            except Error as e:
                mssql.close_db()
                return f"Failed to rebuild clustered index: {e}"
Esempio n. 6
0
def process_dataset_list(pool_no,
                         dataset_id_list,
                         from_pool=False,
                         with_logging=False):
    """ Processes a list of DwCA archives, ideally to be called in parallel
        :param pool_no - Pool number to take track of the pools
        :param dataset_id_list (The list of datasets to be processed)
        :param from_pool: If it comes from a multiprocessing pool, then disabling of the QC index is taken care of
        :param with_logging (Logging enabled or not) """
    # Prints pool data
    start = time.time()
    if with_logging:
        this.logger.info(f"Pool {pool_no} started")

    # Connect to the database, each pool should have its own connection
    conn = None

    mssql.close_db()
    # Make sure db connection is ours
    if not mssql.conn:
        conn = mssql.open_db()

    if conn is None:
        # Should find a way to exit and advice
        this.logger.error("No connection to DB, nothing can be done! ")
        return pool_no

    # Disable index on QC once
    if not from_pool:
        eurobis_dataset.EurobisDataset.disable_qc_index()

    errors = 0

    for dataset_id in dataset_id_list:
        start_file = time.time()

        if with_logging:
            this.logger.info(
                f"Pool Number: {pool_no}, processsing dataset {dataset_id} ")

        try:
            dataset_qc_labeling(dataset_id, False, with_logging, pool_no)
        except Exception:
            errors += 1
            this.logger.error(traceback.format_exc())
            this.logger.warning(
                f"WARNING: Pool Number: {pool_no}, processsing dataset {dataset_id} FAILED "
            )

        if with_logging:
            this.logger.info(
                f"Processed dataset {dataset_id} in  {time.time() - start_file}"
            )

    # REBUILD index on QC once
    if not from_pool:
        eurobis_dataset.EurobisDataset.rebuild_qc_index()

    if with_logging:
        this.logger.info(f"Pool {pool_no} completed in {time.time() - start}")

    return pool_no, errors
Esempio n. 7
0
    def update_record_qc(cls, records, batch_update_count, batch_size, ds_id,
                         record_type):
        """ Shall update a batch of records from a dataset
            update queries shall be built record by record and sent to the DB in batches for execution
            :param records : A list of the records being updated
            :param batch_update_count: The dataset's batch number
            :param batch_size : Here only used to report the status of the update
            :param ds_id: dataset being processed
            :param record_type : For query optimization, occurrence and event records may be treated differently
            also used for printing
            NOTE: Special methods are provided to DISABLE the index on QC and REBUILD it after the updates.
            These improve vastly the query run time.
            """

        # Check DB connection...
        if not mssql.conn:
            mssql.open_db()

        if mssql.conn is None:
            # Should find a way to exit and advice (raise an exception may be)
            return "Fail, no connection "
        else:
            record_count = len(records)
            # sql_update = f"BEGIN TRAN; \n"
            sql_update = ""
            for record in records:
                # Compose update query
                # physloc = bytes.hex(record['physloc'])

                # Note The fields other than physloc and dataprovider_id are used to optimize
                # the update queries execution plans and thus to reduce browsing the records
                # and using the existing indexes on the eurobis table. Observed speed improvements
                # are between 2.5 and 5 times faster.

                # This is a temporary fix - some qc values are set to None.
                if record['qc'] is None:
                    record['qc'] = 0

                sql_update += f"{cls.sql_update_start}{record['qc']}{cls.sql_update_middle} {record['dataprovider_id']}"
                """
                Disabled - using auto_id now.
                if record['decimalLatitude'] is not None:
                    sql_update = f"{sql_update}{cls.sql_if_lat}{record['decimalLatitude']}"
                else:
                    sql_update = f"{sql_update} AND Latitude IS NULL "

                if record['decimalLongitude'] is not None:
                    sql_update = f"{sql_update} {cls.sql_if_lon}{record['decimalLongitude']}"
                else:
                    sql_update = f"{sql_update} AND Longitude IS NULL "

                if record_type == EurobisDataset.EVENT:
                    if record['eventID'] is not None and misc.is_clean_for_sql(record['eventID']):
                        sql_update = f"{sql_update} {cls.sql_if_event_id}'{record['eventID']}'"
                """

                # sql_update = f"{sql_update} {cls.sql_update_end} 0x{physloc} \n"
                sql_update = f"{sql_update} {cls.sql_update_end} {record['auto_id']} \n"

            try:
                # sql_update += f"COMMIT TRAN;\n"
                cursor = mssql.conn.cursor()
                cursor.execute(sql_update)
                mssql.conn.commit()
                rec_type = "EVENT" if record_type == EurobisDataset.EVENT else "OCCURRENCE"
                dateTimeObj = datetime.now()
                this.logger.debug(
                    f"{dateTimeObj}: {rec_type} records update count: {batch_update_count * batch_size + record_count}  "
                    f"of dataset {ds_id};")
                batch_update_count += 1
                return "Success"
            except Error as e:
                return f"Fail, batch {batch_update_count} not updated, exception {str(e)}"

            # Added close_DB to make sure that transactions are "separated".
            mssql.close_db()
Esempio n. 8
0
def do_db_multi_random_percent(percent):
    """ Example of processing multiple datasets at the same time in
        order to exploit the computing resources (cores) performs a random
        selection of 2% of available datasets having less than 4000 records """

    start_time = time.time()

    # Now set to a percent of datasets...
    # sql_random_percent_of_datasets = f"SELECT id FROM dataproviders WHERE {percent} >= CAST(CHECKSUM(NEWID(), id) " \
    #                                  "& 0x7fffffff AS float) / CAST (0x7fffffff AS int)"

    # This selects percent from SMALL datasets (less than 4000 events/occurrences)
    sql_random_percent_of_datasets = f"select a.id, a.displayname, a.rec_count from " \
                                     f"(select d.id, d.displayname, rec_count from dataproviders d " \
                                     f"inner join eurobis e on e.dataprovider_id = d.id " \
                                     f"where rec_count <= 4000 group by d.id, d.displayname, rec_count) a " \
                                     f"where {percent} >= CAST(CHECKSUM(NEWID(), id) & 0x7fffffff AS float) " \
                                     f"/ CAST (0x7fffffff AS int) order by id "

    dataset_ids = []
    dataset_names = []

    # we dedicate to the task the total number of processors - 3 or 1 if we only have 2 cores or less.
    # Knowing that mssql needs 2 cores at least.
    reserve_cpus = 1 + (0 if not mssql.server_local else 2)

    if mp.cpu_count() > reserve_cpus:
        n_cpus = mp.cpu_count() - reserve_cpus
    else:
        n_cpus = 1

    pool = mp.Pool(n_cpus)

    # Connect to the database to get dataset list
    if not mssql.conn:
        mssql.open_db()

    if mssql.conn is None:
        # Should find a way to exit and advice
        this.logger.error("No connection to DB, nothing can be done! ")
        exit(0)
    else:
        # Fetch a random set of datasets
        cur = mssql.conn.cursor()
        cur.execute(sql_random_percent_of_datasets)
        for row in cur:
            dataset_ids.append(row[0])
            # tuples names - size
            dataset_names.append((row[0], row[1], row[2]))

    mssql.close_db()

    # Retrieved list, now need to split
    dataset_id_lists = misc.split_list(dataset_ids, n_cpus)  # We are OK until here.
    dataset_names_lists = misc.split_list(dataset_names, n_cpus)

    result_pool = []
    for i, dataset_id_list in enumerate(dataset_id_lists):
        this.logger.info(f"Pool {i} will process {dataset_names_lists[i]} ")
        result_pool.append(pool.apply_async(process_dataset_list, args=(i, dataset_id_list, True, True)))

    for r in result_pool:
        res = r.get()
        if res[1] > 0:
            this.logger.warning(f"Pool {res[0]} failed to process {res[1]} datasets")

    pool.terminate()
    pool.join()

    this.logger.info(f"Started at: {start_time}. All processes have completed after {time.time() - start_time}")