Exemple #1
0
def populate_is_exotic_be_field(conn, config_parser, exotic_status_source):

    msg = f"We'll now retrieve the GBIF checklist containing the exotic taxa in Belgium, datasetKey: {exotic_status_source}."
    print(msg)
    logging.info(msg)

    start_time = time.time()
    # get alien taxa from GRIIS Belgium checklist
    alien_taxa = _get_alien_taxa(datasetKey=exotic_status_source)
    end_time = time.time()

    msg = f"Retrieved {len(alien_taxa)} exotic taxa in {round(end_time-start_time)}s."
    print(msg)
    logging.info(msg)

    taxon_cur = execute_sql_from_jinja_string(conn, "SELECT * FROM taxonomy", dict_cursor=True)

    total_taxa_count = taxon_cur.rowcount
    msg = f"We'll now update exotic_be field for {total_taxa_count} taxa of the taxonomy table."
    print(msg)
    logging.info(msg)

    start_time = time.time()

    taxa_to_check = dict()
    for taxon in taxon_cur:
        id = taxon['id']
        parentId = taxon['parentId']
        scientificName = taxon['scientificName']
        gbifId = taxon['gbifId']
        acceptedId = taxon['acceptedId']
        taxa_to_check[gbifId] = {'id': id, 'gbifId': gbifId, 'scientificName': scientificName, 'parentId': parentId, 'acceptedId': acceptedId}

    #initialize list with id of exotic taxa in taxonomy
    exotic_taxa_ids= []

    for exotic_taxon in alien_taxa:
        exotic_taxa_ids = _find_exotic_taxa(exotic_taxon=exotic_taxon,
                                                 taxa=taxa_to_check,
                                                 exotic_taxa_list= exotic_taxa_ids,
                                                 depth=0)

    msg = f"{len(exotic_taxa_ids)} exotic taxa found in taxonomy."
    print(msg)
    logging.info(msg)

    if (len(exotic_taxa_ids) > 0):
        # set exotic_be = True for exotic taxa and False for the others
        template = """ UPDATE taxonomy SET "exotic_be" = """ \
                   + """ CASE WHEN "id" IN {{ ids | inclause }} THEN true""" \
                   + """ ELSE false END"""
    else:
        template = """ UPDATE taxonomy SET "exotic_be" = false"""
    update_exotic_be_cur = execute_sql_from_jinja_string(conn, sql_string=template, context={'ids': exotic_taxa_ids})

    end_time = time.time()

    msg = f"Field exotic_be updated for {update_exotic_be_cur.rowcount} taxa in taxonomy in {round(end_time - start_time, 2)}s."
    print(msg)
    logging.info(msg)
Exemple #2
0
def _update_taxonomy_if_needed(conn, taxon_in_taxonomy, taxon, depth=0):
    # Params: depth is the recursion level (used for log indentation)

    # GBIF knows about this taxon, and so we are. Do we need to update or do we already have the latest data
    gbifId = taxon['gbifId']

    taxonomyId = taxon_in_taxonomy.get('id')
    taxonomy_fields_to_compare = {k: taxon_in_taxonomy[k] for k in taxon}
    taxonomy_fields_to_change = taxonomy_fields_to_compare.copy()
    if taxon == taxonomy_fields_to_compare:
        print_indent(
            f"Taxon {taxon['scientificName']} already present in taxonomy (id = {taxonomyId}).",
            depth)
    else:
        # unchanged fields
        keys_same_values = dict(taxonomy_fields_to_compare.items()
                                & taxon.items()).keys()
        # remove unchanged fields
        for key in keys_same_values:
            del taxonomy_fields_to_change[key]
        for key in keys_same_values:
            del taxon[key]
        print(f"Fields - values to change:")
        [print(key, value) for key, value in taxonomy_fields_to_change.items()]
        print(f"New fields - values:")
        [print(key, value) for key, value in taxon.items()]
        context_to_query = taxon
        context_to_query['gbifId'] = gbifId
        template = """ UPDATE taxonomy SET """ \
                   + ", ".join([f'"{i}"' + ' = ' + '{{ ' + str(i) + ' }}' for i in taxon.keys()]) \
                   + """ WHERE "gbifId" = {{ gbifId }} """
        execute_sql_from_jinja_string(conn,
                                      sql_string=template,
                                      context=context_to_query)
        return taxonomyId
Exemple #3
0
def _update_match_info(conn, match_info, scientificname_row_id):
    # update scientificname with info about match and taxonomyId
    match_info = {k: v for k, v in match_info.items() if v is not None}
    template = """ UPDATE scientificname SET """ \
               + ", ".join([f'"{i}"' + ' = ' + '{{ ' + str(i) + ' }}' for i in match_info.keys()]) \
               + """ WHERE "id" = {{ id }} """
    data = match_info.copy()
    data['id'] = scientificname_row_id
    execute_sql_from_jinja_string(conn, sql_string=template, context=data)
def populate_scientificname_annex(conn, config_parser, annex_file):
    """ Populate the table scientificnameannex

    If taxa-limit in configuraton file is not a empty string but a number n, then the first n taxa are imported into
    the table

    """
    annex_names = _get_annex(path=annex_file)
    message_n_names_in_annex_file = "Number of taxa listed in official annexes and ordinances: " + \
                                    str(len(annex_names))
    print(message_n_names_in_annex_file)
    logging.info(message_n_names_in_annex_file)
    n_taxa_max = config_parser.get('scientificname_annex', 'taxa-limit')
    if len(n_taxa_max) > 0:
        n_taxa_max = int(n_taxa_max)
    else:
        n_taxa_max = len(annex_names)
    start = time.time()
    counter_insertions = 0
    for value in annex_names.values():
        values = value.values()
        fields = value.keys()
        if counter_insertions < n_taxa_max:
            template = """INSERT INTO scientificnameannex ({{ col_names | surround_by_quote | join(', ') | sqlsafe 
            }}) VALUES {{ values | inclause }} """
            execute_sql_from_jinja_string(conn,
                                          template,
                                          context={
                                              'col_names': tuple(fields),
                                              'values': tuple(values)
                                          })
            counter_insertions += 1
            # running infos on screen (no logging)
            if counter_insertions % 20 == 0:
                elapsed_time = time.time() - start
                expected_time = elapsed_time / counter_insertions * (
                    n_taxa_max - counter_insertions)
                info_message = "\r" + \
                               f"{counter_insertions}/{n_taxa_max} taxa inserted in scientificnameannex in" + \
                               f" {round(elapsed_time, 2)}s." + \
                               f" Expected time to go: {round(expected_time, 2)}s."
                print(info_message, end="", flush=True)
        else:
            break
    # Logging and statistics
    end = time.time()
    n_taxa_inserted = f"Total number of taxa inserted in scientificnameannex: {counter_insertions}"
    print(n_taxa_inserted)
    logging.info(n_taxa_inserted)
    elapsed_time = f"Table scientificnameannex populated in {round(end - start)}s."
    print(elapsed_time)
    logging.info(elapsed_time)
Exemple #5
0
def _insert_or_get_rank(conn, rank_name):
    """ Insert or select a rank

    If rank_name already exists in the rank table, select it.
    Otherwise, insert it in a new row.

    In both cases, returns the row id """

    template = """WITH ins AS (
        INSERT INTO rank(name)
        VALUES ({{ rank_name }})         -- input value
        ON CONFLICT(name) DO NOTHING
        RETURNING rank.id
        )
    SELECT id FROM ins
    UNION  ALL
    SELECT id FROM rank          -- 2nd SELECT never executed if INSERT successful
    WHERE name = {{ rank_name }}  -- input value a 2nd time
    LIMIT  1;"""

    cur = execute_sql_from_jinja_string(conn,
                                        sql_string=template,
                                        context={'rank_name': rank_name},
                                        dict_cursor=True)
    return cur.fetchone()['id']
Exemple #6
0
def _insert_new_entry_taxonomy(conn, taxon):
    gbifId = taxon['gbifId']

    # insert taxon in taxonomy table
    execute_sql_from_jinja_string(
        conn,
        """INSERT INTO taxonomy ({{ col_names | surround_by_quote | join(', ') | sqlsafe }}) VALUES {{ values | inclause }}""",
        {
            'col_names': tuple(taxon.keys()),
            'values': tuple(taxon.values())
        })
    # get id (PK) in taxonomy
    cur = execute_sql_from_jinja_string(
        conn, """SELECT id FROM taxonomy WHERE "gbifId" = {{ gbifId }}""",
        {'gbifId': gbifId})
    taxonomyId = cur.fetchall()
    assert taxonomyId is not None, f"Taxon with gbifId {gbifId} not inserted into the taxonomy table."
    assert len(taxonomyId) <= 1, \
        f"Too many taxa returned for gbifId = {gbifId}. Duplicates in taxonomy table."

    _insert_new_entry_taxonomy.counter += 1
    return taxonomyId[0][0]
Exemple #7
0
def _get_taxon_from_taxonomy_by_gbifId(conn, gbif_id):
    # Search the taxonomy table by gbif_id
    # Returns a dict such as: {'id': 1, 'gbifId': 5, 'scientificName': 'Fungi', 'rankId': 1, 'acceptedId': None, 'parentId': None}
    # If nothing is found, returns all None: {'id': None, 'gbifId': None, ...}
    template = """SELECT * FROM taxonomy WHERE "gbifId" = {{ gbifId }} """
    taxon_cur = execute_sql_from_jinja_string(conn,
                                              sql_string=template,
                                              context={'gbifId': gbif_id})
    taxon_values = taxon_cur.fetchall()
    cols_taxonomy = list(map(lambda x: x[0], taxon_cur.description))

    assert len(taxon_values
               ) <= 1, f"Multiple taxa with gbifId = {gbif_id} in taxonomy."
    if len(taxon_values) == 1:
        taxon = dict(zip(cols_taxonomy, taxon_values[0]))
    else:
        taxon = dict.fromkeys(cols_taxonomy)
    return taxon
Exemple #8
0
def populate_vernacular_names(conn,
                              config_parser,
                              empty_only,
                              filter_lang=None):
    # If empty only, only process the taxa currently without vernacular names
    # Otherwise, process all entries in the taxonomy table
    # filter_lang is a list of language codes (ISO 639-1 Code) (default: no filtering)
    if empty_only:
        taxa_selection_sql = """SELECT *
                                FROM taxonomy
                                WHERE NOT EXISTS (SELECT vernacularname."taxonomyId" FROM vernacularname WHERE taxonomy.id = vernacularname."taxonomyId") {% if limit %} LIMIT {{ limit }} {% endif %}"""
    else:
        taxa_selection_sql = """SELECT * FROM taxonomy {% if limit %} LIMIT {{ limit }} {% endif %}"""

    limit = config_parser.get('vernacular_names', 'taxa-limit')
    cur = execute_sql_from_jinja_string(conn,
                                        sql_string=taxa_selection_sql,
                                        context={'limit': limit},
                                        dict_cursor=True)

    msg = f"We'll now load vernacular names for {cur.rowcount} entries in the taxonomy table. Languages: "
    if filter_lang is not None:
        msg += ", ".join(filter_lang)
    print(msg)
    logging.info(msg)

    # Create dictionary mapping 3-letter codes (as stored in GBIF) and 2-letter codes
    if filter_lang is not None:
        filter_lang_dict = _iso639_1_to_2_dict(filter_lang)
    else:
        filter_lang_dict = None

    # Get list of 3-letters format languages
    languages3 = None
    if filter_lang_dict is not None:
        languages3 = list(filter_lang_dict.keys())

    total_vernacularnames_counter = 0
    total_taxa_counter = 0
    start_time = time.time()

    for taxon in cur:
        taxonomy_id = taxon['id']
        gbif_taxon_id = taxon['gbifId']

        total_taxa_counter += 1

        vns = _get_vernacular_names_gbif(gbif_taxon_id, languages3=languages3)
        for vernacular_name in vns:
            name = vernacular_name.get('vernacularName')
            lang_code = filter_lang_dict[vernacular_name.get('language')]
            source = vernacular_name.get('source')
            if source is None:
                print(
                    f"Warning: vernacular name {name} for taxon with ID: {taxonomy_id} without source. Contact GBIF: https://github.com/gbif/gbif-api/issues/56"
                )
            msg = f"Now saving '{name}'({lang_code}) for taxon with ID: {taxonomy_id} (source: {source})"
            print(msg)
            logging.info(msg)

            insert_template = """INSERT INTO vernacularname("taxonomyId", "language", "name", "source") VALUES ({{ taxonomy_id}}, {{ lang_code }}, {{ name }}, {{ source }})"""
            execute_sql_from_jinja_string(conn,
                                          sql_string=insert_template,
                                          context={
                                              'taxonomy_id': taxonomy_id,
                                              'lang_code': lang_code,
                                              'name': name,
                                              'source': source
                                          })
            total_vernacularnames_counter += 1

    end_time = time.time()

    msg = f"Done loading {total_vernacularnames_counter} (for {total_taxa_counter} taxa) vernacular names in {round(end_time - start_time)}s."
    print(msg)
    logging.info(msg)
Exemple #9
0
def populate_annex_scientificname(conn, config_parser, annex_file):
    """ Populate the table annexscientificname

    If taxa-limit in configuration file is not a empty string but a number n, then the first n taxa are imported into
    the table
    """
    annex_names = _load_annex_data_from_file(path=annex_file)

    message_n_names_in_annex_file = f"Number of taxa listed in official annexes and ordinances: {len(annex_names)}"
    print(message_n_names_in_annex_file)
    logging.info(message_n_names_in_annex_file)

    n_taxa_max = config_parser.get('annex_scientificname', 'taxa-limit')
    if len(n_taxa_max) > 0:
        n_taxa_max = int(n_taxa_max)
    else:
        n_taxa_max = len(annex_names)
    start = time.time()
    counter_insertions = 0
    for annex_entry in annex_names:
        if counter_insertions < n_taxa_max:
            dict_for_annexscientificname = {
                k: annex_entry[k]
                for k in FIELDS_ANNEXSCIENTIFICNAME
            }
            if (dict_for_annexscientificname['isScientificName'] is True):
                dict_for_scientificname = {
                    k: annex_entry[k]
                    for k in annex_entry.keys() - FIELDS_ANNEXSCIENTIFICNAME
                }
                if dict_for_scientificname['authorship'] == '':
                    dict_for_scientificname['authorship'] = None
                id_scn = insert_or_get_scientificnameid(
                    conn,
                    scientific_name=dict_for_scientificname['scientificName'],
                    authorship=dict_for_scientificname['scientificName'])
                dict_for_annexscientificname['scientificNameId'] = id_scn
            # insert in annexscientificname
            template = """INSERT INTO annexscientificname ({{ col_names | surround_by_quote | join(', ') | sqlsafe 
            }}) VALUES {{ values | inclause }} """
            execute_sql_from_jinja_string(
                conn,
                template,
                context={
                    'col_names': tuple(dict_for_annexscientificname.keys()),
                    'values': tuple(dict_for_annexscientificname.values())
                })
            counter_insertions += 1
            # running infos on screen (no logging)
            if counter_insertions % 20 == 0:
                elapsed_time = time.time() - start
                expected_time = elapsed_time / counter_insertions * (
                    n_taxa_max - counter_insertions)
                info_message = "\r" + \
                               f"{counter_insertions}/{n_taxa_max} taxa inserted in annexscientificname in" + \
                               f" {round(elapsed_time, 2)}s." + \
                               f" Expected time to go: {round(expected_time, 2)}s."
                print(info_message, end="", flush=True)
        else:
            break
    # Logging and statistics
    end = time.time()
    n_taxa_inserted = f"\nTotal number of taxa inserted in annexscientificname: {counter_insertions}"
    print(n_taxa_inserted)
    logging.info(n_taxa_inserted)
    elapsed_time = f"Table annexscientificname populated in {round(end - start)}s."
    print(elapsed_time)
    logging.info(elapsed_time)
Exemple #10
0
def deduplicate_taxon(conn, config_parser):
    with open(
            os.path.join(
                __location__,
                config_parser.get('deduplicate_taxon', 'config-filename')),
            'r') as fp:
        taxon_to_replace = json.load(fp)

        with conn:
            for old_id, new_id in taxon_to_replace.items():
                print(f"Will replace taxon {old_id} by {new_id}")
                q = "UPDATE biodiv.commontaxa SET nptaxonid = {{ new_id }} WHERE nptaxonid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                q = "UPDATE biodiv.media SET taxonid = {{ new_id }} WHERE taxonid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                q = "UPDATE biodiv.identifiablespecies SET taxonid = {{ new_id }} WHERE taxonid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                q = "UPDATE biodiv.occurence SET identifiablespeciesid = {{ new_id }} WHERE identifiablespeciesid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                q = "UPDATE biodiv.speciesannex SET taxonid = {{ new_id }} WHERE taxonid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                # Also update child taxa that point to the record to be deleted
                q = "UPDATE biodiv.taxon SET parentid = {{ new_id }} WHERE parentid = {{ old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })

                q = "DELETE FROM biodiv.taxon WHERE id = {{old_id }};"
                execute_sql_from_jinja_string(conn,
                                              q,
                                              context={
                                                  'new_id': new_id,
                                                  'old_id': old_id
                                              })
            print('DONE')