Example #1
0
# Sample script that reads a JSON configuration file for ESGF quality control
# flags and publishes the information to the Solr index

import logging
from esgfpy.update.utils import update_solr
from pprint import pprint
import json
from urllib.request import urlopen

logging.basicConfig(level=logging.INFO)

# constants
# SOLR_URL = 'http://esgf-node.jpl.nasa.gov:8984/solr'
SOLR_URL = 'http://localhost:8984/solr'
INDICATORS_URL = ('https://raw.githubusercontent.com/EarthSystemCoG/'
                  'esgfpy-publish/master/esgfpy/obs4mips/'
                  'obs4mips_indicators.json')
# INDICATORS_URL = ('https://raw.githubusercontent.com/PCMDI/'
#                   'obs4MIPs-cmor-tables/master/src/tt/obs4MIPs-indicators.json')'
# INDICATORS_URL = ('file:///Users/cinquini/tmp/obs4mips_indicators.json')

# read climate indicators file
response = urlopen(INDICATORS_URL)
html = response.read()
json_data = json.loads(html)
pprint(json_data)
response.close()  # best practice to close the file

# publish to Solr
update_solr(json_data, update='set', solr_url=SOLR_URL, solr_core='datasets')
Example #2
0
def check_replicas(project,
                   start_datetime=datetime.datetime.strftime(
                       datetime.datetime.now() - datetime.timedelta(
                           days=LAST_NUMBER_OF_DAYS), '%Y-%m-%dT%H:%M:%SZ'),
                   stop_datetime=datetime.datetime.strftime(
                       datetime.datetime.now(), '%Y-%m-%dT%H:%M:%SZ'),
                   dry_run=False):
    '''
    Checks replicas for a specific project.
    By default it will check datasets that have changed in the past week.
    start_datetime, stop_datetime must be string in the format
    "2017-01-07T00:00:00.831Z".
    '''

    logging.info("Checking replicas start datetime=%s stop datetime=%s "
                 "dry_run=%s" % (start_datetime, stop_datetime, dry_run))

    # 0) retrieve the latest list of ESGF index nodes
    # query: https://esgf-node.jpl.nasa.gov/esg-search/search/?offset=0
    #  &limit=0&type=Dataset&facets=index_node&format=application%2Fsolr%2Bjson
    query_params = [("offset", "0"),
                    ("limit", "0"),
                    ("type", "Dataset"),
                    ("facets", "index_node"),
                    ("format", "application/solr+json")]
    jobj = query_esgf(query_params, esgf_index_node_url)
    # select the even elements of the list (starting at inde x0):
    # "index_node":["esg-dn1.nsc.liu.se", 78954, "esg.pik-potsdam.de",66899,
    #               "esgdata.gfdl.noaa.gov",5780,...]
    index_nodes = jobj['facet_counts']['facet_fields']['index_node'][0::2]
    logging.debug("Querying index nodes: %s" % index_nodes)

    # counter
    num_datasets_updated = 0

    # 1) query all remote index nodes for the latest primary datasets
    # that have changed in the given time period
    fields = ['id', 'master_id', 'version', '_timestamp']
    for index_node in index_nodes:

        try:

            remote_slave_solr_url = 'https://%s/solr' % index_node
            logging.info("Querying Solr=%s for datasets with project=%s "
                         "start_datetime=%s stop_datetime=%s" % (
                             remote_slave_solr_url, project,
                             start_datetime, stop_datetime))
            query1 = ('project:%s&replica:false&latest:true'
                      '&_timestamp:[%s TO %s]' % (
                          project, start_datetime, stop_datetime))
            docs1 = query_solr(query1, fields, solr_url=remote_slave_solr_url,
                               solr_core='datasets')
            if len(docs1) > 0:
                logging.info("\tFound %s datasets that have changed, "
                             "checking local Solr for replicas" % len(docs1))

        except urllib.error.HTTPError:
            logging.error("Error querying index node "
                          "%s" % remote_slave_solr_url)
            docs1 = []

        # 2) query local index for replicas of the same datasets
        # that are flagged with latest='true'
        for doc1 in docs1:
            v1 = int(doc1['version'])
            master_id = doc1['master_id']
            dataset_id1 = doc1['id']
            _timestamp1 = doc1['_timestamp']
            logging.info("\tChecking local Solr=%s for replica of dataset=%s "
                         "version=%s _timestamp=%s" % (
                             local_master_solr_url, dataset_id1, v1,
                             _timestamp1))

            query2 = 'master_id:%s&replica:true&latest:true' % master_id
            docs2 = query_solr(query2, fields, solr_url=local_master_solr_url,
                               solr_core='datasets')

            # check local 'latest' replica
            for doc2 in docs2:

                # compare versions
                v2 = int(doc2['version'])
                # master_id2 = doc2['master_id']
                dataset_id2 = doc2['id']

                # remote primary has newer version -->
                # local replica must be updated
                if v1 > v2:
                    logging.warn("\t\tFound newer version: %s for dataset: %s "
                                 "at site: %s" % (
                                     v2, master_id, remote_slave_solr_url))
                    logging.warn("\t\tUpdating status of local dataset: %s to "
                                 "latest=false" % dataset_id2)

                    # FIXME
                    # 3) set latest flag of local replica to false
                    # for datasets, files, aggregations
                    if not dry_run:
                        update_dict = {'id:%s' % dataset_id2: {
                            'latest': ['false']}}
                        update_solr(update_dict,
                                    update='set',
                                    solr_url=local_master_solr_url,
                                    solr_core='datasets')
                        update_dict = {'dataset_id:%s' % dataset_id2: {
                            'latest': ['false']}}
                        update_solr(update_dict,
                                    update='set',
                                    solr_url=local_master_solr_url,
                                    solr_core='files')
                        update_solr(update_dict,
                                    update='set',
                                    solr_url=local_master_solr_url,
                                    solr_core='aggregations')

                    num_datasets_updated += 1

    logging.info("Total number of local replica updated="
                 "%s" % num_datasets_updated)
Example #3
0
import logging
logging.basicConfig(level=logging.INFO)

from esgfpy.update.utils import update_solr

# must target the Solr master
solr_url = 'http://localhost:8984/solr'

# SET a field values (i.e. will override values for fields with the same name, or insert new ones if not existing already)
update_dict = {
    'project:obs4MIPs': {
        'location': ['Pasadena'],
        'realm': ['atmosphere']
    }
}
update_solr(update_dict, update='set', solr_url=solr_url, solr_core='datasets')

# ADD values to existing fields, or add new fields with values
update_dict = {
    'project:obs4MIPs&source_id:MLS': {
        'location': ['Boulder'],
        'stratus': ['cumulus']
    }
}
update_solr(update_dict, update='add', solr_url=solr_url, solr_core='datasets')

# REMOVE existing fields - set their values to None or empty list
update_dict = {'project:obs4MIPs': {'location': [], 'stratus': None}}
update_solr(update_dict, update='set', solr_url=solr_url, solr_core='datasets')

# transfer the value of a field yo a new named field
Example #4
0
import logging
from esgfpy.update.utils import update_solr

logging.basicConfig(level=logging.DEBUG)

SOLR_URL = 'http://localhost:8984/solr'

# associate supplementary data to datasets
# http://esgf-data.jpl.nasa.gov/thredds/fileServer/obs4MIPs/technotes/cltTechNote_MODIS_L3_C5_200003-201109.pdf
myDict = {
    'id:obs4mips.NASA-JPL.TES.tro3.mon.v20110608|esgf-data.jpl.nasa.gov': {
        'xlink': [
            'http://esgf-data.jpl.nasa.gov/thredds/fileServer/'
            'obs4MIPs/technotes/tro3TechNote_TES_L3_tbd_200507-200912'
            '.pdf|TES Ozone Technical Note|technote',
            'https://esgf-data.jpl.nasa.gov/thredds/fileServer/'
            'obs4MIPs/supplementary_data/TES-SUPPLEMENTARY.zip|'
            'TES Supplementary Data|supdata'
        ]
    },
}

update_solr(myDict, update='set', solr_url=SOLR_URL, solr_core='datasets')