Ejemplo n.º 1
0
def get_urls(file_functional_id):
    """returns a prioritized list of [url,protocol] where each url can supply the specified file"""

    try:
        result = sdquicksearch.run(parameter=[
            'limit=4',
            'fields=%s' % url_fields, 'type=File',
            'instance_id=%s' % file_functional_id
        ],
                                   post_pipeline_mode=None)
    except Exception as e:
        sdlog.debug("SDNEXTUR-015",
                    "exception %s.  instance_id=%s" % (e, file_functional_id))
        raise e

    li = result.get_files()
    sdlog.info(
        "SDNEXTUR-016",
        "sdquicksearch returned %s sets of file urls: %s" % (len(li), li))
    if li == []:
        # No urls found. Try again, but wildcard the file id. (That leads to a string search on all
        # fields for the wildcarded file id, rather than a match of the instance_id field only.)
        result = sdquicksearch.run(parameter=[
            'limit=4',
            'fields=%s' % url_fields, 'type=File',
            'instance_id=%s' % file_functional_id + '*'
        ],
                                   post_pipeline_mode=None)
        li = result.get_files()
        sdlog.info(
            "SDNEXTUR-017",
            "sdquicksearch 2nd call %s sets of file urls: %s" % (len(li), li))
    # result looks like
    # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File',
    #    size:number} }, {[another dict of the same format}, {another dict},... ]
    # with no more than limit=4 items in the list, and no more than three protocols.
    # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ]
    # The return value could be an empty list.
    # Note: These nested lists are ugly; it's just a quick way to code something up.

    urlps = []
    for dic in li:
        urlps += [[dic[key], key] for key in dic.keys()
                  if key.find('url_') >= 0 and dic[key].find('//None') < 0]
        # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp'
        # The search for //None bypasses an issue with the SOLR lookup where there is no
        # url_gridftp possibility.

    return prioritize_urlps(urlps)
Ejemplo n.º 2
0
def use_file_timestamp_if_dataset_timestamp_is_missing(d):

    if 'timestamp' not in d:
        # timestamp doesn't exist in ESGF for this dataset

        # hack
        #
        # Use a dataset's (random (i.e. files have not always the same even
        # timestmap in one dataset, so we take one randomly)) file timestamp
        # as dataset's timestamp is missing in ESGF !

        # Note
        #     We do not filter replica in the query below in case the master host is not up
        result=sdquicksearch.run(parameter=['limit=1','fields=instance_id,timestamp,type','type=File','dataset_id=%s'%d['instance_id']],post_pipeline_mode=None)
        if len(result.files)>0:
            file=result.files[0]

            if 'timestamp' in file:

                d['timestamp']=file['timestamp']

                sdlog.info("SDTIMEST-001","Dataset timestamp set from one dataset's file's timestamp (dataset_functional_id=%s,file_functional_id=%s)"%(d['instance_id'],file['instance_id']))
            else:
                raise SDException("SDTIMEST-008","Timestamp missing in both dataset and dataset's file(s) (%s)"%d['instance_id'])
        else:
            raise SDException("SDTIMEST-011","Dataset exist in ESGF, but is empty (%s)"%d['instance_id'])
Ejemplo n.º 3
0
def print_remote_sample(project):
    """Print one random file of given project."""
    result=sdquicksearch.run(parameter=['project=%s'%project,"limit=1"])
    for f in result.files:
        print "%s" % (f['local_path'])
        print "%s|%s" % (f['file_functional_id'],f['data_node'])
        print
Ejemplo n.º 4
0
def get_datasets(
    stream=None,
    parameter=None,
    post_pipeline_mode='dataset',
    dry_run=False
):  # TODO: maybe remove parameter argument everywhere as there is a mess in get_selection_file_buffer, because of default/forced parameter (i.e. len(parameter) is non-zero even if non parameter args set on CLI !)

    if parameter is None:
        parameter = []

    assert (stream is None) or (
        len(parameter) < 1
    )  # this is to prevent using stream and parameter together
    assert post_pipeline_mode != 'file'

    if len(parameter) > 0:
        sddeferredbefore.add_forced_parameter(parameter, 'type', 'Dataset')
    elif stream is not None:
        sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset')

    result = sdquicksearch.run(stream=stream,
                               parameter=parameter,
                               post_pipeline_mode=post_pipeline_mode,
                               dry_run=dry_run)
    return result.get_files()
Ejemplo n.º 5
0
def print_remote_sample(project):
    """Print one random file of given project."""
    result = sdquicksearch.run(parameter=['project=%s' % project, "limit=1"])
    for f in result.get_files():
        print "%s" % (f['local_path'])
        print "%s|%s" % (f['file_functional_id'], f['data_node'])
        print
Ejemplo n.º 6
0
def test_index_hosts():
    print "ESGF indexes benchmark"
    print "======================"
    print ""

    ProgressThread.start(running_message='Building test query.. ',spinner_type=0,sleep=0.2,end_message=None)

    #parameter=get_test_query()
    parameter=get_random_test_query()

    parameter.append("limit=0")

    test_queries=sdpipeline.build_queries(parameter=parameter,index_host='<index_host>',load_default=False)

    ProgressThread.stop()

    test_query=test_queries[0]
    print "Test query"
    print "----------"
    print "%s"%test_query['url']
    print ""

    ProgressThread.start(running_message='Test running.. ',spinner_type=0,sleep=0.2,end_message=None)

    li=[]
    for index_host in sdindex.index_host_list:
        result=sdquicksearch.run(index_host=index_host,parameter=parameter)
        li.append([index_host,result.num_found,result.call_duration if result.call_duration>=1 else 0.1])

    ProgressThread.stop()
    print "Result"
    print "------"
    li=sorted(li, key=lambda record: record[2])
    print tabulate(li,headers=['Index host','File count','Call duration (seconds)'],tablefmt="plain")
Ejemplo n.º 7
0
def get_urls(file_functional_id):
    result = sdquicksearch.run(parameter=[
        'limit=1',
        'fields=%s' % url_fields, 'type=File',
        'instance_id=%s' % file_functional_id
    ],
                               post_pipeline_mode=None)
    li = result.get_files()
    if len(li) > 0:
        file_ = li[0]

        # remove non url attributes
        try:
            del file_['attached_parameters']
        except Exception as e:
            pass

        urls = file_

    else:
        sdlog.info(
            "SDNEXTUR-090", "File not found (file_functional_id=%s)" %
            (tr.file_functional_id, ))
        raise sdexception.FileNotFoundException()

    return urls
Ejemplo n.º 8
0
def get_sample_files(project, limit):
    """This func retrieves a list of sample files for the given project."""

    result = sdquicksearch.run(
        parameter=['type=File',
                   'project=%s' % project,
                   "limit=%s" % limit],
        post_pipeline_mode='file')
    return result
Ejemplo n.º 9
0
def get_files(stream=None,parameter=[],post_pipeline_mode='file',dry_run=False): # TODO: maybe remove parameter argument everywhere as there is a mess in get_selection_file_buffer, because of default/forced parameter (i.e. len(parameter) is non-zero even if non parameter args set on CLI !)

    assert (stream is None) or (len(parameter)<1) # this is to prevent using stream and parameter together

    if len(parameter)>0:
        sddeferredbefore.add_forced_parameter(parameter,'type','File')
    elif stream is not None:
        sddeferredbefore.add_forced_parameter(stream,'type','File')

    result=sdquicksearch.run(stream=stream,parameter=parameter,post_pipeline_mode=post_pipeline_mode,dry_run=dry_run)
    return result.files
Ejemplo n.º 10
0
 def do_add(self, file):
     self.parameter = ["limit=1", "instance_id=%s" % file]
     self.complete_parameter()
     result = sdquicksearch.run(parameter=self.parameter)
     if len(result.files) == 1:
         f = result.files[0]
         if f['status'] == sdconst.TRANSFER_STATUS_NEW:
             sdenqueue.run(result.files)
             print "File successfully enqueued"
         else:
             print "File already enqueued"
     elif len(result.files) == 0:
         print "File not found"
Ejemplo n.º 11
0
 def do_add(self,file):
     self.parameter=["limit=1","instance_id=%s"%file]
     self.complete_parameter()
     result=sdquicksearch.run(parameter=self.parameter)
     if len(result.files)==1:
         f=result.files[0]
         if f['status']==sdconst.TRANSFER_STATUS_NEW:
             sdenqueue.run(result.files)
             print "File successfully enqueued"
         else:
             print "File already enqueued"
     elif len(result.files)==0:
         print "File not found"
Ejemplo n.º 12
0
def fill_missing_dataset_timestamp(dataset_without_timestamp):
    """This funcs set the dataset timestamp.

    Notes
        - This func DO NOT commit.
        - In ESFG, timestamp differs from replica to replica, and so, as there
          is no dataset replica concept in 'sdt', it's really a hack, because
          we set the timestamp randomly (i.e. dataset's timestamp in
          Synda installation at user A may differ to dataset's timestamp
          in Synda installation at user B (because the timestamp for the
          dataset may have been retrieved from replica X in the case of user A
          and from replica Y in the case of user B (and X replica's timestamp
          may differ from Y replica's timestamp))). Anyway, in the end, we
          hope that the timestamp random is on a much smaller scale than the
          version-to-version time interval scale, so to be able to detect which
          version is the latest ! And yes: all this mess is because version exists
          in different formats ('v1', 'v20140318'..).
    """

    # Retrieve timestamps from ESGF
    # Note
    #     We do not filter replica in the query below in case the master host is not up
    result = sdquicksearch.run(parameter=[
        'limit=1',
        'fields=%s' % timestamp_fields, 'type=Dataset',
        'instance_id=%s' % dataset_without_timestamp.dataset_functional_id
    ],
                               post_pipeline_mode=None)
    li = result.get_files()

    # check if dataset has been found in ESGF
    if len(li) > 0:
        d = li[0]
    else:
        raise SDException(
            "SDTIMEST-800",
            "%s dataset does not exist in ESGF (or the index used does not list it)"
            % dataset_without_timestamp.dataset_functional_id)

    # use file's timestamp if dataset's timestamp is not set in ESGF
    # (this is needed, because some dataset in ESGF have NO timestamp...)
    use_file_timestamp_if_dataset_timestamp_is_missing(d)

    # update timestamp in DB
    dataset = sddatasetdao.get_dataset(dataset_functional_id=d['instance_id'])
    dataset.timestamp = d['timestamp']
    sddatasetdao.update_dataset(dataset, commit=False, keys=['timestamp'])
Ejemplo n.º 13
0
def get_data_nodes(instance_id,replica_scalar):
    """Return one or more data_nodes depending on the 'replica' flag."""

    parameter=['limit=50','type=Dataset','instance_id=%s'%instance_id,'replica=%s'%replica_scalar]

    # debug
    #print parameter

    result=sdquicksearch.run(parameter=parameter,post_pipeline_mode=None,dry_run=False)
    if result.num_result>0:

        datanodes=[]
        for d in result.files:
            datanodes.append(d['data_node'])

        return datanodes
    else: 
        return []
Ejemplo n.º 14
0
    def do_select(self,arg):
        # -- SQL mode (non-documented as not sure if this is here to stay) -- #
        tokens=arg.split()

        column=tokens[0]

        table=tokens[2]
        where_clause=tokens[4:]

        parameter=where_clause

        dry_run=sdsessionparam.get_value('dry_run')

        parameter.append('type=%s'%table.title())
        result=sdquicksearch.run(parameter=parameter,dry_run=dry_run)

        for f in result.files:
            if column in f:
                print f[column]
Ejemplo n.º 15
0
def get_urls(file_functional_id):
    """returns a prioritized list of [url,protocol] where each url can supply the specified file"""

    result=sdquicksearch.run(parameter=['limit=4','fields=%s'%url_fields,'type=File','instance_id=%s'%file_functional_id],post_pipeline_mode=None)
    li=result.get_files()
    sdlog.info("JFPNEXTUR-05","sdquicksearch returned %s sets of file urls: %s"%(len(li),li))
    # result looks like
    # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File',
    #    size:number} }, {[another dict of the same format}, {another dict},... ]
    # with no more than limit=4 items in the list, and no more than three protocols.  
    # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ]
    # The return value could be an empty list.
    # Note: These nested lists are ugly; it's just a quick way to code something up.

    urlps = []
    for dic in li:
        urlps += [ [dic[key],key] for key in dic.keys() if key.find('url_')>=0 ]
        # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp'

    return prioritize_urlps( urlps )
Ejemplo n.º 16
0
def get_data_nodes(instance_id,replica):
    """Return one or more data_nodes depending on the 'replica' flag."""

    parameter=['limit=50','type=Dataset','instance_id=%s'%instance_id]

    if replica is not None:
        parameter.append('replica=%s'%replica)

    # debug
    #print parameter

    result=sdquicksearch.run(parameter=parameter,post_pipeline_mode=None,dry_run=False)
    if result.count()>0:

        datanodes=[]
        for d in result.get_files():
            datanodes.append(d['data_node'])

        return datanodes
    else: 
        return []
Ejemplo n.º 17
0
def use_file_timestamp_if_dataset_timestamp_is_missing(d):

    if 'timestamp' not in d:
        # timestamp doesn't exist in ESGF for this dataset

        # hack
        #
        # Use a dataset's (random (i.e. files have not always the same even
        # timestmap in one dataset, so we take one randomly)) file timestamp
        # as dataset's timestamp is missing in ESGF !

        # Note
        #     We do not filter replica in the query below in case the master host is not up
        result = sdquicksearch.run(parameter=[
            'limit=1',
            'fields=%s' % timestamp_fields, 'type=File',
            'dataset_id=%s' % d['instance_id']
        ],
                                   post_pipeline_mode=None)
        li = result.get_files()
        if len(li) > 0:
            file = li[0]

            if 'timestamp' in file:

                d['timestamp'] = file['timestamp']

                sdlog.info(
                    "SDTIMEST-001",
                    "Dataset timestamp set from one dataset's file's timestamp (dataset_functional_id=%s,file_functional_id=%s)"
                    % (d['instance_id'], file['instance_id']))
            else:
                raise SDException(
                    "SDTIMEST-008",
                    "Timestamp missing in both dataset and dataset's file(s) (%s)"
                    % d['instance_id'])
        else:
            raise SDException(
                "SDTIMEST-011",
                "Dataset exist in ESGF, but is empty (%s)" % d['instance_id'])
Ejemplo n.º 18
0
def fill_missing_dataset_timestamp(dataset_without_timestamp):
    """This funcs set the dataset timestamp.

    Notes
        - This func DO NOT commit.
        - In ESFG, timestamp differs from replica to replica, and so, as there
          is no dataset replica concept in 'sdt', it's really a hack, because
          we set the timestamp randomly (i.e. dataset's timestamp in
          Synda installation at user A may differ to dataset's timestamp
          in Synda installation at user B (because the timestamp for the
          dataset may have been retrieved from replica X in the case of user A
          and from replica Y in the case of user B (and X replica's timestamp
          may differ from Y replica's timestamp))). Anyway, in the end, we
          hope that the timestamp random is on a much smaller scale than the
          version-to-version time interval scale, so to be able to detect which
          version is the latest ! And yes: all this mess is because version exists
          in different formats ('v1', 'v20140318'..).
    """

    # Retrieve timestamps from ESGF
    # Note
    #     We do not filter replica in the query below in case the master host is not up
    result=sdquicksearch.run(parameter=['limit=1','fields=%s'%timestamp_fields,'type=Dataset','instance_id=%s'%dataset_without_timestamp.dataset_functional_id],post_pipeline_mode=None)
    li=result.get_files()

    # check if dataset has been found in ESGF
    if len(li)>0:
        d=li[0]
    else:
        raise SDException("SDTIMEST-800","%s dataset does not exist in ESGF (or the index used does not list it)"%dataset_without_timestamp.dataset_functional_id)

    # use file's timestamp if dataset's timestamp is not set in ESGF
    # (this is needed, because some dataset in ESGF have NO timestamp...)
    use_file_timestamp_if_dataset_timestamp_is_missing(d)

    # update timestamp in DB
    dataset=sddatasetdao.get_dataset(dataset_functional_id=d['instance_id'])
    dataset.timestamp=d['timestamp']
    sddatasetdao.update_dataset(dataset,commit=False,keys=['timestamp'])
Ejemplo n.º 19
0
    def do_select(self, arg):
        # -- SQL mode (non-documented as not sure if this is here to stay) -- #

        import sdsessionparam, sdquicksearch

        tokens = arg.split()

        column = tokens[0]

        table = tokens[2]
        where_clause = tokens[4:]

        parameter = where_clause

        dry_run = sdsessionparam.get_value('dry_run')

        parameter.append('type=%s' % table.title())
        result = sdquicksearch.run(parameter=parameter, dry_run=dry_run)

        for f in result.files:
            if column in f:
                print f[column]
Ejemplo n.º 20
0
def get_sample_datasets(project,limit):
    """This func retrieves a list of sample datasets for the given project."""

    result=sdquicksearch.run(parameter=['type=Dataset','project=%s'%project,"limit=%s"%limit],post_pipeline_mode='dataset')
    return result
Ejemplo n.º 21
0
def get_sample_files(project,limit):
    """This func retrieves a list of sample files for the given project."""

    result=sdquicksearch.run(parameter=['type=File','project=%s'%project,"limit=%s"%limit],post_pipeline_mode=None)
    return result