Example #1
0
def call_sessionmaker(root):
    from sqlalchemy.orm import sessionmaker
    from sqlalchemy import create_engine
    from esgcet.config import loadConfig, initLogging, registerHandlers

    # init_file = "../scripts/esg.ini"
    init_file = None  # Load installed init file
    echoSql = True

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    root.engine = create_engine(config.getdburl('extract'),
                                echo=root.echoSql,
                                pool_recycle=3600)
    initLogging('extract', override_sa=root.engine)
    Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    root.config = config
    root.Session = Session
    root.projectName = None
    root.firstFile = None
    root.dmap = None
    root.extraFields = None
    root.directoryMap = None
    root.datasetMapfile = None
    root.filefilt = None
def call_sessionmaker( root ):
    from sqlalchemy.orm import sessionmaker
    from sqlalchemy import create_engine
    from esgcet.config import loadConfig, initLogging, registerHandlers

    # init_file = "../scripts/esg.ini"
    init_file = None                    # Load installed init file
    echoSql = True

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    root.engine = create_engine(config.getdburl('extract'), echo=root.echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=root.engine)
    Session = sessionmaker(bind=root.engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    root.config = config
    root.Session = Session
    root.projectName = None
    root.firstFile = None
    root.dmap = None
    root.extraFields = None
    root.directoryMap = None
    root.datasetMapfile = None
    root.filefilt = None
Example #3
0
def initdb(init_file=None, echoSql=False, log_filename=None):
    global dbengine

    config = loadConfig(init_file)
    if dbengine is None:
        dbengine = create_engine(config.getdburl("extract"), echo=echoSql, pool_recycle=3600)
    initLogging("extract", override_sa=dbengine, log_filename=log_filename)
    Session = sessionmaker(bind=dbengine, autoflush=True, autocommit=False)
    return config, Session
Example #4
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(
            argv, "", ['config-section=', 'echo', 'recursive='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    configSection = "srmls"
    echo = False
    recurse = True
    for flag, arg in args:
        if flag == '--config-section':
            configSection = arg
        elif flag == '--echo':
            echo = True
        elif flag == '--recursive':
            recurse = (arg.lower() == "yes")

    if len(lastargs) == 0:
        print "No directory specified."
        print usage
        sys.exit(0)

    if recurse:
        recurseOption = "-recursive"
    else:
        recurseOption = ""

    config = loadConfig(None)
    command = config.get(configSection, 'srmls')
    offline_proxy = config.get(configSection, 'srm_server')
    archive = config.get(configSection, 'srm_archive')
    srm_prefix = "%s?SFN=%s" % (offline_proxy, archive)
    echo_args = "-storageinfo %s -s '%s?SFN=%s%s'" % (
        recurseOption, offline_proxy, archive, lastargs[0])
    command_args = "-storageinfo %s -s %s?SFN=%s%s" % (
        recurseOption, offline_proxy, archive, lastargs[0])

    if echo:
        print '%s %s' % (command, echo_args)
        sys.exit(0)

    try:
        f = subprocess.Popen([command, command_args],
                             stdout=subprocess.PIPE).stdout
    except:
        raise ESGPublishError(
            "Error running command '%s %s': check configuration option 'srmls'"
            % (command, command_args))

    for path, size in SRMIterator(f, srm_prefix):
        print path, size
    f.close()
Example #5
0
def initdb(init_file=None, echoSql=False, log_filename=None):
    global dbengine

    config = loadConfig(init_file)
    if dbengine is None:
        dbengine = create_engine(config.getdburl('extract'),
                                 echo=echoSql,
                                 pool_recycle=3600)
    initLogging('DEFAULT', override_sa=dbengine, log_filename=log_filename)
    Session = sessionmaker(bind=dbengine, autoflush=True, autocommit=False)
    return config, Session
Example #6
0
def gc_mvgood( topdir, gcdir ):
    """second step of gc, move good files from /scratch/_gc/ to /scratch/."""
    config = loadConfig(None)
    engine = sqlalchemy.create_engine(config.get('replication', 'esgcet_db'), echo=False,
                                      pool_recycle=3600)

    # os.walk isn't going to work very well.  I would have to parse the path to identify
    # the abs_path, which encodes the facets and version of the dataset, etc.
    # It's easier to start with those pieces of the path, and stick them together...

    for gcdsdir in glob.glob( gcdir ):
        fac1dir = gcdsdir[ len(os.path.join(topdir,'scratch/_gc/')): ]  # one choice of facets
        # ...gcdsdir is the root directories for the dataset now in .../scratch/_gc/...
        # Below this directory are ones for versions and variables, and possibly bad? directories
        # for files which failed a checksum.
        if fac1dir.endswith("withdrawn"):   # leave this facet directory in _gc, all versions
            # rare, seen for LASG probably it's a name change done by hand
            continue
        versiondirs = os.listdir( gcdsdir )  # should be version directories e.g. v20120913/
        for versd in versiondirs:
            verspath = os.path.join(gcdsdir,versd)
            if not os.path.isdir(verspath): continue
            if not check_versiondir( versd ):
                raise Exception("%s does not look like a version directory"%versd)
            vardirs = os.listdir(verspath)
            mvstatus = False  # True if any file in this dataset+version should be moved
                              # back to scratch/.
            for vard in vardirs:
                varpath = os.path.join(verspath,vard)
                dirpath = varpath
                if not os.path.isdir(varpath): continue
                filenames = os.listdir(varpath) # mostly files, may also have bad? directories
                for filename in filenames:
                    filep = os.path.join(varpath,filename)
                    if os.path.isfile(filep):
                        abspath = os.path.join( fac1dir, versd, vard, filename )
                        mvstatus = mvstatus or mvgood2scratch( filename, abspath, dirpath, engine )
            if mvstatus is True:
                # A file was moved back to scratch, others in the same dataset+version should be
                # moved.
                for vard in vardirs:
                    varpath = os.path.join(verspath,vard)
                    dirpath = varpath
                    if not os.path.isdir(varpath): continue
                    filenames = os.listdir(varpath) # mostly files, may also have bad? directories
                    for filename in filenames:
                        filep = os.path.join(varpath,filename)
                        if os.path.isfile(filep):
                            mv2scratch( filename, dirpath )
                        if os.path.isdir(filep) and filep.find('bad')==0:
                            for filen in os.listdir(filep):
                                if os.path.isfile(filen):
                                    mv2scratch( filename, dirpath )
Example #7
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "", ['config-section=', 'echo', 'recursive='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    configSection = "srmls"
    echo = False
    recurse = True
    for flag, arg in args:
        if flag=='--config-section':
            configSection = arg
        elif flag=='--echo':
            echo = True
        elif flag=='--recursive':
            recurse = (arg.lower()=="yes")

    if len(lastargs)==0:
        print "No directory specified."
        print usage
        sys.exit(0)

    if recurse:
        recurseOption = "-recursive"
    else:
        recurseOption = ""

    config = loadConfig(None)
    command = config.get(configSection, 'srmls')
    offline_proxy = config.get(configSection, 'srm_server')
    archive = config.get(configSection, 'srm_archive')
    srm_prefix = "%s?SFN=%s"%(offline_proxy, archive)
    echo_args = "-storageinfo %s -s '%s?SFN=%s%s'"%(recurseOption, offline_proxy, archive, lastargs[0])
    command_args = "-storageinfo %s -s %s?SFN=%s%s"%(recurseOption, offline_proxy, archive, lastargs[0])

    if echo:
        print '%s %s'%(command, echo_args)
        sys.exit(0)

    try:
        f = subprocess.Popen([command, command_args], stdout=subprocess.PIPE).stdout
    except:
        raise ESGPublishError("Error running command '%s %s': check configuration option 'srmls'"%(command, command_args))
        
    for path, size in SRMIterator(f, srm_prefix):
        print path, size
    f.close()
Example #8
0
def download_list( facets=facets_default, downloadlist="download-mon", tempfile="esg.tmp",\
                   reuse_tempfile=False, forcedl=False,\
                   statusfile=None, head='/cmip5/scratch', serviceurl="", limit=0 ):
    """generates a download list suitable for Estani's Download.py.
    Inputs are: facets, a dictionary of 9 facets which specify the datasets to be downloaded,
    (You can generate facets with the function dataset2facets)
    downloadlist, name of the file to write to,
    tmpfile, name of a temporary file (used for output from esgquery_index)
    (for debugging purposes you can set reuse_tempfile=True to use a file previously generated)
    forcedl, set True to force download even of files we already have
    statusfile, name of a file to which warnings and debugging information are written; defaults to stdout
    head, the first part of the download path (after which is the abs_path of the replica database)
    limit, the maximum number of files to process; 0 for no limit (mainly for testing)
    serviceurl, the argument of esgquery_index
    """
    if statusfile==None: statusfile = sys.stdout
    else: statusfile = open(statusfile,'a+')

    # 
    # Put the institute name is in its P2P form for esgquery_index.
    # For example, esgquery_index wants CCCMA, not CCCma; and sometimes a space rather than a hyphen.
    institute = facets['institute']
    if institute in institute_p.keys():
        institute = institute_p[institute]
    facets['institute'] = institute
    # Also fix the project name - it's CMIP5 for the P2P system, cmip5 for the gateway system.
    facets['project'] = facets['project'].upper()   # same for cmip5/CMIP5
    # Also, in the one case I know of where this is a problem, put the model in its P2P form:
    if facets['model']=='inmcm4':
        facets['model'] = 'INM-CM4'

    # The fields to get with esgquery:
    fields = facets.keys()+[ 'url', 'latest', 'variable', 'title', 'size', 'checksum', 'checksum_type' ]

    # ...note about versions: the version field (when you do esgquery_index on a file) is file_version,
    # not the dataset version which is normally meant by "version".  We can get the dataset version by
    # extgracting the dataset_id field for the file, then calling
    # "esgquery_index -q id=<dataset_id> --fields version -p".
    # But it will be simpler to extract the dataset version from a file_id, as is done below.

    pp = pprint.PrettyPrinter(stream=statusfile)
    if not reuse_tempfile:
        ffiles = open(tempfile,'w')

        arg1= "-q "+ ','.join([ i+"='"+facets[i]+"'" for i in facets.keys() if facets[i] is not '%' ])
        arg2="--fields "+','.join(fields)+" --type f -p"
        arg3 = "" # various options
        if len(serviceurl)>0:
            arg3 = arg3 + " --serviceurl "+serviceurl
        if limit>0:
            arg3 = arg3 + " --limit %d"%(limit)
        subprocess.call(['echo','esgquery_index',arg1,arg2,arg3],stdout=ffiles)
        print 'esgquery_index'+' '+arg1+' '+arg2+' '+arg3
        subprocess.call(['esgquery_index'+' '+arg1+' '+arg2+' '+arg3],stdout=ffiles,shell=True)
        ffiles.close()
    ffiles = open(tempfile)
    dll = []  # each list entry will be a dictionary, download information for one file
    # Thus I'm assuming that esgquery_index is sorted by the file id.  If this ever turns out not
    # to be true, I can sort it or make dll a dict.
    fileid = None
    for line in ffiles:
        cols=line.split('|')
        # len(cols)=1:header or footer.  otherwise,usually:
        # cols[0]='',cols[1]=dataset.file,cols[2]=host,cols[3]=field,cols[4:len-1]=value,cols[len-1]=garbage
        # The original id (first column) is cols[1]+'.'+cols[2].
        if len(cols)<6: continue   # probably 6 columns is real; anything less means header or footer
        if cols[1]!=fileid:    # first hit on this file.
            fileid = cols[1]
            fd = {'fileid':lstrip(rstrip(fileid))}
            dll.append(fd)
        field = lstrip(rstrip(cols[3]))
        fd[field] = [ lstrip(rstrip(val)) for val in cols[4:len(cols)-1] ]

    # pp.pprint(dll)

    fdllist = open(downloadlist,'w')
    dllist = []
    for fd in dll:
        # Form this file's line in download list, first separately getting its fields (columns).
        # Don't bother if this isn't the latest version of the file:
        if fd['latest'][0]!='true':
            # print "file",fd['fileid'],"is not the latest"
            continue
        out0 = fd['url'][0]
        out2 = fd['size'][0]
        out3 = fd.get( 'checksum', ['DUMMY'] )[0]
        out4 = fd.get( 'checksum_type', ['md5'] )[0].lower()
        # When we write to a file, out1 will be the target path, of the form  head/project/product/institute/...
        # But for now, out2 will be abs_path, of the form project/product/institute/...
        # ...institute/model/experiment/time_frequency/realm/cmor_table/ensemble/<version>/variable/title
        # To get the version, take advantage of the fact that the fileid always begins with the same fields,
        # project.product.institute.model.experiment.time_frequency.realm.cmor_table.ensemble.version.filename:
        version = fd['fileid'].split('.')[9]
        institute = fd['institute'][0]
        # Put the institute name is in its gateway form.
        if institute in institute_g.keys():
            institute = institute_g[institute]
        model = fd['model'][0]
        if model=="INM-CM4" or model=="inm-cm4":
            model = "inmcm4"
        out1 = '/'.join( [ fd['project'][0].lower(), fd['product'][0], institute, fd['model'][0],\
                           fd['experiment'][0], fd['time_frequency'][0], fd['realm'][0], fd['cmor_table'][0],\
                           fd['ensemble'][0], version, fd['variable'][0], fd['title'][0] ] )
        dllist.append([ out0, out1, out2, out3, out4 ]) 
        # fdllist.write('\t'.join([ out0, head+'/'+out1, out2, out3, out4 ])+'\n')
    

    # At this point we have in dllist a download list, constructed from the output of esgquery_index
    # (i.e. the P2P system).
    # Now we shall to compare it with files known to the replication database.  If we already have the
    # file, don't get it again.  If we have the exact same file stored under a different version number
    # (very common), re-use it.  If the file _isn't_ in the database, issue a warning and don't get it
    # (because we wouldn't be able to keep track of it after downloading it).

    config = loadConfig(None)
    engine = sqlalchemy.create_engine(config.get('replication', 'esgcet_db'), echo=False, pool_recycle=3600)

    # files0 is the files we want but already have, expresses as an abs_path; from the replication database.
    # Note:  this could be merged with the check for older files - would save a database access
    # but increase code complexity a little.
    # Note: unfortunately, esgquery_index and postgres seem to do output sorting and hence limits
    # a little differently.
    dstr = facets2dataset(facets)
    if not forcedl:
        sql0 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"' AND status>=30;"
        files0 = engine.execute( sql.text( sql0 ) ).fetchall()
        files0 = [ f[0] for f in files0 ]
        # pp.pprint "From %s files0=\n"%(sql0)
        # pp.pprint(files0)
        # Of couse, we don't want files we already have, so take them out of the download list:
        # Making a set out of files0 should convert the deletion process from O(n^2) to O(n), with
        # bigger constants; I'm not sure whether it's worth it.  Usually 1,000<n<100,000.
        # Also, the database and esgquery_index have different naming conventions, e.g. institute
        # CCCma/CCCMA so the comparison has to be made case-insensitive (at least; but I believe that
        # urls and file paths derived from the P2P system will hyphenate the same way).
        sfiles0 = set([f.lower() for f in files0])
        ldllist0 = len(dllist)
        dllist = [ row for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') not in sfiles0 ]
        #...if there are any more mismatch cases not handled by lower(), then I'll have to do
        # a more complicated fix - break up into facets, subsitute with tables, then recombine.
        statusfile.write("dllist reduced from %d to %d lines by removing files we already have\n"%\
                         (ldllist0, len(dllist)) )

    # files1 is the files we want and don't have; from the replication database.
    # It should correspond to the download list, but probably doesn't because they're based on different
    # harvests.
    # I don't want to deal with files which are missing from the database.  Rather than try
    # to fix the database, we'll take them out of the download list too.
    if forcedl:
        sql1 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"';"
    else:
        sql1 = "SELECT abs_path FROM replica.files WHERE dataset_name LIKE '"+dstr+"' AND status<30;"
    print sql1
    files1 = engine.execute( sql.text( sql1 ) ).fetchall()
    files1 = [ f[0] for f in files1 ]
    # pp.pprint "From %s files1=\n"%(sql0)
    # pp.pprint(files1)
    sfiles1 = set([f.lower() for f in files1])
    ldllist0 = len(dllist)
    # pp.pprint(dllist)
    print [ row[1].lower().replace('inm-cm4','inmcm4') for row in dllist ][0]
    dllist2 = [ row for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') in sfiles1 ]
    #...if there are any more mismatch cases not handled by lower(), then I'll have to do
    # a more complicated fix - break up into facets, subsitute with tables, then recombine.
    statusfile.write(("dllist reduced from %d to %d lines by removing files not known to the replication"+\
                     " database.\n")%(ldllist0, len(dllist2)) )
    if len(dllist2)<ldllist0:
        statusfile.write( "WARNING: This change discards the following download list files.\n" )
        statusfile.write( "Maybe it's time for another harvest!\n" )
        if statusfile!=sys.stdout:  # don't write too much to the screen
            pp.pprint( [ row[1] for row in dllist if row[1].lower().replace('inm-cm4','inmcm4') not in sfiles1 ] )
            #...if there are any more mismatch cases not handled by lower(), then I'll have to do
    # a more complicated fix - break up into facets, subsitute with tables, then recombine.
        else:
            statusfile.write("(filenames not printed)\n")
    dllist = dllist2

    # If there be no output limits, and the relevant harvests up-to-date, then there should be a
    # 1:1 correspondence between files1 and dllist (because no file should appear twice in either one).
    # So check for that.
    if limit<=0:
        if len(files1)!=len(dllist):
            statusfile.write("WARNING: esgquery and database produced different numbers of files to download!")
            statusfile.write(" esgquery: %d;  database: %d\n"%( len(dllist), len(files1) ) )
            if statusfile!=sys.stdout:
                print "WARNING: esgquery and database produced different numbers of files to download!",\
                      len(dllist), len(files1)
    # _Maybe_ these sorts will help in finding the row efficiently in the older-version search below;
    # this needs investigation if it matters.
    files1.sort( key=( lambda i: i.lower() ) )
    dllist.sort( key=( lambda i: i[1].lower() ) )

    # files2 is the same as files1 but with the SQL wildcard % in place of the version
    # example of the following:
    # f = cmip5/output1/CCCma/CanCM4/decadal2008/mon/atmos/Amon/r1i1p1/v20111027/cl/cl_etc.nc
    # fs=[cmip5,output1,CCCma,CanCM4,decadal2008,mon,atmos,Amon,r1i1p1,v20111027,cl,cl_etc.nc]
    # fs=[cmip5,output1,CCCma,CanCM4,decadal2008,mon,atmos,Amon,r1i1p1,%,cl,cl_etc.nc]
    # g = cmip5/output1/CCCma/CanCM4/decadal2008/mon/atmos/Amon/r1i1p1/%/cl/cl_etc.nc
    files2=files1[:]
    for i in range(len(files2)):
        fs = files1[i].split('/')
        fs[9] = '%'
        files2[i] = '/'.join(fs)
    # pp.pprint "From %s files2=\n"%(sql0)
    # pp.pprint(files2)

    # Now look for older versions of each file in dllist/files1/files2:
    nnomatch = 0
    for i in range(len(files2)):
        fil2 = files2[i]
        fil1 = files1[i]
        sql2 = "SELECT abs_path,checksum,checksum_type FROM replica.files WHERE abs_path LIKE '"+fil2+\
               "' AND status>=30;"
        hvf = engine.execute( sql.text( sql2 ) ).fetchall()   # list of (abs_path,checksum,checksum_type)
        for fi in hvf:
            # If abs_path is in the download list, this is a file we have which is the same as a file we
            # want, other than version number.  If the checksum matches, we don't have to download -
            # just copy from one version's directory to the new version's directory.
            # Of course, don't bother to do anything if the dllist already refers to another local copy.

            row = next((r for r in dllist if r[1]==fil1),None)  # the row which matches fil1; None if no match
            # ...the above use of a generator expression will only search until the row is found.
            # Thanks to ionous blog: http://dev.ionous.net/2009/01/python-find-item-in-list.html
            # >>> I would like it to start at the previous match; that will usually get the next
            # >>> match in just one try if the lists are sorted first; look into this later.
            if row==None:
                # The database has a file, abs_path==fil1, which the P2P system (i.e. dllist) doesn't
                # know about.  That is, the P2P and gateway systems are inconsistent with one another.
                # This shouldn't happen, but often does...
                if statusfile!=sys.stdout:  # don't write too much to the screen!
                    statusfile.write( "WARNING, can't find match for database file %s\n"%(fil1) )
                    nnomatch = nnomatch+1
                continue
            statusfile.write( fil1+'\n' )
            pp.pprint( row )
            statusfile.write( fi[0]+'\n' )
            if fi[1].upper()==row[3].upper() and fi[1].upper()!='DUMMY' and\
                   fi[2].lower()==row[4].lower() and row[0].find("file")!=0:
                # checksums match, aren't "DUMMY", so change the download url to do a local copy
                # >>>> for the moment, assume that we know where the file is.<<<<
                # >>>> later, check that it's here, and if not look other places...
                row[0] = "file://"+head+'/'+fi[0]

    statusfile.write("%d failures to find a P2P file matching a file needed by the database\n"%(nnomatch))
    if statusfile!=sys.stdout:
        print "%d failures to find a P2P file matching a file needed by the database\n"%(nnomatch)
    for row in dllist:
        fdllist.write('\t'.join([ row[0], head+'/'+row[1], row[2], row[3], row[4] ])+'\n')
    ffiles.close()
    fdllist.close()
    if statusfile is not sys.stdout: statusfile.close()
Example #9
0
def preoutput(argv):
    # This is what was formerly all of main() but the output section.
    global DEFAULT_QUERY_SERVICE

    try:
        args, lastargs = getopt.getopt(
            argv,
            "d:ho:pq:t:v",
            [
                "count",
                "delimiter=",
                "facet-query=",
                "facets=",
                "fields=",
                "format=",
                "free-text=",
                "help",
                "limit=",
                "pretty-print",
                "service-url=",
                "type=",
                "verbose",
            ],
        )
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    # Get the search URL from the publisher configuration if possible
    try:
        from esgcet.config import loadConfig

        config = loadConfig(None)
        DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE)
    except:
        pass

    allFacets = False  # facets=*
    countOnly = False
    delim = None
    facets = []
    facetValues = None
    fields = []
    format = DEFAULT_FORMAT
    freetext = None
    includeId = False
    objtype = DATASET
    offset = 0
    outpath = sys.stdout
    outpathIsStdout = True
    prettyPrint = False
    service = DEFAULT_QUERY_SERVICE
    userLimit = MAX_RECORDS
    verbose = False
    for flag, arg in args:
        if flag == "--count":
            countOnly = True
        elif flag in ["-d", "--delimiter"]:
            delim = arg
            prettyPrint = False
        elif flag == "--facets":
            facetList = arg.split(",")
            facetValues = [item.strip() for item in facetList]
            allFacets = facetValues[0] == "*"
        elif flag == "--fields":
            fieldList = arg.split(",")
            fields = [item.strip() for item in fieldList]
        elif flag == "--format":
            if arg not in ["narrow", "wide"]:
                raise RuntimeError("Invalid format: %s" % arg)
            format = arg
        elif flag in ["-h", "--help"]:
            print usage
            sys.exit(0)
        elif flag == "--limit":
            try:
                userLimit = int(arg)
            except:
                raise RuntimeError("Invalid limit: %s" % arg)
        elif flag == "-o":
            outpath = open(arg, "w")
            outpathIsStdout = False
        elif flag in ["-p", "--pretty-print"]:
            prettyPrint = True
        elif flag in ["-q", "--facet-query"]:
            queries = arg.split(",")
            for q in queries:
                f, v = q.split("=")
                facets.append((f.strip(), v.strip()))
        elif flag == "--service-url":
            service = arg
        elif flag in ["-t", "--free-text"]:
            freetext = arg
        elif flag == "--type":
            try:
                objtype = typeCode[arg]
            except:
                raise RuntimeError("Invalid return type: %s" % arg)
        elif flag in ["-v", "--verbose"]:
            verbose = True

    # If returning id only, use wide format
    if fields == ["id"]:
        format = "wide"
        includeId = True

    # For facet value queries, use wide format.
    if facetValues is not None:
        format = "wide"

    # While remaining data:
    fullResults = []
    numFound = 0
    moredata = True
    nread = 0
    nleft = userLimit
    offset = 0
    chunksize = DEFAULT_CHUNKSIZE
    while moredata:

        # Formulate a query
        if not (countOnly or facetValues is not None):
            limit = min(nleft, chunksize)
        else:
            limit = 0
        query = formulateQuery(
            facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues
        )
        if verbose:
            print >> sys.stderr, "Query: ", query

        # Read a chunk
        chunk = readChunk(service, query)

        # Parse the response. For facet value searches, parse the response trailer
        if facetValues is None:
            results, numFound, numResults = parseResponse(chunk, includeId)
            fullResults.extend(results)
        else:
            numResults = 0
            if allFacets:
                fullResults, numFound = parseHeader(chunk)
            else:
                fullResults, numFound = parseTrailer(chunk, facetValues, includeId)

        # More data if some results were found and the number of records read < total
        nread += numResults
        nleft -= limit
        moredata = (numResults > 0) and (nread < min(numFound, userLimit))
        offset += limit

    ##    print "jfp fullResults as list of (id,field,value) ="
    ##    pprint.pprint(fullResults)
    ##    results_ids = set([a for a,b,c in fullResults])
    ##    fullResults_dicts = [(a, { b:c for a,b,c in fullResults if a1==a }) for a1 in results_ids]
    ##    print "jfp fullResults as list of (id,dict_of_field:value) ="
    ##    pprint.pprint( fullResults_dicts )
    # TO DO: return values which aren't used here, really should be computed in main
    return (
        fullResults,
        countOnly,
        facetValues,
        allFacets,
        prettyPrint,
        delim,
        outpath,
        numFound,
        outpathIsStdout,
        format,
    )
Example #10
0
def main():

    """Uses the esg.ini file options:
        - thredds_file_services
              to get a Globus endpoint UUID
        - thredds_root
              to find a directory with THREDDS xml catalogs
    """

    loadConfig(None)
    config = getConfig()
    if config is None:
        raise ESGPublishError('No configuration file found')

    # By default thredds_root is: /esg/content/thredds/esgcet
    thredds_root = config.get('DEFAULT', 'thredds_root')
    thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT', 'thredds_file_services')
    # parameters needed to re-harvest the THREDDS catalogs
    thredds_url = config.get('DEFAULT', 'thredds_url')
    hessian_service_certfile = config.get('DEFAULT', 'hessian_service_certfile')
    hessian_service_url = config.get('DEFAULT', 'hessian_service_url')
    esgf_harvesting_service_url = hessian_service_url.replace('remote/secure/client-cert/hessian/publishingService','ws/harvest')

    thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..'))
    globus_base = None
    for service in thredds_file_services:
        if service[2] == 'Globus':
            globus_base = service[1]
    if globus_base is None:
        print 'No Globus file service specified in %s\n'\
              'Add Globus file service to the thredds_file_services variable in the form:\n'\
              '        Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\
              'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI']
        sys.exit(1)

    print '\n'\
          'ESGINI: %s\n'\
          'THREDDS root: %s\n'\
          'THREDDS url: %s\n'\
          'Globus service base: %s\n'\
          'ESGF harvesting service url: %s\n'\
          'X.509 user credential: %s\n'\
          '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile)

    if not globus_base.endswith('/'):
        print 'Globus service base must end with "/". Set Globus service base correctly in\n'\
              '%s end run the script again.' % os.environ['ESGINI']
        sys.exit(1)

    print 'The script recursively goes through xml files in %s\n'\
          'looking for datasets that were published without Globus file service and adds\n'\
          'Globus access to the datasets. If a dataset was published with Globus file\n'\
          'service configured, the script skips such a dataset leaving a corresponding xml\n'\
          'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\
          'to harvest the updated xml files. Because Hessian service requires SSL\n'\
          'authentication, the X.509 certificate, %s,\n'\
          'should be valid and obtained by a user who has the publisher role in all\n'\
          'projects.\n'\
          'It is strongly advised that you make a copy of the entire %s\n'\
          'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up)

    while True:
        sys.stdout.write("Do you want to continue? [y/N]")
        line = sys.stdin.readline().rstrip()
        if line == '' or line == 'n' or line == 'N':
            sys.exit(0)
        if line == 'y' or line == 'Y':
            break

    process(thredds_root, thredds_root_up, globus_base, thredds_url, esgf_harvesting_service_url, hessian_service_certfile)
Example #11
0
#!/usr/local/cdat/bin/python
"""Manages access to the replica DB"""
import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Float, String, sql, ForeignKey, orm
import metaconfig
import os

import logging
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

#jfp copied from replica_manager.py:
from esgcet.config import loadConfig, initLogging
config = loadConfig(None)
#Final destination of files (the archive).  Typically this comes from ~/.esgcet/esg.ini
archive_root0 = config.get('replication', 'archive_root0')  # on gdo2: /cmip5/data
archive_root1 = config.get('replication', 'archive_root1')  # on gdo2: /css01-cmip5/data
archive_root2 = config.get('replication', 'archive_root2')  # on gdo2: /css02-cmip5/data
#temporal destinations of files and other data while completing the datasets
replica_root0 = config.get('replication', 'replica_root0')   # on gdo2: /cmip5/scratch
replica_root1 = config.get('replication', 'replica_root1')   # on gdo2: /css01-cmip5/scratch
replica_root2 = config.get('replication', 'replica_root2')   # on gdo2: /css02-cmip5/scratch
# not used: map_dir= os.path.join(replica_root, 'map')
#jfp was files_dir= os.path.join(replica_root, 'files')
files_dir0 = replica_root0                               # on gdo2: /cmip5/scratch
files_dir1 = replica_root1                               # on gdo2: /css01-cmip5/scratch
files_dir2 = replica_root2                               # on gdo2: /css02-cmip5/scratch

def __get_config(section, key):
    try:
Example #12
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "", ['config-section=', 'echo', 'help', 'recursive='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    configSection = "msls"
    echo = False
    recurse = True
    for flag, arg in args:
        if flag=='--config-section':
            configSection = arg
        elif flag=='--echo':
            echo = True
        elif flag=='--help':
            print usage
            sys.exit(0)
        elif flag=='--recursive':
            recurse = (arg.lower()=="yes")

    if len(lastargs)==0:
        print "No directory specified."
        print usage
        sys.exit(0)

    if recurse:
        recurseOption = "R"
    else:
        recurseOption = ""

    config = loadConfig(None)
    command = config.get(configSection, 'msls')
    path = lastargs[0]
    command_args = "-l%s"%recurseOption

    if echo:
        print '%s %s %s'%(command, command_args, path)
        sys.exit(0)

    try:
        errout = subprocess.Popen([command, command_args, path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout
    except:
        raise ESGPublishError("Error running command '%s %s': check configuration option 'msls'"%(command, command_args))
    lines = errout.readlines()
    errout.close()

    directory = path
    for line in lines:

        line = line.strip()

        # Skip blank lines
        if len(line)==0:
            continue

        # File
        elif line[0]=='-':
            fields = line.split()
            fullpath = os.path.join(directory, fields[-1])
            print fullpath, fields[4]

        # Directory
        elif line[0]=='/' and line[-1]==':':
            directory = line[:-1]

        # Error
        elif line[0]=='/':
            raise ESGPublishError("Error: %s"%line)

        # Skip
        else:
            continue
Example #13
0
def getData(inpath, extra_metadata):
    # pdb.set_trace()
    f = cdms2.open(inpath)

    # get a handle to the main module so we can call his routines
    main = sys.modules['__main__']
 
    # load info from esg.ini
    cfg = loadConfig("esg.ini")

    # load info from this run's transient config
    x = SaneConfigParser({})
    x.read(extra_metadata)

    # pdb.set_trace()
    output = dif_switch()
    
    for mode in ['thredds_aggregation_services',
                 'thredds_file_services',
                 'thredds_offline_services']:
        s = splitRecord(cfg.get('DEFAULT', mode))[0]
        iform(output, 'service/')
        iform(output, 'serviceType="%s"' % s[0])
        iform(output, 'base="%s"' % s[1])
        iform(output, 'name="%s"' % s[2])
        iform(output, 'desc="%s"' % DEFAULT_THREDDS_SERVICE_DESCRIPTIONS[s[0]])
        iform(output, 'property/')
        iform(output, 'name="requires_authorization"')
        iform(output,
              'value="%s"' % DEFAULT_THREDDS_SERVICE_AUTH_REQUIRED[s[0]], -1)
        for app in DEFAULT_THREDDS_SERVICE_APPLICATIONS[s[0]]:
            iform(output, '  property/')
            iform(output, 'name="application"')
            iform(output, 'value="%s"' % app, -1)

        iform(output, "", 0)
    
    iform(output, 'property/')
    iform(output, 'name="catalog_version"')
    iform(output, 'value="2"', -1)

    iform(output, 'dataset/')
    iform(output, 'restrictAccess="esg-user"')
    project = x.get('DEFAULT', 'project')

    iform(output, "ID=%s" % safe_quote(safe_interpolate(cfg,
                                                'project:' + project,
                                                'dataset_id',
                                                x)))
          
    iform(output, "name=%s" % safe_quote(safe_interpolate(cfg,
                                                  'project:' + project,
                                                  'dataset_name_format',
                                                  x)))

    for name, value in x.items('DEFAULT'):
        iform(output, 'property/')
        iform(output, 'name="%s"' % name)
        iform(output, 'value="%s"' % value, -1)
        
    iform(output, 'metadata/')
    iform(output, 'variables/')
    # pdb.set_trace()

    for v in f.variables.keys():
        iform(output, 'variable/')
        iform(output, 'name="%s"' % safe_getattr(f.variables[v], "id", "name"))

        vname = vocabulary_name(f.variables[v])
        iform(output, 'vocabulary_name="%s"' % vname)
        iform(output, 'units="%s"' % safe_getattr(f.variables[v],
                                                  "units"))
        iform(output, safe_getattr(f.variables[v], "long_name"), -1)
        
    for v in f.axes.keys():
        iform(output, 'variable/')
        iform(output, 'name="%s"' % v)
        vname = vocabulary_name(f.axes[v])
        iform(output, 'vocabulary_name="%s"' % vname)
        iform(output, 'units="%s"' % safe_getattr(f.axes[v], "units"))
        iform(output, safe_getattr(f.axes[v], "long_name"), -1)

    # pdb.set_trace()
    try:
        n = output.name
        if n == '<stdout>':
            rval = None
        else:
            rval = 'file:' + n
    except:
        rval = 'string:' + output.getvalue()

    return rval
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
Example #15
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(
            argv, "", ['config-section=', 'echo', 'help', 'recursive='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    configSection = "hsi"
    echo = False
    recurse = True
    for flag, arg in args:
        if flag == '--config-section':
            configSection = arg
        elif flag == '--echo':
            echo = True
        elif flag == '--help':
            print usage
            sys.exit(0)
        elif flag == '--recursive':
            recurse = (arg.lower() == "yes")

    if len(lastargs) == 0:
        print "No directory specified."
        print usage
        sys.exit(0)

    if recurse:
        recurseOption = "R"
    else:
        recurseOption = ""

    config = loadConfig(None)
    command = config.get(configSection, 'hsi')
    path = lastargs[0]
    command_args = "ls -1s%s %s" % (recurseOption, path)

    if echo:
        print '%s %s' % (command, command_args)
        sys.exit(0)

    try:
        errout = subprocess.Popen([command, command_args],
                                  stderr=subprocess.PIPE).stderr
    except:
        raise ESGPublishError(
            "Error running command '%s %s': check configuration option 'hsi'" %
            (command, command_args))
    lines = errout.readlines()
    errout.close()

    printit = False
    for line in lines:
        if printit:
            if line[0] == '*':
                raise Exception("Error accessing %s: %s" % (path, line))
            if line[0] != '-':
                fields = line.split()
                print fields[1], fields[0]
        else:
            printit = (line[0:8] == "Username")
Example #16
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "hi:", [
            'database-delete', 'database-only', 'echo-sql', 'map=',
            'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index',
            'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds',
            'use-list='
        ])
    except getopt.error:
        print sys.exc_value
        return

    deleteAll = False
    datasetMap = None
    deleteDset = False
    unpublishOnGateway = False
    echoSql = False
    init_file = None
    gatewayOp = DELETE
    las = False
    log_filename = None
    republish = True
    restApi = None
    thredds = True
    syncThredds = False
    useList = False
    threddsReinit = True
    for flag, arg in args:
        if flag == '--database-delete':
            deleteDset = True
        elif flag == '--database-only':
            gatewayOp = NO_OPERATION
            thredds = False
            deleteDset = True
        elif flag == '--echo-sql':
            echoSql = True
        elif flag in ['-h', '--help']:
            return
        elif flag == '-i':
            init_file = arg
        elif flag == '--map':
            datasetMap = readDatasetMap(arg)
        elif flag == '--skip-gateway':
            gatewayOp = NO_OPERATION
        elif flag == '--skip-index':
            gatewayOp = NO_OPERATION
        elif flag == '--las':
            las = True
        elif flag == '--log':
            log_filename = arg
        elif flag == '--no-republish':
            republish = False
        elif flag == '--no-thredds-reinit':
            threddsReinit = False
        elif flag == '--rest-api':
            restApi = True
        elif flag == '--skip-thredds':
            thredds = False
        elif flag == '--sync-thredds':
            syncThredds = True
        elif flag == '--use-list':
            useList = True
            useListPath = arg

    if gatewayOp != NO_OPERATION and unpublishOnGateway:
        gatewayOp = UNPUBLISH

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'),
                           echo=echoSql,
                           pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    if config is None:
        raise ESGPublishError("No configuration file found.")
    threddsRoot = config.get('DEFAULT', 'thredds_root')

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    if datasetMap is None:
        if not useList:
            datasetNames = [parseDatasetVersionId(item) for item in lastargs]
        else:
            if useListPath == '-':
                namelist = sys.stdin
            else:
                namelist = open(useListPath)
            datasetNames = []
            for line in namelist.readlines():
                versionId = parseDatasetVersionId(line.strip())
                datasetNames.append(versionId)
    else:
        datasetNames = datasetMap.keys()
        datasetNames.sort()
    result = deleteDatasetList(datasetNames,
                               Session,
                               gatewayOp,
                               thredds,
                               las,
                               deleteDset,
                               deleteAll=deleteAll,
                               republish=republish,
                               reinitThredds=threddsReinit,
                               restInterface=restApi)

    # Republish previous versions as needed. This will happen if the latest version
    # was deleted from the database, and is not
    # the only version. In this case the previous version will be rescanned to generate the aggregations.
    if republish:
        statusDict, republishList = result
        if len(republishList) > 0:

            # Register project handlers.
            registerHandlers()

            info("Republishing modified datasets:")
            republishDatasetNames = [
                generateDatasetVersionId(dsetTuple)
                for dsetTuple in republishList
            ]
            dmap, offline = queryDatasetMap(republishDatasetNames, Session)
            datasetNames = dmap.keys()
            datasets = iterateOverDatasets(None,
                                           dmap,
                                           None,
                                           republishList,
                                           Session,
                                           "time",
                                           UPDATE_OP,
                                           None, {},
                                           offline, {},
                                           forceAggregate=True)
            republishOp = (gatewayOp != NO_OPERATION
                           )  # Don't republish if skipping the gateway op
            result = publishDatasetList(datasetNames,
                                        Session,
                                        publish=republishOp,
                                        thredds=thredds)

    # Synchronize database and THREDDS catalogs
    if syncThredds:
        threddsRoot = config.get('DEFAULT', 'thredds_root')

        # Make a dictionary of catalogs from the database
        session = Session()
        subcatalogs = session.query(Catalog).select_from(
            join(Catalog, Dataset,
                 Catalog.dataset_name == Dataset.name)).all()
        catdict = {}
        for catalog in subcatalogs:
            location = os.path.join(threddsRoot, catalog.location)
            catdict[location] = 1
        session.close()

        # Scan all XML files in the threddsroot
        os.path.walk(threddsRoot, cleanupCatalogs, catdict)
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        return

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_file = None
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:
            if len(lastargs)==0:
                print "No directories specified."
                return

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
            datasetNames = [(item,-1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s"%projectName, service)
            offlineLister = config.get(listerSection, 'offline_lister_executable')
            commandArgs = "--config-section %s "%listerSection
            commandArgs += " ".join(lastargs)
            for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName,-1)):
                    dmap[(dsetName,-1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName,-1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames)==0:
        warning("No datasets found.")
        min_version = -1
    else:
        min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

    # Must specify version for replications
    if min_version == -1 and masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")
    
    # Iterate over datasets
    if not publishOnly:

#        pdb.set_trace()

        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    if (not nodbwrite):
        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)
    # print `result`

    if summarizeErrors:
        print 'Summary of errors:'
        for name,versionno in datasetNames:
            dset = Dataset.lookup(name, Session)
            print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
            if dset.has_warnings(Session):
                print '=== Dataset: %s ==='%dset.name
                for line in dset.get_warnings(Session):
                    print line
Example #18
0
def main(argv):

    global DEFAULT_QUERY_SERVICE
    global DEFAULT_WGET_SERVICE

    try:
        args, lastargs = getopt.getopt(argv, "d:ho:pq:t:v", [
            'count', 'delimiter=', 'facet-query=', 'facets=', 'fields=',
            'format=', 'free-text=', 'help', 'limit=', 'offset=',
            'pretty-print', 'service-url=', 'type=', 'verbose'
        ])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    # Get the search URL from the publisher configuration if possible
    try:
        from esgcet.config import loadConfig

        config = loadConfig(None)
        DEFAULT_QUERY_SERVICE = config.get("DEFAULT",
                                           "solr_search_service_url",
                                           default=DEFAULT_QUERY_SERVICE)
        DEFAULT_WGET_SERVICE = config.get("DEFAULT",
                                          "solr_wget_service_url",
                                          default=DEFAULT_WGET_SERVICE)
    except:
        pass

    allFacets = False  # facets=*
    countOnly = False
    delim = None
    facets = []
    facetValues = None
    fields = []
    format = DEFAULT_FORMAT
    freetext = None
    includeId = False
    objtype = DATASET
    offset = 0
    outpath = sys.stdout
    outpathIsStdout = True
    prettyPrint = False
    service = DEFAULT_QUERY_SERVICE
    userLimit = MAX_RECORDS
    verbose = False
    wgetScript = False
    wgetService = DEFAULT_WGET_SERVICE
    for flag, arg in args:
        if flag == '--count':
            countOnly = True
        elif flag in ['-d', '--delimiter']:
            delim = arg
            prettyPrint = False
        elif flag == '--facets':
            facetList = arg.split(',')
            facetValues = [item.strip() for item in facetList]
            allFacets = (facetValues[0] == '*')
        elif flag == '--fields':
            fieldList = arg.split(',')
            fields = [item.strip() for item in fieldList]
        elif flag == '--format':
            if arg not in ['narrow', 'wide', 'wget', 'xml']:
                raise RuntimeError("Invalid format: %s" % arg)
            format = arg
            if arg == 'wget':
                wgetScript = True
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag == '--limit':
            try:
                userLimit = int(arg)
            except:
                raise RuntimeError("Invalid limit: %s" % arg)
        elif flag == '-o':
            outpath = open(arg, 'w')
            outpathIsStdout = False
        elif flag in ['-p', '--pretty-print']:
            prettyPrint = True
        elif flag == '--offset':
            offset = int(arg)
        elif flag in ['-q', '--facet-query']:
            queries = arg.split(',')
            for q in queries:
                f, v = q.split('=')
                facets.append((f.strip(), v.strip()))
        elif flag == '--service-url':
            service = arg
            wgetService = arg
        elif flag in ['-t', '--free-text']:
            freetext = arg
        elif flag == '--type':
            try:
                objtype = typeCode[arg]
            except:
                raise RuntimeError("Invalid return type: %s" % arg)
        elif flag in ['-v', '--verbose']:
            verbose = True

    # If a wget script is requested:
    # - Object type is File
    # - fields are url
    # - limit is min(1000, limit)
    if wgetScript:
        objtype = FILE
        fields = ['url']
        userLimit = min(userLimit, WGET_MAX_RECORDS)
        countOnly = False
        allFacets = False
        facetValues = None
        query = formulateQuery(facets,
                               fields,
                               format,
                               freetext,
                               objtype,
                               wgetService,
                               offset,
                               userLimit,
                               facetValues=facetValues)
        if verbose:
            print >> sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # XML output
    if format == 'xml':
        userLimit = min(userLimit, DEFAULT_CHUNKSIZE)
        query = formulateQuery(facets,
                               fields,
                               format,
                               freetext,
                               objtype,
                               service,
                               offset,
                               userLimit,
                               facetValues=facetValues)
        if verbose:
            print >> sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # If returning id only, use wide format
    if fields == ['id']:
        format = 'wide'
        includeId = True

    # For facet value queries, use wide format.
    if facetValues is not None:
        format = 'wide'

    # While remaining data:
    fullResults = []
    numFound = 0
    moredata = True
    nread = 0
    nleft = userLimit
    chunksize = DEFAULT_CHUNKSIZE
    while moredata:

        # Formulate a query
        if not (countOnly or facetValues is not None):
            limit = min(nleft, chunksize)
        else:
            limit = 0
        query = formulateQuery(facets,
                               fields,
                               format,
                               freetext,
                               objtype,
                               service,
                               offset,
                               limit,
                               facetValues=facetValues)
        if verbose:
            print >> sys.stderr, 'Query: ', query

        # Read a chunk
        chunk = readChunk(service, query)

        # Parse the response. For facet value searches, parse the response trailer
        if facetValues is None:
            scoreInFields = ('score' in fields)
            results, numFound, numResults = parseResponse(
                chunk, includeId, scoreInFields)
            fullResults.extend(results)
        else:
            numResults = 0
            if allFacets:
                fullResults, numFound = parseHeader(chunk)
            else:
                fullResults, numFound = parseTrailer(chunk, facetValues,
                                                     includeId)

        # More data if some results were found and the number of records read < total
        nread += numResults
        nleft -= limit
        moredata = (numResults > 0) and (nread < min(numFound, userLimit))
        offset += limit

    # Output the results
    if not (countOnly or facetValues is not None):
        outputResults(fullResults,
                      format,
                      prettyPrint=prettyPrint,
                      printHeaders=True,
                      delimiter=delim,
                      out=outpath)
    elif facetValues is not None:
        for valueList, head in zip(fullResults[0], fullResults[1]):
            if allFacets:
                header = (head, )
            else:
                header = (head, 'count')
            outputFacetResults(valueList,
                               header,
                               prettyPrint=prettyPrint,
                               printHeaders=True,
                               delimiter=delim,
                               out=outpath)
    else:
        print numFound

    if not outpathIsStdout:
        outpath.close()
Example #19
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(
            argv, "", ['config-section=', 'echo', 'help', 'recursive='])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    configSection = "msls"
    echo = False
    recurse = True
    for flag, arg in args:
        if flag == '--config-section':
            configSection = arg
        elif flag == '--echo':
            echo = True
        elif flag == '--help':
            print usage
            sys.exit(0)
        elif flag == '--recursive':
            recurse = (arg.lower() == "yes")

    if len(lastargs) == 0:
        print "No directory specified."
        print usage
        sys.exit(0)

    if recurse:
        recurseOption = "R"
    else:
        recurseOption = ""

    config = loadConfig(None)
    command = config.get(configSection, 'msls')
    path = lastargs[0]
    command_args = "-l%s" % recurseOption

    if echo:
        print '%s %s %s' % (command, command_args, path)
        sys.exit(0)

    try:
        errout = subprocess.Popen([command, command_args, path],
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT).stdout
    except:
        raise ESGPublishError(
            "Error running command '%s %s': check configuration option 'msls'"
            % (command, command_args))
    lines = errout.readlines()
    errout.close()

    directory = path
    for line in lines:

        line = line.strip()

        # Skip blank lines
        if len(line) == 0:
            continue

        # File
        elif line[0] == '-':
            fields = line.split()
            fullpath = os.path.join(directory, fields[-1])
            print fullpath, fields[4]

        # Directory
        elif line[0] == '/' and line[-1] == ':':
            directory = line[:-1]

        # Error
        elif line[0] == '/':
            raise ESGPublishError("Error: %s" % line)

        # Skip
        else:
            continue
Example #20
0
def main():
    """Uses the esg.ini file options:
        - thredds_file_services
              to get a Globus endpoint UUID
        - thredds_root
              to find a directory with THREDDS xml catalogs
    """

    loadConfig(None)
    config = getConfig()
    if config is None:
        raise ESGPublishError('No configuration file found')

    # By default thredds_root is: /esg/content/thredds/esgcet
    thredds_root = config.get('DEFAULT', 'thredds_root')
    thredds_file_services = getThreddsServiceSpecs(config, 'DEFAULT',
                                                   'thredds_file_services')
    # parameters needed to re-harvest the THREDDS catalogs
    thredds_url = config.get('DEFAULT', 'thredds_url')
    hessian_service_certfile = config.get('DEFAULT',
                                          'hessian_service_certfile')
    hessian_service_url = config.get('DEFAULT', 'hessian_service_url')
    esgf_harvesting_service_url = hessian_service_url.replace(
        'remote/secure/client-cert/hessian/publishingService', 'ws/harvest')

    thredds_root_up = os.path.normpath(os.path.join(thredds_root, '..'))
    globus_base = None
    for service in thredds_file_services:
        if service[2] == 'Globus':
            globus_base = service[1]
    if globus_base is None:
        print 'No Globus file service specified in %s\n'\
              'Add Globus file service to the thredds_file_services variable in the form:\n'\
              '        Globus | globus:<UUID_of_Globus_endpoint_pointing_to_your_data_node_GridFTP_server> | Globus | fileservice\n'\
              'A UUID assigned to the endpoint can be found on https://globus.org/' % os.environ['ESGINI']
        sys.exit(1)

    print '\n'\
          'ESGINI: %s\n'\
          'THREDDS root: %s\n'\
          'THREDDS url: %s\n'\
          'Globus service base: %s\n'\
          'ESGF harvesting service url: %s\n'\
          'X.509 user credential: %s\n'\
          '' % (os.environ['ESGINI'], thredds_root, thredds_url, globus_base, esgf_harvesting_service_url, hessian_service_certfile)

    if not globus_base.endswith('/'):
        print 'Globus service base must end with "/". Set Globus service base correctly in\n'\
              '%s end run the script again.' % os.environ['ESGINI']
        sys.exit(1)

    print 'The script recursively goes through xml files in %s\n'\
          'looking for datasets that were published without Globus file service and adds\n'\
          'Globus access to the datasets. If a dataset was published with Globus file\n'\
          'service configured, the script skips such a dataset leaving a corresponding xml\n'\
          'file unmodified. The script reinitializes THREDDS and requests Hessian service to\n'\
          'to harvest the updated xml files. Because Hessian service requires SSL\n'\
          'authentication, the X.509 certificate, %s,\n'\
          'should be valid and obtained by a user who has the publisher role in all\n'\
          'projects.\n'\
          'It is strongly advised that you make a copy of the entire %s\n'\
          'directory prior to running this script.' % (thredds_root_up, hessian_service_certfile, thredds_root_up)

    while True:
        sys.stdout.write("Do you want to continue? [y/N]")
        line = sys.stdin.readline().rstrip()
        if line == '' or line == 'n' or line == 'N':
            sys.exit(0)
        if line == 'y' or line == 'Y':
            break

    process(thredds_root, thredds_root_up, globus_base, thredds_url,
            esgf_harvesting_service_url, hessian_service_certfile)
Example #21
0
def main(argv):

	global DB_Dict, TDS_Dict, Ref_XML_Errors, VERBOSE, SOLR_HTTP, rprt_fl, DBNoneValueURL, DB_Dict_Redund

	try:
		opts, args = getopt.getopt(argv, "ahve:m:p:l:", ['all', 'help', 'verbose', 'DT', 'SD', 'ST', 'log=', 'project=',  'model=', 'experiment='])
	except getopt.GetoptError:
		print sys.exc_value
		print usage
		sys.exit(1)
	
	CMP_FLG = 0	 
	Task_Message = "comparison all metadata sources: DB, TDS, Solr" 
	proj_cnstr = None
	model_cnstr = None
	exper_cnstr = None

	ts = time.time()
	st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')	
	fl_stmp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d.%H:%M:%S')	
	rprt_file_name = "meta_synchro." + fl_stmp + ".log"
	constr_categ = ["project","model","experiment"]
	if len(args) > 0:
			print "Wrong argumnet: ",  args[0]
			print usage
			exit(1)

	if len(opts) == 0:
		print usage
		exit(0)
		 
	for opt, arg in opts:
		if opt in ['-a', '--all']:
			proj_cnstr = None
			model_cnstr = None
			exper_cnstr = None
		elif opt in ['--DT']:
			CMP_FLG = 1
			Task_Message = "comparison DB <-> TDS" 
		elif opt in ['--SD']:
			CMP_FLG = 2
			Task_Message = "comparison Solr <-> DB" 
		elif opt in ['--ST']:
			CMP_FLG = 3
			Task_Message = "comparison Solr <-> TDS" 
		elif opt in ['-p', '--project']:
			proj_cnstr = arg
		elif opt in ['-m', '--model']:
			model_cnstr = arg
		elif opt in ['-e', '--experiment']:
			exper_cnstr = arg
		elif opt in ['-l', '--log']:
			rprt_file_name = arg
		elif opt in ['-v', '--verbose']:
			VERBOSE = True
		elif opt in ['-h', '--help']:
			print help_message
			exit(0)
		else:
			print "Wrong option: ", opt 
			print usage
			exit(1)
			
	rprt_fl = open(rprt_file_name,'w')
	rprt_fl.write(st)

	config = loadConfig(init_dir)
	
	engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
	Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)
	session = Session()

	# TDS_Dict = {ds_name:xml_ref_file}
	# DB_Dict = {ds_id:ds_name}
	# Solr_DS_Lst = [ds] 

	# DB_TDS_DIFF_DS = {"dataset_name ds_id":["Y", "N"], "dataset_name xml_rel_path":["N", "Y"]}
	# e.g.: {"cmip5.output1.NOAA-GFDL.GFDL-ESM2M.1pctCO2.mon.atmos.Amon.r1i1p2.v20130214 51" ["N","Y"], 
	#        "cmip5.output1.NOAA-GFDL.GFDL-ESM2M.abrupt4xCO2.day.atmos.day.r1i1p1 5213" ["Y", "N"], ...}
	# DB_TDS_Shared_DS_Dict = {"DB_dataset_id rel_xmlpath" : TDS ref_xml}
	
	DB_DS_URLS = {}
	DB_TDS_DIFF_DS = {}  
	Solr_Only_Files_Dict = {}
	DB_Only_Files_Dict = {}
	TDS_Only_Files_Dict = {}
	
	constr = (proj_cnstr, model_cnstr, exper_cnstr)

	if proj_cnstr is None and model_cnstr is None and exper_cnstr is None:	
		cnstr_msg = "No constraints: all datasets are going to be verified."
	else:
		cnstr_msg = "\nConstraints choosen: "
		for i in range(len(constr)):
			if constr[i] is not None:	
				cnstr_msg = cnstr_msg + "\n\t" + constr_categ[i] +": " + constr[i]
	print st			
	print "\n=================> Started ", Task_Message
	print "report ->", rprt_file_name
	print cnstr_msg	
		
	rprt_fl.write("\n\n"+Task_Message)
	rprt_fl.write("\n"+cnstr_msg+"\n")	

	if CMP_FLG != 3:  # PostgreSQL
		(DB_Dict, DB_Dict_Redund) = getDBDatasetDict(session, constr)
		print "\n=================> DB Dictionary (size= " + str(len(DB_Dict)) +")"
		if VERBOSE:
			print "\n".join((str(k) + " : " + DB_Dict[k]) for k in DB_Dict.keys())		
		rprt_fl.write("\n\n=================> DB Datasets (size= " + str(len(DB_Dict)) +")\n")
		if VERBOSE:
			rprt_fl.write( "\n".join((str(k) + " : " + DB_Dict[k]) for k in DB_Dict.keys()) )
	
		if len(DB_Dict_Redund)>0:
			rprt_fl.write("=====> DB table 'dataset' contains records (" + str(len(DB_Dict_Redund)) +\
						  ") with different ids but the same dataset name/version: =====")
			rprt_fl.write( "\n".join( (k + " " + DB_Dict_Redund[k]) for k in DB_Dict_Redund.keys() ) ) 
			
	if CMP_FLG != 2:   # THREDDS
		(TDS_Dict, TDS_Dict_Redund, TDS_Dict_Broken) = getTDSDatasetDict(config, constr)
		print "\n=================> TDS Dictionary (size= " + str(len(TDS_Dict)) +")"		
		rprt_fl.write("\n\n=================> TDS Datasets (size= " + str(len(TDS_Dict)) +")\n")
		if VERBOSE:
			print "\n".join((k + " : " + TDS_Dict[k]) for k in TDS_Dict.keys())		
			rprt_fl.write( "\n".join((k + " : " + TDS_Dict[k]) for k in TDS_Dict.keys()) )
		if len(TDS_Dict_Redund)>1:
			rprt_fl.write("=====> TDS_Dict_Redund: (the same dataset names are listed multple times in main TDS catalog: =====>\n") 
			rprt_fl.write("\n".join( (k + " : " + TDS_Dict_Redund[k]) for k in TDS_Dict_Redund.keys() ) )
		if len(TDS_Dict_Broken)>0:
			rprt_fl.write("=====> TDS_dataset names do not correspond to reference catalog xmls in main catalog. There are " + str (len(TDS_Dict_Broken)) + " records: =====>\n") 
			rprt_fl.write("\n".join( (k + " " + TDS_Dict_Broken[k]) for k in TDS_Dict_Broken.keys() ) ) 

	if CMP_FLG != 1:   # Solr
		(Solr_MultiVersion_DS, Solr_DS_Lst) = getSolrDatasetList(constr)
		print "\n=================> Solr Dataset List (size= " + str(len(Solr_DS_Lst)) + ")"
		if VERBOSE:
			print "\n".join(ds for ds in Solr_DS_Lst)		
		rprt_fl.write("\n\n=================> Solr Datasets (size= " + str(len(Solr_DS_Lst)) + ")\n")
		if VERBOSE:
			rprt_fl.write("\n".join(ds for ds in Solr_DS_Lst))
		if len(Solr_MultiVersion_DS) > 0:
			rprt_fl.write(" \n\n =======> These Datasets are reperesented in Solr in multiversions (only last version is used in comparison): =====>\n")
			rprt_fl.write("\n".join( (ds + " : [" + ", ".join(v for v in Solr_MultiVersion_DS[ds]) + "]") for ds in Solr_MultiVersion_DS.keys()))

	if (CMP_FLG == 1 or CMP_FLG == 0) and (len(DB_Dict)>0 or len(TDS_Dict)>0):
		DB_THREDDS_Comparison(DB_Dict,TDS_Dict, config, session)
	if (CMP_FLG == 2 or CMP_FLG == 0) and (len(DB_Dict)>0 or len(Solr_DS_Lst)>0):
		SOLR_DB_Comparison(Solr_DS_Lst, DB_Dict, SOLR_HTTP, session)	
	if (CMP_FLG == 3 or CMP_FLG == 0) and (len(TDS_Dict)>0 or len(Solr_DS_Lst)>0):
		SOLR_TDS_Comparison(Solr_DS_Lst, TDS_Dict, SOLR_HTTP, config)
		
	ts = time.time()
	st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
	print "\n",st
	rprt_fl.write("\n"+st)
	rprt_fl.close()
	session.close()
Example #22
0
def main(argv):

    global DEFAULT_QUERY_SERVICE
    global DEFAULT_WGET_SERVICE

    try:
        args, lastargs = getopt.getopt(argv, "d:ho:pq:t:v", ['count', 'delimiter=', 'facet-query=', 'facets=', 'fields=', 'format=', 'free-text=', 'help', 'limit=', 'offset=', 'pretty-print', 'service-url=', 'type=', 'verbose'])
    except getopt.error:
        print sys.exc_value
        print usage
        sys.exit(0)

    # Get the search URL from the publisher configuration if possible
    try:
        from esgcet.config import loadConfig

        config = loadConfig(None)
        DEFAULT_QUERY_SERVICE = config.get("DEFAULT", "solr_search_service_url", default=DEFAULT_QUERY_SERVICE)
        DEFAULT_WGET_SERVICE = config.get("DEFAULT", "solr_wget_service_url", default=DEFAULT_WGET_SERVICE)
    except:
        pass

    allFacets = False                   # facets=*
    countOnly = False
    delim = None
    facets = []
    facetValues = None
    fields = []
    format = DEFAULT_FORMAT
    freetext = None
    includeId = False
    objtype = DATASET
    offset = 0
    outpath = sys.stdout
    outpathIsStdout = True
    prettyPrint = False
    service = DEFAULT_QUERY_SERVICE
    userLimit = MAX_RECORDS
    verbose = False
    wgetScript = False
    wgetService = DEFAULT_WGET_SERVICE
    for flag, arg in args:
        if flag=='--count':
            countOnly = True
        elif flag in ['-d', '--delimiter']:
            delim = arg
            prettyPrint = False
        elif flag=='--facets':
            facetList = arg.split(',')
            facetValues = [item.strip() for item in facetList]
            allFacets = (facetValues[0]=='*')
        elif flag=='--fields':
            fieldList = arg.split(',')
            fields = [item.strip() for item in fieldList]
        elif flag=='--format':
            if arg not in ['narrow', 'wide', 'wget', 'xml']:
                raise RuntimeError("Invalid format: %s"%arg)
            format = arg
            if arg=='wget':
                wgetScript = True
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='--limit':
            try:
                userLimit = int(arg)
            except:
                raise RuntimeError("Invalid limit: %s"%arg)
        elif flag=='-o':
            outpath = open(arg, 'w')
            outpathIsStdout = False
        elif flag in ['-p', '--pretty-print']:
            prettyPrint = True
        elif flag=='--offset':
            offset = int(arg)
        elif flag in ['-q', '--facet-query']:
            queries = arg.split(',')
            for q in queries:
                f,v = q.split('=')
                facets.append((f.strip(), v.strip()))
        elif flag=='--service-url':
            service = arg
            wgetService = arg
        elif flag in ['-t', '--free-text']:
	    freetext = arg
	elif flag=='--type':
            try:
                objtype = typeCode[arg]
            except:
                raise RuntimeError("Invalid return type: %s"%arg)
        elif flag in ['-v', '--verbose']:
            verbose = True

    # If a wget script is requested:
    # - Object type is File
    # - fields are url
    # - limit is min(1000, limit)
    if wgetScript:
        objtype = FILE
        fields = ['url']
        userLimit = min(userLimit, WGET_MAX_RECORDS)
        countOnly = False
        allFacets = False
        facetValues = None
        query = formulateQuery(facets, fields, format, freetext, objtype, wgetService, offset, userLimit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # XML output
    if format=='xml':
        userLimit = min(userLimit, DEFAULT_CHUNKSIZE)
        query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, userLimit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query
        downloadResult(query, outpath, outpathIsStdout)
        return

    # If returning id only, use wide format
    if fields==['id']:
        format = 'wide'
        includeId = True

    # For facet value queries, use wide format.
    if facetValues is not None:
        format = 'wide'

    # While remaining data:
    fullResults = []
    numFound = 0
    moredata = True
    nread = 0
    nleft = userLimit
    chunksize = DEFAULT_CHUNKSIZE
    while moredata:

        # Formulate a query
        if not (countOnly or facetValues is not None):
            limit = min(nleft, chunksize)
        else:
            limit = 0
        query = formulateQuery(facets, fields, format, freetext, objtype, service, offset, limit, facetValues=facetValues)
        if verbose:
            print >>sys.stderr, 'Query: ', query

        # Read a chunk
        chunk = readChunk(service, query)

        # Parse the response. For facet value searches, parse the response trailer
        if facetValues is None:
            scoreInFields = ('score' in fields)
            results, numFound, numResults = parseResponse(chunk, includeId, scoreInFields)
            fullResults.extend(results)
        else:
            numResults = 0
            if allFacets:
                fullResults, numFound = parseHeader(chunk)
            else:
                fullResults, numFound = parseTrailer(chunk, facetValues, includeId)

        # More data if some results were found and the number of records read < total
        nread += numResults
        nleft -= limit
        moredata = (numResults>0) and (nread<min(numFound, userLimit))
        offset += limit

    # Output the results
    if not (countOnly or facetValues is not None):
        outputResults(fullResults, format, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath)
    elif facetValues is not None:
        for valueList, head in zip(fullResults[0], fullResults[1]):
            if allFacets:
                header = (head,)
            else:
                header=(head,'count')
            outputFacetResults(valueList, header, prettyPrint=prettyPrint, printHeaders=True, delimiter=delim, out=outpath)
    else:
        print numFound

    if not outpathIsStdout:
        outpath.close()