Ejemplo n.º 1
0
def updateCollections(gw_name):
    import gateway
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    db = getSession()
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()):
            datasets[dataset['id']] = col.id
    
    #search for orphans of this gateway
    for dataset in db.query(Dataset).filter(Dataset.parent_gateway==gw_server).filter(Dataset.parent_id==None):
        if dataset.id in datasets:
            dataset.parent_id = datasets[dataset.id]
        else:
            #perhaps the top level collection was retracted!
            print "Dataset has no parent", dataset.id
    
    db.commit()
Ejemplo n.º 2
0
def __update(db):
    import gateway
    db.open()

    
    gateways = gateway.getGatewayInfo()
    
    known_active = ['WDCC', 'PCMDI', 'BADC', 'NCI', 'NCAR']

    #only reparse datasets once in a while
    import datetime
    timestamp = "modtime > '{0}'".format(datetime.datetime.now() - datetime.timedelta(days=30))
    global log

    def handle_datasets(datasets, gateway):
        #only process if not empty
        if not datasets: return datasets

        total = db._session.query(DatasetDAO).filter(DatasetDAO.parent==datasets[0]['parent']).count()
        if len(datasets) < total:
            #Some datasets are not available anymore, we should scan everything again
            log.error('Dataset were deleted for parent %s, please delete from db.', datasets[0]['parent'])

        known = set([ds[0] for ds in db._session.query(DatasetDAO.dataset_id).filter(DatasetDAO.gateway==gateway['server']).filter(timestamp)])
        new_datasets = []
        
        #get datasets not already checked within a given period
        for dataset in datasets:
            if dataset['id'] not in known: new_datasets.append(dataset)

        log.info('Processing %s new datasets (skipping %s).', len(new_datasets), len(datasets) - len(new_datasets))
        return new_datasets
            
    global __to_ingest
    __to_ingest = []

    def handle_result(dataset, files, gateway):  
        global __to_ingest
        #store only MB
        dataset_size = sum([int(f['size']) for f in files])>>20
        files_count= len(files) 

        #we assume all files in dataset have the same access points
        for endpoint in files[0]['endpoints']:                 
            dao = DatasetDAO( dataset, endpoint, gateway, size=dataset_size, files_count=files_count)
            dao.markAsUpdated()
            __to_ingest.append(dao)

        #update if batch's full        
        if len(__to_ingest) > 25:
            db.addAll(__to_ingest, overwrite=True)
            __to_ingest = []


    for gw_name in gateways:
        #only parse gateways that are known to be active...
        if gw_name not in known_active: continue

        gw_data = gateways[gw_name]
        log.info('Processing %s', gw_name)


        __to_ingest=[]
        
        try: 
            gateway.getCurrentDatasets(gw_name, filter_datasets=handle_datasets, callback_result=handle_result, continue_on_errors=True)
    
            #make sure we don't leave any last ones
            db.addAll(__to_ingest, overwrite=True)
        except:
            import sys
            log.error('There was an error contacting gateway %s: %s',gw_name, str(sys.exc_info()[:3]))
            raise

    db._session.commit()
Ejemplo n.º 3
0
def processGateway(gw_name):
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    hlog.debug("jfp gw_url=%s, gw_server=%s",gw_url,gw_server)
    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        if gw_name == 'NCI':
            hlog.warn("_getCMIP5Id failed; but recognize gateway and will use 'cmip5'")
            cmip5_id = 'cmip5'
        else:
            print 'No CMIP5 found for Gateway %s. Check manually.(3)' % gw_name
            return
    hlog.debug("jfp cmip5_id=%s",cmip5_id)
     
    #get all toplevel collections from gateway
    gw_top = {}
    hlog.debug("jfp in processGateway, will call gateway.main with args %s",
               ('-g %s -co' % gw_name).split())
    collections = gateway.main(('-g %s -co' % gw_name).split())
    if collections==None: collections=[]
    if collections==2:   # quick fix; gateway.main should throw an exception instead
        collections=[]
    for tlc in collections:
        gw_top[tlc['id']] = tlc
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col
    hlog.debug("jfp db_col=%s",db_col)
    
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds[ds.id] = ds
    
    #now get all CMIP5 datasets
    page_url = '%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)
    hlog.debug("jfp in processGateway, about to open %s",page_url)
    # jfp 2012.09.11 This url is unavailable at NCI; the harvest will always fail.
    #    if page_url.find('nci.org.au')>-1:  # jfp unreliable site, try the url twice (doesn't help)
    #        try:
    #            print "first attempt..."
    #            page = urllib2.urlopen(page_url,None,120).read()
    #        except Exception as e:
    #            print "...failed: ",e
    #        print "second attempt..."
    try:
        page = urllib2.urlopen(page_url).read()
    except Exception as e:
        print "exception opening %s in processGateway: %s" % (page_url,e)
        raise e
    dom = xml.dom.minidom.parseString(page)
    counter = 0 #commit after a bunch
    existing_ds = {}
    hlog.debug("jfp %s dom entries",len(dom.getElementsByTagName('entry')))
    for entry in dom.getElementsByTagName('entry'):
        id = entry.getElementsByTagName('title')[0].childNodes[0].data
        timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data
        last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6])
        #top level and normal are mixed!
        if id in gw_top: 
            #this is a top level for cmip5!
            print "Top level found", id 
            if id in db_col:
                #update
                col = db_col[id]
                if last_update > col.modtime:
                    #we know this collection was modified! (not that we car now...)
                    print "Collection modifed! was %s now is %s" % (col.modtime, last_update)
                    col.modtime = last_update
            else:
                #add new collection
                metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
                if metatdata==None: continue
                db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state']))
            continue

        #remember this dataset for later
        existing_ds[id] = True

        if id in db_ds:
            #we know this normal dataset! Check if it has changed
            if  db_ds[id].modtime == last_update:
                #old news...
                hlog.debug("Unchanged dataset %s, modtime=%s",id,last_update)
                continue
            print "Changed dataset found", id, db_ds[id].modtime, last_update 
            hlog.info( "Changed dataset found %s %s %s", id, db_ds[id].modtime, last_update )
            #something got changed!
            old_ds = db_ds[id]
            old_ds.modtime = last_update
        else:
            print "New dataset found", id, " on ", time.ctime()
            hlog.info( "New dataset found %s on %s" %(id,time.ctime()) )
            old_ds = None
        #new dataset version or something changed!
        metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
        if metadata==None or metadata==2: continue
        hlog.debug("version %s",metadata['version'])
         
        #version work around
        if metadata['state'] == 'retracted':
            print "retracted dataset"
            #this got retracted!
            if old_ds and old_ds.state != metadata['state']:
                #state changed!
                old_ds.state = metadata['state']
            continue
        if not metadata['catalog'] or not metadata['version']:
            print "Can't parse this, no catalog or version!!", metadata
            continue
            
        files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split())
        if files==None: files=[]
        filecount = len(files)
        if filecount > 0:
            size = sum([int(f['size']) for f in files])
            #we assume this is per dataset defined, and not per file
            ep = files[filecount/2]['endpoints']
            if ep:
                types = [e['type'] for e in ep]
            else:
                types = []
        else:
            #empty dataset?! There are some...
            size = 0
            types = []

        if old_ds and int(metadata['version']) == old_ds.version:
            print "Same version was updated!!"

            to_check_update = [('access_http', 'HTTPServer' in types),
                ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types),
                ('filecount', filecount), ('size', size)]
            for var, value in to_check_update:
                report = ""
                old_value = old_ds.__dict__[var]
                if old_value != value:
                    #report and update
                    report += "Changed %s from %s to %s, " % (var, old_value, value)
                    old_ds.__dict__[var] = value
            continue    #Use old_ds instead of creating a new one.
        elif old_ds:
            #new version
            print "New version found %s, last one was %s; on %s" %\
               (metadata['version'], old_ds.version, time.ctime())
            hlog.info( "New version found %s, last one was %s; on %s" %\
               (metadata['version'], old_ds.version, time.ctime()) )
        
        #Definitely a new version of either an existing dataset or a new one.
        try:  #jfp added try/except
            db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'],\
                           state=metadata['state'], filecount=filecount, size=size, access_http=\
                           ('HTTPServer' in types), access_gridftp=('GridFTP' in types),\
                           access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server))
            db.flush()  # jfp will slow things down, but we'll catch problems right away
        except sqlalchemy.exc.IntegrityError:   #jfp added try/except
            print "exception adding dataset id=",id," version=",metadata['version']," state=",metadata['state']
            print "catalog=",metadata['catalog']," modtime=",last_update," parent_gateway=",gw_server
            print "access_http=",('HTTPServer' in types)," access_gridftp=",('GridFTP' in types)
            print sys.exc_info()[:2]
            db.rollback()  # jfp mandatory after a failed flush!
            # raise  #jfp Now should be able to continue with other datasets.
        if counter > 20:
            #db.commit()
            counter = 0
        else:
            counter += 1
    #db.commit()

    #Now we must find missing ones, so we delete them properly
    for col in db_col.values():
        for dataset in col.datasets:
            if not dataset.id in existing_ds:
                if dataset.state == 'published':
                    dataset.state = 'retracted'
                print "dataset %s was removed by %s" % (dataset.id,time.ctime())
                hlog.info( "dataset %s was removed by %s" %(dataset.id,time.ctime()) )
    #db.commit()
    # print "jfp finished with loop over db_col.values()"
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        gdatasets = gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split())
        if gdatasets==None: gdatasets=[]
        for dataset in gdatasets:
            datasets[dataset['id']] = col.id
    
    for d in db.new:
        if d.id in datasets:
            d.parent_id = datasets[d.id]
        else:
            print "problem with", d
    db.commit()
    print "jfp committed"
Ejemplo n.º 4
0
def processGatewayOld(gw_name, fast=True):
    """Old method for harvesting gateways"""
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    #skip these
    skip_top_level = []

    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        print 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name
        hlog.warn( 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name )
        return
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col

    #now get known datasets
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds['%s#%s' % (ds.id, ds.version)] = ds

    counter = 0
    for col in _getCMIP5Collections(gw_url, cmip5_id):
        hlog.info( "Processing Collection %s on %s" %(col,time.ctime()) )
        if col in skip_top_level:
            print "Skipping"
            hlog.info( "Skipping, time is %s" % (time.ctime()) )
            continue

        if not col in db_col:
            #new collection!
            hlog.info("New collection %s on %s" % (col,time.ctime()))
            md = gateway.main(('-g %s --parent %s -mo' % (gw_name, col)).split())
            if md==None: continue
            #use a fictional date for the update so we know later on which
            #should be latered
            db.add(Collection(id=col, gateway=gw_server,state=md['state'],
                modtime=dummy_date))

        existing_ds = {}
        datasets = gateway.main(('-g %s --parent %s -do' % (gw_name,col)).split())
        if datasets==None: datasets=[]
        for dataset in datasets:
            ds_key = '%s#%s' % (dataset['id'], dataset['version'])

            #store for later
            existing_ds[ds_key] = True

            if ds_key in db_ds:
                old_ds = db_ds[ds_key]
                #should we update? (for now don't...)
                #if int(dataset['version']) == old_ds.version:
                    #same version... we might want to check... but in the common case this won't be necessary
                    #and is extremely expensive for this old way of getting things
                #    continue
            else:
                old_ds = None

            #Avoid reparsing already parsed datasets. The might change! e.g. they can be retracted.
            #They should be parsed once in a while
            if fast and old_ds: continue

            print "Processing dataset", ds_key, " on ", time.ctime()
	    hlog.info( "Processing dataset %s on %s" %(ds_key,time.ctime()) )
            metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, dataset['id'])).split())
            if not metadata:
                continue
            #version work around 
            if metadata['state'] == 'retracted':
                print "retracted dataset"
                hlog.info( "retracted dataset" )
                #this got retracted!
                if old_ds and old_ds.state != metadata['state']:
                    #state changed!
                    old_ds.state = metadata['state']
                continue
            if not metadata['catalog'] or not metadata['version']:
                print "Can't parse this, no catalog or version!!", metadata
                hlog.info( "Can't parse this, no catalog or version!! %s" %s (metadata) )
                continue


            #this is new!
            files = gateway.main(('-g %s --parent %s -fo' % (gw_name,dataset['id'])).split())
            if files==None: files=[]
            filecount = len(files)
            if filecount > 0:
                size = sum([int(f['size']) for f in files])
                #we assume this is per dataset defined, and not per file
                # use some file in the middle for this
                ep = files[filecount/2]['endpoints']
                if ep:
                    types = [e['type'] for e in ep]
                else:
                    types = []
            else:
                #empty dataset?! There are some...
                size = 0
                types = []
            if old_ds:
                #we will need to update the existing one
                old_ds.access_http=('HTTPServer' in types)
                old_ds.access_gridftp=('GridFTP' in types)
                old_ds.access_opendap=('OPeNDAP'in types)
                 
            else:
                db.add(Dataset(id=dataset['id'], version=int(metadata['version']), catalog=metadata['catalog'], 
                state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types),
                access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), 
                modtime=dummy_date, parent_gateway=gw_server, parent_id=col))
            if counter > 20:
                db.commit()
                counter = 0
            else:
                counter += 1
            # db.commit() #jfp temporary extra commit, to aid debugging


        if col in db_col:
            print col, len(db_col[col].datasets), len(existing_ds)
            hlog.info( "collection,lengths: %s, %s, %s on %s" %\
                       ( col, len(db_col[col].datasets), len(existing_ds), time.ctime() ) )
            for dataset in db_col[col].datasets:
                ds_key = '%s#%s' % (dataset.id, dataset.version)
                if not ds_key in existing_ds:
                    print "dataset %s was deleted" % ds_key
                    hlog.info( "dataset %s was deleted" % ds_key )
                    db.delete(dataset)
                    #if dataset.state == 'published':
                        #dataset.state = 'retracted'

    #commit the rest of the changes
    db.commit()
Ejemplo n.º 5
0
def processGateway(gw_name):
    import urllib2, re, xml, gateway
    db = getSession()
    gw_url = gateway.getGatewayInfo()[gw_name]['url']
    gw_server = gateway.getGatewayInfo()[gw_name]['server']
    
    try:
        cmip5_id = _getCMIP5Id(gw_url)
    except:
        print 'No CMIP5 found for Gateway %s. Check manually.' % gw_name
        return
     
    #get all toplevel collections from gateway
    gw_top = {}
    for tlc in gateway.main(('-g %s -co' % gw_name).split()):
        gw_top[tlc['id']] = tlc
    
    #get already known collections
    db_col = {}
    for col in db.query(Collection).filter(Collection.gateway==gw_server).all():
        #within the gateway these are unique
        db_col[col.id] = col
    
    db_ds = {}
    for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all():
        db_ds[ds.id] = ds
    
    #now get all CMIP5 datasets
    page = urllib2.urlopen('%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)).read()
    dom = xml.dom.minidom.parseString(page)
    counter = 0 #commit after a bunch
    existing_ds = {}
    for entry in dom.getElementsByTagName('entry'):
        id = entry.getElementsByTagName('title')[0].childNodes[0].data
        timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data
        last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6])
        #top level and normal are mixed!
        if id in gw_top: 
            #this is a top level for cmip5!
            print "Top level found", id 
            if id in db_col:
                #update
                col = db_col[id]
                if last_update > col.modtime:
                    #we know this collection was modified! (not that we car now...)
                    print "Collection modifed! was %s now is %s" % (col.modtime, last_update)
                    col.modtime = last_update
            else:
                #add new collection
                metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
                db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state']))
            continue

        #remember this dataset for later
        existing_ds[id] = True

        if id in db_ds:
            #we know this normal dataset! Check if it has changed
            if  db_ds[id].modtime == last_update:
                #old news...
                continue
            print "Changed dataset found", id, db_ds[id].modtime, last_update 
            #something got changed!
            old_ds = db_ds[id]
        else:
            print "New dataset found", id
            old_ds = None
        #new dataset version or something changed!
        metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split())
            
         
        #version work around
        if metadata['state'] == 'retracted':
            print "retracted dataset"
            #this got retracted!
            if old_ds and old_ds.state != metadata['state']:
                #state changed!
                old_ds.state = metadata['state']
            continue
        if not metadata['catalog'] or not metadata['version']:
            print "Can't parse this, no catalog or version!!", metadata
            continue
            
        files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split())
        filecount = len(files)
        if filecount > 0:
            size = sum([int(f['size']) for f in files])
            #we assume this is per dataset defined, and not per file
            ep = files[filecount/2]['endpoints']
            if ep:
                types = [e['type'] for e in ep]
            else:
                types = []
        else:
            #empty dataset?! There are some...
            size = 0
            types = []
        if old_ds and int(metadata['version']) == old_ds.version:
            print "Same version was updated!!"

            to_check_update = [('access_http', 'HTTPServer' in types),
                ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types),
                ('filecount', filecount), ('size', size)]
            for var, value in to_check_update:
                report = ""
                old_value = old_ds.__dict__[var]
                if old_value != value:
                    #report and update
                    report += "Changed %s from %s to %s, " % (var, old_value, value)
                    old_ds.__dict__[var] = value
            continue    #Use old_ds instead of creating a new one.
        elif old_ds:
            #new version
            print "New version found %s, last one was %s" %  (metadata['version'], old_ds.version)
        
        #Definitely a new version of either an existing dataset or a new one.
        db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'],
                filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types),
                access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server))
        if counter > 20:
            #db.commit()
            counter = 0
        else:
            counter += 1
    #db.commit()
    
    #Now we must find missing ones, so we delete them properly
    for col in db_col.values():
        for dataset in col.datasets:
            if not dataset.id in existing_ds:
                if dataset.state == 'published':
                    dataset.state = 'retracted'
                print "dataset %s was removed" % dataset.id
    #db.commit()
    datasets = {}
    for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server):
        for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()):
            datasets[dataset['id']] = col.id
    
    for d in db.new:
        if d.id in datasets:
            d.parent_id = datasets[d.id]
        else:
            print "problem with", d
    db.commit()