def test_main(self): loop_stub = Mock() router_stub = Mock() radio_stub = Mock() gateway.Router.return_value = router_stub gateway.Radio.return_value = radio_stub gateway.Radio.send_packet = {} gateway.asyncio.get_event_loop.return_value = loop_stub router_stub.connect_to_message_queue.return_value = 'Future' gateway.main() gateway.initialize_gpio.assert_called_once_with() gateway.Radio.assert_called_once_with() gateway.Router.assert_called_once_with() loop_stub.run_until_complete.assert_called_once_with('Future') router_stub.set_send_packet.assert_called_once_with( radio_stub.send_packet) gateway.poll.assert_called_once_with(loop_stub, radio_stub, router_stub) loop_stub.run_forever.assert_called_once_with() loop_stub.close.assert_called_once_with()
def test_main(self): loop_stub = Mock() router_stub = Mock() radio_stub = Mock() gateway.Router.return_value = router_stub gateway.Radio.return_value = radio_stub gateway.Radio.send_packet = {} gateway.asyncio.get_event_loop.return_value = loop_stub router_stub.connect_to_message_queue.return_value = 'Future' gateway.main() gateway.initialize_gpio.assert_called_once_with() gateway.Radio.assert_called_once_with() gateway.Router.assert_called_once_with() loop_stub.run_until_complete.assert_called_once_with('Future') router_stub.set_send_packet.assert_called_once_with(radio_stub.send_packet) gateway.poll.assert_called_once_with(loop_stub, radio_stub, router_stub) loop_stub.run_forever.assert_called_once_with() loop_stub.close.assert_called_once_with()
def testIPSL(): import gateway #connect to db db = ReplicaDB('sqlite:///ipsl2.db') #get datasets for replication datasets = ["cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.day.atmos.cfDay.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.day.atmos.day.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.atmos.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.land.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.fx.ocean.fx.r0i0p0","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.3hr.atmos.3hr.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.6hr.atmos.6hrLev.r1i1p1","cmip5.output1.IPSL.IPSL-CM5A-LR.aqua4K.6hr.atmos.6hrPlev.r1i1p1"] for ds in datasets: cmd = '-g BADC -of --parent {0}'.format(ds) files = gateway.main(cmd.split(' ')) print len(files) print files[0] db.add_all(files)
def setUp(self): self.config = testing.setUp() import ConfigParser from gateway.models import DBSession from gateway.models import initialize_sql config = ConfigParser.ConfigParser() config.readfp(open('testing.ini')) db_string = config.get('app:gateway', 'db_string') initialize_sql(db_string) from gateway import main from webtest import TestApp app = main(None, **{'db_string': db_string, 'mako.directories': config.get('app:gateway', 'mako.directories')}) self.testapp = TestApp(app) self.session = DBSession()
def updateCollections(gw_name): import gateway gw_server = gateway.getGatewayInfo()[gw_name]['server'] db = getSession() datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()): datasets[dataset['id']] = col.id #search for orphans of this gateway for dataset in db.query(Dataset).filter(Dataset.parent_gateway==gw_server).filter(Dataset.parent_id==None): if dataset.id in datasets: dataset.parent_id = datasets[dataset.id] else: #perhaps the top level collection was retracted! print "Dataset has no parent", dataset.id db.commit()
def processGateway(gw_name): import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] hlog.debug("jfp gw_url=%s, gw_server=%s",gw_url,gw_server) try: cmip5_id = _getCMIP5Id(gw_url) except: if gw_name == 'NCI': hlog.warn("_getCMIP5Id failed; but recognize gateway and will use 'cmip5'") cmip5_id = 'cmip5' else: print 'No CMIP5 found for Gateway %s. Check manually.(3)' % gw_name return hlog.debug("jfp cmip5_id=%s",cmip5_id) #get all toplevel collections from gateway gw_top = {} hlog.debug("jfp in processGateway, will call gateway.main with args %s", ('-g %s -co' % gw_name).split()) collections = gateway.main(('-g %s -co' % gw_name).split()) if collections==None: collections=[] if collections==2: # quick fix; gateway.main should throw an exception instead collections=[] for tlc in collections: gw_top[tlc['id']] = tlc #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col hlog.debug("jfp db_col=%s",db_col) db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds[ds.id] = ds #now get all CMIP5 datasets page_url = '%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id) hlog.debug("jfp in processGateway, about to open %s",page_url) # jfp 2012.09.11 This url is unavailable at NCI; the harvest will always fail. # if page_url.find('nci.org.au')>-1: # jfp unreliable site, try the url twice (doesn't help) # try: # print "first attempt..." # page = urllib2.urlopen(page_url,None,120).read() # except Exception as e: # print "...failed: ",e # print "second attempt..." try: page = urllib2.urlopen(page_url).read() except Exception as e: print "exception opening %s in processGateway: %s" % (page_url,e) raise e dom = xml.dom.minidom.parseString(page) counter = 0 #commit after a bunch existing_ds = {} hlog.debug("jfp %s dom entries",len(dom.getElementsByTagName('entry'))) for entry in dom.getElementsByTagName('entry'): id = entry.getElementsByTagName('title')[0].childNodes[0].data timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6]) #top level and normal are mixed! if id in gw_top: #this is a top level for cmip5! print "Top level found", id if id in db_col: #update col = db_col[id] if last_update > col.modtime: #we know this collection was modified! (not that we car now...) print "Collection modifed! was %s now is %s" % (col.modtime, last_update) col.modtime = last_update else: #add new collection metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) if metatdata==None: continue db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state'])) continue #remember this dataset for later existing_ds[id] = True if id in db_ds: #we know this normal dataset! Check if it has changed if db_ds[id].modtime == last_update: #old news... hlog.debug("Unchanged dataset %s, modtime=%s",id,last_update) continue print "Changed dataset found", id, db_ds[id].modtime, last_update hlog.info( "Changed dataset found %s %s %s", id, db_ds[id].modtime, last_update ) #something got changed! old_ds = db_ds[id] old_ds.modtime = last_update else: print "New dataset found", id, " on ", time.ctime() hlog.info( "New dataset found %s on %s" %(id,time.ctime()) ) old_ds = None #new dataset version or something changed! metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) if metadata==None or metadata==2: continue hlog.debug("version %s",metadata['version']) #version work around if metadata['state'] == 'retracted': print "retracted dataset" #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata continue files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split()) if files==None: files=[] filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds and int(metadata['version']) == old_ds.version: print "Same version was updated!!" to_check_update = [('access_http', 'HTTPServer' in types), ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types), ('filecount', filecount), ('size', size)] for var, value in to_check_update: report = "" old_value = old_ds.__dict__[var] if old_value != value: #report and update report += "Changed %s from %s to %s, " % (var, old_value, value) old_ds.__dict__[var] = value continue #Use old_ds instead of creating a new one. elif old_ds: #new version print "New version found %s, last one was %s; on %s" %\ (metadata['version'], old_ds.version, time.ctime()) hlog.info( "New version found %s, last one was %s; on %s" %\ (metadata['version'], old_ds.version, time.ctime()) ) #Definitely a new version of either an existing dataset or a new one. try: #jfp added try/except db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'],\ state=metadata['state'], filecount=filecount, size=size, access_http=\ ('HTTPServer' in types), access_gridftp=('GridFTP' in types),\ access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server)) db.flush() # jfp will slow things down, but we'll catch problems right away except sqlalchemy.exc.IntegrityError: #jfp added try/except print "exception adding dataset id=",id," version=",metadata['version']," state=",metadata['state'] print "catalog=",metadata['catalog']," modtime=",last_update," parent_gateway=",gw_server print "access_http=",('HTTPServer' in types)," access_gridftp=",('GridFTP' in types) print sys.exc_info()[:2] db.rollback() # jfp mandatory after a failed flush! # raise #jfp Now should be able to continue with other datasets. if counter > 20: #db.commit() counter = 0 else: counter += 1 #db.commit() #Now we must find missing ones, so we delete them properly for col in db_col.values(): for dataset in col.datasets: if not dataset.id in existing_ds: if dataset.state == 'published': dataset.state = 'retracted' print "dataset %s was removed by %s" % (dataset.id,time.ctime()) hlog.info( "dataset %s was removed by %s" %(dataset.id,time.ctime()) ) #db.commit() # print "jfp finished with loop over db_col.values()" datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): gdatasets = gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()) if gdatasets==None: gdatasets=[] for dataset in gdatasets: datasets[dataset['id']] = col.id for d in db.new: if d.id in datasets: d.parent_id = datasets[d.id] else: print "problem with", d db.commit() print "jfp committed"
def processGatewayOld(gw_name, fast=True): """Old method for harvesting gateways""" import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] #skip these skip_top_level = [] try: cmip5_id = _getCMIP5Id(gw_url) except: print 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name hlog.warn( 'No CMIP5 found for Gateway %s. Check manually.(2)' % gw_name ) return #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col #now get known datasets db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds['%s#%s' % (ds.id, ds.version)] = ds counter = 0 for col in _getCMIP5Collections(gw_url, cmip5_id): hlog.info( "Processing Collection %s on %s" %(col,time.ctime()) ) if col in skip_top_level: print "Skipping" hlog.info( "Skipping, time is %s" % (time.ctime()) ) continue if not col in db_col: #new collection! hlog.info("New collection %s on %s" % (col,time.ctime())) md = gateway.main(('-g %s --parent %s -mo' % (gw_name, col)).split()) if md==None: continue #use a fictional date for the update so we know later on which #should be latered db.add(Collection(id=col, gateway=gw_server,state=md['state'], modtime=dummy_date)) existing_ds = {} datasets = gateway.main(('-g %s --parent %s -do' % (gw_name,col)).split()) if datasets==None: datasets=[] for dataset in datasets: ds_key = '%s#%s' % (dataset['id'], dataset['version']) #store for later existing_ds[ds_key] = True if ds_key in db_ds: old_ds = db_ds[ds_key] #should we update? (for now don't...) #if int(dataset['version']) == old_ds.version: #same version... we might want to check... but in the common case this won't be necessary #and is extremely expensive for this old way of getting things # continue else: old_ds = None #Avoid reparsing already parsed datasets. The might change! e.g. they can be retracted. #They should be parsed once in a while if fast and old_ds: continue print "Processing dataset", ds_key, " on ", time.ctime() hlog.info( "Processing dataset %s on %s" %(ds_key,time.ctime()) ) metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, dataset['id'])).split()) if not metadata: continue #version work around if metadata['state'] == 'retracted': print "retracted dataset" hlog.info( "retracted dataset" ) #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata hlog.info( "Can't parse this, no catalog or version!! %s" %s (metadata) ) continue #this is new! files = gateway.main(('-g %s --parent %s -fo' % (gw_name,dataset['id'])).split()) if files==None: files=[] filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file # use some file in the middle for this ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds: #we will need to update the existing one old_ds.access_http=('HTTPServer' in types) old_ds.access_gridftp=('GridFTP' in types) old_ds.access_opendap=('OPeNDAP'in types) else: db.add(Dataset(id=dataset['id'], version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), modtime=dummy_date, parent_gateway=gw_server, parent_id=col)) if counter > 20: db.commit() counter = 0 else: counter += 1 # db.commit() #jfp temporary extra commit, to aid debugging if col in db_col: print col, len(db_col[col].datasets), len(existing_ds) hlog.info( "collection,lengths: %s, %s, %s on %s" %\ ( col, len(db_col[col].datasets), len(existing_ds), time.ctime() ) ) for dataset in db_col[col].datasets: ds_key = '%s#%s' % (dataset.id, dataset.version) if not ds_key in existing_ds: print "dataset %s was deleted" % ds_key hlog.info( "dataset %s was deleted" % ds_key ) db.delete(dataset) #if dataset.state == 'published': #dataset.state = 'retracted' #commit the rest of the changes db.commit()
def main(argv=None): if argv is None: argv = sys.argv[1:] import getopt try: args, lastargs = getopt.getopt(argv, "g:D:e:dvqh", ['help', 'gateway-url=', 'parent=']) except getopt.error: print sys.exc_info()[:3] return 1 #init values db_name = 'replica.db' gatewayURL = gatewayName = regex = None parent_set = False gw_args = "-o" #parse arguments for flag, arg in args: if flag=='-g': gw_args = '%s -g %s' % (gw_args, arg) elif flag=='--gateway-url': gw_args = '%s --gateway-url %s' % (gw_args, arg) elif flag=='--parent': gw_args = '%s --parent %s' % (gw_args, arg) parent_set = True elif flag=='-D': db_name = arg elif flag=='-e': import re regex = re.compile(arg) # elif flag=='-x': retrieve_xml = True # elif flag=='-o': retrieve_object = True elif flag=='-d': log.setLevel(logging.DEBUG) elif flag=='-v': log.setLevel(logging.INFO) elif flag=='-q': log.setLevel(logging.NONE) elif flag=='-h' or flag=='--help': return 1 if not parent_set: gw_args = '-A ' + gw_args log.warn('Top level collection not set. Trying all known top-level collections (set with --parent)') else: gw_args = '-d ' + gw_args #Get datasets log.info('Retrieving datasets from Gateway') log.debug('cmd: %s', gw_args) ds = gateway.main(gw_args.split(' ')) if ds: log.debug('Total datasets: %s', len(ds)) else: log.error('No Dataset was found!') return 0 if regex: ds = [d['id'] for d in ds if regex.search(d['id'])] #prepare DB log.debug('DB: %s', db_name) log.debug('Entries: %s', len(ds)) db = ReplicaDB('sqlite:///%s' % db_name) db.open() for d in ds: cmd = '{0} -of --parent {1}'.format(gw_args[3:], d) log.debug('getting files with %s', cmd) files = gateway.main(cmd.split(' ')) if files: log.info('Adding %s files for dataset %s', len(files), d) db.add_all(files) else: log.error('no file found!') return 0
def processGateway(gw_name): import urllib2, re, xml, gateway db = getSession() gw_url = gateway.getGatewayInfo()[gw_name]['url'] gw_server = gateway.getGatewayInfo()[gw_name]['server'] try: cmip5_id = _getCMIP5Id(gw_url) except: print 'No CMIP5 found for Gateway %s. Check manually.' % gw_name return #get all toplevel collections from gateway gw_top = {} for tlc in gateway.main(('-g %s -co' % gw_name).split()): gw_top[tlc['id']] = tlc #get already known collections db_col = {} for col in db.query(Collection).filter(Collection.gateway==gw_server).all(): #within the gateway these are unique db_col[col.id] = col db_ds = {} for ds in getLatestDatasetsQuery(gw_server).filter(Dataset.parent_gateway==gw_server).all(): db_ds[ds.id] = ds #now get all CMIP5 datasets page = urllib2.urlopen('%s/project/%s/dataset/feed.atom' % (gw_url, cmip5_id)).read() dom = xml.dom.minidom.parseString(page) counter = 0 #commit after a bunch existing_ds = {} for entry in dom.getElementsByTagName('entry'): id = entry.getElementsByTagName('title')[0].childNodes[0].data timestamp = entry.getElementsByTagName('updated')[0].childNodes[0].data last_update = datetime.datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:6]) #top level and normal are mixed! if id in gw_top: #this is a top level for cmip5! print "Top level found", id if id in db_col: #update col = db_col[id] if last_update > col.modtime: #we know this collection was modified! (not that we car now...) print "Collection modifed! was %s now is %s" % (col.modtime, last_update) col.modtime = last_update else: #add new collection metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) db.add(Collection(gateway=gw_server,id=id,modtime=last_update,state=metadata['state'])) continue #remember this dataset for later existing_ds[id] = True if id in db_ds: #we know this normal dataset! Check if it has changed if db_ds[id].modtime == last_update: #old news... continue print "Changed dataset found", id, db_ds[id].modtime, last_update #something got changed! old_ds = db_ds[id] else: print "New dataset found", id old_ds = None #new dataset version or something changed! metadata = gateway.main(('-g %s --parent %s -mo' % (gw_name, id)).split()) #version work around if metadata['state'] == 'retracted': print "retracted dataset" #this got retracted! if old_ds and old_ds.state != metadata['state']: #state changed! old_ds.state = metadata['state'] continue if not metadata['catalog'] or not metadata['version']: print "Can't parse this, no catalog or version!!", metadata continue files = gateway.main(('-g %s --parent %s -fo' % (gw_name, id)).split()) filecount = len(files) if filecount > 0: size = sum([int(f['size']) for f in files]) #we assume this is per dataset defined, and not per file ep = files[filecount/2]['endpoints'] if ep: types = [e['type'] for e in ep] else: types = [] else: #empty dataset?! There are some... size = 0 types = [] if old_ds and int(metadata['version']) == old_ds.version: print "Same version was updated!!" to_check_update = [('access_http', 'HTTPServer' in types), ('access_gridftp', 'GridFTP' in types), ('access_opendap', 'OPeNDAP'in types), ('filecount', filecount), ('size', size)] for var, value in to_check_update: report = "" old_value = old_ds.__dict__[var] if old_value != value: #report and update report += "Changed %s from %s to %s, " % (var, old_value, value) old_ds.__dict__[var] = value continue #Use old_ds instead of creating a new one. elif old_ds: #new version print "New version found %s, last one was %s" % (metadata['version'], old_ds.version) #Definitely a new version of either an existing dataset or a new one. db.add(Dataset(id=id, version=int(metadata['version']), catalog=metadata['catalog'], state=metadata['state'], filecount=filecount, size=size, access_http=('HTTPServer' in types), access_gridftp=('GridFTP' in types), access_opendap=('OPeNDAP' in types), modtime=last_update, parent_gateway=gw_server)) if counter > 20: #db.commit() counter = 0 else: counter += 1 #db.commit() #Now we must find missing ones, so we delete them properly for col in db_col.values(): for dataset in col.datasets: if not dataset.id in existing_ds: if dataset.state == 'published': dataset.state = 'retracted' print "dataset %s was removed" % dataset.id #db.commit() datasets = {} for col in db.query(Collection).filter(Collection.state=='published').filter(Collection.gateway==gw_server): for dataset in gateway.main(('-g %s --parent %s -do' % (col.gateway, col.id)).split()): datasets[dataset['id']] = col.id for d in db.new: if d.id in datasets: d.parent_id = datasets[d.id] else: print "problem with", d db.commit()