def call_local(self): ''' sort out and return local_file This comes from the URL and local_dir and ends .store ''' if self.indb(): if callable(self.local): sys.msg(f"**unexpected method for self.local {self.local}") else: return self.local kwargs = fdict(self.__dict__.copy()) if 'local_dir' in kwargs and \ (kwargs['local_dir'] is not None) and \ len(kwargs['local_dir']) > 0: self.local_dir = list_resolve(kwargs['local_dir']) if (self.local_dir is None) or (len(self.local_dir) == 0): self.local_dir = list_resolve(self.db_dir) self.local_file = Path(self.local_dir[0],self.as_posix().split("://")[1]) #self.local_file = Path(self.local_dir[-1],str(self.with_scheme(''))[2:]).absolute() # replace ' ' self.local_file = Path(str(self.local_file).replace(' ','_')) suffix = self.local_file.suffix self.local_file = self.local_file.with_suffix(suffix + '.store') self.check_path(self.local_file.parent) self.local_file.parent.mkdir(parents=True,exist_ok=True) return self.local_file
def main(): if False: u='https://e4ftl01.cr.usgs.gov/MOTA/MCD15A3H.006/2003.12.11/MCD15A3H.A2003345.h09v06.006.2015084002115.hdf' url = URL(u) data = url.read_bytes() ofile = Path('data',url.name) osize = ofile.write_bytes(data) assert osize == 3365255 print('passed') if False: u='https://e4ftl01.cr.usgs.gov/MOTA/MCD15A3H.006/2003.12.11' url = URL(u) files = url.glob('*0.hdf',pre_filter=True) print(files) if True: u='https://e4ftl01.cr.usgs.gov' import os os.environ['CACHE_FILE'] = 'data/database.db' url = URL(u,verbose=True,db_file='data/new_db.txt',local_dir='work') rlist = url.glob('MOT*/MCD15A3H.006/2003.12.11/*0.hdf',pre_filter=True) for i,r in enumerate(rlist): print(i) # we can save in decalring a new URL by passing old one u = URL(r,**(fdict(url.__dict__.copy()))) data=u.read_bytes() # updata database u.flush()
def _convert_to_abs(self, ilist): # this is slow and may be not needed self.msg(f'parsing URLs from html file {len(ilist)} items') return [ self.update(*[str(self), l.rstrip('/#')], **(fdict(self.__dict__.copy()))) for l in ilist ]
def init(self,**kwargs): self.__dict__.update(ginit(self,**kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy())))
def glob(self, pattern, pre_filter=True): ''' Iterate over this subtree and yield all existing files (of any kind, including directories) matching the given relative pattern. The URL here then needs to return lxml html code. Positional arguments: patterm : to search for e.g. */2021.*.01 only wildcards * and ? considered at present ''' u = self url = str(u) if url[-1] == '/': url = urls[:-1] url = self.update(url, pattern) # check in database store_url = url store_flag = 'query' olist = self.database.get_from_db(store_flag, store_url) if olist is not None: if type(olist) is list: return [self.update(o) for o in olist] return [self.update(olist)] # start at the top uc = np.array(url.parts) for i, w in enumerate(uc[1:]): if i == 0: base_list = [self.update(uc[0])] new_list = [] for b in base_list: # set to new item glob = self.update(b)._glob(w, pre_filter=pre_filter) # glob with the next item new_list = new_list + glob base_list = np.unique( np.array(new_list, dtype=np.object).flatten()) base_list = np.unique(np.array(base_list, dtype=np.object)) olist = list(np.array([self.update(i) for i in base_list]).flatten()) self.dedate() for l in olist: l.init(**(fdict(self.__dict__.copy()))) # cache this in case we want to re-use it cache = {store_flag: {str(store_url): [str(i) for i in olist]}} self.database.set_db(cache) if type(olist) is list: return [self.update(o) for o in olist] return [self.update(olist)]
def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds] if self.sds is not None: self.msg(f'initial SDS {self.sds}') self.required_sds = self.sds # for most transactions, we want all SDS # so self.sds should reflect that self.sds = None response = self.database.get_from_db('SDS', self.product) if response: self.msg("found SDS names in database") self.sds = response self.msg(self.sds) # require them all if 'required_sds' not in self.__dict__: self.required_sds = self.sds
def update(self,*args,**kwargs): '''update args in object''' if '_cache_original' not in self.__dict__: self._cache_original = self.__dict__.copy() # whetehr we specify full URL in update or not if ('full_url' in kwargs) and (kwargs['full_url'] == True): args = list(args) else: args = [str(self)] + list(args) url = super(URL, self).__new__(self,*args) url.is_clone = True url.__dict__ = fdict(self._cache_original.copy()) return url
def __init__(self, **kwargs): kwargs['defaults'] = { 'store_msg' : [],\ 'database' : None,\ 'product' : 'MCD15A3H',\ 'tile' : 'h08v06',\ 'log' : None,\ 'day' : '01',\ 'doy' : None, 'month' : '*',\ 'sds' : None, 'year' : "2019",\ 'site' : 'https://e4ftl01.cr.usgs.gov',\ 'size_check' : False,\ 'noclobber' : True,\ 'local_dir' : 'work',\ 'local_file' : None,\ 'db_file' : None,\ 'db_dir' : 'work',\ 'verbose' : False,\ 'stderr' : sys.stderr } self.__dict__.update(ginit(self, **kwargs)) if 'database' in self.__dict__ and type(self.database) == Database: # already have databse stored pass else: self.database = Database(self.db_file,\ **(fdict(self.__dict__.copy(),ignore=['db_dir','db_file']))) self.translateoptions = gdal.TranslateOptions( gdal.ParseCommandLine("-of Gtiff -co COMPRESS=LZW")) # list of tiles if type(self.tile) is str: self.tile = [self.tile] if type(self.sds) is str: self.sds = [self.sds]
def call_local(self): ''' sort out and return local_file This comes from the URL and local_dir and ends .store ''' kwargs = fdict(self.__dict__.copy()) if 'local_dir' in kwargs and \ (kwargs['local_dir'] is not None) and \ len(kwargs['local_dir']) > 0: self.local_dir = list_resolve(kwargs['local_dir']) if (self.local_dir is None) or (len(self.local_dir) == 0): self.local_dir = list_resolve(self.db_dir) self.local_file = Path(self.local_dir[-1], str(self.with_scheme(''))[2:]).absolute() # replace ' ' self.local_file = Path(str(self.local_file).replace(' ', '_')) suffix = self.local_file.suffix self.local_file = self.local_file.with_suffix(suffix + '.store') self.check_path(self.local_file.parent) self.local_file.parent.mkdir(parents=True, exist_ok=True) return self.local_file
def stitch_date(self, year, doy, get_files=False, test=False): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() fd = fdict(d) # dont need to read it fd['no_read'] = True ofilebase = f"{self.product}/data.__SDS__." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" hdf_urls = self.get_url(**(fd)) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): if get_files: return None, None return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) if not test and not get_files: # look up in db warp_args = None dstNodata = None step = 1 #this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' kwargs = {'year': self.year, 'doy':doy,'day':self.day,'month':self.month,'step':step,\ 'warp_args':warp_args,'product': self.product, 'dstNodata':dstNodata, 'tile': self.tile} mkey = json.dumps(kwargs) # this is an hdf file response = self.database.get_from_db(store_flag, mkey) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) try: hdf_files = [str(f.local()) for f in hdf_urls] except: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] if get_files: sds = self.get_sds(hdf_files, do_all=False) return hdf_files, sds sds = self.get_sds(hdf_files, do_all=True) if sds == []: for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) # early return if we just want sds if test == True: return sds if len(sds) == 0: # failed to get SDS: need to download example file for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f'{ofilebase.replace("__SDS__",self.sds[i])}.vrt'.replace( ' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) spatial_file.parent.mkdir(parents=True, exist_ok=True) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {mkey: ofiles}} #self.database.set_db(cache,write=True) return ofiles
def __init__(self, args, **kwargs): ''' kwargs setup and organisation of local_dir and db_dir args are database files ''' defaults = {\ 'verbose' : False,\ 'db_dir' : list_resolve(['~/.url_db']),\ 'db_file' : None,\ 'log' : None,\ 'database' : None,\ 'stderr' : sys.stderr,\ } defaults.update(kwargs) old_db = defaults['database'] self.__dict__.update(defaults) if ('database' in self.__dict__) and (type(self.database) is Database): try: print("WARNING: shouldnt be here ... ") this = self.database.__dict__ # in case database object passed self.__dict__.update(fdict(this)) if type(old_db) is dict: self.database.update(old_db) except: pass if self.log is not None: try: self.stderr = Path(self.log).open("a") if self.verbose: try: #msg = f"database: log file {self.log}" self.store_msg.append(msg) print(msg, file=sys.stderr) except: pass except: self.stderr = sys.stderr self.msg(f"WARNING: failure to open log file {self.log}") if type(self.db_file) is str: self.db_file = [self.db_file] # database files if (self.db_file is None): self.db_file = args #else: # if type(self.db_file) is not list: # self.db_file = [self.db_file] # self.db_file.append(args) if (self.db_file is not None) and type(self.db_file) is not list: self.db_file = [self.db_file] if (self.db_dir is not None) and type(self.db_dir) is not list: self.db_dir = [self.db_dir] # may be a cache if 'CACHE_FILE' in os.environ and os.environ['CACHE_FILE'] is not None: db_file = [str(l) for l in list_resolve(os.environ['CACHE_FILE'])] self.msg(f'using cache {db_file}') if (self.db_file is None): self.db_file = db_file else: self.db_file = list_resolve(self.db_file + db_file) # in case still none if (self.db_file is None) or \ ((type(self.db_file) is list) and len(self.db_file) == 0): # in case self.db_dir is none if (self.db_dir is None) or \ ((type(self.db_dir) is list) and len(self.db_dir) == 0): self.db_dir = list_resolve([Path('~', '.url_db')]) self.db_file = [Path(d, '.db.yml') for d in self.db_dir] if type(self.db_file) is str: self.db_file = [self.db_file] self.db_file = list_resolve([Path(f) for f in self.db_file]) self.db_dir = [Path(d).parent for d in self.db_file] if self.database and (len(self.database.keys())): self.msg('getting database from command line') else: self.database = self.set_db(dict(self.get_db())) self.init_database = self.database.copy()
def __init__(self, args, **kwargs): ''' kwargs setup and organisation of local_dir and db_dir args are database files ''' defaults = {\ 'verbose' : False,\ 'db_dir' : None,\ 'db_file' : None,\ 'log' : None,\ 'database' : None,\ 'stderr' : sys.stderr,\ } # try to read from ~/.url_db/.init initfile = Path('~/.url_db/init.yml').expanduser().absolute() if initfile.exists(): #self.msg(f'reading init file {initfile.as_posix()}') with initfile.open('r') as f: info = yaml.safe_load(f) else: info = {} defaults.update(info) defaults.update(kwargs) old_db = defaults['database'] self.__dict__.update(defaults) if ('database' in self.__dict__) and (type(self.database) is Database): try: print("WARNING: shouldnt be here ... ") this = self.database.__dict__ # in case database object passed self.__dict__.update(fdict(this)) if type(old_db) is dict: self.database.update(old_db) except: pass if self.log is not None: try: self.stderr = Path(self.log).open("a") if self.verbose: try: #msg = f"database: log file {self.log}" self.store_msg.append(msg) print(msg, file=sys.stderr) except: pass except: self.stderr = sys.stderr self.msg(f"WARNING: failure to open log file {self.log}") if type(self.db_file) is str: self.db_file = [self.db_file] # database files if (self.db_file is None): self.db_file = args #else: # if type(self.db_file) is not list: # self.db_file = [self.db_file] # self.db_file.append(args) if (self.db_file is not None) and type(self.db_file) is not list: self.db_file = [self.db_file] if (self.db_dir is not None) and type(self.db_dir) is not list: self.db_dir = [self.db_dir] # may be a cache #cache=Path("/shared/groups/jrole001/geog0111/work/database.db") #if cache.exists(): # cache = cache.as_posix() # self.msg(f'using cache {cache}') # if "db_file" not in self.__dict__: # self.db_file = cache # # if (self.db_file is None): # self.db_file = cache # else: # self.db_file = list_resolve([cache] + self.db_file) if info == {} and 'CACHE_FILE' in os.environ and os.environ[ 'CACHE_FILE'] is not None: db_file = [str(l) for l in list_resolve(os.environ['CACHE_FILE'])] #self.msg(f'using cache {db_file}') if (self.db_file is None): self.db_file = db_file else: self.db_file = list_resolve(self.db_file + db_file) if ((type(self.db_dir) is list) and len(self.db_dir) == 0): self.db_dir = None if ((type(self.db_file) is list) and len(self.db_file) == 0): self.db_file = None if type(self.db_file) is str: self.db_file = [self.db_file] if type(self.db_dir) is str: self.db_dir = [self.db_dir] # writeable db_files if (self.db_file is not None): # ie we apparently have something can_write = False for d in self.db_file: try: Path(d).touch() can_write = True except: pass # in case still none or no writeable if (not can_write) or (self.db_file is None): # in case self.db_dir is none if (self.db_dir is None): self.db_dir = list_resolve([Path('~', '.url_db')]) if (self.db_file is None): self.db_file = [Path(d, '.db.yml') for d in self.db_dir] else: self.db_file.extend([Path(d, '.db.yml') for d in self.db_dir]) self.db_file = list_resolve([Path(f) for f in self.db_file]) self.db_dir = [Path(d).parent for d in self.db_file] if self.database and (len(self.database.keys())): self.msg('getting database from command line') else: self.database = self.set_db(dict(self.get_db())) self.init_database = self.database.copy()
def stitch_date(self, year, doy): '''stitch data for date''' year = int(year) doy = int(doy) dater = (datetime.datetime(year, 1, 1) +\ datetime.timedelta(doy - 1)).strftime('%Y %m %d').split() self.year = f'{year}' self.month = f'{str(int(dater[1])) :0>2s}' self.day = f'{str(int(dater[2])) :0>2s}' d = self.__dict__.copy() hdf_urls = self.get_url(**(fdict(d))) if not (len(hdf_urls) and (type(hdf_urls[0]) == URL)): return [None] if 'db_file' in self.__dict__: if 'database' not in self.__dict__: # load database d = self.__dict__.copy() self.database = Database( self.db_file, **(fdict(d, ignore=['db_dir', 'db_file']))) # look up in db this_set = f"{self.product}.{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}" store_flag = 'modis' response = self.database.get_from_db(store_flag, this_set) if response and self.noclobber: # test if self.test_ok(response[0]): # safe to return self.msg(f'positive response from database') ofiles = response return ofiles else: msg = f'WARNING: invalid entry {response[0]} in database {str(self.db_file)}' print(msg) self.msg(msg) for f in hdf_urls: d = f.read_bytes() hdf_files = [str(f.local()) for f in hdf_urls] sds = self.get_sds(hdf_files, do_all=True) ofiles = [] if len(sds) > len(self.sds): self.msg(f"ERROR in product {self.product} specification of SDS") self.msg(f"all SDS claimed to be: {len(self.sds)}") self.msg(self.sds) self.msg(f"But request for {len(sds)} SDSs made") self.msg(sds) sys.exit(1) for i, sd in enumerate(sds): ofile = f"data.{self.sds[i]}." + \ f"{'_'.join(self.tile)}.{self.year}.{self.month}.{self.day}.vrt" ofile = ofile.replace(' ', '_') spatial_file = Path(f"{self.local_dir[0]}", ofile) g = gdal.BuildVRT(spatial_file.as_posix(), sds[i]) if not g: d = self.__dict__ print( f"problem building dataset for {spatial_file} with {fdict(d)}" ) sys.exit(1) del g ofiles.append(Path(spatial_file).absolute().as_posix()) # store in db cache = {store_flag: {this_set: ofiles}} self.database.set_db(cache, write=True) return ofiles