def test_urlQueryParser(): assert anc.urlQueryParser('www.somepage.foo', { 'foo': 'bar', 'page': 1 }) in [ 'www.somepage.foo?foo=bar&page=1', 'www.somepage.foo?page=1&foo=bar' ]
def catch(self, sensor, osvtype='POE', start=None, stop=None): """ check a server for files Parameters ---------- sensor: str or list The S1 mission(s): - 'S1A' - 'S1B' - ['S1A', 'S1B'] osvtype: {'POE', 'RES'} the type of orbit files required start: str the date to start searching for files in format YYYYmmddTHHMMSS stop: str the date to stop searching for files in format YYYYmmddTHHMMSS Returns ------- list the URLs of the remote OSV files """ # a dictionary for storing the url arguments query = {} if osvtype == 'POE': query['product_type'] = 'AUX_POEORB' elif osvtype == 'RES': query['product_type'] = 'AUX_RESORB' else: raise RuntimeError("osvtype must be either 'POE' or 'RES'") if sensor in ['S1A', 'S1B']: query['sentinel1__mission'] = sensor elif sorted(sensor) == ['S1A', 'S1B']: pass else: raise RuntimeError('unsupported input for parameter sensor') # the collection of files to be returned collection = [] # set the defined date or the date of the first existing OSV file otherwise # two days are added/subtracted from the defined start and stop dates since the # online query does only allow for searching the start time; hence, if e.g. # the start date is 2018-01-01T000000, the query would not return the corresponding # file, whose start date is 2017-12-31 (V20171231T225942_20180102T005942) if start is not None: date_start = datetime.strptime( start, '%Y%m%dT%H%M%S').strftime('%Y-%m-%dT%H:%M:%S') else: date_start = '2014-07-31' # set the defined date or the current date otherwise if stop is not None: date_stop = datetime.strptime( stop, '%Y%m%dT%H%M%S').strftime('%Y-%m-%dT%H:%M:%S') else: date_stop = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') # append the time frame to the query dictionary query['validity_start__gte'] = date_start query['validity_stop__lte'] = date_stop print('searching for new {} files'.format(osvtype)) target = urlQueryParser(self.url, query).replace('%3A', ':') print(target) while target is not None: response = requests.get(target, timeout=self.timeout).json() remotes = [item['remote_url'] for item in response['results']] collection += remotes target = response['next'] if osvtype == 'RES' and self.maxdate('POE', 'stop') is not None: collection = [ x for x in collection if self.date(x, 'start') > self.maxdate('POE', 'stop') ] return collection
def catch(self, sensor, osvtype='POE', start=None, stop=None): """ check a server for files Parameters ---------- sensor: str or list The S1 mission(s): - 'S1A' - 'S1B' - ['S1A', 'S1B'] osvtype: {'POE', 'RES'} the type of orbit files required start: str the date to start searching for files stop: str the date to stop searching for files Returns ------- list the URLs of the remote OSV files """ address, outdir = self._typeEvaluate(osvtype) # a dictionary for storing the url arguments query = {'page': 1} # a list of pages to be searched; will be extended during url readout pages = [1] if sensor in ['S1A', 'S1B']: query['sentinel1__mission'] = sensor elif sorted(sensor) == ['S1A', 'S1B']: pass else: raise RuntimeError('unsupported input for parameter sensor') # the collection of files to be returned files = [] # set the defined date or the date of the first existing OSV file otherwise # two days are added/subtracted from the defined start and stop dates since the # online query does only allow for searching the start time; hence, if e.g. # the start date is 2018-01-01T000000, the query would not return the corresponding # file, whose start date is 2017-12-31 (V20171231T225942_20180102T005942) if start is not None: date_start = datetime.strptime(start, '%Y%m%dT%H%M%S') date_start = (date_start - timedelta(days=2)).strftime('%Y-%m-%d') else: date_start = '2014-07-31' # set the defined date or the current date otherwise if stop is not None: date_stop = datetime.strptime(stop, '%Y%m%dT%H%M%S') date_stop = (date_stop + timedelta(days=2)).strftime('%Y-%m-%d') else: date_stop = time.strftime('%Y-%m-%d') # pattern for scanning urlopen response for links to OSV files pattern_url = 'http.*{}'.format(self.pattern) # append the time frame to the query dictionary query['validity_start'] = '{0}..{1}'.format(date_start, date_stop) print('searching for new {} files'.format(osvtype)) # iterate through the url pages and look for files while len(pages) > 0: # parse the url subaddress = urlQueryParser(address, query) # read the remote content try: response = urlopen(subaddress, context=self.sslcontext).read().decode('utf-8') print(subaddress) except IOError as e: raise RuntimeError(e) if query['page'] == 1: # read all existing pages from the url return of the first page pages_str = re.findall('page=[0-9]+', response) pages = list(set([int(x.strip('page=')) for x in pages_str])) else: # delete the page from the list of pages yet to be searched del pages[pages.index(query['page'])] # list all osv files found on the page remotes = sorted(set(re.findall(pattern_url, response))) # do a more accurate filtering of the time stamps if start is not None: remotes = [x for x in remotes if self.date(x, 'stop') >= start] if stop is not None: remotes = [x for x in remotes if self.date(x, 'start') <= stop] # filter files already existing in the files collection selection = [x for x in remotes if x not in files] if len(selection) >= 0: # append the found files to the collection files += selection # increment the url page query['page'] += 1 # in case the type 'RES' is selected then only return those files covering # a time period not covered by any POE file if osvtype == 'RES': files = [x for x in files if self.date(x, 'start') > self.maxdate('POE', 'stop')] return files
def catch(self, osvtype='POE', start=None, stop=None): """ check a server for files Parameters ---------- osvtype: {'POE', 'RES'} the type of orbit files required start: str the date to start searching for files stop: str the date to stop searching for files Returns ------- list the URLs of the remote OSV files """ address, outdir = self._typeEvaluate(osvtype) # a dictionary for storing the url arguments query = {'page': 1} # the collection of files to be returned files = [] # set the defined date or the date of the first existing OSV file otherwise if start is not None: date_start = datetime.strptime( start, '%Y%m%dT%H%M%S').strftime('%Y-%m-%d') else: date_start = '2014-08-22' # set the defined date or the current date otherwise if stop is not None: date_stop = datetime.strptime(stop, '%Y%m%dT%H%M%S').strftime('%Y-%m-%d') else: date_stop = time.strftime('%Y-%m-%d') # pattern for scanning urlopen response for links to OSV files pattern_url = 'http.*{}'.format(self.pattern) # append the time frame to the query dictionary query['validity_start'] = '{0}..{1}'.format(date_start, date_stop) print('searching for new {} files'.format(osvtype)) # iterate through the url pages and look for files while True: # parse the url subaddress = urlQueryParser(address, query) # read the remote content try: response = urlopen( subaddress, context=self.sslcontext).read().decode('utf-8') print(subaddress) except IOError as e: raise RuntimeError(e) # list all osv files found on the page remotes = sorted(set(re.findall(pattern_url, response))) # do a more accurate filtering of the time stamps if start is not None: remotes = [x for x in remotes if self.date(x, 'stop') > start] # filter files already existing in the files collection selection = [x for x in remotes if x not in files] # stop the loop if no more files are found on the current url page if len(selection) == 0: break else: # append the found files to the collection and increment the url page files += selection query['page'] += 1 # in case the type 'RES' is selected then only return those files covering # a time period not covered by any POE file if osvtype == 'RES': files = [ x for x in files if self.date(x, 'stop') > self.maxdate('POE', 'stop') ] return files