def _get_url_for_timerange(self, timerange, **kwargs): """ Returns a URL to the GOES data for the specified date. Parameters ---------- timerange : `~sunpy.time.TimeRange` The time range you want the files for. Returns ------- `list` The URL(s) for the corresponding timerange. """ timerange = TimeRange(timerange.start.strftime('%Y-%m-%d'), timerange.end) if timerange.end < parse_time("1999/01/15"): goes_file = "%Y/go{satellitenumber:02d}%y%m%d.fits" elif timerange.start < parse_time( "1999/01/15") and timerange.end >= parse_time("1999/01/15"): return self._get_overlap_urls(timerange) else: goes_file = "%Y/go{satellitenumber}%Y%m%d.fits" goes_pattern = f"https://umbra.nascom.nasa.gov/goes/fits/{goes_file}" satellitenumber = kwargs.get("satellitenumber", self._get_goes_sat_num(timerange.start)) goes_files = Scraper(goes_pattern, satellitenumber=satellitenumber) return goes_files.filelist(timerange)
def testURL_patternMilliseconds(): s = Scraper('fd_%Y%m%d_%H%M%S_%e.fts') # NOTE: Seems that if below fails randomly - not understood why # with `== True` fails a bit less... assert s._URL_followsPattern('fd_20130410_231211_119.fts') assert not s._URL_followsPattern('fd_20130410_231211.fts.gz') assert not s._URL_followsPattern('fd_20130410_ar_231211.fts.gz')
def test_filelist_url_missing_directory(): # Asserts solution to ticket #2684. # Attempting to access data for the year 1960 results in a 404, so no files are returned. pattern = 'http://lasp.colorado.edu/eve/data_access/evewebdataproducts/level2/%Y/%j/' s = Scraper(pattern) timerange = TimeRange('1960/01/01 00:00:00', '1960/01/02 00:00:00') assert len(s.filelist(timerange)) == 0
def _get_url_for_timerange(self, timerange, **kwargs): """ Returns the url for Fermi/GBM data for the given date. Parameters ---------- timerange : `sunpy.time.TimeRange` The time range for which to download the data. Returns ------- `str`: The url(s) for time of interest. """ # Checks if detector keyword # If not defaults to detector 5 if 'detector' in kwargs: det = _check_detector(kwargs['detector']) else: det = 'n5' # Check for resolution keyword - either CSPEC or CTIME # Default type is CSPEC if 'resolution' in kwargs: data_type = _check_type(kwargs['resolution']) else: data_type = 'cspec' gbm_pattern = ( 'https://heasarc.gsfc.nasa.gov/FTP/fermi/data/gbm/daily/' '%Y/%m/%d/current/glg_{data_type}_{det}_%y%m%d_v00.pha') gbm_files = Scraper(gbm_pattern, data_type=data_type, det=det) urls = gbm_files.filelist(timerange) return urls
def testDirectoryRangeFalse(): s = Scraper('%Y%m%d/%Y%m%d_%H.fit.gz') directory_list = [ '20091230/', '20091231/', '20100101/', '20090102/', '20090103/' ] timerange = TimeRange('2009/12/30', '2010/01/03') assert s.range(timerange) != directory_list
def search(self, *args, **kwargs): """ Query this client for a list of results. Parameters ---------- \\*args: `tuple` `sunpy.net.attrs` objects representing the query. \\*\\*kwargs: `dict` Any extra keywords to refine the search. Returns ------- A `QueryResponse` instance containing the query result. """ baseurl, pattern, matchdict = self.pre_search_hook(*args, **kwargs) scraper = Scraper(baseurl, regex=True) tr = TimeRange(matchdict['Start Time'], matchdict['End Time']) filesmeta = scraper._extract_files_meta(tr, extractor=pattern, matcher=matchdict) filesmeta = sorted(filesmeta, key=lambda k: k['url']) metalist = [] for i in filesmeta: rowdict = self.post_search_hook(i, matchdict) metalist.append(rowdict) return QueryResponse(metalist, client=self)
def test_regex_data(): prefix = r'https://gong2.nso.edu/oQR/zqs/' pattern = prefix + r'%Y%m/mrzqs%y%m%d/mrzqs%y%m%dt%H%Mc(\d){4}_(\d){3}\.fits.gz' s = Scraper(pattern, regex=True) timerange = TimeRange('2020-01-05', '2020-01-06T16:00:00') assert s._URL_followsPattern(prefix + '202001/mrzqs200106/mrzqs200106t1514c2226_297.fits.gz') assert len(s.filelist(timerange)) == 37
def _get_url_for_timerange(self, timerange, **kwargs): """ Returns the url for Fermi/GBM data for the given date. Parameters ---------- timerange : `sunpy.time.TimeRange` The time range for which to download the data. Returns ------- `str`: The url(s) for time of interest. """ # Checks if detector keyword # If not defaults to detector 5 if 'detector' in kwargs: det = _check_detector(kwargs['detector']) else: det = 'n5' # Check for resolution keyword - either CSPEC or CTIME # Default type is CSPEC if 'resolution' in kwargs: data_type = _check_type(kwargs['resolution']) else: data_type = 'cspec' gbm_pattern = ('https://heasarc.gsfc.nasa.gov/FTP/fermi/data/gbm/daily/' '%Y/%m/%d/current/glg_{data_type}_{det}_%y%m%d_v00.pha') gbm_files = Scraper(gbm_pattern, data_type=data_type, det=det) urls = gbm_files.filelist(timerange) return urls
def _get_time_for_url(self, urls): eve = Scraper(BASEURL) times = list() for url in urls: t0 = eve._extractDateURL(url) # hard coded full day as that's the normal. times.append(TimeRange(t0, t0 + datetime.timedelta(days=1))) return times
def testDirectoryRange(): s = Scraper('%Y/%m/%d/%Y%m%d_%H.fit.gz') directory_list = [ '2009/12/30/', '2009/12/31/', '2010/01/01/', '2010/01/02/', '2010/01/03/' ] timerange = TimeRange('2009-12-30', '2010-01-03') assert s.range(timerange) == directory_list
def testFilesRange_sameDirectory_months_remote(): pattern = ('http://www.srl.caltech.edu/{spacecraft}/DATA/{instrument}/' 'Ahead/1minute/AeH%y%b.1m') s = Scraper(pattern, spacecraft='STEREO', instrument='HET') startdate = parse_time((2007, 8, 1)) enddate = parse_time((2007, 9, 10)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 2
def test_ftp(): pattern = 'ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/%Y/%m/tca%y%m%d' s = Scraper(pattern) timerange = TimeRange('2016/5/18 15:28:00', '2016/5/20 16:30:50') urls = s.filelist(timerange) assert urls[0] == ('ftp://solar-pub.nao.ac.jp' '/pub/nsro/norh/data/tcx/2016/05/tca160519') assert len(urls) == 2
def testDirectoryRegex(): # Test for Windows where '\' is a path separator and not part of the regex s = Scraper( 'scheme://a.url.with/a/few/forward/slashes/andbacklash\\inthename.ext', regex=True) timerange = TimeRange('2019-02-01', '2019-02-03') directory = s.range(timerange) assert directory == ['scheme://a.url.with/a/few/forward/slashes/']
def testFilesRange_sameDirectory_local(): s = Scraper('/'.join(['file:/', rootdir, 'EIT', 'efz%Y%m%d.%H%M%S_s.fits'])) startdate = parse_time((2004, 3, 1, 4, 0)) enddate = parse_time((2004, 3, 1, 6, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 3 startdate = parse_time((2010, 1, 10, 20, 30)) enddate = parse_time((2010, 1, 20, 20, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 0
def _get_time_for_url(self, urls): freq = urls[0].split('/')[-1][0:3] # extract the frequency label crawler = Scraper(BASEURL, freq=freq) times = list() for url in urls: t0 = crawler._extractDateURL(url) # hard coded full day as that's the normal. times.append(TimeRange(t0, t0 + TimeDelta(1*u.day))) return times
def testDirectoryRange_Month(): s = Scraper('%Y%m/%d/%j_%H.txt') startdate = parse_time((2008, 2, 20, 10)) enddate = parse_time((2008, 3, 2, 5)) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 12 startdate = parse_time((2009, 2, 20, 10)) enddate = parse_time((2009, 3, 2, 5)) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 11
def test_filelist_relative_hrefs(): # the url opened by the scraper from below pattern contains some links which don't have hrefs pattern = 'http://www.bbso.njit.edu/pub/archive/%Y/%m/%d/bbso_halph_fr_%Y%m%d_%H%M%S.fts' s = Scraper(pattern) timerange = TimeRange('2016/5/18 15:28:00', '2016/5/18 16:30:00') assert s.domain == 'http://www.bbso.njit.edu/' # hrefs are relative to domain here, not to the directory they are present in # this checks that `scraper.filelist` returns fileurls relative to the domain fileurls = s.filelist(timerange) assert fileurls[1] == s.domain + 'pub/archive/2016/05/18/bbso_halph_fr_20160518_160033.fts'
def testDirectoryRange_Month(): s = Scraper("%Y%m/%d/%j_%H.txt") startdate = datetime.datetime(2008, 2, 20, 10) enddate = datetime.datetime(2008, 3, 2, 5) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 12 startdate = datetime.datetime(2009, 2, 20, 10) enddate = datetime.datetime(2009, 3, 2, 5) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 11
def _get_url_for_timerange(self, timerange, **kwargs): """ Returns list of URLS corresponding to value of input timerange. Parameters ---------- timerange: `sunpy.time.TimeRange` time range for which data is to be downloaded. Returns ------- urls : list list of URLs corresponding to the requested time range """ # We allow queries with no Wavelength but error here so that the query # does not get passed to VSO and spit out garbage. if 'wavelength' not in kwargs.keys() or not kwargs['wavelength']: raise ValueError( "Queries to NORH should specify either 17GHz or 34GHz as a Wavelength." "see https://solar.nro.nao.ac.jp/norh/doc/manuale/node65.html") else: wavelength = kwargs['wavelength'] # If wavelength is a single value GenericClient will have made it a # Quantity in the kwargs. if not isinstance(wavelength, u.Quantity): raise ValueError( "Wavelength to NORH must be one value not {}.".format( wavelength)) wavelength = wavelength.to(u.GHz, equivalencies=u.spectral()) if wavelength == 34 * u.GHz: freq = 'tcz' elif wavelength == 17 * u.GHz: freq = 'tca' else: raise ValueError( "NORH Data can be downloaded for 17GHz or 34GHz," " see https://solar.nro.nao.ac.jp/norh/doc/manuale/node65.html" ) # If start of time range is before 00:00, converted to such, so # files of the requested time ranger are included. # This is done because the archive contains daily files. if timerange.start.strftime('%M-%S') != '00-00': timerange = TimeRange(timerange.start.strftime('%Y-%m-%d'), timerange.end) norh = Scraper(BASEURL, freq=freq) # TODO: warn user that some files may have not been listed, like for example: # tca160504_224657 on ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/2016/05/ # as it doesn't follow pattern. return norh.filelist(timerange)
def testExtractDates_usingPattern(): # Standard pattern s = Scraper('data/%Y/%m/%d/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/05/14/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # Not-full repeated pattern s = Scraper('data/%Y/fits/swap/swap_00174_fd_%Y%m%d_%H%M%S.fts.gz') testURL = 'data/2014/fits/swap/swap_00174_fd_20140514_200135.fts.gz' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def testFilesRange_sameDirectory_local(): # Fails due to an IsADirectoryError, wrapped in a URLError, after `requests` # tries to open a directory as a binary file. s = Scraper('/'.join(['file:/', rootdir, 'EIT', 'efz%Y%m%d.%H%M%S_s.fits'])) startdate = parse_time((2004, 3, 1, 4, 0)) enddate = parse_time((2004, 3, 1, 6, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 3 startdate = parse_time((2010, 1, 10, 20, 30)) enddate = parse_time((2010, 1, 20, 20, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 0
def testFilesRange_sameDirectory_remote(): pattern = "http://solarmonitor.org/data/%Y/%m/%d/" "fits/{instrument}/" "{instrument}_00174_fd_%Y%m%d_%H%M%S.fts.gz" s = Scraper(pattern, instrument="swap") startdate = datetime.datetime(2014, 5, 14, 0, 0) enddate = datetime.datetime(2014, 5, 14, 6, 30) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 2 startdate = datetime.datetime(2014, 5, 14, 21, 0) enddate = datetime.datetime(2014, 5, 14, 23, 30) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 0
def testFilesRange_sameDirectory_local(): s = Scraper('/'.join(['file:/', rootdir, 'EIT', 'efz%Y%m%d.%H%M%S_s.fits'])) print(s.pattern) print(s.now) startdate = parse_time((2004, 3, 1, 4, 0)) enddate = parse_time((2004, 3, 1, 6, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 3 startdate = parse_time((2010, 1, 10, 20, 30)) enddate = parse_time((2010, 1, 20, 20, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 0
def get_files(wave): sswbrowser_pattern = ( 'https://hesperia.gsfc.nasa.gov/sdo/aia/2014/06/11/20140611_0528-0547/' 'ssw_cutout_%Y%m%d_%H%M%S_aia_{wave}_.fts') ssw = Scraper(sswbrowser_pattern, wave=wave) timerange = TimeRange('2014-06-11 05:30:00', '2014-06-11 05:36:00') aia_131_files = ssw.filelist(timerange) return aia_131_files
def testFilesRange_sameDirectory_remote(): pattern = ('http://solarmonitor.org/data/%Y/%m/%d/' 'fits/{instrument}/' '{instrument}_00174_fd_%Y%m%d_%H%M%S.fts.gz') s = Scraper(pattern, instrument='swap') startdate = parse_time((2014, 5, 14, 0, 0)) enddate = parse_time((2014, 5, 14, 6, 30)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 2 startdate = parse_time((2014, 5, 14, 21, 0)) enddate = parse_time((2014, 5, 14, 23, 30)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 0
def testExtractDates_notSeparators_andSimilar(): s = Scraper('data/%Y/Jun%b%d_%H%M%S') testURL = 'data/2014/JunJun14_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL testURL = 'data/2014/JunMay14_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL # and testing with the month afterwards s = Scraper('data/%Y/%dJun%b_%H%M%S') testURL = 'data/2014/14JunJun_200135' timeURL = parse_time((2014, 6, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL
def _get_metalist_fn(self, matchdict, baseurl, pattern): """ Function to help get list of OrderedDicts. """ metalist = [] scraper = Scraper(baseurl, regex=True) tr = TimeRange(matchdict["Start Time"], matchdict["End Time"]) filemeta = scraper._extract_files_meta(tr, extractor=pattern, matcher=matchdict) for i in filemeta: rowdict = self.post_search_hook(i, matchdict) metalist.append(rowdict) return metalist
def search(self, *args, **kwargs): baseurl, pattern, matchdict = self.pre_search_hook(*args, **kwargs) metalist = [] for obs in matchdict['Observatory']: scraper = Scraper(baseurl.format(obs=self.observatory_map[obs.title()]), regex=True) tr = TimeRange(matchdict['Start Time'], matchdict['End Time']) filesmeta = scraper._extract_files_meta(tr, extractor=pattern, matcher=matchdict) for i in filesmeta: rowdict = self.post_search_hook(i, matchdict) metalist.append(rowdict) return QueryResponse(metalist, client=self)
def _get_url_for_timerange(self, timerange, **kwargs): """ Returns list of URLS corresponding to value of input timerange. Parameters ---------- timerange: `sunpy.time.TimeRange` time range for which data is to be downloaded. Returns ------- urls : list list of URLs corresponding to the requested time range """ # We allow queries with no Wavelength but error here so that the query # does not get passed to VSO and spit out garbage. if 'wavelength' not in kwargs.keys() or not kwargs['wavelength']: raise ValueError("Queries to NORH should specify either 17GHz or 34GHz as a Wavelength." "see https://solar.nro.nao.ac.jp/norh/doc/manuale/node65.html") else: wavelength = kwargs['wavelength'] # If wavelength is a single value GenericClient will have made it a # Quantity in the kwargs. if not isinstance(wavelength, u.Quantity): raise ValueError("Wavelength to NORH must be one value not {}.".format(wavelength)) wavelength = wavelength.to(u.GHz, equivalencies=u.spectral()) if wavelength == 34 * u.GHz: freq = 'tcz' elif wavelength == 17 * u.GHz: freq = 'tca' else: raise ValueError("NORH Data can be downloaded for 17GHz or 34GHz," " see https://solar.nro.nao.ac.jp/norh/doc/manuale/node65.html") # If start of time range is before 00:00, converted to such, so # files of the requested time ranger are included. # This is done because the archive contains daily files. if timerange.start.strftime('%M-%S') != '00-00': timerange = TimeRange(timerange.start.strftime('%Y-%m-%d'), timerange.end) norh = Scraper(BASEURL, freq=freq) # TODO: warn user that some files may have not been listed, like for example: # tca160504_224657 on ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/2016/05/ # as it doesn't follow pattern. return norh.filelist(timerange)
def search(self, *args, **kwargs): supported_waves = [94, 131, 171, 195, 284, 304] * u.Angstrom all_waves = [] matchdict = self._get_match_dict(*args, **kwargs) req_wave = matchdict.get('Wavelength', None) if req_wave is not None: wmin = req_wave.min.to(u.Angstrom, equivalencies=u.spectral()) wmax = req_wave.max.to(u.Angstrom, equivalencies=u.spectral()) req_wave = a.Wavelength(wmin, wmax) for wave in supported_waves: if wave in req_wave: all_waves.append(int(wave.value)) else: all_waves = [int(i.value) for i in supported_waves] all_satnos = matchdict.get('SatelliteNumber') all_levels = matchdict.get('Level') metalist = [] # iterating over all possible Attr values through loops for satno in all_satnos: for level in all_levels: for wave in all_waves: formdict = {'wave': wave, 'SatelliteNumber': satno} if str(level) == '1b': formdict['elem'] = 'fe' if wave == 304: formdict['elem'] = 'he' baseurl = self.baseurl1b pattern = self.pattern1b elif str(level) == '2': baseurl = self.baseurl2 pattern = self.pattern2 else: raise ValueError(f"Level {level} is not supported.") # formatting baseurl using Level, SatelliteNumber and Wavelength urlpattern = baseurl.format(**formdict) scraper = Scraper(urlpattern) tr = TimeRange(matchdict['Start Time'], matchdict['End Time']) filesmeta = scraper._extract_files_meta(tr, extractor=pattern) for i in filesmeta: rowdict = self.post_search_hook(i, matchdict) metalist.append(rowdict) return QueryResponse(metalist, client=self)
def testURL_patternMillisecondsZeroPadded(): # Asserts solution to ticket #1954. # Milliseconds must be zero-padded in order to match URL lengths. now_mock = Mock(return_value=datetime.datetime(2019, 4, 19, 0, 0, 0, 4009)) with patch('sunpy.util.scraper.datetime', now=now_mock): s = Scraper('fd_%Y%m%d_%H%M%S_%e.fts') now_mock.assert_called_once() assert s.now == 'fd_20190419_000000_004.fts'
def _get_url_for_timerange(self, timerange, **kwargs): """ Return URL(s) for corresponding timerange. Parameters ---------- timerange : `~sunpy.time.TimeRange` The time range you want the files for. Returns ------- `list` The URL(s) for the corresponding timerange. """ lyra_pattern = ('http://proba2.oma.be/lyra/data/bsd/%Y/%m/%d/' 'lyra_%Y%m%d-000000_lev{level}_std.fits') lyra_files = Scraper(lyra_pattern, level=kwargs.get('level', 2)) urls = lyra_files.filelist(timerange) return urls
def _get_url_for_timerange(self, timerange, **kwargs): """ Return list of URLS corresponding to value of input timerange. Parameters ---------- timerange: `sunpy.time.TimeRange` time range for which data is to be downloaded. Returns ------- urls : list list of URLs corresponding to the requested time range """ # If start of time range is before 00:00, converted to such, so # files of the requested time ranger are included. # This is done because the archive contains daily files. if timerange.start.strftime('%M-%S') != '00-00': timerange = TimeRange(timerange.start.strftime('%Y-%m-%d'), timerange.end) eve = Scraper(BASEURL) return eve.filelist(timerange)
def _get_url_for_timerange(self, timerange, **kwargs): """ Return list of URLS corresponding to value of input timerange. Parameters ---------- timerange: `sunpy.time.TimeRange` time range for which data is to be downloaded. Returns ------- urls : list list of URLs corresponding to the requested time range """ # If start of time range is before 00:00, converted to such, so # files of the requested time ranger are included. # This is done because the archive contains daily files. if timerange.start.time() != datetime.time(0, 0): timerange = TimeRange('{:%Y-%m-%d}'.format(timerange.start), timerange.end) eve = Scraper(BASEURL) return eve.filelist(timerange)
def search(self, *args, **kwargs): """ Query this client for a list of results. Parameters ---------- *args: `tuple` `sunpy.net.attrs` objects representing the query. **kwargs: `dict` Any extra keywords to refine the search. Returns ------- A `QueryResponse` instance containing the query result. """ matchdict = self._get_match_dict(*args, **kwargs) req_wave = matchdict.get('Wavelength', None) receivers = RECEIVER_FREQUENCIES.keys() if req_wave is not None: receivers = self._check_wavelengths(req_wave) metalist = [] start_year = matchdict['Start Time'].datetime.year end_year = matchdict['End Time'].datetime.year tr = TimeRange(matchdict['Start Time'], matchdict['End Time']) for receiver in receivers: for year in range(start_year, end_year + 1): urlpattern = self.baseurl.format(Wavelength=receiver, year=year, ext=RECEIVER_EXT[receiver]) scraper = Scraper(urlpattern, regex=True) filesmeta = scraper._extract_files_meta(tr, extractor=self.pattern) for i in filesmeta: rowdict = self.post_search_hook(i, matchdict) metalist.append(rowdict) return QueryResponse(metalist, client=self)
def test_extract_files_meta(): baseurl0 = 'ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/%Y/%m/{freq}%y%m%d' extractpattern0 = '{}/tcx/{year:4d}/{month:2d}/{wavelength}{:4d}{day:2d}' s0 = Scraper(baseurl0, freq='tca') timerange0 = TimeRange('2020/1/1', '2020/1/2') metalist0 = s0._extract_files_meta(timerange0, extractpattern0) assert metalist0[0]['wavelength'] == 'tca' assert metalist0[1]['day'] == 2 prefix = r'https://gong2.nso.edu/oQR/zqs/' baseurl1 = prefix + r'%Y%m/mrzqs%y%m%d/mrzqs%y%m%dt%H%Mc(\d){4}_(\d){3}\.fits.gz' extractpattern1 = '{}/zqs/{:6d}/mrzqs{:6d}/mrzqs{:6d}t{:4d}c{CAR_ROT:4d}_{:3d}.fits.gz' s1 = Scraper(baseurl1, regex=True) timerange1 = TimeRange('2020-01-05', '2020-01-05T16:00:00') metalist1 = s1._extract_files_meta(timerange1, extractpattern1) urls = s1.filelist(timerange1) assert metalist1[3]['CAR_ROT'] == 2226 assert metalist1[-1]['url'] == urls[-1]
def testDirectoryRange_single(): s = Scraper('%Y%m%d/%H_%M.csv') startdate = parse_time((2010, 10, 10, 5, 0)) enddate = parse_time((2010, 10, 10, 7, 0)) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 1
def testDirectoryRangeHours(): s = Scraper('%Y%m%d_%H/%H%M.csv') timerange = TimeRange('2009-12-31T23:40:00', '2010-01-01T01:15:00') assert len(s.range(timerange)) == 3 # 3 directories (1 per hour)
def testNoDateDirectory(): s = Scraper('mySpacecraft/myInstrument/xMinutes/aaa%y%b.ext') directory_list = ['mySpacecraft/myInstrument/xMinutes/'] timerange = TimeRange('2009/11/20', '2010/01/03') assert s.range(timerange) == directory_list
def testDirectoryRangeFalse(): s = Scraper('%Y%m%d/%Y%m%d_%H.fit.gz') directory_list = ['20091230/', '20091231/', '20100101/', '20090102/', '20090103/'] timerange = TimeRange('2009/12/30', '2010/01/03') assert s.range(timerange) != directory_list
def testDirectoryRange(): s = Scraper('%Y/%m/%d/%Y%m%d_%H.fit.gz') directory_list = ['2009/12/30/', '2009/12/31/', '2010/01/01/', '2010/01/02/', '2010/01/03/'] timerange = TimeRange('2009-12-30', '2010-01-03') assert s.range(timerange) == directory_list
def testNoDirectory(): s = Scraper("files/%Y%m%d_%H%M.dat") startdate = datetime.datetime(2010, 1, 10, 20, 30) enddate = datetime.datetime(2010, 1, 20, 20, 30) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 1
def test_ftp(): pattern = 'ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/%Y/%m/tca%y%m%d' s = Scraper(pattern) timerange = TimeRange('2016/5/18 15:28:00', '2016/5/20 16:30:50') assert len(s.filelist(timerange)) == 2
def testDirectoryRangeFalse(): s = Scraper("%Y%m%d/%Y%m%d_%H.fit.gz") directory_list = ["20091230/", "20091231/", "20100101/", "20090102/", "20090103/"] timerange = TimeRange("2009/12/30", "2010/01/03") assert s.range(timerange) != directory_list
def testURL_pattern(): s = Scraper('fd_%Y%m%d_%H%M%S.fts') assert s._URL_followsPattern('fd_20130410_231211.fts') assert not s._URL_followsPattern('fd_20130410_231211.fts.gz') assert not s._URL_followsPattern('fd_20130410_ar_231211.fts.gz')
def testDirectoryDatePatternFalse(): s = Scraper('%Y/%m/%d/%Y%m%d_%H%M%S_59.fit.gz') testpath = '2013/03/05/20140305_013000_59.fit.gz' d = parse_time((2014, 3, 5, 1, 30)) assert not s.matches(testpath, d)
def testNoDirectory(): s = Scraper('files/%Y%m%d_%H%M.dat') startdate = parse_time((2010, 1, 10, 20, 30)) enddate = parse_time((2010, 1, 20, 20, 30)) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 1
def testDirectoryObsPattern(): s = Scraper('%y%m%d/{observatory}_%Y%m%d.fits', observatory='SDO') testpath = '140305/SDO_20140305.fits' d = parse_time((2014, 3, 5)) assert s.matches(testpath, d)
def testDirectoryRange_single(): s = Scraper("%Y%m%d/%H_%M.csv") startdate = datetime.datetime(2010, 10, 10, 5, 00) enddate = datetime.datetime(2010, 10, 10, 7, 00) timerange = TimeRange(startdate, enddate) assert len(s.range(timerange)) == 1
def testExtractDates_notSeparators(): s = Scraper('data/%Y/%m/swap%m%d_%H%M%S') testURL = 'data/2014/05/swap0514_200135' timeURL = parse_time((2014, 5, 14, 20, 1, 35)) assert s._extractDateURL(testURL) == timeURL