def testFilesRange_sameDirectory_local(): s = Scraper('/'.join(['file:/', str(rootdir), 'EIT', 'efz%Y%m%d.%H%M%S_s.fits'])) startdate = parse_time((2004, 3, 1, 4, 0)) enddate = parse_time((2004, 3, 1, 6, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 3 startdate = parse_time((2010, 1, 10, 20, 30)) enddate = parse_time((2010, 1, 20, 20, 30)) assert len(s.filelist(TimeRange(startdate, enddate))) == 0
def testFilesRange_sameDirectory_remote(): pattern = ('http://solarmonitor.org/data/%Y/%m/%d/' 'fits/{instrument}/' '{instrument}_00174_fd_%Y%m%d_%H%M%S.fts.gz') s = Scraper(pattern, instrument='swap') startdate = parse_time((2014, 5, 14, 0, 0)) enddate = parse_time((2014, 5, 14, 6, 30)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 2 startdate = parse_time((2014, 5, 14, 21, 0)) enddate = parse_time((2014, 5, 14, 23, 30)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 0
def test_regex_data(): prefix = r'https://gong2.nso.edu/oQR/zqs/' pattern = prefix + r'%Y%m/mrzqs%y%m%d/mrzqs%y%m%dt%H%Mc(\d){4}_(\d){3}\.fits.gz' s = Scraper(pattern, regex=True) timerange = TimeRange('2020-01-05', '2020-01-06T16:00:00') assert s._URL_followsPattern(prefix + '202001/mrzqs200106/mrzqs200106t1514c2226_297.fits.gz') assert len(s.filelist(timerange)) == 37
def test_filelist_url_missing_directory(): # Asserts solution to ticket #2684. # Attempting to access data for the year 1960 results in a 404, so no files are returned. pattern = 'http://lasp.colorado.edu/eve/data_access/evewebdataproducts/level2/%Y/%j/' s = Scraper(pattern) timerange = TimeRange('1960/01/01 00:00:00', '1960/01/02 00:00:00') assert len(s.filelist(timerange)) == 0
def test_ftp(): pattern = 'ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/%Y/%m/tca%y%m%d' s = Scraper(pattern) timerange = TimeRange('2016/5/18 15:28:00', '2016/5/20 16:30:50') urls = s.filelist(timerange) assert urls[0] == ('ftp://solar-pub.nao.ac.jp' '/pub/nsro/norh/data/tcx/2016/05/tca160519') assert len(urls) == 2
def testFilesRange_sameDirectory_months_remote(): pattern = ('http://www.srl.caltech.edu/{spacecraft}/DATA/{instrument}/' 'Ahead/1minute/AeH%y%b.1m') s = Scraper(pattern, spacecraft='STEREO', instrument='HET') startdate = parse_time((2007, 8, 1)) enddate = parse_time((2007, 9, 10)) timerange = TimeRange(startdate, enddate) assert len(s.filelist(timerange)) == 2
def test_filelist_relative_hrefs(): # the url opened by the scraper from below pattern contains some links which don't have hrefs pattern = 'http://www.bbso.njit.edu/pub/archive/%Y/%m/%d/bbso_halph_fr_%Y%m%d_%H%M%S.fts' s = Scraper(pattern) timerange = TimeRange('2016/5/18 15:28:00', '2016/5/18 16:30:00') assert s.domain == 'http://www.bbso.njit.edu/' # hrefs are relative to domain here, not to the directory they are present in # this checks that `scraper.filelist` returns fileurls relative to the domain fileurls = s.filelist(timerange) assert fileurls[1] == s.domain + 'pub/archive/2016/05/18/bbso_halph_fr_20160518_160033.fts'
def test_extract_files_meta(): baseurl0 = r'ftp://solar-pub.nao.ac.jp/pub/nsro/norh/data/tcx/%Y/%m/(\w){3}%y%m%d' extractpattern0 = '{}/tcx/{year:4d}/{month:2d}/{wave}{:4d}{day:2d}' s0 = Scraper(baseurl0, regex=True) timerange0 = TimeRange('2020/1/1 4:00', '2020/1/2') matchdict = {'wave': ['tca', 'tcz']} metalist0 = s0._extract_files_meta(timerange0, extractpattern0, matcher=matchdict) assert metalist0[0]['wave'] == 'tca' assert metalist0[3]['wave'] == 'tcz' assert metalist0[1]['day'] == 2 prefix = r'https://gong2.nso.edu/oQR/zqs/' baseurl1 = prefix + r'%Y%m/mrzqs%y%m%d/mrzqs%y%m%dt%H%Mc(\d){4}_(\d){3}\.fits.gz' extractpattern1 = ('{}/zqs/{year:4d}{month:2d}/mrzqs{:4d}{day:2d}/mrzqs{:6d}t' '{hour:2d}{minute:2d}c{CAR_ROT:4d}_{:3d}.fits.gz') s1 = Scraper(baseurl1, regex=True) timerange1 = TimeRange('2020-01-05', '2020-01-05T16:00:00') metalist1 = s1._extract_files_meta(timerange1, extractpattern1) urls = s1.filelist(timerange1) assert metalist1[3]['CAR_ROT'] == 2226 assert metalist1[-1]['url'] == urls[-1]