def run(self): try: from parfive import Downloader except ImportError: log.error( "The parfive package needs to be installed to download files with gammapy download" ) return if self.listfiles: log.info("Content will be downloaded in {}".format(self.outfolder)) dl = Downloader(progress=self.progress) for rec in self.listfiles: url = self.listfiles[rec]["url"] path = self.outfolder / self.listfiles[rec]["path"] md5 = "" if "hashmd5" in self.listfiles[rec]: md5 = self.listfiles[rec]["hashmd5"] retrieve = True if md5 and path.exists(): md5local = hashlib.md5(path.read_bytes()).hexdigest() if md5local == md5: retrieve = False if retrieve: dl.enqueue_file(url, path=str(path.parent)) try: dl.download() except Exception as ex: log.error("Failed to download files.") log.error(ex)
def run(self): try: from parfive import Downloader except ImportError: log.error("To use gammapy download, install the parfive package!") return if self.listfiles: log.info(f"Content will be downloaded in {self.outfolder}") dl = Downloader(progress=self.progress, file_progress=False) for rec in self.listfiles: url = self.listfiles[rec]["url"] path = self.outfolder / self.listfiles[rec]["path"] md5 = "" if "hashmd5" in self.listfiles[rec]: md5 = self.listfiles[rec]["hashmd5"] retrieve = True if md5 and path.exists(): md5local = hashlib.md5(path.read_bytes()).hexdigest() if md5local == md5: retrieve = False if retrieve: dl.enqueue_file(url, path=str(path.parent)) log.info(f"{dl.queued_downloads} files to download.") res = dl.download() log.info(f"{len(res)} files downloaded.") for err in res.errors: _, _, exception = err log.error(f"Error: {exception}")
def mas_helio(): """ Get some MAS heliospheric data files. These are taken from CR2210, which is used for PSP data comparisons in the documentation examples. """ mas_helio_dir = download_dir / 'mas_helio' mas_helio_dir.mkdir(parents=True, exist_ok=True) base_url = 'http://www.predsci.com/data/runs/cr2210-medium/hmi_masp_mas_std_0201/helio/{var}002.hdf' # Create a downloader to queue the files to be downloaded dl = Downloader() vars = ['rho', 'vr', 'br'] for var in vars: file = mas_helio_dir / f'{var}002.hdf' if file.exists(): continue else: remote_file = base_url.format(var=var) dl.enqueue_file(remote_file, path=mas_helio_dir) # Download the files if dl.queued_downloads > 0: dl.download() return mas_helio_dir.resolve()
def fetch(self, qres, path=None, overwrite=False, progress=True, downloader=None, wait=True): """ Download a set of results. Parameters ---------- qres : `~sunpy.net.dataretriever.QueryResponse` Results to download. path : `str` or `pathlib.Path`, optional Path to the download directory, or file template including the ``{file}`` string which will be replaced with the filename. overwrite : `bool` or `str`, optional Determine how to handle downloading if a file already exists with the same name. If `False` the file download will be skipped and the path returned to the existing file, if `True` the file will be downloaded and the existing file will be overwritten, if `'unique'` the filename will be modified to be unique. progress : `bool`, optional If `True` show a progress bar showing how many of the total files have been downloaded. If `False`, no progress bar will be shown. downloader : `parfive.Downloader`, optional The download manager to use. wait : `bool`, optional If `False` ``downloader.download()`` will not be called. Only has any effect if `downloader` is not `None`. Returns ------- results: `parfive.Results` """ if path is not None: path = Path(path) urls = [qrblock.url for qrblock in qres.blocks] filenames = [url.split('/')[-1] for url in urls] paths = self._get_full_filenames(qres, filenames, path) dl_set = True if not downloader: dl_set = False downloader = Downloader(progress=progress, overwrite=overwrite) for url, filename in zip(urls, paths): downloader.enqueue_file(url, filename=filename) if dl_set and not wait: return return downloader.download()
def fetch(self, qres, path=None, overwrite=False, progress=True, downloader=None, wait=True): """ Download a set of results. Parameters ---------- qres : `~sunpy.net.dataretriever.QueryResponse` Results to download. path : `str` or `pathlib.Path`, optional Path to the download directory, or file template including the ``{file}`` string which will be replaced with the filename. progress : `bool`, optional If `True` show a progress bar showing how many of the total files have been downloaded. If `False`, no progress bar will be shown. overwrite : `bool` or `str`, optional Determine how to handle downloading if a file already exists with the same name. If `False` the file download will be skipped and the path returned to the existing file, if `True` the file will be downloaded and the existing file will be overwritten, if `'unique'` the filename will be modified to be unique. downloader : `parfive.Downloader`, optional The download manager to use. wait : `bool`, optional If `False` ``downloader.download()`` will not be called. Only has any effect if `downloader` is not `None`. Returns ------- results: `parfive.Results` """ if path is not None: path = Path(path) urls = [qrblock.url for qrblock in qres] filenames = [url.split('/')[-1] for url in urls] paths = self._get_full_filenames(qres, filenames, path) dl_set = True if not downloader: dl_set = False downloader = Downloader(progress=progress, overwrite=overwrite) for url, filename in zip(urls, paths): downloader.enqueue_file(url, filename=filename) if dl_set and not wait: return return downloader.download()
def download_files_parfive(list_of_files, pathy="/Users/laurahayes/ionospheric_work/ionospheric-analysis/vlf_codes/vlf_bas_files/"): dl = Downloader() for f in list_of_files: filename = f.split('/')[-1] dl.enqueue_file(f, path=pathy) files = dl.download()
def download(self, url, path): downloader = Downloader() path = Path(path) filename = path.name directory = path.parent downloader.enqueue_file(url, directory, filename) try: downloader.download() except Exception as e: raise DownloaderError from e
def test_multipart_with_error(multipartserver, tmp_path): multipartserver.callback = partial(error_on_nth_request, 3) dl = Downloader(progress=False) max_splits = 5 dl.enqueue_file(multipartserver.url, path=tmp_path, max_splits=max_splits) files = dl.download() assert len(files) == 0 assert len(files.errors) == 1 assert isinstance(files.errors[0].exception, MultiPartDownloadError) expected_file = tmp_path / "testfile.txt" assert not expected_file.exists()
def test_multipart(multipartserver, tmp_path): dl = Downloader(progress=False) max_splits = 5 dl.enqueue_file(multipartserver.url, path=tmp_path, max_splits=max_splits) files = dl.download() # Verify we transferred all the content with open(files[0], "rb") as fobj: assert fobj.read() == b"a" * 100 # Assert that we made the expected number of requests assert len(multipartserver.requests) == max_splits + 1 assert "HTTP_RANGE" not in multipartserver.requests[0] for split_req in multipartserver.requests[1:]: assert "HTTP_RANGE" in split_req
def main(): args = parse_args(sys.argv[1:]) downloader = Downloader(max_conn=args.max_conn, file_progress=not args.no_file_progress, overwrite=args.overwrite) for url in args.urls: downloader.enqueue_file(url, path=args.directory) results = downloader.download() for i in results: print(i) err_str = '' for err in results.errors: err_str += f'{err.url} \t {err.exception}\n' if err_str: sys.exit(err_str)
def getenvironment(self): try: from parfive import Downloader except ImportError: log.error("To use gammapy download, install the parfive package!") return dl = Downloader(progress=False, file_progress=False) filename_env = "gammapy-" + self.release + "-environment.yml" url_file_env = BASE_URL + "/install/" + filename_env filepath_env = str(self.outfolder / filename_env) dl.enqueue_file(url_file_env, path=filepath_env) try: log.info(f"Downloading {url_file_env}") Path(filepath_env).parent.mkdir(parents=True, exist_ok=True) dl.download() except Exception as ex: log.error(ex) exit()
def download_beacon(dtime): datestr = dtime.strftime('%Y%m%d') for time in [ '000530', '001615', '005530', '010530', '011530', '043530', ]: url = ('https://stereo-ssc.nascom.nasa.gov/pub/beacon/ahead/secchi/img' f'/euvi/{datestr}/{datestr}_{time}_n7euA.fts') print(url) dl = Downloader() dl.enqueue_file(url, path=map_path(dtime).parent) files = dl.download() if len(files.errors): continue pathlib.Path(files[0]).replace(map_path(dtime)) return raise RuntimeError(f'No EUVI beacon map available for {dtime}')
def fetch(self, query_results: QueryResponseTable, *, path: os.PathLike = None, downloader: parfive.Downloader, **kwargs): """ Fetch asdf files describing the datasets. Parameters ---------- query_results: Results to download. path : `str` or `pathlib.Path`, optional Path to the download directory downloader : `parfive.Downloader` The download manager to use. """ # This logic is being upstreamed into Fido hopefully in 2.1rc4 if path is None: path = Path(config.get( 'downloads', 'download_dir')) / '{file}' # pragma: no cover elif isinstance(path, (str, os.PathLike)) and '{file}' not in str(path): path = Path(path) / '{file}' # pragma: no cover else: path = Path(path) # pragma: no cover path = path.expanduser() if not len(query_results): return for row in query_results: url = f"{self._BASE_DOWNLOAD_URL}/asdf?datasetId={row['Dataset ID']}" # Set max_splits here as the metadata streamer doesn't like accept-range at the moment. downloader.enqueue_file(url, filename=partial(self._make_filename, path, row), max_splits=1)
def main(): args = parser.parse_args() dataset = args.dataset # Load annotations annFile = '../annotations/instances_{}.json'.format(dataset) assert os.path.isfile(annFile) cocoPath = './{}'.format(dataset) try: os.mkdir(cocoPath) except FileExistsError: pass # Init coco coco = COCO(annFile) personCatID = coco.getCatIds(catNms=['person'])[0] cocoImageIds = coco.getImgIds(catIds=personCatID) print("Putting all urls into big list!") urls = [] for i in cocoImageIds: cocoImg = coco.loadImgs(i)[0] annIds = coco.getAnnIds(imgIds=cocoImg["id"], catIds=personCatID, iscrowd=None) annotation = coco.loadAnns(annIds)[0] if annotation["iscrowd"] == 0: urls.append(cocoImg["coco_url"]) print("Enqueueing download of {} items".format(len(urls))) dl = Downloader() for url in urls: dl.enqueue_file(url, path=cocoPath) print("Downloading files...") dl.download()
def fetch(self, qres, path=None, error_callback=None, **kwargs): """ Download a set of results. Parameters ---------- qres : `~sunpy.net.dataretriever.QueryResponse` Results to download. Returns ------- Results Object """ urls = [qrblock.url for qrblock in qres] filenames = [] local_filenames = [] for i, [url, qre] in enumerate(zip(urls, qres)): name = url.split('/')[-1] # temporary fix !!! coz All QRBs have same start_time values day = Time(qre.time.start.strftime('%Y-%m-%d')) + TimeDelta( i * u.day) if name not in filenames: filenames.append(name) if name.endswith('.gz'): local_filenames.append('{}SRS.txt'.format( day.strftime('%Y%m%d'))) else: local_filenames.append(name) # Files to be actually downloaded paths = self._get_full_filenames(qres, filenames, path) # Those files that will be present after get returns local_paths = self._get_full_filenames(qres, local_filenames, path) # remove duplicate urls. This will make paths and urls to have same number of elements. # OrderedDict is required to maintain ordering because it will be zipped with paths later urls = list(OrderedDict.fromkeys(urls)) dobj = Downloader(max_conn=5) for aurl, fname in zip(urls, paths): dobj.enqueue_file(aurl, filename=fname) paths = dobj.download() outfiles = [] for fname, srs_filename in zip(local_paths, local_filenames): name = fname.name past_year = False for i, fname2 in enumerate(paths): fname2 = pathlib.Path(fname2) if fname2.name.endswith('.txt'): continue year = fname2.name.split('_SRS')[0] if year in name: TarFile = tarfile.open(fname2) filepath = fname.parent member = TarFile.getmember('SRS/' + srs_filename) member.name = name TarFile.extract(member, path=filepath) TarFile.close() outfiles.append(fname) past_year = True break if past_year is False: outfiles.append(fname) paths.data = list(map(str, outfiles)) return paths
def euvi_pch_data_download(rootpath='', start_date='2007/05/01', end_date='2019/01/01'): # Crawl through and scrape the EUVI wavelet images url_head = 'http://sd-www.jhuapl.edu/secchi/wavelets/fits/' start_date = parse_time(start_date).to_datetime() end_date = parse_time(end_date).to_datetime() resp = request.urlopen(url_head) soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'), features="lxml") subs = [ link.text for link in soup.find_all('a', href=True) if link.text.endswith('/') ] substime = [datetime.datetime.strptime(s, '%Y%m/') for s in subs] gooddate = [ s >= (start_date - datetime.timedelta(days=start_date.day - 1)) and (s <= end_date) for s in substime ] url_subdir1 = [ parse.urljoin(url_head, sub_dir) for sub_dir, gd in zip(subs, gooddate) if gd ] dl = Downloader() if not rootpath: save_dir = os.path.abspath(os.path.curdir) else: save_dir = os.path.abspath(rootpath) # crawling until a full list has been generated for subdir1 in url_subdir1: resp = request.urlopen(subdir1) soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'), features="lxml") subs = [ link.text for link in soup.find_all('a', href=True) if link.text.endswith('/') ] url_subdir2 = [parse.urljoin(subdir1, sub_dir) for sub_dir in subs] for subdir2 in url_subdir2: resp = request.urlopen(subdir2) soup = BeautifulSoup( resp, from_encoding=resp.info().get_param('charset'), features="lxml") subs = [ link.text for link in soup.find_all('a', href=True) if link.text.endswith('/') ] url_subdir3 = [parse.urljoin(subdir2, sub_dir) for sub_dir in subs] for subdir3 in url_subdir3: subs = [] resp = request.urlopen(subdir3) soup = BeautifulSoup( resp, from_encoding=resp.info().get_param('charset'), features="lxml") subs = [ link.text for link in soup.find_all('a', href=True) if link.text.endswith('.fts.gz') ] if len(subs) > 1: image_url = [ parse.urljoin(subdir3, sub_dir) for sub_dir in subs ] # save_path = [os.path.join(save_dir, subdir3.split('fits/')[1], path) for path in subs] save_path = [ os.path.join(save_dir, subdir3.split('fits/')[1]) for path in subs ] image_times = [ datetime_from_euvi_filename(path) for path in subs ] # grab every 4 hours dt = list( np.logical_not([ np.mod((time - image_times[0]).seconds, 14400) for time in image_times ])) if np.sum(dt) < 6: dt2 = list( np.logical_not([ np.mod((time - image_times[1]).seconds, 14400) for time in image_times ])) if len(dt2) > len(dt): dt = dt2 st = [tt >= start_date for tt in image_times] et = [tt <= end_date for tt in image_times] goodness = [(aa and bb and cc) for aa, bb, cc in zip(dt, st, et)] if np.sum(goodness): os.makedirs(os.path.join(save_dir, subdir3.split('fits/')[1]), exist_ok=True) # download each image for good_image, image_loc, image_destination in zip( goodness, image_url, save_path): if good_image: dl.enqueue_file(image_loc, path=image_destination) files = dl.download() print('Downloaded EUVI ' + wavelength_from_euvi_filename(files[0]) + 'images for ' + image_times[0].strftime('%Y-%m-%d')) else: print('Too few images detected in: ', subdir3)
def fetch(self, qres, path=None, error_callback=None, **kwargs): """ Download a set of results. Parameters ---------- qres : `~sunpy.net.dataretriever.QueryResponse` Results to download. Returns ------- Results Object """ urls = [qrblock.url for qrblock in qres] filenames = [] local_filenames = [] for i, [url, qre] in enumerate(zip(urls, qres)): name = url.split('/')[-1] # temporary fix !!! coz All QRBs have same start_time values day = Time(qre.time.start.strftime('%Y-%m-%d')) + TimeDelta(i*u.day) if name not in filenames: filenames.append(name) if name.endswith('.gz'): local_filenames.append('{}SRS.txt'.format(day.strftime('%Y%m%d'))) else: local_filenames.append(name) # Files to be actually downloaded paths = self._get_full_filenames(qres, filenames, path) # Those files that will be present after get returns local_paths = self._get_full_filenames(qres, local_filenames, path) # remove duplicate urls. This will make paths and urls to have same number of elements. # OrderedDict is required to maintain ordering because it will be zipped with paths later urls = list(OrderedDict.fromkeys(urls)) dobj = Downloader(max_conn=5) for aurl, fname in zip(urls, paths): dobj.enqueue_file(aurl, filename=fname) paths = dobj.download() outfiles = [] for fname, srs_filename in zip(local_paths, local_filenames): name = fname.name past_year = False for i, fname2 in enumerate(paths): fname2 = pathlib.Path(fname2) if fname2.name.endswith('.txt'): continue year = fname2.name.split('_SRS')[0] if year in name: TarFile = tarfile.open(fname2) filepath = fname.parent member = TarFile.getmember('SRS/' + srs_filename) member.name = name TarFile.extract(member, path=filepath) TarFile.close() outfiles.append(fname) past_year = True break if past_year is False: outfiles.append(fname) paths.data = list(map(str, outfiles)) return paths
def get_request(self, requests, path=None, overwrite=False, progress=True, downloader=None, wait=True): """ Query JSOC to see if the request(s) is ready for download. If the request is ready for download, it will then download it. Parameters ---------- requests : `~drms.ExportRequest`, `str`, `list` `~drms.ExportRequest` objects or `str` request IDs or lists returned by `~sunpy.net.jsoc.jsoc.JSOCClient.request_data`. path : `str` Path to save data to, defaults to SunPy download dir. progress : `bool`, optional If `True` show a progress bar showing how many of the total files have been downloaded. If `False`, no progress bar will be shown. overwrite : `bool` or `str`, optional Determine how to handle downloading if a file already exists with the same name. If `False` the file download will be skipped and the path returned to the existing file, if `True` the file will be downloaded and the existing file will be overwritten, if `'unique'` the filename will be modified to be unique. downloader : `parfive.Downloader`, optional The download manager to use. wait : `bool`, optional If `False` ``downloader.download()`` will not be called. Only has any effect if `downloader` is not `None`. Returns ------- res: `~sunpy.net.download.Results` A `~sunpy.net.download.Results` instance or `None` if no URLs to download """ c = drms.Client() # Convert Responses to a list if not already if isinstance(requests, str) or not isiterable(requests): requests = [requests] # Ensure all the requests are drms ExportRequest objects for i, request in enumerate(requests): if isinstance(request, str): r = c.export_from_id(request) requests[i] = r # We only download if all are finished if not all([r.has_succeeded() for r in requests]): raise NotExportedError("Can not download as not all the requests " "have been exported for download yet.") # Ensure path has a {file} in it if path is None: default_dir = config.get("downloads", "download_dir") path = os.path.join(default_dir, '{file}') elif isinstance(path, str) and '{file}' not in path: path = os.path.join(path, '{file}') paths = [] for request in requests: for filename in request.data['filename']: # Ensure we don't duplicate the file extension ext = os.path.splitext(filename)[1] if path.endswith(ext): fname = path.strip(ext) else: fname = path fname = fname.format(file=filename) fname = os.path.expanduser(fname) paths.append(fname) dl_set = True if not downloader: dl_set = False downloader = Downloader(progress=progress, overwrite=overwrite) urls = [] for request in requests: if request.status == 0: for index, data in request.data.iterrows(): url_dir = request.request_url + '/' urls.append(urllib.parse.urljoin(url_dir, data['filename'])) if urls: if progress: print_message = "{0} URLs found for download. Full request totalling {1}MB" print(print_message.format(len(urls), request._d['size'])) for aurl, fname in zip(urls, paths): downloader.enqueue_file(aurl, filename=fname) if dl_set and not wait: return Results() results = downloader.download() return results