def to_dataframe(self): """ """ results = [] urls = self.url if isinstance(urls, str): urls = [urls] # Make sure we deal with a list for url in urls: js = self.fs.open_json(url) if isinstance(js, str): continue df = self.json2dataframe(js) df = df.reset_index() df = df.rename(columns=self.key_map) df = df[[value for value in self.key_map.values() if value in df.columns]] results.append(df) results = [r for r in results if r is not None] # Only keep non-empty results if len(results) > 0: df = pd.concat(results, ignore_index=True) df.sort_values(by=['TIME', 'PRES'], inplace=True) df = df.set_index(['N_POINTS']) # df['N_POINTS'] = np.arange(0, len(df['N_POINTS'])) # Re-index to avoid duplicate values return df else: raise DataNotFound("CAN'T FETCH ANY DATA !")
def open_dataframe(self, search_cls): """ Run a search on an Argo index file and return a Pandas dataframe with results Parameters ---------- search_cls: Class instance inhereting from index_filter_proto Returns ------- :class:`pandas.DataFrame` """ uri = search_cls.uri() with self.open_index() as f: if self.cache and (self.in_cache(self.fs['search'].fs, uri) or self.in_memory(self.fs['search'].fs, uri)): # print('Search already in memory, loading:', uri) with self.fs['search'].open(uri, "r") as of: df = self.res2dataframe(of.read()) else: # print('Running search from scratch ...') # Run search: results = search_cls.run(f) if not results: raise DataNotFound( "No Argo data in the index correspond to your search criteria.\nSearch URI: %s" % uri) # and save results for caching: if self.cache: with self.fs['search'].open(uri, "w") as of: of.write(results) # This happens in memory self.fs['search'].fs.save_cache() df = self.res2dataframe(results) return df
def open_mfjson(self, urls, max_workers=112, method: str = 'thread', progress: bool = False, preprocess=None, errors: str = 'ignore', *args, **kwargs): """ Open multiple json urls This is a parallelized version of ``open_json``. Use a Threads Pool by default for parallelization. Parameters ---------- urls: list(str) max_workers: int Maximum number of threads or processes. method: The parallelization method to execute calls asynchronously: - 'thread' (Default): use a pool of at most ``max_workers`` threads - 'process': use a pool of at most ``max_workers`` processes - (XFAIL) Dask client object: use a Dask distributed client object Use 'seq' to simply open data sequentially progress: bool Display a progress bar (True by default, not for dask client method) preprocess: (callable, optional) If provided, call this function on each json set Returns ------- list() """ strUrl = lambda x: x.replace("https://", "").replace("http://", "") if not isinstance(urls, list): urls = [urls] results = [] failed = [] if method in ['thread', 'process']: if method == 'thread': ConcurrentExecutor = concurrent.futures.ThreadPoolExecutor( max_workers=max_workers) else: if max_workers == 112: max_workers = multiprocessing.cpu_count() ConcurrentExecutor = concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) with ConcurrentExecutor as executor: future_to_url = { executor.submit(self._mfprocessor_json, url, preprocess=preprocess, *args, **kwargs): url for url in urls } futures = concurrent.futures.as_completed(future_to_url) if progress: futures = tqdm(futures, total=len(urls)) for future in futures: data = None try: data = future.result() except Exception as e: failed.append(future_to_url[future]) if errors == 'ignore': warnings.warn( "\nSomething went wrong with this url: %s\nException raised: %s" % (strUrl(future_to_url[future]), str(e.args))) pass elif errors == 'silent': pass else: raise finally: results.append(data) # elif type(method) == distributed.client.Client: # # Use a dask client: # futures = method.map(self._mfprocessor_json, urls, preprocess=preprocess, *args, **kwargs) # results = method.gather(futures) elif method in ['seq', 'sequential']: if progress: urls = tqdm(urls, total=len(urls)) for url in urls: data = None try: data = self._mfprocessor_json(url, preprocess=preprocess, *args, **kwargs) except Exception as e: failed.append(url) if errors == 'ignore': warnings.warn( "\nSomething went wrong with this url: %s\nException raised: %s" % (strUrl(url), str(e.args))) pass elif errors == 'silent': pass else: raise finally: results.append(data) else: raise InvalidMethod(method) # Post-process results results = [r for r in results if r is not None] # Only keep non-empty results if len(results) > 0: return results else: raise DataNotFound(urls)
def open_mfdataset(self, urls, concat_dim='row', max_workers: int = 112, method: str = 'thread', progress: bool = False, concat: bool = True, preprocess=None, errors: str = 'ignore', *args, **kwargs): """ Open multiple urls as a single xarray dataset. This is a version of the ``open_dataset`` method that is able to handle a list of urls/paths sequentially or in parallel. Use a Threads Pool by default for parallelization. Parameters ---------- urls: list(str) List of url/path to open concat_dim: str Name of the dimension to use to concatenate all datasets (passed to :class:`xarray.concat`) max_workers: int Maximum number of threads or processes method: str The parallelization method to execute calls asynchronously: - ``thread`` (Default): use a pool of at most ``max_workers`` threads - ``process``: use a pool of at most ``max_workers`` processes - (XFAIL) a :class:`distributed.client.Client` object (:class:`distributed.client.Client`) Use 'seq' to simply open data sequentially progress: bool Display a progress bar (True by default) preprocess: callable (optional) If provided, call this function on each dataset prior to concatenation Returns ------- :class:`xarray.Dataset` """ if not isinstance(urls, list): urls = [urls] results = [] failed = [] if method in ['thread', 'process']: if method == 'thread': ConcurrentExecutor = concurrent.futures.ThreadPoolExecutor( max_workers=max_workers) else: if max_workers == 112: max_workers = multiprocessing.cpu_count() ConcurrentExecutor = concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) with ConcurrentExecutor as executor: future_to_url = { executor.submit(self._mfprocessor_dataset, url, preprocess=preprocess, *args, **kwargs): url for url in urls } futures = concurrent.futures.as_completed(future_to_url) if progress: futures = tqdm(futures, total=len(urls)) for future in futures: data = None try: data = future.result() except Exception as e: failed.append(future_to_url[future]) if errors == 'ignore': warnings.warn( "\nSomething went wrong with this url: %s\nException raised: %s" % (future_to_url[future].replace( "https://", "").replace("http://", ""), str(e.args))) pass elif errors == 'silent': pass else: raise finally: results.append(data) # elif type(method) == distributed.client.Client: # # Use a dask client: # futures = method.map(self._mfprocessor_dataset, urls, preprocess=preprocess, *args, **kwargs) # results = method.gather(futures) elif method in ['seq', 'sequential']: if progress: urls = tqdm(urls, total=len(urls)) for url in urls: data = None try: data = self._mfprocessor_dataset(url, preprocess=preprocess, *args, **kwargs) except Exception as e: failed.append(url) if errors == 'ignore': warnings.warn( "\nSomething went wrong with this url: %s\nException raised: %s" % (url.replace("https://", "").replace( "http://", ""), str(e.args))) pass elif errors == 'silent': pass else: raise finally: results.append(data) else: raise InvalidMethod(method) # Post-process results results = [r for r in results if r is not None] # Only keep non-empty results if len(results) > 0: if concat: # ds = xr.concat(results, dim=concat_dim, data_vars='all', coords='all', compat='override') ds = xr.concat(results, dim=concat_dim, data_vars='minimal', coords='minimal', compat='override') return ds else: return results else: raise DataNotFound(urls)