コード例 #1
0
    def to_dataframe(self):
        """ """
        results = []
        urls = self.url
        if isinstance(urls, str):
            urls = [urls]  # Make sure we deal with a list
        for url in urls:
            js = self.fs.open_json(url)
            if isinstance(js, str):
                continue
            df = self.json2dataframe(js)
            df = df.reset_index()
            df = df.rename(columns=self.key_map)
            df = df[[value for value in self.key_map.values() if value in df.columns]]
            results.append(df)

        results = [r for r in results if r is not None]  # Only keep non-empty results
        if len(results) > 0:
            df = pd.concat(results, ignore_index=True)
            df.sort_values(by=['TIME', 'PRES'], inplace=True)
            df = df.set_index(['N_POINTS'])
            # df['N_POINTS'] = np.arange(0, len(df['N_POINTS']))  # Re-index to avoid duplicate values
            return df
        else:
            raise DataNotFound("CAN'T FETCH ANY DATA !")
コード例 #2
0
    def open_dataframe(self, search_cls):
        """ Run a search on an Argo index file and return a Pandas dataframe with results

        Parameters
        ----------
        search_cls: Class instance inhereting from index_filter_proto

        Returns
        -------
        :class:`pandas.DataFrame`
        """
        uri = search_cls.uri()
        with self.open_index() as f:
            if self.cache and (self.in_cache(self.fs['search'].fs, uri)
                               or self.in_memory(self.fs['search'].fs, uri)):
                # print('Search already in memory, loading:', uri)
                with self.fs['search'].open(uri, "r") as of:
                    df = self.res2dataframe(of.read())
            else:
                # print('Running search from scratch ...')
                # Run search:
                results = search_cls.run(f)
                if not results:
                    raise DataNotFound(
                        "No Argo data in the index correspond to your search criteria.\nSearch URI: %s"
                        % uri)
                # and save results for caching:
                if self.cache:
                    with self.fs['search'].open(uri, "w") as of:
                        of.write(results)  # This happens in memory
                    self.fs['search'].fs.save_cache()
                df = self.res2dataframe(results)
        return df
コード例 #3
0
    def open_mfjson(self,
                    urls,
                    max_workers=112,
                    method: str = 'thread',
                    progress: bool = False,
                    preprocess=None,
                    errors: str = 'ignore',
                    *args,
                    **kwargs):
        """ Open multiple json urls

            This is a parallelized version of ``open_json``.
            Use a Threads Pool by default for parallelization.

            Parameters
            ----------
            urls: list(str)
            max_workers: int
                Maximum number of threads or processes.
            method:
                The parallelization method to execute calls asynchronously:
                    - 'thread' (Default): use a pool of at most ``max_workers`` threads
                    - 'process': use a pool of at most ``max_workers`` processes
                    - (XFAIL) Dask client object: use a Dask distributed client object

                Use 'seq' to simply open data sequentially
            progress: bool
                Display a progress bar (True by default, not for dask client method)
            preprocess: (callable, optional)
                If provided, call this function on each json set

            Returns
            -------
            list()
        """
        strUrl = lambda x: x.replace("https://", "").replace("http://", "")

        if not isinstance(urls, list):
            urls = [urls]

        results = []
        failed = []
        if method in ['thread', 'process']:
            if method == 'thread':
                ConcurrentExecutor = concurrent.futures.ThreadPoolExecutor(
                    max_workers=max_workers)
            else:
                if max_workers == 112:
                    max_workers = multiprocessing.cpu_count()
                ConcurrentExecutor = concurrent.futures.ProcessPoolExecutor(
                    max_workers=max_workers)

            with ConcurrentExecutor as executor:
                future_to_url = {
                    executor.submit(self._mfprocessor_json,
                                    url,
                                    preprocess=preprocess,
                                    *args,
                                    **kwargs): url
                    for url in urls
                }
                futures = concurrent.futures.as_completed(future_to_url)
                if progress:
                    futures = tqdm(futures, total=len(urls))

                for future in futures:
                    data = None
                    try:
                        data = future.result()
                    except Exception as e:
                        failed.append(future_to_url[future])
                        if errors == 'ignore':
                            warnings.warn(
                                "\nSomething went wrong with this url: %s\nException raised: %s"
                                % (strUrl(future_to_url[future]), str(e.args)))
                            pass
                        elif errors == 'silent':
                            pass
                        else:
                            raise
                    finally:
                        results.append(data)

        # elif type(method) == distributed.client.Client:
        #     # Use a dask client:
        #     futures = method.map(self._mfprocessor_json, urls, preprocess=preprocess, *args, **kwargs)
        #     results = method.gather(futures)

        elif method in ['seq', 'sequential']:
            if progress:
                urls = tqdm(urls, total=len(urls))

            for url in urls:
                data = None
                try:
                    data = self._mfprocessor_json(url,
                                                  preprocess=preprocess,
                                                  *args,
                                                  **kwargs)
                except Exception as e:
                    failed.append(url)
                    if errors == 'ignore':
                        warnings.warn(
                            "\nSomething went wrong with this url: %s\nException raised: %s"
                            % (strUrl(url), str(e.args)))
                        pass
                    elif errors == 'silent':
                        pass
                    else:
                        raise
                finally:
                    results.append(data)

        else:
            raise InvalidMethod(method)

        # Post-process results
        results = [r for r in results
                   if r is not None]  # Only keep non-empty results
        if len(results) > 0:
            return results
        else:
            raise DataNotFound(urls)
コード例 #4
0
    def open_mfdataset(self,
                       urls,
                       concat_dim='row',
                       max_workers: int = 112,
                       method: str = 'thread',
                       progress: bool = False,
                       concat: bool = True,
                       preprocess=None,
                       errors: str = 'ignore',
                       *args,
                       **kwargs):
        """ Open multiple urls as a single xarray dataset.

            This is a version of the ``open_dataset`` method that is able to handle a list of urls/paths
            sequentially or in parallel.

            Use a Threads Pool by default for parallelization.

            Parameters
            ----------
            urls: list(str)
                List of url/path to open
            concat_dim: str
                Name of the dimension to use to concatenate all datasets (passed to :class:`xarray.concat`)
            max_workers: int
                Maximum number of threads or processes
            method: str
                The parallelization method to execute calls asynchronously:
                    - ``thread`` (Default): use a pool of at most ``max_workers`` threads
                    - ``process``: use a pool of at most ``max_workers`` processes
                    - (XFAIL) a :class:`distributed.client.Client` object (:class:`distributed.client.Client`)

                Use 'seq' to simply open data sequentially
            progress: bool
                Display a progress bar (True by default)
            preprocess: callable (optional)
                If provided, call this function on each dataset prior to concatenation

            Returns
            -------
            :class:`xarray.Dataset`

        """
        if not isinstance(urls, list):
            urls = [urls]

        results = []
        failed = []
        if method in ['thread', 'process']:
            if method == 'thread':
                ConcurrentExecutor = concurrent.futures.ThreadPoolExecutor(
                    max_workers=max_workers)
            else:
                if max_workers == 112:
                    max_workers = multiprocessing.cpu_count()
                ConcurrentExecutor = concurrent.futures.ProcessPoolExecutor(
                    max_workers=max_workers)

            with ConcurrentExecutor as executor:
                future_to_url = {
                    executor.submit(self._mfprocessor_dataset,
                                    url,
                                    preprocess=preprocess,
                                    *args,
                                    **kwargs): url
                    for url in urls
                }
                futures = concurrent.futures.as_completed(future_to_url)
                if progress:
                    futures = tqdm(futures, total=len(urls))

                for future in futures:
                    data = None
                    try:
                        data = future.result()
                    except Exception as e:
                        failed.append(future_to_url[future])
                        if errors == 'ignore':
                            warnings.warn(
                                "\nSomething went wrong with this url: %s\nException raised: %s"
                                % (future_to_url[future].replace(
                                    "https://", "").replace("http://",
                                                            ""), str(e.args)))
                            pass
                        elif errors == 'silent':
                            pass
                        else:
                            raise
                    finally:
                        results.append(data)

        # elif type(method) == distributed.client.Client:
        #     # Use a dask client:
        #     futures = method.map(self._mfprocessor_dataset, urls, preprocess=preprocess, *args, **kwargs)
        #     results = method.gather(futures)

        elif method in ['seq', 'sequential']:
            if progress:
                urls = tqdm(urls, total=len(urls))

            for url in urls:
                data = None
                try:
                    data = self._mfprocessor_dataset(url,
                                                     preprocess=preprocess,
                                                     *args,
                                                     **kwargs)
                except Exception as e:
                    failed.append(url)
                    if errors == 'ignore':
                        warnings.warn(
                            "\nSomething went wrong with this url: %s\nException raised: %s"
                            % (url.replace("https://", "").replace(
                                "http://", ""), str(e.args)))
                        pass
                    elif errors == 'silent':
                        pass
                    else:
                        raise
                finally:
                    results.append(data)

        else:
            raise InvalidMethod(method)

        # Post-process results
        results = [r for r in results
                   if r is not None]  # Only keep non-empty results
        if len(results) > 0:
            if concat:
                # ds = xr.concat(results, dim=concat_dim, data_vars='all', coords='all', compat='override')
                ds = xr.concat(results,
                               dim=concat_dim,
                               data_vars='minimal',
                               coords='minimal',
                               compat='override')
                return ds
            else:
                return results
        else:
            raise DataNotFound(urls)