Esempio n. 1
0
def _maybe_download_dataset(refresh_days, **kwargs):
    """
    Check if the given dataset is already on disk and how old it is,
    and download again from the SimFin server if it needs to be refreshed.

    :param kwargs: Keyword args for the dataset details.
    :param refresh_days: Integer with number of days before refreshing data.
    :return: Boolean whether data was downloaded (True) or not (False).
    """

    # Name of the dataset, this is just the filename without an extension.
    dataset_name = _filename_dataset(**kwargs, extension=None)

    # Full path for the local data-file.
    path = _path_dataset(**kwargs)

    # Full path for the downloaded file.
    download_path = _path_download_dataset(**kwargs)

    # URL to SimFin's server where the file is located.
    url = _url_dataset(**kwargs)

    return _maybe_download(name=dataset_name, path=path,
                           download_path=download_path,
                           url=url, refresh_days=refresh_days)
Esempio n. 2
0
    def _cache_args(self, datasets, cache_ids=[]):
        """
        Create and return a dict with the arguments for the `cache`
        wrapper-function.

        :param datasets:
            List of tuples with dataset names and variants. If any of these
            dataset-files is newer than the cache-file, then the cache must
            be refreshed.

        :param cache_ids:
            List of arguments such as strings, booleans, ints, etc. that
            can be used to uniquely identify a cache-file.

        :return:
            Dict with arguments for the `cache` wrapper-function.
        """

        # Create list of file-paths for the datasets.
        dataset_paths = [
            _path_dataset(dataset=dataset + self._dataset_extension,
                          variant=variant,
                          market=self._market) for dataset, variant in datasets
        ]

        # List of arguments used to uniquely identify the cache-file.
        all_cache_ids = self._cache_ids + cache_ids

        # Convert to a single string which may be quite long.
        all_cache_ids_str = '-'.join(map(str, all_cache_ids))

        # Convert to a short and nearly unique string by hashing it.
        name_hash = hashlib.sha1(all_cache_ids_str.encode('utf-8')).hexdigest()
        name_hash = name_hash[:8]

        # Create dict with arguments for the @cache function-wrapper.
        args = {
            'cache_name': name_hash,
            'cache_refresh': dataset_paths,
            'cache_format': self._cache_format
        }

        return args
Esempio n. 3
0
def load(dataset, variant=None, market=None,
         parse_dates=None, index=None, refresh_days=30):
    """
    Load the dataset from local disk and return it as a Pandas DataFrame.
    If the dataset does not exist on local disk, or if it is too old, then it
    is automatically downloaded from the SimFin server.

    This is the main function for downloading and loading datasets. It is
    specialized in several so-called partial function definitions below,
    such as :obj:`~simfin.load.load_income` and
    :obj:`~simfin.load.load_shareprices`, which merely set some of the
    arguments in this function for convenience.

    A dataset is specified by its name e.g. 'income' for Income Statements,
    its variant e.g. 'annual' for annual reports, and the market e.g. 'us'
    for USA. All datasets have a name, but only some of them have options
    for variants and markets. For a full list of available datasets see:
    https://simfin.com/data/bulk

    All datasets are saved on disk as CSV-files, where the columns define
    the data-items such as Ticker, Revenue and Net Income, and the rows are
    data-points or records. The number of columns is typically fairly small
    (less than 100) but the number of rows may be thousands or even millions.

    This function can automatically parse and convert columns that contain
    strings with dates into proper date-types. You do this by passing the
    column-names as the argument `parse_dates`.

    This function can also use one or more of the columns as an index for
    the resulting Pandas DataFrame. You do this by passing the column-names
    as the argument `index`. The index will be sorted in ascending order.

    :param dataset:
        String with the name of the dataset (always lowercase).

        Examples:
            - 'income': Income statements.
            - 'balance': Balance sheets.
            - 'cashflow': Cash-flow statements.
            - 'shareprices': Share-prices.
            - 'companies': Company details.
            - 'industries': Sector and industry details.
            - 'markets': Market details.

    :param variant:
        String with the dataset's variant (always lowercase).
        The valid options depend on the dataset.

        Examples for datasets 'income', 'balance', and 'cashflow':
            - 'annual': Annual financial reports.
            - 'quarterly': Quarterly financial reports.
            - 'ttm': Trailing-Twelve-Months (TTM) reports.

        Valid options for dataset 'shareprices':
            - 'latest': Latest share-prices (small data-file).
            - 'daily': Daily share-prices (large data-file).

    :param market:
        String for the dataset's market (always lowercase).

        This is used to group the entire database into smaller sections
        for individual markets such as USA, Germany, etc.

        Examples of valid options:
            - 'us': USA
            - 'de': Germany
            - 'sg': Singapore

        Some datasets such as 'industries' do not support the market-keyword
        and will generate a server-error if the market is set.

    :param parse_dates:
        String or list of strings with column-names that contain dates
        to be parsed. This depends on the dataset.
        For fundamental data it is [REPORT_DATE, PUBLISH_DATE, RESTATED_DATE].
        For shareprices it is [DATE].
        Format is always assumed to be YYYY-MM-DD.

    :param index:
        String or list of strings with column-names that will be used as index.
        The index will automatically be sorted in ascending order.

    :param refresh_days:
        Integer with the number of days before data is downloaded again.

        The data is updated daily on the SimFin server so you would normally
        use refresh_days >= 1. Free datasets are updated less frequently so
        you would normally use refresh_days >= 30 for free datasets.

        A value of refresh_days == 0 means the data is downloaded regardless
        of the age of the data-files saved on disk.

    :return:
        Pandas DataFrame with the data.
    """

    assert dataset is not None

    # Convert dataset name, variant, and market to lower-case.
    dataset = dataset.lower()
    if variant is not None:
        variant = variant.lower()
    if market is not None:
        market = market.lower()

    # Dict with dataset arguments.
    dataset_args = {'dataset': dataset, 'variant': variant, 'market': market}

    # Download file if it does not exist on local disk, or if it is too old.
    _maybe_download_dataset(**dataset_args, refresh_days=refresh_days)

    # Lambda function for converting strings to dates. Format: YYYY-MM-DD
    date_parser = lambda x: pd.to_datetime(x, yearfirst=True, dayfirst=False)

    # Print status message.
    print('- Loading from disk ... ', end='')

    # Full path for the CSV-file on local disk.
    path = _path_dataset(**dataset_args)

    # Load dataset into Pandas DataFrame.
    df = pd.read_csv(path, sep=';', header=0,
                     parse_dates=parse_dates, date_parser=date_parser)

    # Set the index and sort the data.
    if index is not None:
        # Set the index.
        df.set_index(index, inplace=True)

        # Sort the rows of the DataFrame according to the index.
        df.sort_index(ascending=True, inplace=True)

    # Print status message.
    print('Done!')

    return df