def _maybe_download_dataset(refresh_days, **kwargs): """ Check if the given dataset is already on disk and how old it is, and download again from the SimFin server if it needs to be refreshed. :param kwargs: Keyword args for the dataset details. :param refresh_days: Integer with number of days before refreshing data. :return: Boolean whether data was downloaded (True) or not (False). """ # Name of the dataset, this is just the filename without an extension. dataset_name = _filename_dataset(**kwargs, extension=None) # Full path for the local data-file. path = _path_dataset(**kwargs) # Full path for the downloaded file. download_path = _path_download_dataset(**kwargs) # URL to SimFin's server where the file is located. url = _url_dataset(**kwargs) return _maybe_download(name=dataset_name, path=path, download_path=download_path, url=url, refresh_days=refresh_days)
def _cache_args(self, datasets, cache_ids=[]): """ Create and return a dict with the arguments for the `cache` wrapper-function. :param datasets: List of tuples with dataset names and variants. If any of these dataset-files is newer than the cache-file, then the cache must be refreshed. :param cache_ids: List of arguments such as strings, booleans, ints, etc. that can be used to uniquely identify a cache-file. :return: Dict with arguments for the `cache` wrapper-function. """ # Create list of file-paths for the datasets. dataset_paths = [ _path_dataset(dataset=dataset + self._dataset_extension, variant=variant, market=self._market) for dataset, variant in datasets ] # List of arguments used to uniquely identify the cache-file. all_cache_ids = self._cache_ids + cache_ids # Convert to a single string which may be quite long. all_cache_ids_str = '-'.join(map(str, all_cache_ids)) # Convert to a short and nearly unique string by hashing it. name_hash = hashlib.sha1(all_cache_ids_str.encode('utf-8')).hexdigest() name_hash = name_hash[:8] # Create dict with arguments for the @cache function-wrapper. args = { 'cache_name': name_hash, 'cache_refresh': dataset_paths, 'cache_format': self._cache_format } return args
def load(dataset, variant=None, market=None, parse_dates=None, index=None, refresh_days=30): """ Load the dataset from local disk and return it as a Pandas DataFrame. If the dataset does not exist on local disk, or if it is too old, then it is automatically downloaded from the SimFin server. This is the main function for downloading and loading datasets. It is specialized in several so-called partial function definitions below, such as :obj:`~simfin.load.load_income` and :obj:`~simfin.load.load_shareprices`, which merely set some of the arguments in this function for convenience. A dataset is specified by its name e.g. 'income' for Income Statements, its variant e.g. 'annual' for annual reports, and the market e.g. 'us' for USA. All datasets have a name, but only some of them have options for variants and markets. For a full list of available datasets see: https://simfin.com/data/bulk All datasets are saved on disk as CSV-files, where the columns define the data-items such as Ticker, Revenue and Net Income, and the rows are data-points or records. The number of columns is typically fairly small (less than 100) but the number of rows may be thousands or even millions. This function can automatically parse and convert columns that contain strings with dates into proper date-types. You do this by passing the column-names as the argument `parse_dates`. This function can also use one or more of the columns as an index for the resulting Pandas DataFrame. You do this by passing the column-names as the argument `index`. The index will be sorted in ascending order. :param dataset: String with the name of the dataset (always lowercase). Examples: - 'income': Income statements. - 'balance': Balance sheets. - 'cashflow': Cash-flow statements. - 'shareprices': Share-prices. - 'companies': Company details. - 'industries': Sector and industry details. - 'markets': Market details. :param variant: String with the dataset's variant (always lowercase). The valid options depend on the dataset. Examples for datasets 'income', 'balance', and 'cashflow': - 'annual': Annual financial reports. - 'quarterly': Quarterly financial reports. - 'ttm': Trailing-Twelve-Months (TTM) reports. Valid options for dataset 'shareprices': - 'latest': Latest share-prices (small data-file). - 'daily': Daily share-prices (large data-file). :param market: String for the dataset's market (always lowercase). This is used to group the entire database into smaller sections for individual markets such as USA, Germany, etc. Examples of valid options: - 'us': USA - 'de': Germany - 'sg': Singapore Some datasets such as 'industries' do not support the market-keyword and will generate a server-error if the market is set. :param parse_dates: String or list of strings with column-names that contain dates to be parsed. This depends on the dataset. For fundamental data it is [REPORT_DATE, PUBLISH_DATE, RESTATED_DATE]. For shareprices it is [DATE]. Format is always assumed to be YYYY-MM-DD. :param index: String or list of strings with column-names that will be used as index. The index will automatically be sorted in ascending order. :param refresh_days: Integer with the number of days before data is downloaded again. The data is updated daily on the SimFin server so you would normally use refresh_days >= 1. Free datasets are updated less frequently so you would normally use refresh_days >= 30 for free datasets. A value of refresh_days == 0 means the data is downloaded regardless of the age of the data-files saved on disk. :return: Pandas DataFrame with the data. """ assert dataset is not None # Convert dataset name, variant, and market to lower-case. dataset = dataset.lower() if variant is not None: variant = variant.lower() if market is not None: market = market.lower() # Dict with dataset arguments. dataset_args = {'dataset': dataset, 'variant': variant, 'market': market} # Download file if it does not exist on local disk, or if it is too old. _maybe_download_dataset(**dataset_args, refresh_days=refresh_days) # Lambda function for converting strings to dates. Format: YYYY-MM-DD date_parser = lambda x: pd.to_datetime(x, yearfirst=True, dayfirst=False) # Print status message. print('- Loading from disk ... ', end='') # Full path for the CSV-file on local disk. path = _path_dataset(**dataset_args) # Load dataset into Pandas DataFrame. df = pd.read_csv(path, sep=';', header=0, parse_dates=parse_dates, date_parser=date_parser) # Set the index and sort the data. if index is not None: # Set the index. df.set_index(index, inplace=True) # Sort the rows of the DataFrame according to the index. df.sort_index(ascending=True, inplace=True) # Print status message. print('Done!') return df