def download_owid(run_date: pandas.Timestamp = None) -> pandas.DataFrame: """ Downloads the OurWorldInData COVID-19 dataset. Parameters ---------- run_date : optional, pandas.Timestamp the date for which to download the data THIS IS CURRENTLY NOT IMPLEMENTED Raises ------ NotImplementedError when a run_date earlier than today is passed """ if run_date.date() > datetime.date.today(): raise ValueError("Run date is in the future. Nice try.") if run_date.date() < datetime.date.today(): # TODO: implement downloading of historic data raise NotImplementedError( "Downloading with a run_date is not yet supported. " f"Today: {datetime.date.today()}, run_date: {run_date}" ) df_raw = pandas.read_csv( "https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv?raw=true", parse_dates=["date"], ).rename(columns={"iso_code": "iso_alpha3"}) df_raw["iso_alpha2"] = [ iso3166.countries.get(alpha3).alpha2 if alpha3 in iso3166.countries else None for alpha3 in df_raw.iso_alpha3 ] df_raw["region"] = "all" return df_raw.set_index(["iso_alpha2", "region", "date"])
def get_raw_covidtracking_data(run_date: pd.Timestamp): """ Gets the current daily CSV from COVIDTracking """ if run_date.date() > datetime.date.today(): raise ValueError("Run date is in the future. Nice try.") if run_date.date() < datetime.date.today(): # TODO: implement downloading of historic data raise NotImplementedError( "Downloading with a run_date is not yet supported. " f"Today: {datetime.date.today()}, run_date: {run_date}") url = "https://covidtracking.com/api/v1/states/daily.csv" data = pd.read_csv(url).rename(columns={ "state": "region", }) data["date"] = pd.to_datetime(data["date"], format="%Y%m%d") data = data.set_index(["region", "date"]).sort_index() # Too little data or unreliable reporting in the data source. df_raw = data.drop(["MP", "GU", "AS", "PR", "VI"]) # the data in these columns is crap. But it will be corrected by the process_covidtracking_data function # here we just add the columns so the original data is kept for region in df_raw.reset_index().region.unique(): df_raw.loc[idx[region, :], "new_cases"] = df_raw.xs(region).positive.diff().values df_raw.loc[idx[region, :], "new_tests"] = df_raw.xs(region).total.diff().values # calculate the sum over all states df_all = df_raw.sum(level='date') df_all.insert(0, column='region', value='all') df_all = df_all.reset_index().set_index(['region', 'date']) df_merged = pd.concat([df_raw, df_all]).sort_index() return df_merged
def load_to_cache(self, start_time: pd.Timestamp, end_time: pd.Timestamp): """ Метод загрузки данных в диапазоне (end_time - start_time) дней с сервера на файл формата CSV. start_time - Начальное время (тип: pd.Timestamp()) end_time - Конечное время (тип: pd.Timestamp()) """ # TODO: Переделать надо, так как за последние сутки bitmex вылаживает не полностью #assert end_time < (pd.Timestamp.today() - dt.timedelta(days=1)) assert start_time < end_time if start_time.date() == end_time.date(): if not self.check_cache(start_time): self.load_bar_day(start_time).to_csv( path.join(self.path_cash, self.symbol, self.data_frequency, start_time.strftime("%Y-%m-%d") + '.csv')) else: for day in pd.date_range(start_time, end_time, freq='D', closed='left'): if not self.check_cache(day): self.load_bar_day(day).to_csv( path.join(self.path_cash, self.symbol, self.data_frequency, day.strftime("%Y-%m-%d") + '.csv'))
def is_finished_kdata_timestamp(cls, timestamp: pd.Timestamp, level: IntervalLevel): timestamp = pd.Timestamp(timestamp) for t in cls.get_interval_timestamps(timestamp.date(), timestamp.date(), level=level): if is_same_time(t, timestamp): return True return False
def cal_sam_minute(x: pd.Timestamp, sam_minutes: int, region: str = REG_CN) -> pd.Timestamp: """ align the minute-level data to a down sampled calendar e.g. align 10:38 to 10:35 in 5 minute-level(10:30 in 10 minute-level) Parameters ---------- x : pd.Timestamp datetime to be aligned sam_minutes : int align to `sam_minutes` minute-level calendar region: str Region, for example, "cn", "us" Returns ------- pd.Timestamp: the datetime after aligned """ cal = get_min_cal(C.min_data_shift, region)[::sam_minutes] idx = bisect.bisect_right(cal, x.time()) - 1 _date, new_time = x.date(), cal[idx] return concat_date_time(_date, new_time)
def weather_adjustment(start, end, meter, basis_dates): ''' Provides the gross adjustment factor for weather adjusted baselines Parameters: start (str) : A str coercile to timestamp for the start of the event end (str) : A str coercile to timestamp for the end of the event meter (dataframe): A dataframe consisting of datetime and load values basis_dates (list) : A list of dates Return: float : A float which gives the weather adjustment factor ''' start = Timestamp(start) end = Timestamp(end) # adjustment hours 4 hour prior to start of event adj_hrs = list(range(start.hour -4, start.hour - 2)) # get adjustment on the day of the event adj_usage = meter[(meter.date == start.date()) & (meter.hour.isin(adj_hrs))] adj_usage = adj_usage.groupby('hour').mean() adj_usage = adj_usage.mean() adj_basis = meter[(meter.date.isin(basis_dates)) & (meter.hour.isin(adj_hrs))] adj_basis = adj_basis.groupby('hour').mean() adj_basis = adj_basis.mean() return adj_usage/adj_basis
def is_finished_kdata_timestamp(cls, timestamp: pd.Timestamp, level: IntervalLevel): """ :param timestamp: the timestamp could be recorded in kdata of the level :type timestamp: pd.Timestamp :param level: :type level: zvt.domain.common.IntervalLevel :return: :rtype: bool """ timestamp = pd.Timestamp(timestamp) for t in cls.get_interval_timestamps(timestamp.date(), timestamp.date(), level=level): if is_same_time(t, timestamp): return True return False
def make_report_files_path(report_name: str, date: pd.Timestamp): """Прокладывает путь и возвращает путь к файлу pdf-отчета и xlsx-отчета.""" date = date.date() file_name = f"{report_name} {date}" report_folder = PDF_PATH / f"{file_name}" if not report_folder.exists(): report_folder.mkdir(parents=True) return report_folder / f"{date}.pdf", report_folder / f"{date}.xlsx"
def clip_time_range( self, start_time: pd.Timestamp, end_time: pd.Timestamp) -> Tuple[pd.Timestamp, pd.Timestamp]: start_date = start_time.date() val_start, val_end = concat_date_time( start_date, self.start_time), concat_date_time(start_date, self.end_time) # NOTE: `end_date` should not be used. Because the `end_date` is for slicing. It may be in the next day # Assumption: start_time and end_time is for intraday trading. So it is OK for only using start_date return max(val_start, start_time), min(val_end, end_time)
def _download_many(self, aliases: List[str], reg_date: pd.Timestamp) -> List[Dict[str, Any]]: with futures.ThreadPoolExecutor(max_workers=len(aliases)) as executor: rez = [ executor.submit( apimoex.get_market_candles, self._session, ticker, start=reg_date.date(), end=self.LAST_HISTORY_DATE.date(), ) for ticker in aliases ] data = [] for future in rez: data.extend(future.result()) return self._clean_up(data)
def _adjust_date(column: pd.Series, start_from: pd.Timestamp): validate_data_is_type(column, pd.Series) validate_data_is_time_column(column) pre_start = column[column.dt.time < start_from.time()] post_start = column[column.dt.time >= start_from.time()] post_start = pd.to_datetime( post_start.apply(lambda x: f'{start_from.date()} {x.time()}')) day_after = start_from.date() + pd.Timedelta(days=1) pre_start = pd.to_datetime( pre_start.apply(lambda x: f'{day_after} {x.time()}')) output = pd.concat((pre_start, post_start)) assert output.shape == column.shape return output
def isRebalanceTriggered(self, current_date: pd.Timestamp=None, log_flag: bool=True) -> bool: """Checks if a rebalance is triggered at the current backtesting date (indicated by `get_datetime`). Keyword Arguments: current_date {pd.Timestamp} -- Current date to use as an override (default: {None}). log_flag {bool} -- Flag for logging (default: {True}). Returns: bool -- True if rebalance is triggered, false otherwise. """ # Override current date with sim date if not provided if not current_date: current_date = get_datetime() # Checking if correct week if (self.reb_week_start < current_date.day <= self.reb_week_end): # Flag if specific day matches is_triggered = (current_date.weekday_name == config.rebalance_trigger['day']) # Handling wildcard (need to check flag) if ((config.rebalance_trigger['day'] == '*') and (self.last_month_rebalance != current_date.month)): # Set flag to true is_triggered = True # Update flag self.last_month_rebalance = current_date.month # Log and return if is_triggered and log_flag: logging.info('ETF Portfolio rebalance triggered on {0} ({1})' .format(current_date.weekday_name, current_date.date())) return is_triggered # Not in correct week return False
def get_data_FR(run_date: pandas.Timestamp) -> pandas.DataFrame: """ Retrieve daily CSV from https://www.data.gouv.fr/fr/datasets/donnees-relatives-aux # -resultats-des-tests-virologiques-covid-19 for all French regions. Limitations: * Data by regions only contain tests for which residence regions of tested people could be known. Hence, countrywide data contain more tests than sum of all regions. * Data transmission can sometimes excess 9 days. Indicators are updated daily on test results reception. Parameters ---------- run_date : pandas.Timestamp use the data as it was released on that day Returns ------- result : pandas.DataFrame [region, date]-indexed table that has rows for every region & date combination in [ 2020-05-13, run_date - 1]. Contains columns "new_cases" and "new_tests" that are the number of NEW positives / total tests for each (day-region) couple. "all" region is the sum over all regions. """ if run_date.date() > datetime.date.today(): raise ValueError("Run date is in the future. Nice try.") if run_date.date() < datetime.date.today(): # TODO: implement downloading of historic data raise NotImplementedError( "Downloading with a run_date is not yet supported. " f"Today: {datetime.date.today()}, run_date: {run_date}") content = requests.get( "https://www.data.gouv.fr/fr/datasets/r/001aca18-df6a-45c8-89e6-f82d689e6c01", verify=False, ).content data = pandas.read_csv( io.StringIO(content.decode("utf-8")), sep=";", dtype={ "reg": str }, parse_dates=["jour"], usecols=["reg", "jour", "P", "T", "cl_age90"], ).rename( columns={ "reg": "region", "jour": "date", "cl_age90": "ageclass", "P": "new_cases", "T": "new_tests", }) # Drop data by age class ('0' age class is the sum of all age classes) and truncate data after # run_date data = (data[data.ageclass == 0].drop( "ageclass", axis=1).set_index("date").sort_index().truncate( after=run_date - pandas.DateOffset(1)).reset_index().set_index( ["region", "date"]).sort_index()) # compute and append national data, and restrict to existing regions to get rid of data # errors that creep in from the original link df_all = data.reset_index(level=1).groupby("date").sum().reset_index() df_all["region"] = "all" true_region_codes = get_regions_metadata()[0].keys() data = (data.append(df_all.set_index( ["region", "date"])).loc[true_region_codes].sort_index()) assert isinstance(data, pandas.DataFrame) assert data.index.names == ("region", "date") assert "new_cases" in data.columns, f"Columns were: {data.columns}" assert "new_tests" in data.columns, f"Columns were: {data.columns}" for col in ["new_cases", "new_tests"]: if any(data[col] < 0): _log.warning( f"Column {col} has {sum(data[col] < 0)} negative entries!! Overriding with NaN..." ) data.loc[data[col] < 0, col] = numpy.nan return data
expiry_month = randint(1, 12) daysInMonth = Timestamp(1990, expiry_month, 1).daysinmonth expiry_day = randint(1, daysInMonth) expiry_date_ts = Timestamp(expiry_year, expiry_month, expiry_day) if verbosity > 1: print('no expiry_date given, generated expiry_date_ts is "{}" '.format( expiry_date_ts)) if verbosity > 1: print('') if verbosity > 0: print('Authority is {}.'.format(authority)) print('Number is {}'.format(number)) print('Date of birth is {}'.format(birth_date_ts.date())) print('Date of expiry is {}'.format(expiry_date_ts.date())) print('Nationality is {}'.format(nationality)) print('') blocks = [ authority + number, birth_date_ts.strftime('%y%m%d'), expiry_date_ts.strftime('%y%m%d'), nationality ] #blocks = ['T220001293', '6408125', '2010315', 'D'] for i, block in enumerate(blocks): if len(block) > 3: blocks[i] += str(generateChecksum(block))
def ts2str(ts: pd.Timestamp): return str(ts.date())
def nyiso_cbl(meter, event_start, event_end, look_back, event_type = 'weekday'): ''' calculates the nysio customer baseline given the input parameters Parameters: meter (dataframe): A dataframe consisting of datetime and load values event_start (str) : A str coercile to timestamp for the start of the event event_end (str) : A str coercile to timestamp for the end of the event look_back (int) : An integer specifying the number of days to look back event_type (str) : A string specifying the type of event (weekday, sunday, saturday) Returns: tuple : A tuple of dataframe which give the baselins and the performance for the event hour ''' start = Timestamp(event_start) end = Timestamp(event_end) event_hours = date_range(start, end, freq = 'H').hour.tolist() event_hours = event_hours[:-1] # accounting for hour ending # get max lookback days window_start = start.date() - Timedelta(look_back, unit = 'days') datelist = date_range(window_start, periods = look_back).date.tolist() data = meter[meter.date.isin(datelist)] #TODO: weekend cbl logic if event_type == 'weekday': days = list(range(1,6)) if event_type == 'saturday': days = [6] if event_type == 'sunday': days = [7] #get the seed values seed_data = data[data.hour.isin(event_hours)] seed_data = seed_data[seed_data['date'] != start.date()] seed_data = seed_data.groupby(['date','hour']).mean().reset_index() seed_value = seed_data['kW'].max()*0.25 # identify the low usage days low_usage = seed_data.groupby(['date']).mean() low_usage_dates = low_usage[low_usage.kW < seed_value].index.tolist() rm_day = [d for d in seed_data.date.to_list() if not d.isoweekday() in days] rm_day = list(set(rm_day)) # get dates and holidays to exclude exclude = get_holidays(start.year) exclude.extend(low_usage_dates) exclude.extend([start.date()-Timedelta(1, unit = 'day')]) exclude.extend(rm_day) # get cbl basis days max_days = seed_data.date.unique().tolist() days_to_keep = [d for d in max_days if d not in exclude] days_to_keep.sort(reverse = True) if len(days_to_keep) > 10: cbl_basis = days_to_keep[:10] else: cbl_basis = days_to_keep #get averages and rank them, pick the top 5 of the averages averages = seed_data.groupby('date').mean() averages = averages[averages.index.isin(cbl_basis)] averages['rank'] = averages['kW'].rank(ascending = False) baseline_dates = averages[averages['rank'] <= 5].index.tolist() # calculate baseline as average of the hours for the selected days baseline = data[data.date.isin(baseline_dates)] baseline = baseline.groupby('hour').mean() # actual values during event day event_day = meter[meter.dttm >= start.floor('24H')] event_day = event_day[event_day.dttm < start.ceil('24H')] event_day = event_day.groupby(['id','hour']).mean().reset_index() event_day['baseline'] = baseline.kW #get adjustment factor gaf = weather_adjustment(start = start, end = end, meter = meter, basis_dates = cbl_basis) # get the adjusted baseline event_day['adjustment'] = event_day.baseline * gaf.kW # calculate the event performance per hour perf = perf_calc(event_day, event_hours) return event_day, perf
def get_data_BE(run_date: pandas.Timestamp) -> pandas.DataFrame: """ Retrieve daily (run_date) regions and append national data (key 'all') to it Parameters ---------- run_date : pandas.Timestamp date for which the data shall be downloaded Returns ------- df : pandas.DataFrame table with columns as required by rtlive/data.py API """ def redistribute(group: pandas.DataFrame, col: str) -> pandas.Series: gdata = group.groupby('REGION')[col].sum() gdata.loc['Brussels'] += gdata.loc['Nan'] * ( gdata.loc['Brussels'] / (gdata.loc['Brussels'] + gdata.loc['Flanders'] + gdata.loc['Wallonia'])) gdata.loc['Flanders'] += gdata.loc['Nan'] * ( gdata.loc['Flanders'] / (gdata.loc['Brussels'] + gdata.loc['Flanders'] + gdata.loc['Wallonia'])) gdata.loc['Wallonia'] += gdata.loc['Nan'] * ( gdata.loc['Wallonia'] / (gdata.loc['Brussels'] + gdata.loc['Flanders'] + gdata.loc['Wallonia'])) gdata.drop(index='Nan', inplace=True) gdata = gdata.fillna(0).round(0).astype(int) return gdata if run_date.date() > datetime.date.today(): raise ValueError('Run date is in the future. Nice try.') if run_date.date() < datetime.date.today(): # TODO: implement downloading of historic data raise NotImplementedError( 'Downloading with a run_date is not yet supported. ' f'Today: {datetime.date.today()}, run_date: {run_date}') # Download data from Sciensano content = requests.get( 'https://epistat.sciensano.be/Data/COVID19BE_tests.csv', verify=False, ).content df_tests = pandas.read_csv( io.StringIO(content.decode('utf-8')), sep=',', parse_dates=['DATE'], usecols=['DATE', 'REGION', 'PROVINCE', 'TESTS_ALL_POS', 'TESTS_ALL']).rename(columns={'DATE': 'date'}) # Reformat data into Rtlive.de format at country level all df_tests_per_all_day = (df_tests.assign(region='all').groupby( 'date', as_index=True).agg(new_cases=('TESTS_ALL_POS', 'sum'), new_tests=('TESTS_ALL', 'sum'), region=('region', 'first'))) df_tests_per_all_day = (df_tests_per_all_day.reset_index().set_index( ['region', "date"]).sort_index()) # Redistribute the nan for the column TESTS_ALL_POS for regions Flanders, Wallonia and Brussels df_tests_positive = (df_tests.fillna('Nan').groupby(['date']).apply( redistribute, 'TESTS_ALL_POS').stack().reset_index().rename(columns={ 'REGION': 'region', 0: 'new_cases' })) # Redistribute the nan for the column TESTS_ALL for regions Flanders, Wallonia and Brussels df_tests_all = (df_tests.fillna('Nan').groupby(['date']).apply( redistribute, 'TESTS_ALL').stack().reset_index().rename(columns={ 'REGION': 'region', 0: 'new_tests' })) # Combine the total number of tests and the number of positive tests into a basetable df_tests_per_region_day = pandas.concat( [df_tests_all, df_tests_positive['new_cases']], axis=1).set_index(['region', 'date']) # Test per province (Ignore the nan's for the moment) df_tests_per_province_day = ( df_tests[df_tests['REGION'] != 'Brussels'].groupby( ['PROVINCE', 'date'], as_index=False).agg(new_cases=('TESTS_ALL_POS', 'sum'), new_tests=('TESTS_ALL', 'sum')).rename(columns={ 'PROVINCE': 'region' }).set_index(['region', 'date'])) df_tests_per_province_day.index.name = ('region', 'date') # Combine the results at country level with region level data = pandas.concat([ df_tests_per_all_day, df_tests_per_region_day, df_tests_per_province_day ], axis=0).sort_index() data.index = data.index.set_levels( data.index.levels[0].map(BE_REGION_INPUT_ABBR.get), 'region') assert isinstance(data, pandas.DataFrame) assert data.index.names == ('region', 'date') assert 'new_cases' in data.columns, f'Columns were: {data.columns}' assert 'new_tests' in data.columns, f'Columns were: {data.columns}' for col in ['new_cases', 'new_tests']: if any(data[col] < 0): _log.warning( f'Column {col} has {sum(data[col] < 0)} negative entries!! Overriding with NaN...' ) data.loc[data[col] < 0, col] = numpy.nan return data
def print_header_terminal(date: pd.Timestamp): text = "Calculando dia {}".format(date.date()) print("="*len(text)) print(text)
def download(self, start_date: pd.Timestamp = None, end_date: pd.Timestamp = None) -> int: # refresh cache if start_date is not None: start_date = pd.Timestamp(start_date) start_date = start_date or self.min_date() cache = dict() #for symbol, config in self.config.items(): for cfg in self.config: symbol = cfg.download_cfg.symbol params = dict( symbol=symbol, data="daily", maxrecords=np.busday_count(start_date.date(), pd.Timestamp.today().date()) + 1, volume="contract", order="asc", dividends="false", backadjust="false", daystoexpiration=1, contractroll="expiration") self.logger.info( f"Downloading {symbol} from {self.__class__.__name__} {self.name()}" ) resp = self.http_get( "https://www.barchart.com/proxies/timeseries/queryeod.ashx", params=params) df_barchart = pd.read_csv(io.StringIO(resp.data.decode('utf-8')), header=None) df_barchart.columns = [ "symbol", "as_of", "open", "high", "low", TypeColumn.close.value, "volume", "oi" ][:len(df_barchart.columns)] df_barchart.as_of = pd.to_datetime(df_barchart.as_of) df_barchart = df_barchart[df_barchart.as_of >= start_date] df = df_barchart.loc[:, ("open", "high", "low", TypeColumn.close.value, "as_of")] # Store OHLC # pivot df so OHLC are split by row df_melt = df.melt(id_vars="as_of", var_name="type", value_name="price") df_melt['market'] = self.name() for column_name, column_value in cfg.commodity_cfg.__dict__.items( ): df_melt[column_name] = column_value product = cfg.download_cfg.product df_melt['product'] = product # df['type'] = TypeColumn.close.value expiry = cfg.download_cfg.expiry if expiry is not None: maturity = pd.to_datetime(expiry) df_melt['offset'] = product_to_date( maturity, product) - product_to_date( df_melt.as_of.dt, product) else: df_melt[ 'offset'] = 0 # If no maturity, then it is supposed to be a stock or a spot value cache[symbol] = df_melt concat_df = pd.concat(cache.values(), axis=0) cache_df = pd.pivot_table(concat_df, values="price", index="as_of", columns=df_index_columns) self.cache = cache_df return super().download(start_date, end_date)
def _minute_dataframe_for_date( self, ticker: str, start_timestamp: pd.Timestamp) -> pd.DataFrame: ret_df = pd.DataFrame() df = pyEX.chartDF(ticker, timeframe='1d', date=start_timestamp) if df.empty: return ret_df df = df.reset_index() df['volume'] = df['volume'].astype('int') df['date'] = df['date'].astype('str') df['minute'] = df['minute'].astype('str') df['datet'] = df['date'] + ' ' + df['minute'] df['dividend'] = 0.0 df['split'] = 1.0 df.drop([ 'date', 'minute', 'average', 'changeOverTime', 'close', 'high', 'label', 'low', 'marketAverage', 'marketChangeOverTime', 'marketNotional', 'marketNumberOfTrades', 'notional', 'numberOfTrades', 'open', 'volume' ], axis=1, level=None, inplace=True, errors='ignore') df.rename(columns={ 'datet': 'date', 'marketClose': 'close', 'marketHigh': 'high', 'marketLow': 'low', 'marketOpen': 'open', 'marketVolume': 'volume' }, inplace=True) df.date = pd.to_datetime(df.date, errors='coerce', utc=False, infer_datetime_format=True) df = df[~df.date.isnull()] df.set_index('date', drop=True, append=False, inplace=True, verify_integrity=True) utc = pytz.utc nytz = pytz.timezone('US/Eastern') df = df.tz_localize(nytz, axis=0, level=None, copy=False, ambiguous='raise') df.index = df.index.tz_convert(utc) if not (pd.Series(['close', 'high', 'low', 'open']).isin( df.columns).all()): log.info( "Skipping {0} for {1}, not all columns ({2}) received".format( ticker, start_timestamp.date(), str(df.columns))) return ret_df df = self._fixna(df, ticker) df.index = df.index.tz_convert(None) # Re-arrange them ret_df = df[self._cols] return ret_df