def get_dates_or_ytd(cls, self, fpath, dt): """Get the exact dates needed or determine full ytd need.""" # Read local parquet file df = pd.read_parquet(fpath) # If for some reason the df is empty, or data has an error if df.empty: os.remove(fpath) self.need_ytd = True # If the max date is equal to todays date, end sequence elif df['date'].max() == dt: self.need_data = False self.df = df # Assuming df exists and is valid else: # Make a pandas datetime range # bd_range = pd.bdate_range(date(dt.year, 1, 2), dt) bd_range = getDate.get_bus_days(testing=False, this_year=True) bd_range = bd_range[bd_range['date'].dt.date <= dt].copy(deep=True) times_need = bd_range['date'][~bd_range['date'].isin(df['date'])] dts_need = [bd.date().strftime('%Y%m%d') for bd in times_need] if self.testing: self.class_print(dts_need) # If more than 10 dates are needed, just get YTD if len(dts_need) > 10: self.need_ytd = True else: self.dts_need = dts_need # Assign dataframe to self.df self.df = df
def scraped_ee_dates(verbose=False, hist=False, current_year=True): """Start for loop of dates to get future/past analyst estimates.""" dt = getDate.query('iex_eod') bdays, pos_days = None, None if (365 - dt.timetuple().tm_yday) > 15: bdays = getDate.get_bus_days(this_year=True) else: bdays = getDate.get_bus_days(this_year=False) bdays = bdays[bdays['date'].dt.year >= dt.year].copy() bdays['current_date'] = pd.to_datetime(getDate.query('iex_close')) bdays['bday_diff'] = (getDate.get_bus_day_diff( bdays, 'current_date', 'date')) if hist and not current_year: pos_days = bdays[bdays['bday_diff'] < 15].copy() elif hist and current_year: cond1 = (bdays['bday_diff'] < 15) cond2 = (bdays['date'].dt.year == dt.year) pos_days = bdays[cond1 & cond2].copy() else: pos_days = bdays[bdays['bday_diff'].between(0, 15)].copy() bpath = Path(baseDir().path, 'economic_data', 'analyst_earnings') fpath_dir = bpath.joinpath(f"_{str(dt.year)}") pos_days['fpath'] = (pos_days.apply(lambda row: f"{fpath_dir}/_{str(row['date'].date())}.parquet", axis=1)) pos_days['fpath_exists'] = (pos_days['fpath'].astype(str) .map(os.path.exists)) dt_need = pos_days[~pos_days['fpath_exists']] dt_list = [] for dt in dt_need['date']: try: ScrapedEE(dt=dt.date()) sleep(randint(5, 15)) dt_list.append(dt.date()) except Exception as e: help_print_arg(f"scraped_ee_dates {type(e)} {str(e)}") if verbose: help_print_arg(str(dt_list))
def get_missing_dates(df): """Get missing dates from data frame.""" # Need columns for symbol, and date, or some other unique identifier. bus_days = getDate.get_bus_days(this_year=True) dt = getDate.query('iex_eod') bus_days = bus_days[bus_days['date'].dt.date <= dt].copy() df_dt_list = df['date'].unique() dts_missing = bus_days[~bus_days['date'].isin(df_dt_list)].copy() dts_missing['dt_format'] = dts_missing['date'].dt.strftime('%Y%m%d') return dts_missing
def get_last_30_intradays(): """Get last 30 intraday trading days.""" bsdays = getDate.get_bus_days() dt_today = getDate.query('iex_eod') dt_30 = dt_today - timedelta(days=30) days = (bsdays[(bsdays['date'].dt.date >= dt_30) & (bsdays['date'].dt.date <= dt_today)]) df_m1 = serverAPI('iex_intraday_m1').df days_tget = (days[~days['date'].isin(df_m1['date'] .unique())].copy()) # days_tget['dt_fmt'] = days_tget['date'].dt.strftime('%Y%m%d') try: from app.tasks import execute_func for dt in days_tget['date']: kwargs = {'dt': dt} execute_func.delay('iex_intraday', **kwargs) except ModuleNotFoundError: pass
def get_missing_sec_master_idx(sma_df=False): """Get missing sec reference data files.""" # sma_df is the master index file of all dates if not isinstance(sma_df, pd.DataFrame): sma_df = serverAPI('sec_master_all').df sma_df['date'] = pd.to_datetime(sma_df['date'], unit='ms') bus_days = getDate.get_bus_days(this_year=True) dt = getDate.query('iex_eod') bus_days = bus_days[bus_days['date'].dt.date <= dt].copy() dts_missing = bus_days[~bus_days['date'].isin(sma_df['date'].unique(). tolist())].copy() dts_missing['dt_format'] = dts_missing['date'].dt.strftime('%Y%m%d') for dt in tqdm(dts_missing['dt_format']): try: smi = secMasterIdx(hist_date=dt) sleep(.5) except Exception as e: msg = f"get_missing_sec_master_idx: {str(e)}" help_print_arg(msg)
def get_most_recent_fpath(fpath_dir, f_pre='', f_suf='', dt='', this_year=True, second=False): """Get the most recent fpath in a directory.""" path_to_return = False if not dt: # If no date passed, default to iex_eod dt = getDate.query('iex_close') dt_list = getDate.get_bus_days(this_year=this_year) date_list = (dt_list[dt_list['date'].dt.date <= dt] .sort_values(by=['date'], ascending=False)) date_list['fpath'] = (date_list.apply(lambda row: f"_{row['date'].date()}", axis=1)) date_list['fpath_yr'] = (date_list.apply(lambda row: f"_{row['date'].year}", axis=1)) date_list['fpath_fmt'] = (date_list.apply(lambda row: f"_{row['date'].date().strftime('%Y%m%d')}", axis=1)) # Iterate through dataframe to find the most recent for index, row in date_list.iterrows(): tpath = Path(fpath_dir, f"{f_pre}{row['fpath']}{f_suf}.parquet") if tpath.exists(): return tpath # Iterate through dataframe to find the most recent for index, row in date_list.iterrows(): tpath = Path(fpath_dir, f"{f_pre}{row['fpath_yr']}{f_suf}.parquet") if tpath.exists(): return tpath # Iterate through dataframe to find the most recent for index, row in date_list.iterrows(): tpath = Path(fpath_dir, f"{f_pre}{row['fpath_fmt']}{f_suf}.parquet") if tpath.exists(): return tpath """ if not f_pre and not f_suf: for index, row in date_list.iterrows(): if Path(fpath_dir, f"{row['fpath']}.parquet").exists(): path_to_return = Path(fpath_dir, f"{row['fpath']}.parquet") return path_to_return elif f_pre and not f_suf: for index, row in date_list.iterrows(): if Path(fpath_dir, f"{f_pre}{row['fpath']}.parquet").exists(): path_to_return = Path(fpath_dir, f"{f_pre}{row['fpath']}.parquet") return path_to_return elif not f_pre and f_suf: for index, row in date_list.iterrows(): if Path(fpath_dir, f"{row['fpath']}{f_suf}.parquet").exists(): path_to_return = Path(fpath_dir, f"{row['fpath']}{f_suf}.parquet") return path_to_return elif f_pre and f_suf: for index, row in date_list.iterrows(): if Path(fpath_dir, f"{f_pre}{row['fpath']}{f_suf}.parquet").exists(): path_to_return = Path(fpath_dir, f"{f_pre}{row['fpath']}{f_suf}.parquet") return path_to_return """ if not path_to_return and not second: path_to_return = get_most_recent_fpath(fpath_dir, this_year=False, second=True) if path_to_return: help_print_arg(f"get_most_recent_fpath: first failed. Returning {str(path_to_return)}") return path_to_return if not path_to_return: msg_1 = "Directory empty or path doesn't follow format '_dt.parquet'. Returning first path" msg_2 = f": {fpath_dir}" help_print_arg(f"{msg_1} {msg_2}") paths = list(Path(fpath_dir).glob('*.parquet')) if paths: path_to_return = paths[-1] return path_to_return