def holiday_column(airline_df, start='2014-01-01', end='2018-12-31'): '''Takes airline df and returns df with column indicating if date of flight is a US Federal Holiday''' #Pull Holiday Dates and convert to Spark DF with timestamp column cal = USFederalHolidayCalendar() holidays = cal.holidays(start, end).to_pydatetime() holidays_df = pd.DataFrame(pd.DataFrame(holidays)[0].astype('string')) schema = StructType([StructField('Holiday_Date', StringType())]) holidays_sc = spark.createDataFrame(holidays_df, schema) holidays_sc = holidays_sc.select( to_timestamp(holidays_sc.Holiday_Date, 'yyyy-MM-dd').alias('holiday_date')) #Join holidays to airlines holiday_joined_df = airline_df.join( holidays_sc, (airline_df.fl_date == holidays_sc.holiday_date), 'left') #Change date column to binary holiday_joined_df = holiday_joined_df.withColumn( "Holiday", (f.col('holiday_date').isNotNull()).cast("integer")) #Drop redundant holiday_date column holiday_joined_df = holiday_joined_df.drop(holiday_joined_df.holiday_date) return holiday_joined_df
def __init__( self, path: str = None, data: pd.DataFrame = pd.DataFrame(), holdings: pd.DataFrame = pd.DataFrame(), ): """Initialize the portfolio with holdings and market data. Args: path: The name of an a directory containing holdings and market data. If this is None then the portfolio must be described by the data and holdings arguments. data: A DataFrame containing market close data for a set of financial instruments over a period of time. holdings: A DataFrame containing the number of shares held of the set of symbols in data over a time period corresponding to that of data. """ if len(data) > 0 and len(holdings) > 0: logger.debug("Data and holdings arguments are set.") self.data = data self.holdings = holdings else: logger.debug("Data and holdings are not set.") if path is None: path = Path().home() / ".portfolio" / "data" self.path = Path(path) if not self.path.is_dir(): logger.info("%s is not a directory, creating it...", self.path) self.path.mkdir() self.data_file = self.path / "prices.feather" self.holdings_file = self.path / "holdings.feather" try: # The feather format does not support date indices, # so set the Date colemn to be the index. self.holdings = pd.read_feather( self.holdings_file).set_index("date") except FileNotFoundError: logger.info("No stored holdings found.") self.holdings = holdings symbols = self.holdings.columns try: # The feather format does not support date indices, # so set the Date colemn to be the index. self.data = pd.read_feather(self.data_file).set_index("date") except FileNotFoundError: logger.info("Market data is not stored .") calendar = USFederalHolidayCalendar() today = pd.Timestamp.today().normalize() last_business_day = min(today - BDay(1), self.data.index.max() + BDay(1)) holidays = calendar.holidays(last_business_day, today) if (today not in self.data.index and last_business_day not in self.data.index and holidays.empty): self.data = self.get_market_data( symbols, start=last_business_day, end=today, ) self.holdings = self.holdings.reindex(self.data.index, method="ffill").dropna()
def pandas_holidays(): from pandas.tseries.holiday import USFederalHolidayCalendar cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end='2014-12-31').to_pydatetime() if datetime.datetime(2014, 1, 1) in holidays: print(True)
def calc_holidays(start_date=datetime.datetime(2012, 10, 1), end_date=datetime.datetime.now()): cal = USFederalHolidayCalendar() holidays = np.ones(cal.holidays().shape[0]) holidays = pd.Series(holidays, index=cal.holidays()).resample("D").fillna(0.0) h = holidays[start_date:end_date].values h = np.where(h)[0] - 1 return h
def near_holiday(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays() df1 = self.df_raw[['id', 'date_account_created']] df1.date_account_created = pd.to_datetime(df1.date_account_created) df1.date_account_created = pd.to_datetime(df1.date_account_created.dt.date) df1['near_holiday'] = (df1.date_account_created.isin(holidays + timedelta(days=1)) | df1.date_account_created.isin(holidays - timedelta(days=1))) return df1[['id', 'near_holiday']]
def get_holiday_list(start_yr, start_mon, start_day, end_yr, end_mon, end_day): calendar = USFederalHolidayCalendar() holiday_list = calendar.holidays(datetime.datetime(start_yr, 11, 1), datetime.datetime(end_yr, 5, 1)) holidays = [] for holiday in holiday_list: holidays.append(holiday) return holidays
def isHoliday(d): cal = USFederalHolidayCalendar() y_str = d.strftime("%Y") holidays = cal.holidays(start=y_str+'-01-01', end=y_str+'-12-31').to_pydatetime() if d in holidays: return 1 else: return 0
def holiday_lists(): cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2021-01-01', end='2021-12-31').to_pydatetime() hs = [] for h in holidays: hs.append(h.strftime("%Y/%m/%d")) return hs
def next_business_day(date): cal = USFederalHolidayCalendar() holidays = cal.holidays() date = date + BDay(1) date = date.to_pydatetime() while date in holidays: date = date + BDay(1) date = date.to_pydatetime() return date
def __init__(self,epsPath,pricePath,daysNeed = 3): self.raw_eps = pd.read_csv(epsPath) self.price = pd.read_csv(pricePath) self.daysNeed = daysNeed self.__getEps() minDate = str(min(self.epsBefore['DATE'],self.epsAfter['DATE'])) maxDate = str(max(self.epsBefore['DATE'],self.epsAfter['DATE'])) cal = USFederalHolidayCalendar() self.holidays = cal.holidays(start=minDate, end=maxDate).to_pydatetime()
def judgeOpenDaysInRange(from_date, to_date): cal = USFederalHolidayCalendar() holidays = cal.holidays(from_date, to_date) duedays = pd.bdate_range(from_date, to_date) df = pd.DataFrame() df['date'] = duedays df['holiday'] = duedays.isin(holidays) opendays = df[df['holiday'] == False] return opendays
def create_ts_calendar(df, ts_settings, additional_events=None): """ df: pandas df ts_settings: dict Parameters for time series project additional_events: pandas df(optional) df of additional events to add to calendar Returns: -------- Calendar of events """ date_col = ts_settings['date_col'] cal = USFederalHolidayCalendar(AbstractHolidayCalendar) black_friday = Holiday( "Black Friday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), pd.DateOffset(1)]) cyber_monday = Holiday( "Cyber Monday", month=11, day=1, offset=[pd.DateOffset(weekday=TH(4)), pd.DateOffset(4)]) cal.rules.append(black_friday) cal.rules.append(cyber_monday) cal.rules[9] = Holiday('Christmas', month=12, day=25) cal.rules[4] = Holiday('July 4th', month=7, day=4) events = pd.DataFrame( cal.holidays( start=pd.to_datetime(df[date_col].min()), end=pd.to_datetime(df[date_col].max()) + dt.timedelta(days=365), return_name=True, )) events = events.reset_index() events.columns = ['Date', 'Event'] if additional_events is not None: assert additional_events.shape[ 1] == 2, 'additional events must be a df with 2 columns' additional_events.columns = ['Date', 'Event'] additional_events['Date'] = pd.to_datetime(additional_events['Date']) events = events.append(additional_events) events['Date'] = [ dt.datetime.strftime(pd.to_datetime(date), '%Y-%m-%d') for date in events['Date'] ] return events.drop_duplicates().sort_values(by='Date').reset_index( drop=True)
def get_path(self, initial_values, t_start, t_end, freq, interest_rate, seed=None): # check initial value os the right size and dimension #print("getting path " , freq.) # set up parameters of the pathway model covar = stats.corr_to_cov(self.correlation_matrix, self.sigmas) covar_param = param.SimpleArrayParam(covar) chol = linalg.cholesky(covar, lower=True) chol[chol < 1.0e-9] = 0.0 cholesky_param = param.SimpleArrayParam(chol) r_param = param.SimpleParam(interest_rate) # create the index of times that are in market hours between # requested times timeindex = pd.date_range(start=t_start, end=t_end, freq=freq) # get the frequency (in seconds) now before removing non-market times freq_in_secs = pd.to_timedelta(timeindex.freq, unit='s').total_seconds() # only trading hours timeindex = timeindex[timeindex.indexer_between_time('09:30', '16:00')] # only weekdays timeindex = timeindex[~(timeindex.dayofweek > 4)] # remove fed holidays cal = USFederalHolidayCalendar() hols = cal.holidays(start=timeindex.min(), end=timeindex.max()) timeindex = timeindex[~timeindex.isin(hols)] # get array of time in yearly units and get stock pathways times = np.arange( 0, len(timeindex)) * freq_in_secs / MARKET_SECONDS_PER_YEAR # seed and create the pathway generator object np.random.seed(seed=seed) path_maker = path.GeometricDiffusionManyAsset(self.generator, r_param, covar_param, cholesky_param) if len(times) > 0: s_paths = path_maker.get_single_timed_path(initial_values, times) else: raise RunTimeError('Trying to generate stocks on empty time list') # put all data into a pandas Dataframe stocks_df = pd.DataFrame(index=timeindex, data=s_paths, columns=self.stock_names) print("internal : ", stocks_df.groupby(stocks_df.index.dayofweek).sum()) np.random.seed(seed=None) return stocks_df
def __init__(self, epsPath, pricePath, daysNeed=3): self.raw_eps = pd.read_csv(epsPath) self.price = pd.read_csv(pricePath) self.daysNeed = daysNeed self.__getEps() minDate = str(min(self.epsBefore['DATE'], self.epsAfter['DATE'])) maxDate = str(max(self.epsBefore['DATE'], self.epsAfter['DATE'])) cal = USFederalHolidayCalendar() self.holidays = cal.holidays(start=minDate, end=maxDate).to_pydatetime()
def create_date_feature_is_public_holiday(df, date, start, end, country = 'US'): """ create date feature: is holiday only support USFederal Holiday """ cal = USFederalHolidayCalendar() holidays = cal.holidays(start=start, end=end).to_pydatetime() foo = lambda x: 1 if x in holidays else 0 df['is_holiday'] = df[date].map(foo) return df
def find_holidays(base_year): ''' Using Pandas calendar, find all 10 US Federal Holidays, plus California's Cesar Chavez Day (March 31). ''' yr = str(base_year) cal = USFederalHolidayCalendar() holidays = cal.holidays(start=yr + '-01-01', end=yr + '-12-31').to_pydatetime() return [d.strftime('%m-%d') for d in holidays] + ['03-31']
def getHolidays(dates): holidaysVectorReturn = [] cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2015-01-01', end='2017-12-31').to_pydatetime() for idx, date in enumerate(dates): if date in holidays: holidaysVectorReturn.append(1) else: holidaysVectorReturn.append(0) return holidaysVectorReturn
def __init__(self, config_name): self.today = datetime.datetime.now().date() self._holidays = USFederalHolidayCalendar() self.config_name = config_name self._set_config() self.data = self._pull_data() self._clean_data() if len(self.data) != 0: self._amend_data() self._store_data()
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays(self.start_date.strftime("%Y-%m-%d"), self.end_date.strftime("%Y-%m-%d")) holidays_2 = calendar.holidays(Timestamp(self.start_date), Timestamp(self.end_date)) self.assertEqual(list(holidays.to_pydatetime()), self.holiday_list) self.assertEqual(list(holidays_1.to_pydatetime()), self.holiday_list) self.assertEqual(list(holidays_2.to_pydatetime()), self.holiday_list)
def build_features(data, features): #Timestamp First Active data['timestamp_first_active'] = data['timestamp_first_active'].astype(str) data['timestamp_first_active_date'] = data['timestamp_first_active'].str[:8] data['timestamp_first_active_date'] = pd.to_datetime(data['timestamp_first_active_date'], format='%Y%m%d') data['tfa_month'] = data['timestamp_first_active_date'].map(lambda x : x.month) data['tfa_year'] = data['timestamp_first_active_date'].map(lambda x : x.year) data['tfa_day'] = data['timestamp_first_active_date'].map(lambda x : x.day) data['tfa_dayofyear'] = data.timestamp_first_active_date.dt.dayofyear data['tfa_dayofweek'] = data.timestamp_first_active_date.dt.dayofweek data['tfa_week'] = data.timestamp_first_active_date.dt.week data['tfa_quarter'] = data.timestamp_first_active_date.dt.quarter features.extend(['tfa_day','tfa_month','tfa_year','tfa_dayofyear','tfa_dayofweek','tfa_week','tfa_quarter']) #TFA Holidays #calendar = USFederalHolidayCalendar() #tfa_holidays = calendar.holidays(start=data['timestamp_first_active_date'].min(),end=data['timestamp_first_active_date'].max()) #for i in range(len(tfa_holidays)): #data['tfa_holiday_diff_'+str(i)] = data['timestamp_first_active_date'].map(lambda x : (x-tfa_holidays[i]).days) #data['tfa_holiday_diff_'+str(i)] = data['tfa_holiday_diff_'+str(i)].map(holiday_transform) #data_dummy = pd.get_dummies(data['tfa_holiday_diff_'+str(i)],prefix='tfa_holiday_diff_'+str(i)) #features.extend(data_dummy.columns.values) #data.drop(['tfa_holiday_diff_'+str(i)],axis=1,inplace=True) #data = pd.concat((data,data_dummy),axis=1) #features.extend('tfa_holiday_diff_'+str(i)) #Date Account Created data['date_account_created'] = pd.to_datetime(data['date_account_created']) data['dac_month'] = data['date_account_created'].map(lambda x : x.month) data['dac_year'] = data['date_account_created'].map(lambda x : x.year) data['dac_day'] = data['date_account_created'].map(lambda x : x.day) data['dac_dayofyear'] = data.date_account_created.dt.dayofyear data['dac_dayofweek'] = data.date_account_created.dt.dayofweek data['dac_week'] = data.date_account_created.dt.week data['dac_quarter'] = data.date_account_created.dt.quarter features.extend(['dac_year','dac_month','dac_day','dac_dayofyear','dac_dayofweek','dac_week','dac_quarter']) #DAC Holidays calendar = USFederalHolidayCalendar() dac_holidays = calendar.holidays(start=data['date_account_created'].min(),end=data['date_account_created'].max()) for i in range(len(dac_holidays)): data['dac_holiday_diff_'+str(i)] = data['date_account_created'].map(lambda x : (x-dac_holidays[i]).days) data['dac_holiday_diff_'+str(i)] = data['dac_holiday_diff_'+str(i)].map(holiday_transform) features.extend(['dac_holiday_diff_'+str(i)]) #Days Difference Between TFA and DAC data['days_diff'] = (data['date_account_created'] - data['timestamp_first_active_date']).dt.days #data['days_diff'] = data['days_diff'].map(holiday_transform) features.extend(['days_diff']) data.drop(['date_account_created','timestamp_first_active','timestamp_first_active_date'],axis=1,inplace=True) other_features = ['gender', 'signup_method','signup_flow','language','affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in other_features: data_dummy = pd.get_dummies(data[f],prefix=f) features.extend(data_dummy.columns.values) data.drop([f],axis=1,inplace=True) data = pd.concat((data,data_dummy),axis=1) return data
def __init__(self, data_df, date): # df = pd.read_csv(path, date_parser='Date') self.date = date df = data_df.copy() # df = pd.read_csv(path, date_parser='Date') # df = data_df.copy() test = df[[ 'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp', 'Mean_Humi', 'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi', 'USC_Temp', 'USC_Humi', 'WJF_Temp', 'WJF_Humi', 'TRM_Temp', 'TRM_Humi' ]] test.loc[:, 'RIV_Temp_Log'] = np.log(df['RIV_Temp']) test.loc[:, 'Load_Log'] = np.log(df['Load']) test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0) # test['Temp_Lag_48'] = test['Mean_Temp'].shift(48, axis=0) test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0) test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0) # test['RIV_Temp_Lag_48']= test['RIV_Temp'].shift(48, axis=0) cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end=str(datetime.datetime.now()), return_name=True) holidays = pd.DataFrame(holidays) holidays = holidays.reset_index() holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'}) holidays['Date'] = pd.to_datetime(holidays['Date']) test['Date'] = pd.to_datetime(test['Date']) lm_data = test.loc[49:len(test), ].merge(holidays, how='left', on='Date') lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday") lm_data[["Hour", "Weekday", "Month", "Holiday"]] = lm_data[["Hour", "Weekday", "Month", "Holiday"]].astype('category') DateTime = pd.DataFrame( lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime. timedelta(hours=line['Hour']), axis=1)) DateTime.columns = ['DateTime'] self.lm_data = pd.concat([DateTime, lm_data], axis=1) self.lm_data.set_index('DateTime', inplace=True)
def DR_Temp_data_cleaning(dataframe): ''' inplace change of the dataframe, for the structure purpose, return this dataframe ''' dataframe['Date'] = pd.to_datetime(dataframe['Date']) test = dataframe[[ 'Date', 'Hour', 'Weekday', 'Month', 'Load', 'Mean_Temp', 'Mean_Humi', 'RIV_Temp', 'RIV_Humi', 'LAX_Temp', 'LAX_Humi', 'USC_Temp', 'USC_Humi', 'WJF_Temp', 'WJF_Humi', 'TRM_Temp', 'TRM_Humi' ]] test.loc[:, 'RIV_Temp_Log'] = np.log(dataframe['RIV_Temp']) test.loc[:, 'LAX_Temp_Log'] = np.log(dataframe['LAX_Temp']) test.loc[:, 'USC_Temp_Log'] = np.log(dataframe['USC_Temp']) test.loc[:, 'WJF_Temp_Log'] = np.log(dataframe['WJF_Temp']) test.loc[:, 'TRM_Temp_Log'] = np.log(dataframe['TRM_Temp']) test.loc[:, 'Mean_Temp_Log'] = np.log(dataframe['TRM_Temp']) test.loc[:, 'Load_Log'] = np.log(dataframe['Load']) test['Load_Lag_48'] = test['Load_Log'].shift(48, axis=0) test['Humi_Lag_48'] = test['Mean_Humi'].shift(48, axis=0) test['RIV_Temp_Log_Lag_48'] = test['RIV_Temp_Log'].shift(48, axis=0) test['LAX_Temp_Log_Lag_48'] = test['LAX_Temp_Log'].shift(48, axis=0) test['USC_Temp_Log_Lag_48'] = test['USC_Temp_Log'].shift(48, axis=0) test['WJF_Temp_Log_Lag_48'] = test['WJF_Temp_Log'].shift(48, axis=0) test['TRM_Temp_Log_Lag_48'] = test['TRM_Temp_Log'].shift(48, axis=0) test['Mean_Temp_Log_Lag_48'] = test['Mean_Temp_Log'].shift(48, axis=0) cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2014-01-01', end=str(datetime.datetime.now()), return_name=True) holidays = pd.DataFrame(holidays) holidays = holidays.reset_index() holidays = holidays.rename(columns={'index': "Date", 0: 'Holiday'}) holidays['Date'] = pd.to_datetime(holidays['Date']) test['Date'] = pd.to_datetime(test['Date']) lm_data = test.loc[49:len(test), ].merge(holidays, how='left', on='Date') lm_data['Holiday'] = lm_data['Holiday'].fillna("Not Holiday") lm_data[["Hour", "Weekday", "Month", "Holiday"]] = lm_data[["Hour", "Weekday", "Month", "Holiday"]].astype('category') DateTime = pd.DataFrame( lm_data.apply(lambda line: pd.to_datetime(line['Date']) + datetime. timedelta(hours=line['Hour']), axis=1)) DateTime.columns = ['DateTime'] lm_data = pd.concat([DateTime, lm_data], axis=1) lm_data.set_index('DateTime', inplace=True) return lm_data
def predict(dict): """ This function pickles the model and generates the test features by passing the dict parameter. """ folder_selected = folderSelect() filename = os.path.join(folder_selected, "model.joblib") if filename is None: train_and_persist() else: with open(filename, "rb") as f: model = load(f) testDf = pd.DataFrame(dict, index=[0]) testDf["hr"] = testDf["date"].dt.hour testDf["yr"] = testDf["date"].dt.year - testDf["date"].dt.year.min() testDf["mnth"] = testDf["date"].dt.month testDf["season"] = testDf["date"].map(get_season) testDf["weekday"] = testDf["date"].dt.weekday testDf["dteday"] = testDf["date"].dt.day testDf["dteday"] = pd.to_datetime(testDf["dteday"]) cal = USFederalHolidayCalendar() holidays = pd.to_datetime(cal.holidays(start="2011-01-01", end="2011-06-30")) testDf["holiday"] = pd.to_datetime(testDf["date"]).dt.date in holidays testDf["workingday"] = pd.to_datetime(testDf["date"]).dt.date not in holidays testDf["holiday"] = testDf["holiday"].map(lambda x: 1 if x else 0) testDf["workingday"] = testDf["workingday"].map(lambda x: 1 if x else 0) t_max , t_min = 50, -8 # This is for tempretaure normalization testDf["temp"] = (testDf["temperature_C"] - t_min)/(t_max-t_min) # We divide humidity by 100 to scale it between 0 and 1 testDf["hum"] = testDf["humidity"]/100 testDf = testDf.drop(columns=["temperature_C", "humidity"]) # Convert the data type to eithwe category or to float testDf = generating_new_features(testDf) testDf = pd.get_dummies(testDf) # Finally start with Machine Learning test = testDf.drop(columns=["date", "dteday", "feeling_temperature_C"]) # savedir = Path.home() # filename = os.path.join(savedir, "model.joblib") # with open(filename, "rb") as f: # model = load(f) # print("done!") # f.close() train_X, train_y, test_X, test_y = prepare_train_data() train_features = train_X.columns.values test = fill_missing_features(test, train_features) pred = model.predict(test) pred = pred.astype(int) print(" Rounded predictions:\n", pred)
def getStartDate(): # get 200 business days since yesterday but leave room in case this is run on weekend yesterday = date.today() - timedelta(days=1) start_date = yesterday - BDay(202) # find the number of holidays between yesterday and 200 days ago and subtract that to get real start date calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start_date, yesterday) start_date = start_date - BDay(len(holidays.tolist())) s = pd.Series(start_date) return s.map(pd.Timestamp.date)[0]
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays(self.start_date.strftime('%Y-%m-%d'), self.end_date.strftime('%Y-%m-%d')) holidays_2 = calendar.holidays(Timestamp(self.start_date), Timestamp(self.end_date)) assert list(holidays.to_pydatetime()) == self.holiday_list assert list(holidays_1.to_pydatetime()) == self.holiday_list assert list(holidays_2.to_pydatetime()) == self.holiday_list
def validate(date): cal = USFederalHolidayCalendar() holidays = cal.holidays(date) # check recent holidays adjusted = False if date in holidays: #create offset, where offset replace BDay as bussiness day offset to account for holiday offset = CustomBusinessDay(calendar=cal) date += offset adjusted = True return date, adjusted
def assign_workdays(self, data, start, end, speed=False): cal = USFederalHolidayCalendar() holidays = cal.holidays(start=start, end=end) if speed: num_process = mp.cpu_count() chunk_size = int(data.shape[0] / num_process) chunks = [data.loc[data.index[i: i + chunk_size]] for i in range(0, data.shape[0], chunk_size)] pools = mp.Pool(num_process) res = pools.map( partial(self._workday_process, holidays=holidays), chunks) return pd.concat(res) else: return self._workday_process(data, holidays)
def add_time_windows(df): """Adds time windows as target variable, based on delivery date and time. Calculates number of business days (excl weekends, shipment date, federal holidays) for delivery. Adds delivery time to number of business days to get time-in-transit. Creates window thresholds based on discussion with Jose. Lastly assign time windows to shipments based on time-in-transit. Args: df (pandas dataframe obj): Pandas dataframe that must contain shipment_date, delivery_date, delivery_time. Returns: pandas dataframe obj: Pandas dataframe with new columns days_in_transit, days_taken_float (time-in-transit), Y (target variable i.e. time window) """ print("Adding time windows i.e. target variable...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() # Calculate days in transit (exclude shipment date, holidays, weekends) start_date = df['shipment_date'].min() end_date = df['shipment_date'].max() calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start_date, end_date).date.tolist() shipment_dates = [d.date() for d in df['shipment_date']] delivery_dates = [d.date() for d in df['delivery_date']] # -1 because we will add transit time df['days_in_transit'] = np.busday_count( shipment_dates, delivery_dates, holidays=holidays) - 1 # Convert days in transit/delivery time to days taken (with decimals)) # e.g. if parcel reaches at 12.00pm on 2nd business day, days taken is 1.5 delivery_percentage_of_day = [ (timedelta.total_seconds(d) / timedelta(days=1).total_seconds()) for d in df['delivery_time'] ] df['days_taken_float'] = df['days_in_transit'] + delivery_percentage_of_day # Keep rows from -1 to 5 days in transit. The rest are rare occurrences. max_days_to_keep = 5 df = df[df['days_in_transit'].isin(np.arange(-1, max_days_to_keep))] # Assign time windows time_window_thresholds = create_time_window_thresholds() tqdm.pandas(desc="Assign time window") df['Y'] = df.progress_apply(lambda x: assign_time_window( x['days_taken_float'], time_window_thresholds), axis=1) print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def format_subway_data(path_to_csv_file: str): # read data, convert to datetime df = pd.read_csv(path_to_csv_file) df["servicedate"] = pd.to_datetime(df["servicedate"]) # add holidays cal = USFederalHolidayCalendar() holidays = cal.holidays(start=df["servicedate"].min(), end=df["servicedate"].max()) # mark as holiday and weekday df["holiday"] = df["servicedate"].dt.date.astype("datetime64").isin( holidays.date) df["weekday"] = df["servicedate"].dt.dayofweek # define peak, mark weekdays, convert service date back conditions = [(df["holiday"] == False) & (df["weekday"] < 5)] choices = ["peak"] df["peak"] = np.select(conditions, choices, default="offpeak") df["week"] = df["servicedate"].dt.isocalendar().week df["year"] = df["servicedate"].dt.isocalendar().year df["servicedate"] = df["servicedate"].dt.date.astype(str) # select date of the week dates = df[df["weekday"] == 0] dates = dates[["servicedate", "week", "year"]].drop_duplicates() # limit data to just peak, merge back dates final = df[df["peak"] == "peak"] final = final.groupby(["year", "week", "route_or_line" ])["sum"].mean().round().reset_index() final = final.merge(dates, on=["week", "year"], how="left") # get list of bus routes routelist = list(set(final["route_or_line"].tolist())) # create dict output = {} # write out each set of routes to dict for route in routelist: dftemp = final[final["route_or_line"] == route] dictdata = (dftemp[["servicedate", "sum"]].rename(columns={ "servicedate": "date", "sum": "riders" }).to_dict(orient="records")) output[route] = dictdata return output
def validate ( date ): cal = USFederalHolidayCalendar() holidays = cal.holidays(date) # check recent holidays adjusted = False if date in holidays: #create offset, where offset replace BDay as bussiness day offset to account for holiday offset = CustomBusinessDay(calendar=cal) date+=offset adjusted = True return date, adjusted
def test_calendar(self): calendar = USFederalHolidayCalendar() holidays = calendar.holidays(self.start_date, self.end_date) holidays_1 = calendar.holidays( self.start_date.strftime('%Y-%m-%d'), self.end_date.strftime('%Y-%m-%d')) holidays_2 = calendar.holidays( Timestamp(self.start_date), Timestamp(self.end_date)) assert list(holidays.to_pydatetime()) == self.holiday_list assert list(holidays_1.to_pydatetime()) == self.holiday_list assert list(holidays_2.to_pydatetime()) == self.holiday_list
def generate_calendar(year, drop_index=False): ''' Simple function to generate a calendar containing US holidays, weekdays and holiday weeks. ''' from pandas.tseries.offsets import YearEnd from pandas.tseries.holiday import USFederalHolidayCalendar start_date = pd.to_datetime('1/1/'+str(year)) end_date = start_date + YearEnd() DAT = pd.date_range(str(start_date), str(end_date), freq='D') MO = [d.strftime('%B') for d in DAT] holidays = USFederalHolidayCalendar().holidays(start=start_date, end=end_date) cal_df = pd.DataFrame({'date':DAT, 'month':MO}) cal_df['year'] = [format(d, '%Y') for d in DAT] cal_df['weekday'] = [format(d, '%A') for d in DAT] cal_df['is_weekday'] = cal_df.weekday.isin(['Monday','Tuesday','Wednesday','Thursday','Friday']) cal_df['is_weekday'] = cal_df['is_weekday'].astype(int) cal_df['is_holiday'] = cal_df['date'].isin(holidays) cal_df['is_holiday'] = cal_df['is_holiday'].astype(int) cal_df['is_holiday_week'] = cal_df.is_holiday.rolling(window=7,center=True,min_periods=1).sum() cal_df['is_holiday_week'] = cal_df['is_holiday_week'].astype(int) if not drop_index: cal_df.set_index('date', inplace=True) return cal_df
def addExogenousVariables(df): holidays = USFederalHolidayCalendar().holidays(start=df.index.min(), end=df.index.max()) df['holiday'] = df.index.isin(holidays).astype(int) df['week_number'] = df.index.week df['day_of_week'] = df.index.dayofweek return df
def get_calendar(): """Summary Returns: TYPE: Description """ return CustomBusinessDay(calendar=USFederalHolidayCalendar())
def predict_future_price(df, lookback_days, forcast_days, model, sc): """ Prediction of future stock price """ # calculate forcast dates and indexs, used for plot B_DAY = CustomBusinessDay(calendar=USFederalHolidayCalendar()) # business day forcast_dates = [df['Date'].iloc[-1]+i*B_DAY for i in range(1,forcast_days+1)] forcast_indexs = [df['Date'].index[-1]+i for i in range(1,forcast_days+1)] # input X for forcast X_scaled = df[['Scaled_Close']][-lookback_days:].values X_scaled = X_scaled.reshape((1, lookback_days, 1)) # prediction the scaled price with model forcast_price_scaled = model.predict(X_scaled) # transform back to the normal price forcast_prices = sc.inverse_transform(forcast_price_scaled) # create the forcast_dataframe forcast_dataframe = pd.DataFrame({ 'index' : forcast_indexs, 'Date' : forcast_dates, 'Adj Close' : forcast_prices[0], 'Scaled_Close' : forcast_price_scaled[0] }) forcast_dataframe = forcast_dataframe.set_index('index', drop=True) return forcast_dataframe
def describe_date(amd_data): """ Add columns for weekday, month, and holiday. Based on existing timestamp column in amd_data dataframe. """ holidays = USFederalHolidayCalendar().holidays() if type(amd_data) == dd.DataFrame: amd_data = amd_data.assign(weekday=amd_data.timestamp.map( lambda x: x.weekday() > 4, meta=('weekday', 'bool'))) amd_data = amd_data.assign(month=amd_data.timestamp.map( lambda x: x.month, meta=('month', 'int'))) amd_data = amd_data.assign(holiday=amd_data.timestamp.map( lambda x: x.date() in holidays, meta=('holiday', 'bool'))) if type(amd_data) == pd.DataFrame: amd_data['weekday'] = amd_data.timestamp.map( lambda x: x.weekday() > 4) amd_data['month'] = amd_data.timestamp.map(lambda x: x.month) amd_data['holiday'] = amd_data.timestamp.map( lambda x: x in holidays) return amd_data
def Converter_Monthly_Daily(Name_O, Indicator, start_date): usb = CustomBusinessDay(calendar=USFederalHolidayCalendar()) rng = pd.date_range(start=start_date, end="1/1/2018", freq=usb) Name_Daily = pd.DataFrame({'DATE': rng}) Name_Daily['DATE'] = pd.to_datetime(Name_Daily.DATE) Name_Daily['Year'] = Name_Daily.DATE.dt.year Name_Daily['Month'] = Name_Daily.DATE.dt.month Name_Daily['Day'] = Name_Daily.DATE.dt.day #Name_Daily = Name_Daily.drop(columns=['DATE']) year = Name_Daily['Year'].unique() year = np.append(year, [2018]) arr = [] index = 0 for y in year: newY = Name_Daily.loc[Name_Daily.Year == y] for m in range(1, 13): numberofDAYS = len(newY.loc[(newY.Month == m)]) for day in range(0, numberofDAYS): arr.append(Name_O[Indicator][index] + (Name_O[Indicator][index + 1] - Name_O[Indicator][index]) * day / numberofDAYS) index = index + 1 #Name_Daily = Name_Daily.drop(columns=['Year','Month','Day']) Name_Daily[Indicator] = arr #out_filename = "Economy_Indicators_Data"+ "\\" + Indicator + '_Daily_Insoo.csv' #Name_Daily.to_csv(out_filename,sep=',', encoding='utf-8') return Name_Daily
def test_calendar(transform): start_date = datetime(2012, 1, 1) end_date = datetime(2012, 12, 31) calendar = USFederalHolidayCalendar() holidays = calendar.holidays(transform(start_date), transform(end_date)) expected = [ datetime(2012, 1, 2), datetime(2012, 1, 16), datetime(2012, 2, 20), datetime(2012, 5, 28), datetime(2012, 7, 4), datetime(2012, 9, 3), datetime(2012, 10, 8), datetime(2012, 11, 12), datetime(2012, 11, 22), datetime(2012, 12, 25) ] assert list(holidays.to_pydatetime()) == expected
def get_first_bday_of_month(mnth=None, yr=None): ''' Return the first business day of the current month if no variables provided Return the first business day of the month and year provided if variables provided Tests: In [188]: config.get_first_bday_of_month(12,2015) Out[188]: datetime.date(2015, 12, 1) In [189]: config.get_first_bday_of_month(11,2015) Out[189]: datetime.date(2015, 11, 2) In [190]: config.get_first_bday_of_month(10,2015) Out[190]: datetime.date(2015, 10, 1) In [191]: config.get_first_bday_of_month(1,2016) Out[191]: datetime.date(2016, 1, 4) In [192]: config.get_first_bday_of_month(8,2015) Out[192]: datetime.date(2015, 8, 3) :param mnth: :param yr: :return: ''' from calendar import monthrange from pandas.tseries.holiday import USFederalHolidayCalendar from pandas.tseries.offsets import CustomBusinessDay if yr is None or mnth is None: yr = pd.datetime.now().year if pd.datetime.now().month != 1 else pd.datetime.now().year - 1 mnth = pd.datetime.now().month - 1 if pd.datetime.now().month != 1 else 12 else: yr = yr if mnth != 1 else yr - 1 mnth = mnth - 1 if mnth != 1 else 12 end_last = monthrange(yr, mnth) end_last = pd.Timestamp('%s/%s/%s' % (mnth, end_last[1], yr)).date() cal = USFederalHolidayCalendar() holidays = cal.holidays(start=end_last - pd.tseries.offsets.Day(60), end=end_last + pd.tseries.offsets.Day(60)).to_pydatetime() bday_cus = CustomBusinessDay(holidays=holidays) return (end_last + bday_cus).date()
if beginning_range <= date_of_interest <= end_range: days_since_school_year_start = (date_of_interest - beginning_range).days #for 2015/2016 School year beginning_range = datetime.date(2015, 8, 31) end_range = datetime.date(2016, 8, 29) if beginning_range <= date_of_interest <= end_range: days_since_school_year_start = (date_of_interest - beginning_range).days datekey_list.append(weather_datekey) days_since_school_year_start_list.append(days_since_school_year_start) days_since_school_year_start_db = {'days_since_school_year_start' : pd.Series(days_since_school_year_start_list, index=datekey_list)} days_since_school_year_start_df = pd.DataFrame(days_since_school_year_start_db) #Account for major US federal holidays cal = USFederalHolidayCalendar() holidays = cal.holidays(start='2013-01-01', end='2016-12-31').to_pydatetime() datekey_list = [] holidays_list = [] #for weather_datekey in weather_df.index.values: for weather_datekey in weather_df.datekey: year = int(str(weather_datekey)[:4]) month = int(str(weather_datekey)[4:6]) day = int(str(weather_datekey)[6:]) date_of_interest = datetime.date(year, month, day) if date_of_interest in holidays: holidays_list.append(1) else: holidays_list.append(0) datekey_list.append(weather_datekey) holidays_db = {'holidays' : pd.Series(holidays_list, index=datekey_list)}
def make_regression_features_oct(df): '''Create features for regression given a dataframe ''' from pandas.tseries.holiday import USFederalHolidayCalendar x_dayssincestart = df.dayssincestart x_dayofweek = df.dayofweek x_dayofweek = pd.get_dummies(x_dayofweek,prefix='dayofweek') x_dayofmonth = df.dayofmonth x_dayofmonth = pd.get_dummies(x_dayofmonth, prefix='dayofmonth') x_week1 = pd.Series([1 if day < 8 else 0 for day in df.dayofmonth],name='week1') x_week2 = pd.Series([1 if (day >= 8 and day < 16) else 0 for day in df.dayofmonth],name='week2') x_week3 = pd.Series([1 if (day >= 16 and day < 23) else 0 for day in df.dayofmonth],name='week3') x_week4 = pd.Series([1 if day >= 23 else 0 for day in df.dayofmonth],name='week4') x_isweekend = pd.Series([1 if (day == 5 or day == 6) else 0 for day in df.dayofweek],name='isweekend') x_istueswed = pd.Series([1 if (day == 1 or day == 2) else 0 for day in df.dayofweek],name='istueswed') x_isfrisat = pd.Series([1 if (day == 4 or day == 5) else 0 for day in df.dayofweek],name='isfrisat') #get holidays calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start=df.datetime.min(), end=df.datetime.max()) x_isholiday = pd.Series([1 if day in holidays.tolist() else 0 for day in df.datetime],name='x_isholiday') #find 3-day weekends x_isholidaywknd = pd.Series(np.zeros(len(x_isholiday)),name='x_isholidaywknd') x_weekendandholiday = x_isweekend + x_isholiday for i in range(1, len(x_weekendandholiday)-2): if x_weekendandholiday[i] == 1: if x_weekendandholiday[i+1] == 1 and x_weekendandholiday[i+2] ==1: x_isholidaywknd[i] = 1 x_isholidaywknd[i+1] = 1 x_isholidaywknd[i+2] = 1 x_week1tueswed = x_week1 & x_istueswed x_week2tueswed = x_week2 & x_istueswed x_week3tueswed = x_week3 & x_istueswed x_week4tueswed = x_week4 & x_istueswed x_week1frisat = x_week1 & x_isfrisat x_week2frisat = x_week2 & x_isfrisat x_week3frisat = x_week3 & x_isfrisat x_week4frisat = x_week4 & x_isfrisat x_week1wknd = x_week1 & x_isweekend x_week2wknd = x_week2 & x_isweekend x_week3wknd = x_week3 & x_isweekend x_week4wknd = x_week4 & x_isweekend X_vars = pd.concat([x_dayssincestart,x_dayofweek,x_dayofmonth, x_week1,x_week2,x_week3,x_week4, x_isweekend,x_istueswed,x_isfrisat, x_isholiday,x_isholidaywknd, x_week1tueswed,x_week2tueswed,x_week3tueswed,x_week4tueswed, x_week1frisat,x_week2frisat,x_week3frisat,x_week4frisat, x_week1wknd,x_week2wknd,x_week3wknd,x_week4wknd], axis=1) return X_vars
#add a dummy variable for weekend day merged_master['weekend_day'] = np.where((merged_master['day_of_week']==5)|(merged_master['day_of_week']==6),1,0) #drop day_of_week field merged_master.drop('day_of_week', axis=1, inplace=True) #add a column for month and create month dummies merged_master['month'] = merged_master.index.month dummy_df = pd.get_dummies(merged_master['month'], prefix='month') merged_master = pd.concat([merged_master,dummy_df], axis=1) merged_master.drop('month', axis=1, inplace=True) #add a column for holiday merged_master['date'] = merged_master.index.date merged_master['date'] = pd.to_datetime(merged_master['date']) cal = USFederalHolidayCalendar() holidays = cal.holidays(start=merged_master.date.min(), end=merged_master.date.max()) merged_master['holiday'] = merged_master['date'].isin(holidays).astype(int) merged_master.drop('date', axis=1, inplace=True) #add a column for days since first date in dataset merged_master['days_since_first_date'] = (merged_master.index.date - (merged_master.index.date.min())).astype('timedelta64[D]').astype(int) #Drop any rows with null values merged_master.dropna(axis=0, inplace=True) #rearrange column order merged_master['station_id'] = merged_master['from_station_id'] merged_master.drop(['from_station_id'], axis=1, inplace=True)