def get_first_barrier_touches(sep_featured, sep, ptSl: tuple, trgt: pd.Series, minRet: float): ewmstd = sep_featured[["ewmstd"]] # I only consider the case there there is a timeout timeout = sep_featured[["timeout"]] # NEED TO ADD side_long = pd.Series(1.0, index=ewmstd.index) # 3) form events objects, apply stop loss on t1 events = pd.concat( { 'timeout': timeout, 'ewmstd': ewmstd, 'side': side_long }, axis=1) # .dropna(subset=["trgt"]) sep_featured = pandas_mp_engine(callback=triple_barrier_search, atoms=sep_featured, \ data={'sep': sep}, molecule_key='sep_sampled', split_strategy= 'ticker', \ num_processes=1, molecules_per_process=1, ptSl=[1,-1], minRet=None) # drop those where none of the barrieres where touched (should be extremly rare, if not at all I think) events["earliest_touch"] = df0.dropna(how='all').min( axis=1 ) # pd.min ignores nan, here events["t1"] becomes the timesamp of the earliest barrier touch events = events.drop("side", axis=1) return events
def get_events_metalabaling(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False, side=None): # 1) get target trgt = trgt.loc[tEvents] trgt = trgt[trgt > minRet] # minRet # 2) Get t1 (max holding period) if t1 is False: t1 = pd.Series(pd.NaT, index=tEvents) # 3) form events object, apply stop loss on t1 if side is None: side_, ptSl = pd.Series(1.0, index=trgt.index), [ptSl[0], ptSl[0]] else: side_, ptSl_ = side.loc[trgt.index], ptSl[:2] events = pd.concat({ "t1": t1, "trgt": trgt, "side": side_ }, axis=1).dropna(subset=["trgt"]) df0 = pandas_mp_engine(callback=apply_ptsl_on_t1, pdObj=("molecule", events.index), numThreads=numThreads, \ close=inst["close"], events=events, ptSl=ptSl_) events["t1"] = df0.dropna(how="all").min(axis=1) # pd.min ignores nan if side is None: events = events.drop("side", axis=1) return events
def getEvents(close: pd.Series, tEvents: pd.DatetimeIndex, ptSl: tuple, trgt: pd.Series, minRet: float, numThreads: int, t1=False): # sep_featured, sep, ptSl, minRet """ Finds the time of the first barrier touch. close: A pandas series of prices. tEvents: The pandas timeindex containing the timestamps that will seed every triple barrier. These are the timestamps selected by the sampling procedures discussed in Chapter 2, Section 2.5. ptSl: A non-negative float that sets the width of the two barriers. A 0 value means that the respective horizontal barrier (profit taking and/or stop loss) will be disabled. t1: A pandas series with the timestamps of the vertical barriers. We pass a False when we want to disable vertical barriers. trgt: A pandas series of targets, expressed in terms of absolute returns. minRet: The minimum target return required for running a triple barrier search. """ # Some of this preparation migth be done before giving it to this function, don't know... # 1) get target trgt = trgt.loc[tEvents] trgt = trgt[ trgt > minRet] # minRet # what is the result of this (sets those lower than minRet to NAN i think) # 2) Get t1 (max holding period) if t1 is False: t1 = pd.Series(pd.NaT, index=tEvents) # 3) form events objects, apply stop loss on t1 side_ = pd.Series( 1.0, index=trgt.index ) # ALlways assume long position when first calculating label for side. events = pd.concat({ 't1': t1, 'trgt': trgt, 'side': side_ }, axis=1).dropna(subset=["trgt"]) df0 = pandas_mp_engine(func=apply_ptsl_on_t1, pdObj=('molecule', events.index), numThreads=numThreads, \ close=close, events=events, ptSl=[ptSl, ptSl]) # notice that barriers are symmetric when first labeling for side. # drop those where none of the barrieres where touched (should be extremly rare, if not at all I think) events["t1"] = df0.dropna(how='all').min( axis=1 ) # pd.min ignores nan, here events["t1"] becomes the timesamp of the earliest barrier touch events = events.drop("side", axis=1) return events
def test_sep_featured(): global save_path, cache_dir num_processes = 6 print("\n\nSEP_FEATURED - OLD METHOD\n\n") sep = pd.read_csv("../datasets/testing/sep.csv", parse_dates=["date"], index_col="date", low_memory=False) sf1_art = pd.read_csv("../datasets/testing/sf1_art.csv", parse_dates=["calendardate", "datekey", "reportperiod"],\ index_col="calendardate", low_memory=False) metadata = pd.read_csv("../datasets/sharadar/METADATA_PURGED.csv", parse_dates=["firstpricedate"], low_memory=False) tb_rate = tb_rate = pd.read_csv("../datasets/macro/t_bill_rate_3m.csv", parse_dates=["date"], index_col="date") sep_extended = pandas_mp_engine(callback=extend_sep_for_sampling, atoms=sep, \ data={"sf1_art": sf1_art, "metadata": metadata}, \ molecule_key='sep', split_strategy='ticker', \ num_processes=num_processes, molecules_per_process=1) sep_extended.sort_values(by=["ticker", "date"], ascending=True, inplace=True) sep_adjusted = pandas_mp_engine(callback=dividend_adjusting_prices_backwards, atoms=sep_extended, data=None, \ molecule_key='sep', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1) sep_adjusted_plus_returns = pandas_mp_engine(callback=add_weekly_and_12m_stock_returns, atoms=sep_adjusted, data=None, \ molecule_key='sep', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1) sep_adjusted_plus_returns.sort_values(by=["ticker", "date"], ascending=True, inplace=True) sep_prepared = pandas_mp_engine(callback=add_equally_weighted_weekly_market_returns, atoms=sep_adjusted_plus_returns, data=None, \ molecule_key='sep', split_strategy= 'date', \ num_processes=num_processes, molecules_per_process=1) sep_prepared.sort_values(by=["ticker", "date"], ascending=True, inplace=True) sep_prepared_plus_indmom = pandas_mp_engine(callback=add_indmom, atoms=sep_prepared, data=None, \ molecule_key='sep', split_strategy= 'industry', \ num_processes=num_processes, molecules_per_process=1) sep_prepared_plus_indmom.sort_values(by=["ticker", "date"], inplace=True) # sep_prepared_plus_indmom.to_csv("../datasets/testing/sep_prepared.csv") sep_sampled = pandas_mp_engine(callback=rebase_at_each_filing_sampling, atoms=sep_prepared_plus_indmom, data=None, \ molecule_key='observations', split_strategy='ticker', num_processes=num_processes, molecules_per_process=1, \ days_of_distance=20) sep_sampled.sort_values(by=["ticker", "date"], ascending=True, inplace=True) sep_featured = pandas_mp_engine(callback=add_sep_features, atoms=sep_sampled, \ data={'sep': sep_prepared_plus_indmom, "sf1_art": sf1_art}, molecule_key='sep_sampled', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1) sep_featured.sort_values(by=["ticker", "date"], ascending=True, inplace=True) tbm_labeled_sep = pandas_mp_engine(callback=add_labels_via_triple_barrier_method, atoms=sep_featured, \ data={'sep': sep_prepared_plus_indmom}, molecule_key='sep_featured', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1, ptSl=[1, -1], min_ret=None) tbm_labeled_sep.sort_values(by=["ticker", "date"], ascending=True, inplace=True) erp_labeled_sep = pandas_mp_engine(callback=equity_risk_premium_labeling, atoms=tbm_labeled_sep, \ data=None, molecule_key='sep_featured', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1, tb_rate=tb_rate) erp_labeled_sep.sort_values(by=["ticker", "date"], ascending=True, inplace=True) sep_featured = erp_labeled_sep sep_featured.sort_values(by=["ticker", "date"], inplace=True) sep_featured.to_csv( save_path + "/sep_featured.csv") # I really think this is the correct result #______________________CHAINING MP ENGINE____________________________ print("\n\nSEP_FEATURED - NEW METHOD\n\n") sep_featured_2 = generate_sep_featured( num_processes=num_processes, cache_dir=cache_dir, tb_rate=tb_rate, sep_path= "../datasets/testing/sep.csv", # paths relative to the engine I think sf1_art_path="../datasets/testing/sf1_art.csv", metadata_path="../datasets/sharadar/METADATA_PURGED.csv", resume=False) sep_featured_2 = sep_featured_2.sort_values(by=["ticker", "date" ]) # Should not need this sep_featured_2.to_csv(save_path + "/sep_featured_2.csv") """ sep_featured = sep_featured.fillna("NA") sep_featured_2 = sep_featured_2.fillna("NA") eq_result = sep_featured.eq(sep_featured_2) eq_result.to_csv("./testing_datasets/eq_result_sep_featured.csv") """ assert sep_featured.shape[0] == sep_featured_2.shape[0] assert sep_featured.shape[1] == sep_featured_2.shape[1] failed = False pos = None errors = [] len_sep_featured = len(sep_featured) for index in range(0, len_sep_featured): for column in sep_featured.columns: correct_val = sep_featured.iloc[index][column] if isinstance(correct_val, str): if correct_val != sep_featured_2.iloc[index][column]: failed = True pos = (index, column) errors.append(pos) elif isinstance(correct_val, pd.Timestamp) or isinstance( correct_val, pd.Timedelta): if str(correct_val) != str(sep_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) elif math.isnan(correct_val): if not math.isnan(sep_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) else: if correct_val != pytest.approx( sep_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) if failed == True: print("Shape: ", sep_featured.shape, sep_featured_2.shape) for pos in errors: print("Failed at position: ", pos, " Corr: ", sep_featured.iloc[pos[0]][pos[1]], "Othr: ", sep_featured_2.iloc[pos[0]][pos[1]]) assert len(errors) == 0
def testing_sf1_featured(): global save_path, cache_dir num_processes = 6 print("\n\nSF1_FEATURED - OLD METHOD\n\n") sf1_art = pd.read_csv("../datasets/testing/sf1_art_no_duplicates.csv", parse_dates=["calendardate", "datekey"],\ index_col="calendardate", low_memory=False) sf1_arq = pd.read_csv("../datasets/testing/sf1_arq_no_duplicates.csv", parse_dates=["calendardate", "datekey"],\ index_col="calendardate", low_memory=False) metadata = pd.read_csv("../datasets/sharadar/METADATA_PURGED.csv", parse_dates=["firstpricedate"], low_memory=False) sf1_art = sf1_art.sort_values(by=["ticker", "calendardate", "datekey"]) sf1_arq = sf1_arq.sort_values(by=["ticker", "calendardate", "datekey"]) sf1_featured = pandas_mp_engine(callback=add_sf1_features, atoms=sf1_art, \ data={"sf1_arq": sf1_arq, 'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'ticker', \ num_processes=num_processes, molecules_per_process=1) sf1_featured.sort_values(by=["ticker", "calendardate", "datekey"]) sf1_featured = pandas_mp_engine(callback=add_industry_sf1_features, atoms=sf1_featured, \ data={'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'industry', \ num_processes=num_processes, molecules_per_process=1) sf1_featured = sf1_featured.sort_values( by=["ticker", "calendardate", "datekey"]) sf1_featured.to_csv(save_path + "/sf1_featured.csv") print("\n\nSF1_FEATURED - NEW METHOD\n\n") sf1_featured_2 = generate_sf1_featured( num_processes=num_processes, cache_dir=cache_dir, sf1_art_path="../datasets/testing/sf1_art_no_duplicates.csv", sf1_arq_path="../datasets/testing/sf1_arq_no_duplicates.csv", metadata_path="../datasets/sharadar/METADATA_PURGED.csv", resume=False) sf1_featured_2 = sf1_featured_2.sort_values( by=["ticker", "calendardate", "datekey"]) sf1_featured_2.to_csv(save_path + "/sf1_featured_2.csv") assert sf1_featured.shape[0] == sf1_featured_2.shape[0] assert sf1_featured.shape[1] == sf1_featured_2.shape[1] failed = False pos = None errors = [] len_sf1_featured = len(sf1_featured) for index in range(0, len_sf1_featured): for column in sf1_featured.columns: correct_val = sf1_featured.iloc[index][column] if isinstance(correct_val, str): if correct_val != sf1_featured_2.iloc[index][column]: failed = True pos = (index, column) errors.append(pos) elif isinstance(correct_val, pd.Timestamp) or isinstance( correct_val, pd.Timedelta): if str(correct_val) != str(sf1_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) elif math.isnan(correct_val): if not math.isnan(sf1_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) else: if correct_val != pytest.approx( sf1_featured_2.iloc[index][column]): failed = True pos = (index, column) errors.append(pos) if failed == True: print("Shape: ", sf1_featured.shape, sf1_featured_2.shape) for pos in errors: print("Failed at position: ", pos, " Corr: ", sf1_featured.iloc[pos[0]][pos[1]], "Othr: ", \ sf1_featured_2.iloc[pos[0]][pos[1]], sf1_featured.iloc[pos[0]]["datekey"], sf1_featured.iloc[pos[0]]["ticker"], \ sf1_featured_2.iloc[pos[0]]["datekey"], sf1_featured_2.iloc[pos[0]]["ticker"]) assert len(errors) == 0
return ps else: return np.nan if __name__ == "__main__": sf1_art = pd.read_csv("./datasets/testing/sf1_art.csv", parse_dates=["datekey", \ "calendardate", "reportperiod"], index_col="calendardate") sf1_art = sf1_art.sort_values(by="datekey", ascending=True) sf1_art["datekey"] = sf1_art.index sf1_arq = pd.read_csv("./datasets/testing/sf1_arq.csv", parse_dates=["datekey", \ "calendardate", "reportperiod"], index_col="calendardate") sf1_arq = sf1_arq.sort_values(by="datekey", ascending=True) metadata = pd.read_csv("./datasets/sharadar/SHARADAR_TICKERS_METADATA.csv", \ parse_dates=["firstpricedate"]) sf1_art_featured = pandas_mp_engine(callback=add_sf1_features, atoms=sf1_art, \ data={"sf1_arq": sf1_arq, 'metadata': metadata}, molecule_key='sf1_art', split_strategy= 'ticker', \ num_processes=1, molecules_per_process=1) sf1_art_aapl = sf1_art_featured.loc[sf1_art_featured.ticker == "AAPL"] sf1_art_ntk = sf1_art_featured.loc[sf1_art_featured.ticker == "NTK"] with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(sf1_art_aapl)
def finalize_dataset(metadata, sep_featured=None, sf1_featured=None, num_processes=6): sf1_featured = sf1_featured.drop_duplicates(subset=["ticker", "datekey"], keep="last") # 2. Select features from SEP, SF1 etc. selected_features = base_cols + selected_sf1_features + selected_industry_sf1_features + selected_sep_features dataset = merge_datasets(sep_featured, sf1_featured, selected_features) # 3. Make all values numeric: dataset["age"] = pd.to_timedelta(dataset["age"]) dataset["age"] = dataset["age"].dt.days # pd.to_numeric(dataset["age"].apply()) # dataset.to_csv("./datasets/ml_ready_live/dataset_with_nans.csv", index=False) """ merged_length = len(dataset) merged_cols = set(dataset.columns) with pd.option_context('display.max_rows', None, 'display.max_columns', None): print("Nan status after merge") print("Dataset length: ", merged_length) print(dataset.isnull().sum()) """ # 2. Drop columns with too many missing values columns_to_drop = ["saleinv", "pchsale_pchinvt", "pchsaleinv", "rd", "herf"] dataset = dataset.drop(columns_to_drop, axis=1) dataset = dataset.replace([np.inf, -np.inf], np.nan) features = list(set(dataset.columns) - set(labels) - set(base_cols) - set(["industry"])) # 4. Calculate mean and var for each feature for each size category for the whole market # Size classifications: Nano <$50m; 2 - Micro < $300m; 3 - Small < $2bn; 4 - Mid <$10bn; 5 - Large < $200bn; 6 - Mega >= $200bn dataset = dataset.dropna(axis=0, subset=["mve"]) dataset["size"] = pd.NaT dataset["size"].loc[dataset.mve < math.log(50e6)] = "nano" dataset["size"].loc[(dataset.mve >= math.log(50e6)) & (dataset.mve < math.log(300e6))] = "micro" dataset["size"].loc[(dataset.mve >= math.log(300e6)) & (dataset.mve < math.log(2e9))] = "small" dataset["size"].loc[(dataset.mve >= math.log(2e9)) & (dataset.mve < math.log(10e9))] = "mid" dataset["size"].loc[(dataset.mve >= math.log(10e9)) & (dataset.mve < math.log(200e9))] = "large" dataset["size"].loc[dataset.mve >= math.log(200e9)] = "mega" nano_dataset = dataset.loc[dataset["size"] == "nano"] micro_dataset = dataset.loc[dataset["size"] == "micro"] small_dataset = dataset.loc[dataset["size"] == "small"] mid_dataset = dataset.loc[dataset["size"] == "mid"] large_dataset = dataset.loc[dataset["size"] == "large"] mega_dataset = dataset.loc[dataset["size"] == "mega"] print(features) size_rvs = {} for feature in features: size_rvs[feature] = { "nano": (nano_dataset[feature].mean(), nano_dataset[feature].std()), "micro": (micro_dataset[feature].mean(), micro_dataset[feature].std()), "small": (small_dataset[feature].mean(), small_dataset[feature].std()), "mid": (mid_dataset[feature].mean(), mid_dataset[feature].std()), "large": (large_dataset[feature].mean(), large_dataset[feature].std()), "mega": (mega_dataset[feature].mean(), mega_dataset[feature].std()), } # 5. Fix Nans and drop rows dataset = pandas_mp_engine( callback=fix_nans_and_drop_rows, atoms=dataset, data={"metadata": metadata}, molecule_key="dataset", split_strategy="industry_new", num_processes=num_processes, molecules_per_process=1, features=features, size_rvs=size_rvs ) dataset["erp_1m_direction"] = np.sign(dataset["erp_1m"]) dataset = dataset.loc[dataset.primary_label_tbm != 0] """ with pd.option_context('display.max_rows', None, 'display.max_columns', None): print("\n\nNan Status After fixing Nans:") print("New dataset length: ", len(dataset)) print("Percentage dropped: ", ((merged_length - len(dataset))/merged_length) * 100) print("Dropped columns: ", merged_cols.difference(set(dataset.columns))) print(dataset.isnull().sum()) print(dataset.describe()) """ return dataset
i8_advertising_intensity = 1 if ( (art_row_cur["sgna"] / art_row_cur["assetsavg"]) > industry_means.at[caldate_cur, "industry_mean_advertising_intensity"]) else 0 ms = i1_roa_above_avg + i2_cf_roa_above_avg + i3_ncfo_exceeds_netinc + i6_rnd_intensity + i7_capex_indensity + i8_advertising_intensity return ms else: return np.nan if __name__ == "__main__": sf1_art = pd.read_csv("./datasets/testing/sf1_art.csv", index_col="datekey", parse_dates=["datekey", "calendardate"]) metadata = pd.read_csv("./datasets/sharadar/SHARADAR_TICKERS_METADATA.csv", index_col="ticker", parse_dates=["firstpricedate"]) sf1_art["datekey"] = pd.to_datetime(sf1_art["datekey"]) sf1_art["calendardate"] = pd.to_datetime(sf1_art["calendardate"]) metadata["firstpricedate"] = pd.to_datetime(metadata["firstpricedate"]) sep = pandas_mp_engine(callback=add_equally_weighted_weekly_market_returns, atoms=sep, data=None, \ molecule_key='sep', split_strategy= 'date', \ num_processes=4, molecules_per_process=1)