def cleaner(df): logging.info("Cleaning outliers.") # drop rows with no volume & prices less than epsilon eps = 0.01 df = df.loc[(df.open > eps) & (df.close > eps) & (df.high > eps) & (df.close > eps) & (df.volume > 0)].copy() # keep everthing inside +/- 3 std deviations from mean # rolling price sampling window window = 75 devs = 3 # std devs df['zscore'] = (df.close - df.close.mean()) / df.close.std(ddof=0) df['zscore'] = df['zscore'].abs() df['mean'] = df['close'].rolling(window, center=True).mean() df['std'] = df['close'].rolling(window, center=True).std() df = df[(df.close <= df['mean'] + devs * df['std']) & (df.close >= df['mean'] - devs * df['std'])] #df = df[(df.zscore) <= devs] #print(str(df[1500:1600])) #sys.exit() #df = df.dropna() #df.to_csv("clean_outliers.csv") #df = df.drop(['mean', 'std'], axis=1) return df
def run_buy_sell_analyze(cfg, yearspan): min_trades = int(cfg['analyze_min_trades']) holds_cfg = cfg['hold_times_list'].strip() budget_cfg = cfg['budget_list'].strip() holds_lst = holds_cfg.split(",") budget_lst = budget_cfg.split(",") low_price_cutoff = float(cfg['low_price_cutoff']) for budget in budget_lst: b = budget.strip() for hold in holds_lst: h = hold.strip() info_str = f"Running buy-sell with {h} days and {b} dollars..." logging.info(info_str) buy_sell_v3(int(b), 0, int(h), float(low_price_cutoff), yearspan) info_str = f"Running analyze with {h} days and {b} dollars..." logging.info(info_str) analyze(h, b, yearspan, min_trades)
def filter_symbols(prices_start_date, prices_end_date): summary_input_file = SUMMARY_INPUT_FILE if not os.path.exists(summary_input_file): logging.critical("Location file not found: " + summary_input_file) sys.exit() try: logging.info("Reading: " + summary_input_file) stox_df = pd.read_table(summary_input_file, sep=',') stox_df['stock_from_date'] = pd.to_datetime(stox_df['stock_from_date']) stox_df['stock_to_date'] = pd.to_datetime(stox_df['stock_to_date']) except Exception as e: logging.warning("Not parsed: " + summary_input_file + "\n" + str(e)) sys.exit() # drop any symbols that don't cover at least the analysis window stox_df = stox_df[(stox_df['stock_from_date'] <= prices_start_date) & (stox_df['stock_to_date'] >= prices_end_date)] return stox_df['symbol'].tolist()
def clean_prices(cfg): prices_input_file = RAW_PRICES_INPUT_FILE prices_output_file = CLEANED_PRICES_FILE # clean up the existing output file (ignore !exists error) try: os.remove(prices_output_file) except OSError: pass # load prices prices_df = load_df(prices_input_file).groupby('symbol') write_header = True # Clean outliers for symbol, sym_df in prices_df: sym_df = prices_df.get_group(symbol) logging.info(f"Cleaning outliers for {symbol}") sym_df = clean_outliers(sym_df, cfg['rolling_sample_window']) with open(prices_output_file, 'a') as f: sym_df.to_csv(f, index=False, sep=",", header=write_header) write_header = False
def run_make_blacklist(cfg): input_str = input("input: year-span budget => ") args = input_str.split() yearspan = int(args[0]) budget = int(args[1]) holds = cfg['hold_times_list'] logging.info("Running blacklist...") make_blacklist(budget, holds, yearspan)
def run_analysis(): input_string = input( 'input: year-span hold_days budget_dollars min_trades => ') logging.info("Running analysis...") args = input_string.split() yearspan = args[0] hold_days = args[1] budget_dollars = args[2] min_trades = int(args[3]) analyze(hold_days, budget_dollars, yearspan, min_trades)
def load_df(df_file): try: logging.info("Reading " + df_file) df = pd.read_table(df_file, sep=',') logging.info("df shape " + str(df.shape)) except Exception as e: logging.critical("Not parsed: " + df_file + "\n" + str(e)) sys.exit() return df
def load_earnings(fname): try: logging.info("Reading " + fname) earn_df = pd.read_table(fname, sep=',') logging.info("Earnings df shape " + str(earn_df.shape)) except Exception as e: logging.critical("Not parsed: " + fname + "\n" + str(e)) sys.exit() return earn_df
def load_prices(prices_file): try: logging.info("Reading " + prices_file) prices_df = pd.read_table(prices_file, sep=',') prices_df['date'] = pd.to_datetime(prices_df['date']) logging.info("Prices df shape " + str(prices_df.shape)) except Exception as e: logging.critical("Not parsed: " + prices_file + "\n" + str(e)) sys.exit() return prices_df
def main(): # make the output dir if needed stox_dir = STOX_DATA_DIR pathlib.Path(stox_dir).mkdir(exist_ok=True) reply = "none" while reply != "q": # pass the last reply in to menu to show last command reply, cfg = show_menu(reply) if reply == '0': logging.shutdown() if os.path.exists("log-stox.log"): os.remove("log-stox.log") logging.info("Log deleted.") #input("OK >") elif reply == '1': rm_stoxdir(cfg) elif reply == "2": run_clean_prices(cfg) elif reply == "3": write_symbols(cfg) elif reply == "4": run_prices_filter(cfg) elif reply == "5": run_buy_sell(cfg) elif reply == "6": run_analysis() elif reply == "7": run_price_plot(cfg) elif reply == "8": run_cleaner_test(cfg) elif reply == "9": run_buy_sell_analyze(cfg) elif reply == "10": run_make_blacklist(cfg) elif reply == "11": run_auto(cfg)
def load_config(): # Config parser ini_filename = "stox.ini" logging.info("Reading config from: " + ini_filename) config = configparser.ConfigParser() try: config.read(ini_filename) except Exception as e: logging.critical("Error reading .ini file: " + ini_filename) logging.critical("Exception: " + str(type(e)) + " " + str(e)) sys.exit() return config
def filter_prices(in_df, prices_start_date, prices_end_date, yearspan): prices_df = in_df.copy() symbols_input_file = FILTERED_SYMBOLS_PREFIX + str(yearspan) + "years.csv" filtered_prices_output_file = FILTERED_PRICES_PREFIX + str( yearspan) + "years.csv" # load symbols symbols_df = load_df(symbols_input_file) symbols = symbols_df['symbol'].tolist() # filter on symbols prices_df = prices_df[prices_df['symbol'].isin(symbols)] logging.info("Filtered symbols df shape " + str(prices_df.shape)) # filter on date range logging.info("Filtering by date range.") prices_df['date'] = pd.to_datetime(prices_df['date']) prices_df = prices_df[(prices_df['date'] >= prices_start_date) & ( prices_df['date'] <= prices_end_date)].sort_values(['symbol', 'date']) logging.info("Filtered dates df shape " + str(prices_df.shape)) # write filtered prices logging.info("Writing filtered prices to " + filtered_prices_output_file) prices_df.to_csv(filtered_prices_output_file, index=False)
def run_buy_sell(cfg): low_price_cutoff = float(cfg['low_price_cutoff']) input_string = input( 'input: year-span hold_days budget_dollars fee_dollars => ') args = input_string.split() yearspan = int(args[0]) hold_days = int(args[1]) budget_dollars = int(args[2]) min_trades = int(args[3]) logging.info("Running buy-sell...") buy_sell_v3(budget_dollars, fee_dollars, hold_days, low_price_cutoff, yearspan)
def run_prices_filter(cfg): years_back = input("input: number of prior years => ") logging.info("Running prices filter...") prices_df = load_df(CLEANED_PRICES_FILE) # get the timestamps for the year span years_int = int(years_back) end_date_str = cfg['analysis_end_date'].strip() end_list = end_date_str.split('-') end_yr = int(end_list[0]) end_mo = int(end_list[1]) end_d = int(end_list[2]) prices_end_date = pd.Timestamp(end_yr, end_mo, end_d) prices_start_date = prices_end_date - pd.Timedelta(days=years_int * 365) print(f'start: {str(prices_start_date)} end: {str(prices_end_date)}') filter_prices(in_df, prices_start_date, prices_end_date, years_int)
def main(): proc_starttime = datetime.now() logging.info("Processing started: " + str(proc_starttime)) prices_input_file = RAW_PRICES_INPUT_FILE register_matplotlib_converters() # load prices prices_df = load_prices(prices_input_file) # which symbols have splits? prices_df = prices_df.groupby('symbol') num_splits = num_rsplits = 0 for name, group in prices_df: max_split = group['split_coefficient'].max() min_split = group['split_coefficient'].min() if max_split > 1.0: #logging.info(str(name) + " max split " + str(max_split)) num_splits += 1 if min_split < 1.0: num_rsplits += 1 logging.info(str(name) + " r split " + str(min_split)) logging.info(str(num_splits) + " symbols with splits > 1") logging.info(str(num_rsplits) + " symbols with splits < 1") # plot split coeff #grpname = "BRK.A" #grp = prices_df.get_group(grpname) #plot_price(grpname, grp, pd.Timestamp(2009,1,1), pd.Timestamp(2019, 1, 1)) proc_endtime = datetime.now() logging.info(("Total Processing time (min): " + str( (proc_endtime - proc_starttime).total_seconds() / 60.0)))
def run_auto(cfg): startTime = pd.datetime.now() logging.info("Auto run started at " + str(startTime)) run_clean_prices(cfg) logging.info("loading cleaned prices...") prices_df = load_df(CLEANED_PRICES_FILE) logging.info("Cleaned prices df shape " + str(prices_df.shape)) # append the benchmark symbols to the prices_df logging.info("appending benchmarks") for benchsym in BENCHMARK_SYMBOLS: bench_input_file = RAW_DATA_DIR + benchsym + ".csv" logging.info("Reading: " + bench_input_file) bench_df = pd.read_table(bench_input_file, sep=',') logging.info("bench_df shape " + str(bench_df.shape)) bench_df['date'] = pd.to_datetime(bench_df['date']) prices_df = prices_df.append(bench_df, ignore_index=True) logging.info("Cleaned prices with benchmarks df shape " + str(prices_df.shape)) years = cfg['years_list'].strip() years_list = years.split(',') symbols_limit = int(cfg['symbols_limit']) for years_back in years_list: # get the timestamps for the year span years_int = int(years_back) end_date_str = cfg['analysis_end_date'].strip() end_list = end_date_str.split('-') end_yr = int(end_list[0]) end_mo = int(end_list[1]) end_d = int(end_list[2]) prices_end_date = pd.Timestamp(end_yr, end_mo, end_d) prices_start_date = prices_end_date - pd.Timedelta(days=years_int * 365) print(f'start: {str(prices_start_date)} end: {str(prices_end_date)}') # make the filtered symbols list sort_symbols_by_eps(prices_start_date, prices_end_date, years_int, symbols_limit, BENCHMARK_SYMBOLS) # filter prices on date span logging.info("writing filtered prices") filter_prices(prices_df, prices_start_date, prices_end_date, years_int) run_buy_sell_analyze(cfg, years_int) budget_cfg = cfg['budget_list'].strip() budget_lst = budget_cfg.split(",") for budget in budget_lst: b = float(budget.strip()) make_blacklist(b, cfg['hold_times_list'], years_int) endTime = pd.datetime.now() logging.info("Auto run finished at " + str(endTime)) logging.info("Elapsed time: " + str(endTime - startTime)) input("DONE with AUTO > ")
def sort_symbols_by_eps(prices_start_date, prices_end_date, yearspan, limit, benchmarks): earnings_input_file = EARNINGS_INPUT_FILE sorted_symbols_output_file = FILTERED_SYMBOLS_PREFIX + str(yearspan) + "years.csv" # load earnings earn_df = load_earnings(earnings_input_file) logging.info("Filtering by date range.") earn_df['date'] = pd.to_datetime(earn_df['date']) #Filter by start / end dates earn_df = earn_df[(earn_df['date'] >= prices_start_date) & (earn_df['date'] <= prices_end_date)] logging.info("Filtered dates df shape " + str(earn_df.shape)) earn_df = earn_df.groupby('symbol') logging.info(str(len(earn_df)) + " symbols groups found") # get list of symbols whose valid listing dates cover the start-end period logging.info("Filtering symbols (keep symbols covering the date span)") symbols_span_list = filter_symbols(prices_start_date, prices_end_date) logging.info(f"{len(symbols_span_list)} symbols cover the date span for {str(yearspan)} year history") # make df of sorted symbols cols = ['symbol', 'avg_eps'] lst = [] for symbol, symbol_grp in earn_df: # keep only symbols that pass the symbols listing span if symbol in symbols_span_list: #logging.info("Getting mean eps for " + symbol) avg_eps = symbol_grp['eps'].mean() lst.append([symbol, avg_eps]) symsort_df = pd.DataFrame(lst, columns=cols) #symsort_df = symsort_df.dropna() symsort_df = symsort_df.sort_values('avg_eps', ascending=False) # apply symbols limit symsort_df = symsort_df[:limit] # add the benchmark symbols rows_list = [] for bsym in benchmarks: bdict= {"symbol":bsym, "avg_eps":999} rows_list.append(bdict) bdf = pd.DataFrame(rows_list) symsort_df = bdf.append(symsort_df) # write sorted symbols logging.info(f"Writing {len(symsort_df)} sorted symbols to " + sorted_symbols_output_file) symsort_df.to_csv(sorted_symbols_output_file, index=False) return symsort_df['symbol']
def analyze(hold_days, budget, yearspan, min_trades): file_postfix = str(yearspan) + "years_" + str(hold_days) + "days_" + str( budget) + "dollars.csv" buy_sell_input_file = BUY_SELL_RESULTS_PREFIX + file_postfix analysis_output_file = ANALYSIS_FILE_PREFIX + file_postfix # clean up the existing output file (ignore !exists error) try: os.remove(analysis_output_file) except OSError: pass # load buy-sell results and group by symbol try: logging.info(f"Reading {buy_sell_input_file}") bsr_df = pd.read_table(buy_sell_input_file, sep=",") #bsr_df['buy_date'] = pd.to_datetime(bsr_df['buy_date']) #bsr_df['sell_date'] = pd.to_datetime(bsr_df['sell_date']) bsr_df = bsr_df.groupby('symbol') logging.info("Found " + str(len(bsr_df)) + " symbols in buy-sell data.") except Exception as e: logging.warning("Not parsed: " + buy_sell_input_file + "\n" + str(e)) sys.exit() results_lst = [] # output rows qmax = 100000 # output queue max write_header = True # results file header (write once flag) symnum = 0 numsyms = len(bsr_df) for symbol, sym_df in bsr_df: try: num_trades = len(sym_df) blk_trades_df = sym_df[sym_df['gain_total'] > 0.0] red_trades_df = sym_df[sym_df['gain_total'] < 0.0] num_black = len(blk_trades_df) pct_black = float(num_black) / float(num_trades) num_red = len(red_trades_df) avg_return = sym_df["gain_total"].mean() avg_gain = blk_trades_df["gain_total"].mean() avg_loss = red_trades_df["gain_total"].mean() max_gain = blk_trades_df["gain_total"].max() mg_idx = blk_trades_df['gain_total'].idxmax() mg_buy_date = blk_trades_df.loc[mg_idx, 'buy_date'] mg_buy_price = blk_trades_df.loc[mg_idx, 'buy_price'] mg_sell_date = blk_trades_df.loc[mg_idx, 'sell_date'] mg_sell_price = blk_trades_df.loc[mg_idx, 'sell_price'] max_loss = red_trades_df["gain_total"].min() ml_idx = red_trades_df['gain_total'].idxmin() ml_buy_date = red_trades_df.loc[ml_idx, 'buy_date'] ml_buy_price = red_trades_df.loc[ml_idx, 'buy_price'] ml_sell_date = red_trades_df.loc[ml_idx, 'sell_date'] ml_sell_price = red_trades_df.loc[ml_idx, 'sell_price'] row = [ symbol, num_trades, pct_black, num_black, num_red, avg_return, avg_gain, avg_loss, max_gain, mg_buy_date, mg_buy_price, mg_sell_date, mg_sell_price, max_loss, ml_buy_date, ml_buy_price, ml_sell_date, ml_sell_price ] # drop low numbers of trades if num_trades >= min_trades: results_lst.append(row) else: logging.info( f"dropped symbol {symbol} for low trade occurrences.") # peridocially write the results list if len(results_lst) >= qmax: logging.info( f"Writing {qmax} results to {analysis_output_file}") append_analysis_csv(analysis_output_file, results_lst, write_header) write_header = False results_lst = [] symnum += 1 # keep track of how many symbols have been processed logging.info( f"{symbol} \t\t[{symnum} of {numsyms}] \tpct_black: " + f"{pct_black:.1f} avg_return: {avg_return:.2f} ") except Exception as e: logging.error("Exception in analyze " + str(e)) # final csv update if len(results_lst) > 0: logging.info( f"Writing {len(results_lst)} results to {analysis_output_file}") append_analysis_csv(analysis_output_file, results_lst, write_header)
def run_clean_prices(cfg): logging.info(f"Cleaning with window = {cfg['rolling_sample_window']}") clean_prices(cfg)
def make_blacklist(budget, holds, yearspan): file_list = [] holds_lst = holds.split(",") for hold in holds_lst: hold = hold.strip() file_list.append(ANALYSIS_FILE_PREFIX + str(yearspan) + "years_" + str(hold) + "days_" + str(int(budget)) + "dollars.csv") pct_cutoff = 0.51 df_list = [] # keep each df logging.info(f"Loading {file_list[0]}") df = pd.read_table(file_list[0], sep=",") logging.info(f"file0 df shape: {df.shape}") df = df[df.pct_black > pct_cutoff] logging.info(f"file0 df > pct cutoff shape: {df.shape}") df_list.append(df) keep_symbols = set(df['symbol'].tolist()) drop_symbols = set() for file in file_list[1:]: logging.info(f"Checking symbols in {file}") df = pd.read_table(file, sep=",") df = df[df['pct_black'] > pct_cutoff] check_symbols = df['symbol'].tolist() for k in keep_symbols: if k not in check_symbols: drop_symbols.add(k) df_list.append(df) print(f"{len(drop_symbols)} symbols getting dropped.") #logging.info(str(drop_symbols)) for d in drop_symbols: keep_symbols.remove(d) print(f"{len(keep_symbols)} symbols kept.") print(f"{len(df_list)} dfs made") # build a df with the pct_black results across all holds cols = [ 'symbol', 'd4', 'd9', 'd14', 'd19', 'd30', 'd60', 'd90', 'avg_return' ] rows_list = [] i = 0 for symbol in keep_symbols: rtn_sum = 0 # to make average return row = [symbol] for df in df_list: rtn_sum += df.loc[df.symbol == symbol, 'avg_return'].values[0] pct_blk = df.loc[df.symbol == symbol, 'pct_black'].values[0] row.append(pct_blk) avg_return = rtn_sum / float(len(file_list)) row.append(avg_return) rows_list.append(row) i += 1 if ((i % 100) == 0): logging.info(f"processing symbol {i} of {len(keep_symbols)}") # blacklist df logging.info(f"Building blacklist df with {len(rows_list)} rows") bl_df = pd.DataFrame(rows_list, columns=cols) # add column for avg pct_blk bl_df['avg_pct_blk'] = bl_df.iloc[:, 1:8].mean(axis=1) bl_df = bl_df.sort_values('avg_pct_blk', ascending=False) logging.info(f"bl_df shape {bl_df.shape}") print(bl_df.head()) bl_file = BLACKLIST_FILE_PREFIX + str(yearspan) + "years_" + str( int(budget)) + "dollars.csv" bl_df.to_csv(bl_file, index=False, float_format='%.3f') print(f"Wrote {bl_file}")
def save_config(config): ini_filename = "stox.ini" with open(ini_filename, 'w') as configfile: config.write(configfile) logging.info("Saved " + ini_filename)
def test_cleaner(cfg): register_matplotlib_converters() param_list = cfg['cleaner_test_params'].split(" ") if len(param_list) < 3: logging.warn("Plot params malformed. Skipping plot.") return else: logging.info(f"Using symbol {param_list[0].strip()}") logging.info(f" from {param_list[1]}") logging.info(f" to {param_list[2]}") # param string [symbol start-date end-date] # e.g. IBM 2009-01-01 2019-01-01 symbol = param_list[0].strip() start_list = param_list[1].split('-') start_yr = int(start_list[0]) start_mo = int(start_list[1]) start_d = int(start_list[2]) end_list = param_list[2].split('-') end_yr = int(end_list[0]) end_mo = int(end_list[1]) end_d = int(end_list[2]) date_start = pd.Timestamp(start_yr, start_mo, start_d) date_end = pd.Timestamp(end_yr, end_mo, end_d) # use group file if exists else use raw file raw_symbol_file = STOX_DATA_DIR + symbol + "_raw.csv" if os.path.exists(raw_symbol_file): prices_input_file = raw_symbol_file else: prices_input_file = RAW_PRICES_INPUT_FILE try: logging.info("Reading " + prices_input_file) prices_df = pd.read_table(prices_input_file, sep=',') prices_df['date'] = pd.to_datetime(prices_df['date']) logging.info("Prices df shape " + str(prices_df.shape)) except Exception as e: logging.critical("Not parsed: " + prices_input_file + "\n" + str(e)) sys.exit() # get group for this symbol logging.info("Filtering on symbol") df = prices_df.groupby('symbol').get_group(symbol) # write the raw file for this symbol (all time frames) if not os.path.exists(raw_symbol_file): logging.info(f"Writing raw file for {symbol}") df.to_csv(raw_symbol_file, index=False, sep=",", float_format='%.3f') # filter on date range logging.info("Filtering on date range") df = df[(df['date'] >= date_start) & (df['date'] <= date_end)] df = df.sort_values(['date']) # write raw df to file span_str = (date_start.strftime("%Y-%m-%d") + "_" + date_end.strftime("%Y-%m-%d")) csv_name = STOX_DATA_DIR + symbol + "_" + span_str + "_raw.csv" df.to_csv(csv_name, index=False, sep="\t", float_format='%.3f') # test cleaner cdf = cleaner(df) # write cdf to file span_str = (date_start.strftime("%Y-%m-%d") + "_" + date_end.strftime("%Y-%m-%d")) csv_name = STOX_DATA_DIR + symbol + "_" + span_str + "_cleantest.csv" cdf.to_csv(csv_name, index=False, sep="\t", float_format='%.3f') # PLOT fig, axs = plt.subplots(nrows=2, sharex=True) plt.suptitle(symbol, fontsize=10) axs[0].set_title('Raw', {'fontsize': 10}) axs[0].scatter(df['date'].tolist(), df['close'], color='blue', s=2) axs[1].set_title('Cleaned', {'fontsize': 10}) axs[1].scatter(cdf['date'].tolist(), cdf['close'], color='green', s=2) plt_filename = STOX_DATA_DIR + symbol + "_" + span_str + ".png" plt.savefig(plt_filename) plt.show()
def run_price_plot(cfg): logging.info("Running price plot...") plot_price(cfg)
def plot_price(cfg): param_list = cfg['plot_params'].split(" ") if len(param_list) < 3: logging.warn("Plot params malformed. Skipping plot.") return else: logging.info(f"Plotting symbol {param_list[0].strip()}") logging.info(f" from {param_list[1]}") logging.info(f" to {param_list[2]}") register_matplotlib_converters() prices_input_file = CLEANED_PRICES_FILE #prices_input_file = cfg['raw_data_dir'] + cfg['raw_prices_input_file'] try: logging.info("Reading " + prices_input_file) prices_df = pd.read_table(prices_input_file, sep=',') prices_df['date'] = pd.to_datetime(prices_df['date']) logging.info("Prices df shape " + str(prices_df.shape)) except Exception as e: logging.critical("Not parsed: " + prices_input_file + "\n" + str(e)) sys.exit() # param string [symbol start-date end-date] # e.g. IBM 2009-01-01 2019-01-01 symbol = param_list[0].strip() start_list = param_list[1].split('-') start_yr = int(start_list[0]) start_mo = int(start_list[1]) start_d = int(start_list[2]) end_list = param_list[2].split('-') end_yr = int(end_list[0]) end_mo = int(end_list[1]) end_d = int(end_list[2]) date_start = pd.Timestamp(start_yr, start_mo, start_d) date_end = pd.Timestamp(end_yr, end_mo, end_d) # filter on date range logging.info("Filtering on date range") df = prices_df[(prices_df['date'] >= date_start) & (prices_df['date'] <= date_end)] df = df.sort_values(['date']) # get group for this symbol logging.info("Filtering on symbol") df = df.groupby('symbol').get_group(symbol) # write df to file span_str = (date_start.strftime("%Y-%m-%d") + "_" + date_end.strftime("%Y-%m-%d")) csv_name = STOX_DATA_DIR + symbol + "_" + span_str + ".csv" df.to_csv(csv_name, index=False, sep="\t", float_format='%.3f') # plot open/close price fig = plt.figure() plt.suptitle(symbol, fontsize=10) plt.scatter(df['date'].tolist(), df['open'], color='green', s=2) plt.scatter(df['date'].tolist(), df['close'], color = 'blue', s=2) plt_filename = STOX_DATA_DIR + symbol + "_" + span_str + ".png" plt.savefig(plt_filename) plt.show()
def rm_stoxdir(cfg): stox_dir = STOX_DATA_DIR for p in Path(stox_dir).glob("*.*"): p.unlink() logging.info("Removed stox data.")
def buy_sell_v3(budget_dollars, fee_dollars, hold_days, low_price_cutoff, yearspan): prices_input_file = FILTERED_PRICES_PREFIX + str(yearspan) + "years.csv" hold_str = str(hold_days) budget_dollars_str = str(int(budget_dollars)) buy_sell_output_postfix = str( yearspan ) + "years_" + hold_str + "days_" + budget_dollars_str + "dollars.csv" buy_sell_output_file = BUY_SELL_RESULTS_PREFIX + buy_sell_output_postfix # clean up the existing output file (ignore !exists error) try: os.remove(buy_sell_output_file) except OSError: pass # load prices and group by symbol try: logging.info("Reading: " + prices_input_file) stox_df = pd.read_table(prices_input_file, sep=',') stox_df['date'] = pd.to_datetime(stox_df['date']) stox_df = stox_df.groupby('symbol') logging.info("Found " + str(len(stox_df)) + " symbols in price data.") except Exception as e: logging.warning("Not parsed: " + prices_input_file + "\n" + str(e)) sys.exit() numsyms = len(stox_df) # total number of symbols cant_afford = set() # set of symbols whose unit share price exceeds budget penny_stocks = set() # set of low price symbols results_lst = [] # completed transactions write_header = True # results file header (write once flag) # Loop over each symbol symnum = 1 # symbol idx for symbol, sym_df in stox_df: max_gain = max_loss = 0.0 pending_lst = [] # has buy attributes for each buy date # TODO make sure the sym_df is in date order ascending row_idx = 0 for row in sym_df.itertuples(): buy_price, shares_bought, status = buy(row, budget_dollars) split_coeff = row.split_coefficient if status == 'price_high': cant_afford.add(symbol) # todo: count these elif status == 'price_low': penny_stocks.add(symbol) # todo: count these # add current row to pending sale list pending_lst.append([ symbol, row_idx, row.date, shares_bought, buy_price, split_coeff ]) # Once the pending sales list is full, start creating results list if row_idx >= hold_days: result_row = sell_row(row, pending_lst, fee_dollars) returns = float(result_row[11]) # just for logging if returns > max_gain: max_gain = returns if returns < max_loss: max_loss = returns # add the result row to results list results_lst.append(result_row) # remove the sold row del pending_lst[0] row_idx += 1 # peridocially write the results list qmax = 100000 if len(results_lst) >= qmax: logging.info(f"Writing {qmax} results to {buy_sell_output_file}") append_csv(buy_sell_output_file, results_lst, write_header, low_price_cutoff) write_header = False results_lst = [] symnum += 1 # keep track of how many symbols have been processed logging.info(f"{symbol} \t\t[{symnum} of {numsyms}] \ttrade days: " + f"{str(len(sym_df))} max_gain: {max_gain:.2f} " + f"max_loss: {max_loss:.2f}") # final csv update if len(results_lst) > 0: logging.info( f"Writing {len(results_lst)} results to {buy_sell_output_file}") append_csv(buy_sell_output_file, results_lst, write_header, low_price_cutoff) logging.info("Zero shares bought (price exceeds budget): " + str(cant_afford)) logging.info("Zero shares bought (price too low): " + str(penny_stocks))
def run_cleaner_test(cfg): logging.info("Running cleaner test...") test_cleaner(cfg)