Example #1
0
def cleaner(df):

    logging.info("Cleaning outliers.")
    # drop rows with no volume & prices less than epsilon
    eps = 0.01
    df = df.loc[(df.open > eps) & (df.close > eps) & (df.high > eps) &
                (df.close > eps) & (df.volume > 0)].copy()

    #  keep everthing inside +/- 3 std deviations from mean
    # rolling price sampling window
    window = 75
    devs = 3  # std devs

    df['zscore'] = (df.close - df.close.mean()) / df.close.std(ddof=0)
    df['zscore'] = df['zscore'].abs()
    df['mean'] = df['close'].rolling(window, center=True).mean()
    df['std'] = df['close'].rolling(window, center=True).std()
    df = df[(df.close <= df['mean'] + devs * df['std'])
            & (df.close >= df['mean'] - devs * df['std'])]

    #df = df[(df.zscore) <= devs]

    #print(str(df[1500:1600]))
    #sys.exit()

    #df = df.dropna()
    #df.to_csv("clean_outliers.csv")
    #df = df.drop(['mean', 'std'], axis=1)

    return df
Example #2
0
def run_buy_sell_analyze(cfg, yearspan):

    min_trades = int(cfg['analyze_min_trades'])

    holds_cfg = cfg['hold_times_list'].strip()
    budget_cfg = cfg['budget_list'].strip()

    holds_lst = holds_cfg.split(",")
    budget_lst = budget_cfg.split(",")

    low_price_cutoff = float(cfg['low_price_cutoff'])

    for budget in budget_lst:
        b = budget.strip()

        for hold in holds_lst:
            h = hold.strip()

            info_str = f"Running buy-sell with {h} days and {b} dollars..."
            logging.info(info_str)

            buy_sell_v3(int(b), 0, int(h), float(low_price_cutoff), yearspan)

            info_str = f"Running analyze with {h} days and {b} dollars..."
            logging.info(info_str)

            analyze(h, b, yearspan, min_trades)
Example #3
0
def filter_symbols(prices_start_date, prices_end_date):

    summary_input_file = SUMMARY_INPUT_FILE

    if not os.path.exists(summary_input_file):
        logging.critical("Location file not found: " +
                            summary_input_file)
        sys.exit()

    try:
        logging.info("Reading: " + summary_input_file)
        stox_df = pd.read_table(summary_input_file, sep=',')
        stox_df['stock_from_date'] = pd.to_datetime(stox_df['stock_from_date'])
        stox_df['stock_to_date'] = pd.to_datetime(stox_df['stock_to_date'])

    except Exception as e:
        logging.warning("Not parsed: " + summary_input_file + "\n" + str(e))
        sys.exit()
        
    # drop any symbols that don't cover at least the analysis window
    stox_df = stox_df[(stox_df['stock_from_date'] <= prices_start_date) &
                    (stox_df['stock_to_date'] >= prices_end_date)]


    return stox_df['symbol'].tolist()
Example #4
0
def clean_prices(cfg):

    prices_input_file = RAW_PRICES_INPUT_FILE
    prices_output_file = CLEANED_PRICES_FILE

    # clean up the existing output file (ignore !exists error)
    try:
        os.remove(prices_output_file)
    except OSError:
        pass

    # load prices
    prices_df = load_df(prices_input_file).groupby('symbol')

    write_header = True

    # Clean outliers
    for symbol, sym_df in prices_df:

        sym_df = prices_df.get_group(symbol)
        logging.info(f"Cleaning outliers for {symbol}")
        sym_df = clean_outliers(sym_df, cfg['rolling_sample_window'])

        with open(prices_output_file, 'a') as f:
            sym_df.to_csv(f, index=False, sep=",", header=write_header)
            write_header = False
Example #5
0
def run_make_blacklist(cfg):

    input_str = input("input: year-span budget =>  ")
    args = input_str.split()
    yearspan = int(args[0])
    budget = int(args[1])
    holds = cfg['hold_times_list']

    logging.info("Running blacklist...")
    make_blacklist(budget, holds, yearspan)
Example #6
0
def run_analysis():

    input_string = input(
        'input: year-span hold_days budget_dollars min_trades =>  ')
    logging.info("Running analysis...")
    args = input_string.split()
    yearspan = args[0]
    hold_days = args[1]
    budget_dollars = args[2]
    min_trades = int(args[3])
    analyze(hold_days, budget_dollars, yearspan, min_trades)
Example #7
0
def load_df(df_file):

    try:
        logging.info("Reading " + df_file)
        df = pd.read_table(df_file, sep=',')
        logging.info("df shape " + str(df.shape))

    except Exception as e:
        logging.critical("Not parsed: " + df_file + "\n" + str(e))
        sys.exit()

    return df
Example #8
0
def load_earnings(fname):

    try:
        logging.info("Reading " + fname)
        earn_df = pd.read_table(fname, sep=',')
        logging.info("Earnings df shape " + str(earn_df.shape))
        
    except Exception as e: 
        logging.critical("Not parsed: " + fname + "\n" + str(e))
        sys.exit()   

    return earn_df
Example #9
0
def load_prices(prices_file):

    try:
        logging.info("Reading " + prices_file)
        prices_df = pd.read_table(prices_file, sep=',')
        prices_df['date'] = pd.to_datetime(prices_df['date'])
        logging.info("Prices df shape " + str(prices_df.shape))

    except Exception as e:
        logging.critical("Not parsed: " + prices_file + "\n" + str(e))
        sys.exit()

    return prices_df
Example #10
0
def main():

    # make the output dir if needed
    stox_dir = STOX_DATA_DIR
    pathlib.Path(stox_dir).mkdir(exist_ok=True)

    reply = "none"
    while reply != "q":

        # pass the last reply in to menu to show last command
        reply, cfg = show_menu(reply)

        if reply == '0':
            logging.shutdown()
            if os.path.exists("log-stox.log"):
                os.remove("log-stox.log")
            logging.info("Log deleted.")
            #input("OK >")

        elif reply == '1':
            rm_stoxdir(cfg)

        elif reply == "2":
            run_clean_prices(cfg)

        elif reply == "3":
            write_symbols(cfg)

        elif reply == "4":
            run_prices_filter(cfg)

        elif reply == "5":
            run_buy_sell(cfg)

        elif reply == "6":
            run_analysis()

        elif reply == "7":
            run_price_plot(cfg)

        elif reply == "8":
            run_cleaner_test(cfg)

        elif reply == "9":
            run_buy_sell_analyze(cfg)

        elif reply == "10":
            run_make_blacklist(cfg)

        elif reply == "11":
            run_auto(cfg)
Example #11
0
def load_config():

    # Config parser
    ini_filename = "stox.ini"
    logging.info("Reading config from: " + ini_filename)
    config = configparser.ConfigParser()
    try:
        config.read(ini_filename)
    except Exception as e:
        logging.critical("Error reading .ini file: " + ini_filename)
        logging.critical("Exception: " + str(type(e)) + " " + str(e))
        sys.exit()

    return config
Example #12
0
def filter_prices(in_df, prices_start_date, prices_end_date, yearspan):

    prices_df = in_df.copy()

    symbols_input_file = FILTERED_SYMBOLS_PREFIX + str(yearspan) + "years.csv"
    filtered_prices_output_file = FILTERED_PRICES_PREFIX + str(
        yearspan) + "years.csv"

    # load symbols
    symbols_df = load_df(symbols_input_file)
    symbols = symbols_df['symbol'].tolist()

    # filter on symbols
    prices_df = prices_df[prices_df['symbol'].isin(symbols)]
    logging.info("Filtered symbols df shape " + str(prices_df.shape))

    # filter on date range
    logging.info("Filtering by date range.")
    prices_df['date'] = pd.to_datetime(prices_df['date'])
    prices_df = prices_df[(prices_df['date'] >= prices_start_date) & (
        prices_df['date'] <= prices_end_date)].sort_values(['symbol', 'date'])

    logging.info("Filtered dates df shape " + str(prices_df.shape))

    # write filtered prices
    logging.info("Writing filtered prices to " + filtered_prices_output_file)
    prices_df.to_csv(filtered_prices_output_file, index=False)
Example #13
0
def run_buy_sell(cfg):

    low_price_cutoff = float(cfg['low_price_cutoff'])

    input_string = input(
        'input: year-span hold_days budget_dollars fee_dollars =>  ')

    args = input_string.split()
    yearspan = int(args[0])
    hold_days = int(args[1])
    budget_dollars = int(args[2])
    min_trades = int(args[3])

    logging.info("Running buy-sell...")
    buy_sell_v3(budget_dollars, fee_dollars, hold_days, low_price_cutoff,
                yearspan)
Example #14
0
def run_prices_filter(cfg):

    years_back = input("input: number of prior years => ")

    logging.info("Running prices filter...")
    prices_df = load_df(CLEANED_PRICES_FILE)

    # get the timestamps for the year span
    years_int = int(years_back)

    end_date_str = cfg['analysis_end_date'].strip()
    end_list = end_date_str.split('-')
    end_yr = int(end_list[0])
    end_mo = int(end_list[1])
    end_d = int(end_list[2])
    prices_end_date = pd.Timestamp(end_yr, end_mo, end_d)

    prices_start_date = prices_end_date - pd.Timedelta(days=years_int * 365)
    print(f'start: {str(prices_start_date)}  end: {str(prices_end_date)}')

    filter_prices(in_df, prices_start_date, prices_end_date, years_int)
Example #15
0
def main():

    proc_starttime = datetime.now()
    logging.info("Processing started: " + str(proc_starttime))

    prices_input_file = RAW_PRICES_INPUT_FILE

    register_matplotlib_converters()

    # load prices
    prices_df = load_prices(prices_input_file)

    # which symbols have splits?
    prices_df = prices_df.groupby('symbol')
    num_splits = num_rsplits = 0
    for name, group in prices_df:
        max_split = group['split_coefficient'].max()
        min_split = group['split_coefficient'].min()
        if max_split > 1.0:
            #logging.info(str(name) + " max split " + str(max_split))
            num_splits += 1
        if min_split < 1.0:
            num_rsplits += 1
            logging.info(str(name) + " r split " + str(min_split))

    logging.info(str(num_splits) + " symbols with splits > 1")
    logging.info(str(num_rsplits) + " symbols with splits < 1")

    # plot split coeff
    #grpname = "BRK.A"
    #grp = prices_df.get_group(grpname)
    #plot_price(grpname, grp, pd.Timestamp(2009,1,1), pd.Timestamp(2019, 1, 1))

    proc_endtime = datetime.now()
    logging.info(("Total Processing time (min): " + str(
        (proc_endtime - proc_starttime).total_seconds() / 60.0)))
Example #16
0
def run_auto(cfg):

    startTime = pd.datetime.now()
    logging.info("Auto run started at " + str(startTime))

    run_clean_prices(cfg)

    logging.info("loading cleaned prices...")
    prices_df = load_df(CLEANED_PRICES_FILE)

    logging.info("Cleaned prices df shape " + str(prices_df.shape))

    # append the benchmark symbols to the prices_df
    logging.info("appending benchmarks")
    for benchsym in BENCHMARK_SYMBOLS:
        bench_input_file = RAW_DATA_DIR + benchsym + ".csv"
        logging.info("Reading: " + bench_input_file)
        bench_df = pd.read_table(bench_input_file, sep=',')
        logging.info("bench_df shape " + str(bench_df.shape))
        bench_df['date'] = pd.to_datetime(bench_df['date'])
        prices_df = prices_df.append(bench_df, ignore_index=True)

    logging.info("Cleaned prices with benchmarks df shape " +
                 str(prices_df.shape))

    years = cfg['years_list'].strip()
    years_list = years.split(',')
    symbols_limit = int(cfg['symbols_limit'])

    for years_back in years_list:

        # get the timestamps for the year span
        years_int = int(years_back)

        end_date_str = cfg['analysis_end_date'].strip()
        end_list = end_date_str.split('-')
        end_yr = int(end_list[0])
        end_mo = int(end_list[1])
        end_d = int(end_list[2])
        prices_end_date = pd.Timestamp(end_yr, end_mo, end_d)

        prices_start_date = prices_end_date - pd.Timedelta(days=years_int *
                                                           365)
        print(f'start: {str(prices_start_date)}  end: {str(prices_end_date)}')

        # make the filtered symbols list
        sort_symbols_by_eps(prices_start_date, prices_end_date, years_int,
                            symbols_limit, BENCHMARK_SYMBOLS)

        # filter prices on date span
        logging.info("writing filtered prices")
        filter_prices(prices_df, prices_start_date, prices_end_date, years_int)

        run_buy_sell_analyze(cfg, years_int)

        budget_cfg = cfg['budget_list'].strip()
        budget_lst = budget_cfg.split(",")
        for budget in budget_lst:
            b = float(budget.strip())
            make_blacklist(b, cfg['hold_times_list'], years_int)

    endTime = pd.datetime.now()
    logging.info("Auto run finished at " + str(endTime))
    logging.info("Elapsed time: " + str(endTime - startTime))

    input("DONE with AUTO > ")
Example #17
0
def sort_symbols_by_eps(prices_start_date, prices_end_date, yearspan, limit, benchmarks):

    earnings_input_file = EARNINGS_INPUT_FILE
    sorted_symbols_output_file = FILTERED_SYMBOLS_PREFIX + str(yearspan) + "years.csv"
        
    # load earnings
    earn_df = load_earnings(earnings_input_file)

    logging.info("Filtering by date range.")
    earn_df['date'] = pd.to_datetime(earn_df['date'])

    #Filter by start / end dates
    earn_df = earn_df[(earn_df['date'] >= prices_start_date) &
                   (earn_df['date'] <= prices_end_date)]

    logging.info("Filtered dates df shape " + str(earn_df.shape))

    earn_df = earn_df.groupby('symbol')
    logging.info(str(len(earn_df)) + " symbols groups found")

    # get list of symbols whose valid listing dates cover the start-end period
    logging.info("Filtering symbols (keep symbols covering the date span)")
    symbols_span_list = filter_symbols(prices_start_date, prices_end_date)
    logging.info(f"{len(symbols_span_list)} symbols cover the date span for {str(yearspan)} year history")

    # make df of sorted symbols
    cols = ['symbol', 'avg_eps']
    lst = []
    for symbol, symbol_grp in earn_df:
        # keep only symbols that pass the symbols listing span
        if symbol in symbols_span_list:
            #logging.info("Getting mean eps for " + symbol)
            avg_eps = symbol_grp['eps'].mean()
            lst.append([symbol, avg_eps])
        
    symsort_df = pd.DataFrame(lst, columns=cols)
    #symsort_df = symsort_df.dropna()
    symsort_df = symsort_df.sort_values('avg_eps', ascending=False)

    # apply symbols limit
    symsort_df = symsort_df[:limit]

    # add the benchmark symbols
    rows_list = []
    for bsym in benchmarks:
        bdict= {"symbol":bsym, "avg_eps":999}
        rows_list.append(bdict)

    bdf = pd.DataFrame(rows_list)    
    symsort_df = bdf.append(symsort_df)


    
    # write sorted symbols
    logging.info(f"Writing {len(symsort_df)} sorted symbols to " + 
                 sorted_symbols_output_file)

    symsort_df.to_csv(sorted_symbols_output_file, index=False)
    return symsort_df['symbol']
Example #18
0
def analyze(hold_days, budget, yearspan, min_trades):

    file_postfix = str(yearspan) + "years_" + str(hold_days) + "days_" + str(
        budget) + "dollars.csv"

    buy_sell_input_file = BUY_SELL_RESULTS_PREFIX + file_postfix

    analysis_output_file = ANALYSIS_FILE_PREFIX + file_postfix

    # clean up the existing output file (ignore !exists error)
    try:
        os.remove(analysis_output_file)
    except OSError:
        pass

    # load buy-sell results and group by symbol
    try:
        logging.info(f"Reading {buy_sell_input_file}")
        bsr_df = pd.read_table(buy_sell_input_file, sep=",")
        #bsr_df['buy_date'] = pd.to_datetime(bsr_df['buy_date'])
        #bsr_df['sell_date'] = pd.to_datetime(bsr_df['sell_date'])
        bsr_df = bsr_df.groupby('symbol')
        logging.info("Found " + str(len(bsr_df)) +
                     " symbols in buy-sell data.")

    except Exception as e:
        logging.warning("Not parsed: " + buy_sell_input_file + "\n" + str(e))
        sys.exit()

    results_lst = []  # output rows
    qmax = 100000  # output queue max
    write_header = True  # results file header (write once flag)
    symnum = 0
    numsyms = len(bsr_df)

    for symbol, sym_df in bsr_df:

        try:

            num_trades = len(sym_df)
            blk_trades_df = sym_df[sym_df['gain_total'] > 0.0]
            red_trades_df = sym_df[sym_df['gain_total'] < 0.0]
            num_black = len(blk_trades_df)
            pct_black = float(num_black) / float(num_trades)
            num_red = len(red_trades_df)

            avg_return = sym_df["gain_total"].mean()
            avg_gain = blk_trades_df["gain_total"].mean()
            avg_loss = red_trades_df["gain_total"].mean()

            max_gain = blk_trades_df["gain_total"].max()
            mg_idx = blk_trades_df['gain_total'].idxmax()
            mg_buy_date = blk_trades_df.loc[mg_idx, 'buy_date']
            mg_buy_price = blk_trades_df.loc[mg_idx, 'buy_price']
            mg_sell_date = blk_trades_df.loc[mg_idx, 'sell_date']
            mg_sell_price = blk_trades_df.loc[mg_idx, 'sell_price']

            max_loss = red_trades_df["gain_total"].min()
            ml_idx = red_trades_df['gain_total'].idxmin()
            ml_buy_date = red_trades_df.loc[ml_idx, 'buy_date']
            ml_buy_price = red_trades_df.loc[ml_idx, 'buy_price']
            ml_sell_date = red_trades_df.loc[ml_idx, 'sell_date']
            ml_sell_price = red_trades_df.loc[ml_idx, 'sell_price']

            row = [
                symbol, num_trades, pct_black, num_black, num_red, avg_return,
                avg_gain, avg_loss, max_gain, mg_buy_date, mg_buy_price,
                mg_sell_date, mg_sell_price, max_loss, ml_buy_date,
                ml_buy_price, ml_sell_date, ml_sell_price
            ]

            # drop low numbers of trades
            if num_trades >= min_trades:
                results_lst.append(row)
            else:
                logging.info(
                    f"dropped symbol {symbol} for low trade occurrences.")

            # peridocially write the results list
            if len(results_lst) >= qmax:
                logging.info(
                    f"Writing {qmax} results to {analysis_output_file}")
                append_analysis_csv(analysis_output_file, results_lst,
                                    write_header)
                write_header = False
                results_lst = []

            symnum += 1  # keep track of how many symbols have been processed
            logging.info(
                f"{symbol} \t\t[{symnum} of {numsyms}] \tpct_black: " +
                f"{pct_black:.1f} avg_return: {avg_return:.2f} ")

        except Exception as e:
            logging.error("Exception in analyze " + str(e))

    # final csv update
    if len(results_lst) > 0:
        logging.info(
            f"Writing {len(results_lst)} results to {analysis_output_file}")
        append_analysis_csv(analysis_output_file, results_lst, write_header)
Example #19
0
def run_clean_prices(cfg):
    logging.info(f"Cleaning with window = {cfg['rolling_sample_window']}")
    clean_prices(cfg)
Example #20
0
def make_blacklist(budget, holds, yearspan):

    file_list = []
    holds_lst = holds.split(",")
    for hold in holds_lst:
        hold = hold.strip()
        file_list.append(ANALYSIS_FILE_PREFIX + str(yearspan) + "years_" +
                         str(hold) + "days_" + str(int(budget)) +
                         "dollars.csv")

    pct_cutoff = 0.51

    df_list = []  # keep each df

    logging.info(f"Loading {file_list[0]}")
    df = pd.read_table(file_list[0], sep=",")
    logging.info(f"file0 df shape: {df.shape}")

    df = df[df.pct_black > pct_cutoff]
    logging.info(f"file0 df > pct cutoff shape: {df.shape}")

    df_list.append(df)

    keep_symbols = set(df['symbol'].tolist())
    drop_symbols = set()

    for file in file_list[1:]:
        logging.info(f"Checking symbols in {file}")
        df = pd.read_table(file, sep=",")
        df = df[df['pct_black'] > pct_cutoff]
        check_symbols = df['symbol'].tolist()
        for k in keep_symbols:
            if k not in check_symbols:
                drop_symbols.add(k)

        df_list.append(df)

    print(f"{len(drop_symbols)} symbols getting dropped.")
    #logging.info(str(drop_symbols))

    for d in drop_symbols:
        keep_symbols.remove(d)

    print(f"{len(keep_symbols)} symbols kept.")
    print(f"{len(df_list)} dfs made")

    # build a df with the pct_black results across all holds
    cols = [
        'symbol', 'd4', 'd9', 'd14', 'd19', 'd30', 'd60', 'd90', 'avg_return'
    ]
    rows_list = []
    i = 0
    for symbol in keep_symbols:
        rtn_sum = 0  # to make average return
        row = [symbol]
        for df in df_list:

            rtn_sum += df.loc[df.symbol == symbol, 'avg_return'].values[0]

            pct_blk = df.loc[df.symbol == symbol, 'pct_black'].values[0]
            row.append(pct_blk)

        avg_return = rtn_sum / float(len(file_list))
        row.append(avg_return)
        rows_list.append(row)
        i += 1
        if ((i % 100) == 0):
            logging.info(f"processing symbol {i} of {len(keep_symbols)}")

    # blacklist df
    logging.info(f"Building blacklist df with {len(rows_list)} rows")
    bl_df = pd.DataFrame(rows_list, columns=cols)

    # add column for avg pct_blk
    bl_df['avg_pct_blk'] = bl_df.iloc[:, 1:8].mean(axis=1)
    bl_df = bl_df.sort_values('avg_pct_blk', ascending=False)

    logging.info(f"bl_df shape {bl_df.shape}")
    print(bl_df.head())

    bl_file = BLACKLIST_FILE_PREFIX + str(yearspan) + "years_" + str(
        int(budget)) + "dollars.csv"
    bl_df.to_csv(bl_file, index=False, float_format='%.3f')
    print(f"Wrote {bl_file}")
Example #21
0
def save_config(config):

    ini_filename = "stox.ini"
    with open(ini_filename, 'w') as configfile:
        config.write(configfile)
        logging.info("Saved " + ini_filename)
Example #22
0
def test_cleaner(cfg):

    register_matplotlib_converters()

    param_list = cfg['cleaner_test_params'].split(" ")
    if len(param_list) < 3:
        logging.warn("Plot params malformed. Skipping plot.")
        return
    else:
        logging.info(f"Using symbol {param_list[0].strip()}")
        logging.info(f"  from {param_list[1]}")
        logging.info(f"  to {param_list[2]}")

    # param string [symbol start-date end-date]
    #   e.g. IBM 2009-01-01 2019-01-01
    symbol = param_list[0].strip()

    start_list = param_list[1].split('-')
    start_yr = int(start_list[0])
    start_mo = int(start_list[1])
    start_d = int(start_list[2])

    end_list = param_list[2].split('-')
    end_yr = int(end_list[0])
    end_mo = int(end_list[1])
    end_d = int(end_list[2])

    date_start = pd.Timestamp(start_yr, start_mo, start_d)
    date_end = pd.Timestamp(end_yr, end_mo, end_d)

    # use group file if exists else use raw file
    raw_symbol_file = STOX_DATA_DIR + symbol + "_raw.csv"

    if os.path.exists(raw_symbol_file):
        prices_input_file = raw_symbol_file
    else:
        prices_input_file = RAW_PRICES_INPUT_FILE

    try:
        logging.info("Reading " + prices_input_file)
        prices_df = pd.read_table(prices_input_file, sep=',')
        prices_df['date'] = pd.to_datetime(prices_df['date'])
        logging.info("Prices df shape " + str(prices_df.shape))

    except Exception as e:
        logging.critical("Not parsed: " + prices_input_file + "\n" + str(e))
        sys.exit()

    # get group for this symbol
    logging.info("Filtering on symbol")
    df = prices_df.groupby('symbol').get_group(symbol)

    # write the raw file for this symbol (all time frames)
    if not os.path.exists(raw_symbol_file):
        logging.info(f"Writing raw file for {symbol}")
        df.to_csv(raw_symbol_file, index=False, sep=",", float_format='%.3f')

    # filter on date range
    logging.info("Filtering on date range")
    df = df[(df['date'] >= date_start) & (df['date'] <= date_end)]
    df = df.sort_values(['date'])

    # write raw df to file
    span_str = (date_start.strftime("%Y-%m-%d") + "_" +
                date_end.strftime("%Y-%m-%d"))
    csv_name = STOX_DATA_DIR + symbol + "_" + span_str + "_raw.csv"
    df.to_csv(csv_name, index=False, sep="\t", float_format='%.3f')

    # test cleaner
    cdf = cleaner(df)

    # write cdf to file
    span_str = (date_start.strftime("%Y-%m-%d") + "_" +
                date_end.strftime("%Y-%m-%d"))
    csv_name = STOX_DATA_DIR + symbol + "_" + span_str + "_cleantest.csv"
    cdf.to_csv(csv_name, index=False, sep="\t", float_format='%.3f')

    # PLOT
    fig, axs = plt.subplots(nrows=2, sharex=True)
    plt.suptitle(symbol, fontsize=10)

    axs[0].set_title('Raw', {'fontsize': 10})
    axs[0].scatter(df['date'].tolist(), df['close'], color='blue', s=2)

    axs[1].set_title('Cleaned', {'fontsize': 10})
    axs[1].scatter(cdf['date'].tolist(), cdf['close'], color='green', s=2)

    plt_filename = STOX_DATA_DIR + symbol + "_" + span_str + ".png"
    plt.savefig(plt_filename)
    plt.show()
Example #23
0
def run_price_plot(cfg):
    logging.info("Running price plot...")
    plot_price(cfg)
Example #24
0
def plot_price(cfg):

    param_list = cfg['plot_params'].split(" ")
    if len(param_list) < 3:
        logging.warn("Plot params malformed. Skipping plot.")
        return
    else:
        logging.info(f"Plotting symbol {param_list[0].strip()}")
        logging.info(f"  from {param_list[1]}")
        logging.info(f"  to {param_list[2]}")

    register_matplotlib_converters()
    
    prices_input_file = CLEANED_PRICES_FILE
    #prices_input_file = cfg['raw_data_dir'] + cfg['raw_prices_input_file']
    try:
        logging.info("Reading " + prices_input_file)
        prices_df = pd.read_table(prices_input_file, sep=',')
        prices_df['date'] = pd.to_datetime(prices_df['date'])
        logging.info("Prices df shape " + str(prices_df.shape))
        
    except Exception as e: 
        logging.critical("Not parsed: " + prices_input_file + "\n" + str(e))
        sys.exit()   

    # param string [symbol start-date end-date] 
    #   e.g. IBM 2009-01-01 2019-01-01
    symbol = param_list[0].strip()

    start_list = param_list[1].split('-')
    start_yr = int(start_list[0])
    start_mo = int(start_list[1])
    start_d = int(start_list[2])

    end_list = param_list[2].split('-')
    end_yr = int(end_list[0])
    end_mo = int(end_list[1])
    end_d = int(end_list[2])

    date_start = pd.Timestamp(start_yr, start_mo, start_d)
    date_end = pd.Timestamp(end_yr, end_mo, end_d)
   

    # filter on date range
    logging.info("Filtering on date range")
    df = prices_df[(prices_df['date'] >= date_start) & (prices_df['date'] <= date_end)]
    df = df.sort_values(['date'])

    # get group for this symbol
    logging.info("Filtering on symbol")
    df = df.groupby('symbol').get_group(symbol)

    # write df to file
    span_str = (date_start.strftime("%Y-%m-%d") + "_" +
        date_end.strftime("%Y-%m-%d"))
    csv_name = STOX_DATA_DIR + symbol + "_" + span_str + ".csv"
    df.to_csv(csv_name, index=False, sep="\t", float_format='%.3f')

    # plot open/close price
    fig = plt.figure()
    plt.suptitle(symbol, fontsize=10)
    plt.scatter(df['date'].tolist(), df['open'], color='green', s=2)
    plt.scatter(df['date'].tolist(), df['close'], color = 'blue', s=2)

    plt_filename = STOX_DATA_DIR + symbol + "_" + span_str + ".png"
    plt.savefig(plt_filename)
    plt.show()
Example #25
0
def rm_stoxdir(cfg):
    stox_dir = STOX_DATA_DIR
    for p in Path(stox_dir).glob("*.*"):
        p.unlink()
    logging.info("Removed stox data.")
Example #26
0
def buy_sell_v3(budget_dollars, fee_dollars, hold_days, low_price_cutoff,
                yearspan):

    prices_input_file = FILTERED_PRICES_PREFIX + str(yearspan) + "years.csv"

    hold_str = str(hold_days)
    budget_dollars_str = str(int(budget_dollars))
    buy_sell_output_postfix = str(
        yearspan
    ) + "years_" + hold_str + "days_" + budget_dollars_str + "dollars.csv"
    buy_sell_output_file = BUY_SELL_RESULTS_PREFIX + buy_sell_output_postfix

    # clean up the existing output file (ignore !exists error)
    try:
        os.remove(buy_sell_output_file)
    except OSError:
        pass

    # load prices and group by symbol
    try:
        logging.info("Reading: " + prices_input_file)
        stox_df = pd.read_table(prices_input_file, sep=',')
        stox_df['date'] = pd.to_datetime(stox_df['date'])
        stox_df = stox_df.groupby('symbol')
        logging.info("Found " + str(len(stox_df)) + " symbols in price data.")

    except Exception as e:
        logging.warning("Not parsed: " + prices_input_file + "\n" + str(e))
        sys.exit()

    numsyms = len(stox_df)  # total number of symbols
    cant_afford = set()  # set of symbols whose unit share price exceeds budget
    penny_stocks = set()  # set of low price symbols

    results_lst = []  # completed transactions
    write_header = True  # results file header (write once flag)

    # Loop over each symbol
    symnum = 1  # symbol idx

    for symbol, sym_df in stox_df:

        max_gain = max_loss = 0.0
        pending_lst = []  # has buy attributes for each buy date

        # TODO make sure the sym_df is in date order ascending

        row_idx = 0
        for row in sym_df.itertuples():

            buy_price, shares_bought, status = buy(row, budget_dollars)
            split_coeff = row.split_coefficient

            if status == 'price_high':
                cant_afford.add(symbol)  # todo: count these
            elif status == 'price_low':
                penny_stocks.add(symbol)  # todo: count these

            # add current row to pending sale list
            pending_lst.append([
                symbol, row_idx, row.date, shares_bought, buy_price,
                split_coeff
            ])

            # Once the pending sales list is full, start creating results list
            if row_idx >= hold_days:

                result_row = sell_row(row, pending_lst, fee_dollars)

                returns = float(result_row[11])

                # just for logging
                if returns > max_gain: max_gain = returns
                if returns < max_loss: max_loss = returns

                # add the result row to results list
                results_lst.append(result_row)

                # remove the sold row
                del pending_lst[0]

            row_idx += 1

        # peridocially write the results list
        qmax = 100000
        if len(results_lst) >= qmax:
            logging.info(f"Writing {qmax} results to {buy_sell_output_file}")
            append_csv(buy_sell_output_file, results_lst, write_header,
                       low_price_cutoff)
            write_header = False
            results_lst = []

        symnum += 1  # keep track of how many symbols have been processed
        logging.info(f"{symbol} \t\t[{symnum} of {numsyms}] \ttrade days: " +
                     f"{str(len(sym_df))} max_gain: {max_gain:.2f} " +
                     f"max_loss: {max_loss:.2f}")

    # final csv update
    if len(results_lst) > 0:
        logging.info(
            f"Writing {len(results_lst)} results to {buy_sell_output_file}")
        append_csv(buy_sell_output_file, results_lst, write_header,
                   low_price_cutoff)

    logging.info("Zero shares bought (price exceeds budget): " +
                 str(cant_afford))
    logging.info("Zero shares bought (price too low): " + str(penny_stocks))
Example #27
0
def run_cleaner_test(cfg):
    logging.info("Running cleaner test...")
    test_cleaner(cfg)