To make comparison fair we take only four patterns thus ((0, (3,)), (1, (3,))) ((0, (4,)), (1, (4,))) ((0, (3,)), (1, (4,))) ((0, (2,)), (1, (2,))) These patterns all have an immediate and simple trend They move almost identically (with one slight exception) We now visualize the results like we always do ''' #%% st, d = import_dat('./output/UNHTRV/UNH_TRV_5y_close.dat') d_og = d ct = st.copy() #this gives 48% coverage of line 0 and 1 ct = add(((0, (3,)), (1, (3,))),ct,d) ct = add(((0, (4,)), (1, (4,))),ct,d) ct = add(((0, (3,)), (1, (4,))),ct,d) ct = add(((0, (2,)), (1, (2,))),ct,d) patterns = {} for key, value in ct.items(): if value[1]>1:
import numpy as np import pandas as pd from bokeh.io import curdoc, show from bokeh.layouts import column from bokeh.models import ColumnDataSource, Grid, HoverTool, LinearAxis, Range1d from bokeh.models.glyphs import MultiLine from bokeh.plotting import figure, output_file, show from add_rem import add from cover import cov_order from import_data import import_dat from output_gen import load_dictionary, painter from pattern_finder import finder #%% st, d = import_dat('./output/UNHMCD/UNH_MCD_5y_close.dat') d_og = d ct = st.copy() #index patterns ct = add(((0, (4, )), (1, (4, ))), ct, d) ct = add(((0, (4, )), (1, (3, ))), ct, d) ct = add(((0, (2, )), (1, (4, ))), ct, d) ct = add(((0, (5, 4)), (1, (4, ))), ct, d) ct = add(((0, (4, )), (1, (5, 3))), ct, d) #stock pattens ct = add(((0, (3, )), (1, (3, ))), ct, d) ct = add(((0, (3, )), (1, (4, ))), ct, d)
from add_rem import add from cover import cov_order from import_data import import_dat from output_gen import load_dictionary, painter #%% d = load_dictionary('./output/dowj_5y_close.dict') ct = {k: v for k, v in d.items() if v[1] > 1} ct_df = pd.DataFrame.from_dict(ct, orient='index', columns=['Support', 'Length', 'Time']) #%% st, d = import_dat('./output/dowj_5y_close.dat') #%% d_og = d ct = st.copy() #this gives 26% coverage of line 1 and 2` ct = add(((1, (4, )), (2, (4, ))), ct, d) ct = add(((1, (4, )), (2, (3, ))), ct, d) ct = add(((1, (2, )), (2, (4, ))), ct, d) ct = add(((1, (5, 4)), (2, (4, ))), ct, d) ct = add(((1, (4, )), (2, (5, 3))), ct, d) patterns = {} for key, value in ct.items():
def create_codetables(data,codetable): """Creates the same codetable as produced by the cover algorithm This function can be used to get an idea of the actual support of a pattern within the codetable, as otherwise the patterns is solelty the independent amount of times it appears within the dataset, not excluding cases when there is overlap with another pattern Arguments: data {[type]} -- [description] codetable {[type]} -- [description] Returns: [type] -- [description] """ st, d = import_dat(data) codetable = load_dictionary(codetable) # Create separate dictionary for the patterns patterns = {} # Add all patterns from the codetable to the pattern dictionary for key, value in codetable.items(): if value[1]>1: patterns[key]=value # Order the patterns dictionary c_ord_patterns = cov_order(patterns) # Initialize a counter dictionary c = Counter() # For patterns in order, go over the dataset for x in c_ord_patterns: # 'Paint' the dataset with 0's where covered and return p amount d,num = p2(x,d) # Use pattern amount for value, and pattern for key, for dict c[x] = num # Once all patterns have covered the dataset # Go over the covered dataset element by element and count # how often an element appears for i, row in enumerate(d): for element in row: if element > 0: c[((i,(element,)),)] += 1 for x in codetable: # If it doesn't exist in the covered codetable, add it if not x in c: #(support,total_length_pattern,timespan_of_pattern ) c[x] = (0,codetable[x][1],codetable[x][2]) #Otherwise just add the length and timespan of the patterns else: #(support,total_length_pattern,timespan_of_pattern ) c[x] = (c[x] ,codetable[x][1],codetable[x][2]) # C is now our covered codetable and codetable is the original ''' This can be tested by checking the difference between codetable[((0,(2,)),)] and c[((0,(2,)),)] ''' original_ct = pd.DataFrame.from_dict(codetable, orient='index' , columns=['support','length','time']).sort_values( 'support')[::-1] covered_ct = pd.DataFrame.from_dict(c, orient='index', columns=[ 'support','length','time']).sort_values('support')[::-1] return original_ct, covered_ct
ct = load_dictionary('./output/workbook9/PEP_KO_5y_close.dict') # Remove the singletons ct = {k: v for k, v in ct.items() if v[1] > 1} # Turn it into a dataframe ct_df = pd.DataFrame() ct_df['index'] = list(ct.keys()) ct_df['value'] = list(ct.values()) ct_df['Support'] = [ct_df.iloc[i, 1][0] for i in range(len(ct_df))] ct_df['Length'] = [ct_df.iloc[i, 1][1] for i in range(len(ct_df))] ct_df['Time'] = [ct_df.iloc[i, 1][2] for i in range(len(ct_df))] del ct_df['value'] ct_df = ct_df.set_index('index') #Now we sort it based on support ct_df = ct_df.sort_values('Support')[::-1] st, d = import_dat('./output/workbook9/PEP_KO_5y_close.dat') d_og = d ct = st.copy() #this gives 33% coverage of line 0 and 1 ct = add(list(ct_df.index)[0], ct, d) ct = add(list(ct_df.index)[2], ct, d) ct = add(list(ct_df.index)[4], ct, d) ct = add(list(ct_df.index)[5], ct, d) patterns = {} for key, value in ct.items(): if value[1] > 1: patterns[key] = value
def make_plot(): # Load the dictionary to see the patterns btc_dict = load_dictionary('./bitcoin2016.dict') # Remove the singletons btc_dict = {k: v for k, v in btc_dict.items() if v[1] > 1} # Turn it into a dataframe df = pd.DataFrame.from_dict(btc_dict, orient='Index', columns=['Support', 'Length', 'Time']) # Sort them by support df = df.sort_values(by='Support')[::-1] st, d = import_dat('./bitcoin2016.dat') ct = st.copy() for key in list(df.index[:10]): ct = add(key, ct, d) #patterns[key]=value patterns = {} for key, value in ct.items(): if value[1] > 1: patterns[key] = value ordered_p = cov_order(patterns) val_d = {} sign = -100 for x in ordered_p: # 'Paint' the dataset with 0's where covered and return p amount d = painter(x, d, sign) val_d[sign] = x sign *= 2 d2 = [[val_d[x] if x in val_d else 'None' for x in row] for row in d] df2 = pd.read_excel('./bitcoin2016.xlsx') df2['Date'] = pd.to_datetime(df2['<DATE>']) df2['ToolTipDates'] = df2.Date.map(lambda x: x.strftime("%d %b %y")) colors = [ '#e6194B', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#42d4f4', '#f032e6', '#e6beff', '#9A6324', '#800000', '#000075' ] P_TO_COLOR = {x: colors[i] for i, x in enumerate(patterns)} P_TO_COLOR['None'] = '#f1f1f1' for i, x in enumerate(('<OPEN>', '<HIGH>', '<LOW>', '<CLOSE>', '<VOL>')): df2[f'pattern{x}'] = [str(x) for x in d2[i]] df2[f'color{x}'] = [P_TO_COLOR[x] for x in d2[i]] #for every column we generate the line we need open_ = xyc(df2, 'Date', 'og<OPEN>', 'color<OPEN>') high_ = xyc(df2, 'Date', 'og<HIGH>', 'color<HIGH>') low_ = xyc(df2, 'Date', 'og<LOW>', 'color<LOW>') close_ = xyc(df2, 'Date', 'og<CLOSE>', 'color<CLOSE>') vol_ = xyc(df2, 'Date', 'og<VOL>', 'color<VOL>') df2['labOPEN'] = df2['pattern<OPEN>'] df2['labHIGH'] = df2['pattern<HIGH>'] df2['labLOW'] = df2['pattern<LOW>'] df2['labCLOSE'] = df2['pattern<CLOSE>'] df2['labVOL'] = df2['pattern<VOL>'] source2 = ColumnDataSource(df2) output_file('bitcoindaily.html') p = figure(x_axis_type='datetime', plot_width=1440, plot_height=600, title="Bitcoin Stock Price") p.circle(x='Date', y='og<OPEN>', name='open', alpha=0, source=source2, size=3) p.circle(x='Date', y='og<CLOSE>', name='close', alpha=0, source=source2, size=3) p.circle(x='Date', y='og<HIGH>', name='high', alpha=0, source=source2, size=3) p.circle(x='Date', y='og<LOW>', name='low', alpha=0, source=source2, size=3) p.multi_line(name='q', xs=open_[0], ys=open_[1], color=open_[2], line_width=3) p.multi_line(name='e', xs=high_[0], ys=high_[1], color=high_[2], line_width=3) p.multi_line(name='ee', xs=low_[0], ys=low_[1], color=low_[2], line_width=3) p.multi_line(name='w', xs=close_[0], ys=close_[1], color=close_[2], line_width=3) q = figure(x_range=p.x_range, x_axis_type='datetime', plot_width=1440, plot_height=200, title="Stock Volume", y_axis_type='linear') q.circle(x='Date', y='og<VOL>', name='VOL', alpha=0, source=source2, size=3) p.circle(x='Date', y='og<LOW>', name='low', alpha=0, source=source2, size=3) q.multi_line(name='qw', xs=vol_[0], ys=vol_[1], color=vol_[2], line_width=3) p.add_tools( HoverTool(names=['low'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Date : ', '@ToolTipDates'), ('Low Price : ', '@{og<LOW>}{0.2f}'), ('Low Pattern : ', '@labLOW'), ('High Price : ', '@{og<HIGH>}{0.2f}'), ('High Pattern : ', '@labHIGH'), ('Open Price : ', '@{og<OPEN>}{0.2f}'), ('Open Pattern : ', '@labOPEN'), ('Close Price : ', '@{og<CLOSE>}{0.2f}'), ('Close Pattern : ', '@labCLOSE'), ])) p.add_tools( HoverTool(names=['open'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Name', 'Open'), ])) p.add_tools( HoverTool(names=['close'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Name', 'Close'), ])) p.add_tools( HoverTool(names=['high'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Name', 'High'), ])) p.add_tools( HoverTool(names=['low'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Name', 'Low'), ])) q.add_tools( HoverTool(names=['VOL'], mode="vline", line_policy='nearest', point_policy='snap_to_data', tooltips=[ ('Date : ', '@ToolTipDates'), ('High Price : ', '@{og<VOL>}{0.2f}'), ('High Pattern : ', '@labVOL'), ])) show(column(p, q))
def pyccoli( filename,cpu=0): output_generation = False #cpu sets the amount of cores to be used, 0 uses all st, d = import_dat(filename) cand = product(st,st) ct = st.copy() mdl = mdl_calc(ct,d,st) print(f'Original MDL:\t\t\t{mdl}') gen = 0 if output_generation: while True: gen += 1 ct, used = ditto_plus(cand,st,ct,d,mdl,cpu) if mdl_calc(ct,d,st)<mdl: mdl = mdl_calc(ct,d,st) with open(f'./plus_{filename}_{gen}.txt','w') as o: for key,value in ct.items(): print(f'{key} \t {value}',file=o) else: print('Finished') break ct = ditto_min(st,ct,d,cpu) if mdl_calc(ct,d,st)<mdl: mdl = mdl_calc(ct,d,st) with open(f'./min_{filename}_{gen}.txt','w') as o: for key,value in ct.items(): print(f'{key} \t {value}',file=o) else: print('Finished') break cand = product(ct,used) else: while True: gen += 1 ct, used = ditto_plus(cand,st,ct,d,mdl,cpu) #mdl = mdl_calc(ct,d,st) ct = ditto_min(st,ct,d,cpu) if not mdl_calc(ct,d,st)<mdl: with open(f'./ct_{filename}.txt','w') as o: for key,value in ct.items(): print(f'{key} \t {value}',file=o) with open(f'./{filename}.dict','wb') as out_pickle: pickle.dump(ct,out_pickle,protocol=pickle.HIGHEST_PROTOCOL) print('Finished') break mdl = mdl_calc(ct,d,st) cand = product(ct,used)