def main(args: argparse.Namespace): label, *_ = data.read_csv(args.label) pred, *_ = data.read_csv(args.prediction) # Set criterion as RMSE critic = criterion.get(args.method)() result = critic(label[:, 2], pred[:, 2]) print(f'{args.method}: {result}')
def main(args: argparse.Namespace): result = Path(args.result) result.mkdir(exist_ok=True, parents=True) # Read whole csv file content, header = data.read_csv(args.dataset) # Split query for query in args.split: try: # Each query must include {name, begin condition, end condition} name, begin, end = query.split(',') begin, end = map(int, (begin, end)) cropped = content[(begin <= content[:, -1]) & (content[:, -1] <= end)] data.to_csv(str(result.joinpath(f'{name}.csv')), cropped, header=header) except ValueError as e: print(f'Error: {e}') print( 'format mismatch, each query must be [name,begin_condition,end_condition]' ) print(f'But got `{query}`') continue
def main(args: argparse.Namespace): # Reproducible (Important) # An experiment that can not be reproduced can not make any conclusions. # So fix random seed before anything else. seed(args.seed) # Load dataset # Provides two dataset loading methods # - Load from whole csv and split train, test by condition (slow) # - Load each train, test csv (faster) # (Using scripts/split.py to split train, test by condition) if args.dataset: dataset = data.Dataset(args.dataset) train, test = dataset.split_train_test(args.mode) test_header = dataset.rating_headers else: train, train_header = data.read_csv(args.train) test, test_header = data.read_csv(args.test) # Set criterion as RMSE critic = criterion.get(args.criterion)() # Fit model, using train data model = Recommender(factors=args.factor, epochs=args.epoch, mean=args.mean, derivation=args.dev, lr=args.lr, reg=args.reg) model.fit(train[:, :2], train[:, 2]) # Predict by test data and calculate error predictions = model.predict(test[:, :2]) error = critic(predictions, test[:, 2]) print(f'RMSE: {error}') # Save predictions test[:, 2] = predictions data.to_csv(args.result, test, header=test_header)
def main(args: argparse.Namespace): global train, test, test_header # Reproducible (Important) # An experiment that can not be reproduced can not make any conclusions. # So fix random seed before anything else. seed(args.seed) # Load dataset # Provides two dataset loading methods # - Load from whole csv and split train, test by condition (slow) # - Load each train, test csv (faster) # (Using scripts/split.py to split train, test by condition) if args.dataset: dataset = data.Dataset(args.dataset) train, test = dataset.split_train_test(args.mode) test_header = dataset.rating_headers else: train, train_header = data.read_csv(args.train) test, test_header = data.read_csv(args.test) # Find param in search space params = list(product(*param_space.values())) if args.size: indexes = np.random.choice(len(params), args.size, replace=False) params = [params[i] for i in indexes] print(f'Search space: {len(params)}') print(f'Param: {param_space}') with Pool(args.cpu or cpu_count()) as pool: results = pool.map(wrapper, params) best, *_ = sorted(results, key=lambda x: x[1]) print(f'Best RMSE: {best[1]}') print(f'param: {best[0]}')
help='year of data to play, default=2018') parser.add_argument('--commission', type=float, default=DEFAULT_COMMISSION, help='commission size, default=0.00025') parser.add_argument('--cuda', default=False, action='store_true', help='enable cuda') args = parser.parse_args() device = 'cuda' if args.cuda else 'cpu' try: from lib import data play_data = data.read_csv(file_name='data/000001_%d.csv' % args.year) except ModuleNotFoundError: play_data = (pd.read_csv('data/prices_%d.csv' % args.year, index_col=0), pd.read_csv('data/factors_%d.csv' % args.year, index_col=0)) env = environ.StockEnv(play_data, bars_count=BARS_COUNT, commission=args.commission, reset_on_sell=False, random_ofs_on_reset=False) net = models.DQNConv1d(env.observation_space.shape, env.action_space.n) datestr = datetime.strftime(date(2019, 2, 2), '%Y-%m-%d') save_path = os.path.join('saves', datestr) state_dict = torch.load(os.path.join(save_path, 'best_mean_val.pth'), map_location=lambda storage, loc: storage) net.load_state_dict(state_dict)
from lib import data, model, common, ML, visualize import numpy as np import graphviz from sklearn.ensemble import RandomForestClassifier PATH = "/home/chris/projects/Kaggle/heart_200719/data/heart.csv" TYPE_LIST = [ "num", "mc", "mc", "num", "num", "mc", "mc", "num", "mc", "num", "mc", "mc", "mc" ] MAX_DATA_COL = 13 # read the df df = data.read_csv(PATH) # get the feature name list feature_list = [] for i, key in enumerate(df.keys()): feature_list.append(key) if i == (MAX_DATA_COL - 1): break # shuffle data seed_arr = data.shuffle(data.df2array(df)) # split the data into training set and testing set train_set, test_set = data.split_data(seed_arr, percentage=0.8) randomForest = ML.RandomForest(tolerance=0.05, min_element=10, max_depth=100, num_col_blocked_each_step=2,
def test_read_csv(self): prices = data.read_csv("WIKI-CSCO.csv") self.assertIsInstance(prices, data.Prices)
help='enable cuda') parser.add_argument('--colab', default=False, action='store_true', help='enable colab hosted runtime') parser.add_argument('--double', default=False, action='store_true', help='enable double DQN') args = parser.parse_args() device = torch.device('cuda' if args.cuda else 'cpu') try: from lib import data train_data = data.read_csv(file_name='data/000001_2017.csv') val_data = data.read_csv(file_name='data/000001_2018.csv') except ModuleNotFoundError: train_data = (pd.read_csv('data/prices_2017.csv', index_col=0), pd.read_csv('data/factors_2017.csv', index_col=0)) val_data = (pd.read_csv('data/prices_2018.csv', index_col=0), pd.read_csv('data/factors_2018.csv', index_col=0)) env = environ.StockEnv(train_data, bars_count=BARS_COUNT, reset_on_sell=True) env = gym.wrappers.TimeLimit(env, max_episode_steps=1000) env_test = environ.StockEnv(train_data, bars_count=BARS_COUNT, reset_on_sell=True) env_test = gym.wrappers.TimeLimit(env_test, max_episode_steps=1000)
MAIN_PATH = "../docs/3" NET_SAVE_PATH = MAIN_PATH + '/checkpoint' RUNS_SAVE_PATH = MAIN_PATH + "/runs/" + dt_string NET_FILE = "checkpoint-3100000.data" LOAD_NET = False TRAIN_ON_GPU = True BATCH_SIZE = 512 lr = 0.001 CHECKPOINT_STEP = 100000 PRINT_EVERY = 50000 SCALAR_VISUALIZE_EVERY = 1000 EMBEDDING_VISUALIZE_EVERY = 100000 MOVING_AVERAGE_STEP = 1000 # read file col_names, raw_data = data.read_csv(path=DATA_PATH) # define batch generator domain_col = raw_data[7] # x = company name codomain_col = raw_data[3] # y = issue name batch_generator = data.Batch_Generator() generator_prepare = batch_generator.prepare_generator( domain_col=domain_col, codomain_col=codomain_col) # define model fc_model = models.FC_Embed(len(generator_prepare.domain_int2vocab), len(generator_prepare.codomain_int2vocab), embedding_size=3, train_on_gpu=TRAIN_ON_GPU) if LOAD_NET: print("Loading net params...")
EPSILON_START = 1.0 EPSILON_STOP = 0.1 EPSILON_STEPS = 1000000 CHECKPOINT_EVERY_STEP = 50000 VALIDATION_EVERY_STEP = 10000 # 10000 load_net = True load_fileName = "checkpoint-950000.data" saves_path = "C:\\Users\\user\\python_jupyter\\book_Hands_On_Reinforcement_Learning_Pytorch\\cmk_chapter8\\2_LSTM\\checkpoint" if __name__ == "__main__": device = torch.device("cuda") stock_data = data.read_csv( "C:\\Users\\user\\python_jupyter\\book_Hands_On_Reinforcement_Learning_Pytorch\\cmk_chapter8\\2_LSTM\\data\\0005.HK.csv") # create the training and val set train_set, val_set = data.split_data(stock_data, percentage=0.8) train_set = {"train": train_set} val_set = {"eval": val_set} env = environ.StocksEnv(train_set, bars_count=BARS_COUNT, reset_on_close=True, state_1d=False, volumes=True) env = wrappers.TimeLimit(env, max_episode_steps=1000) env_val = environ.StocksEnv(val_set, bars_count=BARS_COUNT, reset_on_close=True, state_1d=False, volumes=True) # env_val = wrappers.TimeLimit(env_val, max_episode_steps=1000) # create neural network net = models.SimpleLSTM(input_size=5, n_hidden=512, n_layers=2, drop_prob=0.5, actions_n=3, train_on_gpu=True, batch_first=True).to(device) # load the network
def test_read_csv(self): prices = data.read_csv("data/YNDX_160101_161231.csv") self.assertIsInstance(prices, data.Prices)
def worker(net, device, train_queue, proc_idx, save_path): try: from lib import data train_data = data.read_csv(file_name='data/000001_2018.csv') except ModuleNotFoundError: train_data = (pd.read_csv('data/prices_2018.csv', index_col=0), pd.read_csv('data/factors_2018.csv', index_col=0)) env = environ.StockEnv(train_data, bars_count=BARS_COUNT, reset_on_sell=True) env = gym.wrappers.TimeLimit(env, max_episode_steps=1000) agt = agent.ProbabilityAgent(lambda x: net(x)[0], apply_softmax=True, device=device) exp_source = experience.ExperienceSource(env, agt, GAMMA, steps_count=REWARD_STEPS) batch = [] frame_idx = 0 total_reward = [] total_steps = [] reward_buf = [] steps_buf = [] frame_idx = 0 frame_prev = 0 ts = time.time() best_mean_reward = None stats = collections.defaultdict(list) file_name = os.path.splitext(os.path.basename(__file__))[0] file_name = file_name.split('_')[-1] proc_name = 'worker_' + '%d' % proc_idx writer = SummaryWriter(os.path.join('runs', file_name, proc_name)) logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s', handlers=[ logging.FileHandler( os.path.join(save_path, 'console.log')), logging.StreamHandler() ]) for exp in exp_source: frame_idx += 1 batch.append(exp) if len(batch) < GRAD_BATCH: continue net.zero_grad() loss_val_v, loss_policy_v, loss_entropy_v = helper.a2c_loss( batch, net, GAMMA**REWARD_STEPS, ENTROPY_BETA, device) batch.clear() loss_v = loss_entropy_v + loss_val_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) grads = [ param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters() ] train_queue.put(grads) stats['loss_value'].append(loss_val_v) stats['loss_policy'].append(loss_policy_v) stats['loss_entropy'].append(loss_entropy_v) stats['loss_total'].append(loss_v) for stat in stats: if len(stat) >= STATS_GROUPS: writer.add_scalar(stat, torch.mean(torch.stack(stats[stat])).item(), frame_idx) stats[stat].clear() ep_reward, ep_steps = exp_source.pop_episode_result() if ep_reward: print( 'Worker_%d: %d done, Episode reward: %.4f, Episode step: %d' % (proc_idx, frame_idx, ep_reward, ep_steps)) reward_buf.append(ep_reward) steps_buf.append(ep_steps) if len(reward_buf) == REWARD_GROUPS: reward = np.mean(reward_buf) steps = np.mean(steps_buf) reward_buf.clear() steps_buf.clear() total_reward.append(reward) total_steps.append(steps) speed = (frame_idx - frame_prev) / (time.time() - ts) frame_prev = frame_idx ts = time.time() mean_reward = np.mean(total_reward[-100:]) mean_step = np.mean(total_steps[-100:]) logging.info( '%d done, mean reward %.3f, mean step %d, speed %d f/s' % (frame_idx, mean_reward, mean_step, speed)) writer.add_scalar('speed', speed, frame_idx) writer.add_scalar('reward', reward, frame_idx) writer.add_scalar('reward_100', mean_reward, frame_idx) writer.add_scalar('steps', steps, frame_idx) writer.add_scalar('steps_100', mean_step, frame_idx) if best_mean_reward is None or best_mean_reward < mean_reward: torch.save( net.state_dict(), os.path.join(save_path, 'best_mean_reward-%.3f.pth') % mean_reward) if best_mean_reward is not None: logging.info( 'Worker_%d: Best mean value updated %.3f -> %.3f' % (proc_idx, best_mean_reward, mean_reward)) best_mean_reward = mean_reward writer.close()
from lib import data, model, common, ML, visualize import numpy as np import graphviz from sklearn import tree READ_SAME = False MAX_DATA_COL = 13 PATH = "/home/chris/projects/Kaggle/heart_200719/data/heart.csv" TYPE_LIST = [ "num", "mc", "mc", "num", "num", "mc", "mc", "num", "mc", "num", "mc", "mc", "mc" ] decisionTree = ML.DecisionTree(tolerance=0.00, min_element=1, max_depth=100) if READ_SAME: df = data.read_csv( "/home/chris/projects/Kaggle/heart_200719/data/heart_debug.csv") seed_arr = data.df2array(df) # get the feature name list feature_list = [] for i, key in enumerate(df.keys()): feature_list.append(key) else: # read the df df = data.read_csv(PATH) # get the feature name list feature_list = [] for i, key in enumerate(df.keys()): feature_list.append(key)