def __init__(self, representation_learner: _RepresentationLearner, policy: _Policy, environments: List[Env]): self.environments = environments self.policy = policy self.representation_learner = representation_learner self.start_episode = 0 self.path_manager = PathManager() self.logger = Logger('logs', self.get_config_name()) # check if environments given as list if not isinstance(environments, list): raise ValueError( "Need to provide list of environment. For single environment training provide single-element list." ) # check if the environments are having same action space if not len(set([env.action_space.n for env in self.environments])) == 1: raise ValueError( "All environments need to have the same amount of available actions!" ) # check if the environments are having same state space if not len( set([env.observation_space.shape for env in self.environments])) == 1: raise ValueError( "All environments need to have the same state dimensionality!")
def make_submission_file(id, pred, test_ids, data_eval): print("\n\n") print("id:", id) subm_name = id pred = pd.Series(pred).round(8) subm = pd.DataFrame() subm["ParcelId"] = test_ids subm["201610"] = pred subm["201611"] = pred subm["201612"] = pred subm["201710"] = pred subm["201711"] = pred subm["201712"] = pred print("submission") print(subm) subm_path = PathManager().get_submission_dir() + subm_name + ".csv" subm.to_csv(subm_path, index=False) subm_metadata = PathManager().get_submission_dir() + subm_name + ".json" with open(subm_metadata, 'w') as file: submission_data = {} submission_data["id"] = id submission_data["score"] = "" json.dump(submission_data, file)
def __init__(self, log_dir: str, config: str): """Create a summary writer logging to log_dir.""" self.path_manager = PathManager() out_dir = self.path_manager.get_subdir_under_root( os.path.join(log_dir, config)) self.writer = tf.summary.FileWriter( out_dir, filename_suffix='_{}'.format(self.path_manager.start_timestamp))
def __init__(self, train_file_name=TRAIN_2016_DATA_FILE_NAME, test_file_name=TEST_2016_DATA_FILE_NAME, new_features=[]): self.pm = PathManager() train_df_file_path = self.pm.get_data_dir(train_file_name) self.data_train = self.load_data(train_df_file_path, new_features=new_features) test_df_file_path = self.pm.get_data_dir(test_file_name) self.data_test = self.load_data(test_df_file_path, new_features=new_features)
def save(self): results_file_path = PathManager().get_results_data_eval_dir( ) + self.id.__str__() + ".json" with open(results_file_path, 'w') as file: json.dump(self.result_dict(), file) plot_file_path = PathManager().get_results_plot_dir( ) + self.id.__str__() + ".html" self.plot(show=False, save=True, file_name=plot_file_path) result_df_file_path = PathManager().get_results_predictions_eval_dir( ) + self.id.__str__() + ".csv" self.result_df.to_csv(result_df_file_path, index=False)
def submission(model, norm, feat_selection, inputation, new_features, subm_name): dao = DAO(new_features=new_features) if norm: train = dao.get_normalized_data(dataset="train", inputation=inputation, max_na_count_columns=0.05) test = dao.get_normalized_data(dataset="test", inputation=inputation, max_na_count_columns=1) print(len(test)) else: train = dao.get_data(cols_type="numeric", dataset="train", max_na_count_columns=0.05) test = dao.get_data(cols_type="numeric", dataset="test", max_na_count_columns=0.05) test_ids = test.index.tolist() if feat_selection is None: feat_selection_name = "" else: feat_selection_name = feat_selection.__name__ columns = feat_selection(train) train_columns = columns + [TARGET] train = train[train_columns] test = test[columns] ev = Evaluator(model=model) pred = ev.run(train, test, abs_target=False) pred = pd.Series(pred).round(10) subm = pd.DataFrame() subm["ParcelId"] = test_ids subm["201610"] = pred subm["201611"] = pred subm["201612"] = pred subm["201710"] = pred subm["201711"] = pred subm["201712"] = pred subm_path = PathManager().get_submission_dir() + subm_name + ".csv" subm.to_csv(subm_path, index=False) subm_metadata = PathManager().get_submission_dir() + subm_name + ".json" with open(subm_metadata, 'w') as file: submission_dict = {} submission_dict["submission_name"] = subm_name submission_dict["norm"] = norm submission_dict["feat_selection"] = feat_selection_name submission_dict["model"] = model.get_model_name() submission_dict["inputation"] = inputation submission_dict["score"] = "" json.dump(submission_dict, file)
def get_data_eval(id): filepath = PathManager().get_results_data_eval_dir() + id + ".json" with open(filepath, 'r') as file: json_data_eval = json.load(file) # print(json_data_eval) return json_data_eval
def predict(self, df_test): test_path = PathManager().get_temp_dir() + "test_temp.csv" df_test[self.use_cols].to_csv(test_path, index=False) h2o_df_test = h2o.import_file(test_path) h2o_pred = self.model.predict(h2o_df_test) pred = pd.read_csv(StringIO(h2o_pred.get_frame_data()), sep=",")["predict"] return pred.tolist()
def load_data(self, df_file_path, new_features=[]): df = pd.read_csv(df_file_path, low_memory=False) df = df.set_index(df["parcelid"]) del df["parcelid"] for new_feature in new_features: path = PathManager().get_new_features_dir() + new_feature + ".csv" new_feature_df = pd.read_csv(path, low_memory=False) new_feature_df = new_feature_df.set_index(new_feature_df["parcelid"]) df = df.merge(new_feature_df, left_index=True, right_index=True, how="left") gc.collect() return df
def train(self, df_train, target_name): use_cols = df_train.columns.tolist() #columns from train use_cols.remove(target_name) #remove target from train dataset parcelid_index = df_train.index.tolist() train_path = PathManager().get_temp_dir() + "train_temp.csv" df_train.to_csv(train_path, index=False) h2o_df_train = h2o.import_file(train_path) self.model.train(x=use_cols, y=target_name, training_frame=h2o_df_train) self.df_train = df_train self.target_name = target_name self.use_cols = use_cols
class _Agent(abc.ABC): """ Abstract agent class. An agent unifies the three cornerstones of the system: - an environment in which the agent acts - a policy that it uses to make decisions about how to act - a representation module that converts an environment state into a latent representation. Implementations of the agent class provide methods for training the latter components for the purpose of acting in the environment. """ representation_learner: _RepresentationLearner policy: _Policy environments: List[Env] @abc.abstractmethod def __init__(self, representation_learner: _RepresentationLearner, policy: _Policy, environments: List[Env]): self.environments = environments self.policy = policy self.representation_learner = representation_learner self.start_episode = 0 self.path_manager = PathManager() self.logger = Logger('logs', self.get_config_name()) # check if environments given as list if not isinstance(environments, list): raise ValueError( "Need to provide list of environment. For single environment training provide single-element list." ) # check if the environments are having same action space if not len(set([env.action_space.n for env in self.environments])) == 1: raise ValueError( "All environments need to have the same amount of available actions!" ) # check if the environments are having same state space if not len( set([env.observation_space.shape for env in self.environments])) == 1: raise ValueError( "All environments need to have the same state dimensionality!") @abc.abstractmethod def train_agent(self, episodes: int, ckpt_to_load: str = None, episodes_per_saving: int = None, plot_every: int = None, log: bool = False) -> None: """ Train the agent for some number of episodes. The max length of episodes is specified in the environment. Optionally save or load checkpoints from previous trainings. :param episodes: the number of episodes :param ckpt_to_load: loading checkpoint. Default: None :param episodes_per_saving: number of episodes between saving checkpoint. Default: None :param plot_every: number of steps that will happen between the plotting of the space representation :param log: whether logging is done. Default: False """ raise NotImplementedError def act(self, current_state: Tensor, env: Env) -> Tuple[Tensor, float, bool]: """ Method that makes the agent choose an action given the actual state. This method will imply the encoding of the state if a representation learner is capable of doing so. :param current_state: current state of the environment :return: next state of the environment along with the reward and a flag that indicates if the episode is finished """ action = self.policy.choose_action_policy(current_state) next_state, step_reward, env_done, _ = step_env(action, env) return next_state, step_reward, env_done def report_progress(self, episode, total_episodes, start_time, last_rewards, last_repr_losses, last_policy_losses): numb_reported_episodes = len(last_rewards) print( f"\t|-- {int(round(episode / total_episodes * 100)):3d}% ({episode}); " f"r-avg: {(sum(last_rewards) / numb_reported_episodes):8.2f}; " f"r-peak: {int(max(last_rewards)):4d}; " f"r-slack: {int(min(last_rewards)):4d}; " f"r-median: {int(statistics.median(last_rewards)):4d}; " f"Avg. repr_loss: {sum(last_repr_losses) / numb_reported_episodes:10.4f}; " f"Avg. policy_loss: {sum(last_policy_losses) / numb_reported_episodes:15.4f}; " f"Time elapsed: {(time.time()-start_time)/60:6.2f} min; " f"Eps: {self.policy.memory_epsilon_calculator.value(self.policy.total_steps_done - self.policy.memory_delay):.5f}" ) def test(self, env: Env, numb_runs: int = 1, render: bool = False, visual=True) -> None: """ Run a test in the environment using the current policy without exploration. :param numb_runs: number of test to be done. :param render: render the environment """ all_rewards = [] fig = plt.figure(figsize=(10, 6)) for i in range(numb_runs): plt.clf() ims = [] done = False state = reset_env(env) step = 0 total_reward = 0 while not done: state, reward, done = self.act(state, env) step += 1 total_reward += reward if visual: ims.append([ plt.imshow(state.cpu(), cmap="binary", origin="upper", animated=True) ]) if render: env.render() all_rewards.append(total_reward) print( f"Tested episode {i} took {step} steps and gathered a reward of {total_reward}." ) if not render and visual: ani = animation.ArtistAnimation(fig, ims, blit=True, repeat_delay=1000) ani.save(self.path_manager.get_data_dir( f'{env.__class__.__name__}_testrun_{i}.gif'), writer="pillow", fps=15) print( f'Average max score after {numb_runs} testruns: {sum(all_rewards) / len(all_rewards)} with a peak of {max(all_rewards)} at episode {all_rewards.index(max(all_rewards))}' ) def get_config_name(self): return "_".join([ self.__class__.__name__, "_".join([env.spec.id for env in self.environments]), self.representation_learner.__class__.__name__, self.policy.__class__.__name__ ]) def save(self, episode: int, save_repr_learner: bool = True, save_policy_learner: bool = True) -> None: ckpt_dir = self.path_manager.get_ckpt_dir(self.get_config_name()) if save_repr_learner: save_checkpoint(self.representation_learner.current_state(), episode, ckpt_dir, 'repr') if save_policy_learner: save_checkpoint(self.policy.get_current_training_state(), episode, ckpt_dir, 'policy') def load(self, ckpt_dir: str, load_repr_learner: bool = True, load_policy_learner: bool = True, gpu: bool = True) -> None: if load_repr_learner and load_policy_learner: self.start_episode = apply_checkpoint( ckpt_dir, policy=self.policy, repr=self.representation_learner, gpu=gpu) elif load_repr_learner: self.start_episode = apply_checkpoint( ckpt_dir, repr=self.representation_learner, gpu=gpu) elif load_policy_learner: self.start_episode = apply_checkpoint(ckpt_dir, policy=self.policy, gpu=gpu)
class Logger(object): def __init__(self, log_dir: str, config: str): """Create a summary writer logging to log_dir.""" self.path_manager = PathManager() out_dir = self.path_manager.get_subdir_under_root( os.path.join(log_dir, config)) self.writer = tf.summary.FileWriter( out_dir, filename_suffix='_{}'.format(self.path_manager.start_timestamp)) def scalar_summary_dict(self, info: dict, step: int): """Log a set of scalar variable.""" for tag, value in info.items(): self.scalar_summary(tag, value, step) def scalar_summary(self, tag: str, value: float, step: int): """Log a scalar variable.""" summary = tf.Summary( value=[tf.Summary.Value(tag=tag, simple_value=value)]) self.writer.add_summary(summary, step) def image_summary(self, tag, images, step): """Log a list of images.""" img_summaries = [] for i, img in enumerate(images): # Write the image to a string s = BytesIO() scipy.misc.toimage(img).save(s, format="png") # Create an Image object img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), height=img.shape[0], width=img.shape[1]) # Create a Summary value img_summaries.append( tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) # Create and write Summary summary = tf.Summary(value=img_summaries) self.writer.add_summary(summary, step) def histo_summary(self, tag, values, step, bins=1000): """Log a histogram of the tensor of values.""" # Create a histogram using numpy counts, bin_edges = np.histogram(values, bins=bins) # Fill the fields of the histogram proto hist = tf.HistogramProto() hist.min = float(np.min(values)) hist.max = float(np.max(values)) hist.num = int(np.prod(values.shape)) hist.sum = float(np.sum(values)) hist.sum_squares = float(np.sum(values**2)) # Drop the start of the first bin bin_edges = bin_edges[1:] # Add bin edges and counts for edge in bin_edges: hist.bucket_limit.append(edge) for c in counts: hist.bucket.append(c) # Create and write Summary summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) self.writer.add_summary(summary, step) self.writer.flush()
good_cols.remove("logerror") picked_cols = [] for index, row in use_df_corr.loc[good_cols][good_cols].iterrows(): # print(index) use_row = row[row.index != index] high_correlateds = use_row[use_row > corr_threshold].index.tolist() for high_correlated in high_correlateds: if high_correlated in good_cols and not high_correlated in picked_cols: good_cols.remove(high_correlated) picked_cols.append(index) return good_cols if __name__ == "__main__": new_features_list = listdir(PathManager().get_new_features_dir()) new_features_list = [[new_features.replace(".csv", "")] for new_features in new_features_list] print("new_features_list:", new_features_list) dao = DAO(train_file_name="train_complete_2016.csv", new_features=["knn-longitude-latitude"]) df = dao.get_normalized_data(max_na_count_columns=0.05) df = df.dropna() print(select_by_corr_thresh(df)) print(df.columns.tolist()) #good_cols: ['longitude--latitude', 'bedroomcnt', 'structuretaxvaluedollarcnt', 'yearbuilt']
class DAO: def __init__(self, train_file_name=TRAIN_2016_DATA_FILE_NAME, test_file_name=TEST_2016_DATA_FILE_NAME, new_features=[]): self.pm = PathManager() train_df_file_path = self.pm.get_data_dir(train_file_name) self.data_train = self.load_data(train_df_file_path, new_features=new_features) test_df_file_path = self.pm.get_data_dir(test_file_name) self.data_test = self.load_data(test_df_file_path, new_features=new_features) def load_data(self, df_file_path, new_features=[]): df = pd.read_csv(df_file_path, low_memory=False) df = df.set_index(df["parcelid"]) del df["parcelid"] for new_feature in new_features: path = PathManager().get_new_features_dir() + new_feature + ".csv" new_feature_df = pd.read_csv(path, low_memory=False) new_feature_df = new_feature_df.set_index(new_feature_df["parcelid"]) df = df.merge(new_feature_df, left_index=True, right_index=True, how="left") gc.collect() return df def get_data(self, cols_type=None, inputation=None, dataset="train", max_na_count_columns=1): ''' cols_type: None or 'numeric' values are accepted. None: returns all columns 'numeric': returns only numeric columns max_na_count_columns: Set the NAs threshold for the maximum NAs proportion. Example: 1 to return columns that have NAs proportion less or equal than 100% Example: 0.25 to return columns that have NAs proportion less or equal than 25% ''' if dataset == "train": use_data = self.data_train elif dataset == "test": use_data = self.data_test if cols_type == "numeric": numeric_cols = self.infer_numeric_cols(use_data) use_data = use_data[numeric_cols] use_cols = self.less_na_cols(use_data, threshold=max_na_count_columns) gc.collect() df = use_data[use_cols] if inputation == "drop": df = df.dropna() elif inputation == "fill_0": df = df.fillna(0) elif inputation == "column_mean": df = col_mean_inputer(df) elif inputation == "column_mean_fine": df = col_mean_inputer_fine(df) return df def get_normalized_data(self, dataset="train", inputation=None, max_na_count_columns=1): ''' Returns normalize data. Only numeric data will be returned. IMPORTANT: Defatul value for inputation means that remaining ROWS with any NA values are removed. max_na_count_columns: Set the NAs threshold for the maximum NAs proportion. Example: 1 to return COLUMNS that have NAs proportion less or equal than 100% Example: 0.25 to return COLUMNS that have NAs proportion less or equal than 25% ''' df = self.get_data(cols_type="numeric", inputation=inputation, dataset=dataset, max_na_count_columns=max_na_count_columns) if dataset == "train": target = df["logerror"] del df["logerror"] parcelid_index = df.index x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df_norm = pd.DataFrame(x_scaled) df_norm.columns = df.columns gc.collect() df_norm = df_norm.set_index(parcelid_index) if dataset == "train": df_norm["logerror"] = target.tolist() return df_norm def infer_numeric_cols(self, df): numeric_cols = [] for col in df.columns: try: df[col].astype("float") numeric_cols.append(col) except ValueError: pass return numeric_cols def less_na_cols(self, data, threshold=1): ''' Return column names with NAs count less or equal than threshold ''' na_df = pd.Series(data.isnull().sum() / len(data)).sort_values(ascending=False) cols = na_df[na_df <= threshold].index.tolist() return cols
import pandas as pd import json from os import listdir from src.utils.path_manager import PathManager pd.set_option('display.float_format', lambda x: '%.7f' % x) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) data_eval_file_paths = listdir(PathManager().get_results_data_eval_dir()) if len(data_eval_file_paths) == 0: raise Exception("No results found") evals = [] for file_path in data_eval_file_paths: with open(PathManager().get_results_data_eval_dir() + file_path, "r") as file: content = json.load(file) evals.append(content) evals_df = pd.DataFrame(evals).sort_values(by="mae").reset_index() evals_df.to_csv(PathManager().get_results_dir() + "evals_df.csv", index=False) evals_df = pd.DataFrame(evals).sort_values(by="r2", ascending=False) use_evals = evals_df[(evals_df["abs"].astype(str) != "True")] print("all_cols:", evals_df.columns.tolist()) print() use_cols = [ "cols_type", "feat_selection", "inputation", "model_name", "norm", "abs",