def train_and_evaluate(train_dir, eval_dir, config, model_dir=None): """Trains and evaluates the model. Args: train_dir (string): Path of the training directory. eval_dir (string): Path of the evaluation directory. config (configparser): Config file containing the diferent configurations and hyperparameters. model_dir (string): Directory where all outputs (checkpoints, event files, etc.) are written. If model_dir is not set, a temporary directory is used. """ my_checkpoint_config = tf.estimator.RunConfig( save_checkpoints_secs=int( config['RUN_CONFIG']['save_checkpoints_secs']), keep_checkpoint_max=int(config['RUN_CONFIG']['keep_checkpoint_max'])) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=my_checkpoint_config, params=config) train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn(train_dir, repeat=True, shuffle=True), max_steps=int(config['RUN_CONFIG']['train_steps'])) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn(eval_dir, repeat=False, shuffle=False), throttle_secs=int(config['RUN_CONFIG']['throttle_secs'])) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def predict(test_dir, model_dir, config): """ Generate the predictions given a model. Args: test_dir (string): Path of the test directory. model_dir (string): Directory with the trained model. config (configparser): Config file containing the diferent configurations and hyperparameters. Returns: list: A list with the predicted values. """ estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=config) pred_results = estimator.predict( input_fn=lambda: input_fn(test_dir, repeat=False, shuffle=False)) return [pred['predictions'] for pred in pred_results]
def predict_and_save(test_dir, model_dir, save_dir, filename, config): """Generates and saves a Pandas Dataframe in CSV format with the real and the predicted delay. It also computes the MAPE (Mean Absolute Percentage Error) of all the samples in the dataset and computes its mean. Args: test_dir (string): Path of the test directory. model_dir (string): Directory with the trained model. save_dir (string): Directory where the generated dataframe will be saved (in csv). filename (string): The filename of the dataframe. config (configparser): Config file containing the diferent configurations and hyperparameters. Returns: float: The Mean Absolute Percentage Error. """ if not os.path.exists(save_dir): os.makedirs(save_dir) tmp_dir = tempfile.mkdtemp() ds = input_fn(test_dir, repeat=False, shuffle=False) dataframes_to_concat = [] it = 0 df_files = [] delays = np.array([]) for predictors, target in ds: it += 1 delays = np.append(delays, target) if it % 1000 == 0: aux_df = pd.DataFrame({"Delay": delays}) dataframes_to_concat.append(aux_df) delays = np.array([]) if it % 3000 == 0: df = pd.concat(dataframes_to_concat) file = os.path.join(tmp_dir, "tmp_df_" + str(it) + ".parquet") df.to_parquet(file) df_files.append(file) dataframes_to_concat = [] if it % 3000 != 0: if it % 1000 != 0: aux_df = pd.DataFrame({"Delay": delays}) dataframes_to_concat.append(aux_df) df = pd.concat(dataframes_to_concat) file = os.path.join(tmp_dir, "tmp_df_" + str(it) + ".parquet") df.to_parquet(file) df_files.append(file) df_list = [] for file in df_files: df_list.append(pd.read_parquet(os.path.join(file))) df = pd.concat(df_list) file = os.path.join(save_dir, filename) predictions = predict(test_dir, model_dir, config) df["Predicted_Delay"] = predictions df['Absolute_Error'] = np.abs(df["Delay"] - df["Predicted_Delay"]) df['Absolute_Percentage_Error'] = (df['Absolute_Error'] / np.abs(df["Delay"])) * 100 df.to_csv(file) return df
def generate_upload_csv(test_dir, model_dir, filename, config): """Generates, compresses (in ZIP) and saves a Pandas Dataframe in CSV format with the predicted delays. Args: test_dir (string): Path of the test dataset root directory. model_dir (string): Directory of the trained model. filename (string): The filename of the compressed CSV file. config (configparser): Config file containing the different configurations and hyperparameters. """ # IMPORTANT NOTE! In order to compress the data, pandas needs for the output file a simple filename, without including the route or path and the extension. # (i.e., "submission_file", not "./home/dataset/submission_file.zip") if '/' in filename: print("---WARNING---") print( "---Filename must be a simple filename, it should not include a path--- Use \"submission_file\" instead of \"./home/dataset/submission_file.zip\"" ) print("GENERATING DELAY LABELS WITH THE TRAINED MODEL...") ######################## # Generate predictions # ######################## # Create the estimator loading the model estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, params=config) # Generate the dataset and make the predictions pred_results = estimator.predict( input_fn=lambda: input_fn(test_dir, repeat=False, shuffle=False)) # Collect the predictions pred = np.array([pred['predictions'] for pred in pred_results]) ################### # Denormalization # ################### # If you have applied any normalization, please denormalize the predicted values here #################### # Prepare the data # #################### print("RESHAPING THE DATA...") # Prepare the data as it should be in the CSV file (each line contains the 342 src-dst delays of a sample) # The network of the test dataset has in total 342 src-dst paths (19 sources x 18 destinations = 342 src-dst pairs) pred = pred.reshape(int(pred.shape[0] / 342), 342) print("CHECKING CSV format...") if pred.shape != (50000, 342): print("--- WARNING ---") print( "--- The format of the CSV file is not correct. It must have 50,000 lines with 342 values each one---" ) print("It has currently the following lines and and elements: " + str(pred.shape)) print("SAVING CSV FILE COMPRESSED IN ZIP...") df = pd.DataFrame(pred) # The CSV file will be directly compressed in ZIP compression_options = dict(method='zip', archive_name=f'{filename}.csv') # The CSV file uses ";" as separator between values # df.to_csv(f'{filename}.zip', header=False, index=False, sep=";", compression=compression_options) # df.to_csv(filename+'.zip', header=False, index=False, sep=";", compression=compression_options) df.to_csv(filename + '.zip', header=False, index=False, sep=";", compression='zip')