def lr_schedule(ep): i_lr = 0.01 drop = 0.5 ep_drop = 10.0 lr = i_lr * math.pow(drop, math.floor((1 + ep) / ep_drop)) logger.info('New learning rate: %01.10f', lr) return lr
def execute_to_numpy(spark): logger.info("Saving data as numpy arrays ...") overall = time() transform_input_to_numpy(spark) transform_output_to_numpy(spark) logger.info("Saving data as numpy array done in %s.", time_it(overall, time()))
def main(): overall = time() logger.info("Main script started ...") proper = False transpose = False to_numpy = False trim_and_split = False spark = None sc = None for arg in sys.argv[1:]: if arg == 'proper': proper = True if arg == 'transpose': transpose = True elif arg == 'to_numpy': to_numpy = True elif arg == 'trim_and_split': trim_and_split = True if not proper and not transpose and not to_numpy and not trim_and_split: proper = True transpose = True to_numpy = True trim_and_split = True if transpose or to_numpy: from pyspark import SparkContext from pyspark.sql import SparkSession sc = SparkContext("local[3]", "test") spark = SparkSession(sc) if proper: execute_proper() if transpose: execute_transpose(spark) if to_numpy: execute_to_numpy(spark) if trim_and_split: execute_trim_and_split() if sc is not None: sc.stop() logger.info("Main script done in %s.", time_it(overall, time()))
def load_previous_model_if_available(model_dir, model_file_name): fpp = None model = None if previous_keras_model_file_exists(model_dir, model_file_name): logger.info("Loading model ...") fpp = load_feature_prep_pipeline(model_dir, model_file_name) model = load_keras_model(model_dir, model_file_name) logger.info("Loading model done.") return fpp, model
def execute_trim_and_split(): logger.info("Starting initial data preparation ...") train_x, train_y, _, _, _, _, _, _, _ = load_all_data(train_set=True, val_set=False, test_set=False, init=True) print(train_x.shape, train_y.shape) logger.info("Trimming training data ...") train_x, train_y = trim_data(train_x, train_y, years=MLC.YEARS) print(train_x.shape, train_y.shape) logger.info("Trimming training data done.") logger.info("Splitting prepared training data ...") train_x_as_test = None train_y_as_test = None if MLC.TEST_SIZE == 0: train_x_as_test = train_x train_y_as_test = train_y train_x, train_y, val_x, val_y, test_x, test_y, train_i, val_i, test_i = split_data( x=train_x, y=train_y, train_size=MLC.TRAIN_SIZE, val_size=MLC.VAL_SIZE, test_size=MLC.TEST_SIZE) if MLC.TEST_SIZE == 0: test_x = train_x_as_test test_y = train_y_as_test test_i = range(len(test_y)) logger.info("Splitting prepared training data done.") save_all_data(train_x=train_x, train_y=train_y, train_i=train_i, val_x=val_x, val_y=val_y, val_i=val_i, test_x=test_x, test_y=test_y, test_i=test_i)
def execute_train(model_dir, model_file_name, fpp, model, start_epoch, end_epoch, train_x, train_y, train_i, val_x, val_y, val_i): model_creator = MLC.get_model_creator() if model is None: fpp = model_creator.build_feature_prep_pipeline() model = model_creator.build_model(MLC.INPUT_LEN, MLC.INPUT_DIM, MLC.OUTPUT_DIM) fit = True else: fit = False x_t = apply_feature_prep_pipeline(x=train_x, fpp=fpp, fit=fit) y_t = train_y x_v = apply_feature_prep_pipeline(x=val_x, fpp=fpp, fit=False) y_v = val_y logger.info('Building/compiling model ...') if MLC.GPUS > 1: model = multi_gpu_model(model, gpus=MLC.GPUS) model = model_creator.compile_model(model) model_tracker = mt.Model_Tracker(model_dir, model_file_name, model, x_v, y_v, val_i) callbacks = [ LearningRateScheduler(model_creator.get_learning_rate_schedule()), TensorBoard(log_dir=get_tb_log_dir(), histogram_freq=0, write_graph=True, write_images=False), model_tracker ] if fit: model_tracker.save_feature_prep_pipeline(fpp) logger.info('Building/compiling model done.') logger.info('Fitting model ...') model.fit(x=[x_t], y=y_t, batch_size=MLC.BATCH_SIZE, epochs=end_epoch, verbose=1, callbacks=callbacks, shuffle=True, sample_weight=None, initial_epoch=start_epoch, steps_per_epoch=None, validation_data=[[x_v], y_v]) logger.info('Fitting model done.') return fpp, model
def execute_test(fpp, model, test_x, test_y, test_i, model_dir, model_file_name): logger.info("Testing model ...") x = apply_feature_prep_pipeline(x=test_x, fpp=fpp, fit=False) y = test_y y_p = model.predict(x, verbose=1) y = np.reshape(a=y, newshape=(len(y), )) y_p = np.reshape(a=y_p, newshape=(len(y), )) test_result = pd.DataFrame({ RMC.SCEN_ID_COL: test_i + 1, 'y': y, 'y_pred': y_p, 'Difference': y - y_p, 'Deviation': (y - y_p) * 100 / y }) test_result.set_index(RMC.SCEN_ID_COL, inplace=True) test_result.sort_index(inplace=True) skl_mse = mean_squared_error(y, y_p) skl_rmse = sqrt(skl_mse) print(" - test_skl_mse ({:.6f}), test_skl_rmse ({:.6f})".format( skl_mse, skl_rmse)) print('\n') if model_dir is not None: with open( os.path.join(model_dir, model_file_name + '_test_results.csv'), "w") as file: file.write("Test MSE: {0}, Test RMSE: {1}\n".format( skl_mse, skl_rmse)) file.write("\n") test_result.to_csv( path_or_buf=file, columns=['y', 'y_pred', 'Difference', 'Deviation']) file.write(",,,, {0}\n".format( np.mean(np.absolute(y - y_p) * 100 / y)))
def execute_ada_boost(model_dir, model_file_name, start_epoch, end_epoch, train_x, train_y, train_i, val_x, val_y, val_i): model_creator = MLC.get_model_creator() fpp = model_creator.build_feature_prep_pipeline() model_fn = lambda: model_creator.build_model(MLC.INPUT_LEN, MLC.INPUT_DIM, MLC.OUTPUT_DIM) x_t = apply_feature_prep_pipeline(x=train_x, fpp=fpp, fit=True) x_t = x_t.reshape(-1, MLC.INPUT_LEN * MLC.INPUT_DIM) y_t = train_y x_v = apply_feature_prep_pipeline(x=val_x, fpp=fpp, fit=False) y_v = val_y logger.info('Building/compiling model ...') #if MLC.GPUS > 1: # model = multi_gpu_model(model, gpus=MLC.GPUS) #model = model_creator.compile_model(model) #model_tracker = mt.Model_Tracker(model_dir, model_file_name, model, x_v, y_v, val_i) callbacks = [ LearningRateScheduler(model_creator.get_learning_rate_schedule()) ] # TensorBoard(log_dir=get_tb_log_dir(), histogram_freq=0, write_graph=True, write_images=False), # model_tracker] keras_regr = KerasRegressor(build_fn=model_fn, batch_size=MLC.BATCH_SIZE, epochs=end_epoch, verbose=1, shuffle=True, initial_epoch=start_epoch) ada_boost = AdaBoostRegressor(keras_regr, n_estimators=2, random_state=None) logger.info('Building/compiling model done.') logger.info('Fitting model ...') ada_boost.fit(x_t, y_t) logger.info('Fitting model done.')
def execute_proper(): logger.info("Creating proper data files ...") then = time() wro = os.path.join(RMC.INPUT_DIR, RMC.PROPHET_INPUT_ALL + '.fac') pro = os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_PROPER + '.csv') with open(wro, 'r') as orig: print("open orig") with open(pro, 'w') as copy: print("open copy") i = 0 for line in orig: if i >= 2: copy.write(line) i += 1 if i % 100000 == 0: logger.info("Creating proper input data file ... %3.2f%%", (i * 100.0 / 780081)) wro = os.path.join(RMC.INPUT_DIR, RMC.PROPHET_OUTPUT_ALL + '.csv') pro = os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_OUTPUT_ALL_PROPER + '.csv') with open(wro, 'r') as orig: print("open orig") with open(pro, 'w') as copy: print("open copy") i = 0 for line in orig: # SCENARIO's 2053 output is corrupted if i != 2053: copy.write(line.replace(',', '')) else: print(line) i += 1 if i % 1000 == 0: logger.info("Creating proper output data file ... %3.2f%%", (i * 100.0 / 10001)) logger.info("Creating proper data files done in %s.", time_it(then, time()))
def save_all_data(train_x, train_y, train_i, val_x, val_y, val_i, test_x, test_y, test_i): if train_x is not None and train_y is not None: logger.info("Saving prepared training data ...") save_data(train_x, RMC.TRAIN_X_DATA_FILE) save_data(train_y, RMC.TRAIN_Y_DATA_FILE) save_data(train_i, RMC.TRAIN_I_DATA_FILE) logger.info("Saving prepared training data done.") if val_x is not None and val_y is not None: logger.info("Saving prepared validation data ...") save_data(val_x, RMC.VAL_X_DATA_FILE) save_data(val_y, RMC.VAL_Y_DATA_FILE) save_data(val_i, RMC.VAL_I_DATA_FILE) logger.info("Saving prepared validation data done.") if test_x is not None and test_y is not None: logger.info("Saving prepared test data ...") save_data(test_x, RMC.TEST_X_DATA_FILE) save_data(test_y, RMC.TEST_Y_DATA_FILE) save_data(test_i, RMC.TEST_I_DATA_FILE) logger.info("Saving prepared test data done.")
def execute_transpose(spark): logger.info("Transposing data ...") overall = time() logger.info("Reading proper data file ...") then = time() df = spark.read.csv(path=os.path.join( RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_PROPER + '.csv'), header=True, inferSchema=True) logger.info("Reading proper data file done in %s.", time_it(then, time())) logger.info("Collecting distinct value column names ...") then = time() import pyspark.sql.functions as sf df = df.withColumn( 'EC_CL_MS_OS', sf.concat(df.ECONOMY, sf.lit('_'), df.CLASS, sf.lit('_'), df.MEASURE, sf.lit('_'), df.OS_TERM)) df = df.drop('!6', 'ECONOMY', 'CLASS', 'MEASURE', 'OS_TERM') # this is to provide faster test, needs to be uncommented later on #df = df.select('SCENARIO', 'EC_CL_MS_OS', '201712', '201801') # this is way cheaper to do now than to wait until all months have been transposed as rows val_col_nms = sorted( df.select('EC_CL_MS_OS').distinct().rdd.map( lambda row: row[0]).collect()) logger.info("Collecting distinct value column names done in %s.", time_it(then, time())) logger.info("Transposing month columns as rows ...") then = time() keep_col_nms = ['SCENARIO', 'EC_CL_MS_OS'] mo_col_nms = [c for c in df.columns if c not in keep_col_nms] mo_val_cols = sf.explode( sf.array([ sf.struct(sf.lit(c).alias("MONTH"), sf.col(c).alias("VALUE")) for c in mo_col_nms ])).alias('MONTH_VALUE') df = df.select(keep_col_nms + [mo_val_cols]).select( keep_col_nms + ['MONTH_VALUE.MONTH', 'MONTH_VALUE.VALUE']) logger.info("Transposing month columns as rows done in %s.", time_it(then, time())) logger.info("Selecting with value columns ...") then = time() val_cols = [ sf.when(sf.col('EC_CL_MS_OS') == c, sf.col('VALUE')).otherwise(None).alias(c) for c in val_col_nms ] max_agg_cols = [sf.max(sf.col(c)).alias(c) for c in val_col_nms] df = df.select(sf.col('SCENARIO'), sf.col('MONTH'), *val_cols) logger.info("Selecting with value columns done in %s.", time_it(then, time())) logger.info("Aggregating value columns ...") then = time() df = df.groupBy('SCENARIO', 'MONTH').agg(*max_agg_cols) logger.info("Aggregating value columns done in %s.", time_it(then, time())) logger.info("Saving reshaped data to file ...") if os.path.exists(os.path.join(RMC.OUTPUT_DIR, 'tmp.csv')): rmtree(os.path.join(RMC.OUTPUT_DIR, 'tmp.csv')) df.write.csv(path=os.path.join(RMC.OUTPUT_DIR, 'tmp.csv'), mode='overwrite', header=True) if os.path.exists( os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_RESHAPED + '.csv')): rmtree( os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_RESHAPED + '.csv')) os.rename(src=os.path.join(RMC.OUTPUT_DIR, 'tmp.csv'), dst=os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_RESHAPED + '.csv')) logger.info("Saving reshaped data to file done in %s.", time_it(then, time())) logger.info("Transposing data done in %s.", time_it(overall, time())) return df
def load_all_data(train_set, val_set, test_set, init): train_x = None train_y = None train_i = None val_x = None val_y = None val_i = None test_x = None test_y = None test_i = None if train_set: logger.info("Loading training data ...") if init: train_x = load_data(file_name=RMC.PROPHET_INPUT_ALL_NUMPY, init=True) train_y = load_data(file_name=RMC.PROPHET_OUTPUT_ALL_NUMPY, init=True) else: train_x = load_data(file_name=RMC.TRAIN_X_DATA_FILE) train_y = load_data(file_name=RMC.TRAIN_Y_DATA_FILE) train_i = load_data(file_name=RMC.TRAIN_I_DATA_FILE) logger.info("Loading training data done.") if val_set: logger.info("Loading prepared validation data ...") val_x = load_data(file_name=RMC.VAL_X_DATA_FILE) val_y = load_data(file_name=RMC.VAL_Y_DATA_FILE) val_i = load_data(file_name=RMC.VAL_I_DATA_FILE) logger.info("Loading prepared validation data done.") if test_set: logger.info("Loading prepared test data ...") test_x = load_data(file_name=RMC.TEST_X_DATA_FILE) test_y = load_data(file_name=RMC.TEST_Y_DATA_FILE) test_i = load_data(file_name=RMC.TEST_I_DATA_FILE) logger.info("Loading prepared test data done.") return train_x, train_y, train_i, val_x, val_y, val_i, test_x, test_y, test_i
def transform_output_to_numpy(spark): logger.info("Transforming prophet input data to numpy array ...") overall = time() logger.info("Reading output data file ...") then = time() df = spark.read.csv(path=os.path.join( RMC.OUTPUT_DIR, RMC.PROPHET_OUTPUT_ALL_PROPER + '.csv'), header=True, inferSchema=True, sep=';') logger.info("Reading output data file done in %s.", time_it(then, time())) logger.info("Collecting data ...") then = time() df = df.orderBy('SCENARIO') df = df.drop('SCENARIO') data = df.collect() logger.info("Collecting data done in %s.", time_it(then, time())) logger.info("Creating and saving numpy array ...") then = time() data = np.array(data) np.save( os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_OUTPUT_ALL_NUMPY + '.npy'), data) logger.info("Creating and saving numpy array done in %s.", time_it(then, time())) logger.info("Transforming prophet ouput data to numpy array done in %s.", time_it(overall, time()))
def transform_input_to_numpy(spark): logger.info("Transforming prophet input data to numpy array ...") overall = time() logger.info("Reading reshaped data file ...") then = time() df = spark.read.csv(path=os.path.join( RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_RESHAPED + '.csv'), header=True, inferSchema=True) logger.info("Reading reshaped data file done in %s.", time_it(then, time())) logger.info("Collecting data ...") then = time() # SCENARIO's 2053 output is corrupted df = df.where('SCENARIO != 2053') df = df.orderBy('SCENARIO', 'MONTH') df = df.drop('SCENARIO', 'MONTH') col_nms = df.columns import pyspark.sql.functions as sf df = df.withColumn('FEATURES', sf.array(col_nms)) df = df.drop(*col_nms) data = df.collect() logger.info("Collecting data done in %s.", time_it(then, time())) logger.info("Creating and saving numpy array ...") then = time() data = np.array(data) data = data.reshape(-1, 78) data = data.reshape(-1, 721, 78) np.save(os.path.join(RMC.OUTPUT_DIR, RMC.PROPHET_INPUT_ALL_NUMPY + '.npy'), data) logger.info("Creating and saving numpy array done in %s.", time_it(then, time())) logger.info("Transforming prophet input data to numpy array done in %s.", time_it(overall, time()))
def main(): overall = time() logger.info("Main script started ...") train = False test = False ada_boost = False fpp = None model = None for arg in sys.argv[1:]: if arg == 'train': train = True elif arg == 'test': test = True elif arg == 'ada_boost': ada_boost = True if not train and not test and not ada_boost: train = True train_x, train_y, train_i, val_x, val_y, val_i, test_x, test_y, test_i = load_all_data( train_set=train or ada_boost, val_set=train or ada_boost, test_set=test, init=False) model_file_name = '{0}_{1}_{2}_{3}'.format(MLC.TRN, MLC.MV, MLC.OV, MLC.DP) model_dir = os.path.join(RMC.OUTPUT_DIR, model_file_name) if test or (train and not MLC.OVERWRITE): fpp, model = mt.load_previous_model_if_available( model_dir, model_file_name) if train: fpp, model = execute_train(model_dir, model_file_name, fpp, model, start_epoch=MLC.START_EP, end_epoch=MLC.END_EP, train_x=train_x, train_y=train_y, train_i=train_i, val_x=val_x, val_y=val_y, val_i=val_i) if test: execute_test(fpp, model, test_x, test_y, test_i, model_dir, model_file_name) if ada_boost: execute_ada_boost(model_dir, model_file_name, start_epoch=MLC.START_EP, end_epoch=MLC.END_EP, train_x=train_x, train_y=train_y, train_i=train_i, val_x=val_x, val_y=val_y, val_i=val_i) logger.info("Main script finished in %s.", time_it(overall, time()))