def get_configs(): """ Defines the configurations for hyper parameter search """ configurations.DEFINE_string("configs_fname",None,"CSV containing all the configs to run") configurations.DEFINE_boolean("predict",True,"Run predictions after training") configurations.DEFINE_integer("num_threads",4,"NUmber of parallel threads (Number of parallel executions)") configurations.DEFINE_integer("num_gpu",1,"Number of GPU on the machine, Use 0 if there are None") configurations.DEFINE_integer("sleep_time",1,"Sleep time") configurations.DEFINE_integer("start_date",None,"First date for prediction on as YYYYMM") configurations.DEFINE_integer("end_date",None,"Last date for prediction on as YYYYMM") c = configurations.ConfigValues() return c
def main(_): """ The model specified command line arg --model_dir is applied to every data point in --test_datafile and the model output is sent to --output. The unix command 'paste' can be used to stich the input file and output together. e.g., $ classifiy_data.py --config=train.conf --test_datafile=test.dat > output.dat $ paste -d ' ' test.dat output.dat > input_and_output.dat """ configs.DEFINE_string('test_datafile',None,'file with test data') configs.DEFINE_string('time_field','date','fields used for dates/time') configs.DEFINE_string('print_start','190001','only print data on or after') configs.DEFINE_string('print_end','999912','only print data on or before') configs.DEFINE_integer('num_batches',None,'num_batches overrride') config = configs.get_configs() if config.test_datafile is None: config.test_datafile = config.datafile batch_size = 1 data_path = model_utils.get_data_path(config.data_dir,config.test_datafile) # print("Loading data %s"%data_path) dataset = BatchGenerator(data_path, config, batch_size=batch_size, num_unrollings=config.num_unrollings) num_data_points = dataset.num_batches if config.num_batches is not None: num_data_points = config.num_batches #print("num_batches = ", num_data_points) tf_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False ) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: #print("Loading model.") model = model_utils.get_trained_model(session, config, verbose=False) for i in range(num_data_points): batch = dataset.next_batch() preds = model.step(session, batch) seq_len = get_seq_length(batch) key, date = get_key_and_date(batch, seq_len-1) if (date < config.print_start or date > config.print_end): continue score = get_score(config, preds, seq_len-1) target = get_target(config, batch, seq_len-1) print("%s %s %.6f %.6f %d" % (key, date, score, target, seq_len))
def get_search_configs(): """ Defines the configurations for hyper parameter search """ configurations.DEFINE_string("template", None, "Template file for hyper-param search") configurations.DEFINE_string( "search_algorithm", "genetic", "Algorithm for hyper-param optimization. Select from 'genetic', 'grid_search'" ) configurations.DEFINE_integer( "generations", 100, "Number of generations for genetic algorithm") configurations.DEFINE_integer("pop_size", 20, "Population size for genetic algorithm") configurations.DEFINE_integer("num_survivors", 10, "Number of survivors for genetic algorithm") configurations.DEFINE_integer( "num_threads", 4, "NUmber of parallel threads (Number of parallel executions)") configurations.DEFINE_integer( "num_gpu", 1, "Number of GPU on the machine, Use 0 if there are None") configurations.DEFINE_integer("sleep_time", 1, "Sleep time") configurations.DEFINE_float("mutate_rate", 0.2, "Mutation rate for genetic algorithm") configurations.DEFINE_string( "init_pop", None, "Specify starting population. Path to the pickle file") c = configurations.ConfigValues() return c
def get_configs(): """ Defines all configuration params passable to command line. """ configs.DEFINE_string("datasource", 'big_datafile', "The source of the data.") configs.DEFINE_string("tkrlist", "big_tkrlist.csv", "The list of filters to use.") configs.DEFINE_string("datafile", 'big_datafile.dat', "a datafile name.") configs.DEFINE_string("mse_outfile", None, "A file to write mse values during predict phase.") configs.DEFINE_string("default_gpu", '', "The default GPU to use e.g., /gpu:0") configs.DEFINE_string("nn_type", 'DeepRnnModel', "Model type") configs.DEFINE_string("active_field", 'active', "Key column name header for active indicator") configs.DEFINE_string("key_field", 'gvkey', "Key column name header in datafile") configs.DEFINE_string("target_field", 'oiadpq_ttm', "Target column name header in datafile") configs.DEFINE_string("scale_field", 'mrkcap', "Feature to scale inputs by") configs.DEFINE_string("feature_fields", '', "shared input and target field names") configs.DEFINE_string("aux_input_fields", None, "non-target, input only fields") configs.DEFINE_string("data_dir", '', "The data directory") configs.DEFINE_string("model_dir", '', "Model directory") configs.DEFINE_string("rnn_cell", 'gru', "lstm or gru") configs.DEFINE_integer("num_inputs", -1, "") configs.DEFINE_integer("num_outputs", -1, "") configs.DEFINE_integer("target_idx", None, "") configs.DEFINE_integer("min_unrollings", None, "Min number of unrolling steps") configs.DEFINE_integer("max_unrollings", None, "Max number of unrolling steps") # num_unrollings is being depricated by max_unrollings configs.DEFINE_integer("num_unrollings", 4, "Number of unrolling steps") configs.DEFINE_integer("stride", 12, "How many steps to skip per unrolling") configs.DEFINE_integer("forecast_n", 12, "How many steps to forecast into the future") configs.DEFINE_integer("batch_size", 1, "Size of each batch") configs.DEFINE_integer("num_layers", 1, "Numer of RNN layers") configs.DEFINE_integer("num_hidden", 10, "Number of hidden layer units") configs.DEFINE_float("init_scale", 0.1, "Initial scale for weights") configs.DEFINE_float("max_grad_norm", 10.0, "Gradient clipping") configs.DEFINE_integer("start_date", None, "First date to train on as YYYYMM") configs.DEFINE_integer("end_date", None, "Last date to train on as YYYYMM") configs.DEFINE_float("keep_prob", 1.0, "Keep probability for dropout") configs.DEFINE_boolean("train", True, "Train model otherwise inference only") configs.DEFINE_boolean("input_dropout", False, "Do dropout on input layer") configs.DEFINE_boolean("hidden_dropout", False, "Do dropout on hidden layers") configs.DEFINE_boolean("rnn_dropout", False, "Do dropout on recurrent connections") configs.DEFINE_boolean( "skip_connections", False, "Have direct connections between input and output in MLP") configs.DEFINE_boolean( "use_cache", True, "Load data for logreg from cache (vs processing from batch generator)") configs.DEFINE_boolean( "pretty_print_preds", False, "Print predictions in tabular format with inputs, targets, and keys") configs.DEFINE_boolean("scale_targets", True, "") configs.DEFINE_string("data_scaler", None, 'sklearn scaling algorithm or None if no scaling') configs.DEFINE_string("optimizer", 'GradientDescentOptimizer', 'Any tensorflow optimizer in tf.train') configs.DEFINE_string("optimizer_params", None, 'Additional optimizer params such as momentum') configs.DEFINE_float("learning_rate", 0.6, "The initial starting learning rate") configs.DEFINE_float("lr_decay", 0.9, "Learning rate decay") configs.DEFINE_float("validation_size", 0.0, "Size of validation set as %, ie. .3 = 30% of data") configs.DEFINE_float("passes", 1.0, "Passes through day per epoch") configs.DEFINE_float("target_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_float("rnn_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_integer("max_epoch", 0, "Stop after max_epochs") configs.DEFINE_integer("early_stop", None, "Early stop parameter") configs.DEFINE_integer("seed", None, "Seed for deterministic training") configs.DEFINE_integer("cache_id", None, "A unique experiment key for traking a cahce") c = configs.ConfigValues() if c.min_unrollings is None: c.min_unrollings = c.num_unrollings if c.max_unrollings is None: c.max_unrollings = c.num_unrollings # optimizer_params is a string of the form "param1=value1,param2=value2,..." # this maps it to dictionary { param1 : value1, param2 : value2, ...} if c.optimizer_params is None: c.optimizer_params = dict() else: args_list = [p.split('=') for p in c.optimizer_params.split(',')] params = dict() for p in args_list: params[p[0]] = float(p[1]) c.optimizer_params = params assert ('learning_rate' not in c.optimizer_params) return c
def get_configs(): configs.DEFINE_string("name", 'trial1', "") configs.DEFINE_string("datafile", 'Group2-Dataset.csv', "") configs.DEFINE_string("predict_datafile", None, "") configs.DEFINE_string("mse_outfile", None, "") configs.DEFINE_string("scalesfile", None, "") configs.DEFINE_string("default_gpu", '/gpu:0', "") configs.DEFINE_string("nn_type", 'DeepRnnModel', "") configs.DEFINE_string("active_field", 'active', "") configs.DEFINE_string("date_field", 'date', "") configs.DEFINE_string("key_field", 'gvkey', "") configs.DEFINE_string("target_field", 'mkvaltq_ttm', "") configs.DEFINE_string("scale_field", 'mrkcap', "") configs.DEFINE_string("financial_fields", 'saleq_ttm-ltq_mrq', "") configs.DEFINE_string("aux_fields", 'mom3m-mom9m', "") configs.DEFINE_string("dont_scale", None, "") configs.DEFINE_string("data_dir", 'datasets', "") configs.DEFINE_string("model_dir", 'chkpts-wrds-rnn', "") configs.DEFINE_string("rnn_cell", 'lstm', "") configs.DEFINE_string("activation_fn", 'relu', "") configs.DEFINE_integer("num_inputs", -1, "") configs.DEFINE_integer("num_outputs", -1, "") configs.DEFINE_integer("target_idx", None, "") configs.DEFINE_integer("min_unrollings", 5, "") configs.DEFINE_integer("max_unrollings", 5, "") configs.DEFINE_integer("min_years", None, "") configs.DEFINE_integer("max_years", None, "") configs.DEFINE_integer("pls_years", None, "") configs.DEFINE_integer("num_unrollings", 5, "") configs.DEFINE_integer("stride", 12, "") configs.DEFINE_integer("forecast_n", 3, "") configs.DEFINE_integer("batch_size", 128, "") configs.DEFINE_integer("num_layers", 5, "") configs.DEFINE_integer("num_hidden", 128, "") configs.DEFINE_float("training_noise", None, "") configs.DEFINE_float("init_scale", 0.01, "") configs.DEFINE_float("max_grad_norm", 10.0, "") configs.DEFINE_integer("start_date", None, "") configs.DEFINE_integer("end_date", None, "") configs.DEFINE_integer("split_date", None, "") configs.DEFINE_float("keep_prob", 0.75, "") configs.DEFINE_boolean("train", False, "") configs.DEFINE_boolean("require_targets", False, "") configs.DEFINE_boolean("input_dropout", False, "") configs.DEFINE_boolean("hidden_dropout", False, "") configs.DEFINE_boolean("rnn_dropout", True, "") configs.DEFINE_boolean("skip_connections", False, "") configs.DEFINE_boolean("direct_connections", False, "") configs.DEFINE_boolean("use_cache", True, "") configs.DEFINE_boolean("pretty_print_preds", True, "") configs.DEFINE_boolean("scale_targets", True, "") configs.DEFINE_boolean("backfill", False, "") configs.DEFINE_boolean("log_squasher", True, "") configs.DEFINE_boolean("ts_smoother", False, "") configs.DEFINE_string("data_scaler", 'RobustScaler', '') configs.DEFINE_string("optimizer", 'AdadeltaOptimizer', '') configs.DEFINE_string("optimizer_params", None, '') configs.DEFINE_float("learning_rate", 0.6, "") configs.DEFINE_float("lr_decay", 0.95, "") configs.DEFINE_float("validation_size", 0.3, "") configs.DEFINE_float("train_until", 0.0, "") configs.DEFINE_float("passes", 0.2, "") configs.DEFINE_float("target_lambda", 0.8, "") configs.DEFINE_float("rnn_lambda", 0.2, "") configs.DEFINE_float("l2_alpha", 0.0, "") configs.DEFINE_integer("max_epoch", 1000, "") configs.DEFINE_integer("early_stop", 10, "") configs.DEFINE_integer("seed", 100, "") configs.DEFINE_integer("cache_id", 100, "") configs.DEFINE_string("output_file", "mkvaltq_2016.csv", "") c = configs.ConfigValues() if c.min_unrollings is None: c.min_unrollings = c.num_unrollings if c.max_unrollings is None: c.max_unrollings = c.num_unrollings if c.min_years is not None: c.min_unrollings = c.min_years * (12 // c.stride) if c.max_years is not None: c.max_unrollings = (c.max_years) * (12 // c.stride) elif c.pls_years is None: c.max_unrollings = c.min_unrollings else: c.max_unrollings = (c.min_years + c.pls_years) * (12 // c.stride) # optimizer_params is a string of the form "param1=value1,param2=value2,..." # this maps it to dictionary { param1 : value1, param2 : value2, ...} if c.optimizer_params is None: c.optimizer_params = dict() else: args_list = [p.split('=') for p in c.optimizer_params.split(',')] params = dict() for p in args_list: params[p[0]] = float(p[1]) c.optimizer_params = params assert ('learning_rate' not in c.optimizer_params) return c
def get_configs(): """ Defines all configuration params passable to command line. """ configs.DEFINE_string("name", 'hpo-test', "A name for the config.") configs.DEFINE_string("datafile", 'source-ml-data-v8-100M.dat', "a datafile name.") configs.DEFINE_string( "predict_datafile", None, "If predict_datafile is not None, use it instead of datafile for predictions" ) configs.DEFINE_string("mse_outfile", None, "A file to write mse values during predict phase.") configs.DEFINE_string("scalesfile", None, "Optional file for storing scaling params") configs.DEFINE_string("default_gpu", '/gpu:0', "The default GPU to use e.g., /gpu:0") configs.DEFINE_string("nn_type", 'RNNPointEstimate', "Model type") configs.DEFINE_string("active_field", 'active', "Key column name header for active indicator") configs.DEFINE_string("date_field", 'date', "Name of data column.") configs.DEFINE_string("key_field", 'gvkey', "Key column name header in datafile") configs.DEFINE_string("target_field", 'oiadpq_ttm', "Target column name header in datafile") configs.DEFINE_string("scale_field", 'mrkcap', "Feature to scale inputs by") configs.DEFINE_string("financial_fields", 'saleq_ttm-ltq_mrq', "Shared input and target field names") configs.DEFINE_string("aux_fields", 'rel_mom1m-rel_mom9m', "non-target, input only fields") configs.DEFINE_string("dont_scale", None, "Names of fields to not scale") configs.DEFINE_string("data_dir", 'datasets', "The data directory") configs.DEFINE_string("model_dir", 'test-model', "Model directory") configs.DEFINE_string("experiments_dir", './', "Experiments directory") configs.DEFINE_list_string("rnn_cell", 'lstm', "lstm or gru") configs.DEFINE_list_string("activation_fn", 'relu', "MLP activation function in tf.nn.*") configs.DEFINE_integer("num_inputs", -1, "") configs.DEFINE_integer("num_outputs", -1, "") configs.DEFINE_integer("target_idx", None, "") configs.DEFINE_list_integer("min_unrollings", 5, "Min number of unrolling steps") configs.DEFINE_list_integer("max_unrollings", 5, "Max number of unrolling steps") configs.DEFINE_list_integer("min_years", None, "Alt to min_unrollings") configs.DEFINE_list_integer("max_years", None, "Alt to max_unrollings") configs.DEFINE_integer("pls_years", None, "Alt to max_years. max_years = min_year+pls_years") configs.DEFINE_list_integer("stride", 12, "How many steps to skip per unrolling") configs.DEFINE_list_integer("batch_size", 256, "Size of each batch") configs.DEFINE_list_integer("num_layers", 2, "Numer of RNN layers") configs.DEFINE_integer("forecast_n", 12, "How many steps to forecast into the future") configs.DEFINE_list_integer("num_hidden", 64, "Number of hidden layer units") configs.DEFINE_list_float("init_scale", 1.0, "Initial scale for weights") configs.DEFINE_list_float("max_grad_norm", 50.0, "Gradient clipping") configs.DEFINE_integer("start_date", 197501, "First date to train on as YYYYMM") configs.DEFINE_integer("end_date", 199812, "Last date to train on as YYYYMM") configs.DEFINE_integer("split_date", None, "Date to split train/test on.") configs.DEFINE_boolean("train", True, "Train model otherwise inference only") configs.DEFINE_list_float("dropout", 0.0, "Dropout rate for hidden layers") configs.DEFINE_list_float("recurrent_dropout", 0.3, "Dropout rate for recurrent connections") configs.DEFINE_boolean( "log_squasher", True, "Squash large normalized inputs with natural log function") configs.DEFINE_list_string( "data_scaler", 'RobustScaler', 'sklearn scaling algorithm or None if no scaling') configs.DEFINE_list_string("optimizer", 'Adadelta', 'Any tensorflow optimizer in tf.train') configs.DEFINE_list_float("learning_rate", 0.6, "The initial starting learning rate") configs.DEFINE_list_float("lr_decay", 0.96, "Learning rate decay") configs.DEFINE_float("validation_size", 0.3, "Size of validation set as %, ie. 0.3 = 30% of data") configs.DEFINE_list_float( "target_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_list_float( "rnn_lambda", 0.7, "How much to weight last step vs. all steps in loss") configs.DEFINE_integer("max_epoch", 35, "Stop after max_epochs") configs.DEFINE_integer("early_stop", 15, "Early stop parameter") configs.DEFINE_integer("seed", 521, "Seed for deterministic training") configs.DEFINE_boolean("UQ", False, "Uncertainty Quantification Mode") configs.DEFINE_list_float("l2_alpha", 0.0, "L2 regularization for weight parameters.") configs.DEFINE_float("recurrent_l2_alpha", 0.0, "L2 regularization for recurrent weight parameters.") configs.DEFINE_list_boolean("huber_loss", False, "Use huber loss instead of mse") configs.DEFINE_list_float("huber_delta", 1.0, "delta for huber loss") configs.DEFINE_integer("forecast_steps", 1, "How many future predictions need to me made") configs.DEFINE_string('forecast_steps_weights', '1.0', 'weights for the forecast steps') configs.DEFINE_integer( "logging_interval", 100, "Number of batches for logging interval during training") configs.DEFINE_boolean("write_inp_to_out_file", True, "Write input sequence to the output files") configs.DEFINE_string( "training_type", 'fixed_dates', 'Choose between "fixed_dates" and "iterative" training') configs.DEFINE_integer("member_id", 1, "Id of member in a population") configs.DEFINE_boolean('load_saved_weights', False, 'Load weights saved in the checkpoint directory') configs.DEFINE_integer( "epoch_logging_interval", 1, "Number of batches for logging interval during training") configs.DEFINE_string('preds_fname', 'preds.dat', 'Name of the prediction file') configs.DEFINE_integer("num_procs", 1, "Total number of training/prediction processes") # HPO related params configs.DEFINE_integer("NPE", 1, "Number of Parallel Executions") configs.DEFINE_string( "search_algorithm", "genetic", "Algorithm for hyper-param optimization. Select from 'genetic', 'grid_search', 'doe' " ) configs.DEFINE_integer("generations", 5, "Number of generations for genetic algorithm") configs.DEFINE_integer("pop_size", 16, "Population size for genetic algorithm") configs.DEFINE_integer( "num_gpu", 1, "Number of GPU on the machine, Use 0 if there are None") configs.DEFINE_float("mutate_rate", 0.2, "Mutation rate for genetic algorithm") configs.DEFINE_string("objective", 'mse', "Select between mse or uq_loss") configs.DEFINE_string("init_pop", None, "Initial population to begin hyper param search") configs.DEFINE_boolean("save_latest_pop", False, "Save the latest population") configs.DEFINE_string('doe_file', None, 'Design of experiments csv file') configs.DEFINE_integer("decay_steps", 100000, "Number of training steps between decay steps") configs.DEFINE_string("initializer", 'GlorotUniform', 'variable initializers available in Keras') configs.DEFINE_boolean( "use_custom_init", True, 'Use RandomUniform initializer with init_scale values') configs.DEFINE_boolean( "aux_masking", False, 'Mask aux features of all time steps except the last one with 0') configs.DEFINE_integer("max_norm", None, "Max Norm for kernel constraint") configs.DEFINE_float("sgd_momentum", 0.0, "momentum for SGD optimizer") configs.DEFINE_float("end_learning_rate", 0.01, "end lr for polynomial decay") configs.DEFINE_float( 'decay_power', 0.5, 'power to decay the learning rate with for polynomial decay') configs.DEFINE_string('piecewise_lr_boundaries', None, 'boundaries for piecewise constant lr') configs.DEFINE_string('piecewise_lr_values', None, 'values for piecewise constant lr') configs.DEFINE_string('lr_schedule', 'ExponentialDecay', 'Learning rate scheduler') c = configs.ConfigValues() c.data_dir = os.path.join(_data_dir_path, c.data_dir) c.forecast_steps_weights = [ float(x) for x in c.forecast_steps_weights.split(',') ] return c
import model_utils from model_utils import get_tabular_data import configs from tensorflow.python.platform import gfile from batch_generator import BatchGenerator """ Entry point and main loop for train_net.py. Uses command line arguments to get model and training specification (see config.py). """ configs.DEFINE_string("train_datafile", None, "Training file") configs.DEFINE_float("lr_decay", 0.9, "Learning rate decay") configs.DEFINE_float("initial_learning_rate", 1.0, "Initial learning rate") configs.DEFINE_float("validation_size", 0.0, "Size of validation set as %") configs.DEFINE_integer("passes", 1, "Passes through day per epoch") configs.DEFINE_integer("max_epoch", 0, "Stop after max_epochs") configs.DEFINE_integer("early_stop", None, "Early stop parameter") configs.DEFINE_integer("seed", None, "Seed for deterministic training") config = configs.get_configs() datafile = config.train_datafile if config.train_datafile else config.datafile train_path = model_utils.get_data_path(config.data_dir, datafile) cache_path = os.path.splitext(train_path)[0] + '.cache' print("Loading training data ...") end_date = config.end_date
def get_configs(): """ Defines all configuration params passable to command line. """ configs.DEFINE_string("name", 'none', "A name for the config.") configs.DEFINE_string("datafile", 'open_dataset.dat', "a datafile name.") configs.DEFINE_string( "predict_datafile", None, "If predict_datafile is not None, use it instead of datafile for predictions" ) configs.DEFINE_string("mse_outfile", None, "A file to write mse values during predict phase.") configs.DEFINE_string("scalesfile", None, "Optional file for storing scaling params") configs.DEFINE_string( "mse_var_outfile", None, "A file to write mse_var values during predict phase.") configs.DEFINE_string("default_gpu", '', "The default GPU to use e.g., /gpu:0") configs.DEFINE_string("nn_type", 'DeepRnnModel', "Model type") configs.DEFINE_string("active_field", 'active', "Key column name header for active indicator") configs.DEFINE_string("date_field", 'date', "Name of data column.") configs.DEFINE_string("key_field", 'gvkey', "Key column name header in datafile") configs.DEFINE_string("target_field", 'oiadpq_ttm', "Target column name header in datafile") configs.DEFINE_string("scale_field", 'mrkcap', "Feature to scale inputs by") configs.DEFINE_string("financial_fields", '', "Shared input and target field names") configs.DEFINE_string("aux_fields", None, "non-target, input only fields") configs.DEFINE_string("dont_scale", None, "Names of fields to not scale") configs.DEFINE_string("data_dir", '', "The data directory") configs.DEFINE_string("model_dir", 'chkpts', "Model (checkpoint) directory") configs.DEFINE_string("rnn_cell", 'gru', "lstm or gru") configs.DEFINE_string("activation_fn", 'relu', "MLP activation function in tf.nn.*") configs.DEFINE_integer("num_inputs", -1, "") configs.DEFINE_integer("num_outputs", -1, "") configs.DEFINE_integer("target_idx", None, "") configs.DEFINE_integer("min_unrollings", None, "Min number of unrolling steps") configs.DEFINE_integer("max_unrollings", None, "Max number of unrolling steps") configs.DEFINE_integer("min_years", None, "Alt to min_unrollings") configs.DEFINE_integer("max_years", None, "Alt to max_unrollings") configs.DEFINE_integer("pls_years", None, "Alt to max_years. max_years = min_year+pls_years") # num_unrollings is being depricated, replaced with max_unrollings configs.DEFINE_integer("num_unrollings", 4, "Number of unrolling steps") configs.DEFINE_integer("stride", 12, "How many steps to skip per unrolling") configs.DEFINE_integer("forecast_n", 12, "How many steps to forecast into the future") configs.DEFINE_integer("batch_size", 1, "Size of each batch") configs.DEFINE_integer("num_layers", 1, "Numer of RNN layers") configs.DEFINE_integer("num_hidden", 10, "Number of hidden layer units") configs.DEFINE_float("training_noise", None, "Level of training noise as multiple of 1-stdev") configs.DEFINE_float("init_scale", 0.1, "Initial scale for weights") configs.DEFINE_float("max_grad_norm", 10.0, "Gradient clipping") configs.DEFINE_integer("start_date", None, "First date to train on as YYYYMM") configs.DEFINE_integer("end_date", None, "Last date to train on as YYYYMM") configs.DEFINE_integer("split_date", None, "Date to split train/test on.") configs.DEFINE_float("keep_prob", 1.0, "Keep probability for dropout") configs.DEFINE_boolean("train", True, "Train model otherwise inference only") configs.DEFINE_boolean("require_targets", False, "Require target values for test predictions") configs.DEFINE_boolean("input_dropout", False, "Do dropout on input layer") configs.DEFINE_boolean("hidden_dropout", False, "Do dropout on hidden layers") configs.DEFINE_boolean("rnn_dropout", False, "Do dropout on recurrent connections") configs.DEFINE_boolean( "skip_connections", False, "Have a linear fully connected weight skip hidden units in MLP") configs.DEFINE_boolean( "direct_connections", False, "Have direct connections between input and output in MLP") configs.DEFINE_boolean( "use_cache", True, "Load data for logreg from cache (vs processing from batch generator)") configs.DEFINE_boolean( "pretty_print_preds", False, "Print predictions in tabular format with inputs, targets, and keys") configs.DEFINE_boolean( "print_preds", False, "Print predictions with just date, gvkey and output values") configs.DEFINE_string( "df_dirname", None, "Saves dataframes for target, output, variance/variance, mse and mse_var in df_dirname" ) configs.DEFINE_boolean("scale_targets", True, "") configs.DEFINE_boolean( "backfill", False, "Backfill seq history to max_unrollings with data in first time step") configs.DEFINE_boolean( "log_squasher", True, "Squash large normalized inputs with natural log function") configs.DEFINE_boolean("ts_smoother", False, "Use smoother on data time series during training") configs.DEFINE_string("data_scaler", None, 'sklearn scaling algorithm or None if no scaling') configs.DEFINE_string("optimizer", 'GradientDescentOptimizer', 'Any tensorflow optimizer in tf.train') configs.DEFINE_string("optimizer_params", None, 'Additional optimizer params such as momentum') configs.DEFINE_float("learning_rate", 0.6, "The initial starting learning rate") configs.DEFINE_float("lr_decay", 0.9, "Learning rate decay") configs.DEFINE_float("validation_size", 0.0, "Size of validation set as %, ie. .3 = 30% of data") configs.DEFINE_float("train_until", 0.0, "Train until validation MSE is less than this value") configs.DEFINE_float("passes", 1.0, "Passes through day per epoch") configs.DEFINE_float("target_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_float("rnn_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_integer("max_epoch", 0, "Stop after max_epochs") configs.DEFINE_integer("early_stop", None, "Early stop parameter") configs.DEFINE_integer("seed", None, "Seed for deterministic training") configs.DEFINE_integer("cache_id", None, "A unique experiment key for traking a cahce") configs.DEFINE_float("keep_prob_pred", 1.0, "Keep Prob for dropout during prediction") configs.DEFINE_boolean( "print_normalized_outputs", False, "Print normalized outputs. Doesn't apply to pretty print") configs.DEFINE_boolean("UQ", False, "Uncertainty Quantification Mode") configs.DEFINE_string("UQ_model_type", 'MVE', "Select between MVE or PIE") configs.DEFINE_float( "noise_lambda", 1.0, "Weight decay for noise in the loss function. Refer to DeepBayesUQ Model" ) configs.DEFINE_float("l2_alpha", 0.0, "L2 regularization for weight parameters.") configs.DEFINE_float("picp_lambda", 1.0, "Contribution of PICP loss term for HQPI UQ model") configs.DEFINE_float( "smoothing_pi_check", 100, "Smoothing parameter for calculation of PI check in HQPI UQ model") configs.DEFINE_float( "confidence_alpha", 0.1, "Alpha used for calculating confidence level (= 1 - alpha)") configs.DEFINE_boolean("huber_loss", False, "Use huber loss instead of mse") configs.DEFINE_float("huber_delta", 1.0, "delta for huber loss") c = configs.ConfigValues() if c.min_unrollings is None: c.min_unrollings = c.num_unrollings if c.max_unrollings is None: c.max_unrollings = c.num_unrollings if c.min_years is not None: c.min_unrollings = c.min_years * (12 // c.stride) if c.max_years is not None: c.max_unrollings = (c.max_years) * (12 // c.stride) elif c.pls_years is None: c.max_unrollings = c.min_unrollings else: c.max_unrollings = (c.min_years + c.pls_years) * (12 // c.stride) # optimizer_params is a string of the form "param1=value1,param2=value2,..." # this maps it to dictionary { param1 : value1, param2 : value2, ...} if c.optimizer_params is None: c.optimizer_params = dict() else: args_list = [p.split('=') for p in c.optimizer_params.split(',')] params = dict() for p in args_list: params[p[0]] = float(p[1]) c.optimizer_params = params assert ('learning_rate' not in c.optimizer_params) return c
def main(_): """ The model specified command line arg --model_dir is applied to every data point in --test_datafile and the model output is sent to --output. The unix command 'paste' can be used to stich the input file and output together. e.g., $ classifiy_data.py --config=train.conf --test_datafile=test.dat --output=output.dat $ paste -d ' ' test.dat output.dat > input_and_output.dat """ configs.DEFINE_string('test_datafile', None, 'file with test data') configs.DEFINE_string('output', 'preds.dat', 'file for predictions') configs.DEFINE_string('time_field', 'date', 'fields used for dates/time') configs.DEFINE_string('print_start', '190001', 'only print data on or after') configs.DEFINE_string('print_end', '210012', 'only print data on or before') configs.DEFINE_integer('min_test_k', 1, 'minimum seq length classified') configs.DEFINE_integer('num_batches', None, 'num_batches overrride') config = configs.get_configs() if config.test_datafile is None: config.test_datafile = config.datafile batch_size = 1 data_path = model_utils.get_data_path(config.data_dir, config.test_datafile) print("Loading data %s" % data_path) dataset = BatchGenerator(data_path, config, batch_size=batch_size, num_unrollings=config.num_unrollings) num_data_points = dataset.num_batches if config.num_batches is not None: num_data_points = config.num_batches print("num_batches = ", num_data_points) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: print("Loading model.") model = model_utils.get_trained_model(session, config) stats = dict() key = 'ALL' stats[key] = list() with open(config.output, "w") as outfile: for i in range(num_data_points): batch = dataset.next_batch() preds = model.step(session, batch) seq_len = get_seq_length(batch) start = seq_len - 1 if seq_len < config.num_unrollings: continue #if config.nn_type != 'rnn' and seq_len < config.num_unrollings: # continue #elif config.nn_type == 'rnn' and classify_entire_seq(batch): # start = config.min_test_k - 1 for i in range(start, seq_len): key, date = get_key_and_date(batch, i) if (date < config.print_start or date > config.print_end): continue prob = get_pos_prob(config, preds, i) target = get_target(batch, i) outfile.write("%s %s " "%.4f %.4f %d %d\n" % (key, date, 1.0 - prob, prob, target, i + 1)) pred = +1.0 if prob >= 0.5 else 0.0 error = 0.0 if (pred == target) else 1.0 tpos = 1.0 if (pred == 1 and target == 1) else 0.0 tneg = 1.0 if (pred == 0 and target == 0) else 0.0 fpos = 1.0 if (pred == 1 and target == 0) else 0.0 fneg = 1.0 if (pred == 0 and target == 1) else 0.0 # print("pred=%.2f target=%.2f tp=%d tn=%d fp=%d fn=%d"%(pred,target,tp,tn,fp,fn)) curstat = { 'error': error, 'tpos': tpos, 'tneg': tneg, 'fpos': fpos, 'fneg': fneg } if date not in stats: stats[date] = list() stats[date].append(curstat) stats['ALL'].append(curstat) print_summary_stats(stats)
filename: the data file name Returns: If DNN_QUANT_ROOT is defined, the fully qualified data path is returned Otherwise a path relative to the working directory is returned """ path = os.path.join(data_dir, filename) # path = data_dir + '/' + filename if data_dir != '.' and 'DNN_QUANT_ROOT' in os.environ: # path = os.environ['DNN_QUANT_ROOT'] + '/' + path path = os.path.join(os.environ['DNN_QUANT_ROOT'], path) return path configs.DEFINE_string("train_datafile", None, "Training file") configs.DEFINE_float("validation_size", 0.0, "Size of validation set as %") configs.DEFINE_integer("seed", None, "Seed for deterministic training") configs.DEFINE_float("rnn_loss_weight", None, "How much moret to weight kth example") config = configs.get_configs() if config.train_datafile is None: config.train_datafile = config.datafile train_path = get_data_path(config.data_dir, config.train_datafile) print("Loading batched data ...") batches = BatchGenerator(train_path, config, config.batch_size, config.num_unrollings,
def main(_): """ """ configs.DEFINE_string('test_datafile', None, 'file with test data') configs.DEFINE_string('output', 'preds.dat', 'file for predictions') configs.DEFINE_string('time_field', 'date', 'fields used for dates/time') configs.DEFINE_string('print_start', '190001', 'only print data on or after') configs.DEFINE_string('print_end', '210012', 'only print data on or before') configs.DEFINE_string('factor_name', None, 'Name of factor if nn_type=factor') configs.DEFINE_integer('min_test_k', 1, 'minimum seq length classified') configs.DEFINE_integer('num_batches', None, 'num_batches overrride') config = configs.get_configs() factor_name = config.factor_name assert (factor_name is not None) if config.test_datafile is None: config.test_datafile = config.datafile batch_size = 1 num_unrollings = config.num_unrollings data_path = model_utils.get_data_path(config.data_dir, config.test_datafile) filename = data_path print("Loading data %s" % data_path) if not os.path.isfile(filename): raise RuntimeError("The data file %s does not exists" % filename) data = pd.read_csv(filename, sep=' ', dtype={ config.key_field: str, 'date': str }) if config.end_date is not None: data = data.drop(data[data['date'] > str(config.end_date)].index) num_data_points = len(data) params = dict() print("num data points = ", num_data_points) stats = dict() key = 'ALL' stats[key] = list() with open(config.output, "w") as outfile: last_key = '' seq_len = 0 for i in range(num_data_points): key = get_value(data, config.key_field, i) date = get_value(data, 'date', i) seq_len = seq_len + 1 if key == last_key else 1 last_key = key if (str(date) < config.print_start or str(date) > config.print_end): continue if seq_len < config.min_test_k: continue prob = get_value(data, factor_name, i) out = get_value(data, config.target_field, i) target = (out + 1.0) / 2.0 k = min(seq_len, config.num_unrollings) outfile.write("%s %s " "%.4f %.4f %d %d\n" % (key, date, 1.0 - prob, prob, target, k)) pred = +1.0 if prob >= 0.5 else 0.0 error = 0.0 if (pred == target) else 1.0 tpos = 1.0 if (pred == 1 and target == 1) else 0.0 tneg = 1.0 if (pred == 0 and target == 0) else 0.0 fpos = 1.0 if (pred == 1 and target == 0) else 0.0 fneg = 1.0 if (pred == 0 and target == 1) else 0.0 # print("pred=%.2f target=%.2f tp=%d tn=%d fp=%d fn=%d"%(pred,target,tp,tn,fp,fn)) rec = { 'error': error, 'tpos': tpos, 'tneg': tneg, 'fpos': fpos, 'fneg': fneg } if date not in stats: stats[date] = list() stats[date].append(rec) stats['ALL'].append(rec) print_summary_stats(stats)
def get_configs(): """ Defines all configuration params passable to command line. """ configs.DEFINE_string("name", 'test', "A name for the config.") configs.DEFINE_string("datafile", None, "a datafile name.") configs.DEFINE_string("scalesfile", None, "Optional file for storing scaling params") configs.DEFINE_string("default_gpu", '/gpu:0', "The default GPU to use e.g., /gpu:0") configs.DEFINE_string("nn_type", 'RNNPointEstimate', "Model type") configs.DEFINE_string("active_field", 'active', "Key column name header for active indicator") configs.DEFINE_string("date_field", 'date', "Name of data column.") configs.DEFINE_string("key_field", 'gvkey', "Key column name header in datafile") configs.DEFINE_string("target_field", 'oiadpq_ttm', "Target column name header in datafile") configs.DEFINE_string("scale_field", 'mrkcap', "Feature to scale inputs by") configs.DEFINE_string("financial_fields", 'saleq_ttm-ltq_mrq', "Shared input and target field names") configs.DEFINE_string("aux_fields", 'rel_mom1m-rel_mom9m', "non-target, input only fields") configs.DEFINE_string("dont_scale_fields", None, "Names of fields to not scale") configs.DEFINE_string("data_dir", 'datasets', "The data directory") configs.DEFINE_string("model_dir", 'test-model', "Model directory") configs.DEFINE_string("experiments_dir", './', "Experiments directory") configs.DEFINE_string("rnn_cell", 'lstm', "lstm or gru") configs.DEFINE_string("activation_fn", 'relu', "MLP activation function in tf.nn.*") configs.DEFINE_integer("num_inputs", -1, "") configs.DEFINE_integer("num_outputs", -1, "") configs.DEFINE_integer("target_idx", None, "") configs.DEFINE_integer("min_unrollings", 5, "Min number of unrolling steps") configs.DEFINE_integer("max_unrollings", 5, "Max number of unrolling steps") configs.DEFINE_integer("min_years", None, "Alt to min_unrollings") configs.DEFINE_integer("max_years", None, "Alt to max_unrollings") configs.DEFINE_integer("pls_years", None, "Alt to max_years. max_years = min_year+pls_years") configs.DEFINE_integer("stride", 12, "How many steps to skip per unrolling") configs.DEFINE_integer("batch_size", 256, "Size of each batch") configs.DEFINE_integer("num_layers", 2, "Numer of RNN layers") configs.DEFINE_integer("forecast_n", 12, "How many steps to forecast into the future") configs.DEFINE_integer("num_hidden", 64, "Number of hidden layer units") configs.DEFINE_float("init_scale", 1.0, "Initial scale for weights") configs.DEFINE_float("max_grad_norm", 50.0, "Gradient clipping") configs.DEFINE_integer("start_date", 197501, "First date to train on as YYYYMM") configs.DEFINE_integer("end_date", 199912, "Last date to train on as YYYYMM") configs.DEFINE_integer("split_date", None, "Date to split train/test on.") configs.DEFINE_boolean("train", True, "Train model otherwise inference only") configs.DEFINE_float("dropout", 0.0, "Dropout rate for hidden layers") configs.DEFINE_float("recurrent_dropout", 0.0, "Dropout rate for recurrent connections") configs.DEFINE_boolean( "log_squasher", True, "Squash large normalized inputs with natural log function") configs.DEFINE_string("data_scaler", 'RobustScaler', 'sklearn scaling algorithm or None if no scaling') configs.DEFINE_string("optimizer", 'Adadelta', 'Any tensorflow optimizer in tf.train') configs.DEFINE_float("learning_rate", 0.6, "The initial starting learning rate") configs.DEFINE_float("lr_decay", 1.0, "Learning rate decay for exponential decay") configs.DEFINE_float("validation_size", 0.3, "Size of validation set as %, ie. 0.3 = 30% of data") configs.DEFINE_float("target_lambda", 0.5, "How much to weight last step vs. all steps in loss") configs.DEFINE_float("rnn_lambda", 0.7, "How much to weight last step vs. all steps in loss") configs.DEFINE_integer("max_epoch", 1, "Stop after max_epochs") configs.DEFINE_integer("early_stop", 1, "Early stop parameter") configs.DEFINE_integer("seed", 521, "Seed for deterministic training") configs.DEFINE_boolean("UQ", False, "Uncertainty Quantification Mode") configs.DEFINE_float("l2_alpha", 0.0, "L2 regularization for weight parameters.") configs.DEFINE_float("recurrent_l2_alpha", 0.0, "L2 regularization for recurrent weight parameters.") configs.DEFINE_boolean("huber_loss", False, "Use huber loss instead of mse") configs.DEFINE_float("huber_delta", 1.0, "delta for huber loss") configs.DEFINE_integer("forecast_steps", 1, "How many future predictions need to me made") configs.DEFINE_string('forecast_steps_weights', '1.0', 'weights for the forecast steps') configs.DEFINE_integer( "logging_interval", 100, "Number of batches for logging interval during training") configs.DEFINE_boolean("write_inp_to_out_file", True, "Write input sequence to the output files") configs.DEFINE_string( "training_type", 'fixed_dates', 'Choose between "fixed_dates" and "iterative" training') configs.DEFINE_integer("NPE", 1, "Number of Parallel Executions") configs.DEFINE_integer("num_procs", 1, "Total number of training/prediction processes") configs.DEFINE_integer("num_gpu", 1, "NUmber of GPUs") configs.DEFINE_boolean('load_saved_weights', False, 'Load weights saved in the checkpoint directory') configs.DEFINE_integer( "epoch_logging_interval", 1, "Number of batches for logging interval during training") configs.DEFINE_integer("decay_steps", 1500, "Number of training steps between decay steps") configs.DEFINE_string("initializer", 'GlorotUniform', 'variable initializers available in Keras') configs.DEFINE_boolean( "use_custom_init", True, 'Use RandomUniform initializer with init_scale values') configs.DEFINE_boolean( "aux_masking", False, 'Mask aux features of all time steps except the last one with 0') configs.DEFINE_integer("max_norm", 3, "Max Norm for kernel constraint") configs.DEFINE_float("sgd_momentum", 0.0, "momentum for SGD optimizer") configs.DEFINE_float("end_learning_rate", 0.01, "end lr for polynomial decay") configs.DEFINE_float( 'decay_power', 0.5, 'power to decay the learning rate with for polynomial decay') configs.DEFINE_string('piecewise_lr_boundaries', '4000-5500-5500', 'boundaries for piecewise constant lr') configs.DEFINE_string('piecewise_lr_values', '0.5-0.1-0.05-0.1', 'values for piecewise constant lr') configs.DEFINE_string('lr_schedule', 'ExponentialDecay', 'Learning rate scheduler') configs.DEFINE_string('preds_fname', 'preds.dat', 'Name of the prediction file') configs.DEFINE_integer("member_id", 1, "Id of member in a population") configs.DEFINE_boolean("cdrs_inference", False, 'If the execution is for inference on CDRS data') configs.DEFINE_string('cdrs_src_fname', 'cdrs-src.dat', 'Filename of the CDRS source file') configs.DEFINE_string('cdrs_ml_fname', 'cdrs-ml-data.dat', 'Filename of the CDRS ML data file') configs.DEFINE_string('model_ranking_fname', './model-ranking.dat', 'Model Ranking File Name') configs.DEFINE_string('model_ranking_factor', 'pred_var_entval', 'Model ranking factor') configs.DEFINE_string("cdrs_inference_date", None, "CDRS Inference date. Format: '%Y-%m-%d' ") c = configs.ConfigValues() if c.min_unrollings is None: c.min_unrollings = c.num_unrollings if c.max_unrollings is None: c.max_unrollings = c.num_unrollings if c.min_years is not None: c.min_unrollings = c.min_years * (12 // c.stride) if c.max_years is not None: c.max_unrollings = (c.max_years) * (12 // c.stride) elif c.pls_years is None: c.max_unrollings = c.min_unrollings else: c.max_unrollings = (c.min_years + c.pls_years) * (12 // c.stride) c.forecast_steps_weights = [ float(x) for x in c.forecast_steps_weights.split('-') ] c.piecewise_lr_boundaries = [ float(x) for x in c.piecewise_lr_boundaries.split('-') ] c.piecewise_lr_values = [ float(x) for x in c.piecewise_lr_values.split('-') ] return c
def main(_): """ Entry point and main loop for train_net.py. Uses command line arguments to get model and training specification (see config.py). """ configs.DEFINE_string("train_datafile", None, "Training file") configs.DEFINE_string("optimizer", 'gd', 'Optimizer to use gd, adam, adagrad, momentum') configs.DEFINE_float("lr_decay", 0.9, "Learning rate decay") configs.DEFINE_float("initial_learning_rate", 1.0, "Initial learning rate") configs.DEFINE_float("validation_size", 0.0, "Size of validation set as %") configs.DEFINE_float("passes", 1.0, "Passes through day per epoch") configs.DEFINE_float("rnn_loss_weight", None, "How much moret to weight kth example") configs.DEFINE_integer("max_epoch", 0, "Stop after max_epochs") configs.DEFINE_integer("early_stop", None, "Early stop parameter") configs.DEFINE_integer("seed", None, "Seed for deterministic training") config = configs.get_configs() if config.train_datafile is None: config.train_datafile = config.datafile train_path = model_utils.get_data_path(config.data_dir, config.train_datafile) print("Loading training data ...") train_data = BatchGenerator(train_path, config, config.batch_size, config.num_unrollings, validation_size=config.validation_size, randomly_sample=True) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: if config.seed is not None: tf.set_random_seed(config.seed) print("Constructing model ...") model = model_utils.get_training_model(session, config, verbose=True) if config.early_stop is not None: print("Training will early stop without " "improvement after %d epochs." % config.early_stop) train_history = list() valid_history = list() # This sets the initial learning rate tensor lr = model.assign_lr(session, config.initial_learning_rate) for i in range(config.max_epoch): trc, tre, vdc, vde = run_epoch(session, model, train_data, keep_prob=config.keep_prob, passes=config.passes, verbose=True) trc = 999.0 if trc > 999.0 else trc vdc = 999.0 if vdc > 999.0 else vdc print(('Epoch: %d loss: %.6f %.6f' ' error: %.6f %.6f Learning rate: %.4f') % (i + 1, trc, vdc, tre, vde, lr)) sys.stdout.flush() train_history.append(trc) valid_history.append(vdc) # update learning rate if config.optimizer == 'gd' or config.optimizer == 'momentum': lr = model_utils.adjust_learning_rate(session, model, lr, config.lr_decay, train_history) if not os.path.exists(config.model_dir): print("Creating directory %s" % config.model_dir) os.mkdir(config.model_dir) chkpt_file_prefix = "training.ckpt" if model_utils.stop_training(config, valid_history, chkpt_file_prefix): print("Training stopped.") quit() else: checkpoint_path = os.path.join(config.model_dir, chkpt_file_prefix) tf.train.Saver().save(session, checkpoint_path, global_step=i)