def main(_): """ The model specified command line arg --model_dir is applied to every data point in --test_datafile and the model output is sent to --output. The unix command 'paste' can be used to stich the input file and output together. e.g., $ classifiy_data.py --config=train.conf --test_datafile=test.dat > output.dat $ paste -d ' ' test.dat output.dat > input_and_output.dat """ configs.DEFINE_string('test_datafile',None,'file with test data') configs.DEFINE_string('time_field','date','fields used for dates/time') configs.DEFINE_string('print_start','190001','only print data on or after') configs.DEFINE_string('print_end','999912','only print data on or before') configs.DEFINE_integer('num_batches',None,'num_batches overrride') config = configs.get_configs() if config.test_datafile is None: config.test_datafile = config.datafile batch_size = 1 data_path = model_utils.get_data_path(config.data_dir,config.test_datafile) # print("Loading data %s"%data_path) dataset = BatchGenerator(data_path, config, batch_size=batch_size, num_unrollings=config.num_unrollings) num_data_points = dataset.num_batches if config.num_batches is not None: num_data_points = config.num_batches #print("num_batches = ", num_data_points) tf_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False ) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: #print("Loading model.") model = model_utils.get_trained_model(session, config, verbose=False) for i in range(num_data_points): batch = dataset.next_batch() preds = model.step(session, batch) seq_len = get_seq_length(batch) key, date = get_key_and_date(batch, seq_len-1) if (date < config.print_start or date > config.print_end): continue score = get_score(config, preds, seq_len-1) target = get_target(config, batch, seq_len-1) print("%s %s %.6f %.6f %d" % (key, date, score, target, seq_len))
def main(): restore_model = args.restore print(restore_model) seq_len = args.seq_len batch_size = args.batch_size num_epoch = args.epochs batches_per_epoch = 1000 batch_generator = BatchGenerator(batch_size, seq_len) g, vs = create_graph(batch_generator.num_letters, batch_size, num_units=args.units, lstm_layers=args.lstm_layers, window_mixtures=args.window_mixtures, output_mixtures=args.output_mixtures) with tf.Session(graph=g) as sess: model_saver = tf.train.Saver(max_to_keep=2) if restore_model: model_file = tf.train.latest_checkpoint( os.path.join(restore_model, 'models')) experiment_path = restore_model epoch = int(model_file.split('-')[-1]) + 1 model_saver.restore(sess, model_file) else: sess.run(tf.global_variables_initializer()) experiment_path = next_experiment_path() epoch = 0 summary_writer = tf.summary.FileWriter(experiment_path, graph=g, flush_secs=10) summary_writer.add_session_log( tf.SessionLog(status=tf.SessionLog.START), global_step=epoch * batches_per_epoch) for e in range(epoch, num_epoch): print('\nEpoch {}'.format(e)) for b in range(1, batches_per_epoch + 1): coords, seq, reset, needed = batch_generator.next_batch() if needed: sess.run(vs.reset_states, feed_dict={vs.reset: reset}) l, s, _ = sess.run([vs.loss, vs.summary, vs.train_step], feed_dict={ vs.coordinates: coords, vs.sequence: seq }) summary_writer.add_summary(s, global_step=e * batches_per_epoch + b) print('\r[{:5d}/{:5d}] loss = {}'.format( b, batches_per_epoch, l), end='') model_saver.save(sess, os.path.join(experiment_path, 'models', 'model'), global_step=e)
def predict(config): datafile = config.datafile if config.predict_datafile is not None: datafile = config.predict_datafile print("Loading data from %s ..." % datafile) path = utils.data_utils.get_data_path(config.data_dir, datafile) config.batch_size = 1 batches = BatchGenerator(path, config, require_targets=config.require_targets, verbose=True) batches.cache(verbose=True) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: model = model_utils.get_model(session, config, verbose=True) perfs = dict() for i in range(batches.num_batches): batch = batches.next_batch() (mse, preds) = model.step(session, batch) # (mse, preds) = model.debug_step(session, batch) if math.isnan(mse) is False: date = batch_to_date(batch) if date not in perfs: perfs[date] = list() perfs[date].append(mse) if config.pretty_print_preds is True: pretty_print_predictions(batches, batch, preds, mse) else: print_predictions(batches, batch, preds) if config.mse_outfile is not None: with open(config.mse_outfile, "w") as f: for date in sorted(perfs): mean = np.mean(perfs[date]) print("%s %.6f %d" % (date, mean, len(perfs[date])), file=f) total_mean = np.mean([x for v in perfs.values() for x in v]) print("Total %.6f" % (total_mean), file=f) f.closed else: exit()
def main(): seq_len = 256 batch_size = 64 epochs = 30 batches_per_epoch = 1000 batch_generator = BatchGenerator(batch_size, seq_len) g, vs = create_graph(batch_generator.num_letters, batch_size) with tf.Session(graph=g) as sess: model_saver = tf.train.Saver(max_to_keep=2) sess.run(tf.global_variables_initializer()) model_path = get_model_path() summary_writer = tf.summary.FileWriter(model_path, graph=g, flush_secs=10) summary_writer.add_session_log( tf.SessionLog(status=tf.SessionLog.START), global_step=0) for e in range(epochs): print('\n{} : Epoch {}'.format(datetime.datetime.now().time(), e)) for b in range(1, batches_per_epoch + 1): coordinates, labels, reset, to_reset = batch_generator.next_batch( ) if to_reset: sess.run(vs.reset_states, feed_dict={vs.reset: reset}) loss, s, _ = sess.run([vs.loss, vs.summary, vs.train_step], feed_dict={ vs.coordinates: coordinates, vs.sequence: labels }) summary_writer.add_summary(s, global_step=e * batches_per_epoch + b) print('\r[{:5d}/{:5d}] loss = {}'.format( b, batches_per_epoch, loss), end='') model_saver.save(sess, os.path.join(model_path, 'models', 'model'), global_step=e)
configs.DEFINE_float("rnn_loss_weight",None,"How much moret to weight kth example") config = configs.get_configs() if config.train_datafile is None: config.train_datafile = config.datafile train_path = get_data_path(config.data_dir,config.train_datafile) print("Loading batched data ...") batches = BatchGenerator(train_path, config, config.batch_size,config.num_unrollings, validation_size=config.validation_size, randomly_sample=True) for i in range(10): b = batches.next_batch() print("-----------------------------------------------------") print("----Atributes: ") print(b.attribs) print("----Sequence Lengths: ") print(b.seq_lengths) print("----Train Weights: ") print(b.train_mask) print("----Valid Weights: ") print(b.valid_mask) print("----Targets: ") print(b.targets)
def main(_): """ The model specified command line arg --model_dir is applied to every data point in --test_datafile and the model output is sent to --output. The unix command 'paste' can be used to stich the input file and output together. e.g., $ classifiy_data.py --config=train.conf --test_datafile=test.dat --output=output.dat $ paste -d ' ' test.dat output.dat > input_and_output.dat """ configs.DEFINE_string('test_datafile', None, 'file with test data') configs.DEFINE_string('output', 'preds.dat', 'file for predictions') configs.DEFINE_string('time_field', 'date', 'fields used for dates/time') configs.DEFINE_string('print_start', '190001', 'only print data on or after') configs.DEFINE_string('print_end', '210012', 'only print data on or before') configs.DEFINE_integer('min_test_k', 1, 'minimum seq length classified') configs.DEFINE_integer('num_batches', None, 'num_batches overrride') config = configs.get_configs() if config.test_datafile is None: config.test_datafile = config.datafile batch_size = 1 data_path = model_utils.get_data_path(config.data_dir, config.test_datafile) print("Loading data %s" % data_path) dataset = BatchGenerator(data_path, config, batch_size=batch_size, num_unrollings=config.num_unrollings) num_data_points = dataset.num_batches if config.num_batches is not None: num_data_points = config.num_batches print("num_batches = ", num_data_points) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: print("Loading model.") model = model_utils.get_trained_model(session, config) stats = dict() key = 'ALL' stats[key] = list() with open(config.output, "w") as outfile: for i in range(num_data_points): batch = dataset.next_batch() preds = model.step(session, batch) seq_len = get_seq_length(batch) start = seq_len - 1 if seq_len < config.num_unrollings: continue #if config.nn_type != 'rnn' and seq_len < config.num_unrollings: # continue #elif config.nn_type == 'rnn' and classify_entire_seq(batch): # start = config.min_test_k - 1 for i in range(start, seq_len): key, date = get_key_and_date(batch, i) if (date < config.print_start or date > config.print_end): continue prob = get_pos_prob(config, preds, i) target = get_target(batch, i) outfile.write("%s %s " "%.4f %.4f %d %d\n" % (key, date, 1.0 - prob, prob, target, i + 1)) pred = +1.0 if prob >= 0.5 else 0.0 error = 0.0 if (pred == target) else 1.0 tpos = 1.0 if (pred == 1 and target == 1) else 0.0 tneg = 1.0 if (pred == 0 and target == 0) else 0.0 fpos = 1.0 if (pred == 1 and target == 0) else 0.0 fneg = 1.0 if (pred == 0 and target == 1) else 0.0 # print("pred=%.2f target=%.2f tp=%d tn=%d fp=%d fn=%d"%(pred,target,tp,tn,fp,fn)) curstat = { 'error': error, 'tpos': tpos, 'tneg': tneg, 'fpos': fpos, 'fneg': fneg } if date not in stats: stats[date] = list() stats[date].append(curstat) stats['ALL'].append(curstat) print_summary_stats(stats)
def main(): config = deep_quant.get_configs() train_path = utils.data_utils.get_data_path(config.data_dir,config.datafile) print("Loading training data ...") config.batch_size = 1 batches = BatchGenerator(train_path,config) # batches.cache(verbose=True) # batches.shuffle() params = batches.get_scaling_params('StandardScaler') print(params['scale']) print(params['center']) col_names = batches.feature_names df = pd.DataFrame(columns=col_names) gvkeys = list() dates = list() steps = list() print("Num batches sampled: %d"%batches.num_batches) for j in range(batches.num_batches): # for j in range(5000): b = batches.next_batch() seq_len = b.seq_lengths[0] idx = seq_len-1 for i in range(seq_len): gvkeys.append( b.attribs[idx][0][0] ) dates.append( b.attribs[idx][0][1] ) steps.append( i ) x = (b.inputs[i][0] - params['center']) / params['scale'] # x = b.inputs[i][0] n = len(df.index) df.loc[n] = x if (j % 1000)==0: print(".",end='') sys.stdout.flush() print() df = pd.concat( [pd.DataFrame( {'gvkey' : gvkeys, 'date': dates, 'step' : steps } ), df], axis=1 ) # write to outfile df.to_csv(config.mse_outfile,sep=' ',float_format="%.4f") # print feature charateristics for feature in col_names: mean = np.mean( df[feature] ) std = np.std( df[feature] ) print("%s %.4f %.4f"%(feature,mean,std)) print('--------------------------------') # print min and max values for feature in col_names: print("%s:"%feature) st = df.sort_values(feature) rt = df.sort_values(feature, ascending=False) for i in range(5): min_el = st.iloc[i,:] max_el = rt.iloc[i,:] #print(min_el) #print(max_el) print("%s %s %s %s"% (min_el['gvkey'],min_el['date'],min_el['step'],min_el[feature]),end=' ') print("%s %s %s %s"% (max_el['gvkey'],max_el['date'],max_el['step'],max_el[feature])) print('--------------------------------')
"How much moret to weight kth example") config = configs.get_configs() if config.train_datafile is None: config.train_datafile = config.datafile train_path = get_data_path(config.data_dir, config.train_datafile) print("Loading batched data ...") batches = BatchGenerator(train_path, config, config.batch_size, config.num_unrollings, validation_size=config.validation_size, randomly_sample=True) for i in range(10): b = batches.next_batch() print("-----------------------------------------------------") print("----Atributes: ") print(b.attribs) print("----Sequence Lengths: ") print(b.seq_lengths) print("----Train Weights: ") print(b.train_mask) print("----Valid Weights: ") print(b.valid_mask) print("----Targets: ") print(b.targets)
def predict(config): datafile = config.datafile if config.predict_datafile is not None: datafile = config.predict_datafile print("Loading data from %s ..." % datafile) path = utils.data_utils.get_data_path(config.data_dir, datafile) config.batch_size = 1 batches = BatchGenerator(path, config, require_targets=config.require_targets, verbose=True) batches.cache(verbose=True) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Initialize DataFrames df_target = pd.DataFrame() df_output = pd.DataFrame() df_mse = pd.DataFrame() df_list = [df_target, df_output, df_mse] with tf.Graph().as_default(), tf.Session(config=tf_config) as session: model = model_utils.get_model(session, config, verbose=True) perfs = dict() for i in range(batches.num_batches): batch = batches.next_batch() (mse, preds) = model.step(session, batch, keep_prob=config.keep_prob_pred) # (mse, preds) = model.debug_step(session, batch) if math.isnan(mse) is False: date = batch_to_date(batch) key = batch_to_key(batch) if date not in perfs: perfs[date] = list() perfs[date].append(mse) if config.pretty_print_preds: pretty_print_predictions(batches, batch, preds, mse) elif config.print_preds: print_predictions(config, batches, batch, preds, mse) # Get values and update DataFrames if df_dirname is provided in config if config.df_dirname is not None: if not math.isnan(mse): # Get all values target_val = get_value(batches, batch, 'target') output_val = get_value(batches, batch, 'output', preds) mse_val = mse values_list = [target_val, output_val, mse_val] # Update DataFrames for j in range(len(df_list)): assert (len(df_list) == len(values_list)) df_list[j] = update_df(df_list[j], date, key, values_list[j]) # Save the DataFrames if config.df_dirname: if not os.path.isdir(config.df_dirname): os.makedirs(config.df_dirname) save_names = ['target-df.pkl', 'output-df.pkl', 'mse-df.pkl'] for j in range(len(df_list)): assert (len(df_list) == len(save_names)) df_list[j].to_pickle( os.path.join(config.df_dirname, save_names[j])) # MSE Outfile if config.mse_outfile is not None: with open(config.mse_outfile, "w") as f: for date in sorted(perfs): mean = np.mean(perfs[date]) print("%s %.6f %d" % (date, mean, len(perfs[date])), file=f) total_mean = np.mean([x for v in perfs.values() for x in v]) print("Total %.6f" % (total_mean), file=f) f.closed else: exit()
def predict_pie(config): """ Doesn't use print options. Only outputs dataframes""" datafile = config.datafile if config.predict_datafile is not None: datafile = config.predict_datafile print("Loading data from %s ..." % datafile) path = utils.data_utils.get_data_path(config.data_dir, datafile) config.batch_size = 1 batches = BatchGenerator(path, config, require_targets=config.require_targets, verbose=True) batches.cache(verbose=True) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Initialize DataFrames df_target = pd.DataFrame() df_output_lb = pd.DataFrame() df_output_ub = pd.DataFrame() df_list = [df_target, df_output_lb, df_output_ub] with tf.Graph().as_default(), tf.Session(config=tf_config) as session: model = model_utils.get_model(session, config, verbose=True) for i in range(batches.num_batches): batch = batches.next_batch() (mpiw, _, _, preds_lb, preds_ub) = model.step(session, batch, keep_prob=config.keep_prob_pred, uq=config.UQ, UQ_model_type='PIE') # (mse, preds) = model.debug_step(session, batch) date = batch_to_date(batch) key = batch_to_key(batch) # Dummy input to be consistent with the rest of the predictions printing options. MSE = 0.0. It is not # evaluated in PIE case mse_dummy = mse_var_dummy = 0.0 # Print every n iterations to check the progress for monitoring if i % 10000 == 0: pretty_print_predictions(batches, batch, preds_lb, preds_ub, mse_dummy, mse_var_dummy) # Get values and update DataFrames if df_dirname is provided in config if config.df_dirname is not None: # Get all values target_val = get_value(batches, batch, 'target') output_lb_val = get_value(batches, batch, 'output_lb', preds_lb) output_ub_val = get_value(batches, batch, 'output_ub', preds_ub) values_list = [target_val, output_lb_val, output_ub_val] # Update DataFrames for j in range(len(df_list)): assert (len(df_list) == len(values_list)) df_list[j] = update_df(df_list[j], date, key, values_list[j]) # Save the DataFrames if not os.path.isdir(config.df_dirname): os.makedirs(config.df_dirname) save_names = ['target-df.pkl', 'output-lb-df.pkl', 'output-ub-df.pkl'] for j in range(len(df_list)): assert (len(df_list) == len(save_names)) df_list[j].to_pickle(os.path.join(config.df_dirname, save_names[j])) return
sess = tf.Session(config=config) # control training and others # sess.run(tf.global_variables_initializer(), tf.local_variables_initializer()) # initialize var in graph init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()) # the local var is for accuracy_op sess.run(init_op) # initialize var in graph saver = tf.train.Saver() # define a saver for saving and restoring writer = tf.summary.FileWriter('./action_and_id_log', sess.graph) # write to file merge_op = tf.summary.merge_all() # operation to merge all summary bg = BatchGenerator() print("Beginning training session") for step in range(250): # train m, s, u, a = bg.next_batch(get_action_id_only=True) _, loss_, result = sess.run([train_op, loss, merge_op], { minimap: m, screen: s, info: u, action_output: a }) writer.add_summary(result, step) if step % 50 == 0: accuracy_ = sess.run([accuracy], { minimap: m, screen: s, info: u, action_output: a })
def learn_bin_embeddings(self, dummy_coded_data, var_dict, embedding_dim, lr, n_epoch, weight_decay, batch_size, verbose): n_variables = len(var_dict['numerical_vars']) if 'categorical_vars' in var_dict: n_variables += len(var_dict['categorical_vars']) inputs, targets = self._generate_instances(dummy_coded_data, n_variables) n_instances = len(dummy_coded_data) n_dummy_cols = dummy_coded_data.shape[1] batch_size = min(int(n_instances / 10), batch_size) n_iter_per_epoch = int( np.ceil(n_instances * (n_variables - 1) / batch_size)) batch_gen = BatchGenerator(inputs, targets, batch_size) dummy_cols = dummy_coded_data.columns torch.cuda.random.manual_seed_all(42) torch.manual_seed(42) self.be = BinEmbedding(n_dummy_cols, embedding_dim).cuda() loss_ftn = nn.CrossEntropyLoss() opt = torch.optim.Adagrad(self.be.parameters(), lr=lr, lr_decay=0.001) for it in range(n_iter_per_epoch * n_epoch): input_batch, target_batch = batch_gen.next_batch() opt.zero_grad() input_batch = Variable(torch.LongTensor(input_batch)).cuda() target_batch = Variable(torch.LongTensor(target_batch)).cuda() out = self.be(input_batch) loss = loss_ftn(out, target_batch) loss.backward() opt.step() # Normalize Embedding Vectors embedding_norm = torch.norm(self.be.embedding.weight, p=2, dim=1).data embedding_norm = embedding_norm.view(-1, 1).expand_as( self.be.embedding.weight) self.be.embedding.weight.data = self.be.embedding.weight.data.div( embedding_norm) if ((it + 1) % n_iter_per_epoch == 0): if verbose: print('>>> Epoch = {}'.format( int((it + 1) / n_iter_per_epoch))) print('Loss = {}'.format(loss.data[0])) embedding_weights = self.be.state_dict()['embedding.weight'].cpu( ).numpy() self.embedding_by_column = dict( zip(list(dummy_coded_data.columns), embedding_weights))
def predict(config): target_list = [ 'saleq_ttm', 'cogsq_ttm', 'xsgaq_ttm', 'oiadpq_ttm', 'mkvaltq_ttm', 'niq_ttm', 'ibq_ttm', 'cheq_mrq', 'rectq_mrq', 'invtq_mrq', 'acoq_mrq', 'ppentq_mrq', 'aoq_mrq', 'dlcq_mrq', 'apq_mrq', 'txpq_mrq', 'lcoq_mrq', 'ltq_mrq' ] aux_list = ['mom1m', 'mom3m', 'mom6m', 'mom9m'] df = pd.DataFrame(columns=[ 'date', 'gvkey', 'mse', 'normalizer', config.target_field + "_output", config.target_field + "_target" ]) datafile = config.datafile if config.predict_datafile is not None: datafile = config.predict_datafile print("Loading data from %s ..." % datafile) path = os.path.join(config.data_dir, datafile) ind = 0 config.batch_size = 1 batches = BatchGenerator(path, config, require_targets=config.require_targets, verbose=True) batches.cache(verbose=True) tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) index = int( np.argwhere(np.array(target_list) == config.target_field).mean()) with tf.Graph().as_default(), tf.Session(config=tf_config) as session: model = model_utils.get_model(session, config, verbose=True) perfs = dict() for i in range(batches.num_batches): batch = batches.next_batch() (mse, preds) = model.step(session, batch) # (mse, preds) = model.debug_step(session, batch) if math.isnan(mse) is False: date = batch_to_date(batch) if date not in perfs: perfs[date] = list() perfs[date].append(mse) if config.pretty_print_preds is True: #pretty_print_predictions(batches, batch, preds, mse) key = batch_to_key(batch) date = batch_to_date(batch) if int(date % 100) in [3, 6, 9, 12]: print("GVKEY: " + str(key) + ", Date: " + str(date)) L = batch.seq_lengths[0] targets = batch.targets[L - 1][0] outputs = preds[0] normalizer = batch.normalizers[0] np.set_printoptions(suppress=True) np.set_printoptions(precision=3) df.loc[ind] = [ date, key, mse, normalizer, batches.get_raw_outputs(batch, 0, outputs)[index], batches.get_raw_outputs(batch, 0, targets)[index] ] ind += 1 else: print_predictions(batches, batch, preds) if config.mse_outfile is not None: with open(config.mse_outfile, "w") as f: for date in sorted(perfs): mean = np.mean(perfs[date]) print("%s %.6f %d" % (date, mean, len(perfs[date])), file=f) total_mean = np.mean([x for v in perfs.values() for x in v]) print("Total %.6f" % (total_mean), file=f) df.to_csv('datasets/' + config.output_file, index=False) f.closed else: df.to_csv('datasets/' + config.output_file, index=False) exit()