def creat_model(self, D): ''' Initialize input placeholders ''' x = tf.placeholder("float", shape=[None, D.dim], name='x') # Features t = tf.placeholder("float", shape=[None, 1], name='t') # Treatent y_ = tf.placeholder("float", shape=[None, 1], name='y_') # Outcome ''' Parameter placeholders ''' r_alpha = tf.placeholder("float", name='r_alpha') r_lambda = tf.placeholder("float", name='r_lambda') do_in = tf.placeholder("float", name='dropout_in') do_out = tf.placeholder("float", name='dropout_out') p = tf.placeholder("float", name='p_treated') ''' Define model graph ''' log(self.logfile, 'Defining graph...\n') dims = [D.dim, FLAGS.dim_in, FLAGS.dim_out] CFR = cfr.cfr_net(x, t, y_, p, FLAGS, r_alpha, r_lambda, do_in, do_out, dims) return CFR
def run(outdir): """ Runs an experiment and stores result in outdir """ ''' Set up paths and start log ''' npzfile = outdir+'result' npzfile_test = outdir+'result.test' repfile = outdir+'reps' repfile_test = outdir+'reps.test' outform = outdir+'y_pred' outform_test = outdir+'y_pred.test' lossform = outdir+'loss' logfile = outdir+'log.txt' f = open(logfile,'w') f.close() dataform = FLAGS.datadir + FLAGS.dataform has_test = False if not FLAGS.data_test == '': # if test set supplied has_test = True dataform_test = FLAGS.datadir + FLAGS.data_test ''' Set random seeds ''' random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) ''' Save parameters ''' save_config(outdir+'config.txt') log(logfile, 'Training with hyperparameters: alpha=%.2g, lambda=%.2g' % (FLAGS.p_alpha,FLAGS.p_lambda)) ''' Load Data ''' npz_input = False if dataform[-3:] == 'npz': npz_input = True if npz_input: datapath = dataform if has_test: datapath_test = dataform_test else: datapath = dataform % 1 if has_test: datapath_test = dataform_test % 1 log(logfile, 'Training data: ' + datapath) if has_test: log(logfile, 'Test data: ' + datapath_test) D = load_data(datapath) D_test = None if has_test: D_test = load_data(datapath_test) log(logfile, 'Loaded data with shape [%d,%d]' % (D['n'], D['dim'])) ''' Start Session ''' sess = tf.Session() ''' Initialize input placeholders ''' x = tf.placeholder("float", shape=[None, D['dim']], name='x') # Features t = tf.placeholder("float", shape=[None, 1], name='t') # Treatent y_ = tf.placeholder("float", shape=[None, 1], name='y_') # Outcome ''' Parameter placeholders ''' r_alpha = tf.placeholder("float", name='r_alpha') r_lambda = tf.placeholder("float", name='r_lambda') do_in = tf.placeholder("float", name='dropout_in') do_out = tf.placeholder("float", name='dropout_out') p = tf.placeholder("float", name='p_treated') ''' Define model graph ''' log(logfile, 'Defining graph...\n') dims = [D['dim'], FLAGS.dim_in, FLAGS.dim_out] CFR = cfr.cfr_net(x, t, y_, p, FLAGS, r_alpha, r_lambda, do_in, do_out, dims) ''' Set up optimizer ''' global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(FLAGS.lrate, global_step, \ NUM_ITERATIONS_PER_DECAY, FLAGS.lrate_decay, staircase=True) opt = None if FLAGS.optimizer == 'Adagrad': opt = tf.train.AdagradOptimizer(lr) elif FLAGS.optimizer == 'GradientDescent': opt = tf.train.GradientDescentOptimizer(lr) elif FLAGS.optimizer == 'Adam': opt = tf.train.AdamOptimizer(lr) else: opt = tf.train.RMSPropOptimizer(lr, FLAGS.decay) ''' Unused gradient clipping ''' #gvs = opt.compute_gradients(CFR.tot_loss) #capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] #train_step = opt.apply_gradients(capped_gvs, global_step=global_step) train_step = opt.minimize(CFR.tot_loss,global_step=global_step) ''' Set up for saving variables ''' all_losses = [] all_preds_train = [] all_preds_test = [] all_valid = [] if FLAGS.varsel: all_weights = None all_beta = None all_preds_test = [] ''' Handle repetitions ''' n_experiments = FLAGS.experiments if FLAGS.repetitions>1: if FLAGS.experiments>1: log(logfile, 'ERROR: Use of both repetitions and multiple experiments is currently not supported.') sys.exit(1) n_experiments = FLAGS.repetitions ''' Run for all repeated experiments ''' for i_exp in range(1,n_experiments+1): if FLAGS.repetitions>1: log(logfile, 'Training on repeated initialization %d/%d...' % (i_exp, FLAGS.repetitions)) else: log(logfile, 'Training on experiment %d/%d...' % (i_exp, n_experiments)) ''' Load Data (if multiple repetitions, reuse first set)''' if i_exp==1 or FLAGS.experiments>1: D_exp_test = None if npz_input: D_exp = {} D_exp['x'] = D['x'][:,:,i_exp-1] D_exp['t'] = D['t'][:,i_exp-1:i_exp] D_exp['yf'] = D['yf'][:,i_exp-1:i_exp] if D['HAVE_TRUTH']: D_exp['ycf'] = D['ycf'][:,i_exp-1:i_exp] else: D_exp['ycf'] = None if has_test: D_exp_test = {} D_exp_test['x'] = D_test['x'][:,:,i_exp-1] D_exp_test['t'] = D_test['t'][:,i_exp-1:i_exp] D_exp_test['yf'] = D_test['yf'][:,i_exp-1:i_exp] if D_test['HAVE_TRUTH']: D_exp_test['ycf'] = D_test['ycf'][:,i_exp-1:i_exp] else: D_exp_test['ycf'] = None else: datapath = dataform % i_exp D_exp = load_data(datapath) if has_test: datapath_test = dataform_test % i_exp D_exp_test = load_data(datapath_test) D_exp['HAVE_TRUTH'] = D['HAVE_TRUTH'] if has_test: D_exp_test['HAVE_TRUTH'] = D_test['HAVE_TRUTH'] ''' Split into training and validation sets ''' I_train, I_valid = validation_split(D_exp, FLAGS.val_part) ''' Run training loop ''' losses, preds_train, preds_test, reps, reps_test = \ train(CFR, sess, train_step, D_exp, I_valid, \ D_exp_test, logfile, i_exp) ''' Collect all reps ''' all_preds_train.append(preds_train) all_preds_test.append(preds_test) all_losses.append(losses) ''' Fix shape for output (n_units, dim, n_reps, n_outputs) ''' out_preds_train = np.swapaxes(np.swapaxes(all_preds_train,1,3),0,2) if has_test: out_preds_test = np.swapaxes(np.swapaxes(all_preds_test,1,3),0,2) out_losses = np.swapaxes(np.swapaxes(all_losses,0,2),0,1) ''' Store predictions ''' log(logfile, 'Saving result to %s...\n' % outdir) if FLAGS.output_csv: np.savetxt('%s_%d.csv' % (outform,i_exp), preds_train[-1], delimiter=',') np.savetxt('%s_%d.csv' % (outform_test,i_exp), preds_test[-1], delimiter=',') np.savetxt('%s_%d.csv' % (lossform,i_exp), losses, delimiter=',') ''' Compute weights if doing variable selection ''' if FLAGS.varsel: if i_exp == 1: all_weights = sess.run(CFR.weights_in[0]) all_beta = sess.run(CFR.weights_pred) else: all_weights = np.dstack((all_weights, sess.run(CFR.weights_in[0]))) all_beta = np.dstack((all_beta, sess.run(CFR.weights_pred))) ''' Save results and predictions ''' all_valid.append(I_valid) if FLAGS.varsel: np.savez(npzfile, pred=out_preds_train, loss=out_losses, w=all_weights, beta=all_beta, val=np.array(all_valid)) else: np.savez(npzfile, pred=out_preds_train, loss=out_losses, val=np.array(all_valid)) if has_test: np.savez(npzfile_test, pred=out_preds_test) ''' Save representations ''' if FLAGS.save_rep and i_exp == 1: np.savez(repfile, rep=reps) if has_test: np.savez(repfile_test, rep=reps_test)
def run(outdir): """ Runs an experiment and stores result in outdir """ ''' Set up paths and start log ''' npzfile = outdir+'result' npzfile_test = outdir+'result.test' repfile = outdir+'reps' repfile_test = outdir+'reps.test' outform = outdir+'y_pred' outform_test = outdir+'y_pred.test' lossform = outdir+'loss' logfile = outdir+'log.txt' f = open(logfile,'w') f.close() dataform = FLAGS.datadir + FLAGS.dataform has_test = False if not FLAGS.data_test == '': # if test set supplied has_test = True dataform_test = FLAGS.datadir + FLAGS.data_test ''' Set random seeds ''' random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) np.random.seed(FLAGS.seed) ''' Save parameters ''' save_config(outdir+'config.txt') log(logfile, 'Training with hyperparameters: alpha=%.2g, lambda=%.2g' % (FLAGS.p_alpha,FLAGS.p_lambda)) ''' Load Data ''' npz_input = False if dataform[-3:] == 'npz': npz_input = True if npz_input: datapath = dataform if has_test: datapath_test = dataform_test else: datapath = dataform if has_test: datapath_test = dataform_test log(logfile, 'Training data: ' + datapath) if has_test: log(logfile, 'Test data: ' + datapath_test) #D = load_data(datapath) D = load_assistments_data(datapath, rname=FLAGS.rname, embeddings=FLAGS.embeddings) D_test = None if has_test: D_test = load_assistments_data(datapath_test, rname=FLAGS.rname, embeddings=FLAGS.embeddings) log(logfile, 'Loaded data with shape [%d,%d]' % (D['n'], D['dim'])) ''' Start Session ''' sess = tf.Session() ''' Parameter placeholders ''' r_alpha = tf.placeholder("float", name='r_alpha') r_lambda = tf.placeholder("float", name='r_lambda') do_in = tf.placeholder("float", name='dropout_in') do_out = tf.placeholder("float", name='dropout_out') p = tf.placeholder("float", name='p_treated') ''' Initialize input placeholders ''' if FLAGS.rnn: problem_set = FLAGS.ps trainable_embed = FLAGS.trainable_embed if trainable_embed: file_path = '../lstm-autoencoder/'+str(problem_set)+'_sq_train_data.csv' else: file_path = '../lstm-autoencoder/'+str(problem_set)+'_pl_train_data.csv' hidden_num = FLAGS.hidden_num pl_df = pd.read_csv(file_path) # the number of features d_num = 3 if trainable_embed else 2 elem_num = len(pl_df.columns)-d_num # group by students pl_df.set_index('id', inplace=True) pl_g = pl_df.groupby('user_id') cnt_list = [] for name,group in pl_g: cnt = len(group) cnt_list.append(cnt) max_len = max(cnt_list) avg_len = sum(cnt_list)/len(cnt_list) max_max_len = int(np.percentile(cnt_list, 70)) print 'max len {}'.format(max_len) print 'avg len {}'.format(avg_len) print 'max max len {}'.format(max_max_len) max_len = min(max_len, max_max_len) if trainable_embed: # load ps list if FLAGS.rnn: ps_file = '../lstm-autoencoder/'+str(problem_set)+'_ps_index' else: ps_file = '../lstm-autoencoder/2016_ps_index' ps_list = [] with open(ps_file) as f: for line in f: ps_list.append(int(line)) sq_embed_idx = tf.placeholder(tf.int32, [None, max_len]) #max_len = 1000 for i in range(len(cnt_list)): if cnt_list[i] > max_len: cnt_list[i] = max_len # get user id list user_list = pl_df['user_id'].unique().tolist() x_dict = {} len_dict = {} if trainable_embed: ps_dict = {} for ite in user_list: m = pl_g.get_group(ite).iloc[:, :-1*(d_num-1)].as_matrix() if trainable_embed: seq_ids = pl_g.get_group(ite)['sequence_id'].tolist() embed_ids = [] for seq_id in seq_ids: if seq_id in ps_list: tmp_idx = ps_list.index(seq_id) embed_ids.append(tmp_idx) else: embed_ids.append(len(ps_list)) if max_len >= m.shape[0]: len_dict[ite] = m.shape[0] diff = max_len - m.shape[0] x_dict[ite] = np.pad(m, ((0,diff), (0,0)), mode='constant', constant_values=0) if trainable_embed: ps_dict[ite] = embed_ids + [0]*diff else: len_dict[ite] = max_len x_dict[ite] = m[-1*max_len:, :] if trainable_embed: ps_dict[ite] = embed_ids[-1*max_len:] # load user ids from exp data data = np.loadtxt(open(dataform,"rb"),delimiter=",") user_ids = data[:, 1] test_data = np.loadtxt(open(dataform_test,"rb"),delimiter=",") test_user_ids = test_data[:, 1] p_input = tf.placeholder(tf.float32, [None, max_len, elem_num]) if FLAGS.trainable_embed: embedding_size = 10 # look up embeddings W = tf.get_variable('W', shape=[len(ps_list)+1, embedding_size], initializer=tf.contrib.layers.xavier_initializer()) sq_embed = tf.nn.embedding_lookup(W, sq_embed_idx) cell_input = tf.reshape(tf.expand_dims(sq_embed, -2) * tf.expand_dims(p_input, -1), [-1, max_len, embedding_size*elem_num]) else: cell_input = p_input cell = tf.nn.rnn_cell.GRUCell(hidden_num) cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=do_in) seq_len = tf.placeholder(tf.int32, [None]) z_codes, enc_state = tf.nn.dynamic_rnn(cell, cell_input, seq_len, dtype=tf.float32) x = enc_state dims = [hidden_num, FLAGS.dim_in, FLAGS.dim_out] else: x = tf.placeholder("float", shape=[None, D['dim']], name='x') # Features dims = [D['dim'], FLAGS.dim_in, FLAGS.dim_out] t = tf.placeholder("float", shape=[None, 1], name='t') # Treatent y_ = tf.placeholder("float", shape=[None, 1], name='y_') # Outcome ''' Define model graph ''' log(logfile, 'Defining graph...\n') CFR = cfr.cfr_net(x, t, y_, p, FLAGS, r_alpha, r_lambda, do_in, do_out, dims) ''' Set up optimizer ''' global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(FLAGS.lrate, global_step, \ NUM_ITERATIONS_PER_DECAY, FLAGS.lrate_decay, staircase=True) opt = None if FLAGS.optimizer == 'Adagrad': opt = tf.train.AdagradOptimizer(lr) elif FLAGS.optimizer == 'GradientDescent': opt = tf.train.GradientDescentOptimizer(lr) elif FLAGS.optimizer == 'Adam': opt = tf.train.AdamOptimizer(lr) else: opt = tf.train.RMSPropOptimizer(lr, FLAGS.decay) ''' Unused gradient clipping ''' #gvs = opt.compute_gradients(CFR.tot_loss) #capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] #train_step = opt.apply_gradients(capped_gvs, global_step=global_step) train_step = opt.minimize(CFR.tot_loss,global_step=global_step) ''' Set up for saving variables ''' all_losses = [] all_preds_train = [] all_preds_test = [] all_valid = [] if FLAGS.varsel: all_weights = None all_beta = None all_preds_test = [] ''' Handle repetitions ''' n_experiments = FLAGS.experiments if FLAGS.repetitions>1: if FLAGS.experiments>1: log(logfile, 'ERROR: Use of both repetitions and multiple experiments is currently not supported.') sys.exit(1) n_experiments = FLAGS.repetitions ''' Run for all repeated experiments ''' for i_exp in range(1,n_experiments+1): if FLAGS.repetitions>1: log(logfile, 'Training on repeated initialization %d/%d...' % (i_exp, FLAGS.repetitions)) else: log(logfile, 'Training on experiment %d/%d...' % (i_exp, n_experiments)) ''' Load Data (if multiple repetitions, reuse first set)''' if i_exp==1 or FLAGS.experiments>1: D_exp_test = None if npz_input: D_exp = {} D_exp['x'] = D['x'][:,:,i_exp-1] D_exp['t'] = D['t'][:,i_exp-1:i_exp] D_exp['yf'] = D['yf'][:,i_exp-1:i_exp] if D['HAVE_TRUTH']: D_exp['ycf'] = D['ycf'][:,i_exp-1:i_exp] else: D_exp['ycf'] = None if has_test: D_exp_test = {} D_exp_test['x'] = D_test['x'][:,:,i_exp-1] D_exp_test['t'] = D_test['t'][:,i_exp-1:i_exp] D_exp_test['yf'] = D_test['yf'][:,i_exp-1:i_exp] if D_test['HAVE_TRUTH']: D_exp_test['ycf'] = D_test['ycf'][:,i_exp-1:i_exp] else: D_exp_test['ycf'] = None else: datapath = dataform D_exp = load_assistments_data(datapath, rname=FLAGS.rname, embeddings=FLAGS.embeddings) if has_test: datapath_test = dataform_test D_exp_test = load_assistments_data(datapath_test, rname=FLAGS.rname, embeddings=FLAGS.embeddings) D_exp['HAVE_TRUTH'] = D['HAVE_TRUTH'] if has_test: D_exp_test['HAVE_TRUTH'] = D_test['HAVE_TRUTH'] ''' Split into training and validation sets ''' I_train, I_valid = validation_split(D_exp, FLAGS.val_part) ''' Run training loop ''' # pass more parameters: p_input, seq_len, rnn if FLAGS.rnn: if FLAGS.trainable_embed: losses, preds_train, preds_test, reps, reps_test = train(CFR, sess, train_step, D_exp, I_valid, D_exp_test, logfile, i_exp, user_ids, test_user_ids, x_dict, len_dict, p_input, seq_len, ps_dict, sq_embed_idx) else: losses, preds_train, preds_test, reps, reps_test = train(CFR, sess, train_step, D_exp, I_valid, D_exp_test, logfile, i_exp, user_ids, test_user_ids, x_dict, len_dict, p_input, seq_len) else: losses, preds_train, preds_test, reps, reps_test = train(CFR, sess, train_step, D_exp, I_valid, D_exp_test, logfile, i_exp) ''' Collect all reps ''' all_preds_train.append(preds_train) all_preds_test.append(preds_test) all_losses.append(losses) ''' Fix shape for output (n_units, dim, n_reps, n_outputs) ''' out_preds_train = np.swapaxes(np.swapaxes(all_preds_train,1,3),0,2) if has_test: out_preds_test = np.swapaxes(np.swapaxes(all_preds_test,1,3),0,2) out_losses = np.swapaxes(np.swapaxes(all_losses,0,2),0,1) ''' Store predictions ''' log(logfile, 'Saving result to %s...\n' % outdir) if FLAGS.output_csv: np.savetxt('%s_%d.csv' % (outform,i_exp), preds_train[-1], delimiter=',') np.savetxt('%s_%d.csv' % (outform_test,i_exp), preds_test[-1], delimiter=',') np.savetxt('%s_%d.csv' % (lossform,i_exp), losses, delimiter=',') ''' Compute weights if doing variable selection ''' if FLAGS.varsel: if i_exp == 1: all_weights = sess.run(CFR.weights_in[0]) all_beta = sess.run(CFR.weights_pred) else: all_weights = np.dstack((all_weights, sess.run(CFR.weights_in[0]))) all_beta = np.dstack((all_beta, sess.run(CFR.weights_pred))) ''' Save results and predictions ''' all_valid.append(I_valid) if FLAGS.varsel: np.savez(npzfile, pred=out_preds_train, loss=out_losses, w=all_weights, beta=all_beta, val=np.array(all_valid)) else: np.savez(npzfile, pred=out_preds_train, loss=out_losses, val=np.array(all_valid)) if has_test: np.savez(npzfile_test, pred=out_preds_test) ''' Save representations ''' if FLAGS.save_rep and i_exp == 1: np.savez(repfile, rep=reps) if has_test: np.savez(repfile_test, rep=reps_test)