def train(filename_train, filename_model, regression=False, simple=False, n_features=14, n_hidden=40, n_epochs=5, batch_size=64, step_size=0.0005, decay=0.9, random_state=42, verbose=False, statlimit=-1): # Initialization gated = not simple if verbose: logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tgated = %s" % gated) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data if verbose: logging.info("Loading data...") if filename_train[-1] == "e": fd = open(filename_train, "rb") X, y = pickle.load(fd) fd.close() else: X, y = np.load(filename_train) X = np.array(X).astype(dict) y = np.array(y).astype(float) flush = np.random.permutation(len(X)) X, y = X[flush][:statlimit], y[flush][:statlimit] i = 0 ### delete single particles ### while i < len(X): if len(X[i]["content"]) == 1: X = np.delete(X, i) y = np.delete(y, i) else: i += 1 if regression: zerovalue = square_error(y, [x["pt"] for x in X]).mean() X = list(X) if verbose: logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing if verbose: logging.info("Preprocessing...") tf = create_tf_transform(X) X = apply_tf_transform(X, tf) # Split into train+validation logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=rng) del X del y # Training if verbose: logging.info("Training...") if gated: predict = grnn_predict_gated init = grnn_init_gated else: predict = grnn_predict_simple init = grnn_init_simple trained_params = init(n_features, n_hidden, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X, regression=regression) if regression: l = square_error(y, y_pred).mean() else: l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration % n_batches) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start + batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient, regression=False): if iteration % 100 == 0: the_loss = loss(X_valid, y_valid, params) if the_loss < best_score[0]: best_score[0] = the_loss best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() if verbose: if regression: logging.info( "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f" "\tbest_loss(valid) = %.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), best_score[0])) else: roc_auc = roc_auc_score( y_valid, predict(params, X_valid, regression=regression)) logging.info( "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f" "\troc_auc(valid) = %.4f\tbest_loss(valid) = %.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss( X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) if regression: logging.info("zerovalue = %.4f" % zerovalue) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
def train(filename_train, filename_model, n_events_train=-1, simple=False, n_features=7, n_hidden=30, n_epochs=5, batch_size=64, step_size=0.01, decay=0.7, random_state=1): # Initialization gated = not simple logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_events_train = %d" % n_events_train) logging.info("\tgated = %s" % gated) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data logging.info("Loading data...") fd = open(filename_train, "rb") X, y = pickle.load(fd) fd.close() y = np.array(y) if n_events_train > 0: indices = check_random_state(123).permutation(len(X))[:n_events_train] X = [X[i] for i in indices] y = y[indices] logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing logging.info("Preprocessing...") X = [extract(permute_by_pt(rewrite_content(jet))) for jet in X] tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X])) for jet in X: jet["content"] = tf.transform(jet["content"]) # Split into train+validation logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=5000, random_state=rng) # Training logging.info("Training...") if gated: predict = grnn_predict_gated init = grnn_init_gated else: predict = grnn_predict_simple init = grnn_init_simple trained_params = init(n_features, n_hidden, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [-np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X) l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration % n_batches) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start + batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient): if iteration % 25 == 0: roc_auc = roc_auc_score(y_valid, predict(params, X_valid)) if roc_auc > best_score[0]: best_score[0] = roc_auc best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() logging.info( "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f" "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
def train(filename_train, filename_model, n_events, n_features_rnn=4, n_hidden_rnn=40, n_epochs=5, batch_size=64, step_size=0.01, decay=0.7, n_particles_per_event=10, random_state=1): # Initialization n_events = int(n_events) logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_events = %d" % n_events) logging.info("\tn_features_rnn = %d" % n_features_rnn) logging.info("\tn_hidden_rnn = %d" % n_hidden_rnn) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\tn_particles_per_event = %d" % n_particles_per_event) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data logging.info("Loading data + preprocessing...") fd = open(filename_train, "rb") X = [] y = [] for i in range(n_events): v_i, y_i = pickle.load(fd) v_i = v_i[:n_particles_per_event] X.append(v_i) y.append(y_i) y = np.array(y) fd.close() logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing logging.info("Preprocessing...") tf_features = RobustScaler().fit(np.vstack([features for features in X])) for i in range(len(X)): X[i] = tf_features.transform(X[i]) if len(X[i]) < n_particles_per_event: X[i] = np.vstack( [X[i], np.zeros((n_particles_per_event - len(X[i]), 4))]) # Split into train+test logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1000, stratify=y, random_state=rng) # Training logging.info("Training...") predict = event_baseline_predict init = event_baseline_init trained_params = init(n_features_rnn, n_hidden_rnn, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [-np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X, n_particles_per_event=n_particles_per_event) l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start + batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient): if iteration % 25 == 0: roc_auc = roc_auc_score( y_valid, predict(params, X_valid, n_particles_per_event=n_particles_per_event)) if roc_auc > best_score[0]: best_score[0] = roc_auc best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() logging.info( "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f" "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
def train(filename_train, filename_model, n_events, pflow=False, n_features_embedding=7, n_hidden_embedding=40, n_features_rnn=40+4, n_hidden_rnn=10, n_epochs=5, batch_size=64, step_size=0.01, decay=0.7, n_jets_per_event=10, random_state=1): # Initialization n_events = int(n_events) if pflow: n_features_embedding += 4 logging.info("Calling with...") logging.info("\tfilename_train = %s" % filename_train) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_events = %d" % n_events) logging.info("\tpflow = %s" % pflow) logging.info("\tn_features_embedding = %d" % n_features_embedding) logging.info("\tn_hidden_embedding = %d" % n_hidden_embedding) logging.info("\tn_features_rnn = %d" % n_features_rnn) logging.info("\tn_hidden_rnn = %d" % n_hidden_rnn) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\tn_jets_per_event = %d" % n_jets_per_event) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data logging.info("Loading data + preprocessing...") fd = open(filename_train, "rb") # training file is assumed to be formatted a sequence of pickled pairs # (e_i, y_i), where e_i is a list of (phi, eta, pt, mass, jet) tuples. X = [] y = [] for i in range(n_events): e_i, y_i = pickle.load(fd) original_features = [] jets = [] for j, (phi, eta, pt, mass, jet) in enumerate(e_i[:n_jets_per_event]): if len(jet["tree"]) > 1: original_features.append((phi, eta, pt, mass)) jet = extract(permute_by_pt(rewrite_content(jet)), pflow=pflow) jets.append(jet) if len(jets) == n_jets_per_event: X.append([np.array(original_features), jets]) y.append(y_i) y = np.array(y) fd.close() logging.info("\tfilename = %s" % filename_train) logging.info("\tX size = %d" % len(X)) logging.info("\ty size = %d" % len(y)) # Preprocessing logging.info("Preprocessing...") tf_features = RobustScaler().fit( np.vstack([features for features, _ in X])) tf_content = RobustScaler().fit( np.vstack([j["content"] for _, jets in X for j in jets])) for i in range(len(X)): X[i][0] = tf_features.transform(X[i][0]) for j in X[i][1]: j["content"] = tf_content.transform(j["content"]) # Split into train+validation logging.info("Splitting into train and validation...") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=1000, stratify=y, random_state=rng) # Training logging.info("Training...") predict = event_predict init = event_init trained_params = init(n_features_embedding, n_hidden_embedding, n_features_rnn, n_hidden_rnn, n_jets_per_event, random_state=rng) n_batches = int(np.ceil(len(X_train) / batch_size)) best_score = [-np.inf] # yuck, but works best_params = [trained_params] def loss(X, y, params): y_pred = predict(params, X, n_jets_per_event=n_jets_per_event) l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration) start = rng.randint(len(X_train) - batch_size) idx = slice(start, start+batch_size) return loss(X_train[idx], y_train[idx], params) def callback(params, iteration, gradient): if iteration % 25 == 0: roc_auc = roc_auc_score(y_valid, predict(params, X_valid, n_jets_per_event=n_jets_per_event)) if roc_auc > best_score[0]: best_score[0] = roc_auc best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() logging.info( "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f" "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % ( iteration, loss(X_train[:5000], y_train[:5000], params), loss(X_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay
def train(filename_train1, filename_train2, filename_model, n_features=7, n_hidden=40, n_epochs=5, batch_size=64, step_size=0.0005, decay=0.9, random_state=1): # Initialization logging.info("Calling with...") logging.info("\tfilename_train1 = %s" % filename_train1) logging.info("\tfilename_train2 = %s" % filename_train2) logging.info("\tfilename_model = %s" % filename_model) logging.info("\tn_features = %d" % n_features) logging.info("\tn_hidden = %d" % n_hidden) logging.info("\tn_epochs = %d" % n_epochs) logging.info("\tbatch_size = %d" % batch_size) logging.info("\tstep_size = %f" % step_size) logging.info("\tdecay = %f" % decay) logging.info("\trandom_state = %d" % random_state) rng = check_random_state(random_state) # Make data logging.info("Loading data...") fd = open(filename_train1, "rb") X1, y = pickle.load(fd) fd.close() y = np.array(y) fd = open(filename_train2, "rb") X2, _ = pickle.load(fd) fd.close() logging.info("\tfilename = %s" % filename_train1) logging.info("\tfilename = %s" % filename_train2) logging.info("\tX1 size = %d" % len(X1)) logging.info("\tX2 size = %d" % len(X2)) logging.info("\ty size = %d" % len(y)) # Preprocessing logging.info("Preprocessing...") X1 = [extract(permute_by_pt(jet)) for jet in X1] tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X1])) for jet in X1: jet["content"] = tf.transform(jet["content"]) X2 = [extract(permute_by_pt(jet)) for jet in X2] tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X2])) for jet in X2: jet["content"] = tf.transform(jet["content"]) # Split into train+test logging.info("Splitting into train and validation...") X1_train, X1_valid, X2_train, X2_valid, y_train, y_valid = train_test_split( X1, X2, y, test_size=5000, random_state=rng) # Training logging.info("Training...") predict = grnn_predict_simple_join init = grnn_init_simple_join trained_params = init(n_features, n_hidden, random_state=rng) n_batches = int(np.ceil(len(X1_train) / batch_size)) best_score = [-np.inf] # yuck, but works best_params = [trained_params] def loss(X1, X2, y, params): y_pred = predict(params, X1, X2) l = log_loss(y, y_pred).mean() return l def objective(params, iteration): rng = check_random_state(iteration % n_batches) start = rng.randint(len(X1_train) - batch_size) idx = slice(start, start + batch_size) return loss(X1_train[idx], X2_train[idx], y_train[idx], params) def callback(params, iteration, gradient): if iteration % 25 == 0: roc_auc = roc_auc_score(y_valid, predict(params, X1_valid, X2_valid)) if roc_auc > best_score[0]: best_score[0] = roc_auc best_params[0] = copy.deepcopy(params) fd = open(filename_model, "wb") pickle.dump(best_params[0], fd) fd.close() logging.info( "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f" "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (iteration, loss(X1_train[:5000], X2_train[:5000], y_train[:5000], params), loss(X1_valid, X2_valid, y_valid, params), roc_auc, best_score[0])) for i in range(n_epochs): logging.info("epoch = %d" % i) logging.info("step_size = %.4f" % step_size) trained_params = adam(ag.grad(objective), trained_params, step_size=step_size, num_iters=1 * n_batches, callback=callback) step_size = step_size * decay