Example #1
0
def train(filename_train,
          filename_model,
          n_events,
          pflow=False,
          n_features_embedding=7,
          n_hidden_embedding=40,
          n_features_rnn=40+4,
          n_hidden_rnn=10,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          n_jets_per_event=10,
          random_state=1):
    # Initialization
    n_events = int(n_events)
    if pflow:
        n_features_embedding += 4
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events = %d" % n_events)
    logging.info("\tpflow = %s" % pflow)
    logging.info("\tn_features_embedding = %d" % n_features_embedding)
    logging.info("\tn_hidden_embedding = %d" % n_hidden_embedding)
    logging.info("\tn_features_rnn = %d" % n_features_rnn)
    logging.info("\tn_hidden_rnn = %d" % n_hidden_rnn)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\tn_jets_per_event = %d" % n_jets_per_event)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data + preprocessing...")

    fd = open(filename_train, "rb")

    # training file is assumed to be formatted a sequence of pickled pairs
    # (e_i, y_i), where e_i is a list of (phi, eta, pt, mass, jet) tuples.

    X = []
    y = []

    for i in range(n_events):
        e_i, y_i = pickle.load(fd)

        original_features = []
        jets = []

        for j, (phi, eta, pt, mass, jet) in enumerate(e_i[:n_jets_per_event]):
            if len(jet["tree"]) > 1:
                original_features.append((phi, eta, pt, mass))
                jet = extract(permute_by_pt(rewrite_content(jet)), pflow=pflow)
                jets.append(jet)

        if len(jets) == n_jets_per_event:
            X.append([np.array(original_features), jets])
            y.append(y_i)

    y = np.array(y)

    fd.close()

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    tf_features = RobustScaler().fit(
        np.vstack([features for features, _ in X]))

    tf_content = RobustScaler().fit(
        np.vstack([j["content"] for _, jets in X for j in jets]))

    for i in range(len(X)):
        X[i][0] = tf_features.transform(X[i][0])

        for j in X[i][1]:
            j["content"] = tf_content.transform(j["content"])

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                          test_size=1000,
                                                          stratify=y,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    predict = event_predict
    init = event_init

    trained_params = init(n_features_embedding, n_hidden_embedding,
                          n_features_rnn, n_hidden_rnn, n_jets_per_event,
                          random_state=rng)

    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X,
                         n_jets_per_event=n_jets_per_event)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start+batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid,
                                    predict(params, X_valid,
                                            n_jets_per_event=n_jets_per_event))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (
                    iteration,
                    loss(X_train[:5000], y_train[:5000], params),
                    loss(X_valid, y_valid, params),
                    roc_auc,
                    best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Example #2
0
def train(filename_train,
          filename_model,
          n_events_train=-1,
          simple=False,
          n_features=7,
          n_hidden=30,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          random_state=1):
    # Initialization
    gated = not simple
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events_train = %d" % n_events_train)
    logging.info("\tgated = %s" % gated)
    logging.info("\tn_features = %d" % n_features)
    logging.info("\tn_hidden = %d" % n_hidden)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data...")

    fd = open(filename_train, "rb")
    X, y = pickle.load(fd)
    fd.close()
    y = np.array(y)

    if n_events_train > 0:
        indices = check_random_state(123).permutation(len(X))[:n_events_train]
        X = [X[i] for i in indices]
        y = y[indices]

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    X = [extract(permute_by_pt(rewrite_content(jet))) for jet in X]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))

    for jet in X:
        jet["content"] = tf.transform(jet["content"])

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=5000,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    if gated:
        predict = grnn_predict_gated
        init = grnn_init_gated
    else:
        predict = grnn_predict_simple
        init = grnn_init_simple

    trained_params = init(n_features, n_hidden, random_state=rng)
    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration % n_batches)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid, predict(params, X_valid))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" %
                (iteration, loss(X_train[:5000], y_train[:5000], params),
                 loss(X_valid, y_valid, params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Example #3
0
def test(filename_train,
         filename_test,
         filename_model,
         n_events_train,
         n_events_test,
         filename_output,
         pflow=False,
         n_jets_per_event=10,
         random_state=1):
    # Initialization
    n_events_train = int(n_events_train)
    n_events_test = int(n_events_test)
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_test = %s" % filename_test)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events_train = %d" % n_events_train)
    logging.info("\tn_events_test = %d" % n_events_test)
    logging.info("\tfilename_output = %s" % filename_output)
    logging.info("\tpflow = %s" % pflow)
    logging.info("\tn_jets_per_event = %d" % n_jets_per_event)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading train data + preprocessing...")

    fd = open(filename_train, "rb")

    # training file is assumed to be formatted a sequence of pickled pairs
    # (e_i, y_i), where e_i is a list of (phi, eta, pt, mass, jet) tuples.

    X = []
    y = []

    for i in range(n_events_train):
        e_i, y_i = pickle.load(fd)

        original_features = []
        jets = []

        for j, (phi, eta, pt, mass, jet) in enumerate(e_i[:n_jets_per_event]):
            if len(jet["tree"]) > 1:
                original_features.append((phi, eta, pt, mass))
                jet = extract(permute_by_pt(rewrite_content(jet)), pflow=pflow)
                jets.append(jet)

        if len(jets) == n_jets_per_event:
            X.append([np.array(original_features), jets])
            y.append(y_i)

    y = np.array(y)

    fd.close()

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Building scalers
    logging.info("Building scalers...")
    tf_features = RobustScaler().fit(np.vstack([features
                                                for features, _ in X]))
    tf_content = RobustScaler().fit(
        np.vstack([j["content"] for _, jets in X for j in jets]))

    X = None
    y = None

    # Loading test data
    logging.info("Loading test data + preprocessing...")

    fd = open(filename_test, "rb")

    # training file is assumed to be formatted a sequence of pickled pairs
    # (e_i, y_i), where e_i is a list of (phi, eta, pt, mass, jet) tuples.

    X = []
    y = []

    for i in range(n_events_test):
        e_i, y_i = pickle.load(fd)

        original_features = []
        jets = []

        for j, (phi, eta, pt, mass, jet) in enumerate(e_i[:n_jets_per_event]):
            if len(jet["tree"]) > 1:
                original_features.append((phi, eta, pt, mass))
                jet = extract(permute_by_pt(rewrite_content(jet)), pflow=pflow)
                jets.append(jet)

        if len(jets) == n_jets_per_event:
            X.append([np.array(original_features), jets])
            y.append(y_i)

    y = np.array(y)

    fd.close()

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Scaling
    logging.info("Scaling...")

    for i in range(len(X)):
        X[i][0] = tf_features.transform(X[i][0])

        for j in X[i][1]:
            j["content"] = tf_content.transform(j["content"])

    # Testing
    logging.info("Testing...")

    predict = event_predict

    fd = open(filename_model, "rb")
    params = pickle.load(fd)
    fd.close()

    all_y_pred = []

    for start in range(0, len(y), 1000):
        y_pred = predict(params,
                         X[start:start + 1000],
                         n_jets_per_event=n_jets_per_event)
        all_y_pred.append(y_pred)

    y_pred = np.concatenate(all_y_pred)

    # Save
    output = np.hstack((y.reshape(-1, 1), y_pred.reshape(-1, 1)))

    fd = open(filename_output, "wb")
    pickle.dump(output, fd, protocol=2)
    fd.close()