Beispiel #1
0
def train(filename_train,
          filename_model,
          regression=False,
          simple=False,
          n_features=14,
          n_hidden=40,
          n_epochs=5,
          batch_size=64,
          step_size=0.0005,
          decay=0.9,
          random_state=42,
          verbose=False,
          statlimit=-1):
    # Initialization
    gated = not simple
    if verbose:
        logging.info("Calling with...")
        logging.info("\tfilename_train = %s" % filename_train)
        logging.info("\tfilename_model = %s" % filename_model)
        logging.info("\tgated = %s" % gated)
        logging.info("\tn_features = %d" % n_features)
        logging.info("\tn_hidden = %d" % n_hidden)
        logging.info("\tn_epochs = %d" % n_epochs)
        logging.info("\tbatch_size = %d" % batch_size)
        logging.info("\tstep_size = %f" % step_size)
        logging.info("\tdecay = %f" % decay)
        logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    if verbose:
        logging.info("Loading data...")
    if filename_train[-1] == "e":
        fd = open(filename_train, "rb")
        X, y = pickle.load(fd)
        fd.close()
    else:
        X, y = np.load(filename_train)
    X = np.array(X).astype(dict)
    y = np.array(y).astype(float)
    flush = np.random.permutation(len(X))
    X, y = X[flush][:statlimit], y[flush][:statlimit]
    i = 0

    ### delete single particles ###
    while i < len(X):
        if len(X[i]["content"]) == 1:
            X = np.delete(X, i)
            y = np.delete(y, i)
        else:
            i += 1

    if regression:
        zerovalue = square_error(y, [x["pt"] for x in X]).mean()

    X = list(X)
    if verbose:
        logging.info("\tfilename = %s" % filename_train)
        logging.info("\tX size = %d" % len(X))
        logging.info("\ty size = %d" % len(y))

    # Preprocessing
    if verbose:
        logging.info("Preprocessing...")
    tf = create_tf_transform(X)

    X = apply_tf_transform(X, tf)

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=rng)
    del X
    del y
    # Training
    if verbose:
        logging.info("Training...")

    if gated:
        predict = grnn_predict_gated
        init = grnn_init_gated
    else:
        predict = grnn_predict_simple
        init = grnn_init_simple

    trained_params = init(n_features, n_hidden, random_state=rng)
    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X, regression=regression)
        if regression:
            l = square_error(y, y_pred).mean()
        else:
            l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration % n_batches)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient, regression=False):
        if iteration % 100 == 0:
            the_loss = loss(X_valid, y_valid, params)
            if the_loss < best_score[0]:
                best_score[0] = the_loss
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            if verbose:
                if regression:
                    logging.info(
                        "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f"
                        "\tbest_loss(valid) = %.4f" %
                        (iteration, loss(X_train[:5000], y_train[:5000],
                                         params), loss(X_valid, y_valid,
                                                       params), best_score[0]))
                else:
                    roc_auc = roc_auc_score(
                        y_valid, predict(params,
                                         X_valid,
                                         regression=regression))
                    logging.info(
                        "%5d\t~loss(train) = %.4f\tloss(valid) = %.4f"
                        "\troc_auc(valid) = %.4f\tbest_loss(valid) = %.4f" %
                        (iteration, loss(X_train[:5000], y_train[:5000],
                                         params), loss(
                                             X_valid, y_valid,
                                             params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)
        if regression:
            logging.info("zerovalue = %.4f" % zerovalue)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Beispiel #2
0
def train(filename_train,
          filename_model,
          n_events_train=-1,
          simple=False,
          n_features=7,
          n_hidden=30,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          random_state=1):
    # Initialization
    gated = not simple
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events_train = %d" % n_events_train)
    logging.info("\tgated = %s" % gated)
    logging.info("\tn_features = %d" % n_features)
    logging.info("\tn_hidden = %d" % n_hidden)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data...")

    fd = open(filename_train, "rb")
    X, y = pickle.load(fd)
    fd.close()
    y = np.array(y)

    if n_events_train > 0:
        indices = check_random_state(123).permutation(len(X))[:n_events_train]
        X = [X[i] for i in indices]
        y = y[indices]

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    X = [extract(permute_by_pt(rewrite_content(jet))) for jet in X]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))

    for jet in X:
        jet["content"] = tf.transform(jet["content"])

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=5000,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    if gated:
        predict = grnn_predict_gated
        init = grnn_init_gated
    else:
        predict = grnn_predict_simple
        init = grnn_init_simple

    trained_params = init(n_features, n_hidden, random_state=rng)
    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration % n_batches)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid, predict(params, X_valid))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" %
                (iteration, loss(X_train[:5000], y_train[:5000], params),
                 loss(X_valid, y_valid, params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Beispiel #3
0
def train(filename_train,
          filename_model,
          n_events,
          n_features_rnn=4,
          n_hidden_rnn=40,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          n_particles_per_event=10,
          random_state=1):
    # Initialization
    n_events = int(n_events)
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events = %d" % n_events)
    logging.info("\tn_features_rnn = %d" % n_features_rnn)
    logging.info("\tn_hidden_rnn = %d" % n_hidden_rnn)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\tn_particles_per_event = %d" % n_particles_per_event)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data + preprocessing...")

    fd = open(filename_train, "rb")

    X = []
    y = []

    for i in range(n_events):
        v_i, y_i = pickle.load(fd)
        v_i = v_i[:n_particles_per_event]

        X.append(v_i)
        y.append(y_i)

    y = np.array(y)

    fd.close()

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    tf_features = RobustScaler().fit(np.vstack([features for features in X]))

    for i in range(len(X)):
        X[i] = tf_features.transform(X[i])

        if len(X[i]) < n_particles_per_event:
            X[i] = np.vstack(
                [X[i], np.zeros((n_particles_per_event - len(X[i]), 4))])

    # Split into train+test
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=1000,
                                                          stratify=y,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    predict = event_baseline_predict
    init = event_baseline_init

    trained_params = init(n_features_rnn, n_hidden_rnn, random_state=rng)

    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params,
                         X,
                         n_particles_per_event=n_particles_per_event)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(
                y_valid,
                predict(params,
                        X_valid,
                        n_particles_per_event=n_particles_per_event))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" %
                (iteration, loss(X_train[:5000], y_train[:5000], params),
                 loss(X_valid, y_valid, params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Beispiel #4
0
def train(filename_train,
          filename_model,
          n_events,
          pflow=False,
          n_features_embedding=7,
          n_hidden_embedding=40,
          n_features_rnn=40+4,
          n_hidden_rnn=10,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          n_jets_per_event=10,
          random_state=1):
    # Initialization
    n_events = int(n_events)
    if pflow:
        n_features_embedding += 4
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events = %d" % n_events)
    logging.info("\tpflow = %s" % pflow)
    logging.info("\tn_features_embedding = %d" % n_features_embedding)
    logging.info("\tn_hidden_embedding = %d" % n_hidden_embedding)
    logging.info("\tn_features_rnn = %d" % n_features_rnn)
    logging.info("\tn_hidden_rnn = %d" % n_hidden_rnn)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\tn_jets_per_event = %d" % n_jets_per_event)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data + preprocessing...")

    fd = open(filename_train, "rb")

    # training file is assumed to be formatted a sequence of pickled pairs
    # (e_i, y_i), where e_i is a list of (phi, eta, pt, mass, jet) tuples.

    X = []
    y = []

    for i in range(n_events):
        e_i, y_i = pickle.load(fd)

        original_features = []
        jets = []

        for j, (phi, eta, pt, mass, jet) in enumerate(e_i[:n_jets_per_event]):
            if len(jet["tree"]) > 1:
                original_features.append((phi, eta, pt, mass))
                jet = extract(permute_by_pt(rewrite_content(jet)), pflow=pflow)
                jets.append(jet)

        if len(jets) == n_jets_per_event:
            X.append([np.array(original_features), jets])
            y.append(y_i)

    y = np.array(y)

    fd.close()

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    tf_features = RobustScaler().fit(
        np.vstack([features for features, _ in X]))

    tf_content = RobustScaler().fit(
        np.vstack([j["content"] for _, jets in X for j in jets]))

    for i in range(len(X)):
        X[i][0] = tf_features.transform(X[i][0])

        for j in X[i][1]:
            j["content"] = tf_content.transform(j["content"])

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                          test_size=1000,
                                                          stratify=y,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    predict = event_predict
    init = event_init

    trained_params = init(n_features_embedding, n_hidden_embedding,
                          n_features_rnn, n_hidden_rnn, n_jets_per_event,
                          random_state=rng)

    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X,
                         n_jets_per_event=n_jets_per_event)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start+batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid,
                                    predict(params, X_valid,
                                            n_jets_per_event=n_jets_per_event))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" % (
                    iteration,
                    loss(X_train[:5000], y_train[:5000], params),
                    loss(X_valid, y_valid, params),
                    roc_auc,
                    best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
Beispiel #5
0
def train(filename_train1,
          filename_train2,
          filename_model,
          n_features=7,
          n_hidden=40,
          n_epochs=5,
          batch_size=64,
          step_size=0.0005,
          decay=0.9,
          random_state=1):
    # Initialization
    logging.info("Calling with...")
    logging.info("\tfilename_train1 = %s" % filename_train1)
    logging.info("\tfilename_train2 = %s" % filename_train2)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_features = %d" % n_features)
    logging.info("\tn_hidden = %d" % n_hidden)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data...")

    fd = open(filename_train1, "rb")
    X1, y = pickle.load(fd)
    fd.close()
    y = np.array(y)

    fd = open(filename_train2, "rb")
    X2, _ = pickle.load(fd)
    fd.close()

    logging.info("\tfilename = %s" % filename_train1)
    logging.info("\tfilename = %s" % filename_train2)
    logging.info("\tX1 size = %d" % len(X1))
    logging.info("\tX2 size = %d" % len(X2))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")

    X1 = [extract(permute_by_pt(jet)) for jet in X1]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X1]))

    for jet in X1:
        jet["content"] = tf.transform(jet["content"])

    X2 = [extract(permute_by_pt(jet)) for jet in X2]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X2]))

    for jet in X2:
        jet["content"] = tf.transform(jet["content"])

    # Split into train+test
    logging.info("Splitting into train and validation...")

    X1_train, X1_valid, X2_train, X2_valid, y_train, y_valid = train_test_split(
        X1, X2, y, test_size=5000, random_state=rng)

    # Training
    logging.info("Training...")

    predict = grnn_predict_simple_join
    init = grnn_init_simple_join

    trained_params = init(n_features, n_hidden, random_state=rng)
    n_batches = int(np.ceil(len(X1_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X1, X2, y, params):
        y_pred = predict(params, X1, X2)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration % n_batches)
        start = rng.randint(len(X1_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X1_train[idx], X2_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid, predict(params, X1_valid,
                                                     X2_valid))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" %
                (iteration,
                 loss(X1_train[:5000], X2_train[:5000], y_train[:5000],
                      params), loss(X1_valid, X2_valid, y_valid,
                                    params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay