Esempio n. 1
0
def eval_one_epoch(sess, ops, test_writer, is_full_training):
    """ ops: dict mapping from string to tf ops """
    global EPOCH_CNT
    is_training = False
    test_idxs = np.arange(0, len(TEST_FILES))
    # Test on all data: last batch might be smaller than BATCH_SIZE
    loss_sum = acc = 0
    acc_seg = 0

    for fn in range(len(TEST_FILES)):
        #log_string('----' + str(fn) + '-----')
        current_file = os.path.join(H5_DIR, TEST_FILES[test_idxs[fn]])
        if RD:
            current_data, current_cluster, current_label = provider.load_h5_data_label_seg(
                current_file)
        else:
            current_data, current_label = provider.load_h5(current_file, 'seg')
        adds = provider.load_add(current_file, ['global'])
        if NUM_GLOB < adds['global'].shape[1]:
            log_string("Using less global variables than possible")
            adds['global'] = adds['global'][:, :NUM_GLOB]

        current_label = np.squeeze(current_label)

        file_size = current_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            batch_data, batch_label, batch_global = get_batch(
                current_data, current_label, adds['global'], start_idx,
                end_idx)
            cur_batch_size = end_idx - start_idx

            feed_dict = {
                ops['pointclouds_pl']: batch_data,
                ops['is_training_pl']: is_training,
                ops['global_pl']: batch_global,
                ops['labels_pl']: batch_label,
                ops['alpha']: 10 * (EPOCH_CNT - MAX_PRETRAIN + 1),
            }
            if is_full_training:
                summary, step, loss_val, pred_val, max_pool, dist = sess.run(
                    [
                        ops['merged'],
                        ops['step'],
                        ops['kmeans_loss'],
                        ops['pred'],
                        ops['max_pool'],
                        ops['stack_dist'],

                        #ops['pi']
                    ],
                    feed_dict=feed_dict)

                cluster_assign = np.zeros((cur_batch_size), dtype=int)
                for i in range(cur_batch_size):
                    index_closest_cluster = np.argmin(dist[:, i])
                    cluster_assign[i] = index_closest_cluster
                if RD:
                    batch_cluster = current_cluster[start_idx:end_idx]

                    if batch_cluster.size == cluster_assign.size:
                        acc += cluster_acc(batch_cluster, cluster_assign)

            else:
                summary, step, loss_val, pred_val, max_pool = sess.run(
                    [
                        ops['merged'],
                        ops['step'],
                        ops['classify_loss'],
                        ops['pred'],
                        ops['max_pool'],
                    ],
                    feed_dict=feed_dict)

            test_writer.add_summary(summary, step)

            loss_sum += np.mean(loss_val)

    total_loss = loss_sum * 1.0 / float(num_batches)
    log_string('mean loss: %f' % (total_loss))
    log_string('testing clustering accuracy: %f' % (acc / float(num_batches)))

    EPOCH_CNT += 1
    if FLAGS.min == 'acc':
        return total_correct / float(total_seen)
    else:
        return total_loss
Esempio n. 2
0
def train_one_epoch(sess, ops, train_writer, is_full_training):
    """ ops: dict mapping from string to tf ops """
    is_training = True

    train_idxs = np.arange(0, len(TRAIN_FILES))

    acc = loss_sum = 0
    y_pool = []
    y_assign = []
    for fn in range(len(TRAIN_FILES)):
        #log_string('----' + str(fn) + '-----')
        current_file = os.path.join(H5_DIR, TRAIN_FILES[train_idxs[fn]])
        if RD:
            current_data, current_cluster, current_label = provider.load_h5_data_label_seg(
                current_file)
        else:
            current_data, current_label = provider.load_h5(current_file, 'seg')

        adds = provider.load_add(current_file, ['global'])
        if NUM_GLOB < adds['global'].shape[1]:
            log_string("Using less global variables than possible")
            adds['global'] = adds['global'][:, :NUM_GLOB]

        current_label = np.squeeze(current_label)

        file_size = current_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        if FLAGS.nbatches > 0:
            num_batches = FLAGS.nbatches

        log_string(str(datetime.now()))

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE
            batch_data, batch_label, batch_global = get_batch(
                current_data, current_label, adds['global'], start_idx,
                end_idx)
            cur_batch_size = end_idx - start_idx

            #print(batch_weight)
            feed_dict = {
                ops['pointclouds_pl']: batch_data,
                ops['labels_pl']: batch_label,
                ops['global_pl']: batch_global,
                ops['is_training_pl']: is_training,
                ops['alpha']: 10 * (EPOCH_CNT - MAX_PRETRAIN + 1),
            }
            if is_full_training:
                summary, step, _, loss_val, pred_val, max_pool, dist = sess.run(
                    [
                        ops['merged'], ops['step'], ops['train_op_full'],
                        ops['kmeans_loss'], ops['pred'], ops['max_pool'],
                        ops['stack_dist']
                    ],
                    feed_dict=feed_dict)

                cluster_assign = np.zeros((cur_batch_size), dtype=int)
                for i in range(cur_batch_size):
                    index_closest_cluster = np.argmin(dist[:, i])
                    cluster_assign[i] = index_closest_cluster
                if RD:
                    batch_cluster = current_cluster[start_idx:end_idx]
                    if batch_cluster.size == cluster_assign.size:
                        acc += cluster_acc(batch_cluster, cluster_assign)

            else:
                summary, step, _, loss_val, pred_val, max_pool = sess.run(
                    [
                        ops['merged'], ops['step'], ops['train_op'],
                        ops['classify_loss'], ops['pred'], ops['max_pool']
                    ],
                    feed_dict=feed_dict)

            loss_sum += np.mean(loss_val)
            if len(y_pool) == 0:
                y_pool = np.squeeze(max_pool)

            else:
                y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0)

            train_writer.add_summary(summary, step)
    log_string('mean loss: %f' % (loss_sum / float(num_batches)))
    log_string('train clustering accuracy: %f' % (acc / float(num_batches)))
    return y_pool
Esempio n. 3
0
def eval_one_epoch(sess, ops):
    is_training = False

    eval_idxs = np.arange(0, len(EVALUATE_FILES))
    y_val = []
    for fn in range(len(EVALUATE_FILES)):
        current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]])
        current_data, current_label, current_cluster = provider.load_h5_data_label_seg(
            current_file)
        adds = provider.load_add(current_file, ['masses'])

        current_label = np.squeeze(current_label)

        file_size = current_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        num_batches = 5

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE

            batch_data, batch_label = get_batch(current_data, current_label,
                                                start_idx, end_idx)
            batch_cluster = current_cluster[start_idx:end_idx]
            cur_batch_size = end_idx - start_idx

            feed_dict = {
                ops['pointclouds_pl']: batch_data,
                ops['labels_pl']: batch_label,
                ops['alpha']: 1,  #No impact on evaluation,
                ops['is_training_pl']: is_training,
            }

            loss, dist, max_pool = sess.run(
                [ops['kmeans_loss'], ops['stack_dist'], ops['max_pool']],
                feed_dict=feed_dict)
            cluster_assign = np.zeros((cur_batch_size), dtype=int)
            for i in range(cur_batch_size):
                index_closest_cluster = np.argmin(dist[:, i])
                cluster_assign[i] = index_closest_cluster

            batch_cluster = np.array([
                np.where(r == 1)[0][0]
                for r in current_cluster[start_idx:end_idx]
            ])

            if len(y_val) == 0:
                y_val = batch_cluster
                y_assign = cluster_assign
                y_pool = np.squeeze(max_pool)
                y_mass = adds['masses'][start_idx:end_idx]
            else:
                y_val = np.concatenate((y_val, batch_cluster), axis=0)
                y_assign = np.concatenate((y_assign, cluster_assign), axis=0)
                y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0)
                y_mass = np.concatenate(
                    (y_mass, adds['masses'][start_idx:end_idx]), axis=0)

    with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)),
                   "w") as fh5:
        dset = fh5.create_dataset("pid", data=y_val)  #Real jet categories
        dset = fh5.create_dataset("label", data=y_assign)  #Cluster labeling
        dset = fh5.create_dataset("max_pool", data=y_pool)
        dset = fh5.create_dataset("masses", data=y_mass)
Esempio n. 4
0
def eval_one_epoch(sess, ops):
    is_training = False
    eval_idxs = np.arange(0, len(EVALUATE_FILES))

    y_assign = []
    y_glob = []
    acc = 0

    for fn in range(len(EVALUATE_FILES)):
        current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]])
        if RD:
            current_data, current_cluster, current_label = provider.load_h5_data_label_seg(
                current_file)
        else:
            current_data, current_label = provider.load_h5(current_file, 'seg')

        adds = provider.load_add(current_file, ['global', 'masses'])

        if NUM_GLOB < adds['global'].shape[1]:
            print("Using less global variables than possible")
            current_glob = adds['global'][:, :NUM_GLOB]
        else:
            current_glob = adds['global']

        current_label = np.squeeze(current_label)

        file_size = current_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE

            batch_data, batch_label, batch_global = get_batch(
                current_data, current_label, current_glob, start_idx, end_idx)

            cur_batch_size = end_idx - start_idx

            feed_dict = {
                ops['pointclouds_pl']: batch_data,
                ops['global_pl']: batch_global,
                ops['labels_pl']: batch_label,
                ops['alpha']: 1,  #No impact during evaluation
                ops['is_training_pl']: is_training,
            }

            dist, mu, max_pool = sess.run(
                [ops['stack_dist'], ops['mu'], ops['max_pool']],
                feed_dict=feed_dict)

            cluster_assign = np.zeros((cur_batch_size), dtype=int)
            if RD:
                batch_cluster = current_cluster[start_idx:end_idx]

            for i in range(cur_batch_size):
                index_closest_cluster = np.argmin(dist[:, i])
                cluster_assign[i] = index_closest_cluster
                if RD:
                    acc += cluster_acc(batch_cluster, cluster_assign)

            if len(y_assign) == 0:
                if RD:
                    y_val = batch_cluster
                y_assign = cluster_assign
                y_pool = np.squeeze(max_pool)
            else:
                y_assign = np.concatenate((y_assign, cluster_assign), axis=0)
                y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0)

                if RD:
                    y_val = np.concatenate((y_val, batch_cluster), axis=0)

        if len(y_glob) == 0:
            y_glob = adds['global'][:num_batches * BATCH_SIZE]
            y_mass = adds['masses'][:num_batches * BATCH_SIZE]
        else:
            y_glob = np.concatenate(
                (y_glob, adds['global'][:num_batches * BATCH_SIZE]), axis=0)
            y_mass = np.concatenate(
                (y_mass, adds['masses'][:num_batches * BATCH_SIZE]), axis=0)

    with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)),
                   "w") as fh5:
        if RD:
            dset = fh5.create_dataset("label", data=y_val)
        dset = fh5.create_dataset("pid", data=y_assign)
        dset = fh5.create_dataset("max_pool", data=y_pool)
        dset = fh5.create_dataset("global", data=y_glob)
        dset = fh5.create_dataset("masses", data=y_mass)
Esempio n. 5
0
def eval_one_epoch(sess, ops):
    is_training = False

    total_correct = total_sig = total_correct_ones = total_seen = total_seen_ones = loss_sum = 0
    eval_idxs = np.arange(0, len(EVALUATE_FILES))
    y_pred = []
    for fn in range(len(EVALUATE_FILES)):
        current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]])
        current_data, current_label = provider.load_h5(current_file, 'seg')
        full_data = current_data
        if current_data.shape[2] > NFEATURES:
            print('puppi not used')
            current_data = current_data[:, :, :NFEATURES]
        if current_data.shape[1] > NUM_POINT:
            print('Using less points')
            current_data = current_data[:, :NUM_POINT]
            current_label = current_label[:, :NUM_POINT]

        add_list = [
            'PFNoPU',
            'puppiPU',
            'chs',
            'NPU',
            'CHS_MET',
            'PUPPI_MET',
            #'puppiNoPU',
        ]
        adds = provider.load_add(current_file, add_list)
        if not FLAGS.is_data:
            current_truth = adds['PFNoPU']
            current_truth = preprocessing(current_data, current_truth)
        else:
            add_list.append('nLeptons')
            current_truth = np.zeros((current_data.shape))

        current_label = np.squeeze(current_label)

        file_size = current_data.shape[0]
        num_batches = file_size // BATCH_SIZE
        #num_batches = 1
        # if FLAGS.is_data:
        #     num_batches = 600

        for batch_idx in range(num_batches):
            scores = np.zeros(NUM_POINT)
            true = np.zeros(NUM_POINT)
            start_idx = batch_idx * BATCH_SIZE
            end_idx = (batch_idx + 1) * BATCH_SIZE

            batch_data, batch_label, batch_truth = get_batch(
                current_data, current_label, current_truth, start_idx, end_idx)

            cur_batch_size = end_idx - start_idx

            feed_dict = {
                ops['pointclouds_pl']: batch_data,
                ops['truth_pl']: batch_truth,
                ops['labels_pl']: batch_label,
                ops['is_training_pl']: is_training,
            }
            #,beforemax
            loss, pred = sess.run([ops['loss'], ops['pred']],
                                  feed_dict=feed_dict)
            pred_val = np.argmax(pred, 2)

            correct_ones = pred_val * batch_label
            total_sig += np.sum(batch_label == 2)
            total_correct_ones += np.sum(correct_ones == 4)

            loss_sum += np.mean(loss)
            if len(y_pred) == 0:
                y_pred = pred[:, :, 2]
                y_data = full_data[start_idx:end_idx]
                y_lab = batch_label
                y_add = {}
                for add in adds:
                    y_add[add] = adds[add][start_idx:end_idx]
            else:
                y_pred = np.concatenate((y_pred, pred[:, :, 2]), axis=0)
                y_data = np.concatenate((y_data, full_data[start_idx:end_idx]),
                                        axis=0)
                y_lab = np.concatenate((y_lab, batch_label), axis=0)
                for add in adds:
                    y_add[add] = np.concatenate(
                        (y_add[add], adds[add][start_idx:end_idx]), axis=0)

    if not FLAGS.is_data:
        print('The signal accuracy is {0}'.format(total_correct_ones /
                                                  float(total_sig)))
        flat_pred = y_pred.flatten()
        flat_lab = y_lab.flatten()
        flat_lab = flat_lab == 2
        results = metrics.roc_curve(flat_lab, flat_pred)
        threshs = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.90, 0.95]
        with open(os.path.join(MODEL_PATH, 'cut_eff.txt'), 'w') as f:
            for thresh in threshs:
                bin = np.argmax(results[1] > thresh)
                cut = results[2][bin]
                f.write('eff: {}, fpr: {}, cut: {} \n'.format(
                    results[1][bin], results[0][bin], cut))
    with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)),
                   "w") as fh5:
        dset = fh5.create_dataset("DNN", data=y_pred)
        dset = fh5.create_dataset("data", data=y_data)
        dset = fh5.create_dataset("pid", data=y_lab)
        for add in adds:
            dset = fh5.create_dataset(add, data=y_add[add])