Example #1
0
def MakeReachabilityDataset(
        num_data_points: int) -> reachability_pb2.ReachabilityDataset:
    """Generate a training data proto of unique CFGs."""
    seqs = set()
    data = reachability_pb2.ReachabilityDataset()
    seed = 0
    while len(data.entry) < num_data_points:
        graph = control_flow_graph.ControlFlowGraph.GenerateRandom(
            FLAGS.reachability_num_nodes,
            seed=seed,
            connections_scaling_param=FLAGS.reachability_scaling_param)
        seed += 1
        seq = graph.ToSuccessorsListString()
        if seq not in seqs:
            seqs.add(seq)
            proto = data.entry.add()
            graph.SetProto(proto)
            proto.seed = seed
    return data
Example #2
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    model_dir = pathlib.Path(FLAGS.reachability_model_dir)
    if not model_dir.is_dir():
        raise app.UsageError('--reachability_model_dir is not a directory.')

    logger = telemetry.TrainingLogger(logdir=model_dir / 'logs')
    telemetry_ = logger.EpochTelemetry()
    num_epochs = len(telemetry_)
    training_time_ms = sum(t.epoch_wall_time_ms for t in telemetry_)
    training_time_natural = humanize.naturaldelta(training_time_ms / 1000)
    time_per_epoch_natural = humanize.naturaldelta(training_time_ms /
                                                   num_epochs / 1000)
    losses = [round(t.loss, 2) for t in telemetry_]

    with open(model_dir / 'model.json') as f:
        model: keras.models.Model = keras.models.model_from_json(f.read())

    model.compile(loss='binary_crossentropy',
                  metrics=['accuracy'],
                  optimizer='adam')
    model.summary()
    print(f'Total training time: {training_time_natural} '
          f'({time_per_epoch_natural} per epoch).')
    print(f'Number of epochs: {num_epochs}.')
    print(f'Training losses: {losses}.')

    training_data = pbutil.FromFile(model_dir / 'training_data.pbtxt',
                                    reachability_pb2.ReachabilityDataset())
    testing_data = pbutil.FromFile(model_dir / 'testing_data.pbtxt',
                                   reachability_pb2.ReachabilityDataset())
    data = reachability_pb2.ReachabilityDataset()
    data.entry.extend(training_data.entry)
    data.entry.extend(testing_data.entry)

    num_nodes = len(training_data.entry[0].graph.node)
    num_nodes_natural = humanize.intcomma(num_nodes)
    num_training_graphs_natural = humanize.intcomma(len(training_data.entry))
    num_testing_graphs_natural = humanize.intcomma(len(testing_data.entry))
    print(f'Training data: {num_training_graphs_natural} graphs of '
          f'{num_nodes_natural} nodes each.')
    print(f'Testing data: {num_testing_graphs_natural} graphs of '
          f'{num_nodes_natural} nodes each.')

    num_connections_training = sum(
        sum(len(n.child) for n in entry.graph.node)
        for entry in training_data.entry)
    num_connections_testing = sum(
        sum(len(n.child) for n in entry.graph.node)
        for entry in testing_data.entry)

    print(
        'Average graph connections: {:.1f} training ({:.1f} per node), '
        '{:.1f} testing ({:.1f} per node).'.format(
            num_connections_training / len(training_data.entry),
            num_connections_training / (len(training_data.entry) * num_nodes),
            num_connections_testing / len(testing_data.entry),
            num_connections_testing / (len(testing_data.entry) * num_nodes)))

    sequence_length = train_model.GetSequenceLength(
        len(training_data.entry[0].graph.node))
    print('Sequence length:', sequence_length)

    with open(model_dir / 'atomizer.pkl', 'rb') as f:
        atomizer: atomizers.AtomizerBase = pickle.load(f)

    print('Vocabulary size:', atomizer.vocab_size)

    seqs = [
        train_model.ControlFlowGraphToSequence(entry.graph)
        for entry in data.entry
    ]
    num_uniq_seqs = len(set(seqs))
    print('Unique sequences: {} of {} ({:.2f}%)'.format(
        humanize.intcomma(num_uniq_seqs), humanize.intcomma(len(seqs)),
        (num_uniq_seqs / len(seqs)) * 100))
    num_uniq_labels = len(
        set([''.join(str(x) for x in e.reachable) for e in data.entry]))
    print('Unique labels: {} of {} ({:.2f}%)'.format(
        humanize.intcomma(num_uniq_labels), humanize.intcomma(len(seqs)),
        (num_uniq_labels / len(seqs)) * 100))

    test_x, test_y = train_model.ProtosToModelData(testing_data,
                                                   sequence_length, atomizer)

    zero_r_acc = sum(sum(x)
                     for x in test_y) / len(testing_data.entry) / num_nodes
    zero_r_acc = max(zero_r_acc[0], 1 - zero_r_acc[0])
    print('Zero-R accuracy: {:.2%}'.format(zero_r_acc))

    row = model.evaluate(test_x,
                         test_y,
                         batch_size=FLAGS.batch_size,
                         verbose=0)
    overall_loss, losses, accuracies = (row[0], row[1:1 + num_nodes],
                                        row[num_nodes + 1:])
    print('Accuracy: {:.2%}'.format(sum(accuracies) / len(accuracies)))
    print('Accuracy (excluding first class): {:.2%}'.format(
        sum(accuracies[1:]) / len(accuracies[1:])))
    print('done.')
Example #3
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(' '.join(
            argv[1:])))

    model_dir = pathlib.Path(FLAGS.reachability_model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    (model_dir / 'logs').mkdir(exist_ok=True)
    (model_dir / 'checkpoints').mkdir(exist_ok=True)

    logging.info('Generating graphs dataset ...')
    data = MakeReachabilityDataset(FLAGS.reachability_num_training_graphs +
                                   FLAGS.reachability_num_testing_graphs)
    training_data = reachability_pb2.ReachabilityDataset()
    training_data.entry.extend(
        data.entry[:FLAGS.reachability_num_training_graphs])
    pbutil.ToFile(training_data, model_dir / 'training_data.pbtxt')
    testing_data = reachability_pb2.ReachabilityDataset()
    testing_data.entry.extend(
        data.entry[FLAGS.reachability_num_training_graphs:])
    pbutil.ToFile(testing_data, model_dir / 'testing_data.pbtxt')

    logging.info('Number of training examples: %s.',
                 humanize.intcomma(len(training_data.entry)))
    logging.info('Number of testing examples: %s.',
                 humanize.intcomma(len(testing_data.entry)))

    n = FLAGS.reachability_num_nodes
    sequence_length = GetSequenceLength(FLAGS.reachability_num_nodes)
    logging.info('Using sequence length %s.',
                 humanize.intcomma(sequence_length))
    seqs = [ControlFlowGraphToSequence(entry.graph) for entry in data.entry]
    text = '\n'.join(seqs)
    logging.info('Deriving atomizer from %s chars.',
                 humanize.intcomma(len(text)))
    atomizer = atomizers.AsciiCharacterAtomizer.FromText(text)
    logging.info('Vocabulary size: %s.',
                 humanize.intcomma(len(atomizer.vocab)))
    with open(model_dir / 'atomizer.pkl', 'wb') as f:
        pickle.dump(atomizer, f)
    logging.info('Pickled atomizer to %s.', model_dir / 'atomizer.pkl')

    x, y = ProtosToModelData(training_data, sequence_length, atomizer)
    logging.info('Training data: x %s, y[%s] %s', x.shape, len(y), y[0].shape)

    test_x, test_y = ProtosToModelData(testing_data, sequence_length, atomizer)
    logging.info('Testing data: x %s, y[%s] %s', test_x.shape, len(test_y),
                 test_y[0].shape)

    num_uniq_seqs = len(set(seqs))
    logging.info('Unique sequences: %s of %s (%.2f %%)',
                 humanize.intcomma(num_uniq_seqs),
                 humanize.intcomma(len(seqs)),
                 (num_uniq_seqs / len(seqs)) * 100)
    num_uniq_labels = len(
        set([''.join(str(x) for x in e.reachable) for e in data.entry]))
    logging.info('Unique labels: %s of %s (%.2f %%)',
                 humanize.intcomma(num_uniq_labels),
                 humanize.intcomma(len(seqs)),
                 (num_uniq_labels / len(seqs)) * 100)

    np.random.seed(FLAGS.reachability_model_seed)
    random.seed(FLAGS.reachability_model_seed)
    logging.info('Building Keras model ...')
    model = BuildKerasModel(sequence_length=sequence_length,
                            num_classes=n,
                            lstm_size=FLAGS.lstm_size,
                            num_layers=FLAGS.num_layers,
                            dnn_size=FLAGS.dnn_size,
                            atomizer=atomizer)

    model_json = model.to_json()
    with open(model_dir / 'model.json', 'w') as f:
        f.write(model_json)
    logging.info('Wrote model to %s', model_dir / 'model.json')

    logging.info('Training model ...')

    def OnEpochEnd(epoch, logs):
        """End-of-epoch model evaluate."""
        del logs
        logging.info('Evaluating model at epoch %d', epoch)
        # score, accuracy
        row = model.evaluate(test_x,
                             test_y,
                             batch_size=FLAGS.batch_size,
                             verbose=0)
        overall_loss, losses, accuracies = row[0], row[1:1 + n], row[n + 1:]
        logging.info('Accuracy (excluding first class): %.2f %%',
                     (sum(accuracies[1:]) / len(accuracies[1:])) * 100)

    logger = telemetry.TrainingLogger(logdir=model_dir / 'logs')
    model.fit(
        x,
        y,
        epochs=FLAGS.num_epochs,
        batch_size=FLAGS.batch_size,
        verbose=True,
        shuffle=True,
        callbacks=[
            keras.callbacks.ModelCheckpoint(str(model_dir / 'checkpoints') +
                                            '/weights_{epoch:03d}.hdf5',
                                            verbose=1,
                                            mode="min",
                                            save_best_only=False),
            keras.callbacks.LambdaCallback(on_epoch_end=OnEpochEnd),
            logger.KerasCallback(keras),
        ])

    for i in range(5):
        outs = FlattenModelOutputs(model.predict(np.array([x[i]])))
        logging.info('outs:    %s', outs)
        logging.info('clamped: %s', np.rint(outs).astype(np.int32))
        logging.info('true:    %s', FlattenModelData(y, i))
        logging.info('')
    logging.info('done')