Esempio n. 1
0
 def make_feed_dict(self, seqIn, patternIn, target=None):
     seqIn = utils_np.data_dicts_to_graphs_tuple(seqIn)
     patternIn = utils_np.data_dicts_to_graphs_tuple(patternIn)
     feed_dict = utils_tf.get_feed_dict( self.seq_input_ph, seqIn )
     feed_dict.update( utils_tf.get_feed_dict( self.pattern_input_ph, patternIn ) )
     if not target == None:
         target = utils_np.data_dicts_to_graphs_tuple(target)
         feed_dict.update( utils_tf.get_feed_dict( self.target_ph, target ) )
     return feed_dict
Esempio n. 2
0
 def test_get_feed_dict_raises(self, none_fields):
   networkxs = [_generate_graph(batch_index) for batch_index in range(16)]
   placeholders = utils_tf.placeholders_from_networkxs(networkxs)
   feed_values = utils_np.networkxs_to_graphs_tuple(networkxs)
   with self.assertRaisesRegexp(ValueError, ""):
     utils_tf.get_feed_dict(
         placeholders.map(lambda _: None, none_fields), feed_values)
   with self.assertRaisesRegexp(ValueError, ""):
     utils_tf.get_feed_dict(placeholders,
                            feed_values.map(lambda _: None, none_fields))
Esempio n. 3
0
 def test_feed_data(self):
   networkx = [_generate_graph(batch_index) for batch_index in range(16)]
   placeholders = utils_tf.placeholders_from_networkxs(
       networkx, force_dynamic_num_graphs=True)
   # Does not need to be the same size
   networkxs = [_generate_graph(batch_index) for batch_index in range(2)]
   with self.test_session() as sess:
     output = sess.run(
         placeholders,
         utils_tf.get_feed_dict(placeholders,
                                utils_np.networkxs_to_graphs_tuple(networkxs)))
   self.assertAllEqual(
       np.array([[0, 0], [1, 0], [2, 0], [3, 0], [0, 1], [1, 1], [2, 1],
                 [3, 1]]), output.nodes)
   self.assertEqual(np.float32, output.nodes.dtype)
   self.assertAllEqual(np.array([[0], [1]]), output.globals)
   self.assertEqual(np.float32, output.globals.dtype)
   sorted_edges_content = sorted(
       [(x, y, z)
        for x, y, z in zip(output.receivers, output.senders, output.edges)])
   self.assertAllEqual([0, 0, 1, 4, 4, 5],
                       [x[0] for x in sorted_edges_content])
   self.assertAllEqual([1, 2, 3, 5, 6, 7],
                       [x[1] for x in sorted_edges_content])
   self.assertEqual(np.float64, output.edges.dtype)
   self.assertAllEqual(
       np.array([[0, 1, 0], [1, 2, 0], [2, 3, 0], [0, 1, 1], [1, 2, 1],
                 [2, 3, 1]]), [x[2] for x in sorted_edges_content])
 def make_feed_dict(val):
     if isinstance(val, GraphsTuple):
         graphs_tuple = val
     else:
         dicts = []
         for graphs_tuple in val:
             dicts.append(
                 utils_np.graphs_tuple_to_data_dicts(graphs_tuple)[0])
         graphs_tuple = utils_np.data_dicts_to_graphs_tuple(dicts)
     return utils_tf.get_feed_dict(placeholders, graphs_tuple)
def create_feed_dict(input_ph,
                     target_ph,
                     input_graphs,
                     target_graphs,
                     batch_processing=True):
    if batch_processing:
        input_graphs = input_graphs
        target_graphs = target_graphs
    else:
        input_graphs = [input_graphs]
        target_graphs = [target_graphs]

    input_tuple = utils_np.networkxs_to_graphs_tuple(input_graphs)
    target_tuple = utils_np.networkxs_to_graphs_tuple(target_graphs)

    input_dct = utils_tf.get_feed_dict(input_ph, input_tuple)
    target_dct = utils_tf.get_feed_dict(target_ph, target_tuple)

    input_ph_runnable, target_ph_runnable = make_all_runnable_in_session(
        input_ph, target_ph)

    return input_ph_runnable, target_ph_runnable, {**input_dct, **target_dct}
Esempio n. 6
0
 def test_feed_data_no_nodes(self):
   networkx = [
       _generate_graph(batch_index, n_nodes=0, add_edges=False)
       for batch_index in range(16)
   ]
   placeholders = utils_tf.placeholders_from_networkxs(
       networkx, force_dynamic_num_graphs=True)
   # Does not need to be the same size
   networkxs = [
       _generate_graph(batch_index, n_nodes=0, add_edges=False)
       for batch_index in range(2)
   ]
   self.assertEqual(None, placeholders.nodes)
   self.assertEqual(None, placeholders.edges)
   with self.test_session() as sess:
     output = sess.run(
         placeholders.replace(nodes=tf.no_op(), edges=tf.no_op()),
         utils_tf.get_feed_dict(placeholders,
                                utils_np.networkxs_to_graphs_tuple(networkxs)))
   self.assertAllEqual(np.array([[0], [1]]), output.globals)
   self.assertEqual(np.float32, output.globals.dtype)
Esempio n. 7
0
def main():

    # A bunch of configuration stuff to clean up...
    parser = argparse.ArgumentParser(
        description='Train nx-graph with configurations')
    add_arg = parser.add_argument
    add_arg('name', nargs='?', default='unnamed')
    args = parser.parse_args()

    results_dir = 'results/{}'.format(args.name)
    os.makedirs(results_dir, exist_ok=True)
    config = load_config('configs/nxgraph_default.yaml')
    base_dir = config['data']['input_dir']
    config_tr = config['train']
    log_every_seconds = config_tr['time_lapse']
    batch_size = config_tr['batch_size']  # need optimization
    num_training_iterations = config_tr['iterations']
    iter_per_job = config_tr['iter_per_job']
    num_processing_steps_tr = config_tr['n_iters']  ## level of message-passing
    prod_name = config['prod_name']
    learning_rate = config_tr['learning_rate']
    output_dir = os.path.join(config['output_dir'], prod_name)

    # Start to build tensorflow sessions
    tf.reset_default_graph()

    # Creates a placeholder for training examples. The placeholders define a
    # slot for training examples given in feed dict to be assigned. We create
    # graphs.GraphsTuple placeholders using the graph_nets utility functions.
    # They are automatically generated from the first graph in the first batch.
    # By assigning force_dynamic_num_graphs=True, we ensure the the placeholders
    # accepts graphs of any size.
    _, _, input_graphs, truth_values = batch_iterator(base_dir,
                                                      batch_size).__next__()
    input_ph = utils_tf.placeholders_from_data_dicts(
        input_graphs[0:1], force_dynamic_num_graphs=True)
    truth_ph = tf.placeholder(tf.float64, shape=[None])

    # Here, we define our computational graphs.
    # - First, we compute the model output using the graph_nets library.
    # - Then, we compute our loss function only on edge features, where we utilize a log_loss
    #   function between the truth values and the model output. There is also some factor
    #   'num_processing_steps_tr' that describes the level of message passing that somehow
    #   plays into this. I need to figure out the details.
    # -  Finally, we will minimize training loss using the Adam Optimizer algorithm.
    model_outputs = SegmentClassifier()(input_ph, num_processing_steps_tr)
    triplet_output = triplets_ph[1]
    edge_losses = tf.losses.log_loss(truth_ph,
                                     tf.transpose(model_outputs[-1].edges)[0])
    training_loss = edge_losses
    training_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
        training_loss)

    # Allows a graph containing `None` fields to be run in a Tensorflow
    # session. This is currently not needed since we have data for all
    # elements in the graph, including useless data for the global variable.
    input_ph = utils_tf.make_runnable_in_session(input_ph)

    # According to documentation, represent a connection between the client
    # program and a C++ runtime. See the following link for more information.
    # https://www.tensorflow.org/guide/graphs
    sess = tf.Session()

    # Create session saver
    saver = tf.train.Saver()

    # Our computation graph uses global variables, so we are required to
    # initialize them for the first pass. See the following link for more
    # information on Tensorflow variables
    # https://www.tensorflow.org/guide/variables
    sess.run(tf.global_variables_initializer())

    output_index = 0
    last_output = time.time()

    # We will iterate through our dataset many times to train.
    for iteration in range(0, num_training_iterations):

        # Iterate through all of the batches and retrieve batch data accordingly.
        for batch_index, batch_count, input_batch, truth_batch in batch_iterator(
                base_dir, batch_size):

            # Turn our data dictionary into a proper graphs.GraphsTuple
            # object for use with graph_nets library.
            input_graphs = utils_np.data_dicts_to_graphs_tuple(input_batch)

            # The utility function make_runnable_in_session to fix problems resulting from
            # None fields in graph.
            input_graphs = utils_tf.make_runnable_in_session(input_graphs)

            # Create a feed dictionary that properly maps graph properties.
            # Documentation states that this is only necessary in the case of
            # missing properties, but we will do it anyway just to be safe.
            feed_dict = utils_tf.get_feed_dict(input_ph, input_graphs)

            # We must pass both the input and target graphs into our computation
            # graph, so we update our feed dictionary with new properties using
            # the same method described above.

            feed_dict.update({truth_ph: truth_batch})

            # Run our computation graph using the feed_dictionary created above.
            # Currently, we appear to be computing multiple values... I need
            # to figure out what each of them means.
            train_values = sess.run(
                {
                    "step": training_optimizer,
                    "loss": training_loss,
                    "outputs": model_outputs
                },
                feed_dict=feed_dict)

            # Compute the time lapse from last save-evaluate-visualize action
            current_time = time.time()
            output_time_lapse = current_time - last_output

            if output_time_lapse > 120:
                last_output = current_time

                # Create a feed dict with 10 training events. These events have not been
                # used during testing, so

                _, _, input_batch, truth_batch = batch_iterator(
                    base_dir, 10, test=True).__next__()

                input_graphs = utils_np.data_dicts_to_graphs_tuple(input_batch)
                input_graphs = utils_tf.make_runnable_in_session(input_graphs)
                feed_dict = utils_tf.get_feed_dict(input_ph, input_graphs)
                feed_dict.update({truth_ph: truth_batch})

                train_values = sess.run(
                    {
                        "loss": training_loss,
                        "target": truth_ph,
                        "outputs": model_outputs
                    },
                    feed_dict=feed_dict)

                cutoff_list = []
                purity_list = []
                efficiency_list = []

                # Compute purity and efficiency for every cutoff from 0 to 1 in steps of 0.01
                for filter_cutoff in np.linspace(0, 1, 100):
                    result = np.transpose(
                        np.where(
                            train_values['outputs'][-1].edges > filter_cutoff,
                            1, 0))[0]
                    correct = np.sum(
                        np.where(
                            np.logical_and(result == truth_batch,
                                           result == np.ones(result.shape)), 1,
                            0))
                    purity = correct / np.sum(result) if np.sum(
                        result) != 0 else 1.0
                    purity_list.append(purity)
                    efficiency = correct / np.sum(truth_batch)
                    efficiency_list.append(efficiency)
                    cutoff_list.append(filter_cutoff)

                # Create purity-efficiency plot and save to folder
                plt.figure()
                plt.plot(purity_list, efficiency_list)
                plt.axis([0, 1, 0, 1])
                plt.xlabel('Purity')
                plt.ylabel('Efficiency')
                os.makedirs(os.path.join(results_dir, 'figures'),
                            exist_ok=True)
                plt.savefig(
                    os.path.join(
                        results_dir,
                        'figures/purity_vs_efficiency{:02d}.png'.format(
                            output_index)))
                plt.close()

                # Write the purity-efficiency
                csv_path = os.path.join(
                    results_dir,
                    'figures/purity_vs_efficiency{:02d}.csv'.format(
                        output_index))
                with open(csv_path, 'w') as csvfile:
                    csv_writer = csv.writer(csvfile)
                    csv_writer.writerow(['cutoff', 'purity', 'efficiency'])
                    for (cutoff, purity,
                         efficiency) in zip(cutoff_list, purity_list,
                                            efficiency_list):
                        csv_writer.writerow([cutoff, purity, efficiency])

                os.makedirs(os.path.join(results_dir, 'models'), exist_ok=True)
                saver.save(
                    sess,
                    os.path.join(results_dir,
                                 'models/model{}.ckpt'.format(output_index)))

                visualize_hitgraph(
                    os.path.join(results_dir, 'images'), output_index, {
                        'nodes': input_batch[0]['nodes'],
                        'edges': truth_batch,
                        'senders': input_batch[0]['senders'],
                        'receivers': input_batch[0]['receivers']
                    })

                print('\repoch: {} progress: {:.4f} loss: {:.4f}'.format(
                    iteration, batch_index / batch_count,
                    train_values['loss']))

                output_index += 1

    sess.close()
Esempio n. 8
0
def run_batches(sess, batch_generator, input_p_ph, input_l_ph, target_ph, input_p_op, input_l_op, target_op, output_ops, step_op, loss_op):
    #import pdb; pdb.set_trace()
    if tf.get_default_session() != None:
        session = tf.get_default_session()
    else:
        session = sess
    # Init counters / stats.
    start_time = time.time()
    solved, count, loss = (0.0, 0.0, 0.0)
    # Process data in batches.
    if DEBUG:
        sys.stdout.write("    Batch x%s:"%str(PRINT_EVERY))
        sys.stdout.flush()
    for b, batch in enumerate(batch_generator()):
        items, input_dicts_p, input_dicts_l, target_dicts = batch
        # Progress print.
        #if DEBUG and b % 10 == 0 and b>0:
        if ( (b % PRINT_EVERY == 0) and (b>0) ):
            if INFERENCE_ONLY:
                examples_seen=b*BATCH_SIZE_TEST
            else: 
                examples_seen=b*BATCH_SIZE
              
            elapsed_time=time.time()-start_time
            plpps = float(examples_seen)/elapsed_time
            
            if DISTRIBUTED:
                import horovod.tensorflow as hvd
                avg_plpps = tf.cast(plpps,tf.float32)
                avg_plpps_op = hvd.allreduce(avg_plpps)
                plpps = sess.run(avg_plpps_op)
                gplpps = plpps*hvd.size()
                if DEBUG:
                    print("Average Protein-Ligand Pairs Per Second = ", plpps, ", Global = ", gplpps)
                    sys.stdout.flush()
            else: 
                if DEBUG:
                    print("Protein-Ligand Pairs Per Second = ", plpps)
                    sys.stdout.flush()
        # Convert data graph dicts to graphs tuple objects
        input_graphs_p = utils_np.data_dicts_to_graphs_tuple(input_dicts_p)
        input_graphs_l = utils_np.data_dicts_to_graphs_tuple(input_dicts_l)
        target_graphs = utils_np.data_dicts_to_graphs_tuple(target_dicts)


        # Build a feed dict for the data.
        input_p_feed_dict = utils_tf.get_feed_dict(input_p_ph, input_graphs_p)
        input_l_feed_dict = utils_tf.get_feed_dict(input_l_ph, input_graphs_l)
        target_feed_dict = utils_tf.get_feed_dict(target_ph, target_graphs)
        feed_dict = dict()
        feed_dict.update(input_p_feed_dict)
        feed_dict.update(input_l_feed_dict)
        feed_dict.update(target_feed_dict)
        # Run it.
        ops = { "input_p": input_p_op, "input_l": input_l_op, "target": target_op, "loss": loss_op, "outputs": output_ops }
        if step_op != None:
            ops["step"] = step_op
        run_values = session.run(ops, feed_dict=feed_dict)
        # Accumulate stats.
        if MODE == 'classification':
            s, c = compute_accuracy_class(run_values["target"], run_values["outputs"])
        if MODE == 'regression':
            s, c = compute_accuracy_reg(run_values["target"], run_values["outputs"])
        solved, count, loss = (solved+s, count+c, loss+run_values["loss"])
        # If there is an output list, save outputs.
        write_predictions(items, run_values["outputs"])
    elapsed = time.time() - start_time
    # Return stats.
    return elapsed, solved, loss, count