Beispiel #1
0
def interpolate():
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    noise = tf.placeholder(dtype=tf.float32,
                           shape=[FLAGS.batch_size, FLAGS.z_dim])
    model = get_model(FLAGS, properties, logdir, noise)
    generated_seqs = get_generated_seqs(model)
    session_creator = ChiefSessionCreator(
        master='',
        checkpoint_filename_with_path=tf.train.latest_checkpoint(logdir))
    seqs = []
    with MonitoredSession(session_creator=session_creator,
                          hooks=None) as session:
        noise1 = np.random.uniform(-1, 1, FLAGS.z_dim)
        noise2 = np.random.uniform(-1, 1, FLAGS.z_dim)
        n = np.stack([
            slerp(ratio, noise1, noise2)
            for ratio in np.linspace(0, 1, FLAGS.batch_size)
        ])
        results, d_scores = session.run(
            [generated_seqs, model.discriminator_fake], feed_dict={noise: n})
        for i in range(FLAGS.batch_size):
            seqs.append(Sequence(id=i, seq=results[i], d_score=d_scores[i]))
        print(
            sequences_to_fasta(seqs,
                               properties['class_mapping'],
                               escape=False,
                               strip_zeros=True))
Beispiel #2
0
def main(_, is_test=False, debug_cli=False, debug_ui=False):
    graph = tf.Graph()
    with graph.as_default():
        properties = get_properties(FLAGS)
        # Select model to train
        model = get_model(FLAGS, properties)
        logdir = setup_logdir(FLAGS, properties)

        print_run_meta_data(FLAGS)
        # Adding all meta data about the model before starting
        add_model_metadata(logdir, os.path.join(os.path.dirname(__file__), FLAGS.model_type), FLAGS, properties)

        # We set allow_soft_placement to be True because Saver for the DCGAN model gets misplaced on the GPU.
        session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)

        hooks = get_hooks(debug_cli, debug_ui)
        model_hooks = get_specific_hooks(FLAGS, logdir, properties)
        if hasattr(FLAGS, "static_embedding") and not FLAGS.static_embedding:
            model_hooks.append(get_embedding_hook(model, FLAGS))

        train_ops = GANTrainOps(generator_train_op=model.g_optim,
                                discriminator_train_op=model.d_optim,
                                global_step_inc_op=model.increment_global_step)
        train_steps = GANTrainSteps(FLAGS.g_step, FLAGS.d_step)

        if is_test:
            return graph
        else:
            gan_train(train_ops,
                      get_hooks_fn=get_sequential_train_hooks(train_steps=train_steps),
                      hooks=([tf.train.StopAtStepHook(num_steps=FLAGS.steps)] + hooks + model_hooks),
                      logdir=logdir,
                      save_summaries_steps=FLAGS.save_summary_steps,
                      save_checkpoint_secs=FLAGS.save_checkpoint_sec,
                      config=session_config)
Beispiel #3
0
def main(_):
    FLAGS.properties_file = "properties_test.json"
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    FLAGS.running_mode = "test"
    model = get_model(FLAGS, properties)

    with tf.variable_scope('model', reuse=True):
        batch = model.data_handler.get_batch(FLAGS.batch_size, FLAGS)
        noise = tf.random_uniform([FLAGS.batch_size, FLAGS.z_dim],
                                  minval=-1.0,
                                  maxval=1.0,
                                  dtype=tf.float32,
                                  name='z0')
        generated_data = model.get_generated_data(noise, batch[1:])
        model.data_handler.display_fake_data(generated_data, batch[1],
                                             FLAGS.batch_size)

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)

    id_to_enzyme_class_dict = properties["class_mapping"]
    evaluate_repeatedly(checkpoint_dir=logdir,
                        hooks=[
                            BlastHook(id_to_enzyme_class_dict,
                                      every_n_steps=1,
                                      output_dir=logdir,
                                      n_examples=FLAGS.batch_size,
                                      running_mode=FLAGS.running_mode),
                            tf.contrib.training.SummaryAtEndHook(logdir),
                            tf.contrib.training.StopAfterNEvalsHook(1)
                        ],
                        eval_ops=generated_data,
                        config=session_config)
Beispiel #4
0
def main(_):
    FLAGS.properties_file = "properties_test.json"
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    FLAGS.running_mode = "test"
    path = os.path.join(FLAGS.data_dir, FLAGS.dataset.replace("\\", os.sep),
                        "d_test.csv")
    # data_to_test = pd.read_csv(path, header=None)[:FLAGS.batch_size]
    # data_to_test = from_amino_acid_to_id(data_to_test, 0)

    model = get_model(FLAGS, properties)

    path = os.path.join(FLAGS.data_dir, FLAGS.dataset.replace("\\", os.sep),
                        "properties_test_original.json")
    with open(path) as json_data_file:
        properties = json.load(json_data_file)

    with tf.variable_scope('model', reuse=True):
        real_x, labels = model.data_handler.get_batch(FLAGS.batch_size, FLAGS)
        real_x, labels = model.data_handler.prepare_real_data(real_x, labels)
        #
        # data = model.data_handler.get_embedded_seqs(data_to_test)
        # d_scores, _ = model.get_discriminator_result(data, batch[1:], reuse=True)
        # printing_d_scores = tf.py_func(
        #     lambda vals, scores: print_protein_seq(vals, properties["class_mapping"], d_scores=scores),
        #     [tf.squeeze(data_to_test), d_scores], tf.string)

        noise = np.random.normal(size=[FLAGS.batch_size, FLAGS.z_dim],
                                 loc=0.0,
                                 scale=1.0)
        z = noise
        generated_data = model.get_generated_data(
            tf.convert_to_tensor(z, dtype=tf.float32), labels)
        d_fake_scores, _ = model.get_discriminator_result(generated_data,
                                                          labels,
                                                          reuse=True)
        generated_data_ids = model.data_handler.convert_to_acid_ids(
            generated_data)
        printing_sequeces = tf.py_func(
            lambda vals, labels: print_protein_seq(
                vals, properties["class_mapping"], labels=labels),
            [tf.squeeze(generated_data_ids), labels[0]], tf.string)

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)

    evaluate_repeatedly(
        checkpoint_dir=logdir,
        hooks=[
            # BlastHook(id_to_enzyme_class_dict, every_n_steps=1, output_dir=logdir, n_examples=FLAGS.batch_size,
            #           running_mode=FLAGS.running_mode),
            tf.contrib.training.SummaryAtEndHook(logdir),
            tf.contrib.training.StopAfterNEvalsHook(1)
        ],
        eval_ops=[printing_sequeces],
        max_number_of_evaluations=1,
        config=session_config)
Beispiel #5
0
def main(_):
    FLAGS.properties_file = "properties_test.json"
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    FLAGS.running_mode = "test"
    path = os.path.join(FLAGS.data_dir, FLAGS.dataset.replace("\\", os.sep),
                        "d_test.csv")
    data_to_test = pd.read_csv(path, header=None)[:FLAGS.batch_size]
    data_to_test = from_amino_acid_to_id(data_to_test, 0)

    model = get_model(FLAGS, properties)

    with tf.variable_scope('model', reuse=True):
        batch = model.data_handler.get_batch(FLAGS.batch_size, FLAGS)

        data = model.data_handler.get_embedded_seqs(data_to_test)
        d_scores, _ = model.get_discriminator_result(data,
                                                     batch[1:],
                                                     reuse=True)
        printing_d_scores = tf.py_func(
            lambda vals, scores: print_protein_seq(
                vals, properties["class_mapping"], d_scores=scores),
            [tf.squeeze(data_to_test), d_scores], tf.string)

        noise1 = np.random.uniform(size=[FLAGS.z_dim], low=-1.0, high=1.0)
        noise2 = np.random.uniform(size=[FLAGS.z_dim], low=-1.0, high=1.0)
        z = np.stack([
            slerp(ratio, noise1, noise2)
            for ratio in np.linspace(0, 1, FLAGS.batch_size)
        ])
        generated_data = model.get_generated_data(
            tf.convert_to_tensor(z, dtype=tf.float32), batch[1:])
        d_fake_scores, _ = model.get_discriminator_result(generated_data,
                                                          batch[1:],
                                                          reuse=True)
        generated_data_ids = model.data_handler.convert_to_acid_ids(
            generated_data)
        printing_sequeces = tf.py_func(
            lambda vals, scores: print_protein_seq(
                vals, properties["class_mapping"], d_scores=scores),
            [tf.squeeze(generated_data_ids), d_fake_scores], tf.string)

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)

    evaluate_repeatedly(
        checkpoint_dir=logdir,
        hooks=[
            # BlastHook(id_to_enzyme_class_dict, every_n_steps=1, output_dir=logdir, n_examples=FLAGS.batch_size,
            #           running_mode=FLAGS.running_mode),
            tf.contrib.training.SummaryAtEndHook(logdir),
            tf.contrib.training.StopAfterNEvalsHook(1)
        ],
        eval_ops=[printing_d_scores, printing_sequeces],
        max_number_of_evaluations=1,
        config=session_config)
Beispiel #6
0
def get_discriminator_results():
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    noise = tf.placeholder(dtype=tf.float32,
                           shape=[FLAGS.batch_size, FLAGS.z_dim])
    model = get_model(FLAGS, properties, logdir, noise)
    s1 = [FLAGS.batch_size, properties[SEQ_LENGTH]]
    input = tf.placeholder(dtype=tf.int32, shape=s1)
    data = tf.expand_dims(tf.transpose(tf.one_hot(input, FLAGS.n_seqs, axis=1),
                                       [0, 2, 1]),
                          axis=1)
    s2 = [FLAGS.batch_size]
    labels = tf.placeholder(dtype=tf.float32, shape=[FLAGS.batch_size])
    with tf.variable_scope('model', reuse=tf.AUTO_REUSE):
        d, d_h = model.get_discriminator_result(data, labels, reuse=True)

    fasta_seqs = fasta_to_numpy(FLAGS.fasta_path, properties[SEQ_LENGTH])
    session_creator = ChiefSessionCreator(
        master='',
        checkpoint_filename_with_path=tf.train.latest_checkpoint(logdir))
    seqs = []
    with MonitoredSession(session_creator=session_creator,
                          hooks=None) as session:
        for i in range(0, len(fasta_seqs), FLAGS.batch_size):
            print("Processing batch ", i)
            batch = fasta_seqs[i:i + FLAGS.batch_size]
            l = len(batch)
            if l < (FLAGS.batch_size):
                batch = np.vstack([
                    batch,
                    np.zeros([FLAGS.batch_size - l, properties[SEQ_LENGTH]])
                ])
            d_scores, step = session.run([d, tf.train.get_global_step()],
                                         feed_dict={
                                             input: batch,
                                             labels: np.zeros(s2)
                                         })
            for j in range(l):
                seqs.append(
                    Sequence(id=j + i,
                             seq=fasta_seqs[j + i],
                             d_score=d_scores[j]))
        fasta = sequences_to_fasta(seqs,
                                   properties['class_mapping'],
                                   escape=False,
                                   strip_zeros=True)
        time_stamp = time.strftime('%H_%M_%S', time.gmtime())
        original_name = os.path.splitext(os.path.basename(FLAGS.fasta_path))[0]
        path = os.path.join(
            logdir, '{}_scores_{}_{}.fasta'.format(original_name, step,
                                                   time_stamp))
        with open(path, 'w') as f:
            print(fasta, file=f)
            tf.logging.info('{} sequences stored in {}'.format(
                len(seqs), path))
Beispiel #7
0
def generate_sequences():
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    tf.logging.info('Noise will have standard deviation of {}'.format(
        FLAGS.stddev))
    noise = tf.random.truncated_normal([FLAGS.batch_size, FLAGS.z_dim],
                                       stddev=FLAGS.stddev,
                                       dtype=tf.float32)
    model = get_model(FLAGS, properties, logdir, noise)
    if FLAGS.one_hot:
        generated_seqs = tf.squeeze(tf.argmax(model.fake_x, axis=-1))
    else:
        generated_seqs = convert_to_acid_ids(model.fake_x)
    seqs = []
    session_creator = ChiefSessionCreator(
        master='',
        checkpoint_filename_with_path=tf.train.latest_checkpoint(logdir))
    with MonitoredSession(session_creator=session_creator,
                          hooks=None) as session:
        while True:
            results, step = session.run(
                [generated_seqs, tf.train.get_global_step()], None)
            id = len(seqs)
            for i in range(FLAGS.batch_size):
                seqs.append(Sequence(id=id + i, seq=results[i]))
            if len(seqs) >= FLAGS.n_seqs:
                break
    time_stamp = time.strftime('%H_%M_%S', time.gmtime())
    path = os.path.join(logdir,
                        'generated_{}_{}.fasta'.format(step, time_stamp))
    fasta = sequences_to_fasta(seqs,
                               properties['class_mapping'],
                               escape=False,
                               strip_zeros=True)
    if FLAGS.blast:
        db_path = os.path.join(
            FLAGS.data_dir, FLAGS.dataset,
            FLAGS.blast_db.replace("\\", os.sep) + "_" + FLAGS.running_mode)
        blast_results, err = get_local_blast_results(logdir, db_path, fasta)
        seqs, evalues, similarities, identity = update_sequences_with_blast_results(
            blast_results, seqs)
        print_stats([("Evalue", evalues), ("BLOMSUM45", similarities),
                     ("Identity", identity)], len(seqs))
        fasta = sequences_to_fasta(seqs,
                                   properties['class_mapping'],
                                   escape=False,
                                   strip_zeros=True)
    with open(path, 'w') as f:
        print(fasta, file=f)
        tf.logging.info('{} sequences stored in {}'.format(len(seqs), path))
    tf.logging.info('Finished evaluation at ' +
                    time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime()))
Beispiel #8
0
def raw_results():
    properties = get_properties(FLAGS)
    logdir = setup_logdir(FLAGS, properties)
    noise = tf.random.truncated_normal([FLAGS.batch_size, FLAGS.z_dim],
                                       stddev=0.5,
                                       dtype=tf.float32)
    model = get_model(FLAGS, properties, logdir, noise)
    raw_generations = tf.squeeze(model.fake_x)
    session_creator = ChiefSessionCreator(
        master='',
        checkpoint_filename_with_path=tf.train.latest_checkpoint(logdir))
    with MonitoredSession(session_creator=session_creator,
                          hooks=None) as session:
        results, step = session.run(
            [raw_generations, tf.train.get_global_step()], None)
        time_stamp = time.strftime('%H_%M_%S', time.gmtime())
        path = os.path.join(logdir, 'raw_{}_{}.npz'.format(step, time_stamp))
        with open(path, 'wb') as f:
            np.savez(f, results)