Example #1
0
def evaluate(target_list):
    """ evaluate the model 
  """
    # virtual screen log file
    log_dir = "log_files"
    logpath = os.path.join(log_dir, "pk_eval.log")
    logfile = open(logpath, "w")
    logfile.write("pk_eval starts at: %s\n" % datetime.datetime.now())

    # get input dataset
    train_dataset_dict = dict()
    test_dataset_dict = dict()
    for target in target_list:
        train_dataset_dict[target] = pk_input.get_inputs_by_cpickle(
            "data_files/pkl_files/" + target + "_train.pkl")
        test_dataset_dict[target] = pk_input.get_inputs_by_cpickle(
            "data_files/pkl_files/" + target + "_test.pkl")

    neg_dataset = pk_input.get_inputs_by_cpickle(
        "data_files/pkl_files/pubchem_neg_sample.pkl")

    with tf.Graph().as_default(), tf.device("/gpu:0"):

        # build the model
        input_placeholder = tf.placeholder(tf.float32, shape=(None, 8192))
        label_placeholder = tf.placeholder(tf.float32, shape=(None, 2))
        # build the "Tree" with a mutual "Term" and several "Branches"
        base = dnn_model.term(input_placeholder, keep_prob=1.0)
        softmax_dict = dict()
        wd_loss_dict = dict()
        x_entropy_dict = dict()
        loss_dict = dict()
        accuracy_dict = dict()
        for target in target_list:
            # compute softmax
            softmax_dict[target] = dnn_model.branch(target,
                                                    base,
                                                    keep_prob=1.0)
            # compute loss.
            wd_loss_dict[target] = tf.add_n(
                tf.get_collection("term_wd_loss") +
                tf.get_collection(target + "_wd_loss"))
            x_entropy_dict[target] = dnn_model.x_entropy(
                softmax_dict[target], label_placeholder, target)
            loss_dict[target] = tf.add(wd_loss_dict[target],
                                       x_entropy_dict[target])
            # compute accuracy
            accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target],
                                                       label_placeholder,
                                                       target)

        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())

        # create session.
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.per_process_gpu_memory_fraction = 0.5
        sess = tf.Session(config=config)

        # Restores variables from checkpoint
        saver.restore(sess, "ckpt_files/model.ckpt-40000")

        # eval train dataset
        for target in target_list:
            t0 = float(time.time())
            compds = numpy.vstack(
                [train_dataset_dict[target].compds, neg_dataset.compds])
            labels = numpy.vstack(
                [train_dataset_dict[target].labels, neg_dataset.labels])
            t1 = float(time.time())
            LV, XLV, ACC, prediction, label_dense = sess.run(
                [
                    wd_loss_dict[target], x_entropy_dict[target],
                    accuracy_dict[target],
                    tf.argmax(softmax_dict[target], 1),
                    tf.argmax(labels, 1)
                ],
                feed_dict={
                    input_placeholder: compds,
                    label_placeholder: labels,
                })
            t2 = time.time()
            TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(
                label_dense, prediction)
            format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s"
            logfile.write(format_str %
                          (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE,
                           ACC, MCC, t1 - t0, t2 - t1, target))
            logfile.write('\n')
            print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN,
                                SPE, ACC, MCC, t1 - t0, t2 - t1, target))

        # eval test dataset
        for target in target_list:
            t0 = float(time.time())
            compds = test_dataset_dict[target].compds
            labels = test_dataset_dict[target].labels
            t1 = float(time.time())
            LV, XLV, ACC, prediction, label_dense = sess.run(
                [
                    wd_loss_dict[target], x_entropy_dict[target],
                    accuracy_dict[target],
                    tf.argmax(softmax_dict[target], 1),
                    tf.argmax(labels, 1)
                ],
                feed_dict={
                    input_placeholder: compds,
                    label_placeholder: labels,
                })
            t2 = time.time()
            TP, TN, FP, FN, SEN, SPE, MCC = dnn_model.compute_performance(
                label_dense, prediction)
            format_str = "%6d %6d %6.3f %6.3f %10.3f %5d %5d %5d %5d %6.3f %6.3f %6.3f %6.3f %5.3f %5.3f %s"
            logfile.write(format_str %
                          (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN, SPE,
                           ACC, MCC, t1 - t0, t2 - t1, target))
            logfile.write('\n')
            print(format_str % (5000, 40000, LV, XLV, 0, TP, FN, TN, FP, SEN,
                                SPE, ACC, MCC, t1 - t0, t2 - t1, target))

    logfile.close()
Example #2
0
def virtual_screening_chemdiv(target, g_step, gpu_num=0):
    t_0 = time.time()

    # dataset
    d = ci.DatasetChemDiv(target)
    # batch size
    batch_size = 128
    # input vec_len
    input_vec_len = d.num_features
    # keep prob
    keep_prob = 0.8
    # weight decay
    wd = 0.004
    # g_step
    #g_step = 2236500

    # virtual screen pred file
    pred_dir = "pred_files/%s" % target
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    pred_path = os.path.join(
        pred_dir, "vs_chemdiv_%s_%d_%4.3f_%4.3e_%d.pred" %
        (target, batch_size, keep_prob, wd, g_step))
    predfile = open(pred_path, 'w')
    print("virtual screen ChemDiv starts at: %s\n" % datetime.datetime.now())

    # checkpoint file
    ckpt_dir = "ckpt_files/%s" % target
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))

    # screening
    with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
        # the input
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        # the term
        base = dnn_model.term(input_placeholder,
                              in_units=input_vec_len,
                              wd=wd,
                              keep_prob=1.0)
        # the branches
        softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())
        # Start screen
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = 0.35

        with tf.Session(config=config) as sess:
            # Restores variables from checkpoint
            saver.restore(sess, ckpt_path + "-%d" % g_step)

            for ids, features in d.batch_generator_chemdiv(vs_batch_size):
                sm = sess.run(softmax, feed_dict={input_placeholder: features})
                for id_, sm_v in zip(ids, sm[:, 1]):
                    predfile.write("%s\t%f\n" % (id_, sm_v))
            """
      try:     
        while True:
          ids, features = d.generate_batch(vs_batch_size)
          sm = sess.run(softmax, feed_dict = {input_placeholder: features.toarray()})
          for id_, sm_v in zip(ids, sm[:, 1]):
            predfile.write("%s\t%f\n" % (id_, sm_v))
      except StopIteration:
        pass
      """
    predfile.close()
    print("duration: %.3f" % (time.time() - t_0))
Example #3
0
def train(target,
          gpu_num=0,
          tpm=0,
          train_from=0,
          keep_prob=0.8,
          wd=0.004,
          batch_size=128):
    """"""
    # dataset
    d = ci.Dataset(target, train_pos_multiply=tpm)
    d.test_features_dense = d.test_features.toarray()
    # learning rate
    step_per_epoch = int(d.train_size /
                         batch_size)  # approximately equal to 7456
    start_learning_rate = 0.05
    decay_step = step_per_epoch * 10
    decay_rate = 0.9
    # max train steps
    max_step = 300 * step_per_epoch
    # input vec_len
    input_vec_len = d.num_features
    # checkpoint file
    ckpt_dir = "ckpt_files/%s" % target
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
    # train log file
    log_dir = "log_files"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    log_path = os.path.join(
        log_dir,
        "train_%s_%d_%4.3f_%4.3e.log" % (target, batch_size, keep_prob, wd))
    logfile = open(log_path, 'w')
    logfile.write("train starts at: %s\n" % datetime.datetime.now())

    # build dnn model and train
    with tf.Graph().as_default(), tf.device('/gpu: %d' % gpu_num):
        # placeholders
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        label_placeholder = tf.placeholder(tf.float32, shape=(None, 2))
        # global step and learning rate
        global_step = tf.Variable(train_from, trainable=False)
        learning_rate = tf.train.exponential_decay(start_learning_rate,
                                                   global_step, decay_step,
                                                   decay_rate)
        # build a Graph that computes the softmax predictions from the
        # inference model.
        base = dnn_model.term(input_placeholder,
                              in_units=input_vec_len,
                              wd=wd,
                              keep_prob=keep_prob)
        # compute softmax
        softmax = dnn_model.branch(target, base, wd=wd, keep_prob=keep_prob)
        # compute loss.
        wd_loss = tf.add_n(
            tf.get_collection("term_wd_loss") +
            tf.get_collection(target + "_wd_loss"))
        x_entropy = dnn_model.x_entropy(softmax,
                                        label_placeholder,
                                        target,
                                        neg_weight=1)
        loss = tf.add(wd_loss, x_entropy)
        # train op
        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            loss, global_step=global_step)
        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None)
        # start running operations on the Graph.
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = 0.3
        sess = tf.Session(config=config)
        # initialize all variables at first.
        sess.run(tf.initialize_all_variables())
        if train_from != 0:
            saver.restore(sess, ckpt_path + "-%d" % train_from)
        # print title to screen and log file
        title_str = "  step g_step wdloss   xloss learn_rate    TP    FN    TN    FP    SEN    SPE    ACC    MCC t1-t0 t2-t1 t3-t2  target"
        print(title_str)
        logfile.write(title_str + "\n")

        # format str
        format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s "

        # train the model
        for step in xrange(max_step):
            t0 = time.time()

            # get a batch sample
            perm = d.generate_perm_for_train_batch(batch_size)
            compds_batch = d.train_features[perm].toarray()
            labels_batch_one_hot = d.train_labels_one_hot[perm]
            t1 = time.time()
            # train once
            _ = sess.run(
                [train_op],
                feed_dict={
                    input_placeholder: compds_batch,
                    label_placeholder: labels_batch_one_hot
                })
            t2 = time.time()

            # compute performance for the train batch
            if step % step_per_epoch == 0 or (step + 1) == max_step:
                g_step, wd_ls, x_ls, lr, pred = sess.run(
                    [
                        global_step, wd_loss, x_entropy, learning_rate,
                        tf.argmax(softmax, 1)
                    ],
                    feed_dict={
                        input_placeholder: compds_batch,
                        label_placeholder: labels_batch_one_hot
                    })
                tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(
                    d.train_labels[perm], pred)
                t3 = float(time.time())
                logfile.write(
                    format_str %
                    (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe,
                     acc, mcc, t1 - t0, t2 - t1, t3 - t2, target) + "\n")
                print(format_str %
                      (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen, spe,
                       acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))

            # save the model checkpoint periodically.
            if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
                saver.save(sess,
                           ckpt_path,
                           global_step=global_step,
                           write_meta_graph=False)

            # compute performance for the test data
            if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
                x_ls, pred = sess.run(
                    [x_entropy, tf.argmax(softmax, 1)],
                    feed_dict={
                        input_placeholder: d.test_features_dense,
                        label_placeholder: d.test_labels_one_hot
                    })
                tp, tn, fp, fn, sen, spe, acc, mcc = ci.compute_performance(
                    d.test_labels, pred)
                logfile.write(format_str %
                              (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp,
                               sen, spe, acc, mcc, 0, 0, 0, target) + "\n")
                print(format_str % (step, g_step, wd_ls, x_ls, lr, tp, fn, tn,
                                    fp, sen, spe, acc, mcc, 0, 0, 0, target))

    logfile.write("train ends at: %s\n" % datetime.datetime.now())
    logfile.close()
Example #4
0
def evaluate(target,
             g_step_list=None,
             gpu_num=0,
             keep_prob=0.8,
             wd=0.004,
             batch_size=128):
    """ evaluate the model 
  """
    # dataset
    d = ci.Dataset(target)
    # learning rate
    step_per_epoch = int(d.train_size / batch_size)
    # input vec_len
    input_vec_len = d.num_features
    # checkpoint file
    ckpt_dir = "ckpt_files/%s" % target
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))

    # pred file
    pred_dir = "pred_files/%s" % target
    if not os.path.exists(pred_dir):
        os.mkdir(pred_dir)

    print("%s eval starts at: %s\n" % (target, datetime.datetime.now()))

    # g_step_list
    #g_step_list = range(1, 2235900, 10 * step_per_epoch)
    #g_step_list.append(2235900)

    with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
        # build the model
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        label_placeholder = tf.placeholder(tf.float32, shape=(None, 2))
        # build the "Tree" with a mutual "Term" and several "Branches"
        base = dnn_model.term(input_placeholder,
                              in_units=input_vec_len,
                              wd=wd,
                              keep_prob=1.0)
        # compute softmax
        softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)

        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())
        # create session.
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        sess = tf.Session(config=config)

        for g_step in g_step_list:
            # Restores variables from checkpoint
            saver.restore(sess, ckpt_path + "-%d" % g_step)

            # the whole pns
            pns_pred_file = open(
                pred_dir + "/pns_%s_%d_%4.3f_%4.3e_%d.pred" %
                (target, batch_size, keep_prob, wd, g_step), "w")
            for ids, features, mask in d.batch_generator_pns(eval_batch_size):
                sm = sess.run(softmax, feed_dict={input_placeholder: features})
                for i, s, m in zip(ids, sm[:, 1], mask):
                    pns_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
            pns_pred_file.close()

            # the whole cns
            cns_pred_file = open(
                pred_dir + "/cns_%s_%d_%4.3f_%4.3e_%d.pred" %
                (target, batch_size, keep_prob, wd, g_step), "w")
            for ids, features, mask in d.batch_generator_cns(eval_batch_size):
                sm = sess.run(softmax, feed_dict={input_placeholder: features})
                for i, s, m in zip(ids, sm[:, 1], mask):
                    cns_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
            cns_pred_file.close()

            # the target's train
            train_pred_file = open(
                pred_dir + "/train_%s_%d_%4.3f_%4.3e_%d.pred" %
                (target, batch_size, keep_prob, wd, g_step), "w")
            sm = sess.run(softmax,
                          feed_dict={
                              input_placeholder:
                              d.target_features_train.toarray()
                          })
            for i, s, m in zip(d.target_ids_train, sm[:, 1],
                               d.target_labels_train):
                train_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
            train_pred_file.close()

            # the target's test
            test_pred_file = open(
                pred_dir + "/test_%s_%d_%4.3f_%4.3e_%d.pred" %
                (target, batch_size, keep_prob, wd, g_step), "w")
            sm = sess.run(softmax,
                          feed_dict={
                              input_placeholder:
                              d.target_features_test.toarray()
                          })
            for i, s, m in zip(d.target_ids_test, sm[:, 1],
                               d.target_labels_test):
                test_pred_file.write("%s\t%f\t%d\n" % (i, s, m))
            test_pred_file.close()

    print("eval ends at: %s\n" % datetime.datetime.now())
Example #5
0
def test(target, g_step):
    # dataset
    d = ci.DatasetTarget(target)
    # batch size
    batch_size = 128
    # keep prob
    keep_prob = 0.8
    # weight decay
    wd = 0.004
    # checkpoint file
    ckpt_dir = "ckpt_files/%s" % target
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))
    # input vec_len
    input_vec_len = d.num_features

    with tf.Graph().as_default(), tf.device("/gpu:3"):
        # build the model
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        label_placeholder = tf.placeholder(tf.float32, shape=(None, 2))
        # build the "Tree" with a mutual "Term" and several "Branches"
        base = dnn_model.term(input_placeholder,
                              in_units=input_vec_len,
                              wd=wd,
                              keep_prob=1.0)
        # compute softmax
        softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
        # compute loss.
        wd_loss = tf.add_n(
            tf.get_collection("term_wd_loss") +
            tf.get_collection(target + "_wd_loss"))
        x_entropy = dnn_model.x_entropy(softmax, label_placeholder, target)
        loss = tf.add(wd_loss, x_entropy)
        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())
        # create session.
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        sess = tf.Session(config=config)

        saver.restore(sess, ckpt_path + "-%d" % g_step)
        sm = sess.run(
            softmax,
            feed_dict={input_placeholder: d.target_features_test.toarray()})

        fpr, tpr, _ = roc_curve(d.target_labels_test, sm[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr,
                 tpr,
                 color="r",
                 lw=2,
                 label="ROC curve (area = %.2f)" % roc_auc)
        plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("Receiver operating characteristic of DNN model on %s" %
                  target)
        plt.legend(loc="lower right")
        plt.savefig("%s.png" % target)
Example #6
0
def virtual_screening(target_list, part_num):

    # virtual screen log file
    log_dir = "log_files"
    logpath = os.path.join(log_dir, "virtual_screen_pubchem_%d.log" % part_num)
    logfile = open(logpath, "w")
    logfile.write("virtual screen %d starts at: %s\n" %
                  (part_num, datetime.datetime.now()))

    # input and output dir
    pkl_dir = "/raid/xiaotaw/pubchem/pkl_files"
    prediction_dir = "/raid/xiaotaw/pubchem/prediction_files"
    if not os.path.exists(prediction_dir):
        os.mkdir(prediction_dir)

    # screening
    with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num // 3)):
        # the input
        input_placeholder = tf.placeholder(tf.float32, shape=(None, 8192))

        # the term
        base = dnn_model.term(input_placeholder, keep_prob=1.0)

        # the branches
        softmax_dict = dict()
        for target in target_list:
            softmax_dict[target] = dnn_model.branch(target,
                                                    base,
                                                    keep_prob=1.0)

        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())

        # Start screen
        prediction_dict = dict()
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = 0.2
        with tf.Session(config=config) as sess:
            # Restores variables from checkpoint
            saver.restore(sess, "ckpt_files/model.ckpt-40000")

            #for i in xrange(1, 121225001, 25000):
            begin_num = part_num * 10000000 + 1
            if part_num == 11:
                end_num = 121225001
            else:
                end_num = (part_num + 1) * 10000000 + 1

            for i in xrange(begin_num, end_num, 25000):
                start_time = float(time.time())
                # get input compounds
                in_file = "Compound_" + "{:0>9}".format(
                    i) + "_" + "{:0>9}".format(i + 24999) + ".pkl"
                if not os.path.exists(os.path.join(pkl_dir, in_file)):
                    logfile.write("%s\t0\tnot exists" % in_file)
                    continue
                infile = open(os.path.join(pkl_dir, in_file), "rb")
                data = cPickle.load(infile)
                numpy.clip(data, 0, 1, out=data)
                compds = data.astype(numpy.float32)
                infile.close()
                for target in target_list:
                    prediction_dict[target] = sess.run(
                        tf.argmax(softmax_dict[target], 1),
                        feed_dict={input_placeholder: compds})

                # stack prediction result into a matrix with shape = (num_compds, num_targets)
                prediction = numpy.vstack(
                    [prediction_dict[k] for k in target_list]).T
                logfile.write(
                    "%s\t%s\t%d\n" %
                    (in_file, prediction.sum(axis=0), compds.shape[0]))
                # convert into sparse matrix
                if not prediction.sum() == 0:
                    sparse_prediction = sparse.csr_matrix(prediction)
                    # save result into file
                    out_file = in_file.replace("pkl", "prediction")
                    outfile = open(os.path.join(prediction_dir, out_file),
                                   "wb")
                    cPickle.dump(sparse_prediction, outfile, protocol=2)
                    outfile.close()
                    #logfile.write(str(sparse_prediction)+"\n")
                print("%s\t%s\t%d\t%.3f" %
                      (in_file, prediction.sum(axis=0), compds.shape[0],
                       time.time() - start_time))
    logfile.write("virtual screen %d ends at: %s\n" %
                  (part_num, datetime.datetime.now()))
    logfile.close()
Example #7
0
def virtual_screening_single(target, g_step, part_num, gpu_num):
    t_0 = time.time()

    # dataset
    d = ci.DatasetVS(target)
    # batch size
    batch_size = 128
    # input vec_len
    input_vec_len = d.num_features
    # keep prob
    keep_prob = 0.8
    # weight decay
    wd = 0.004
    # g_step
    #g_step = 2236500

    # virtual screen pred file
    pred_dir = "pred_files/%s" % target
    if not os.path.exists(pred_dir):
        os.makedirs(pred_dir)
    pred_path = os.path.join(
        pred_dir, "vs_pubchem_%s_%d_%4.3f_%4.3e_%d_%d.pred" %
        (target, batch_size, keep_prob, wd, g_step, part_num))
    predfile = open(pred_path, 'w')
    print("virtual screen %d starts at: %s\n" %
          (part_num, datetime.datetime.now()))

    # checkpoint file
    ckpt_dir = "ckpt_files/%s" % target
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (batch_size, keep_prob, wd))

    # input and output dir
    fp_dir = "/raid/xiaotaw/pubchem/fp_files/%d" % part_num

    # screening
    with tf.Graph().as_default(), tf.device("/gpu: %d" % gpu_num):
        #with tf.Graph().as_default(), tf.device("/gpu:%d" % (part_num % 4)):
        # the input
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        # the term
        base = dnn_model.term(input_placeholder,
                              in_units=input_vec_len,
                              wd=wd,
                              keep_prob=1.0)
        # the branches
        softmax = dnn_model.branch(target, base, wd=wd, keep_prob=1.0)
        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables())
        # Start screen
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = 0.35
        with tf.Session(config=config) as sess:
            # Restores variables from checkpoint
            saver.restore(sess, ckpt_path + "-%d" % g_step)
            for i in xrange(part_num * 10000000 + 1, (part_num + 1) * 10000000,
                            25000):
                in_file = "Compound_" + "{:0>9}".format(
                    i) + "_" + "{:0>9}".format(i + 24999) + ".apfp"
                fp_fn = os.path.join(fp_dir, in_file)
                if not os.path.exists(fp_fn):
                    print("%s not exists" % fp_fn)
                    continue
                d.reset(fp_fn)
                compds = d.features_dense
                sm = sess.run(softmax, feed_dict={input_placeholder: compds})
                for id_, sm_v in zip(d.pubchem_id, sm[:, 1]):
                    predfile.writelines("%s\t%f\n" % (id_, sm_v))
                print("%s\t%d\n" % (fp_fn, len(d.pubchem_id)))

    print("duration: %.3f" % (time.time() - t_0))
Example #8
0
def train(target_list, train_from=0):

    # dataset
    d = pki.Datasets(target_list)

    # batch size.
    # note: the mean number of neg sample is 25.23 times as many as pos's.
    neg_batch_size = 512
    pos_batch_size_dict = {}
    pos_sum = 0
    for target in target_list:
        pos_sum += d.pos[target].size
    pos_batch_size = int(neg_batch_size * pos_sum / d.neg.size)
    for target in target_list:
        pos_batch_size_dict[target] = int(neg_batch_size * d.pos[target].size /
                                          d.neg.size)
        #pos_batch_size_dict[target] = pos_batch_size
    # learning rate
    step_per_epoch = int(d.neg.size / neg_batch_size)
    start_learning_rate = 0.05
    decay_step = step_per_epoch * 10 * 8
    decay_rate = 0.9
    # max train steps
    max_step = 50 * step_per_epoch
    # input vec_len
    input_vec_len = d.neg.features.shape[1]
    # keep prob
    keep_prob = 0.8
    # weight decay
    wd = 0.001
    # checkpoint file
    ckpt_dir = "ckpt_files_big_tree/pk"
    ckpt_path = os.path.join(
        ckpt_dir, '%d_%4.3f_%4.3e.ckpt' % (neg_batch_size, keep_prob, wd))
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)
    # train log file
    log_dir = "log_files_big_tree"
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    log_path = os.path.join(
        log_dir,
        "train_pk_%d_%4.3f_%4.3e.log" % (neg_batch_size, keep_prob, wd))
    logfile = open(log_path, 'w')
    logfile.write("train starts at: %s\n" % datetime.datetime.now())

    # train the model
    with tf.Graph().as_default(), tf.device("/gpu:0"):

        # exponential decay learning rate
        global_step = tf.Variable(train_from, trainable=False)
        learning_rate = tf.train.exponential_decay(start_learning_rate,
                                                   global_step, decay_step,
                                                   decay_rate)

        # build the model
        input_placeholder = tf.placeholder(tf.float32,
                                           shape=(None, input_vec_len))
        label_placeholder = tf.placeholder(tf.float32, shape=(None, 2))
        # build the "Tree" with a mutual "Term" and several "Branches"
        base = dnn_model.term(input_placeholder, wd=wd, keep_prob=keep_prob)
        softmax_dict = dict()
        wd_loss_dict = dict()
        x_entropy_dict = dict()
        loss_dict = dict()
        accuracy_dict = dict()
        train_op_dict = dict()
        for target in target_list:
            # compute softmax
            softmax_dict[target] = dnn_model.branch(target,
                                                    base,
                                                    wd=wd,
                                                    keep_prob=keep_prob)
            # compute loss.
            wd_loss_dict[target] = tf.add_n(
                tf.get_collection("term_wd_loss") +
                tf.get_collection(target + "_wd_loss"))
            x_entropy_dict[target] = dnn_model.x_entropy(
                softmax_dict[target], label_placeholder, target)
            loss_dict[target] = tf.add(wd_loss_dict[target],
                                       x_entropy_dict[target])
            # compute accuracy
            accuracy_dict[target] = dnn_model.accuracy(softmax_dict[target],
                                                       label_placeholder,
                                                       target)
            # train op
            train_op_dict[target] = tf.train.GradientDescentOptimizer(
                learning_rate).minimize(loss_dict[target],
                                        global_step=global_step)
        # create a saver.
        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=None)
        # start running operations on the Graph.
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.per_process_gpu_memory_fraction = 0.8
        sess = tf.Session(config=config)
        # initialize all variables at first.
        sess.run(tf.initialize_all_variables())
        if train_from != 0:
            saver.restore(sess, ckpt_path + "-%d" % train_from)
        # print title to screen and log file
        title_str = "  step g_step wdloss   xloss learn_rate    TP    FN    TN    FP    SEN    SPE    ACC    MCC t1-t0 t2-t1 t3-t2  target"
        print(title_str)
        logfile.write(title_str + "\n")

        # format str
        format_str = "%6d %6d %6.4f %7.5f %10.8f %5d %5d %5d %5d %6.4f %6.4f %6.4f %6.4f %5.3f %5.3f %5.3f %10s "

        # train with max step
        for step in xrange(max_step):
            for target in target_list:
                t0 = time.time()

                # get a batch sample
                compds_batch, labels_batch = d.next_train_batch(
                    target, pos_batch_size_dict[target], neg_batch_size)
                t1 = float(time.time())

                _ = sess.run(train_op_dict[target],
                             feed_dict={
                                 input_placeholder: compds_batch,
                                 label_placeholder: labels_batch
                             })
                t2 = float(time.time())

                # compute performance
                # compute performance
                if step % step_per_epoch == 0 or (step + 1) == max_step:
                    g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run(
                        [
                            global_step, wd_loss_dict[target],
                            x_entropy_dict[target], learning_rate,
                            accuracy_dict[target],
                            tf.argmax(softmax_dict[target], 1),
                            tf.argmax(labels_batch, 1)
                        ],
                        feed_dict={
                            input_placeholder: compds_batch,
                            label_placeholder: labels_batch
                        })
                    tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(
                        label_dense, pred)
                    t3 = float(time.time())
                    # print to file and screen

                    logfile.write(
                        format_str %
                        (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                         spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))
                    logfile.write('\n')
                    print(format_str %
                          (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                           spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))

            # save the model checkpoint periodically.
            if step % (10 * step_per_epoch) == 0 or (step + 1) == max_step:
                saver.save(sess,
                           ckpt_path,
                           global_step=global_step,
                           write_meta_graph=False)

            if (step > 3 * 10 * step_per_epoch) and (step %
                                                     (10 * step_per_epoch) == 0
                                                     or
                                                     (step + 1) == max_step):
                for target in target_list:
                    # the whole train
                    t0 = time.time()
                    compds_batch = numpy.vstack([
                        d.pos[target].features[d.pos[target].train_perm],
                        d.neg.features[d.neg.train_perm]
                    ])
                    labels_batch = numpy.vstack([
                        d.pos[target].labels[d.pos[target].train_perm],
                        d.neg.mask_dict[target][d.neg.train_perm]
                    ])
                    t1 = time.time()
                    t2 = time.time()
                    g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run(
                        [
                            global_step, wd_loss_dict[target],
                            x_entropy_dict[target], learning_rate,
                            accuracy_dict[target],
                            tf.argmax(softmax_dict[target], 1),
                            tf.argmax(labels_batch, 1)
                        ],
                        feed_dict={
                            input_placeholder: compds_batch,
                            label_placeholder: labels_batch
                        })
                    t3 = float(time.time())
                    tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(
                        label_dense, pred)
                    # print to file and screen
                    logfile.write(
                        format_str %
                        (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                         spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))
                    logfile.write('\n')
                    print(format_str %
                          (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                           spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))

                    # the whole test
                    t0 = time.time()
                    compds_batch = numpy.vstack([
                        d.pos[target].features[d.pos[target].test_perm],
                        d.neg.features[d.neg.test_perm]
                    ])
                    labels_batch = numpy.vstack([
                        d.pos[target].labels[d.pos[target].test_perm],
                        d.neg.mask_dict[target][d.neg.test_perm]
                    ])
                    t1 = time.time()
                    t2 = time.time()
                    g_step, wd_ls, x_ls, lr, acc, pred, label_dense = sess.run(
                        [
                            global_step, wd_loss_dict[target],
                            x_entropy_dict[target], learning_rate,
                            accuracy_dict[target],
                            tf.argmax(softmax_dict[target], 1),
                            tf.argmax(labels_batch, 1)
                        ],
                        feed_dict={
                            input_placeholder: compds_batch,
                            label_placeholder: labels_batch
                        })
                    t3 = float(time.time())
                    tp, tn, fp, fn, sen, spe, mcc = dnn_model.compute_performance(
                        label_dense, pred)
                    # print to file and screen
                    logfile.write(
                        format_str %
                        (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                         spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))
                    logfile.write('\n')
                    print(format_str %
                          (step, g_step, wd_ls, x_ls, lr, tp, fn, tn, fp, sen,
                           spe, acc, mcc, t1 - t0, t2 - t1, t3 - t2, target))

    logfile.write("train ends at: %s\n" % datetime.datetime.now())
    logfile.close()