model.input_x: x_batch, model.input_y: y_batch, model.dropout_keep_prob: 0.5 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, model.loss, model.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if save: if writer: writer.add_summary(summaries, step) # CREATE THE BATCHES GENERATOR batches = batchgen.gen_batch(list(zip(x_train, y_train)), batch_size, num_epochs) # TRAIN FOR EACH BATCH for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
def main(embdFilePath, embdDim, outFolderName, num_epochs): # Load data print("Loading data...") x_text, y = batchgen.get_dataset(goodfile, badfile, 5000) #TODO: MAX LENGTH max_document_length = max([len(x.split(" ")) for x in x_text]) x, y, vocab_processor = generateData(embdFilePath, embdDim, x_text, y, max_document_length) # Randomly shuffle data np.random.seed(42) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(dev_size * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): model = CNN_LSTM(x_train.shape[1], y_train.shape[1], len(vocab_processor.vocabulary_), embdDim, filter_sizes, num_filters, l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(model.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, outFolderName, timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) #TRAINING STEP def train_step(x_batch, y_batch, save=False): feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.dropout_keep_prob: dropout_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if save: train_summary_writer.add_summary(summaries, step) #EVALUATE MODEL def dev_step(x_batch, y_batch, writer=None, save=False): feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.dropout_keep_prob: 0.5 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, model.loss, model.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if save: if writer: writer.add_summary(summaries, step) #CREATE THE BATCHES GENERATOR batches = batchgen.gen_batch(list(zip(x_train, y_train)), batch_size, num_epochs) #TRAIN FOR EACH BATCH for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if current_step % checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) dev_step(x_dev, y_dev, writer=dev_summary_writer) path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
writer.add_summary(summaries, step) return accuracy, output #testing MODEL # def test_step(x_batch, y_batch, writer=None,save=True): # feed_dict = {model.input_x: x_batch, model.input_y: y_batch, model.dropout_keep_prob: dropout_prob} # step, summaries,output, accuracy = sess.run([global_step, test_summary_op,model.scores, model.accuracy], feed_dict) # #step,output, accuracy = sess.run([global_step,model.scores, model.accuracy], feed_dict) # time_str = datetime.datetime.now().isoformat() # print("{}: step {}, acc {:g}".format(time_str, step, accuracy)) # if save: # if writer: # writer.add_summary(summaries, step) # return output #CREATE THE BATCHES GENERATOR batches = batchgen.gen_batch(list(zip(X_tr, y_tr)), batch_size, num_epochs) batches_val = batchgen.gen_batch(list(zip(X_val, y_val)), batch_size, num_epochs) #TRAIN FOR EACH BATCH max_acc = 0 if istrain: for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") accuracy, _ = dev_step(X_val, y_val, writer=dev_summary_writer) print("")
step, predictions, summaries, loss, accuracy, semantic = sess.run([ global_step, model.score, dev_summary_op, model.loss, model.accuracy, model.semantic ], feed_dict) #step,predictions,loss, accuracy = sess.run([global_step,model.scores, model.loss,model.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) if save: if writer: writer.add_summary(summaries, step) return accuracy, predictions, semantic #CREATE THE BATCHES GENERATOR batches = batchgen.gen_batch( list(zip(x_train, np_utils.to_categorical(y_train))), batch_size, num_epochs) #TRAIN FOR EACH BATCH max_acc = 0 if istrain: for i, batch in enumerate(batches): x_batch, y_batch = zip(*batch) predictions, semantic = train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") accuracy, predictions, semantic = dev_step( x_test, y_test, writer=dev_summary_writer) if accuracy > max_acc: max_acc = accuracy path = saver.save(sess,
def pre(seq, value): global pres0, pres X = [] for sequence in seq: sequence = sequence.tolist() for i, t in enumerate(sequence): t = id2word.get((int(t))) if t is None: sequence[i] = '0' else: sequence[i] = t sequences = [] sequence = ''.join(sequence) sequence = sequence.replace('X', '0') sequence = sequence.replace('U', '0') sequence = sequence.replace('O', '0') sequence = sequence.replace('B', 'N') sequence = sequence.replace('Z', 'Q') sequence = sequence.replace('J', 'L') sequence = list(sequence) if len(sequence) >= max_sequence_size: a = int((len(sequence) - max_sequence_size)) for i in list(range(a)): sequence.pop(51) # sequences.append(list((ord(t)-64) for t in sequence)) sequences.append(sequence) else: b = int((max_sequence_size - len(sequence))) for i in list(range(b)): sequence.insert(int((len(sequence))), '0') sequences.append(sequence) X.append(sequences[0]) acid_letters = [ '0', 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y' ] le = LabelEncoder() datas = np_utils.to_categorical(le.fit_transform(acid_letters)) def two2three(x): xx = [] for _, m in enumerate(x): k = [] for j, t in enumerate(m): if t not in acid_letters: t = '0' n = acid_letters.index(t) k.append(datas[n]) xx.append(k) return np.array(xx) x_train = np.array(two2three(X)) print(x_train.shape) # Training # ================================================== batches = batchgen.gen_batch(x_train, batch_size, num_epochs) with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): model = CNN_LSTM(x_train.shape[1], embedding_dim, filter_sizes, num_filters, num_hidden) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join("runs_deeploc", "checkpoints")) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints) sess.run(tf.global_variables_initializer()) # EVALUATE MODEL def dev_step(x_batch, writer=None): feed_dict = { model.input_x: x_batch, model.dropout_keep_prob: 1 } pre = sess.run(model.predictions, feed_dict) return pre ckpt = tf.train.get_checkpoint_state(checkpoint_dir) saver.restore(sess, ckpt.model_checkpoint_path) new_posi = [] new_nege = [] ''' for i,batch in enumerate(batches): if i == 0: pres0 = dev_step(batch) elif i == 1: pres1 = dev_step(batch) pres = np.concatenate((pres0, pres1)) else: press = dev_step(batch) pres = np.concatenate((pres, press)) ''' pres = dev_step(x_train) # ['Cell.membrane', 'Cytoplasm', 'Endoplasmic.reticulum', 'Extracellular', 'Golgi apparatus', 'Lysosome/Vacuole', 'Mitochandrion', 'Nucleus', 'Peroxisome', 'Plastid'] # ['Cell.membrane', 'Cytoplasm', 'Endoplasmic.reticulum', 'Golgi.apparatus', 'Lysosome/Vacuole', 'Mitochondrion', 'Nucleus', 'Peroxisome', 'Plastid', 'Extracellular'] [0, 1, 2, 4, 5, 6, 7, 8, 9, 3] print(len(pres)) for i, t in enumerate(pres): #print(np.argmax(t)) if t[7] >= value: new_posi.append(seq[i]) elif t[3] >= value: new_nege.append(seq[i]) print(len(new_posi)) accuracy = len(new_posi) / len(pres) print(accuracy) with open("acc_0.65.txt", "a") as f: f.write(str(accuracy)) f.write("\n") return new_posi, new_nege