def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.source_test_path, "--source_test_path is required." assert FLAGS.target_test_path, "--target_test_path is required." assert FLAGS.reference_test_path, "--reference_test_path is required." assert FLAGS.source_vocab_path, "--souce_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) # Read test set. source_sentences, target_sentences, references = utils.read_data_with_ref( FLAGS.source_test_path, FLAGS.target_test_path, FLAGS.reference_test_path) # Convert sentences to token ids sequences. source_sentences_ids = [ utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences ] target_sentences_ids = [ utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences ] utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for evaluation. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name( "source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name( "target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [ x_source, source_seq_length, x_target, target_seq_length, labels ] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") # Run evaluation. evaluate(sess, source_sentences, target_sentences, references, source_sentences_ids, target_sentences_ids, probs, placeholders)
def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.extract_dir, "--extract_dir is required." assert FLAGS.source_vocab_path, "--source_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." assert FLAGS.source_output_path, "--source_output_path is required." assert FLAGS.target_output_path, "--target_output_path is required." assert FLAGS.score_output_path, "--score_output_path is required." assert FLAGS.source_language, "--source_language is required." assert FLAGS.target_language, "--target_language is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) source_vocab_words = read_vocabulary(FLAGS.source_vocab_path) target_vocab_words = read_vocabulary(FLAGS.target_vocab_path) utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for extraction. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name( "source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name( "target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [ x_source, source_seq_length, x_target, target_seq_length, labels ] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file, \ open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file, \ open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file: source_docs, target_docs = read_docs(FLAGS.extract_dir, source_vocab, target_vocab) pairs = extract_pairs(sess, source_docs, target_docs, source_sentences_ids, target_sentences_ids, probs, placeholders) #for source_path, target_path in zip(source_paths, target_paths): for source_path, target_path in itertools.product( source_paths, target_paths): #print("paths", source_path, target_path) # Read sentences from articles. source_sentences, target_sentences = read_articles( source_path, target_path) # Convert sentences to token ids sequences. source_sentences_ids = [ utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences ] target_sentences_ids = [ utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences ] # Extract sentence pairs. pairs = extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs, placeholders) if not pairs: continue for source_sentence, target_sentence, score in pairs: source_output_file.write(source_sentence) target_output_file.write(target_sentence) score_output_file.write(str(score) + "\n")
def build_graph(self): # Reset previous graph. reset_graph() # Placeholders. x_source = tf.placeholder(tf.int32, shape=[None, None], name="x_source") source_seq_length = tf.placeholder(tf.int32, shape=[None], name="source_seq_length") x_target = tf.placeholder(tf.int32, shape=[None, None], name="x_target") target_seq_length = tf.placeholder(tf.int32, shape=[None], name="target_seq_length") labels = tf.placeholder(tf.float32, shape=[None], name="labels") input_dropout = tf.placeholder_with_default(1.0, shape=[], name="input_dropout") output_dropout = tf.placeholder_with_default(1.0, shape=[], name="output_dropout") decision_threshold = tf.placeholder_with_default(0.5, shape=[], name="decision_threshold") # Embedding layer. with tf.variable_scope("embeddings"): if self.config.source_embeddings_path is not None and self.config.target_embeddings_path is not None: source_pretrained_embeddings,\ target_pretrained_embeddings = get_pretrained_embeddings( source_embeddings_path, target_embeddings_path, source_vocab, target_vocab) assert source_pretrained_embeddings.shape[1] == target_pretrained_embeddings.shape[1] self.config.embedding_size = source_pretrained_embeddings.shape[1] if self.config.fix_pretrained: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(source_pretrained_embeddings), trainable=False) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(target_pretrained_embeddings), trainable=False) else: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(source_pretrained_embeddings)) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size], initializer=tf.constant_initializer(target_pretrained_embeddings)) else: source_embeddings = tf.get_variable( name="source_embeddings_matrix", shape=[self.config.source_vocab_size, self.config.embedding_size]) target_embeddings = tf.get_variable( name="target_embeddings_matrix", shape=[self.config.target_vocab_size, self.config.embedding_size]) source_rnn_inputs = tf.nn.embedding_lookup(source_embeddings, x_source) target_rnn_inputs = tf.nn.embedding_lookup(target_embeddings, x_target) source_rnn_inputs = tf.nn.dropout(source_rnn_inputs, keep_prob=input_dropout, name="source_seq_embeddings") target_rnn_inputs = tf.nn.dropout(target_rnn_inputs, keep_prob=input_dropout, name="target_seq_embeddings") # BiRNN encoder. with tf.variable_scope("birnn") as scope: if self.config.use_lstm: cell_fw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) cell_bw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) else: cell_fw = tf.nn.rnn_cell.GRUCell(self.config.state_size) cell_bw = tf.nn.rnn_cell.GRUCell(self.config.state_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=output_dropout) cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=output_dropout) if self.config.num_layers > 1: if self.config.use_lstm: cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) for _ in range(self.config.num_layers)]) cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True) for _ in range(self.config.num_layers)]) else: cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size) for _ in range(self.config.num_layers)]) cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size) for _ in range(self.config.num_layers)]) with tf.variable_scope(scope): source_rnn_outputs, source_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=source_rnn_inputs, sequence_length=source_seq_length, dtype=tf.float32) with tf.variable_scope(scope, reuse=True): target_rnn_outputs, target_final_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=target_rnn_inputs, sequence_length=target_seq_length, dtype=tf.float32) self.config.state_size *= 2 # Mean and max pooling only work for 1 layer BiRNN. if self.config.use_mean_pooling: source_final_state = self.average_pooling(source_rnn_outputs, source_seq_length) target_final_state = self.average_pooling(target_rnn_outputs, target_seq_length) elif self.config.use_max_pooling: source_final_state = self.max_pooling(source_rnn_outputs) target_final_state = self.max_pooling(target_rnn_outputs) else: source_final_state_fw, source_final_state_bw = source_final_state target_final_state_fw, target_final_state_bw = target_final_state if self.config.num_layers > 1: source_final_state_fw = source_final_state_fw[-1] source_final_state_bw = source_final_state_bw[-1] target_final_state_fw = target_final_state_fw[-1] target_final_state_bw = target_final_state_bw[-1] if self.config.use_lstm: source_final_state_fw = source_final_state_fw.h source_final_state_bw = source_final_state_bw.h target_final_state_fw = target_final_state_fw.h target_final_state_bw = target_final_state_bw.h source_final_state = tf.concat([source_final_state_fw, source_final_state_bw], axis=1, name="source_final_state_ph") target_final_state = tf.concat([target_final_state_fw, target_final_state_bw], axis=1) # Feed-forward neural network. with tf.variable_scope("feed_forward"): h_multiply = tf.multiply(source_final_state, target_final_state) h_abs_diff = tf.abs(tf.subtract(source_final_state, target_final_state)) W_1 = tf.get_variable(name="W_1", shape=[self.config.state_size, self.config.hidden_size]) W_2 = tf.get_variable(name="W_2", shape=[self.config.state_size, self.config.hidden_size]) b_1 = tf.get_variable(name="b_1", shape=[self.config.hidden_size], initializer=tf.constant_initializer(0.0)) h_semantic = tf.tanh(tf.matmul(h_multiply, W_1) + tf.matmul(h_abs_diff, W_2) + b_1) W_3 = tf.get_variable(name="W_3", shape=[self.config.hidden_size, 1]) b_2 = tf.get_variable(name="b_2", shape=[1], initializer=tf.constant_initializer(0.0)) logits = tf.matmul(h_semantic, W_3) + b_2 logits = tf.squeeze(logits, name="logits") # Sigmoid output layer. with tf.name_scope("output"): probs = tf.sigmoid(logits, name="probs") predicted_class = tf.cast(tf.greater(probs, decision_threshold), tf.float32, name="predicted_class") # Loss. with tf.name_scope("cross_entropy"): losses = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name="cross_entropy_per_sequence") mean_loss = tf.reduce_mean(losses, name="cross_entropy_loss") # Optimization. with tf.name_scope("optimization"): global_step = tf.Variable(initial_value=0, trainable=False, name="global_step") optimizer = tf.train.AdamOptimizer(self.config.learning_rate) trainable_variables = tf.trainable_variables() gradients = tf.gradients(mean_loss, trainable_variables, name="gradients") clipped_gradients, global_norm = tf.clip_by_global_norm( gradients, clip_norm=self.config.max_gradient_norm, name="clipped_gradients") train_op = optimizer.apply_gradients(zip(clipped_gradients, trainable_variables), global_step=global_step) # Evaluation metrics. accuracy = tf.metrics.accuracy(labels, predicted_class, name="accuracy") precision = tf.metrics.precision(labels, predicted_class, name="precision") recall = tf.metrics.recall(labels, predicted_class, name="recall") # Add summaries. tf.summary.scalar("loss", mean_loss) tf.summary.scalar("global_norm", global_norm) tf.summary.scalar("accuracy", accuracy[0]) tf.summary.scalar("precision", precision[0]) tf.summary.scalar("recall", recall[0]) tf.summary.scalar("logits" + "/sparsity", tf.nn.zero_fraction(logits)) tf.summary.histogram("logits" + "/activations", logits) tf.summary.histogram("probs", probs) # Add histogram for trainable variables. for var in trainable_variables: tf.summary.histogram(var.op.name, var) # Add histogram for gradients. for grad, var in zip(clipped_gradients, trainable_variables): if grad is not None: tf.summary.histogram(var.op.name + "/gradients", grad) # Assign placeholders and operations. self.x_source = x_source self.x_target = x_target self.source_seq_length = source_seq_length self.target_seq_length = target_seq_length self.labels = labels self.input_dropout = input_dropout self.output_dropout = output_dropout self.decision_threshold = decision_threshold self.train_op = train_op self.probs = probs self.predicted_class = predicted_class self.mean_loss = mean_loss self.accuracy = accuracy self.precision = precision self.recall = recall self.summaries = tf.summary.merge_all() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.extract_dir, "--extract_dir is required." assert FLAGS.source_vocab_path, "--source_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." assert FLAGS.source_output_path, "--source_output_path is required." assert FLAGS.target_output_path, "--target_output_path is required." assert FLAGS.score_output_path, "--score_output_path is required." assert FLAGS.source_language, "--source_language is required." assert FLAGS.target_language, "--target_language is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) # Read source and target paths for sentence extraction. source_paths = [] target_paths = [] for file in os.listdir(FLAGS.extract_dir): if file.endswith(FLAGS.source_language): source_paths.append(os.path.join(FLAGS.extract_dir, file)) elif file.endswith(FLAGS.target_language): target_paths.append(os.path.join(FLAGS.extract_dir, file)) source_paths.sort() target_paths.sort() utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for extraction. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name("source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name("target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [x_source, source_seq_length, x_target, target_seq_length, labels] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") source_final_state_ph = sess.graph.get_tensor_by_name("birnn/source_final_state_ph:0") with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file,\ open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file,\ open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file: for source_path, target_path in zip(source_paths, target_paths): # Read sentences from articles. source_sentences, target_sentences = read_articles(source_path, target_path) # Convert sentences to token ids sequences. source_sentences_ids = [utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences] target_sentences_ids = [utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences] # Extract sentence pairs. pairs = extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs, placeholders, source_final_state_ph) if not pairs: continue for source_sentence, target_sentence, score in pairs: source_output_file.write(source_sentence) target_output_file.write(target_sentence) score_output_file.write(str(score) + "\n")