def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.source_test_path, "--source_test_path is required." assert FLAGS.target_test_path, "--target_test_path is required." assert FLAGS.reference_test_path, "--reference_test_path is required." assert FLAGS.source_vocab_path, "--souce_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) # Read test set. source_sentences, target_sentences, references = utils.read_data_with_ref( FLAGS.source_test_path, FLAGS.target_test_path, FLAGS.reference_test_path) # Convert sentences to token ids sequences. source_sentences_ids = [ utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences ] target_sentences_ids = [ utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences ] utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for evaluation. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name( "source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name( "target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [ x_source, source_seq_length, x_target, target_seq_length, labels ] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") # Run evaluation. evaluate(sess, source_sentences, target_sentences, references, source_sentences_ids, target_sentences_ids, probs, placeholders)
def query_model(sess, input_node, predictions, vocab, rev_vocab, max_seq_len, output_embs_for_all_vocab): while True: sys.stdout.write("Type a definition: ") sys.stdout.flush() sentence = sys.stdin.readline() sys.stdout.write("Number of candidates: ") sys.stdout.flush() top = int(sys.stdin.readline()) token_ids = utils.sentence_to_token_ids(sentence, vocab) padded_ids = np.asarray(utils.pad_sequence(token_ids, max_seq_len)) input_data = np.asarray([padded_ids]) model_preds = sess.run(predictions, feed_dict={input_node: input_data}) sims = 1 - np.squeeze( dist.cdist(model_preds, output_embs_for_all_vocab, metric="cosine")) sims = np.nan_to_num(sims) candidate_ids = sims.argsort()[::-1][:top] candidates = [rev_vocab[idx] for idx in candidate_ids] print("\n Top %s candidates from the RNN model:" % top) for ii, cand in enumerate(candidates): print("%s: %s" % (ii + 1, cand)) sys.stdout.flush() sentence = sys.stdin.readline()
def query_model(sess, input_node, predictions, vocab, rev_vocab, max_seq_len, output_embs_for_all_vocab): with tf.gfile.GFile("data/definitions/concept_descriptions.tok", mode="r") as data_file: with tf.gfile.GFile("data/output/concept_BOW.txt", mode="w") as output_file: for line in data_file: top = 10 token_ids = utils.sentence_to_token_ids(line, vocab) padded_ids = np.asarray( utils.pad_sequence(token_ids[1:], max_seq_len)) input_data = np.asarray([padded_ids]) model_preds = sess.run(predictions, feed_dict={input_node: input_data}) sims = 1 - np.squeeze( dist.cdist(model_preds, output_embs_for_all_vocab, metric="cosine")) sims = np.nan_to_num(sims) candidate_ids = sims.argsort()[::-1][:top] candidates = [rev_vocab[idx] for idx in candidate_ids] for ii, cand in enumerate(candidates): output_file.write(cand + " ") print(cand + " ") output_file.write("\n") output_file.flush() print("\n")
def map_to_ids(sentence_tuple): token_ids = [ sentence if vocab is None else utils.sentence_to_token_ids(sentence, vocab.vocab, character_level=self.character_level.get(ext)) for ext, vocab, sentence in zip(self.extensions, self.vocabs, sentence_tuple) ] return token_ids
def queryBaseline(pre_emb_for_all_vocab, vocab, rev_vocab): while True: sys.stdout.write("Type a definition: ") sys.stdout.flush() sentence = sys.stdin.readline() sys.stdout.write("Number of candidates: ") sys.stdout.flush() top = int(sys.stdin.readline()) token_ids = utils.sentence_to_token_ids(sentence, vocab) base_rep_mean = np.asarray( [np.mean(pre_emb_for_all_vocab[token_ids], axis=0)]) print("Top %s baseline candidates from W2V mean/add model:" % top) for ii, cand in enumerate( get_Candidates_Answers(base_rep_mean, pre_emb_for_all_vocab, top, rev_vocab)): print("%s: %s" % (ii + 1, cand)) # base_rep_add = np.asarray([np.sum(pre_emb_for_all_vocab[token_ids], axis=0)]) # print("Top %s baseline candidates from W2V add model:" % top) # for ii, cand in enumerate(get_Candidates_Answers(base_rep_add, pre_emb_for_all_vocab, top, rev_vocab)): # print("%s: %s" % (ii + 1, cand)) base_rep_mult = np.asarray( [np.prod(pre_emb_for_all_vocab[token_ids], axis=0)]) print("Top %s baseline candidates from W2V mult model:" % top) for ii, cand in enumerate( get_Candidates_Answers(base_rep_mult, pre_emb_for_all_vocab, top, rev_vocab)): print("%s: %s" % (ii + 1, cand))
def readCSVhelper(csvFileName, imageDir, word2id, readLabel=True): with open(csvFileName, 'r') as csvFile: CSVreader = csv.reader(csvFile, skipinitialspace=True, delimiter=',') fileIds = [] fileNames = [] sentences = [] features = [] labels = [] missingFiles = 0 print('Reading file %s' % csvFileName) next(CSVreader) # skip header for row in tqdm(CSVreader): fId = row[0] baseName = row[2] fName = imageDir + baseName if readLabel: label = row[6] fileIds.append(fId) fileNames.append(fName) if readLabel: labels.append(label) disc = row[12] tokens, clean_tokens, ids = sentence_to_token_ids(disc, word2id) # sentence length is restricted to 100 paddedIdsList = padded(ids, 100) sentences.append(paddedIdsList) sqft = float(row[9]) elemSchool = float(row[18]) midSchool = float(row[19]) highSchool = float(row[20]) walkScore = float(row[21]) transitScore = float(row[22]) bikeScore = float(row[23]) tmpVec = [ sqft, elemSchool, midSchool, highSchool, walkScore, transitScore, bikeScore ] features.append(tmpVec) print('Got %d picture ids' % (len(fileNames))) print('Got %d picture filenames' % (len(fileNames))) print('Got %d sentences' % (len(sentences))) print('Got %d features' % (len(features))) #print(features[0]) #print(features[1]) norm_features = preprocessing.normalize(features, axis=0) #print(norm_features[0]) #print(norm_features[1]) return fileIds, fileNames, sentences, norm_features, labels
def align(self, output=None, align_encoder_id=0, **kwargs): # if self.binary and any(self.binary): # raise NotImplementedError if len(self.filenames.test) != len(self.extensions): raise Exception('wrong number of input files') binary = self.binary and any(self.binary) paths = self.filenames.test or [None] lines = utils.read_lines(paths, binary=self.binary) for line_id, lines in enumerate(lines): token_ids = [ sentence if vocab is None else utils.sentence_to_token_ids( sentence, vocab.vocab, character_level=self.character_level.get(ext)) for ext, vocab, sentence in zip(self.extensions, self.vocabs, lines) ] _, weights = self.seq2seq_model.step(data=[token_ids], align=True, update_model=False) trg_vocab = self.trg_vocab[0] trg_token_ids = token_ids[len(self.src_ext)] trg_tokens = [ trg_vocab.reverse[i] if i < len(trg_vocab.reverse) else utils._UNK for i in trg_token_ids ] weights = weights.squeeze() max_len = weights.shape[1] if binary: src_tokens = None else: src_tokens = lines[align_encoder_id].split()[:max_len - 1] + [utils._EOS] trg_tokens = trg_tokens[:weights.shape[0] - 1] + [utils._EOS] output_file = '{}.{}.svg'.format(output, line_id + 1) if output is not None else None utils.heatmap(src_tokens, trg_tokens, weights, output_file=output_file)
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = utils.initialize_vocabulary(vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. sentence_tokens = utils.basic_tokenizer( tf.compat.as_bytes(sentence)) token_ids = utils.sentence_to_token_ids(sentence_tokens, vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if utils.EOS_ID in outputs: outputs = outputs[:outputs.index(utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join( [tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def queryBaselineWithConecptDesc(pre_emb_for_all_vocab, vocab, rev_vocab): with tf.gfile.GFile("data/definitions/concept_descriptions.tok", mode="r") as data_file: with tf.gfile.GFile("data/output/concept_Baseline.txt", mode="w") as output_file: for line in data_file: top = 100 token_ids = utils.sentence_to_token_ids(line, vocab) base_rep_mean = np.asarray( [np.mean(pre_emb_for_all_vocab[token_ids[1:]], axis=0)]) print("Top %s baseline candidates from W2V mean/add model:" % top) for ii, cand in enumerate( get_Candidates_Answers(base_rep_mean, pre_emb_for_all_vocab, top, rev_vocab)): output_file.write(cand + " ") print(cand + " ") output_file.write("\n") output_file.flush() print("\n")
def decode(): with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.form_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = utils.init_vocab(en_vocab_path) _, rev_fr_vocab = utils.init_vocab(fr_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(tokenids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logti, axis=1)) for logti in output_logits] if utils.END_ID in outputs: outputs = outputs[:outputs.index(utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.extract_dir, "--extract_dir is required." assert FLAGS.source_vocab_path, "--source_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." assert FLAGS.source_output_path, "--source_output_path is required." assert FLAGS.target_output_path, "--target_output_path is required." assert FLAGS.score_output_path, "--score_output_path is required." assert FLAGS.source_language, "--source_language is required." assert FLAGS.target_language, "--target_language is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) source_vocab_words = read_vocabulary(FLAGS.source_vocab_path) target_vocab_words = read_vocabulary(FLAGS.target_vocab_path) utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for extraction. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name( "source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name( "target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [ x_source, source_seq_length, x_target, target_seq_length, labels ] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file, \ open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file, \ open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file: source_docs, target_docs = read_docs(FLAGS.extract_dir, source_vocab, target_vocab) pairs = extract_pairs(sess, source_docs, target_docs, source_sentences_ids, target_sentences_ids, probs, placeholders) #for source_path, target_path in zip(source_paths, target_paths): for source_path, target_path in itertools.product( source_paths, target_paths): #print("paths", source_path, target_path) # Read sentences from articles. source_sentences, target_sentences = read_articles( source_path, target_path) # Convert sentences to token ids sequences. source_sentences_ids = [ utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences ] target_sentences_ids = [ utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences ] # Extract sentence pairs. pairs = extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs, placeholders) if not pairs: continue for source_sentence, target_sentence, score in pairs: source_output_file.write(source_sentence) target_output_file.write(target_sentence) score_output_file.write(str(score) + "\n")
def main(_): assert FLAGS.checkpoint_dir, "--checkpoint_dir is required." assert FLAGS.extract_dir, "--extract_dir is required." assert FLAGS.source_vocab_path, "--source_vocab_path is required." assert FLAGS.target_vocab_path, "--target_vocab_path is required." assert FLAGS.source_output_path, "--source_output_path is required." assert FLAGS.target_output_path, "--target_output_path is required." assert FLAGS.score_output_path, "--score_output_path is required." assert FLAGS.source_language, "--source_language is required." assert FLAGS.target_language, "--target_language is required." # Read vocabularies. source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path) target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path) # Read source and target paths for sentence extraction. source_paths = [] target_paths = [] for file in os.listdir(FLAGS.extract_dir): if file.endswith(FLAGS.source_language): source_paths.append(os.path.join(FLAGS.extract_dir, file)) elif file.endswith(FLAGS.target_language): target_paths.append(os.path.join(FLAGS.extract_dir, file)) source_paths.sort() target_paths.sort() utils.reset_graph() with tf.Session() as sess: # Restore saved model. utils.restore_model(sess, FLAGS.checkpoint_dir) # Recover placeholders and ops for extraction. x_source = sess.graph.get_tensor_by_name("x_source:0") source_seq_length = sess.graph.get_tensor_by_name("source_seq_length:0") x_target = sess.graph.get_tensor_by_name("x_target:0") target_seq_length = sess.graph.get_tensor_by_name("target_seq_length:0") labels = sess.graph.get_tensor_by_name("labels:0") placeholders = [x_source, source_seq_length, x_target, target_seq_length, labels] probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0") source_final_state_ph = sess.graph.get_tensor_by_name("birnn/source_final_state_ph:0") with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file,\ open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file,\ open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file: for source_path, target_path in zip(source_paths, target_paths): # Read sentences from articles. source_sentences, target_sentences = read_articles(source_path, target_path) # Convert sentences to token ids sequences. source_sentences_ids = [utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length) for sent in source_sentences] target_sentences_ids = [utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length) for sent in target_sentences] # Extract sentence pairs. pairs = extract_pairs(sess, source_sentences, target_sentences, source_sentences_ids, target_sentences_ids, probs, placeholders, source_final_state_ph) if not pairs: continue for source_sentence, target_sentence, score in pairs: source_output_file.write(source_sentence) target_output_file.write(target_sentence) score_output_file.write(str(score) + "\n")