Exemple #1
0
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.source_test_path, "--source_test_path is required."
    assert FLAGS.target_test_path, "--target_test_path is required."
    assert FLAGS.reference_test_path, "--reference_test_path is required."
    assert FLAGS.source_vocab_path, "--souce_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    # Read test set.
    source_sentences, target_sentences, references = utils.read_data_with_ref(
        FLAGS.source_test_path, FLAGS.target_test_path,
        FLAGS.reference_test_path)

    # Convert sentences to token ids sequences.
    source_sentences_ids = [
        utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length)
        for sent in source_sentences
    ]
    target_sentences_ids = [
        utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length)
        for sent in target_sentences
    ]

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for evaluation.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name(
            "source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name(
            "target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [
            x_source, source_seq_length, x_target, target_seq_length, labels
        ]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        # Run evaluation.
        evaluate(sess, source_sentences, target_sentences, references,
                 source_sentences_ids, target_sentences_ids, probs,
                 placeholders)
Exemple #2
0
def query_model(sess, input_node, predictions, vocab, rev_vocab, max_seq_len,
                output_embs_for_all_vocab):
    while True:
        sys.stdout.write("Type a definition: ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sys.stdout.write("Number of candidates: ")
        sys.stdout.flush()
        top = int(sys.stdin.readline())
        token_ids = utils.sentence_to_token_ids(sentence, vocab)
        padded_ids = np.asarray(utils.pad_sequence(token_ids, max_seq_len))

        input_data = np.asarray([padded_ids])
        model_preds = sess.run(predictions, feed_dict={input_node: input_data})
        sims = 1 - np.squeeze(
            dist.cdist(model_preds, output_embs_for_all_vocab,
                       metric="cosine"))
        sims = np.nan_to_num(sims)
        candidate_ids = sims.argsort()[::-1][:top]
        candidates = [rev_vocab[idx] for idx in candidate_ids]

        print("\n Top %s candidates from the RNN model:" % top)
        for ii, cand in enumerate(candidates):
            print("%s: %s" % (ii + 1, cand))

        sys.stdout.flush()
        sentence = sys.stdin.readline()
Exemple #3
0
def query_model(sess, input_node, predictions, vocab, rev_vocab, max_seq_len,
                output_embs_for_all_vocab):
    with tf.gfile.GFile("data/definitions/concept_descriptions.tok",
                        mode="r") as data_file:
        with tf.gfile.GFile("data/output/concept_BOW.txt",
                            mode="w") as output_file:
            for line in data_file:
                top = 10
                token_ids = utils.sentence_to_token_ids(line, vocab)
                padded_ids = np.asarray(
                    utils.pad_sequence(token_ids[1:], max_seq_len))

                input_data = np.asarray([padded_ids])
                model_preds = sess.run(predictions,
                                       feed_dict={input_node: input_data})
                sims = 1 - np.squeeze(
                    dist.cdist(model_preds,
                               output_embs_for_all_vocab,
                               metric="cosine"))
                sims = np.nan_to_num(sims)
                candidate_ids = sims.argsort()[::-1][:top]
                candidates = [rev_vocab[idx] for idx in candidate_ids]
                for ii, cand in enumerate(candidates):
                    output_file.write(cand + " ")
                    print(cand + " ")
                output_file.write("\n")
                output_file.flush()
                print("\n")
 def map_to_ids(sentence_tuple):
     token_ids = [
         sentence if vocab is None else
         utils.sentence_to_token_ids(sentence, vocab.vocab, character_level=self.character_level.get(ext))
         for ext, vocab, sentence in zip(self.extensions, self.vocabs, sentence_tuple)
         ]
     return token_ids
Exemple #5
0
def queryBaseline(pre_emb_for_all_vocab, vocab, rev_vocab):
    while True:
        sys.stdout.write("Type a definition: ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sys.stdout.write("Number of candidates: ")
        sys.stdout.flush()
        top = int(sys.stdin.readline())
        token_ids = utils.sentence_to_token_ids(sentence, vocab)

        base_rep_mean = np.asarray(
            [np.mean(pre_emb_for_all_vocab[token_ids], axis=0)])
        print("Top %s baseline candidates from W2V mean/add model:" % top)
        for ii, cand in enumerate(
                get_Candidates_Answers(base_rep_mean, pre_emb_for_all_vocab,
                                       top, rev_vocab)):
            print("%s: %s" % (ii + 1, cand))

        # base_rep_add = np.asarray([np.sum(pre_emb_for_all_vocab[token_ids], axis=0)])
        # print("Top %s baseline candidates from W2V add model:" % top)
        # for ii, cand in enumerate(get_Candidates_Answers(base_rep_add, pre_emb_for_all_vocab, top, rev_vocab)):
        #     print("%s: %s" % (ii + 1, cand))

        base_rep_mult = np.asarray(
            [np.prod(pre_emb_for_all_vocab[token_ids], axis=0)])
        print("Top %s baseline candidates from W2V mult model:" % top)
        for ii, cand in enumerate(
                get_Candidates_Answers(base_rep_mult, pre_emb_for_all_vocab,
                                       top, rev_vocab)):
            print("%s: %s" % (ii + 1, cand))
Exemple #6
0
def readCSVhelper(csvFileName, imageDir, word2id, readLabel=True):
    with open(csvFileName, 'r') as csvFile:
        CSVreader = csv.reader(csvFile, skipinitialspace=True, delimiter=',')
        fileIds = []
        fileNames = []
        sentences = []
        features = []
        labels = []
        missingFiles = 0
        print('Reading file %s' % csvFileName)
        next(CSVreader)  # skip header
        for row in tqdm(CSVreader):
            fId = row[0]
            baseName = row[2]
            fName = imageDir + baseName
            if readLabel:
                label = row[6]
            fileIds.append(fId)
            fileNames.append(fName)
            if readLabel:
                labels.append(label)

            disc = row[12]
            tokens, clean_tokens, ids = sentence_to_token_ids(disc, word2id)
            # sentence length is restricted to 100
            paddedIdsList = padded(ids, 100)
            sentences.append(paddedIdsList)

            sqft = float(row[9])
            elemSchool = float(row[18])
            midSchool = float(row[19])
            highSchool = float(row[20])
            walkScore = float(row[21])
            transitScore = float(row[22])
            bikeScore = float(row[23])
            tmpVec = [
                sqft, elemSchool, midSchool, highSchool, walkScore,
                transitScore, bikeScore
            ]
            features.append(tmpVec)
        print('Got %d picture ids' % (len(fileNames)))
        print('Got %d picture filenames' % (len(fileNames)))
        print('Got %d sentences' % (len(sentences)))
        print('Got %d features' % (len(features)))
        #print(features[0])
        #print(features[1])
        norm_features = preprocessing.normalize(features, axis=0)
        #print(norm_features[0])
        #print(norm_features[1])

        return fileIds, fileNames, sentences, norm_features, labels
Exemple #7
0
    def align(self, output=None, align_encoder_id=0, **kwargs):
        # if self.binary and any(self.binary):
        #     raise NotImplementedError

        if len(self.filenames.test) != len(self.extensions):
            raise Exception('wrong number of input files')

        binary = self.binary and any(self.binary)

        paths = self.filenames.test or [None]
        lines = utils.read_lines(paths, binary=self.binary)

        for line_id, lines in enumerate(lines):
            token_ids = [
                sentence if vocab is None else utils.sentence_to_token_ids(
                    sentence,
                    vocab.vocab,
                    character_level=self.character_level.get(ext)) for ext,
                vocab, sentence in zip(self.extensions, self.vocabs, lines)
            ]

            _, weights = self.seq2seq_model.step(data=[token_ids],
                                                 align=True,
                                                 update_model=False)

            trg_vocab = self.trg_vocab[0]
            trg_token_ids = token_ids[len(self.src_ext)]
            trg_tokens = [
                trg_vocab.reverse[i]
                if i < len(trg_vocab.reverse) else utils._UNK
                for i in trg_token_ids
            ]

            weights = weights.squeeze()
            max_len = weights.shape[1]

            if binary:
                src_tokens = None
            else:
                src_tokens = lines[align_encoder_id].split()[:max_len -
                                                             1] + [utils._EOS]
            trg_tokens = trg_tokens[:weights.shape[0] - 1] + [utils._EOS]

            output_file = '{}.{}.svg'.format(output, line_id +
                                             1) if output is not None else None

            utils.heatmap(src_tokens,
                          trg_tokens,
                          weights,
                          output_file=output_file)
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = utils.initialize_vocabulary(vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            sentence_tokens = utils.basic_tokenizer(
                tf.compat.as_bytes(sentence))
            token_ids = utils.sentence_to_token_ids(sentence_tokens, vocab)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join(
                [tf.compat.as_str(rev_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #9
0
def queryBaselineWithConecptDesc(pre_emb_for_all_vocab, vocab, rev_vocab):
    with tf.gfile.GFile("data/definitions/concept_descriptions.tok",
                        mode="r") as data_file:
        with tf.gfile.GFile("data/output/concept_Baseline.txt",
                            mode="w") as output_file:
            for line in data_file:
                top = 100
                token_ids = utils.sentence_to_token_ids(line, vocab)
                base_rep_mean = np.asarray(
                    [np.mean(pre_emb_for_all_vocab[token_ids[1:]], axis=0)])
                print("Top %s baseline candidates from W2V mean/add model:" %
                      top)
                for ii, cand in enumerate(
                        get_Candidates_Answers(base_rep_mean,
                                               pre_emb_for_all_vocab, top,
                                               rev_vocab)):
                    output_file.write(cand + " ")
                    print(cand + " ")
                output_file.write("\n")
                output_file.flush()
                print("\n")
Exemple #10
0
def decode():
    with tf.Session() as sess:
        model = create_model(sess, True)
        model.batch_size = 1

        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.form_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)

        en_vocab, _     = utils.init_vocab(en_vocab_path)
        _, rev_fr_vocab = utils.init_vocab(fr_vocab_path)

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            token_ids = utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(tokenids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            outputs = [int(np.argmax(logti, axis=1)) for logti in output_logits]

            if utils.END_ID in outputs:
                outputs = outputs[:outputs.index(utils.EOS_ID)]

            print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.extract_dir, "--extract_dir is required."
    assert FLAGS.source_vocab_path, "--source_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."
    assert FLAGS.source_output_path, "--source_output_path is required."
    assert FLAGS.target_output_path, "--target_output_path is required."
    assert FLAGS.score_output_path, "--score_output_path is required."
    assert FLAGS.source_language, "--source_language is required."
    assert FLAGS.target_language, "--target_language is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    source_vocab_words = read_vocabulary(FLAGS.source_vocab_path)
    target_vocab_words = read_vocabulary(FLAGS.target_vocab_path)

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for extraction.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name(
            "source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name(
            "target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [
            x_source, source_seq_length, x_target, target_seq_length, labels
        ]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file, \
                open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file, \
                open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file:

            source_docs, target_docs = read_docs(FLAGS.extract_dir,
                                                 source_vocab, target_vocab)
            pairs = extract_pairs(sess, source_docs, target_docs,
                                  source_sentences_ids, target_sentences_ids,
                                  probs, placeholders)

            #for source_path, target_path in zip(source_paths, target_paths):
            for source_path, target_path in itertools.product(
                    source_paths, target_paths):
                #print("paths", source_path, target_path)
                # Read sentences from articles.
                source_sentences, target_sentences = read_articles(
                    source_path, target_path)

                # Convert sentences to token ids sequences.
                source_sentences_ids = [
                    utils.sentence_to_token_ids(sent, source_vocab,
                                                FLAGS.max_seq_length)
                    for sent in source_sentences
                ]
                target_sentences_ids = [
                    utils.sentence_to_token_ids(sent, target_vocab,
                                                FLAGS.max_seq_length)
                    for sent in target_sentences
                ]

                # Extract sentence pairs.
                pairs = extract_pairs(sess, source_sentences, target_sentences,
                                      source_sentences_ids,
                                      target_sentences_ids, probs,
                                      placeholders)
                if not pairs:
                    continue
                for source_sentence, target_sentence, score in pairs:
                    source_output_file.write(source_sentence)
                    target_output_file.write(target_sentence)
                    score_output_file.write(str(score) + "\n")
Exemple #12
0
def main(_):
    assert FLAGS.checkpoint_dir, "--checkpoint_dir is required."
    assert FLAGS.extract_dir, "--extract_dir is required."
    assert FLAGS.source_vocab_path, "--source_vocab_path is required."
    assert FLAGS.target_vocab_path, "--target_vocab_path is required."
    assert FLAGS.source_output_path, "--source_output_path is required."
    assert FLAGS.target_output_path, "--target_output_path is required."
    assert FLAGS.score_output_path, "--score_output_path is required."
    assert FLAGS.source_language, "--source_language is required."
    assert FLAGS.target_language, "--target_language is required."

    # Read vocabularies.
    source_vocab, _ = utils.initialize_vocabulary(FLAGS.source_vocab_path)
    target_vocab, _ = utils.initialize_vocabulary(FLAGS.target_vocab_path)

    # Read source and target paths for sentence extraction.
    source_paths = []
    target_paths = []
    for file in os.listdir(FLAGS.extract_dir):
        if file.endswith(FLAGS.source_language):
            source_paths.append(os.path.join(FLAGS.extract_dir, file))
        elif file.endswith(FLAGS.target_language):
            target_paths.append(os.path.join(FLAGS.extract_dir, file))
    source_paths.sort()
    target_paths.sort()

    utils.reset_graph()
    with tf.Session() as sess:
        # Restore saved model.
        utils.restore_model(sess, FLAGS.checkpoint_dir)

        # Recover placeholders and ops for extraction.
        x_source = sess.graph.get_tensor_by_name("x_source:0")
        source_seq_length = sess.graph.get_tensor_by_name("source_seq_length:0")

        x_target = sess.graph.get_tensor_by_name("x_target:0")
        target_seq_length = sess.graph.get_tensor_by_name("target_seq_length:0")

        labels = sess.graph.get_tensor_by_name("labels:0")

        placeholders = [x_source, source_seq_length, x_target, target_seq_length, labels]

        probs = sess.graph.get_tensor_by_name("feed_forward/output/probs:0")

        source_final_state_ph = sess.graph.get_tensor_by_name("birnn/source_final_state_ph:0")

        with open(FLAGS.source_output_path, mode="w", encoding="utf-8") as source_output_file,\
             open(FLAGS.target_output_path, mode="w", encoding="utf-8") as target_output_file,\
             open(FLAGS.score_output_path, mode="w", encoding="utf-8") as score_output_file:

            for source_path, target_path in zip(source_paths, target_paths):
                # Read sentences from articles.
                source_sentences, target_sentences = read_articles(source_path, target_path)

                # Convert sentences to token ids sequences.
                source_sentences_ids = [utils.sentence_to_token_ids(sent, source_vocab, FLAGS.max_seq_length)
                                        for sent in source_sentences]
                target_sentences_ids = [utils.sentence_to_token_ids(sent, target_vocab, FLAGS.max_seq_length)
                                        for sent in target_sentences]

                # Extract sentence pairs.
                pairs = extract_pairs(sess, source_sentences, target_sentences,
                                      source_sentences_ids, target_sentences_ids,
                                      probs, placeholders, source_final_state_ph)
                if not pairs:
                    continue
                for source_sentence, target_sentence, score in pairs:
                    source_output_file.write(source_sentence)
                    target_output_file.write(target_sentence)
                    score_output_file.write(str(score) + "\n")