def query_model(sess,
                input_node,
                predictions,
                vocab,
                rev_vocab,
                max_seq_len,
                dropout_keep_prob,
                saver=None,
                embs=None,
                out_form="cosine"):
    while True:
        sys.stdout.write("Type a definition: ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sys.stdout.write("Number of candidates: ")
        sys.stdout.flush()
        top = int(sys.stdin.readline())
        # Get token-ids for the input gloss.
        token_ids = data_utils.sentence_to_token_ids(sentence, vocab)
        # Pad out (or truncate) the input gloss ids.
        padded_ids = np.asarray(data_utils.pad_sequence(
            token_ids, max_seq_len))
        input_data = np.asarray([padded_ids])
        # Single vector encoding the input gloss.
        model_preds = sess.run(predictions,
                               feed_dict={
                                   input_node: input_data,
                                   dropout_keep_prob: 1.0
                               })
        # Softmax already provides scores over the vocab.
        if out_form == "softmax":
            # Exclude padding and _UNK tokens from the top-k calculation.
            candidate_ids = np.squeeze(
                model_preds)[2:].argsort()[-top:][::-1] + 2
            # Replace top-k ids with corresponding words.
            candidates = [rev_vocab[idx] for idx in candidate_ids]
            # Cosine requires sim to be calculated for each vocab word.
        else:
            sims = 1 - np.squeeze(
                dist.cdist(model_preds, embs, metric="cosine"))
            # replace nans with 0s.
            sims = np.nan_to_num(sims)
            candidate_ids = sims.argsort()[::-1][:top]
            candidates = [rev_vocab[idx] for idx in candidate_ids]
        # get baseline candidates from the raw embedding space.
        base_rep = np.asarray([np.mean(embs[token_ids], axis=0)])
        sims_base = 1 - np.squeeze(dist.cdist(base_rep, embs, metric="cosine"))
        sims_base = np.nan_to_num(sims_base)
        candidate_ids_base = sims_base.argsort()[::-1][:top]
        candidates_base = [rev_vocab[idx] for idx in candidate_ids_base]
        print("Top %s baseline candidates:" % top)
        for ii, cand in enumerate(candidates_base):
            print("%s: %s" % (ii + 1, cand))
        print("\n Top %s candidates from the model:" % top)
        for ii, cand in enumerate(candidates):
            print("%s: %s" % (ii + 1, cand))
        old_model_preds = model_preds
        sys.stdout.flush()
        sentence = sys.stdin.readline()
Beispiel #2
0
 def get_feed_dict(self, sentences, labels=None, learning_rate=None, dropout=None):
     sentences_ids, sequence_lengths = pad_sequence(sentences, 0)
     feed = {
         self.sentence_ids:sentences_ids,
         self.sequence_lengths:sequence_lengths
     }
     if labels is not None:
         labels, _ = pad_sequence(labels, 0)
         feed[self.labels] = labels
     
     if dropout is not None:
         feed[self.dropout] = dropout
     
     if learning_rate is not None:
         feed[self.learning_rate] = learning_rate
     
     return feed, sequence_lengths
def pad_and_bucket_dataset(in_dataset, in_vocabulary, in_config):
    BUCKETS = in_config['buckets']
    bucket_stats = collect_bucket_stats(in_dataset, BUCKETS)
    bucketed_encoder_inputs = [
        np.zeros((bucket_stats[bucket_id], input_length), dtype=np.uint32)
        for bucket_id, (input_length, output_length) in enumerate(BUCKETS)
    ]
    bucketed_decoder_inputs = [
        np.zeros((bucket_stats[bucket_id], output_length), dtype=np.uint32)
        for bucket_id, (input_length, output_length) in enumerate(BUCKETS)
    ]
    bucket_cursors = [0 for _ in BUCKETS]
    for row in in_dataset.itertuples():
        encoder_input_ids, decoder_input_ids = row[1:]
        bucket_id = find_bucket(
            len(encoder_input_ids),
            len(decoder_input_ids),
            BUCKETS
        )
        if bucket_id is None:
            continue
        bucket_cursor = bucket_cursors[bucket_id]
        input_length, output_length = BUCKETS[bucket_id]
        padded_encoder_input = pad_sequence(
            encoder_input_ids,
            input_length,
            padding='pre'
        )
        padded_decoder_input = pad_sequence(decoder_input_ids, output_length)

        bucketed_encoder_inputs[bucket_id][bucket_cursor] = padded_encoder_input
        bucketed_decoder_inputs[bucket_id][bucket_cursor] = padded_decoder_input
        bucket_cursors[bucket_id] += 1

    return {
        'encoder': [
            np.asarray(input_bucket)
            for input_bucket in bucketed_encoder_inputs
        ],
        'decoder': [
            np.asarray(input_bucket)
            for input_bucket in bucketed_decoder_inputs
        ],
    }
def live_decode(in_vocabulary, in_embeddings, in_config):
    logging.info('Loading the trained model')

    model = create_model(in_vocabulary,
                         in_vocabulary,
                         in_embeddings,
                         in_config['buckets'][BUCKET_ID][0],
                         in_config['buckets'][BUCKET_ID][1],
                         in_config,
                         mode=Mode.TEST)
    MODEL_FILE = in_config['model_weights']
    model.load_weights(MODEL_FILE)

    vocabulary_map = {
        token: index
        for index, token in enumerate(in_vocabulary)
    }
    print 'go'
    while True:
        user_input = stdin.readline().lower().strip()
        if user_input == 'q':
            break
        token_ids = [vocabulary_map[token]
                     for token in user_input.split()] + [GO_ID]
        BUCKETS = in_config['buckets']
        bucket_id = find_bucket(len(token_ids), 0, BUCKETS)
        decoder_inputs = pad_sequence(token_ids,
                                      BUCKETS[bucket_id][0],
                                      padding='pre')
        decoder_input_matrix = np.asarray(decoder_inputs)
        decoder_input_matrix = decoder_input_matrix.reshape(
            [1] + list(decoder_input_matrix.shape))
        decoder_outputs = model.predict(decoder_input_matrix)
        argmaxes = np.argmax(decoder_outputs[0], axis=1)
        decoded_ids = truncate_decoded_sequence(argmaxes)
        print ' '.join(
            [in_vocabulary[decoded_token] for decoded_token in decoded_ids])