def query_model(sess, input_node, predictions, vocab, rev_vocab, max_seq_len, dropout_keep_prob, saver=None, embs=None, out_form="cosine"): while True: sys.stdout.write("Type a definition: ") sys.stdout.flush() sentence = sys.stdin.readline() sys.stdout.write("Number of candidates: ") sys.stdout.flush() top = int(sys.stdin.readline()) # Get token-ids for the input gloss. token_ids = data_utils.sentence_to_token_ids(sentence, vocab) # Pad out (or truncate) the input gloss ids. padded_ids = np.asarray(data_utils.pad_sequence( token_ids, max_seq_len)) input_data = np.asarray([padded_ids]) # Single vector encoding the input gloss. model_preds = sess.run(predictions, feed_dict={ input_node: input_data, dropout_keep_prob: 1.0 }) # Softmax already provides scores over the vocab. if out_form == "softmax": # Exclude padding and _UNK tokens from the top-k calculation. candidate_ids = np.squeeze( model_preds)[2:].argsort()[-top:][::-1] + 2 # Replace top-k ids with corresponding words. candidates = [rev_vocab[idx] for idx in candidate_ids] # Cosine requires sim to be calculated for each vocab word. else: sims = 1 - np.squeeze( dist.cdist(model_preds, embs, metric="cosine")) # replace nans with 0s. sims = np.nan_to_num(sims) candidate_ids = sims.argsort()[::-1][:top] candidates = [rev_vocab[idx] for idx in candidate_ids] # get baseline candidates from the raw embedding space. base_rep = np.asarray([np.mean(embs[token_ids], axis=0)]) sims_base = 1 - np.squeeze(dist.cdist(base_rep, embs, metric="cosine")) sims_base = np.nan_to_num(sims_base) candidate_ids_base = sims_base.argsort()[::-1][:top] candidates_base = [rev_vocab[idx] for idx in candidate_ids_base] print("Top %s baseline candidates:" % top) for ii, cand in enumerate(candidates_base): print("%s: %s" % (ii + 1, cand)) print("\n Top %s candidates from the model:" % top) for ii, cand in enumerate(candidates): print("%s: %s" % (ii + 1, cand)) old_model_preds = model_preds sys.stdout.flush() sentence = sys.stdin.readline()
def get_feed_dict(self, sentences, labels=None, learning_rate=None, dropout=None): sentences_ids, sequence_lengths = pad_sequence(sentences, 0) feed = { self.sentence_ids:sentences_ids, self.sequence_lengths:sequence_lengths } if labels is not None: labels, _ = pad_sequence(labels, 0) feed[self.labels] = labels if dropout is not None: feed[self.dropout] = dropout if learning_rate is not None: feed[self.learning_rate] = learning_rate return feed, sequence_lengths
def pad_and_bucket_dataset(in_dataset, in_vocabulary, in_config): BUCKETS = in_config['buckets'] bucket_stats = collect_bucket_stats(in_dataset, BUCKETS) bucketed_encoder_inputs = [ np.zeros((bucket_stats[bucket_id], input_length), dtype=np.uint32) for bucket_id, (input_length, output_length) in enumerate(BUCKETS) ] bucketed_decoder_inputs = [ np.zeros((bucket_stats[bucket_id], output_length), dtype=np.uint32) for bucket_id, (input_length, output_length) in enumerate(BUCKETS) ] bucket_cursors = [0 for _ in BUCKETS] for row in in_dataset.itertuples(): encoder_input_ids, decoder_input_ids = row[1:] bucket_id = find_bucket( len(encoder_input_ids), len(decoder_input_ids), BUCKETS ) if bucket_id is None: continue bucket_cursor = bucket_cursors[bucket_id] input_length, output_length = BUCKETS[bucket_id] padded_encoder_input = pad_sequence( encoder_input_ids, input_length, padding='pre' ) padded_decoder_input = pad_sequence(decoder_input_ids, output_length) bucketed_encoder_inputs[bucket_id][bucket_cursor] = padded_encoder_input bucketed_decoder_inputs[bucket_id][bucket_cursor] = padded_decoder_input bucket_cursors[bucket_id] += 1 return { 'encoder': [ np.asarray(input_bucket) for input_bucket in bucketed_encoder_inputs ], 'decoder': [ np.asarray(input_bucket) for input_bucket in bucketed_decoder_inputs ], }
def live_decode(in_vocabulary, in_embeddings, in_config): logging.info('Loading the trained model') model = create_model(in_vocabulary, in_vocabulary, in_embeddings, in_config['buckets'][BUCKET_ID][0], in_config['buckets'][BUCKET_ID][1], in_config, mode=Mode.TEST) MODEL_FILE = in_config['model_weights'] model.load_weights(MODEL_FILE) vocabulary_map = { token: index for index, token in enumerate(in_vocabulary) } print 'go' while True: user_input = stdin.readline().lower().strip() if user_input == 'q': break token_ids = [vocabulary_map[token] for token in user_input.split()] + [GO_ID] BUCKETS = in_config['buckets'] bucket_id = find_bucket(len(token_ids), 0, BUCKETS) decoder_inputs = pad_sequence(token_ids, BUCKETS[bucket_id][0], padding='pre') decoder_input_matrix = np.asarray(decoder_inputs) decoder_input_matrix = decoder_input_matrix.reshape( [1] + list(decoder_input_matrix.shape)) decoder_outputs = model.predict(decoder_input_matrix) argmaxes = np.argmax(decoder_outputs[0], axis=1) decoded_ids = truncate_decoded_sequence(argmaxes) print ' '.join( [in_vocabulary[decoded_token] for decoded_token in decoded_ids])