def get_parse_data(sent_file, parse_file, en_vocab, parse_vocab, bsize, mode, unk='<unk>', eos='<eos>'): """ Parse data batcher Return sentence + linearized parse tree. """ sent_dataset = tf.data.TextLineDataset(sent_file) parse_dataset = tf.data.TextLineDataset(parse_file) # : Default value of unkown tokens en_vocab_table = lookup_ops.index_table_from_file(en_vocab, default_value=1) parse_vocab_table = lookup_ops.index_table_from_file(parse_vocab, default_value=1) # : Append EOS and make column for number of words : sent_dataset = sent_dataset.map( lambda sentence: tf.string_split([sentence]).values, num_parallel_calls=num_threads) sent_dataset = sent_dataset.map( lambda words: tf.concat([words, [eos]], axis=0), num_parallel_calls=num_threads) sent_dataset = sent_dataset.map(lambda words: en_vocab_table.lookup(words), num_parallel_calls=num_threads) sent_dataset = sent_dataset.map(lambda words: (words, tf.size(words)), num_parallel_calls=num_threads) # : Make shifted pairs and make column for number of words : parse_dataset = parse_dataset.map( lambda sentence: tf.string_split([sentence]).values, num_parallel_calls=num_threads) parse_dataset_start = parse_dataset.map(lambda words: words[:-1]) parse_dataset = parse_dataset.map( lambda words: parse_vocab_table.lookup(words[1:]), num_parallel_calls=num_threads) parse_dataset_start = parse_dataset_start.map( lambda words: parse_vocab_table.lookup(words), num_parallel_calls=num_threads) parse_dataset = parse_dataset.map(lambda words: (words, tf.size(words)), num_parallel_calls=num_threads) parse_dataset_start = parse_dataset_start.map( lambda words: (words, tf.size(words)), num_parallel_calls=num_threads) # : Zip the two datasets with line-by-line parses : sen_parse_dataset = tf.data.Dataset.zip( (sent_dataset, parse_dataset_start, parse_dataset)) if (mode == 'train'): sen_parse_dataset = sen_parse_dataset.shuffle(buffer_size=1000, seed=42) sen_parse_dataset = sen_parse_dataset.padded_batch( batch_size=bsize, padded_shapes=(([None], []), ([None], []), ([None], []))) sen_parse_dataset = sen_parse_dataset.prefetch(1) return (sen_parse_dataset)
def create_train_model(hparams): src_file = hparams.src_train_file tgt_file = hparams.tgt_train_file src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(), tf.container('train'): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) src_dataset = tf.data.TextLineDataset(src_file) tgt_dataset = tf.data.TextLineDataset(tgt_file) iterator = get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, hparams.batch_size, SOS, EOS, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len) model = NMTModel(hparams, 'train', iterator, src_vocab_table, tgt_vocab_table) return TrainModel(graph=graph, model=model, iterator=iterator)
def create_train_model(hparams, model_creator): txt_file = "%s.%s" % (hparams.train_prefix, "txt") lb_file = "%s.%s" % (hparams.train_prefix, "lb") vocab_file = hparams.vocab_file index_file = hparams.index_file graph = tf.Graph() with graph.as_default(), tf.container("train"): vocab_table = lookup_ops.index_table_from_file( vocab_file, default_value=UNK_ID) # for the labels index_table = lookup_ops.index_table_from_file( index_file, default_value=0) txt_dataset = tf.data.TextLineDataset(txt_file) lb_dataset = tf.data.TextLineDataset(lb_file) iterator = data_iterator.get_iterator( txt_dataset, lb_dataset, vocab_table, index_table, batch_size=hparams.batch_size, num_buckets=hparams.num_buckets, max_len=hparams.max_len) model = model_creator( hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, vocab_table=vocab_table) return TrainModel(graph=graph, model=model, iterator=iterator)
def mydatasetcreator(hparams): srcvocabpath = "%s.%s" % (hparams.vocab_prefix, hparams.src) sv = lookup_ops.index_table_from_file(srcvocabpath, default_value=UNK_ID) tgtvocabpath = "%s.%s" % (hparams.vocab_prefix, hparams.tgt) tv = lookup_ops.index_table_from_file(tgtvocabpath, default_value=UNK_ID) hparams.src_vocab_size = sv.size() hparams.tgt_vocab_size = tv.size() srcpath = "%s.%s" % (hparams.train_prefix, hparams.src) tgtpath = "%s.%s" % (hparams.train_prefix, hparams.tgt) srcdata = tf.data.TextLineDataset(srcpath) srcdata = srcdata.map(lambda x: tf.strings.split([x]).values) srcdata = srcdata.map(lambda x: tf.dtypes.cast(sv.lookup(x), tf.int32)) max_length = max(tf.shape(v)[0] for v in srcdata) hparams.max_input_length = max_length srcdata = srcdata.padded_batch(hparams.batch, [max_length], drop_remainder=True) tgtdata = tf.data.TextLineDataset(tgtpath) tgtdata = tgtdata.map(lambda x: tf.strings.split([x]).values) tgtdata = tgtdata.map(lambda x: tf.concat([x, [EOS]], -1)) tgtdata = tgtdata.map(lambda x: tf.dtypes.cast(tv.lookup(x), tf.int32)) max_length = max(tf.shape(v)[0] for v in tgtdata) hparams.max_output_length = max_length tgtdata = tgtdata.padded_batch(hparams.batch, [max_length], drop_remainder=True) d = tf.data.TextLineDataset.zip((srcdata, tgtdata)) for x in d: hparams.train_data_size += 1 return d
def create_infer_model(hparams): src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(), tf.container('infer'): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file( tgt_vocab_file, default_value=UNK) src_placeholder = tf.placeholder(shape=[None], dtype=tf.string) batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64) src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder) iterator = get_infer_iterator(src_dataset, src_vocab_table, batch_size_placeholder, EOS, src_max_len=hparams.src_max_len_infer) model = NMTModel(hparams, 'infer', iterator, src_vocab_table, tgt_vocab_table, reverse_tgt_vocab_table) return InferModel(graph=graph, model=model, src_placeholder=src_placeholder, batch_size_placeholder=batch_size_placeholder, iterator=iterator)
def create_eval_model(hparams): src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(), tf.container('eval'): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) src_file_placeholder = tf.placeholder(shape=[], dtype=tf.string) tgt_file_placeholder = tf.placeholder(shape=[], dtype=tf.string) src_dataset = tf.data.TextLineDataset(src_file_placeholder) tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder) iterator = get_iterator(src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, hparams.batch_size, SOS, EOS, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len) model = NMTModel(hparams, 'eval', iterator, src_vocab_table, tgt_vocab_table) return EvalModel(graph=graph, model=model, src_file_placeholder=src_file_placeholder, tgt_file_placeholder=tgt_file_placeholder, iterator=iterator)
def create_vocab_tables(src_vocab_file, tgt_vocab_file, unk_id): """Create the vocab lookup table""" src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=unk_id) tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=unk_id) return src_vocab_table, tgt_vocab_table
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
def create_input_data(source_data_file, target_data_file, source_vocab_file, target_vocab_file, batch_size, sos, eos, source_max_length, target_max_length): source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file)) target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file)) source_vocab = lookup_ops.index_table_from_file( source_vocab_file, default_value=FLAGS.unk_id) target_vocab = lookup_ops.index_table_from_file( target_vocab_file, default_value=FLAGS.unk_id) output_buffer_size = batch_size * 1000 source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32) target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32) target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32) dataset = tf.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.map( lambda src, tgt: (tf.string_split([src]).values, tf.string_split([tgt]).values)).prefetch(output_buffer_size) dataset = dataset.filter( lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0)) dataset = dataset.map( lambda src, tgt: (src[:source_max_length], tgt[:target_max_length])) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32), tf.cast(target_vocab.lookup(tgt), tf.int32))) dataset = dataset.prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt: (src, tf.concat(([target_sos_id], tgt), 0), tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size) dataset = dataset.map( lambda src, tgt_in, tgt_out: ( src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size) dataset = dataset.shuffle(100).repeat().padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([])), padding_values=(source_eos_id, target_eos_id, target_eos_id, 0, 0)) iterator = dataset.make_initializable_iterator() return iterator.get_next(), iterator.initializer, source_vocab, target_vocab
def create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id, tgt_unknown_id, share_vocab=False): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=src_unknown_id) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=tgt_unknown_id) return src_vocab_table, tgt_vocab_table
def create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id, tgt_unknown_id, share_vocab=False): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=src_unknown_id) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=tgt_unknown_id) return src_vocab_table, tgt_vocab_table
def get_nli_data(nli_premise, nli_hypothesis, nli_classes, en_vocab, class_vocab, bsize, mode, unk='<unk>', eos='<eos>'): """ NLI Batcher Return Premise, Hypothesis sentence + class Also return sentence lengths """ nli_premise = tf.data.TextLineDataset(nli_premise) nli_hypothesis = tf.data.TextLineDataset(nli_hypothesis) nli_classes = tf.data.TextLineDataset(nli_classes) en_vocab_table = lookup_ops.index_table_from_file(en_vocab, default_value=1) class_vocab_table = lookup_ops.index_table_from_file(class_vocab, default_value=0) # : Append EOS and make columns for number of words : nli_premise = nli_premise.map(lambda sentence: (tf.string_split([sentence]).values), num_parallel_calls=num_threads) nli_premise = nli_premise.map( lambda words: en_vocab_table.lookup(tf.concat([words, [eos]], axis=0)), num_parallel_calls=num_threads) nli_premise_wrds = nli_premise.map(lambda words: tf.size(words), num_parallel_calls=num_threads) nli_hypothesis = nli_hypothesis.map(lambda sentence: (tf.string_split([sentence]).values), num_parallel_calls=num_threads) nli_hypothesis = nli_hypothesis.map( lambda words: en_vocab_table.lookup(tf.concat([words, [eos]], axis=0)), num_parallel_calls=num_threads) nli_hypothesis_wrds = nli_hypothesis.map(lambda words: tf.size(words), num_parallel_calls=num_threads) nli_classes = nli_classes.map(lambda sentence: class_vocab_table.lookup( tf.string_split([sentence]).values)[0], num_parallel_calls=num_threads) nli_dataset = tf.data.Dataset.zip( (nli_premise, nli_premise_wrds, nli_hypothesis, nli_hypothesis_wrds, nli_classes)) if (mode == 'train'): nli_dataset = nli_dataset.shuffle(buffer_size=1000, seed=42) nli_dataset = nli_dataset.padded_batch(batch_size=bsize, padded_shapes=((([None]), ([]), ([None ]), ([]), ([])))) nli_dataset = nli_dataset.prefetch(1) return (nli_dataset)
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
def create_vocab_tables(src1_vocab_file, src2_vocab_file, tgt_vocab_file): src1_vocab_table = lookup_ops.index_table_from_file( src1_vocab_file, default_value=data_utils.UNK_ID) src2_vocab_table = lookup_ops.index_table_from_file( src2_vocab_file, default_value=data_utils.UNK_ID) tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=data_utils.UNK_ID) return src1_vocab_table, src2_vocab_table, tgt_vocab_table
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_table = lookup_ops.index_table_from_file( # 返回一个word2id的table # The lookup table to map a key_dtype Tensor to index int64 Tensor. src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
def __init__(self, config_dict): super(Bert2Seq, self).__init__() self.cfg = Bert2SeqConfig(config_dict) #Dictionary initialization self.bert_dict = lookup_ops.index_table_from_file( self.cfg.vocab_path + "/" + self.cfg.bert_vocab_file, default_value=0) self.decoder_dict = lookup_ops.index_table_from_file( self.cfg.vocab_path + "/" + self.cfg.decoder_vocab_file, default_value=self.cfg.dict_param.unk_id) self.reverse_decoder_dict = lookup_ops.index_to_string_table_from_file( self.cfg.vocab_path + "/" + self.cfg.decoder_vocab_file, default_value=self.cfg.dict_param.unk)
def create_vocab_tables(src_vocab_file, tgt_vocab_file, config): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if config.share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file( tgt_vocab_file, default_value=config.unk) return src_vocab_table, tgt_vocab_table, reverse_tgt_vocab_table
def create_vocab_tables(self): """Creates vocab tables for src_vocab_file and tgt_vocab_file.""" src_vocab_file, tgt_vocab_file, share_vocab = self.opt.source_vocab_file, \ self.opt.dest_vocab_file, False src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
def __init__(self, hparams, training=True): self.training = training self.hparams = hparams self.src_max_len = self.hparams.src_max_len self.tgt_max_len = self.hparams.tgt_max_len self.vocab_size, self.vocab_list = check_vocab(VOCAB_FILE) self.emotion_size, self.emotion_list = check_vocab(EMOTION_FILE) self.vocab_table = lookup_ops.index_table_from_file( VOCAB_FILE, default_value=self.hparams.unk_id) self.reverse_vocab_table = lookup_ops.index_to_string_table_from_file( VOCAB_FILE, default_value=self.hparams.unk_token) self.emotion_table = lookup_ops.index_table_from_file( EMOTION_FILE, default_value=self.hparams.unk_id) self.reverse_emotion_table = lookup_ops.index_to_string_table_from_file( EMOTION_FILE, default_value=self.hparams.unk_token) if self.training: print('--------------------------------------------------') for index, name in enumerate(RECORD_FILE_NAME_LIST): print('= {} - {}'.format(index, name)) RECORD_INDEX = int(input("# Input record file index: ")) print('--------------------------------------------------') batch_lists = self.get_file_batch_lists('{}_train.json'.format( RECORD_FILE_NAME_LIST[RECORD_INDEX])) emotion_num_dict = self.get_emotion_num(batch_lists) self.emotion_weight_dict = self.get_emotion_weight( emotion_num_dict) self.case_table = prepare_case_table() self.dev_dataset = self.load_record( os.path.join( RECORD_DIR, '{}_dev.tfrecords'.format( RECORD_FILE_NAME_LIST[RECORD_INDEX]))) self.test_dataset = self.load_record( os.path.join( RECORD_DIR, '{}_test.tfrecords'.format( RECORD_FILE_NAME_LIST[RECORD_INDEX]))) self.train_dataset = self.load_record( os.path.join( RECORD_DIR, '{}_train.tfrecords'.format( RECORD_FILE_NAME_LIST[RECORD_INDEX]))) else: self.case_table = None
def test_index_table_from_file_with_vocab_size_too_large(self): vocabulary_file = self._createVocabFile("f2i_vocab7.txt") with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, vocab_size=4) self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Invalid vocab_size", table.init.run)
def create_speaker_tables(speaker_table_file): """Creates speaker tables for question file""" ## TODO account for speaker only present in answers speaker_table = lookup_ops.index_table_from_file(speaker_table_file, default_value=UNK_ID) return speaker_table
def vocabulary_lookup(self): """Returns a lookup table mapping string to index.""" return lookup.index_table_from_file( self.vocabulary_file, vocab_size=self.vocabulary_size - self.num_oov_buckets, num_oov_buckets=self.num_oov_buckets, default_value=constants.UNKNOWN_ID)
def create_train_model(hparams): train_file = hparams.train vocab_size, vocab_file = vocab_utils.check_vocab(hparams.vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("vocab_size", vocab_size) graph = tf.Graph() with graph.as_default(), tf.container("train"): vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0) iterator = iterator_utils.get_iterator(train_file, vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, src_max_len=hparams.src_max_len) model = rnn_model.Model(hparams, mode=tf.contrib.learn.ModeKeys.TRAIN, iterator=iterator, vocab_table=vocab_table) return graph, model, iterator
def __init__(self, config_dict): super(BertQK, self).__init__() self.cfg = BertQKConfig(config_dict) #Dictionary initialization self.bert_dict = lookup_ops.index_table_from_file( self.cfg.vocab_path + "/" + self.cfg.bert_vocab_file, default_value=0)
def __init__(self, vocab_file_path, oov_buckets, num_lines_to_ignore=0, num_lines_to_use=None): super(TextEmbeddingModel, self).__init__() self._vocabulary, self._pretrained_vectors = load(vocab_file_path, parse_line, num_lines_to_ignore, num_lines_to_use) self._oov_buckets = oov_buckets # Make the vocabulary file a `TrackableAsset` to ensure it is saved along # with the model. self._vocabulary_file = tracking.TrackableAsset( write_vocabulary_file(self._vocabulary)) self._table = lookup_ops.index_table_from_file( vocabulary_file=self._vocabulary_file, num_oov_buckets=self._oov_buckets, hasher_spec=lookup_ops.FastHashSpec) oovs = np.zeros([oov_buckets, self._pretrained_vectors.shape[1]]) self._pretrained_vectors.resize([ self._pretrained_vectors.shape[0] + oov_buckets, self._pretrained_vectors.shape[1] ]) self._pretrained_vectors[self._pretrained_vectors.shape[0] - oov_buckets:, :] = oovs self.embeddings = tf.Variable(self._pretrained_vectors) self.variables = [self.embeddings] self.trainable_variables = self.variables
def _create_vocab_tables(self, vocab_files, share_vocab=False): if vocab_files[1] is None and share_vocab == False: raise ValueError( 'If share_vocab is set to false must provide target vocab. (src_vocab_file, \ target_vocab_file)') src_vocab_table = lookup_ops.index_table_from_file( vocab_files[0], default_value=UNK_ID) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( vocab_files[1], default_value=UNK_ID) return src_vocab_table, tgt_vocab_table
def compute_loss(self, logits, labels, nwords, params): """Compute loss. Args: logits: A tensor, output of dense layer labels: A tensor, the ground truth label nwords: A tensor, length of inputs params: A dict, storing hyper params Returns: A loss tensor, negative log likelihood loss. """ tags_str2idx = lookup_ops.index_table_from_file(params['tag_vocab'], default_value=0) actual_ids = tags_str2idx.lookup(labels) # get transition matrix created before with tf.variable_scope("crf", reuse=True): trans_val = tf.get_variable( "transition", shape=[params['num_tags'], params['num_tags']], dtype=tf.float32) log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=actual_ids, sequence_lengths=nwords, transition_params=trans_val) loss = tf.reduce_mean(-log_likelihood) return loss
def __init__(self): TreeHeight = lambda x: int(math.log(x - 1) / math.log(2)) + 2 indexCnt = count_idx(FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file) self.tree_height = TreeHeight(indexCnt + 1) self.tree_index = lookup_ops.index_table_from_file( FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file, default_value=indexCnt) self.reverse_tree_index = lookup_ops.index_to_string_table_from_file( FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file, default_value='<unk>') self.dims = parse_dims(FLAGS.semantic_model_dims) self.layer_embedding = tf.get_variable( name='tree_node_emb', shape=[pow(2, self.tree_height - 1), self.dims[-1]]) if not FLAGS.leaf_content_emb: self.leaf_embedding = tf.get_variable( name='leaf_node_emb', shape=[pow(2, self.tree_height - 1), self.dims[-1]]) if FLAGS.use_mstf_ops == 1: self.op_dict = mstf.dssm_dict(FLAGS.xletter_dict) elif FLAGS.use_mstf_ops == -1: self.op_dict = XletterPreprocessor(FLAGS.xletter_dict, FLAGS.xletter_win_size) else: self.op_dict = None
def do_infer(hparams, args): if 'len_max_sentence' in hparams: len_max_sentence = hparams.len_max_sentence else: len_max_sentence = -1 infer_graph = tf.Graph() rev_vocab_table = index_to_word_map(hparams.vocab_output) all_words = np.array( [rev_vocab_table[index] for index in range(hparams.size_vocab_output)]) with infer_graph.as_default(): vocab_table_input = lookup_ops.index_table_from_file( hparams.vocab_input, default_value=0) infer_iterator = create_infer_dataset_iterator(args.infer_sentences, vocab_table_input, args.infer_batch_size, len_max_sentence) infer_model = RNNPredictor(hparams, infer_iterator, ModeKeys.INFER) infer_sess = tf.Session() infer_sess.run(tf.tables_initializer()) latest_train_ckpt = tf.train.latest_checkpoint(args.model_dir) infer_model.saver.restore(infer_sess, latest_train_ckpt) fw = open(args.infer_out, 'w') start_time = time.time() all_probs, num_batches = infer_model.get_all_probs(infer_sess) logging.info('Infer time: %ds Batches: %d datums: %d' % ((time.time() - start_time), num_batches, len(all_probs))) for datum_prob in all_probs: fw.write('%s\n' % ' '.join(all_words[datum_prob > args.prob_cutoff]))
def __init__(self, hparams, tokenizer=None, training=True, mode='inference'): self.training = training self.hparams = hparams self.tokenizer = tokenizer if tokenizer else Tokenizer( self.hparams, VOCAB_FILE) self.vocab_size, self.vocab_dict = len( self.tokenizer.vocab), self.tokenizer.vocab self.emotion_tokenizer = tokenizer if tokenizer else Tokenizer( self.hparams, EMOTION_FILE) self.emotion_size, self.emotion_list = len( self.emotion_tokenizer.vocab), self.emotion_tokenizer.inv_vocab with tf.name_scope("data_process"): self.vocab_table = lookup_ops.index_table_from_file( VOCAB_FILE, default_value=self.hparams.unk_id) self.reverse_vocab_table = lookup_ops.index_to_string_table_from_file( VOCAB_FILE, default_value=self.hparams.unk_token) self.emotion_table = lookup_ops.index_table_from_file( EMOTION_FILE, default_value=self.hparams.unk_id) self.reverse_emotion_table = lookup_ops.index_to_string_table_from_file( EMOTION_FILE, default_value=self.hparams.unk_token) self.dull_response_id = self.get_dull_response(DULL_RESPONSE) if self.training: with tf.name_scope("load_record"): if mode == 'ddpg': train_file = 'daily_train.tfrecords' test_file = 'daily_test.tfrecords' else: train_file = 'daily_mtem_train.tfrecords' test_file = 'daily_mtem_test.tfrecords' # train_file = 'friends_train.tfrecords' # test_file = 'friends_test.tfrecords' self.train_dataset_count, self.train_dataset = self.load_record( os.path.join(RECORD_DIR, train_file), ELEMENT_LIST) self.test_dataset_count, self.test_dataset = self.load_record( os.path.join(RECORD_DIR, test_file), ELEMENT_LIST)
def test_index_table_from_file_with_invalid_hashers(self): vocabulary_file = self._createVocabFile("invalid_hasher.txt") with self.test_session(): with self.assertRaises(TypeError): lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, vocab_size=3, num_oov_buckets=1, hasher_spec=1) table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, vocab_size=3, num_oov_buckets=1, hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) self.assertRaises(ValueError, table.lookup, constant_op.constant(["salad", "surgery", "tarkus"]))
def build_eval_metrics(self, predict_ids, labels, nwords, params): tags_str2idx = lookup_ops.index_table_from_file( params['tag_vocab'], default_value=0) actual_ids = tags_str2idx.lookup(labels) weights = tf.sequence_mask(nwords) metrics = { "accuracy": tf.metrics.accuracy(actual_ids, predict_ids, weights) } return metrics
def string_to_index_table_from_file(vocabulary_file=None, num_oov_buckets=0, vocab_size=None, default_value=-1, hasher_spec=FastHashSpec, name=None): return index_table_from_file( vocabulary_file, num_oov_buckets, vocab_size, default_value, hasher_spec, key_dtype=dtypes.string, name=name)
def __init__(self, hparams, mode): self.mode = mode self.hparams = hparams params = tf.trainable_variables() #define placeholder self.vocab_table_word = lookup_ops.index_table_from_file( 'pre_data/vocab_word.txt', default_value=0) self.vocab_table_char = lookup_ops.index_table_from_file( 'pre_data/vocab_char.txt', default_value=0) self.norm_trainable = tf.placeholder(tf.bool) self.q1 = {} self.q2 = {} self.label = tf.placeholder(shape=(None, ), dtype=tf.float32) for q in [self.q1, self.q2]: q['words'] = tf.placeholder(shape=(None, None), dtype=tf.string) q['words_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32) q['chars'] = tf.placeholder(shape=(None, None), dtype=tf.string) q['chars_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32) q['words_num'] = tf.placeholder( shape=(None, len(hparams.word_num_features)), dtype=tf.float32) q['chars_num'] = tf.placeholder( shape=(None, len(hparams.char_num_features)), dtype=tf.float32) #build graph self.build_graph(hparams) #build optimizer self.optimizer(hparams) params = tf.trainable_variables() self.saver = tf.train.Saver(tf.global_variables()) elmo_param = [] for param in tf.global_variables(): if 'elmo' in param.name and 'elmo/Variable' not in param.name: elmo_param.append(param) self.pretrain_saver = tf.train.Saver(elmo_param) utils.print_out("# Trainable variables") for param in params: if hparams.pretrain is False and 'elmo' in param.name: continue else: utils.print_out( " %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
def create_train_model(model_creator, hparams, scope=None): """Create train graph, model, and iterator.""" train_src_file = "%s.%s" % (hparams.train_prefix, hparams.src) train_tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file train_graph = tf.Graph() with train_graph.as_default(): src_vocab_table = lookup_ops.index_table_from_file( src_vocab_file, default_value=vocab_utils.UNK_ID) tgt_vocab_table = lookup_ops.index_table_from_file( tgt_vocab_file, default_value=vocab_utils.UNK_ID) train_src_dataset = tf.contrib.data.TextLineDataset(train_src_file) train_tgt_dataset = tf.contrib.data.TextLineDataset(train_tgt_file) train_skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) train_iterator = iterator_utils.get_iterator( train_src_dataset, train_tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=train_skip_count_placeholder) train_model = model_creator( hparams, iterator=train_iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, scope=scope) return train_graph, train_model, train_iterator, train_skip_count_placeholder
def test_string_index_table_from_file(self): vocabulary_file = self._createVocabFile("f2i_vocab1.txt") with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=1) ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) self.assertRaises(errors_impl.OpError, ids.eval) lookup_ops.tables_initializer().run() self.assertAllEqual((1, 2, 3), ids.eval())
def _create_vocab_tables(self, vocab_files, share_vocab=False): if vocab_files[1] is None and share_vocab == False: raise ValueError('If share_vocab is set to false must provide target vocab. (src_vocab_file, \ target_vocab_file)') src_vocab_table = lookup_ops.index_table_from_file( vocab_files[0], default_value=UNK_ID ) if share_vocab: tgt_vocab_table = src_vocab_table else: tgt_vocab_table = lookup_ops.index_table_from_file( vocab_files[1], default_value=UNK_ID ) return src_vocab_table, tgt_vocab_table
def test_index_table_from_file_with_vocab_size_too_small(self): vocabulary_file = self._createVocabFile("f2i_vocab6.txt") with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, vocab_size=2) ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) self.assertRaises(errors_impl.OpError, ids.eval) lookup_ops.tables_initializer().run() self.assertAllEqual((1, -1, -1), ids.eval()) self.assertEqual(2, table.size().eval())
def test_index_table_from_file_with_default_value(self): default_value = -42 vocabulary_file = self._createVocabFile("f2i_vocab4.txt") with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, default_value=default_value) ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) self.assertRaises(errors_impl.OpError, ids.eval) lookup_ops.tables_initializer().run() self.assertAllEqual((1, 2, default_value), ids.eval())
def test_int64_index_table_from_file(self): vocabulary_file = self._createVocabFile( "f2i_vocab3.txt", values=("42", "1", "-1000")) with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=1, key_dtype=dtypes.int64) ids = table.lookup( constant_op.constant((1, -1000, 11), dtype=dtypes.int64)) self.assertRaises(errors_impl.OpError, ids.eval) lookup_ops.tables_initializer().run() self.assertAllEqual((1, 2, 3), ids.eval())
def __init__(self, vocabulary, emb_dim, oov_buckets): super(TextEmbeddingModel, self).__init__() self._oov_buckets = oov_buckets self._vocabulary_file = tracking.TrackableAsset( write_vocabulary_file(vocabulary)) self._total_size = len(vocabulary) + oov_buckets self._table = lookup_ops.index_table_from_file( vocabulary_file=self._vocabulary_file, num_oov_buckets=self._oov_buckets, hasher_spec=lookup_ops.FastHashSpec) self.embeddings = tf.Variable( tf.random.uniform(shape=[self._total_size, emb_dim])) self.variables = [self.embeddings] self.trainable_variables = self.variables
def test_index_table_from_file_with_oov_buckets(self): vocabulary_file = self._createVocabFile("f2i_vocab5.txt") with self.test_session(): table = lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=1000) ids = table.lookup( constant_op.constant(["salad", "surgery", "tarkus", "toccata"])) self.assertRaises(errors_impl.OpError, ids.eval) lookup_ops.tables_initializer().run() self.assertAllEqual( ( 1, # From vocabulary file. 2, # From vocabulary file. 867, # 3 + fingerprint("tarkus") mod 300. 860), # 3 + fingerprint("toccata") mod 300. ids.eval())