コード例 #1
0
ファイル: data_feed.py プロジェクト: abhudev/DL_Project1_NLP
def get_parse_data(sent_file,
                   parse_file,
                   en_vocab,
                   parse_vocab,
                   bsize,
                   mode,
                   unk='<unk>',
                   eos='<eos>'):
    """ Parse data batcher
        Return sentence + linearized parse tree.        
    """
    sent_dataset = tf.data.TextLineDataset(sent_file)
    parse_dataset = tf.data.TextLineDataset(parse_file)

    # : Default value of unkown tokens
    en_vocab_table = lookup_ops.index_table_from_file(en_vocab,
                                                      default_value=1)
    parse_vocab_table = lookup_ops.index_table_from_file(parse_vocab,
                                                         default_value=1)

    # : Append EOS and make column for number of words :
    sent_dataset = sent_dataset.map(
        lambda sentence: tf.string_split([sentence]).values,
        num_parallel_calls=num_threads)
    sent_dataset = sent_dataset.map(
        lambda words: tf.concat([words, [eos]], axis=0),
        num_parallel_calls=num_threads)
    sent_dataset = sent_dataset.map(lambda words: en_vocab_table.lookup(words),
                                    num_parallel_calls=num_threads)
    sent_dataset = sent_dataset.map(lambda words: (words, tf.size(words)),
                                    num_parallel_calls=num_threads)

    # : Make shifted pairs and make column for number of words :
    parse_dataset = parse_dataset.map(
        lambda sentence: tf.string_split([sentence]).values,
        num_parallel_calls=num_threads)
    parse_dataset_start = parse_dataset.map(lambda words: words[:-1])
    parse_dataset = parse_dataset.map(
        lambda words: parse_vocab_table.lookup(words[1:]),
        num_parallel_calls=num_threads)
    parse_dataset_start = parse_dataset_start.map(
        lambda words: parse_vocab_table.lookup(words),
        num_parallel_calls=num_threads)
    parse_dataset = parse_dataset.map(lambda words: (words, tf.size(words)),
                                      num_parallel_calls=num_threads)
    parse_dataset_start = parse_dataset_start.map(
        lambda words: (words, tf.size(words)), num_parallel_calls=num_threads)

    # : Zip the two datasets with line-by-line parses :
    sen_parse_dataset = tf.data.Dataset.zip(
        (sent_dataset, parse_dataset_start, parse_dataset))
    if (mode == 'train'):
        sen_parse_dataset = sen_parse_dataset.shuffle(buffer_size=1000,
                                                      seed=42)
    sen_parse_dataset = sen_parse_dataset.padded_batch(
        batch_size=bsize,
        padded_shapes=(([None], []), ([None], []), ([None], [])))
    sen_parse_dataset = sen_parse_dataset.prefetch(1)

    return (sen_parse_dataset)
コード例 #2
0
ファイル: create_nmt_model.py プロジェクト: zouning68/RNN-NMT
def create_train_model(hparams):
    src_file = hparams.src_train_file
    tgt_file = hparams.tgt_train_file
    src_vocab_file = hparams.src_vocab_file
    tgt_vocab_file = hparams.tgt_vocab_file
    graph = tf.Graph()
    with graph.as_default(), tf.container('train'):
        src_vocab_table = lookup_ops.index_table_from_file(
            src_vocab_file, default_value=UNK_ID)
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=UNK_ID)
        src_dataset = tf.data.TextLineDataset(src_file)
        tgt_dataset = tf.data.TextLineDataset(tgt_file)
        iterator = get_iterator(src_dataset,
                                tgt_dataset,
                                src_vocab_table,
                                tgt_vocab_table,
                                hparams.batch_size,
                                SOS,
                                EOS,
                                src_max_len=hparams.src_max_len,
                                tgt_max_len=hparams.tgt_max_len)
        model = NMTModel(hparams, 'train', iterator, src_vocab_table,
                         tgt_vocab_table)
        return TrainModel(graph=graph, model=model, iterator=iterator)
コード例 #3
0
def create_train_model(hparams, model_creator):
  txt_file = "%s.%s" % (hparams.train_prefix, "txt")
  lb_file = "%s.%s" % (hparams.train_prefix, "lb")
  vocab_file = hparams.vocab_file
  index_file = hparams.index_file

  graph = tf.Graph()

  with graph.as_default(), tf.container("train"):
    vocab_table = lookup_ops.index_table_from_file(
      vocab_file, default_value=UNK_ID)
    # for the labels
    index_table = lookup_ops.index_table_from_file(
      index_file, default_value=0)

    txt_dataset = tf.data.TextLineDataset(txt_file)
    lb_dataset = tf.data.TextLineDataset(lb_file)

    iterator = data_iterator.get_iterator(
        txt_dataset,
        lb_dataset,
        vocab_table,
        index_table,
        batch_size=hparams.batch_size,
        num_buckets=hparams.num_buckets,
        max_len=hparams.max_len)

    model = model_creator(
        hparams,
        iterator=iterator,
        mode=tf.contrib.learn.ModeKeys.TRAIN,
        vocab_table=vocab_table)

  return TrainModel(graph=graph, model=model, iterator=iterator)
コード例 #4
0
def mydatasetcreator(hparams):

    srcvocabpath = "%s.%s" % (hparams.vocab_prefix, hparams.src)
    sv = lookup_ops.index_table_from_file(srcvocabpath, default_value=UNK_ID)
    tgtvocabpath = "%s.%s" % (hparams.vocab_prefix, hparams.tgt)
    tv = lookup_ops.index_table_from_file(tgtvocabpath, default_value=UNK_ID)
    hparams.src_vocab_size = sv.size()
    hparams.tgt_vocab_size = tv.size()

    srcpath = "%s.%s" % (hparams.train_prefix, hparams.src)
    tgtpath = "%s.%s" % (hparams.train_prefix, hparams.tgt)
    srcdata = tf.data.TextLineDataset(srcpath)
    srcdata = srcdata.map(lambda x: tf.strings.split([x]).values)
    srcdata = srcdata.map(lambda x: tf.dtypes.cast(sv.lookup(x), tf.int32))
    max_length = max(tf.shape(v)[0] for v in srcdata)
    hparams.max_input_length = max_length
    srcdata = srcdata.padded_batch(hparams.batch, [max_length],
                                   drop_remainder=True)
    tgtdata = tf.data.TextLineDataset(tgtpath)
    tgtdata = tgtdata.map(lambda x: tf.strings.split([x]).values)
    tgtdata = tgtdata.map(lambda x: tf.concat([x, [EOS]], -1))
    tgtdata = tgtdata.map(lambda x: tf.dtypes.cast(tv.lookup(x), tf.int32))
    max_length = max(tf.shape(v)[0] for v in tgtdata)
    hparams.max_output_length = max_length
    tgtdata = tgtdata.padded_batch(hparams.batch, [max_length],
                                   drop_remainder=True)
    d = tf.data.TextLineDataset.zip((srcdata, tgtdata))

    for x in d:
        hparams.train_data_size += 1

    return d
コード例 #5
0
ファイル: create_nmt_model.py プロジェクト: zouning68/RNN-NMT
def create_infer_model(hparams):
    src_vocab_file = hparams.src_vocab_file
    tgt_vocab_file = hparams.tgt_vocab_file
    graph = tf.Graph()
    with graph.as_default(), tf.container('infer'):
        src_vocab_table = lookup_ops.index_table_from_file(
            src_vocab_file, default_value=UNK_ID)
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=UNK_ID)
        reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
            tgt_vocab_file, default_value=UNK)
        src_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
        batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)

        src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder)
        iterator = get_infer_iterator(src_dataset,
                                      src_vocab_table,
                                      batch_size_placeholder,
                                      EOS,
                                      src_max_len=hparams.src_max_len_infer)
        model = NMTModel(hparams, 'infer', iterator, src_vocab_table,
                         tgt_vocab_table, reverse_tgt_vocab_table)
        return InferModel(graph=graph,
                          model=model,
                          src_placeholder=src_placeholder,
                          batch_size_placeholder=batch_size_placeholder,
                          iterator=iterator)
コード例 #6
0
ファイル: create_nmt_model.py プロジェクト: zouning68/RNN-NMT
def create_eval_model(hparams):
    src_vocab_file = hparams.src_vocab_file
    tgt_vocab_file = hparams.tgt_vocab_file
    graph = tf.Graph()
    with graph.as_default(), tf.container('eval'):
        src_vocab_table = lookup_ops.index_table_from_file(
            src_vocab_file, default_value=UNK_ID)
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=UNK_ID)
        src_file_placeholder = tf.placeholder(shape=[], dtype=tf.string)
        tgt_file_placeholder = tf.placeholder(shape=[], dtype=tf.string)
        src_dataset = tf.data.TextLineDataset(src_file_placeholder)
        tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder)
        iterator = get_iterator(src_dataset,
                                tgt_dataset,
                                src_vocab_table,
                                tgt_vocab_table,
                                hparams.batch_size,
                                SOS,
                                EOS,
                                src_max_len=hparams.src_max_len,
                                tgt_max_len=hparams.tgt_max_len)
        model = NMTModel(hparams, 'eval', iterator, src_vocab_table,
                         tgt_vocab_table)
        return EvalModel(graph=graph,
                         model=model,
                         src_file_placeholder=src_file_placeholder,
                         tgt_file_placeholder=tgt_file_placeholder,
                         iterator=iterator)
コード例 #7
0
def create_vocab_tables(src_vocab_file, tgt_vocab_file, unk_id):
    """Create the vocab lookup table"""
    src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file,
                                                       default_value=unk_id)
    tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file,
                                                       default_value=unk_id)

    return src_vocab_table, tgt_vocab_table
コード例 #8
0
ファイル: vocab_utils.py プロジェクト: yyzreal/nlp-architect
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
    """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
    src_vocab_table = lookup_ops.index_table_from_file(src_vocab_file, default_value=UNK_ID)
    if share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(tgt_vocab_file, default_value=UNK_ID)
    return src_vocab_table, tgt_vocab_table
コード例 #9
0
def create_input_data(source_data_file, target_data_file,
                      source_vocab_file, target_vocab_file,
                      batch_size, sos, eos,
                      source_max_length, target_max_length):
  source_dataset = tf.data.TextLineDataset(tf.gfile.Glob(source_data_file))
  target_dataset = tf.data.TextLineDataset(tf.gfile.Glob(target_data_file))
  source_vocab = lookup_ops.index_table_from_file(
    source_vocab_file, default_value=FLAGS.unk_id)
  target_vocab = lookup_ops.index_table_from_file(
    target_vocab_file, default_value=FLAGS.unk_id)

  output_buffer_size = batch_size * 1000

  source_eos_id = tf.cast(source_vocab.lookup(tf.constant(eos)), tf.int32)
  target_sos_id = tf.cast(target_vocab.lookup(tf.constant(sos)), tf.int32)
  target_eos_id = tf.cast(target_vocab.lookup(tf.constant(eos)), tf.int32)

  dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
  dataset = dataset.map(
    lambda src, tgt: (tf.string_split([src]).values,
                      tf.string_split([tgt]).values)).prefetch(output_buffer_size)
  dataset = dataset.filter(
    lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
  dataset = dataset.map(
    lambda src, tgt: (src[:source_max_length], tgt[:target_max_length]))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (tf.cast(source_vocab.lookup(src), tf.int32),
                      tf.cast(target_vocab.lookup(tgt), tf.int32)))
  dataset = dataset.prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt: (src,
                      tf.concat(([target_sos_id], tgt), 0),
                      tf.concat((tgt, [target_eos_id]), 0))).prefetch(output_buffer_size)

  dataset = dataset.map(
    lambda src, tgt_in, tgt_out: (
      src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in))).prefetch(output_buffer_size)

  dataset = dataset.shuffle(100).repeat().padded_batch(
    batch_size,
    padded_shapes=(tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([None]),
                   tf.TensorShape([]),
                   tf.TensorShape([])),
    padding_values=(source_eos_id,
                    target_eos_id,
                    target_eos_id,
                    0,
                    0))

  iterator = dataset.make_initializable_iterator()

  return iterator.get_next(), iterator.initializer, source_vocab, target_vocab
コード例 #10
0
ファイル: utils.py プロジェクト: zhenglei2015/NER
def create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id, tgt_unknown_id, share_vocab=False):
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=src_unknown_id)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=tgt_unknown_id)
  return src_vocab_table, tgt_vocab_table
コード例 #11
0
ファイル: utils.py プロジェクト: luluyouyue/NER
def create_vocab_tables(src_vocab_file, tgt_vocab_file, src_unknown_id, tgt_unknown_id, share_vocab=False):
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=src_unknown_id)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=tgt_unknown_id)
  return src_vocab_table, tgt_vocab_table
コード例 #12
0
ファイル: data_feed.py プロジェクト: abhudev/DL_Project1_NLP
def get_nli_data(nli_premise,
                 nli_hypothesis,
                 nli_classes,
                 en_vocab,
                 class_vocab,
                 bsize,
                 mode,
                 unk='<unk>',
                 eos='<eos>'):
    """ NLI Batcher
        Return Premise, Hypothesis sentence + class
        Also return sentence lengths
    """
    nli_premise = tf.data.TextLineDataset(nli_premise)
    nli_hypothesis = tf.data.TextLineDataset(nli_hypothesis)
    nli_classes = tf.data.TextLineDataset(nli_classes)
    en_vocab_table = lookup_ops.index_table_from_file(en_vocab,
                                                      default_value=1)
    class_vocab_table = lookup_ops.index_table_from_file(class_vocab,
                                                         default_value=0)

    # : Append EOS and make columns for number of words :
    nli_premise = nli_premise.map(lambda sentence:
                                  (tf.string_split([sentence]).values),
                                  num_parallel_calls=num_threads)
    nli_premise = nli_premise.map(
        lambda words: en_vocab_table.lookup(tf.concat([words, [eos]], axis=0)),
        num_parallel_calls=num_threads)
    nli_premise_wrds = nli_premise.map(lambda words: tf.size(words),
                                       num_parallel_calls=num_threads)

    nli_hypothesis = nli_hypothesis.map(lambda sentence:
                                        (tf.string_split([sentence]).values),
                                        num_parallel_calls=num_threads)
    nli_hypothesis = nli_hypothesis.map(
        lambda words: en_vocab_table.lookup(tf.concat([words, [eos]], axis=0)),
        num_parallel_calls=num_threads)
    nli_hypothesis_wrds = nli_hypothesis.map(lambda words: tf.size(words),
                                             num_parallel_calls=num_threads)

    nli_classes = nli_classes.map(lambda sentence: class_vocab_table.lookup(
        tf.string_split([sentence]).values)[0],
                                  num_parallel_calls=num_threads)

    nli_dataset = tf.data.Dataset.zip(
        (nli_premise, nli_premise_wrds, nli_hypothesis, nli_hypothesis_wrds,
         nli_classes))
    if (mode == 'train'):
        nli_dataset = nli_dataset.shuffle(buffer_size=1000, seed=42)
    nli_dataset = nli_dataset.padded_batch(batch_size=bsize,
                                           padded_shapes=((([None]), ([]),
                                                           ([None
                                                             ]), ([]), ([]))))
    nli_dataset = nli_dataset.prefetch(1)

    return (nli_dataset)
コード例 #13
0
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
  """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
  src_vocab_table = lookup_ops.index_table_from_file(
      src_vocab_file, default_value=UNK_ID)
  if share_vocab:
    tgt_vocab_table = src_vocab_table
  else:
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=UNK_ID)
  return src_vocab_table, tgt_vocab_table
コード例 #14
0
def create_vocab_tables(src1_vocab_file, src2_vocab_file, tgt_vocab_file):
    src1_vocab_table = lookup_ops.index_table_from_file(
        src1_vocab_file, default_value=data_utils.UNK_ID)
    src2_vocab_table = lookup_ops.index_table_from_file(
        src2_vocab_file, default_value=data_utils.UNK_ID)

    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=data_utils.UNK_ID)

    return src1_vocab_table, src2_vocab_table, tgt_vocab_table
コード例 #15
0
ファイル: vocab_utils.py プロジェクト: ChuanTianML/learn_gnmt
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
    """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
    src_vocab_table = lookup_ops.index_table_from_file(  # 返回一个word2id的table
        # The lookup table to map a key_dtype Tensor to index int64 Tensor.
        src_vocab_file,
        default_value=UNK_ID)
    if share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=UNK_ID)
    return src_vocab_table, tgt_vocab_table
コード例 #16
0
 def __init__(self, config_dict):
     super(Bert2Seq, self).__init__()
     self.cfg = Bert2SeqConfig(config_dict)
     #Dictionary initialization
     self.bert_dict = lookup_ops.index_table_from_file(
         self.cfg.vocab_path + "/" + self.cfg.bert_vocab_file,
         default_value=0)
     self.decoder_dict = lookup_ops.index_table_from_file(
         self.cfg.vocab_path + "/" + self.cfg.decoder_vocab_file,
         default_value=self.cfg.dict_param.unk_id)
     self.reverse_decoder_dict = lookup_ops.index_to_string_table_from_file(
         self.cfg.vocab_path + "/" + self.cfg.decoder_vocab_file,
         default_value=self.cfg.dict_param.unk)
コード例 #17
0
ファイル: vocab_utils.py プロジェクト: rpryzant/code-doodles
def create_vocab_tables(src_vocab_file, tgt_vocab_file, config):
    src_vocab_table = lookup_ops.index_table_from_file(
        src_vocab_file, default_value=UNK_ID)
    if config.share_vocab:
        tgt_vocab_table = src_vocab_table
    else:
        tgt_vocab_table = lookup_ops.index_table_from_file(
            tgt_vocab_file, default_value=UNK_ID)

    reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
        tgt_vocab_file, default_value=config.unk)

    return src_vocab_table, tgt_vocab_table, reverse_tgt_vocab_table
コード例 #18
0
 def create_vocab_tables(self):
     """Creates vocab tables for src_vocab_file and tgt_vocab_file."""
     src_vocab_file, tgt_vocab_file, share_vocab  = self.opt.source_vocab_file, \
                     self.opt.dest_vocab_file, False
     
     src_vocab_table = lookup_ops.index_table_from_file(
                               src_vocab_file, default_value=UNK_ID)
     if share_vocab:
         tgt_vocab_table = src_vocab_table
     else:
         tgt_vocab_table = lookup_ops.index_table_from_file(
                              tgt_vocab_file, default_value=UNK_ID)
     return src_vocab_table, tgt_vocab_table
コード例 #19
0
    def __init__(self, hparams, training=True):

        self.training = training
        self.hparams = hparams

        self.src_max_len = self.hparams.src_max_len
        self.tgt_max_len = self.hparams.tgt_max_len

        self.vocab_size, self.vocab_list = check_vocab(VOCAB_FILE)
        self.emotion_size, self.emotion_list = check_vocab(EMOTION_FILE)

        self.vocab_table = lookup_ops.index_table_from_file(
            VOCAB_FILE, default_value=self.hparams.unk_id)
        self.reverse_vocab_table = lookup_ops.index_to_string_table_from_file(
            VOCAB_FILE, default_value=self.hparams.unk_token)

        self.emotion_table = lookup_ops.index_table_from_file(
            EMOTION_FILE, default_value=self.hparams.unk_id)
        self.reverse_emotion_table = lookup_ops.index_to_string_table_from_file(
            EMOTION_FILE, default_value=self.hparams.unk_token)

        if self.training:
            print('--------------------------------------------------')
            for index, name in enumerate(RECORD_FILE_NAME_LIST):
                print('= {} - {}'.format(index, name))
            RECORD_INDEX = int(input("# Input record file index: "))
            print('--------------------------------------------------')

            batch_lists = self.get_file_batch_lists('{}_train.json'.format(
                RECORD_FILE_NAME_LIST[RECORD_INDEX]))
            emotion_num_dict = self.get_emotion_num(batch_lists)
            self.emotion_weight_dict = self.get_emotion_weight(
                emotion_num_dict)

            self.case_table = prepare_case_table()
            self.dev_dataset = self.load_record(
                os.path.join(
                    RECORD_DIR, '{}_dev.tfrecords'.format(
                        RECORD_FILE_NAME_LIST[RECORD_INDEX])))
            self.test_dataset = self.load_record(
                os.path.join(
                    RECORD_DIR, '{}_test.tfrecords'.format(
                        RECORD_FILE_NAME_LIST[RECORD_INDEX])))
            self.train_dataset = self.load_record(
                os.path.join(
                    RECORD_DIR, '{}_train.tfrecords'.format(
                        RECORD_FILE_NAME_LIST[RECORD_INDEX])))
        else:
            self.case_table = None
コード例 #20
0
 def test_index_table_from_file_with_vocab_size_too_large(self):
   vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
   with self.test_session():
     table = lookup_ops.index_table_from_file(
         vocabulary_file=vocabulary_file, vocab_size=4)
     self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                             "Invalid vocab_size", table.init.run)
コード例 #21
0
def create_speaker_tables(speaker_table_file):
    """Creates speaker tables for question file"""
    ## TODO account for speaker only present in answers

    speaker_table = lookup_ops.index_table_from_file(speaker_table_file,
                                                     default_value=UNK_ID)
    return speaker_table
コード例 #22
0
 def vocabulary_lookup(self):
     """Returns a lookup table mapping string to index."""
     return lookup.index_table_from_file(
         self.vocabulary_file,
         vocab_size=self.vocabulary_size - self.num_oov_buckets,
         num_oov_buckets=self.num_oov_buckets,
         default_value=constants.UNKNOWN_ID)
コード例 #23
0
def create_train_model(hparams):
    train_file = hparams.train
    vocab_size, vocab_file = vocab_utils.check_vocab(hparams.vocab_file,
                                                     hparams.out_dir,
                                                     sos=hparams.sos,
                                                     eos=hparams.eos,
                                                     unk=vocab_utils.UNK)
    hparams.add_hparam("vocab_size", vocab_size)

    graph = tf.Graph()
    with graph.as_default(), tf.container("train"):
        vocab_table = lookup_ops.index_table_from_file(vocab_file,
                                                       default_value=0)

        iterator = iterator_utils.get_iterator(train_file,
                                               vocab_table,
                                               batch_size=hparams.batch_size,
                                               sos=hparams.sos,
                                               eos=hparams.eos,
                                               src_max_len=hparams.src_max_len)

        model = rnn_model.Model(hparams,
                                mode=tf.contrib.learn.ModeKeys.TRAIN,
                                iterator=iterator,
                                vocab_table=vocab_table)
    return graph, model, iterator
コード例 #24
0
 def __init__(self, config_dict):
     super(BertQK, self).__init__()
     self.cfg = BertQKConfig(config_dict)
     #Dictionary initialization
     self.bert_dict = lookup_ops.index_table_from_file(
         self.cfg.vocab_path + "/" + self.cfg.bert_vocab_file,
         default_value=0)
コード例 #25
0
ファイル: export_v2.py プロジェクト: waitalone/hub-1
 def __init__(self,
              vocab_file_path,
              oov_buckets,
              num_lines_to_ignore=0,
              num_lines_to_use=None):
   super(TextEmbeddingModel, self).__init__()
   self._vocabulary, self._pretrained_vectors = load(vocab_file_path,
                                                     parse_line,
                                                     num_lines_to_ignore,
                                                     num_lines_to_use)
   self._oov_buckets = oov_buckets
   # Make the vocabulary file a `TrackableAsset` to ensure it is saved along
   # with the model.
   self._vocabulary_file = tracking.TrackableAsset(
       write_vocabulary_file(self._vocabulary))
   self._table = lookup_ops.index_table_from_file(
       vocabulary_file=self._vocabulary_file,
       num_oov_buckets=self._oov_buckets,
       hasher_spec=lookup_ops.FastHashSpec)
   oovs = np.zeros([oov_buckets, self._pretrained_vectors.shape[1]])
   self._pretrained_vectors.resize([
       self._pretrained_vectors.shape[0] + oov_buckets,
       self._pretrained_vectors.shape[1]
   ])
   self._pretrained_vectors[self._pretrained_vectors.shape[0] -
                            oov_buckets:, :] = oovs
   self.embeddings = tf.Variable(self._pretrained_vectors)
   self.variables = [self.embeddings]
   self.trainable_variables = self.variables
コード例 #26
0
ファイル: driver.py プロジェクト: piBrain/aura-ml
    def _create_vocab_tables(self, vocab_files, share_vocab=False):
        if vocab_files[1] is None and share_vocab == False:
            raise ValueError(
                'If share_vocab is set to false must provide target vocab. (src_vocab_file, \
                    target_vocab_file)')

        src_vocab_table = lookup_ops.index_table_from_file(
            vocab_files[0], default_value=UNK_ID)

        if share_vocab:
            tgt_vocab_table = src_vocab_table
        else:
            tgt_vocab_table = lookup_ops.index_table_from_file(
                vocab_files[1], default_value=UNK_ID)

        return src_vocab_table, tgt_vocab_table
コード例 #27
0
ファイル: bilstm_crf_model.py プロジェクト: yaoqi/deepseg
    def compute_loss(self, logits, labels, nwords, params):
        """Compute loss.

        Args:
            logits: A tensor, output of dense layer
            labels: A tensor, the ground truth label
            nwords: A tensor, length of inputs
            params: A dict, storing hyper params

        Returns:
            A loss tensor, negative log likelihood loss.
        """
        tags_str2idx = lookup_ops.index_table_from_file(params['tag_vocab'],
                                                        default_value=0)
        actual_ids = tags_str2idx.lookup(labels)
        # get transition matrix created before
        with tf.variable_scope("crf", reuse=True):
            trans_val = tf.get_variable(
                "transition",
                shape=[params['num_tags'], params['num_tags']],
                dtype=tf.float32)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            inputs=logits,
            tag_indices=actual_ids,
            sequence_lengths=nwords,
            transition_params=trans_val)
        loss = tf.reduce_mean(-log_likelihood)
        return loss
コード例 #28
0
 def __init__(self):
     TreeHeight = lambda x: int(math.log(x - 1) / math.log(2)) + 2
     indexCnt = count_idx(FLAGS.input_previous_model_path + "/" +
                          FLAGS.tree_index_file)
     self.tree_height = TreeHeight(indexCnt + 1)
     self.tree_index = lookup_ops.index_table_from_file(
         FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file,
         default_value=indexCnt)
     self.reverse_tree_index = lookup_ops.index_to_string_table_from_file(
         FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file,
         default_value='<unk>')
     self.dims = parse_dims(FLAGS.semantic_model_dims)
     self.layer_embedding = tf.get_variable(
         name='tree_node_emb',
         shape=[pow(2, self.tree_height - 1), self.dims[-1]])
     if not FLAGS.leaf_content_emb:
         self.leaf_embedding = tf.get_variable(
             name='leaf_node_emb',
             shape=[pow(2, self.tree_height - 1), self.dims[-1]])
     if FLAGS.use_mstf_ops == 1:
         self.op_dict = mstf.dssm_dict(FLAGS.xletter_dict)
     elif FLAGS.use_mstf_ops == -1:
         self.op_dict = XletterPreprocessor(FLAGS.xletter_dict,
                                            FLAGS.xletter_win_size)
     else:
         self.op_dict = None
コード例 #29
0
def do_infer(hparams, args):
    if 'len_max_sentence' in hparams:
        len_max_sentence = hparams.len_max_sentence
    else:
        len_max_sentence = -1

    infer_graph = tf.Graph()

    rev_vocab_table = index_to_word_map(hparams.vocab_output)
    all_words = np.array(
        [rev_vocab_table[index] for index in range(hparams.size_vocab_output)])

    with infer_graph.as_default():
        vocab_table_input = lookup_ops.index_table_from_file(
            hparams.vocab_input, default_value=0)

        infer_iterator = create_infer_dataset_iterator(args.infer_sentences,
                                                       vocab_table_input,
                                                       args.infer_batch_size,
                                                       len_max_sentence)

        infer_model = RNNPredictor(hparams, infer_iterator, ModeKeys.INFER)
        infer_sess = tf.Session()
        infer_sess.run(tf.tables_initializer())
        latest_train_ckpt = tf.train.latest_checkpoint(args.model_dir)
        infer_model.saver.restore(infer_sess, latest_train_ckpt)

        fw = open(args.infer_out, 'w')
        start_time = time.time()
        all_probs, num_batches = infer_model.get_all_probs(infer_sess)
        logging.info('Infer time: %ds Batches: %d datums: %d' %
                     ((time.time() - start_time), num_batches, len(all_probs)))

    for datum_prob in all_probs:
        fw.write('%s\n' % ' '.join(all_words[datum_prob > args.prob_cutoff]))
コード例 #30
0
    def __init__(self,
                 hparams,
                 tokenizer=None,
                 training=True,
                 mode='inference'):

        self.training = training
        self.hparams = hparams

        self.tokenizer = tokenizer if tokenizer else Tokenizer(
            self.hparams, VOCAB_FILE)
        self.vocab_size, self.vocab_dict = len(
            self.tokenizer.vocab), self.tokenizer.vocab

        self.emotion_tokenizer = tokenizer if tokenizer else Tokenizer(
            self.hparams, EMOTION_FILE)
        self.emotion_size, self.emotion_list = len(
            self.emotion_tokenizer.vocab), self.emotion_tokenizer.inv_vocab

        with tf.name_scope("data_process"):

            self.vocab_table = lookup_ops.index_table_from_file(
                VOCAB_FILE, default_value=self.hparams.unk_id)
            self.reverse_vocab_table = lookup_ops.index_to_string_table_from_file(
                VOCAB_FILE, default_value=self.hparams.unk_token)

            self.emotion_table = lookup_ops.index_table_from_file(
                EMOTION_FILE, default_value=self.hparams.unk_id)
            self.reverse_emotion_table = lookup_ops.index_to_string_table_from_file(
                EMOTION_FILE, default_value=self.hparams.unk_token)

            self.dull_response_id = self.get_dull_response(DULL_RESPONSE)

        if self.training:
            with tf.name_scope("load_record"):
                if mode == 'ddpg':
                    train_file = 'daily_train.tfrecords'
                    test_file = 'daily_test.tfrecords'
                else:
                    train_file = 'daily_mtem_train.tfrecords'
                    test_file = 'daily_mtem_test.tfrecords'
#                    train_file = 'friends_train.tfrecords'
#                    test_file = 'friends_test.tfrecords'
                self.train_dataset_count, self.train_dataset = self.load_record(
                    os.path.join(RECORD_DIR, train_file), ELEMENT_LIST)
                self.test_dataset_count, self.test_dataset = self.load_record(
                    os.path.join(RECORD_DIR, test_file), ELEMENT_LIST)
コード例 #31
0
  def test_index_table_from_file_with_invalid_hashers(self):
    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
    with self.test_session():
      with self.assertRaises(TypeError):
        lookup_ops.index_table_from_file(
            vocabulary_file=vocabulary_file,
            vocab_size=3,
            num_oov_buckets=1,
            hasher_spec=1)

      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file,
          vocab_size=3,
          num_oov_buckets=1,
          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))

      self.assertRaises(ValueError, table.lookup,
                        constant_op.constant(["salad", "surgery", "tarkus"]))
コード例 #32
0
ファイル: base_model.py プロジェクト: yaoqi/deepseg
 def build_eval_metrics(self, predict_ids, labels, nwords, params):
     tags_str2idx = lookup_ops.index_table_from_file(
         params['tag_vocab'], default_value=0)
     actual_ids = tags_str2idx.lookup(labels)
     weights = tf.sequence_mask(nwords)
     metrics = {
         "accuracy": tf.metrics.accuracy(actual_ids, predict_ids, weights)
     }
     return metrics
コード例 #33
0
def string_to_index_table_from_file(vocabulary_file=None,
                                    num_oov_buckets=0,
                                    vocab_size=None,
                                    default_value=-1,
                                    hasher_spec=FastHashSpec,
                                    name=None):
  return index_table_from_file(
      vocabulary_file, num_oov_buckets, vocab_size, default_value, hasher_spec,
      key_dtype=dtypes.string, name=name)
コード例 #34
0
    def __init__(self, hparams, mode):
        self.mode = mode
        self.hparams = hparams
        params = tf.trainable_variables()
        #define placeholder
        self.vocab_table_word = lookup_ops.index_table_from_file(
            'pre_data/vocab_word.txt', default_value=0)
        self.vocab_table_char = lookup_ops.index_table_from_file(
            'pre_data/vocab_char.txt', default_value=0)
        self.norm_trainable = tf.placeholder(tf.bool)
        self.q1 = {}
        self.q2 = {}
        self.label = tf.placeholder(shape=(None, ), dtype=tf.float32)

        for q in [self.q1, self.q2]:
            q['words'] = tf.placeholder(shape=(None, None), dtype=tf.string)
            q['words_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32)
            q['chars'] = tf.placeholder(shape=(None, None), dtype=tf.string)
            q['chars_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32)
            q['words_num'] = tf.placeholder(
                shape=(None, len(hparams.word_num_features)), dtype=tf.float32)
            q['chars_num'] = tf.placeholder(
                shape=(None, len(hparams.char_num_features)), dtype=tf.float32)

        #build graph
        self.build_graph(hparams)

        #build optimizer
        self.optimizer(hparams)
        params = tf.trainable_variables()
        self.saver = tf.train.Saver(tf.global_variables())
        elmo_param = []
        for param in tf.global_variables():
            if 'elmo' in param.name and 'elmo/Variable' not in param.name:
                elmo_param.append(param)
        self.pretrain_saver = tf.train.Saver(elmo_param)
        utils.print_out("# Trainable variables")
        for param in params:
            if hparams.pretrain is False and 'elmo' in param.name:
                continue
            else:
                utils.print_out(
                    "  %s, %s, %s" %
                    (param.name, str(param.get_shape()), param.op.device))
コード例 #35
0
def create_train_model(model_creator,
                       hparams,
                       scope=None):
  """Create train graph, model, and iterator."""
  train_src_file = "%s.%s" % (hparams.train_prefix, hparams.src)
  train_tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt)
  src_vocab_file = hparams.src_vocab_file
  tgt_vocab_file = hparams.tgt_vocab_file

  train_graph = tf.Graph()

  with train_graph.as_default():
    src_vocab_table = lookup_ops.index_table_from_file(
        src_vocab_file, default_value=vocab_utils.UNK_ID)
    tgt_vocab_table = lookup_ops.index_table_from_file(
        tgt_vocab_file, default_value=vocab_utils.UNK_ID)

    train_src_dataset = tf.contrib.data.TextLineDataset(train_src_file)
    train_tgt_dataset = tf.contrib.data.TextLineDataset(train_tgt_file)
    train_skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)

    train_iterator = iterator_utils.get_iterator(
        train_src_dataset,
        train_tgt_dataset,
        src_vocab_table,
        tgt_vocab_table,
        batch_size=hparams.batch_size,
        sos=hparams.sos,
        eos=hparams.eos,
        source_reverse=hparams.source_reverse,
        random_seed=hparams.random_seed,
        num_buckets=hparams.num_buckets,
        src_max_len=hparams.src_max_len,
        tgt_max_len=hparams.tgt_max_len,
        skip_count=train_skip_count_placeholder)
    train_model = model_creator(
        hparams,
        iterator=train_iterator,
        mode=tf.contrib.learn.ModeKeys.TRAIN,
        source_vocab_table=src_vocab_table,
        target_vocab_table=tgt_vocab_table,
        scope=scope)

  return train_graph, train_model, train_iterator, train_skip_count_placeholder
コード例 #36
0
  def test_string_index_table_from_file(self):
    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
    with self.test_session():
      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file, num_oov_buckets=1)
      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))

      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, 2, 3), ids.eval())
コード例 #37
0
ファイル: driver.py プロジェクト: piBrain/aura-ml
    def _create_vocab_tables(self, vocab_files, share_vocab=False):
        if vocab_files[1] is None and share_vocab == False:
            raise ValueError('If share_vocab is set to false must provide target vocab. (src_vocab_file, \
                    target_vocab_file)')

        src_vocab_table = lookup_ops.index_table_from_file(
            vocab_files[0],
            default_value=UNK_ID
        )

        if share_vocab:
            tgt_vocab_table = src_vocab_table
        else:
            tgt_vocab_table = lookup_ops.index_table_from_file(
                vocab_files[1],
                default_value=UNK_ID
            )

        return src_vocab_table, tgt_vocab_table
コード例 #38
0
  def test_index_table_from_file_with_vocab_size_too_small(self):
    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
    with self.test_session():
      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file, vocab_size=2)
      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))

      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, -1, -1), ids.eval())
      self.assertEqual(2, table.size().eval())
コード例 #39
0
  def test_index_table_from_file_with_default_value(self):
    default_value = -42
    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
    with self.test_session():
      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file, default_value=default_value)
      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))

      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, 2, default_value), ids.eval())
コード例 #40
0
  def test_int64_index_table_from_file(self):
    vocabulary_file = self._createVocabFile(
        "f2i_vocab3.txt", values=("42", "1", "-1000"))
    with self.test_session():
      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file,
          num_oov_buckets=1,
          key_dtype=dtypes.int64)
      ids = table.lookup(
          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))

      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual((1, 2, 3), ids.eval())
コード例 #41
0
 def __init__(self, vocabulary, emb_dim, oov_buckets):
   super(TextEmbeddingModel, self).__init__()
   self._oov_buckets = oov_buckets
   self._vocabulary_file = tracking.TrackableAsset(
       write_vocabulary_file(vocabulary))
   self._total_size = len(vocabulary) + oov_buckets
   self._table = lookup_ops.index_table_from_file(
       vocabulary_file=self._vocabulary_file,
       num_oov_buckets=self._oov_buckets,
       hasher_spec=lookup_ops.FastHashSpec)
   self.embeddings = tf.Variable(
       tf.random.uniform(shape=[self._total_size, emb_dim]))
   self.variables = [self.embeddings]
   self.trainable_variables = self.variables
コード例 #42
0
  def test_index_table_from_file_with_oov_buckets(self):
    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
    with self.test_session():
      table = lookup_ops.index_table_from_file(
          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
      ids = table.lookup(
          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))

      self.assertRaises(errors_impl.OpError, ids.eval)
      lookup_ops.tables_initializer().run()
      self.assertAllEqual(
          (
              1,  # From vocabulary file.
              2,  # From vocabulary file.
              867,  # 3 + fingerprint("tarkus") mod 300.
              860),  # 3 + fingerprint("toccata") mod 300.
          ids.eval())