def check_vocab(vocab_file,
                out_dir,
                check_special_token=True,
                sos=None,
                eos=None,
                unk=None):
    """Check if vocab_file doesn't exist, create from corpus_file."""
    if tf.gfile.Exists(vocab_file):
        utils.print_out("# Vocab file %s exists" % vocab_file)
        vocab, vocab_size = load_vocab(vocab_file)
        if check_special_token:
            # Verify if the vocab starts with unk, sos, eos
            # If not, prepend those tokens & generate a new vocab file
            if not unk: unk = UNK
            if not sos: sos = SOS
            if not eos: eos = EOS
            assert len(vocab) >= 3
            if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
                utils.print_out("The first 3 vocab words [%s, %s, %s]"
                                " are not [%s, %s, %s]" %
                                (vocab[0], vocab[1], vocab[2], unk, sos, eos))
                vocab = [unk, sos, eos] + vocab
                vocab_size += 3
                new_vocab_file = os.path.join(out_dir,
                                              os.path.basename(vocab_file))
                with codecs.getwriter("utf-8")(tf.gfile.GFile(
                        new_vocab_file, "wb")) as f:
                    for word in vocab:
                        f.write("%s\n" % word)
                vocab_file = new_vocab_file
    else:
        raise ValueError("vocab_file '%s' does not exist." % vocab_file)

    vocab_size = len(vocab)
    return vocab_size, vocab_file
Beispiel #2
0
def main():
    args = get_args()

    if args.train:
        train(args.model_name, args.restore)
    else:
        import_lib()
        dataset = Dataset.Dataset()
        model = PHVM.PHVM(len(dataset.vocab.id2featCate),
                          len(dataset.vocab.id2featVal),
                          len(dataset.vocab.id2word),
                          len(dataset.vocab.id2category),
                          key_wordvec=None,
                          val_wordvec=None,
                          tgt_wordvec=dataset.vocab.id2vec,
                          type_vocab_size=len(dataset.vocab.id2type))

        best_checkpoint_dir = config.checkpoint_dir + "/" + args.model_name + config.best_model_dir
        tmp_checkpoint_dir = config.checkpoint_dir + "/" + args.model_name + config.tmp_model_dir
        model_utils.restore_model(model, best_checkpoint_dir,
                                  tmp_checkpoint_dir)

        dataset.prepare_dataset()
        texts = infer(model, dataset, dataset.test)
        dump(texts, config.result_dir + "/{}.json".format(args.model_name))
        utils.print_out("finish file test")
Beispiel #3
0
    def save(self):
        hparams_file = os.path.join(
            self.model_dir, "{}_config.yml".format(file_name(self.config)))
        print_out("  saving config to %s" % hparams_file)

        to_dump_dict = dict(self.__dict__)
        if to_dump_dict['train_data']:
            to_dump_dict['train_data'] = os.path.abspath(
                to_dump_dict['train_data'])
        if to_dump_dict['test_data']:
            to_dump_dict['test_data'] = os.path.abspath(
                to_dump_dict['test_data'])
        if to_dump_dict['dev_data']:
            to_dump_dict['dev_data'] = os.path.abspath(
                to_dump_dict['dev_data'])
        if to_dump_dict['pretrain_data']:
            to_dump_dict['pretrain_data'] = os.path.abspath(
                to_dump_dict['pretrain_data'])
        else:
            to_dump_dict.pop('pretrain_data')
        if to_dump_dict['vocab_file']:
            to_dump_dict['vocab_file'] = os.path.abspath(
                to_dump_dict['vocab_file'])

        with codecs.getwriter("utf-8")(open(hparams_file, "wb")) as f:
            yaml.dump(to_dump_dict, f, default_flow_style=False)
def _cell_list(unit_type,
               num_units,
               num_layers,
               forget_bias,
               dropout,
               mode,
               num_gpus,
               base_gpu=0,
               single_cell_fn=None):
    """Create a list of RNN cells."""
    if not single_cell_fn:
        single_cell_fn = _single_cell

    # Multi-GPU
    cell_list = []
    for i in range(num_layers):
        utils.print_out("  cell %d" % i, new_line=False)
        single_cell = single_cell_fn(unit_type=unit_type,
                                     num_units=num_units,
                                     forget_bias=forget_bias,
                                     dropout=dropout,
                                     mode=mode,
                                     device_str=get_device_str(
                                         i + base_gpu, num_gpus))
        utils.print_out("")
        cell_list.append(single_cell)

    return cell_list
Beispiel #5
0
def print_step_info(prefix, global_step, info, result_summary, log_f):
    """Print all info at the current global step."""
    utils.print_out(
        "%sstep %d lr %g step-time %.2fs wps %.2fK gN %.2f %s, %s" %
        (prefix, global_step, info["learning_rate"], info["avg_step_time"],
         info["speed"], info["avg_grad_norm"], result_summary, time.ctime()),
        log_f)
Beispiel #6
0
def before_train(loaded_train_model, train_model, train_sess, global_step,
                 hparams, log_f):
    """Misc tasks to do before training."""
    stats = init_stats()
    info = {
        "train_ppl": 0.0,
        "speed": 0.0,
        "avg_step_time": 0.0,
        "avg_grad_norm": 0.0,
        "avg_sequence_count": 0.0,
        "learning_rate":
        loaded_train_model.learning_rate.eval(session=train_sess)
    }
    start_train_time = time.time()
    utils.print_out(
        "# Start step %d, lr %g, %s" %
        (global_step, info["learning_rate"], time.ctime()), log_f)

    # Initialize all of the iterators
    skip_count = hparams.batch_size * hparams.epoch_step
    utils.print_out("# Init train iterator, skipping %d elements" % skip_count)
    train_sess.run(train_model.iterator.initializer,
                   feed_dict={train_model.skip_count_placeholder: skip_count})

    return stats, info, start_train_time
Beispiel #7
0
def eval(eval_model, eval_sess, model_dir, hparams, summary_writer, log_f):
    with eval_model.graph.as_default():
        eval_model, global_step, epoch_num = create_or_load_model(
            eval_model, model_dir, eval_sess, "eval")
        eval_sess.run(eval_model.model.iterator.initializer)
        eval_info = {}
        total_loss = 0
        total_predict_beat_count = 0
        total_sequence_count = 0
        while True:
            try:
                step_result = eval_model.model.eval(eval_sess)
                total_loss += step_result.eval_loss * step_result.predict_beat_count
                total_predict_beat_count += step_result.predict_beat_count
                total_sequence_count += step_result.batch_size
            except tf.errors.OutOfRangeError:
                eval_info['epoch_num'] = epoch_num
                eval_info[
                    'eval_avg_beat_loss'] = total_loss / total_predict_beat_count
                eval_info['eval_predict_beat_count'] = total_predict_beat_count
                eval_info['eval_sample_num'] = total_sequence_count
                utils.print_out(
                    "\neval: global step: %d, epoch_num: %d, eval_avg beat loss: %.2f, eval_predict_beat_count: %d, eval_sample_num: %d, time;%s\n"
                    % (global_step, eval_info['epoch_num'],
                       eval_info["eval_avg_beat_loss"],
                       eval_info['eval_predict_beat_count'],
                       eval_info['eval_sample_num'], time.ctime()), log_f)
                for key in eval_info:
                    summary_writer.add_summary(
                        tf.Summary(value=[
                            tf.Summary.Value(tag=key,
                                             simple_value=eval_info[key])
                        ]), global_step)
                break
Beispiel #8
0
    def build_graph(self, hparams):
        utils.print_out("# Creating %s graph ..." % self.mode)

        with tf.variable_scope("network", dtype=self.dtype, reuse=tf.AUTO_REUSE):
            self.top_scope = tf.get_variable_scope()

            # Initializer
            initializer = tf.random_uniform_initializer(-hparams.init_weight, hparams.init_weight,
                                                        seed=hparams.random_seed)
            self.top_scope.set_initializer(initializer)
            ## so the initializer will be the default initializer for the following variable in this variable scope
            "---------"
            # the variable scope specification is left for _encode function
            # common components in three mode
            if self.architecture == "deepRNN":
                # lstm (bi_lstm, then stacked with uni_lstm)
                self.src_bi_lstm, self.src_bi_lstm_condition = self._build_bi_lstm(hparams)
                self.src_uni_lstm, self.src_uni_lstm_condition = self._build_uni_lstm(hparams)
                self.tgt_bi_lstm, self.tgt_bi_lstm_condition = self._build_bi_lstm(hparams)
                self.tgt_uni_lstm, self.tgt_uni_lstm_condition = self._build_uni_lstm(hparams)

                # Projector
                self.src_projector = self._build_projector(hparams, field='src')
                self.tgt_projector = self._build_projector(hparams, field='tgt')
            else:
                raise ValueError("Unknown architecture_type %s" % hparams.lstm_type)

            "------------"

            # set mode-specific component
            self.set_mode_phase(hparams)

            # Saver
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
Beispiel #9
0
 def eval(self,T,dev_data,hparams,sess):
     preds=self.infer(dev_data)
     if hparams.metric=='logloss':
         log_loss=metrics.log_loss(dev_data[1],preds)
         if self.best_score>log_loss:
             self.best_score=log_loss
             try:
                 os.makedirs('model_tmp/')
             except:
                 pass
             self.saver.save(sess,'model_tmp/model')
         utils.print_out("# Epcho-time %.2fs Eval logloss %.6f. Best logloss %.6f." \
                         %(T,log_loss,self.best_score))
     elif hparams.metric=='auc':
         fpr, tpr, thresholds = metrics.roc_curve(dev_data[1]+1, preds, pos_label=2)
         auc=metrics.auc(fpr, tpr)
         if self.best_score<auc:
             self.best_score=auc
             try:
                 os.makedirs('model_tmp/')
             except:
                 pass
             self.saver.save(sess,'model_tmp/model')                           
         utils.print_out("# Epcho-time %.2fs Eval AUC %.6f. Best AUC %.6f." \
                         %(T,auc,self.best_score))  
Beispiel #10
0
    def __init__(self):
        self.config = Config.config
        if not os.path.exists(self.config.vocab_file):
            pickle.dump(Vocabulary.Vocabulary(),
                        open(self.config.vocab_file, "wb"))
        self.vocab = pickle.load(open(self.config.vocab_file, "rb"))
        utils.print_out("finish reading vocab : {}".format(
            len(self.vocab.id2word)))
        self.cate2FK = {
            "裙": [
                "类型", "版型", "材质", "颜色", "风格", "图案", "裙型", "裙下摆", "裙腰型", "裙长",
                "裙衣长", "裙袖长", "裙领型", "裙袖型", "裙衣门襟", "裙款式"
            ],
            "裤": [
                "类型", "版型", "材质", "颜色", "风格", "图案", "裤长", "裤型", "裤款式", "裤腰型",
                "裤口"
            ],
            "上衣": [
                "类型", "版型", "材质", "颜色", "风格", "图案", "衣样式", "衣领型", "衣长", "衣袖长",
                "衣袖型", "衣门襟", "衣款式"
            ]
        }
        for key, val in self.cate2FK.items():
            self.cate2FK[key] = dict(zip(val, range(len(val))))

        self.input_graph = tf.Graph()
        with self.input_graph.as_default():
            proto = tf.ConfigProto()
            proto.gpu_options.allow_growth = True
            self.input_sess = tf.Session(config=proto)
            self.prepare_dataset()
def print_variables_in_ckpt(ckpt_path):
    """Print a list of variables in a checkpoint together with their shapes."""
    utils.print_out("# Variables in ckpt %s" % ckpt_path)
    reader = tf.train.NewCheckpointReader(ckpt_path)
    variable_map = reader.get_variable_to_shape_map()
    for key in sorted(variable_map.keys()):
        utils.print_out("  %s: %s" % (key, variable_map[key]))
Beispiel #12
0
def compute_perplexity(model, sess, name):
    """Compute perplexity of the output of the model.
    Args:
      model: model for compute perplexity.
      sess: tensorflow session to use.
      name: name of the batch.
    Returns:
      The perplexity of the eval outputs.
    """
    total_loss = 0
    total_predict_count = 0
    start_time = time.time()
    step = 0

    while True:
        try:
            loss, predict_count, batch_size = model.eval(sess)
            total_loss += loss * batch_size
            total_predict_count += predict_count
            step += 1
            if step % 500 == 0:
                ls = total_loss / total_predict_count
                ppl = misc.safe_exp(ls)
                print_out("    ## After %d steps, loss %.2f - ppl %.3f" %
                          (step, ls, ppl))
        except tf.errors.OutOfRangeError:
            break

    perplexity = safe_exp(total_loss / total_predict_count)
    print_time("  eval %s: perplexity %.2f" % (name, perplexity), start_time)
    return perplexity
Beispiel #13
0
def load_model(model, ckpt, session, name):
    start_time = time.time()
    model.saver.restore(session, ckpt)
    session.run(tf.tables_initializer())
    print_out("  loaded %s model parameters from %s, time %.2fs" %
              (name, ckpt, time.time() - start_time))
    return model
Beispiel #14
0
def ensure_compatible_hparams(hparams, default_hparams, hparams_path=""):
  """Make sure the loaded hparams is compatible with new changes."""
  default_hparams = utils.maybe_parse_standard_hparams(
      default_hparams, hparams_path)

  # Set num encoder/decoder layers (for old checkpoints)
  if hasattr(hparams, "num_layers"):
    if not hasattr(hparams, "num_encoder_layers"):
      hparams.add_hparam("num_encoder_layers", hparams.num_layers)
    if not hasattr(hparams, "num_decoder_layers"):
      hparams.add_hparam("num_decoder_layers", hparams.num_layers)

  # For compatible reason, if there are new fields in default_hparams,
  #   we add them to the current hparams
  default_config = default_hparams.values()
  config = hparams.values()
  for key in default_config:
    if key not in config:
      hparams.add_hparam(key, default_config[key])

  # Update all hparams' keys if override_loaded_hparams=True
  if getattr(default_hparams, "override_loaded_hparams", None):
    overwritten_keys = default_config.keys()
  else:
    # For inference
    overwritten_keys = INFERENCE_KEYS

  for key in overwritten_keys:
    if getattr(hparams, key) != default_config[key]:
      utils.print_out("# Updating hparams.%s: %s -> %s" %
                      (key, str(getattr(hparams, key)),
                       str(default_config[key])))
      setattr(hparams, key, default_config[key])
  return hparams
Beispiel #15
0
def write_tree(affs, filename):
    f = open(filename, 'wb')
    print >>f, '<affs>'
    
    for aff in affs:
        print_out(aff, where=f)
    
    print >>f, '</affs>'
Beispiel #16
0
 def _build_uni_lstm(self, hparams):
     utils.print_out("# Build unidirectional lstm")
     num_uni_layers = self.num_uni_layers
     num_uni_residual_layers = self.num_uni_layers - 1
     utils.print_out("  num_layers = %d, num_residual_layers=%d" % (num_uni_layers, num_uni_residual_layers))
     cell = self._build_cell(num_uni_layers, num_uni_residual_layers)
     uni_lstm = cell
     uni_lstm_condition = ("uni", None)
     return uni_lstm, uni_lstm_condition
Beispiel #17
0
 def _build_bi_lstm(self, hparams):
     utils.print_out("# Build bidirectional lstm")
     num_bi_layers = self.num_bi_layers
     num_bi_residual_layers = 0
     utils.print_out("  num_bi_layers = %d, num_bi_residual_layers=%d" % (num_bi_layers, num_bi_residual_layers))
     # Construct forward and backward cells
     fw_cell = self._build_cell(num_bi_layers, num_bi_residual_layers)
     bw_cell = self._build_cell(num_bi_layers, num_bi_residual_layers)
     bi_lstm = (fw_cell, bw_cell)
     bi_lstm_condition = ("bi", num_bi_layers)
     return bi_lstm, bi_lstm_condition
Beispiel #18
0
    def gnmt_encoder(self):
        print_out("build gnmt encoder")
        with tf.variable_scope("gnmt_encoder") as scope:
            inputs = tf.transpose(self.source_embedding,[1,0,2])
            inputs_reverse = _reverse(
                inputs, seq_lengths=self.sequence_length,
                seq_dim=0, batch_dim=1)
            encoder_states = []
            outputs = [inputs]

            with tf.variable_scope("fw") as s:
                cell = tf.contrib.rnn.LSTMBlockFusedCell(self.hparams.num_units,use_peephole=False)
                fused_outputs_op, fused_state_op = cell(inputs,sequence_length=self.sequence_length,dtype=inputs.dtype)
                encoder_states.append(fused_state_op)
                outputs.append(fused_outputs_op)
            
            with tf.variable_scope('bw') as s:
                bw_cell = tf.contrib.rnn.LSTMBlockFusedCell(self.hparams.num_units,use_peephole=False)
                bw_fused_outputs_op, bw_fused_state_op = bw_cell(inputs_reverse,sequence_length=self.sequence_length,dtype=inputs.dtype)
                bw_fused_outputs_op = _reverse(
                    bw_fused_outputs_op, seq_lengths=self.sequence_length,
                    seq_dim=0, batch_dim=1)
                encoder_states.append(bw_fused_state_op)
                outputs.append(bw_fused_outputs_op)

            with tf.variable_scope("uni") as s:
                uni_inputs = tf.concat([fused_outputs_op,bw_fused_outputs_op],axis=-1)
                for i in range(self.hparams.num_layers-1):
                    with tf.variable_scope("layer_%d" % i) as scope:
                        uni_cell =  tf.contrib.rnn.LSTMBlockFusedCell(self.hparams.num_units,use_peephole=False)
                        uni_fused_outputs_op, uni_fused_state_op = uni_cell(uni_inputs,sequence_length=self.sequence_length,dtype=inputs.dtype)
                        encoder_states.append(uni_fused_state_op)
                        outputs.append(uni_fused_outputs_op)
                        if i > 0:
                            uni_fused_outputs_op = uni_fused_outputs_op + uni_inputs
                        uni_inputs = uni_fused_outputs_op

            final_output = None
            # embedding + fw + bw + uni
            n = 3 + self.hparams.num_layers - 1
            scalars = tf.get_variable('scalar',initializer=tf.constant([1/(n)]*n))
            self.scalars = scalars
            weight = tf.get_variable('weight',initializer=tf.constant(0.001))
            self.weight = weight
            
            soft_scalars = tf.nn.softmax(scalars)
            for i, output in enumerate(outputs):
                if final_output is None:
                    final_output = soft_scalars[i] * tf.transpose(output,[1,0,2])
                else:
                    final_output = final_output + soft_scalars[i] * tf.transpose(output,[1,0,2])

            self.final_outputs = weight * final_output
            self.final_state = tuple(encoder_states)
    def build_graph(self, hparams, scope=None):
        """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.
      scope: VariableScope for the created subgraph; default "dynamic_seq2seq".

    Returns:
      A tuple of the form (logits, loss_tuple, final_context_state, sample_id),
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols].
        loss: loss = the total loss / batch_size.    
    """
        utils.print_out("\n# Creating %s graph ..." % self.mode)

        with tf.variable_scope(scope or "rnn", dtype=self.dtype):
            # Encoder
            self.encoder_outputs, encoder_state = self._build_encoder(hparams)
            fw_state, bw_state = encoder_state
            print('encoder_outputs: ', self.encoder_outputs.shape)
            print('fw_state.h: ', fw_state.h.shape)
            print('bw_state.h: ', bw_state.h.shape)

            # Linear layer for classification of intent
            encoder_last_state = tf.concat([fw_state.h, bw_state.h], axis=1)
            print('encoder_last_state: ', encoder_last_state.shape)
            print()

            encoder_output_size = encoder_last_state.get_shape()[1].value
            print('encoder_output_size: ', encoder_output_size)
            w = tf.get_variable('w',
                                [encoder_output_size, self.lbl_vocab_size],
                                dtype=tf.float32)
            w_t = tf.transpose(w)
            v = tf.get_variable('v', [self.lbl_vocab_size], dtype=tf.float32)

            # apply the linear layer
            label_logits = tf.nn.xw_plus_b(encoder_last_state, w, v)
            label_pred = tf.argmax(label_logits, 1)
            print('label_scores: ', label_logits.shape)
            print()

            ## Loss
            if self.mode != tf.contrib.learn.ModeKeys.INFER:
                with tf.device(
                        model_helper.get_device_str(
                            self.num_encoder_layers - 1, self.num_gpus)):
                    loss = self._compute_loss(label_logits)
            else:
                loss = tf.constant(0.0)

            return label_logits, loss, label_pred
Beispiel #20
0
    def elmo_encoder(self):
        print_out("build elmo encoder")
        with tf.variable_scope("elmo_encoder") as scope:
            inputs = tf.transpose(self.source_embedding,[1,0,2])
            inputs_reverse = _reverse(
                inputs, seq_lengths=self.sequence_length,
                seq_dim=0, batch_dim=1)
            encoder_states = []
            outputs = [tf.concat([inputs,inputs],axis=-1)]
            fw_cell_inputs = inputs
            bw_cell_inputs = inputs_reverse
            for i in range(self.hparams.num_layers):
                with tf.variable_scope("fw_%d" % i) as s:
                    cell = tf.contrib.rnn.LSTMBlockFusedCell(self.hparams.num_units,use_peephole=False)
                    fused_outputs_op, fused_state_op = cell(fw_cell_inputs,sequence_length=self.sequence_length,dtype=inputs.dtype)
                    encoder_states.append(fused_state_op)
                with tf.variable_scope("bw_%d" % i) as s:
                    bw_cell = tf.contrib.rnn.LSTMBlockFusedCell(self.hparams.num_units,use_peephole=False)
                    bw_fused_outputs_op_reverse, bw_fused_state_op = bw_cell(bw_cell_inputs,sequence_length=self.sequence_length,dtype=inputs.dtype)
                    bw_fused_outputs_op = _reverse(
                        bw_fused_outputs_op_reverse, seq_lengths=self.sequence_length,
                        seq_dim=0, batch_dim=1)
                    encoder_states.append(bw_fused_state_op)
                output = tf.concat([fused_outputs_op,bw_fused_outputs_op],axis=-1)
                if i > 0:
                    fw_cell_inputs = output + fw_cell_inputs
                    bw_cell_inputs = _reverse(
                        output, seq_lengths=self.sequence_length,
                        seq_dim=0, batch_dim=1) + bw_cell_inputs
                else:
                    fw_cell_inputs = output
                    bw_cell_inputs = _reverse(
                        output, seq_lengths=self.sequence_length,
                        seq_dim=0, batch_dim=1)
                outputs.append(output)
            
            final_output = None
            # embedding + num_layers
            n = 1 + self.hparams.num_layers
            scalars = tf.get_variable('scalar',initializer=tf.constant([1/(n)]*n))
            self.scalars = scalars
            weight = tf.get_variable('weight',initializer=tf.constant(0.001))
            self.weight = weight

            soft_scalars = tf.nn.softmax(scalars)
            for i, output in enumerate(outputs):
                if final_output is None:
                    final_output = soft_scalars[i] * tf.transpose(output,[1,0,2])
                else:
                    final_output = final_output + soft_scalars[i] * tf.transpose(output,[1,0,2])

            self.final_outputs = weight * final_output
            self.final_state = tuple(encoder_states)
Beispiel #21
0
    def init_embeddings(self, vocab_file, embedding_type, embedding_size, dtype=tf.float32, scope=None):
        vocab_list, vocab_size = vocab.load_vocab(vocab_file)

        with tf.variable_scope(scope or "embeddings", dtype=dtype):
            sqrt3 = math.sqrt(3)
            if embedding_type == 'random':
                print_out('# Using random embedding.')
                self.embeddings = tf.get_variable("emb_random_mat",
                                                  shape=[vocab_size, embedding_size],
                                                  initializer=tf.random_uniform_initializer(minval=-sqrt3, maxval=sqrt3, dtype=dtype))
            else:
                print_out('# Using pretrained embedding: %s.' % embedding_type)
Beispiel #22
0
def create_or_load_model(model, ckpt_dir, session, name):
    latest_ckpt = tf.train.latest_checkpoint(ckpt_dir)
    if latest_ckpt:
        model = load_model(latest_ckpt)
    else:
        start_time = time.time()
        session.run(tf.global_variables_initializer())
        utils.print_out("created %s model with fresh parameters, time %.2fs" %
                        (name, time.time() - start_time))

    global_step = session.run(model.global_step)
    return model, global_step
Beispiel #23
0
def remove_tags(root, aratio, cratio, acratio):
    for aff in root:
        tags = set([elem.tag for elem in aff])
        if not set(['country', 'addr-line', 'institution']) <= tags:
            print_out(aff) # Tag missing already

        if random.random() <= aratio:
            remove_elems('addr-line', aff)
        elif random.random() <= cratio:
            remove_elems('country', aff)
        elif random.random() <= acratio:
            remove_elems('addr-line', aff)
            remove_elems('country', aff)
Beispiel #24
0
 def __init__(self, hparams):
     self.hparams = hparams
     if hparams.metric in ['logloss']:
         self.best_score = 100000
     else:
         self.best_score = 0
     self.build_graph(hparams)
     self.optimizer(hparams)
     params = tf.trainable_variables()
     utils.print_out("# Trainable variables")
     for param in params:
         utils.print_out(
             "  %s, %s, %s" %
             (param.name, str(param.get_shape()), param.op.device))
Beispiel #25
0
def check_and_save_hparams(out_dir, hparams):
    """Save hparams."""
    hparams_file = os.path.join(out_dir, "hparams")
    if tf.gfile.Exists(hparams_file):
        with codecs.getreader("utf-8")(tf.gfile.GFile(hparams_file, "rb")) as f:
            origin_hparams = json.load(f)
            origin_hparams = tf.contrib.training.HParams(**origin_hparams)
        wrong_keys = []
        keys = set(list(hparams.values().keys()) + (list(origin_hparams.values().keys())))
        for key in keys:
            if (hparams.values().get(key, None) != origin_hparams.values().get(key, None) or
                    hparams.values().get(key, None) == None or
                    hparams.values().get(key, None) == None):
                wrong_keys.append(key)
        try:
            assert origin_hparams.values() == hparams.values()
            utils.print_out("using the same hparams of old %s" % hparams_file)
        except:
            utils.print_out('new hparams not the same with the existed one')
            for wrong_key in wrong_keys:
                utils.print_out(" keys: %s, \norigin_value: %s, \nnew_value: %s\n" % (
                wrong_key, origin_hparams.values()[wrong_key], hparams.values()[wrong_key]))
            raise ValueError
    else:
        utils.print_out("  not old hparams found, create new hparams to %s" % hparams_file)
        with codecs.getwriter("utf-8")(tf.gfile.GFile(hparams_file, "wb")) as f:
            f.write(hparams.to_json(indent=4))
Beispiel #26
0
 def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
     """Maximum decoding steps at inference time."""
     if hparams.tgt_max_len_infer:
         maximum_iterations = hparams.tgt_max_len_infer
         utils.print_out("  decoding maximum_iterations %d" %
                         maximum_iterations)
     else:
         # TODO(thangluong): add decoding_length_factor flag
         decoding_length_factor = 2.0
         max_encoder_length = tf.reduce_max(source_sequence_length)
         maximum_iterations = tf.to_int32(
             tf.round(
                 tf.to_float(max_encoder_length) * decoding_length_factor))
     return maximum_iterations
def create_or_load_model(model, model_dir, session, name):
    """Create model and initialize or load parameters in session."""
    latest_ckpt = tf.train.latest_checkpoint(model_dir)
    if latest_ckpt:
        model = load_model(model, latest_ckpt, session, name)
    else:
        start_time = time.time()
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        utils.print_out(
            "  created %s model with fresh parameters, time %.2fs" %
            (name, time.time() - start_time))

    global_step = model.global_step.eval(session=session)
    return model, global_step
Beispiel #28
0
def _external_eval(model, global_step, sess, hparams, iterator,
                   iterator_feed_dict, tgt_file, lbl_file, label,
                   summary_writer, save_on_best):
    """External evaluation such as BLEU and ROUGE scores."""
    out_dir = hparams.out_dir
    decode = global_step > 0

    if decode:
        utils.print_out("# External evaluation, global step %d" % global_step)

    sess.run(iterator.initializer, feed_dict=iterator_feed_dict)

    slot_output = os.path.join(out_dir, "slot_output_%s" % label)
    intent_output = os.path.join(out_dir, "intent_output_%s" % label)
    scores = nmt_utils.decode_and_evaluate(
        label,
        model,
        sess,
        slot_output,
        intent_output,
        ref_file=tgt_file,
        ref_lbl_file=lbl_file,
        metrics=hparams.metrics,
        subword_option=hparams.subword_option,
        beam_width=hparams.beam_width,
        tgt_eos=hparams.eos,
        task=hparams.task,
        decode=decode,
        infer_mode=hparams.infer_mode)
    # Save on best metrics
    if decode:
        for metric in hparams.metrics:
            best_metric_label = "best_" + metric

            utils.add_summary(summary_writer, global_step,
                              "%s_%s" % (label, metric), scores[metric])
            # metric: larger is better
            if save_on_best and scores[metric] > getattr(
                    hparams, best_metric_label):
                setattr(hparams, best_metric_label, scores[metric])
                model.saver.save(sess,
                                 os.path.join(
                                     getattr(hparams,
                                             best_metric_label + "_dir"),
                                     "translate.ckpt"),
                                 global_step=model.global_step)
        utils.save_hparams(out_dir, hparams)
    return scores
Beispiel #29
0
    def _preprocess(self):
        print_out("# Start to preprocessing data...")
        content = _tokenize(self.data, self.w2i, self.max_len, self.reverse,
                            self.split_word)
        item_labels = []
        for label_name in self.label_names:
            labels = [""]
            labels = self.get_label(labels, self.tag_l2i)
            item_labels.append(labels)
        self._raw_data.append(
            DataItem(content=content,
                     labels=np.asarray(item_labels),
                     length=len(content),
                     id=int("0")))

        self.num_batches = 1
        self.data_size = len(self._raw_data)
Beispiel #30
0
    def _get_learning_rate_decay(self, hparams):
        """Get learning rate decay."""
        start_decay_step, decay_steps, decay_factor = self._get_decay_info(
            hparams)
        utils.print_out(
            "  decay_scheme=%s, start_decay_step=%d, decay_steps %d, "
            "decay_factor %g" % (hparams.decay_scheme, start_decay_step,
                                 decay_steps, decay_factor))

        return tf.cond(self.global_step < start_decay_step,
                       lambda: self.learning_rate,
                       lambda: tf.train.exponential_decay(self.learning_rate, (
                           self.global_step - start_decay_step),
                                                          decay_steps,
                                                          decay_factor,
                                                          staircase=True),
                       name="learning_rate_decay_cond")
Beispiel #31
0
    def _build_cell(self, num_layers, num_residual_layers):
        cell_list = []
        for i in range(num_layers):
            utils.print_out("  cell %d " % i, new_line=False)
            single_cell = self._single_cell(
                unit_type=self.unit_type,
                num_units=self.num_units,
                forget_bias=self.forget_bias,
                dropout=self.dropout,
                mode=self.mode,
                residual_connection=(i >= (num_layers - num_residual_layers)))
            utils.print_out("", new_line=True)
            cell_list.append(single_cell)

        if len(cell_list) == 1:  # Single layer.
            return cell_list[0]
        else:  # Multi layers
            return tf.nn.rnn_cell.MultiRNNCell(cell_list)
Beispiel #32
0
    def __init__(self, hparams, mode):
        self.mode = mode
        self.hparams = hparams
        params = tf.trainable_variables()
        #define placeholder
        self.vocab_table_word = lookup_ops.index_table_from_file(
            'pre_data/vocab_word.txt', default_value=0)
        self.vocab_table_char = lookup_ops.index_table_from_file(
            'pre_data/vocab_char.txt', default_value=0)
        self.norm_trainable = tf.placeholder(tf.bool)
        self.q1 = {}
        self.q2 = {}
        self.label = tf.placeholder(shape=(None, ), dtype=tf.float32)

        for q in [self.q1, self.q2]:
            q['words'] = tf.placeholder(shape=(None, None), dtype=tf.string)
            q['words_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32)
            q['chars'] = tf.placeholder(shape=(None, None), dtype=tf.string)
            q['chars_len'] = tf.placeholder(shape=(None, ), dtype=tf.int32)
            q['words_num'] = tf.placeholder(
                shape=(None, len(hparams.word_num_features)), dtype=tf.float32)
            q['chars_num'] = tf.placeholder(
                shape=(None, len(hparams.char_num_features)), dtype=tf.float32)

        #build graph
        self.build_graph(hparams)

        #build optimizer
        self.optimizer(hparams)
        params = tf.trainable_variables()
        self.saver = tf.train.Saver(tf.global_variables())
        elmo_param = []
        for param in tf.global_variables():
            if 'elmo' in param.name and 'elmo/Variable' not in param.name:
                elmo_param.append(param)
        self.pretrain_saver = tf.train.Saver(elmo_param)
        utils.print_out("# Trainable variables")
        for param in params:
            if hparams.pretrain is False and 'elmo' in param.name:
                continue
            else:
                utils.print_out(
                    "  %s, %s, %s" %
                    (param.name, str(param.get_shape()), param.op.device))
Beispiel #33
0
def change_country_by_dict(root):
    """ <country>123234</country> --> <addr-line>123234</addr-line>
        <country>Berlin</country> --> <addr-line>Berlin</Berlin>
    """
    country_keywords = set_from_file(COUNTRY_DICT, normal=True, split=True) \
            .union(set_from_file(DEPENDENT_DICT, normal=True, split=True))
    for k in list(country_keywords):
        if len(k) == 1:
            country_keywords.discard(k)
    #print country_keywords

    for aff in root:
        for elem in aff:
            if elem.tag == 'country':
                tokens = [normalize(t) for t in tokenize(elem.text, split_alphanum=True)]
                if not any(t in country_keywords for t in tokens) and \
                    elem.text.strip() and \
                    (elem.text.strip() not in ['P.', 'R.', 'O.', 'C.', ')', ',']):
                        elem.tag = 'addr-line'
                        print_out(elem)