def get_dialog_examples(self, dataset):
        """Return a list of `InputExample`s of the data splits' dialogues.

    Args:
      dataset: str. can be "train", "dev", or "test".

    Returns:
      examples: a list of `InputExample`s.
    """
        dialog_paths = [
            os.path.join(self.dstc8_data_dir, dataset,
                         "dialogues_{:03d}.json".format(i))
            for i in self._file_ranges[dataset]
        ]
        dialogs = load_dialogues(dialog_paths)
        schema_path = os.path.join(self.dstc8_data_dir, dataset, "schema.json")
        schemas = schema.Schema(schema_path)

        examples = []
        for dialog_idx, dialog in enumerate(dialogs):
            tf.compat.v1.logging.log_every_n(tf.compat.v1.logging.INFO,
                                             "Processed %d dialogs.", 1000,
                                             dialog_idx)
            examples.extend(
                self._create_examples_from_dialog(dialog, schemas, dataset))
        return examples
Example #2
0
def _create_schema_embeddings(bert_config, schema_embedding_file):
  """Create schema embeddings and save it into file."""
  if not tf.io.gfile.exists(FLAGS.schema_embedding_dir):
    tf.io.gfile.makedirs(FLAGS.schema_embedding_dir)
  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  schema_emb_run_config = tf.contrib.tpu.RunConfig(
      master=FLAGS.master,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  schema_json_path = os.path.join(FLAGS.dstc8_data_dir, FLAGS.dataset_split,
                                  "schema.json")
  schemas = schema.Schema(schema_json_path)

  # Prepare BERT model for embedding a natural language descriptions.
  bert_init_ckpt = os.path.join(FLAGS.bert_ckpt_dir, "bert_model.ckpt")
  schema_emb_model_fn = extract_schema_embedding.model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=bert_init_ckpt,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  schema_emb_estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=schema_emb_model_fn,
      config=schema_emb_run_config,
      predict_batch_size=FLAGS.predict_batch_size)
  vocab_file = os.path.join(FLAGS.bert_ckpt_dir, "vocab.txt")
  tokenizer = tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=FLAGS.do_lower_case)
  emb_generator = extract_schema_embedding.SchemaEmbeddingGenerator(
      tokenizer, schema_emb_estimator, FLAGS.max_seq_length)
  emb_generator.save_embeddings(schemas, schema_embedding_file)
def main(_):
    schema_path = os.path.join(_DIR_PATH, FLAGS.schema_file_name)
    schemas = schema.Schema(schema_path)
    processor = Processor(schemas)
    data_path = os.path.join(FLAGS.input_data_dir, 'data.json')
    with tf.io.gfile.GFile(data_path, 'r') as f:
        data = json.load(f)
    dev_test_ids = []
    output_dir = FLAGS.output_dir or _DIR_PATH
    # Generate dev and test set according to the ids listed in the files. Ids not
    # included in the dev and test id list files belong to the training set.
    for output_dir_name, file_name in _PATH_MAPPING:
        output_sub_dir = os.path.join(output_dir, output_dir_name)
        if not tf.io.gfile.exists(output_sub_dir):
            tf.io.gfile.makedirs(output_sub_dir)
        schema_path = os.path.join(output_sub_dir, 'schema.json')
        schemas.save_to_file(schema_path)
        dial_ids = []
        if file_name:
            id_list_path = os.path.join(FLAGS.input_data_dir, file_name)
            with tf.io.gfile.GFile(id_list_path) as f:
                dial_ids = [id_name.strip() for id_name in f.readlines()]
            dev_test_ids.extend(dial_ids)
        else:
            # Generate the ids for the training set.
            dial_ids = list(set(data.keys()) - set(dev_test_ids))
        converted_dials = processor.convert_to_dstc(dial_ids, data)
        logging.info('Unfound slot span ratio %s',
                     processor.unfound_slot_span_ratio)
        logging.info('Writing %d dialogs to %s', len(converted_dials),
                     output_sub_dir)
        for i in range(0, len(converted_dials), _NUM_DIALS_PER_FILE):
            file_index = int(i / _NUM_DIALS_PER_FILE) + 1
            # Create a new json file and save the dialogues.
            json_file_path = os.path.join(
                output_sub_dir, 'dialogues_{:03d}.json'.format(file_index))
            dialogs_list = converted_dials[(file_index - 1) *
                                           _NUM_DIALS_PER_FILE:file_index *
                                           _NUM_DIALS_PER_FILE]
            with tf.io.gfile.GFile(json_file_path, 'w') as f:
                json.dump(dialogs_list,
                          f,
                          indent=2,
                          separators=(',', ': '),
                          sort_keys=True)
            logging.info('Created %s with %d dialogues.', json_file_path,
                         len(dialogs_list))
def write_predictions_to_file(predictions, input_json_files, schema_json_file,
                              output_dir):
    """Write the predicted dialogues as json files.

  Args:
    predictions: An iterator containing model predictions. This is the output of
      the predict method in the estimator.
    input_json_files: A list of json paths containing the dialogues to run
      inference on.
    schema_json_file: Path for the json file containing the schemas.
    output_dir: The directory where output json files will be created.
  """
    tf.compat.v1.logging.info("Writing predictions to %s.", output_dir)
    schemas = schema.Schema(schema_json_file)
    # Index all predictions.
    all_predictions = {}
    for idx, prediction in enumerate(predictions):
        if not prediction["is_real_example"]:
            continue
        tf.compat.v1.logging.log_every_n(tf.compat.v1.logging.INFO,
                                         "Processed %d examples.", 500, idx)
        _, dialog_id, turn_id, service_name = (
            prediction["example_id"].decode("utf-8").split("-"))
        all_predictions[(dialog_id, turn_id, service_name)] = prediction

    # Read each input file and write its predictions.
    for input_file_path in input_json_files:
        with tf.io.gfile.GFile(input_file_path) as f:
            dialogs = json.load(f)
            pred_dialogs = []
            for d in dialogs:
                pred_dialogs.append(
                    get_predicted_dialog(d, all_predictions, schemas))
        input_file_name = os.path.basename(input_file_path)
        output_file_path = os.path.join(output_dir, input_file_name)
        with tf.io.gfile.GFile(output_file_path, "w") as f:
            json.dump(pred_dialogs,
                      f,
                      indent=2,
                      separators=(",", ": "),
                      sort_keys=True)