Beispiel #1
0
def main():
    schema_path = os.path.join(_DIR_PATH, args.schema_file_name)
    schemas = schema.Schema(schema_path)
    processor = Processor(schemas)
    data_path = os.path.join(args.input_data_dir, 'data.json')
    with open(data_path, 'r') as f:
        data = json.load(f)
    dev_test_ids = []
    output_dir = args.output_dir or _DIR_PATH
    # Generate dev and test set according to the ids listed in the files. Ids not
    # included in the dev and test id list files belong to the training set.
    for output_dir_name, file_name in _PATH_MAPPING:
        output_sub_dir = os.path.join(output_dir, output_dir_name)
        if not os.path.exists(output_sub_dir):
            os.makedirs(output_sub_dir)
        schema_path = os.path.join(output_sub_dir, 'schema.json')
        schemas.save_to_file(schema_path)
        dial_ids = []
        if file_name:
            id_list_path = os.path.join(args.input_data_dir, file_name)
            with open(id_list_path) as f:
                dial_ids = [id_name.strip() for id_name in f.readlines()]
            dev_test_ids.extend(dial_ids)
        else:
            # Generate the ids for the training set.
            dial_ids = list(set(data.keys()) - set(dev_test_ids))
        converted_dials = processor.convert_to_dstc(dial_ids, data)
        logging.info('Unfound slot span ratio %s',
                     processor.unfound_slot_span_ratio)
        logging.info('Writing %d dialogs to %s', len(converted_dials),
                     output_sub_dir)
        for i in range(0, len(converted_dials), _NUM_DIALS_PER_FILE):
            file_index = int(i / _NUM_DIALS_PER_FILE) + 1
            # Create a new json file and save the dialogues.
            json_file_path = os.path.join(
                output_sub_dir, 'dialogues_{:03d}.json'.format(file_index))
            dialogs_list = converted_dials[(file_index - 1) *
                                           _NUM_DIALS_PER_FILE:file_index *
                                           _NUM_DIALS_PER_FILE]
            dialogs_list = change_to_nemo_id(dialogs_list, file_index)
            with open(json_file_path, 'w') as f:
                json.dump(dialogs_list,
                          f,
                          indent=2,
                          separators=(',', ': '),
                          sort_keys=True)
            logging.info('Created %s with %d dialogues.', json_file_path,
                         len(dialogs_list))
    def __init__(
        self,
        data_dir,
        schema_embedding_dir,
        schema_config,
        tokenizer,
        bert_model,
        overwrite_schema_emb_files,
        bert_ckpt_dir,
        nf,
        datasets=['train', 'test', 'dev'],
        mode='baseline',
        is_trainable=False,
    ):

        # Dimension of the embedding for intents, slots and categorical slot values in
        # Maximum allowed number of categorical trackable slots for a service.
        self.schema_config = schema_config.copy()
        # self.MAX_NUM_CAT_SLOT = config["MAX_NUM_CAT_SLOT"]
        # # Maximum allowed number of non-categorical trackable slots for a service.
        # self.MAX_NUM_NONCAT_SLOT = config["MAX_NUM_NONCAT_SLOT"]
        # # Maximum allowed number of values per categorical trackable slot.
        # self.MAX_NUM_VALUE_PER_CAT_SLOT = config["MAX_NUM_VALUE_PER_CAT_SLOT"]
        # # Maximum allowed number of intents for a service.
        # self.MAX_NUM_INTENT = config["MAX_NUM_INTENT"]

        self.is_trainable = is_trainable
        self.datasets = datasets

        for dataset_split in ['train', 'test', 'dev']:
            if dataset_split not in self.datasets:
                logging.warning(
                    'WARNING: %s set was not included and won\'t be processed. Services from this dataset split '
                    + 'won\'t be supported',
                    dataset_split,
                )
        os.makedirs(schema_embedding_dir, exist_ok=True)

        tokenizer_type = type(tokenizer.tokenizer).__name__
        vocab_size = getattr(tokenizer, "vocab_size", 0)
        self.schema_embedding_file = os.path.join(
            schema_embedding_dir,
            "{}_{}_{}_{}_pretrained_schema_embedding.npy".format(
                '_'.join(self.datasets), mode, tokenizer_type, vocab_size),
        )
        all_schema_json_paths = []
        for dataset_split in self.datasets:
            all_schema_json_paths.append(
                os.path.join(data_dir, dataset_split, "schema.json"))
        self.schemas = schema.Schema(all_schema_json_paths)

        if not os.path.exists(
                self.schema_embedding_file) or overwrite_schema_emb_files:
            # Generate the schema embeddings if needed or specified
            logging.info(f"Start generating the schema embeddings.")
            dataset_params = {
                "schema_config": schema_config,
                "tokenizer": tokenizer,
                "schemas": self.schemas,
            }
            emb_datalayer = BertInferDataLayer(
                dataset_type=SchemaEmbeddingDataset,
                dataset_params=dataset_params,
                batch_size=1,
                shuffle=False,
            )

            input_ids, input_mask, input_type_ids = emb_datalayer()

            hidden_states = bert_model(input_ids=input_ids,
                                       token_type_ids=input_type_ids,
                                       attention_mask=input_mask)
            evaluated_tensors = nf.infer(tensors=[hidden_states],
                                         checkpoint_dir=bert_ckpt_dir)

            master_device = not torch.distributed.is_initialized(
            ) or torch.distributed.get_rank() == 0
            if master_device:
                hidden_states = [
                    concatenate(tensors) for tensors in evaluated_tensors
                ]
                emb_datalayer.dataset.save_embeddings(
                    hidden_states, self.schema_embedding_file, mode)
                logging.info(f"Finish generating the schema embeddings.")

        # wait until the master process writes to the schema embedding file
        if torch.distributed.is_initialized():
            torch.distributed.barrier()

        with open(self.schema_embedding_file, "rb") as f:
            self.schema_embeddings = np.load(f, allow_pickle=True)
            f.close()