Ejemplo n.º 1
0
    def __init__(self):
        super(TradeDST, self).__init__()
        # load config
        common_config_path = os.path.join(get_config_path(),
                                          TradeDST.common_config_name)
        common_config = json.load(open(common_config_path))
        model_config_path = os.path.join(get_config_path(),
                                         TradeDST.model_config_name)
        model_config = json.load(open(model_config_path))
        model_config.update(common_config)
        self.model_config = model_config
        self.model_config['data_path'] = os.path.join(
            get_data_path(), 'crosswoz/dst_trade_data')
        self.model_config['n_gpus'] = 0 if self.model_config[
            'device'] == 'cpu' else torch.cuda.device_count()
        self.model_config['device'] = torch.device(self.model_config['device'])
        if model_config['load_embedding']:
            model_config['hidden_size'] = 300

        # download data
        for model_key, url in TradeDST.model_urls.items():
            dst = os.path.join(self.model_config['data_path'], model_key)
            if model_key.endswith('pth'):
                file_name = 'trained_model_path'
            elif model_key.endswith('pkl'):
                file_name = model_key.rsplit('-', maxsplit=1)[0]
            else:
                file_name = model_key.split('.')[0]  # ontology
            self.model_config[file_name] = dst
            if not os.path.exists(dst) or not self.model_config['use_cache']:
                download_from_url(url, dst)

        # load date & model
        ontology = json.load(
            open(self.model_config['ontology'], 'r', encoding='utf8'))
        self.all_slots = get_slot_information(ontology)
        self.gate2id = {'ptr': 0, 'none': 1}
        self.id2gate = {id_: gate for gate, id_ in self.gate2id.items()}
        self.lang = pickle.load(open(self.model_config['lang'], 'rb'))
        self.mem_lang = pickle.load(open(self.model_config['mem-lang'], 'rb'))

        model = Trade(
            lang=self.lang,
            vocab_size=len(self.lang.index2word),
            hidden_size=self.model_config['hidden_size'],
            dropout=self.model_config['dropout'],
            num_encoder_layers=self.model_config['num_encoder_layers'],
            num_decoder_layers=self.model_config['num_decoder_layers'],
            pad_id=self.model_config['pad_id'],
            slots=self.all_slots,
            num_gates=len(self.gate2id),
            unk_mask=self.model_config['unk_mask'])

        model.load_state_dict(
            torch.load(self.model_config['trained_model_path']))

        self.model = model.to(self.model_config['device']).eval()
        print(f'>>> {self.model_config["trained_model_path"]} loaded ...')
        self.state = default_state()
        print('>>> State initialized ...')
Ejemplo n.º 2
0
def prepare_data_seq(config):
    eval_batch = (config["eval_batch_size"]
                  if config["eval_batch_size"] else config["batch_size"])
    train_file_path = config["train_dials"]
    dev_file_path = config["dev_dials"]
    test_file_path = config["test_dials"]
    ontology_file_path = config["ontology"]

    # load domain-slot pairs from ontology
    ontology = json.load(open(ontology_file_path, "r", encoding="utf8"))
    slots = get_slot_information(ontology)
    gating_dict = {"ptr": 0, "none": 1}

    # Vocabulary
    lang_name = "lang-all.pkl" if config["all_vocab"] else "lang-train.pkl"
    mem_lang_name = "mem-lang-all.pkl" if config[
        "all_vocab"] else "mem-lang-train.pkl"
    if config["debug"]:
        lang_name = "debug-" + lang_name
        mem_lang_name = "debug-" + mem_lang_name
    lang_file_path = os.path.join(config["data_path"], lang_name)
    mem_lang_file_path = os.path.join(config["data_path"], mem_lang_name)
    load_lang = False
    if (os.path.exists(lang_file_path) and
            os.path.exists(mem_lang_file_path)) and not config["clean_cache"]:
        print("Loading saved lang files...")
        load_lang = True
        with open(lang_file_path, "rb") as f:
            lang = pickle.load(f)
        with open(mem_lang_file_path, "rb") as f:
            mem_lang = pickle.load(f)
    else:
        lang, mem_lang = Lang(config), Lang(config)
        # 都包含了 ontology 中的 domain 和 slot,之后分别包含 utterance 和 domain-slot-value
        lang.index_words(slots, "slot")
        mem_lang.index_words(slots, "slot")

    # 生成 dataloader
    pair_train, train_max_len = read_langs(train_file_path, gating_dict, slots,
                                           "train", lang, mem_lang, load_lang,
                                           config)
    train_loader = get_seq(
        pair_train,
        lang,
        mem_lang,
        config["batch_size"],
        config["n_gpus"],
        shuffle=True,
        config=config,
    )
    train_vocab_size = lang.n_words

    pair_dev, dev_max_len = read_langs(dev_file_path, gating_dict, slots,
                                       "dev", lang, mem_lang, load_lang,
                                       config)
    dev_loader = get_seq(
        pair_dev,
        lang,
        mem_lang,
        eval_batch,
        config["n_gpus"],
        shuffle=False,
        config=config,
    )

    pair_test, test_max_len = read_langs(test_file_path, gating_dict, slots,
                                         "tests", lang, mem_lang, load_lang,
                                         config)
    test_loader = get_seq(
        pair_test,
        lang,
        mem_lang,
        eval_batch,
        config["n_gpus"],
        shuffle=False,
        config=config,
    )

    # 保存中间数据
    if (not (os.path.exists(lang_file_path)
             and os.path.exists(mem_lang_file_path)) or config["clean_cache"]):
        print("Dumping lang files...")
        with open(lang_file_path, "wb") as f:
            pickle.dump(lang, f)
        with open(mem_lang_file_path, "wb") as f:
            pickle.dump(mem_lang, f)

    emb_dump_path = os.path.join(config["data_path"],
                                 f"emb{len(lang.index2word)}")
    if (not os.path.exists(emb_dump_path)
            or config["clean_cache"]) and config["load_embedding"]:
        dump_pretrained_emb(config["orig_pretrained_embedding"],
                            lang.index2word, emb_dump_path)

    max_dialogue_history_length = max(train_max_len, dev_max_len,
                                      test_max_len) + 1

    print("Read %s pairs train" % len(pair_train))
    print("Read %s pairs dev" % len(pair_dev))
    print("Read %s pairs tests" % len(pair_test))
    print("Vocab_size: %s " % lang.n_words)
    print("Vocab_size Training %s" % train_vocab_size)
    print("Vocab_size Belief %s" % mem_lang.n_words)
    print("Max. length of dialog words for RNN: %s " %
          max_dialogue_history_length)

    langs = [lang, mem_lang]
    # dataloader, dataloader, dataloader, dataloader, List[Lang], List[Dict[str, str]], Dict[str, int], int
    return train_loader, dev_loader, test_loader, langs, slots, gating_dict
Ejemplo n.º 3
0
    def __init__(self):
        super(TradeDST, self).__init__()
        # load config
        common_config_path = os.path.join(get_config_path(),
                                          TradeDST.common_config_name)
        common_config = json.load(open(common_config_path))
        model_config_path = os.path.join(get_config_path(),
                                         TradeDST.model_config_name)
        model_config = json.load(open(model_config_path))
        model_config.update(common_config)
        self.model_config = model_config
        self.model_config["data_path"] = os.path.join(
            get_data_path(), "crosswoz/dst_trade_data")
        self.model_config["n_gpus"] = (0 if self.model_config["device"]
                                       == "cpu" else torch.cuda.device_count())
        self.model_config["device"] = torch.device(self.model_config["device"])
        if model_config["load_embedding"]:
            model_config["hidden_size"] = 300

        # download data
        for model_key, url in TradeDST.model_urls.items():
            dst = os.path.join(self.model_config["data_path"], model_key)
            if model_key.endswith("pth"):
                file_name = "trained_model_path"
            elif model_key.endswith("pkl"):
                file_name = model_key.rsplit("-", maxsplit=1)[0]
            else:
                file_name = model_key.split(".")[0]  # ontology
            self.model_config[file_name] = dst
            if not os.path.exists(dst) or not self.model_config["use_cache"]:
                download_from_url(url, dst)

        # load date & model
        ontology = json.load(
            open(self.model_config["ontology"], "r", encoding="utf8"))
        self.all_slots = get_slot_information(ontology)
        self.gate2id = {"ptr": 0, "none": 1}
        self.id2gate = {id_: gate for gate, id_ in self.gate2id.items()}
        self.lang = pickle.load(open(self.model_config["lang"], "rb"))
        self.mem_lang = pickle.load(open(self.model_config["mem-lang"], "rb"))

        model = Trade(
            lang=self.lang,
            vocab_size=len(self.lang.index2word),
            hidden_size=self.model_config["hidden_size"],
            dropout=self.model_config["dropout"],
            num_encoder_layers=self.model_config["num_encoder_layers"],
            num_decoder_layers=self.model_config["num_decoder_layers"],
            pad_id=self.model_config["pad_id"],
            slots=self.all_slots,
            num_gates=len(self.gate2id),
            unk_mask=self.model_config["unk_mask"],
        )

        model.load_state_dict(
            torch.load(self.model_config["trained_model_path"]))

        self.model = model.to(self.model_config["device"]).eval()
        print(f'>>> {self.model_config["trained_model_path"]} loaded ...')
        self.state = default_state()
        print(">>> State initialized ...")