Example #1
0
def build_child2parent_list():
    child_par_list = "data/kb/child_par_list.json"
    par_child_dict = load_json("data/kb/par_child_dict.json")
    items_wikidata_n = load_json("data/kb/items_wikidata_n.json")

    max_entity_id = -1
    for ent in items_wikidata_n:
        max_entity_id = max(max_entity_id, get_ent_int_id(ent))
    assert max_entity_id > -1
    # build list
    res_list = [None for _ in range(max_entity_id + 1)]
    for par in par_child_dict:
        for child in par_child_dict[par]:
            res_list[get_ent_int_id(child)] = par
    save_json(res_list, child_par_list)
Example #2
0
class BaseProcessor(object):
    # dict_e = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/items_wikidata_n.json").items())
    dict_e = load_json("data/kb/items_wikidata_n_tokenized.json")
    dict_p = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/filtered_property_wikidata4.json").items())
    dict_t2e = load_json("data/kb/par_child_dict.json")
    dict_e2t = dict((v, k) for k, vs in dict_t2e.items() for v in vs)

    def __init__(self):
        self._labels_dict = None  # label_name: {"type":str, "labels":list}
        self.primary_metric = None

    @staticmethod
    def post_process_dialog_turn(dialog_turn, *args, **kwargs):
        return dialog_turn

    def get_labels_dict(self):
        assert self._labels_dict is not None, "using labels before generation"
        return self._labels_dict
Example #3
0
class BaseProcessor(object):
    # dict_e = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/items_wikidata_n.json").items())
    # Item and it's label e.g. "Q5266722": "development and peace"
    dict_e = load_json("data/kb/items_wikidata_n_tokenized.json")
    # Property and it's label e.g. "P86": "composer",  tokenize it and store.
    dict_p = dict((k, spacy_tokenize(v)) for k, v in load_json(
        "data/kb/filtered_property_wikidata4.json").items())
    # Parent:[list of childs] e.g. "Q15726688": ["Q23872762", "Q12345822", "Q15142867"]
    dict_t2e = load_json("data/kb/par_child_dict.json")
    # Dict of child to parent. e.g. "Q23872762": "Q15726688"
    dict_e2t = dict((v, k) for k, vs in dict_t2e.items() for v in vs)

    def __init__(self):
        self._labels_dict = None  # label_name: {"type":str, "labels":list}
        self.primary_metric = None

    @staticmethod
    def post_process_dialog_turn(dialog_turn, *args, **kwargs):
        return dialog_turn

    def get_labels_dict(self):
        assert self._labels_dict is not None, "using labels before generation"
        return self._labels_dict
Example #4
0
    def _pre_process_raw_data(self, path_list, debug_num=0):
        # 1. data
        if isinstance(path_list, list):
            path_list = path_list
        else:
            path_list = get_data_path_list("all", path_list)

        # for debug:
        if debug_num is not None and debug_num > 0:
            path_list = path_list[:debug_num]

        # formulate the data
        logging.info("\tFormulating the raw data data")
        turn_list = []
        for idx_f, file_path in tqdm(enumerate(path_list)):
            raw_data = load_json(file_path)
            new_turn_list = self._get_formulated_dialog(raw_data, file_path)
            # some other processes
            turn_list.extend(new_turn_list)
        return turn_list
Example #5
0
def decoding(model_cfg, infer_cfg):  # this for dev set
    from e2e.exe import LfExecutor

    dataset_obj = load_file(infer_cfg['processed_path'] + ".light",
                            'processed_datasets',
                            mode='pickle')  #
    assert dataset_obj is not None
    dataset_obj._dev_feature_list = dataset_obj._dev_feature_list[:4000]

    with tf.variable_scope('model') as scope:
        # cfg, vocab, data_type, labels_dict, max_sequence_len, num_training_steps, scope
        model_obj = model_cfg['model_class'](model_cfg, dataset_obj.tokenizer,
                                             model_cfg['dataset'],
                                             dataset_obj.get_labels_dict(),
                                             model_cfg["max_sequence_len"],
                                             1000, scope.name)
    graph_handler = GraphHandler(model_obj, infer_cfg)
    evaluator = E2eEvaluator(model_obj, infer_cfg)
    sess = graph_handler.initialize()
    # data preparation
    logging.info("loading inverse_index...")
    inverse_index = load_json("data/EDL/inverse_index_spacy_token.json")
    logging.info("building lf executor")
    lf_executor = LfExecutor(kb_mode="offline")
    logging.info("Done")

    evaluator.decoding(sess,
                       dataset_obj._dev_feature_list,
                       lf_executor,
                       inverse_index,
                       BaseProcessor.dict_e2t,
                       dataset_obj.get_labels_dict()["EOs"]["labels"],
                       dataset_obj.get_labels_dict()["sketch"]["labels"],
                       dataset_obj.get_labels_dict()["predicates"]["labels"],
                       dataset_obj.get_labels_dict()["types"]["labels"],
                       batch_size=20,
                       max_seq_len=infer_cfg["max_sequence_len"])
Example #6
0
    def run(self):
        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.infer_cfg['gpu'])
        feature_list = self.dataset_obj.process_test_data(self.path_list)
        g = tf.Graph()
        with g.as_default():
            with tf.device("/device:GPU:{}".format(self.gpu_index)):
                with tf.variable_scope('model') as scope:
                    # cfg, vocab, data_type, labels_dict, max_sequence_len, num_training_steps, scope
                    model_obj = self.model_cfg['model_class'](
                        self.model_cfg, self.dataset_obj.tokenizer,
                        self.model_cfg['dataset'],
                        self.dataset_obj.get_labels_dict(),
                        self.model_cfg["max_sequence_len"], 1000, scope.name)
                graph_handler = GraphHandler(model_obj, self.infer_cfg)
                evaluator = E2eEvaluator(model_obj, self.infer_cfg)
                sess = graph_handler.initialize()
        # data preparation
        logging.info("loading inverse_index...")
        inverse_index = load_json("data/EDL/inverse_index_spacy_token.json"
                                  ) if self.alter_ner_dir is None else None
        logging.info("building lf executor")
        lf_executor = LfExecutor(
            kb_mode=self.kb_mode,
            use_op_type_constraint=self.use_op_type_constraint)
        logging.info("Done")

        # data in this process
        top1_pred = []
        dev_dict = {}
        recall = {}
        precision = {}
        _feature_ptr = 0
        for _idx_file, _file_path in tqdm(enumerate(self.path_list),
                                          total=len(self.path_list)):
            _dump_path = os.path.join(self.dump_dir,
                                      os.path.basename(_file_path))
            _raw_data = load_json(_file_path)
            assert len(_raw_data) % 2 == 0
            _num_turns = len(_raw_data) // 2
            # fetch the feature list
            _proc_features = feature_list[_feature_ptr:(_feature_ptr +
                                                        _num_turns)]
            _feature_ptr += _num_turns
            # verity the equal of raw data and _proc_features
            for _idx_t in range(_num_turns):
                assert _raw_data[_idx_t * 2]["utterance"] == _proc_features[
                    _idx_t]["utterances"]["cur_q"]
                assert _raw_data[_idx_t * 2 +
                                 1]["utterance"] == _proc_features[_idx_t][
                                     "utterances"]["cur_a"]

            _out_list = None
            if os.path.exists(_dump_path) and os.path.isfile(_dump_path):
                try:
                    _out_list = load_pickle(_dump_path)
                    assert len(_out_list) == _num_turns
                    for _idx_t in range(_num_turns):
                        assert _out_list[_idx_t][
                            "cur_question_type"] == _raw_data[
                                _idx_t * 2]["question-type"]
                except:
                    _out_list = None
            if _out_list is None:
                _out_list = evaluator.decoding(  # how to multi process
                    sess,
                    _proc_features,
                    lf_executor,
                    inverse_index,
                    BaseProcessor.dict_e2t,
                    self.dataset_obj.get_labels_dict()["EOs"]["labels"],
                    self.dataset_obj.get_labels_dict()["sketch"]["labels"],
                    self.dataset_obj.get_labels_dict()["predicates"]["labels"],
                    self.dataset_obj.get_labels_dict()["types"]["labels"],
                    batch_size=20,
                    max_seq_len=self.max_sequence_len,
                    timeout=self.timeout,
                    use_filtered_ent=self.use_filtered_ent,
                    alter_ner_dir=self.alter_ner_dir,
                )
                assert len(_out_list) == _num_turns
                save_pickle(_out_list, _dump_path)

            if self.verbose:
                for _out in _out_list:
                    accumulative_eval(_out["gold_answer"],
                                      _out["cur_question_type"],
                                      _out["prev_question_type"], _out["top1"],
                                      _out["predicted_answer"], top1_pred,
                                      dev_dict, recall, precision)
            if self.verbose and (_idx_file + 1) % 40 == 0:
                logging.info("")
                logging.info("=" * 30)
                logging.info("From process {}".format(self.idx))
                smp_result_print_wrt_qt(top1_pred, dev_dict, recall, precision)
                logging.info("=" * 30)
Example #7
0
def tokenize_items_wikidata_n():
    items_wikidata_n = load_json("data/kb/items_wikidata_n.json")
    for key in items_wikidata_n:
        items_wikidata_n[key] = spacy_tokenize(items_wikidata_n[key])
    save_json(items_wikidata_n, "data/kb/items_wikidata_n_tokenized")
Example #8
0
 def load_child2parent(self):  # xxx added
     if self.child_id2parent is None and self.use_op_type_constraint:
         self.child_id2parent = load_json("data/kb/child_par_list.json")
Example #9
0
    def run(self):
        database = self._database
        files = self._files
        cover_num_True = self._cover_num_True
        cover_num_False = self._cover_num_False
        verb = self._verb
        beam_size = self._beam_size

        parser = Parser.Parser(database)
        parser.load_child2parent()
        memory = Memory()
        for f in tqdm(files, total=len(files), position=0, leave=True):
            # xxx added: output to another dir
            f_dir = os.path.dirname(f)
            new_f_dir = f_dir + "_proc_{}_{}_".format(
                "direct", beam_size) + self._out_dir_suffix
            if not os.path.isdir(new_f_dir):
                os.mkdir(new_f_dir)
            new_f = os.path.join(new_f_dir, os.path.basename(f))
            if os.path.isfile(new_f):
                try:
                    with open(new_f, 'r') as fp:
                        tmp_dicts = json.load(fp)
                        for i in range(0, len(tmp_dicts), 2):
                            # check whether found the correct answer
                            if tmp_dicts[i][
                                    "question-type"] not in cover_num_True:
                                cover_num_True[tmp_dicts[i]
                                               ["question-type"]] = 0.0
                                cover_num_False[tmp_dicts[i]
                                                ["question-type"]] = 0.0
                            True_lf_action = tmp_dicts[i + 1]["true_lf"]
                            if len(True_lf_action) != 0:
                                cover_num_True[tmp_dicts[i]
                                               ["question-type"]] += 1
                            else:
                                cover_num_False[tmp_dicts[i]
                                                ["question-type"]] += 1
                    continue
                except:
                    pass

            # load dataset
            dicts = json.load(open(f, 'r'))
            # reset memory
            memory.clear()
            # print("+++++++++++++++++{}++++++++++++++++++++".format(os.path.basename(f)))
            prev_predicates = []
            for i in range(0, len(dicts), 2):
                turn_start_time = time.time()
                # Extract entity and relation, in BFS, we use entities and relations offered by training dataset
                # In D2A, we only use entities by entity linking and relations by relation classfier
                # In our setting, we assume that entities and relations are unseen in test dataset
                if 'entities_in_utterance' in dicts[i]:
                    user_entities = dicts[i]['entities_in_utterance']
                else:
                    user_entities = []
                if 'entities_in_utterance' in dicts[i + 1]:
                    system_entities = dicts[i + 1]['entities_in_utterance']
                # elif 'entities' in dicts[i + 1]:
                #     system_entities = dicts[i + 1]['entities']
                else:
                    system_entities = []
                if 'relations' in dicts[i]:  # gold relations are used
                    pres = dicts[i]['relations']
                else:
                    pres = []
                if 'type_list' in dicts[i]:
                    types = dicts[i]['type_list']  # gold types are used
                else:
                    types = []
                numbers = []
                for x in dicts[i]['utterance'].split():
                    try:
                        numbers.append(int(x))
                    except:
                        continue
                numbers = list(set(numbers))
                entities, pres = memory.current_state(user_entities, pres)

                # # our method !!!!!!!!!!!!!!!!!!!!!!!!!
                # # 1. for the entity
                entities = get_entities(dicts[i])
                # # 2. for the number: i.e., remove the number in the entity
                if self._dict_ent2text is None:
                    self._dict_ent2text = load_json(
                        "data/kb/items_wikidata_n_tokenized.json")
                cur_q_utterance = dicts[i]["utterance"]
                tokenized_utterance = spacy_tokenize(cur_q_utterance)
                ent_codes = entities.copy()
                ent_strs = [self._dict_ent2text[_code] for _code in ent_codes]
                if len(ent_codes) > 0:
                    ent_codes, ent_str = zip(*list(
                        sorted(zip(ent_codes, ent_strs),
                               key=lambda elem: len(elem[1].split()),
                               reverse=True)))
                EO, _, _ = generate_EO_with_etype(tokenized_utterance,
                                                  ent_codes, ent_strs,
                                                  ["UNK"] * len(ent_codes),
                                                  "EMPTY")
                num2idxs = index_num_in_tokenized_utterance(
                    tokenized_utterance, [eo_label != "O" for eo_label in EO])
                numbers = list(num2idxs.keys())
                # 3. predicates
                cur_predicates = get_predicates(dicts[i])
                if len(cur_predicates) == 0:
                    pres = prev_predicates
                else:
                    pres = cur_predicates
                prev_predicates = cur_predicates

                # Extract answer
                answer = parser.parsing_answer(dicts[i + 1]['all_entities'],
                                               dicts[i + 1]['utterance'],
                                               dicts[i]['question-type'])
                try:
                    logical_forms, candidate_answers, logical_action, _ = parser.BFS(
                        entities, pres, types, numbers, beam_size)  # add set
                except timeout_decorator.TimeoutError:
                    logical_forms = []
                    candidate_answers = []
                    logical_action = []
                    # lf_entity_record = []
                # update memory and keep right logical forms and action sequences
                memory.update(user_entities + system_entities, pres)
                True_lf = []
                True_lf_action = []
                # True_lf_entity_record = []
                All_lf = []
                for item in zip(logical_forms, candidate_answers,
                                logical_action):
                    pred = item[1]
                    All_lf.append(item[0])
                    All_lf.append((item[0], item[2]))
                    if type(pred) == int:
                        pred = [pred]
                    if answer == pred:
                        True_lf.append(item[0])
                        True_lf_action.append((item[0], item[2]))
                        # True_lf_entity_record.append(item[3])

                # eval oracle
                if dicts[i]["question-type"] not in cover_num_True:
                    cover_num_True[dicts[i]["question-type"]] = 0.0
                    cover_num_False[dicts[i]["question-type"]] = 0.0
                if len(True_lf_action) != 0:
                    cover_num_True[dicts[i]["question-type"]] += 1
                else:
                    cover_num_False[dicts[i]["question-type"]] += 1
                dicts[i + 1]["true_lf"] = True_lf_action
                if self._all_lf:
                    dicts[i + 1]['all_lf'] = All_lf
                dicts[i + 1]['num_all_lf'] = len(All_lf)
                dicts[i + 1]['time'] = time.time() - turn_start_time
            json.dump(dicts, open(new_f, 'w'))