Ejemplo n.º 1
0
    def get_data_items(self, dataset, predict=False):
        data = []
        cand_source = 'candidates'
        count = 0

        for doc_name, content in dataset.items():
            count += 1
            if count % 1000 == 0:
                print(count, end='\r')

            items = []
            conll_doc = content[0].get('conll_doc', None)

            for m in content:
                try:
                    named_cands = [
                        c[0] for c in m[cand_source]
                        if (wiki_prefix +
                            c[0]) in self.model.entity_voca.word2id
                    ]
                    p_e_m = [min(1., max(1e-3, c[1])) for c in m[cand_source]]
                except:
                    named_cands = [
                        c[0] for c in m['candidates']
                        if (wiki_prefix +
                            c[0]) in self.model.entity_voca.word2id
                    ]
                    p_e_m = [min(1., max(1e-3, c[1])) for c in m['candidates']]

                try:
                    true_pos = named_cands.index(m['gold'][0])
                    p = p_e_m[true_pos]
                except:
                    true_pos = -1

                named_cands = named_cands[:min(self.args.n_cands_before_rank,
                                               len(named_cands))]
                p_e_m = p_e_m[:min(self.args.n_cands_before_rank, len(p_e_m))]

                if true_pos >= len(named_cands):
                    if not predict:
                        true_pos = len(named_cands) - 1
                        p_e_m[-1] = p
                        named_cands[-1] = m['gold'][0]
                    else:
                        true_pos = -1

                cands = [
                    self.model.entity_voca.get_id(wiki_prefix + c)
                    for c in named_cands
                ]
                mask = [1.] * len(cands)
                if len(cands) == 0 and not predict:
                    continue
                elif len(cands) < self.args.n_cands_before_rank:
                    cands += [self.model.entity_voca.unk_id
                              ] * (self.args.n_cands_before_rank - len(cands))
                    named_cands += [Vocabulary.unk_token] * (
                        self.args.n_cands_before_rank - len(named_cands))
                    p_e_m += [1e-8
                              ] * (self.args.n_cands_before_rank - len(p_e_m))
                    mask += [0.] * (self.args.n_cands_before_rank - len(mask))

                lctx = m['context'][0].strip().split()
                lctx_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in lctx
                    if utils.is_important_word(t)
                ]
                lctx_ids = [
                    tid for tid in lctx_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]
                lctx_ids = lctx_ids[
                    max(0,
                        len(lctx_ids) - self.args.ctx_window // 2):]

                rctx = m['context'][1].strip().split()
                rctx_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in rctx
                    if utils.is_important_word(t)
                ]
                rctx_ids = [
                    tid for tid in rctx_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]
                rctx_ids = rctx_ids[:min(len(rctx_ids), self.args.ctx_window //
                                         2)]

                ment = m['mention'].strip().split()
                ment_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in ment
                    if utils.is_important_word(t)
                ]
                ment_ids = [
                    tid for tid in ment_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]

                m['sent'] = ' '.join(lctx + rctx)

                # secondary local context (for computing relation scores)
                if conll_doc is not None:
                    conll_m = m['conll_m']
                    sent = conll_doc['sentences'][conll_m['sent_id']]
                    start = conll_m['start']
                    end = conll_m['end']

                    snd_lctx = [
                        self.model.snd_word_voca.get_id(t) for t in
                        sent[max(0, start -
                                 self.args.snd_local_ctx_window // 2):start]
                    ]
                    snd_rctx = [
                        self.model.snd_word_voca.get_id(t) for t in
                        sent[end:min(len(sent), end +
                                     self.args.snd_local_ctx_window // 2)]
                    ]
                    snd_ment = [
                        self.model.snd_word_voca.get_id(t)
                        for t in sent[start:end]
                    ]

                    if len(snd_lctx) == 0:
                        snd_lctx = [self.model.snd_word_voca.unk_id]
                    if len(snd_rctx) == 0:
                        snd_rctx = [self.model.snd_word_voca.unk_id]
                    if len(snd_ment) == 0:
                        snd_ment = [self.model.snd_word_voca.unk_id]
                else:
                    snd_lctx = [self.model.snd_word_voca.unk_id]
                    snd_rctx = [self.model.snd_word_voca.unk_id]
                    snd_ment = [self.model.snd_word_voca.unk_id]

                items.append({
                    'context': (lctx_ids, rctx_ids),
                    'snd_ctx': (snd_lctx, snd_rctx),
                    'ment_ids': ment_ids,
                    'snd_ment': snd_ment,
                    'cands': cands,
                    'named_cands': named_cands,
                    'p_e_m': p_e_m,
                    'mask': mask,
                    'true_pos': true_pos,
                    'doc_name': doc_name,
                    'raw': m
                })

            if len(items) > 0:
                # note: this shouldn't affect the order of prediction because we use doc_name to add predicted entities,
                # and we don't shuffle the data for prediction
                max_len = 50
                if len(items) > max_len:
                    # print(len(items))
                    for k in range(0, len(items), max_len):
                        data.append(items[k:min(len(items), k + max_len)])
                else:
                    data.append(items)

        return self.prerank(data, predict)
Ejemplo n.º 2
0
    def get_data_items(self, dataset, predict=False):
        data = []
        cand_source = 'candidates'
        # doc_name is the id of doc, not 'aida-A', 'msnbc' etc.
        for doc_name, content in dataset.items():
            items = []
            # only the first mention has the 'conll_doc'
            conll_doc = content[0].get('conll_doc', None)
            #content is list of mentions
            for m in content:
                try:
                    # c[0] is string name of candidates, c[1] is pem
                    named_cands = [c[0] for c in m[cand_source]]
                    p_e_m = [min(1., max(1e-3, c[1])) for c in m[cand_source]]
                except:
                    named_cands = [c[0] for c in m['candidates']]
                    p_e_m = [min(1., max(1e-3, c[1])) for c in m['candidates']]

                try:
                    # index of gold entity in named_cands list,
                    # gold
                    # ('Germany', 1e-05, -1)
                    true_pos = named_cands.index(m['gold'][0])
                    p = p_e_m[true_pos]
                except:
                    # gold is not in candidates
                    true_pos = -1
                # short list of candidates for mention m
                named_cands = named_cands[:min(self.args.n_cands_before_rank,
                                               len(named_cands))]
                p_e_m = p_e_m[:min(self.args.n_cands_before_rank, len(p_e_m))]
                # if gold is out of short list, then add the gold for training
                if true_pos >= len(named_cands):
                    if not predict:
                        true_pos = len(named_cands) - 1
                        p_e_m[-1] = p
                        named_cands[-1] = m['gold'][
                            0]  # replace the last candidates with gold
                    else:
                        true_pos = -1
                # get the id of entity
                cands = [
                    self.model.entity_voca.get_id(wiki_prefix + c)
                    for c in named_cands
                ]
                mask = [1.] * len(cands)
                # skip the training of mentions without candidates
                if len(cands) == 0 and not predict:
                    continue
                elif len(
                        cands
                ) < self.args.n_cands_before_rank:  # padding? and mask the padding
                    cands += [self.model.entity_voca.unk_id
                              ] * (self.args.n_cands_before_rank - len(cands))
                    named_cands += [Vocabulary.unk_token] * (
                        self.args.n_cands_before_rank - len(named_cands))
                    p_e_m += [1e-8
                              ] * (self.args.n_cands_before_rank - len(p_e_m))
                    mask += [0.] * (self.args.n_cands_before_rank - len(mask))
                # left contxt token ids of important words
                lctx = m['context'][0].strip().split()
                lctx_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in lctx
                    if utils.is_important_word(t)
                ]
                lctx_ids = [
                    tid for tid in lctx_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]
                lctx_ids = lctx_ids[
                    max(0,
                        len(lctx_ids) - self.args.ctx_window // 2):]

                rctx = m['context'][1].strip().split()
                rctx_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in rctx
                    if utils.is_important_word(t)
                ]
                rctx_ids = [
                    tid for tid in rctx_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]
                rctx_ids = rctx_ids[:min(len(rctx_ids), self.args.ctx_window //
                                         2)]
                # mention itself
                ment = m['mention'].strip().split()
                ment_ids = [
                    self.prerank_model.word_voca.get_id(t) for t in ment
                    if utils.is_important_word(t)
                ]
                ment_ids = [
                    tid for tid in ment_ids
                    if tid != self.prerank_model.word_voca.unk_id
                ]

                m['sent'] = ' '.join(lctx + rctx)

                # secondary local context (for computing relation scores)
                if conll_doc is not None:
                    conll_m = m['conll_m']
                    # the sent where the mention appears
                    # conll_m,
                    #{'sent_id': 0, 'start': 2, 'end': 3, 'wikilink': 'http://en.wikipedia.org/wiki/Germany'}
                    sent = conll_doc['sentences'][conll_m['sent_id']]
                    aet_words = conll_doc['aet_words']
                    start = conll_m['start']
                    end = conll_m['end']

                    snd_lctx = [
                        self.model.snd_word_voca.get_id(t) for t in
                        sent[max(0, start -
                                 self.args.snd_local_ctx_window // 2):start]
                    ]
                    snd_rctx = [
                        self.model.snd_word_voca.get_id(t) for t in
                        sent[end:min(len(sent), end +
                                     self.args.snd_local_ctx_window // 2)]
                    ]
                    snd_ment = [
                        self.model.snd_word_voca.get_id(t)
                        for t in sent[start:end]
                    ]
                    aet_ctx = [
                        self.model.aet_word_voca.get_id(ae) for ae in aet_words
                    ]

                    if len(snd_lctx) == 0:
                        snd_lctx = [self.model.snd_word_voca.unk_id]
                    if len(snd_rctx) == 0:
                        snd_rctx = [self.model.snd_word_voca.unk_id]
                    if len(snd_ment) == 0:
                        snd_ment = [self.model.snd_word_voca.unk_id]
                else:
                    snd_lctx = [self.model.snd_word_voca.unk_id]
                    snd_rctx = [self.model.snd_word_voca.unk_id]
                    snd_ment = [self.model.snd_word_voca.unk_id]

                items.append({
                    'context': (lctx_ids, rctx_ids),
                    'snd_ctx': (snd_lctx, snd_rctx),
                    'aet_ctx': aet_ctx,
                    'ment_ids': ment_ids,
                    'snd_ment': snd_ment,
                    'cands': cands,
                    'named_cands': named_cands,
                    'p_e_m': p_e_m,
                    'mask': mask,
                    'true_pos': true_pos,
                    'doc_name': doc_name,
                    'raw': m
                })

            if len(items) > 0:
                # note: this shouldn't affect the order of prediction because we use doc_name to add predicted entities,
                # and we don't shuffle the data for prediction
                if len(items) > 100:
                    print(len(items))
                    for k in range(0, len(items), 100):
                        data.append(items[k:min(len(items), k + 100)])
                else:
                    data.append(items)

        return self.prerank(data, predict)