예제 #1
0
 def __init__(self,
              path=None,
              bert_model='bert-base-cased',
              max_length=128):
     self.path = path
     self.bert_model = bert_model
     self.max_length = max_length
     self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)
     self.ner_vocabulary = load_json(os.path.join(path, "ner2idx.json"))
     self.rc_vocabulary = load_json(os.path.join(path, "rel2idx.json"))
     self.collate_fn = collater(self.ner_vocabulary, self.rc_vocabulary)
예제 #2
0
파일: nyt.py 프로젝트: jinzhuoran/CogIE
 def _load(self, path):
     dataset = DataTable()
     data = load_json(path)
     for item in data:
         ner_label = []
         rc_label = []
         ner_check = []
         rc_check = []
         text = item["text"].split(" ")
         for label in item["triple_list"]:
             subject_word_loc = text.index(label[0])
             relation = label[1]
             object_word_loc = text.index(label[2])
             if subject_word_loc not in ner_check:
                 ner_label.append(
                     [subject_word_loc, subject_word_loc, "None"])
                 ner_check += [subject_word_loc, subject_word_loc, "None"]
             if object_word_loc not in ner_check:
                 ner_label.append(
                     [object_word_loc, object_word_loc, "None"])
                 ner_check += [object_word_loc, object_word_loc, "None"]
             rc_label.append([subject_word_loc, object_word_loc, relation])
         dataset("text", text)
         dataset("ner_label", ner_label)
         dataset("rc_label", rc_label)
     return dataset
예제 #3
0
 def _load(self, path):
     dataset = load_json(path)
     for data in dataset:
         triples = data['triples']
         for triple in triples:
             self.label_set.add(triple['predicate']['uri'])
     return dataset
예제 #4
0
파일: ace2005.py 프로젝트: jinzhuoran/CogIE
 def _load(self, path):
     data = load_json(path)
     for item in data:
         for entity_mention in item['golden-entity-mentions']:
             for i in range(entity_mention['start'], entity_mention['end']):
                 entity_type = entity_mention['entity-type']
                 if i == entity_mention['start']:
                     self.label_set.add('B-{}'.format(entity_type))
                 else:
                     self.label_set.add('I-{}'.format(entity_type))
     return data
예제 #5
0
    def _load(self, path):
        datas = load_json(path)
        # datas = datas[0:int(len(datas)/20)]
        if self.debug:
            datas = datas[0:100]
        dataset = DataTable()
        for data in tqdm(datas):
            text = data['text']
            entities = data['entities']
            sentences_boundaries = data['sentences_boundaries']
            words_boundaries = data["words_boundaries"]

            prev_length = 0
            sentences = []
            ners = []
            for i, sentences_boundary in enumerate(sentences_boundaries):
                charid2wordid = {}
                sentence = []
                for j, (start, end) in enumerate(words_boundaries):
                    if start >= sentences_boundary[
                            0] and end <= sentences_boundary[1]:
                        if start == sentences_boundary[0]:
                            # print("j={}  prev_length={}".format(j,prev_length))
                            assert j == prev_length
                        charid2wordid = {
                            **charid2wordid,
                            **{
                                key: j - prev_length
                                for key in range(start, end + 1)
                            }
                        }
                        sentence.append(text[start:end])
                prev_length += len(sentence)
                sentences.append(sentence)
                dataset("sentence", sentence)
                ners_one_sentence = []
                for entity in entities:
                    entity_boundary = entity["boundaries"]
                    start, end = entity_boundary
                    if start >= sentences_boundary[
                            0] and end <= sentences_boundary[1]:
                        index = list(
                            set([
                                charid2wordid[charid]
                                for charid in range(start, end)
                            ]))
                        for k in index:
                            assert k < len(sentence)
                        ner = {"index": index, "type": "null"}
                        ners_one_sentence.append(ner)
                ners.append(ners_one_sentence)
                dataset("ner", ners_one_sentence)

        return dataset
예제 #6
0
 def _load(self, path):
     data = load_json(path)
     for item in data:
         for event_mention in item['golden-event-mentions']:
             for i in range(event_mention['trigger']['start'],
                            event_mention['trigger']['end']):
                 trigger_type = event_mention['event_type']
                 if i == event_mention['trigger']['start']:
                     self.label_set.add('B-{}'.format(trigger_type))
                 else:
                     self.label_set.add('I-{}'.format(trigger_type))
     return data
예제 #7
0
 def load_all(self, path):
     datasets = []
     for f in os.listdir(path):
         if f == 'vocabulary.txt':
             continue
         dataset = load_json(os.path.join(path, f))
         for data in dataset:
             entities = data['entities']
             for entity in entities:
                 entity
         datasets.extend(dataset)
     return datasets
예제 #8
0
 def _load(self, path):
     data = load_json(path)
     for sample in data:
         if len(sample["golden-event-mentions"]) > 0:
             for event in sample["golden-event-mentions"]:
                 event_type = event["event_type"]
                 for argument in event["arguments"]:
                     role = argument["role"]
                     if (event_type, role) not in self.label_set:
                         self.label_set.add((event_type, role))
                     if (event_type, "trigger") not in self.label_set:
                         self.label_set.add((event_type, "trigger"))
     return data
예제 #9
0
파일: kbp37.py 프로젝트: jinzhuoran/CogIE
 def _load(self, path):
     dataset = load_json(path)
     datable = DataTable()
     for data in dataset:
         token = data['token']
         relation = data['relation']
         subj_start = data['subj_start']
         subj_end = data['subj_end']
         obj_start = data['obj_start']
         obj_end = data['obj_end']
         self.label_set.add(relation)
         datable('token', token)
         datable('relation', relation)
         datable('subj_start', subj_start)
         datable('subj_end', subj_end)
         datable('obj_start', obj_start)
         datable('obj_end', obj_end)
     return datable
예제 #10
0
    def _load(self, path):
        data = load_json(path)
        for item in data:
            for event_mention in item['golden-event-mentions']:
                for i in range(event_mention['trigger']['start'],
                               event_mention['trigger']['end']):
                    trigger_type = event_mention['event_type']
                    if i == event_mention['trigger']['start']:
                        self.trigger_label_set.add('B-{}'.format(trigger_type))
                    else:
                        self.trigger_label_set.add('I-{}'.format(trigger_type))
                """
                28 argument roles

                There are 35 roles in ACE2005 dataset, but the time-related 8 roles were replaced by 'Time' as the previous work (Yang et al., 2016).
                ['Time-At-End','Time-Before','Time-At-Beginning','Time-Ending', 'Time-Holds', 'Time-After','Time-Starting', 'Time-Within'] --> 'Time'.
                """
                for argument in event_mention['arguments']:
                    role = argument['role']
                    if role.startswith('Time'):
                        role = role.split('-')[0]
                    self.argument_label_set.add(role)
        return data
예제 #11
0
 def load_one(self, path):
     dataset = load_json(path)
     return dataset
예제 #12
0
    def _load(self, path):
        dataset = []
        datas = load_json(path)
        count = 0
        for data in datas:
            text = data['text']
            entities = data['entities']
            triples = data['triples']
            sentences_boundaries = data['sentences_boundaries']
            for sentences_boundary in sentences_boundaries:
                entity_mentions = []
                relation_mentions = []
                sentence = text[sentences_boundary[0]:sentences_boundary[1]]
                words = nltk.word_tokenize(sentence)

                for entity in entities:
                    if entity['boundaries'][0] >= sentences_boundary[0] and entity['boundaries'][1] <= \
                            sentences_boundary[1]:
                        entity_mention_position = get_mention_position(
                            text, sentences_boundary, entity['boundaries'])
                        if entity_mention_position[
                                0] >= entity_mention_position[1]:
                            count += 1
                            continue
                        entity_mention = {'position': entity_mention_position}
                        entity_mentions.append(entity_mention)
                for triple in triples:
                    sentence_id = triple['sentence_id']
                    predicate = triple['predicate']
                    subject = triple['subject']
                    object = triple['object']
                    if not subject['boundaries'] or not object[
                            'boundaries'] or sentences_boundaries[
                                sentence_id] != sentences_boundary:
                        continue
                    relation_type = predicate['uri']
                    self.label_set.add(relation_type)
                    subject_mention_position = get_mention_position(
                        text, sentences_boundary, subject['boundaries'])
                    object_mention_position = get_mention_position(
                        text, sentences_boundary, object['boundaries'])
                    if subject_mention_position[0] >= subject_mention_position[
                            1]:
                        count += 1
                        continue
                    if object_mention_position[0] >= object_mention_position[1]:
                        count += 1
                        continue
                    arguments = [
                        subject_mention_position, object_mention_position
                    ]
                    relation_mention = {
                        'relation_type': relation_type,
                        'arguments': arguments
                    }
                    relation_mentions.append(relation_mention)
                dataset.append({
                    'sentence': sentence,
                    'words': words,
                    'entity_mentions': entity_mentions,
                    'relation_mentions': relation_mentions
                })
        return dataset
예제 #13
0
    def _load(self, path):
        datas = load_json(path)
        if self.debug:
            datas = datas[0:100]
        dataset = DataTable()
        for data in tqdm(datas):
            text = data['text']
            sentences_boundaries = data['sentences_boundaries']
            words_boundaries = data["words_boundaries"]
            triples = data["triples"]
            if not triples:  # if there is no triples
                continue

            prev_length = 0
            for i, sentences_boundary in enumerate(sentences_boundaries):
                charid2wordid = {}
                sentence = []
                for j, (start, end) in enumerate(words_boundaries):
                    if start >= sentences_boundary[
                            0] and end <= sentences_boundary[1]:
                        if start == sentences_boundary[0]:
                            # print("j={}  prev_length={}".format(j,prev_length))
                            assert j == prev_length
                        charid2wordid = {
                            **charid2wordid,
                            **{
                                key: j - prev_length
                                for key in range(start, end + 1)
                            }
                        }
                        sentence.append(text[start:end])
                prev_length += len(sentence)
                triples_one_sentence = []
                for triple in triples:
                    if triple["sentence_id"] != i:
                        continue
                    if triple["subject"] is not None and triple[
                            "predicate"] is not None and triple[
                                "object"] is not None:
                        subject, predicate, object = triple["subject"], triple[
                            "predicate"], triple["object"]
                        if subject["boundaries"] is not None and predicate[
                                "boundaries"] is not None and object[
                                    "boundaries"] is not None:
                            # print(triple)
                            keys = ["subject", "predicate", "object"]
                            for key in keys:
                                start, end = triple[key]["boundaries"]
                                triple[key]["boundaries"] = sorted(
                                    list(
                                        set([
                                            charid2wordid[charid]
                                            for charid in range(start, end)
                                        ])))
                            triples_one_sentence.append({
                                "subject":
                                triple["subject"]["boundaries"],
                                "predicate":
                                triple["predicate"]["boundaries"],
                                "object":
                                triple["object"]["boundaries"],
                            })
                if not triples_one_sentence:
                    continue

                dataset("sentence", sentence)
                dataset("triple", triples_one_sentence)

        return dataset
예제 #14
0
파일: datable.py 프로젝트: jinzhuoran/CogIE
 def load_table(path):
     datable = DataTable()
     datable.datas = load_json(path)
     datable.headers = list(datable.datas.keys())
     return datable
예제 #15
0
import cogie
from cogie.core.loss import BCEloss
from cogie.io.loader.re.nyt import NYTRELoader
from cogie.io.processor.re.nyt import NYTREProcessor
from cogie.utils import load_json

torch.cuda.set_device(4)
device = torch.device('cuda:0')

loader = NYTRELoader()
train_data, dev_data, test_data = loader.load_all(
    '../../../cognlp/data/spo/nyt/data')
processor = NYTREProcessor(path='../../../cognlp/data/spo/nyt/data',
                           bert_model='bert-base-cased')

ner_vocabulary = load_json('../../../cognlp/data/spo/nyt/data/ner2idx.json')
rc_vocabulary = load_json('../../../cognlp/data/spo/nyt/data/rel2idx.json')

train_datable = processor.process(train_data)
train_dataset = cogie.DataTableSet(train_datable)
train_sampler = RandomSampler(train_dataset)

dev_datable = processor.process(dev_data)
dev_dataset = cogie.DataTableSet(dev_datable)
dev_sampler = RandomSampler(dev_dataset)

test_datable = processor.process(test_data)
test_dataset = cogie.DataTableSet(test_datable)
test_sampler = RandomSampler(test_dataset)

model = cogie.PFN(dropout=0.1,