def __init__(self, path=None, bert_model='bert-base-cased', max_length=128): self.path = path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) self.ner_vocabulary = load_json(os.path.join(path, "ner2idx.json")) self.rc_vocabulary = load_json(os.path.join(path, "rel2idx.json")) self.collate_fn = collater(self.ner_vocabulary, self.rc_vocabulary)
def _load(self, path): dataset = DataTable() data = load_json(path) for item in data: ner_label = [] rc_label = [] ner_check = [] rc_check = [] text = item["text"].split(" ") for label in item["triple_list"]: subject_word_loc = text.index(label[0]) relation = label[1] object_word_loc = text.index(label[2]) if subject_word_loc not in ner_check: ner_label.append( [subject_word_loc, subject_word_loc, "None"]) ner_check += [subject_word_loc, subject_word_loc, "None"] if object_word_loc not in ner_check: ner_label.append( [object_word_loc, object_word_loc, "None"]) ner_check += [object_word_loc, object_word_loc, "None"] rc_label.append([subject_word_loc, object_word_loc, relation]) dataset("text", text) dataset("ner_label", ner_label) dataset("rc_label", rc_label) return dataset
def _load(self, path): dataset = load_json(path) for data in dataset: triples = data['triples'] for triple in triples: self.label_set.add(triple['predicate']['uri']) return dataset
def _load(self, path): data = load_json(path) for item in data: for entity_mention in item['golden-entity-mentions']: for i in range(entity_mention['start'], entity_mention['end']): entity_type = entity_mention['entity-type'] if i == entity_mention['start']: self.label_set.add('B-{}'.format(entity_type)) else: self.label_set.add('I-{}'.format(entity_type)) return data
def _load(self, path): datas = load_json(path) # datas = datas[0:int(len(datas)/20)] if self.debug: datas = datas[0:100] dataset = DataTable() for data in tqdm(datas): text = data['text'] entities = data['entities'] sentences_boundaries = data['sentences_boundaries'] words_boundaries = data["words_boundaries"] prev_length = 0 sentences = [] ners = [] for i, sentences_boundary in enumerate(sentences_boundaries): charid2wordid = {} sentence = [] for j, (start, end) in enumerate(words_boundaries): if start >= sentences_boundary[ 0] and end <= sentences_boundary[1]: if start == sentences_boundary[0]: # print("j={} prev_length={}".format(j,prev_length)) assert j == prev_length charid2wordid = { **charid2wordid, **{ key: j - prev_length for key in range(start, end + 1) } } sentence.append(text[start:end]) prev_length += len(sentence) sentences.append(sentence) dataset("sentence", sentence) ners_one_sentence = [] for entity in entities: entity_boundary = entity["boundaries"] start, end = entity_boundary if start >= sentences_boundary[ 0] and end <= sentences_boundary[1]: index = list( set([ charid2wordid[charid] for charid in range(start, end) ])) for k in index: assert k < len(sentence) ner = {"index": index, "type": "null"} ners_one_sentence.append(ner) ners.append(ners_one_sentence) dataset("ner", ners_one_sentence) return dataset
def _load(self, path): data = load_json(path) for item in data: for event_mention in item['golden-event-mentions']: for i in range(event_mention['trigger']['start'], event_mention['trigger']['end']): trigger_type = event_mention['event_type'] if i == event_mention['trigger']['start']: self.label_set.add('B-{}'.format(trigger_type)) else: self.label_set.add('I-{}'.format(trigger_type)) return data
def load_all(self, path): datasets = [] for f in os.listdir(path): if f == 'vocabulary.txt': continue dataset = load_json(os.path.join(path, f)) for data in dataset: entities = data['entities'] for entity in entities: entity datasets.extend(dataset) return datasets
def _load(self, path): data = load_json(path) for sample in data: if len(sample["golden-event-mentions"]) > 0: for event in sample["golden-event-mentions"]: event_type = event["event_type"] for argument in event["arguments"]: role = argument["role"] if (event_type, role) not in self.label_set: self.label_set.add((event_type, role)) if (event_type, "trigger") not in self.label_set: self.label_set.add((event_type, "trigger")) return data
def _load(self, path): dataset = load_json(path) datable = DataTable() for data in dataset: token = data['token'] relation = data['relation'] subj_start = data['subj_start'] subj_end = data['subj_end'] obj_start = data['obj_start'] obj_end = data['obj_end'] self.label_set.add(relation) datable('token', token) datable('relation', relation) datable('subj_start', subj_start) datable('subj_end', subj_end) datable('obj_start', obj_start) datable('obj_end', obj_end) return datable
def _load(self, path): data = load_json(path) for item in data: for event_mention in item['golden-event-mentions']: for i in range(event_mention['trigger']['start'], event_mention['trigger']['end']): trigger_type = event_mention['event_type'] if i == event_mention['trigger']['start']: self.trigger_label_set.add('B-{}'.format(trigger_type)) else: self.trigger_label_set.add('I-{}'.format(trigger_type)) """ 28 argument roles There are 35 roles in ACE2005 dataset, but the time-related 8 roles were replaced by 'Time' as the previous work (Yang et al., 2016). ['Time-At-End','Time-Before','Time-At-Beginning','Time-Ending', 'Time-Holds', 'Time-After','Time-Starting', 'Time-Within'] --> 'Time'. """ for argument in event_mention['arguments']: role = argument['role'] if role.startswith('Time'): role = role.split('-')[0] self.argument_label_set.add(role) return data
def load_one(self, path): dataset = load_json(path) return dataset
def _load(self, path): dataset = [] datas = load_json(path) count = 0 for data in datas: text = data['text'] entities = data['entities'] triples = data['triples'] sentences_boundaries = data['sentences_boundaries'] for sentences_boundary in sentences_boundaries: entity_mentions = [] relation_mentions = [] sentence = text[sentences_boundary[0]:sentences_boundary[1]] words = nltk.word_tokenize(sentence) for entity in entities: if entity['boundaries'][0] >= sentences_boundary[0] and entity['boundaries'][1] <= \ sentences_boundary[1]: entity_mention_position = get_mention_position( text, sentences_boundary, entity['boundaries']) if entity_mention_position[ 0] >= entity_mention_position[1]: count += 1 continue entity_mention = {'position': entity_mention_position} entity_mentions.append(entity_mention) for triple in triples: sentence_id = triple['sentence_id'] predicate = triple['predicate'] subject = triple['subject'] object = triple['object'] if not subject['boundaries'] or not object[ 'boundaries'] or sentences_boundaries[ sentence_id] != sentences_boundary: continue relation_type = predicate['uri'] self.label_set.add(relation_type) subject_mention_position = get_mention_position( text, sentences_boundary, subject['boundaries']) object_mention_position = get_mention_position( text, sentences_boundary, object['boundaries']) if subject_mention_position[0] >= subject_mention_position[ 1]: count += 1 continue if object_mention_position[0] >= object_mention_position[1]: count += 1 continue arguments = [ subject_mention_position, object_mention_position ] relation_mention = { 'relation_type': relation_type, 'arguments': arguments } relation_mentions.append(relation_mention) dataset.append({ 'sentence': sentence, 'words': words, 'entity_mentions': entity_mentions, 'relation_mentions': relation_mentions }) return dataset
def _load(self, path): datas = load_json(path) if self.debug: datas = datas[0:100] dataset = DataTable() for data in tqdm(datas): text = data['text'] sentences_boundaries = data['sentences_boundaries'] words_boundaries = data["words_boundaries"] triples = data["triples"] if not triples: # if there is no triples continue prev_length = 0 for i, sentences_boundary in enumerate(sentences_boundaries): charid2wordid = {} sentence = [] for j, (start, end) in enumerate(words_boundaries): if start >= sentences_boundary[ 0] and end <= sentences_boundary[1]: if start == sentences_boundary[0]: # print("j={} prev_length={}".format(j,prev_length)) assert j == prev_length charid2wordid = { **charid2wordid, **{ key: j - prev_length for key in range(start, end + 1) } } sentence.append(text[start:end]) prev_length += len(sentence) triples_one_sentence = [] for triple in triples: if triple["sentence_id"] != i: continue if triple["subject"] is not None and triple[ "predicate"] is not None and triple[ "object"] is not None: subject, predicate, object = triple["subject"], triple[ "predicate"], triple["object"] if subject["boundaries"] is not None and predicate[ "boundaries"] is not None and object[ "boundaries"] is not None: # print(triple) keys = ["subject", "predicate", "object"] for key in keys: start, end = triple[key]["boundaries"] triple[key]["boundaries"] = sorted( list( set([ charid2wordid[charid] for charid in range(start, end) ]))) triples_one_sentence.append({ "subject": triple["subject"]["boundaries"], "predicate": triple["predicate"]["boundaries"], "object": triple["object"]["boundaries"], }) if not triples_one_sentence: continue dataset("sentence", sentence) dataset("triple", triples_one_sentence) return dataset
def load_table(path): datable = DataTable() datable.datas = load_json(path) datable.headers = list(datable.datas.keys()) return datable
import cogie from cogie.core.loss import BCEloss from cogie.io.loader.re.nyt import NYTRELoader from cogie.io.processor.re.nyt import NYTREProcessor from cogie.utils import load_json torch.cuda.set_device(4) device = torch.device('cuda:0') loader = NYTRELoader() train_data, dev_data, test_data = loader.load_all( '../../../cognlp/data/spo/nyt/data') processor = NYTREProcessor(path='../../../cognlp/data/spo/nyt/data', bert_model='bert-base-cased') ner_vocabulary = load_json('../../../cognlp/data/spo/nyt/data/ner2idx.json') rc_vocabulary = load_json('../../../cognlp/data/spo/nyt/data/rel2idx.json') train_datable = processor.process(train_data) train_dataset = cogie.DataTableSet(train_datable) train_sampler = RandomSampler(train_dataset) dev_datable = processor.process(dev_data) dev_dataset = cogie.DataTableSet(dev_datable) dev_sampler = RandomSampler(dev_dataset) test_datable = processor.process(test_data) test_dataset = cogie.DataTableSet(test_datable) test_sampler = RandomSampler(test_dataset) model = cogie.PFN(dropout=0.1,