def __init__(self, frame_path=None, element_path=None, bert_model='bert-base-cased', max_length=256): self.frame_vocabulary = Vocabulary.load(frame_path) self.element_vocabulary = Vocabulary.load(element_path) self.tokenizer = BertTokenizer.from_pretrained(bert_model) self.max_length = max_length
def __init__(self, trigger_path=None, argument_path=None, bert_model='bert-base-cased', max_length=256): if trigger_path and argument_path: self.trigger_vocabulary = Vocabulary.load(trigger_path) self.argument_vocabulary = Vocabulary.load(argument_path) self.max_length = max_length self.bert_model = bert_model self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)
def __init__(self, node_types_label_list=None, node_attrs_label_list=None, p2p_edges_label_list=None, p2r_edges_label_list=None, path=None,bert_model='bert-base-cased',max_span_width = 15, max_length=128): self.path = path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(bert_model) self.max_span_width = max_span_width self._ontology = FrameOntology(self.path) if node_types_label_list: self.node_types_vocabulary = Vocabulary(padding="O", unknown=None) self.node_types_vocabulary.add_word_lst(node_types_label_list) self.node_types_vocabulary.build_vocab() self.node_types_vocabulary.save(os.path.join(path, 'node_types_vocabulary.txt')) else: self.node_types_vocabulary = Vocabulary.load(os.path.join(path, 'node_types_vocabulary.txt')) if node_attrs_label_list: self.node_attrs_vocabulary = Vocabulary(padding="O", unknown=None) self.node_attrs_vocabulary.add_word_lst(node_attrs_label_list) self.node_attrs_vocabulary.build_vocab() self.node_attrs_vocabulary.save(os.path.join(path, 'node_attrs_vocabulary.txt')) else: self.node_attrs_vocabulary = Vocabulary.load(os.path.join(path, 'node_attrs_vocabulary.txt')) if p2p_edges_label_list: self.p2p_edges_vocabulary = Vocabulary(padding=None, unknown=None) self.p2p_edges_vocabulary.add_word_lst(p2p_edges_label_list) self.p2p_edges_vocabulary.build_vocab() self.p2p_edges_vocabulary.save(os.path.join(path, 'p2p_edges_vocabulary.txt')) else: self.p2p_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2p_edges_vocabulary.txt')) if p2r_edges_label_list: self.p2r_edges_vocabulary = Vocabulary(padding=None, unknown=None) self.p2r_edges_vocabulary.add_word_lst(p2r_edges_label_list) self.p2r_edges_vocabulary.build_vocab() self.p2r_edges_vocabulary.save(os.path.join(path, 'p2r_edges_vocabulary.txt')) else: self.p2r_edges_vocabulary = Vocabulary.load(os.path.join(path, 'p2r_edges_vocabulary.txt'))
def __init__(self, bert_model=None, model_path=None, vocabulary_path=None, device=None, device_ids=None, max_seq_length=256): super().__init__() self.bert_model = bert_model self.model_path = model_path self.vocabulary_path = vocabulary_path self.device = device self.device_ids = device_ids self.max_seq_length = max_seq_length if self.bert_model: self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) if self.vocabulary_path: self.vocabulary = Vocabulary.load(self.vocabulary_path)
def __init__(self, schema_path=None, trigger_path=None, argument_path=None, bert_model='bert-base-cased', max_length=128): self.schema_path = schema_path self.trigger_path = trigger_path self.argument_path = argument_path self.bert_model = bert_model self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) with open(self.schema_path, 'r', encoding='utf-8') as f: self.schema_str = json.load(f) self.trigger_type_list = list() self.argument_type_list = list() trigger_type_set = set() argument_type_set = set() for trigger_type, argument_type_list in self.schema_str.items(): trigger_type_set.add(trigger_type) for argument_type in argument_type_list: argument_type_set.add(argument_type) self.trigger_type_list = list(trigger_type_set) self.argument_type_list = list(argument_type_set) self.args_s_id = {} self.args_e_id = {} for i in range(len(self.argument_type_list)): s = self.argument_type_list[i] + '_s' self.args_s_id[s] = i e = self.argument_type_list[i] + '_e' self.args_e_id[e] = i if os.path.exists(self.trigger_path): self.trigger_vocabulary = Vocabulary.load(self.trigger_path) else: self.trigger_vocabulary = Vocabulary(padding=None, unknown=None) self.trigger_vocabulary.add_word_lst(self.trigger_type_list) self.trigger_vocabulary.build_vocab() self.trigger_vocabulary.save(self.trigger_path) if os.path.exists(self.argument_path): self.argument_vocabulary = Vocabulary.load(self.argument_path) else: self.argument_vocabulary = Vocabulary(padding=None, unknown=None) self.argument_vocabulary.add_word_lst(self.argument_type_list) self.argument_vocabulary.build_vocab() self.argument_vocabulary.save(self.argument_path) self.schema_id = {} for trigger_type, argument_type_list in self.schema_str.items(): self.schema_id[self.trigger_vocabulary.word2idx[trigger_type]] = [ self.argument_vocabulary.word2idx[a] for a in argument_type_list ] self.trigger_type_num = len(self.trigger_vocabulary) self.argument_type_num = len(self.argument_vocabulary) self.trigger_max_span_len = {} self.argument_max_span_len = {} for name in self.trigger_vocabulary.word2idx: self.trigger_max_span_len[name] = 1 for name in self.argument_vocabulary.word2idx: self.argument_max_span_len[name] = 1
def load_vocabulary(self, path): self.vocabulary = Vocabulary.load(os.path.join(path, 'vocabulary.txt'))