def __init__(self, info_dict, trained_model_path=None):
     self.info_dict = info_dict
     self.bertinizer = SentenceBERTinizer()
     self.model = BERTGraphRel(
         num_ne=info_dict["entity_vsize"],
         num_rel=info_dict["rel_vsize"],
         embedding_size=self.bertinizer.embedding_size)
     if trained_model_path is not None:
         self.load_trained_model(trained_model_path)
class BERTGraphRelExtractor(object):
    def __init__(self, info_dict, trained_model_path=None):
        self.info_dict = info_dict
        self.bertinizer = SentenceBERTinizer()
        self.model = BERTGraphRel(
            num_ne=info_dict["entity_vsize"],
            num_rel=info_dict["rel_vsize"],
            embedding_size=self.bertinizer.embedding_size)
        if trained_model_path is not None:
            self.load_trained_model(trained_model_path)

    def load_trained_model(self, path):
        self.model.load_state_dict(
            torch.load(path, map_location=lambda storage, loc: storage))
        print("Model", path, "loaded.")

    def get_entities(self, entity_pred_tensor):
        d_id_to_type = self.info_dict["mod_entities_id_to_token_dict"]
        entity_index = entity_pred_tensor.squeeze().argmax(dim=1)
        entity_index_np = entity_index.detach().numpy()
        entity_type = [d_id_to_type[str(i)] for i in entity_index_np]
        return entity_type

    def get_relations(self, rel_pred_tensor):
        rel_list = []
        d_id_to_type = self.info_dict["mod_relations_id_to_token_dict"]
        rel_matrix_index = rel_pred_tensor.squeeze(dim=2).argmax(dim=2)
        rel_matrix_index_np = rel_matrix_index.detach().numpy()
        e1_index_array, e2_index_array = rel_matrix_index_np.nonzero()
        if len(e1_index_array) == 0:
            return rel_list
        for i in range(len(e1_index_array)):
            e1_index = e1_index_array[i]
            e2_index = e2_index_array[i]
            rel_index = rel_matrix_index_np[e1_index, e2_index]
            rel_list.append([e1_index, e2_index, d_id_to_type[str(rel_index)]])
        return rel_list

    def analyze_sentence(self, sentence):
        tokens_wp, tokens_ids_wp_tensor, segments_ids_wp_tensors, tokens_base = self.bertinizer.tokenize(
            sentence)
        bert_embeddings = self.bertinizer.get_embeddings(
            tokens_ids_wp_tensor, segments_ids_wp_tensors)
        bert_avg_embeddings = self.bertinizer.average_wp_embeddings(
            bert_embeddings, tokens_wp)
        ne_p1, rel_p1, ne_p2, rel_p2 = self.model(
            bert_avg_embeddings.unsqueeze(dim=1))
        entity_type = self.get_entities(ne_p2)
        rel_list = self.get_relations(rel_p2)
        return tokens_base, entity_type, rel_list
Ejemplo n.º 3
0
    os.makedirs(dumb_dataset_dir)

train_dumb_dataset_path = dumb_dataset_dir + "dumb_train.json"
test_dumb_dataset_path = dumb_dataset_dir + "dumb_test.json"

if __name__ == "__main__":

    ddsc = DumbDataSetConstructor(config_path=dumb_dataset_config_path)

    print("+ Preparing and saving train dataset.")
    ddsc.generate_dataset(n=6000)
    data, ne_list, rel_list = ddsc.get_dataset()
    ddsc.write_json_dataset(train_dumb_dataset_path)

    print("+ Preparing and saving descriptive json.")
    sentbertnizer = SentenceBERTinizer()
    er_aligner = tgtEntRelConstructor(tokenizer=sentbertnizer,
                                      ne_tags=ne_list,
                                      rel_tags=rel_list)

    num_ne = er_aligner.NE_vsize
    num_rel = er_aligner.REL_vsize

    info_collector = InfoCollector()
    info_collector.remember_info(
        entities=ne_list,
        relations=rel_list,
        entity_vsize=num_ne,
        rel_vsize=num_rel,
        mod_entities=er_aligner.NE_biotags,
        mod_relations=er_aligner.REL_mod_tags,
    # wiki_json_train = "./data/preproc_WikiKBP_json/train.json"
    # pubmed_json_train = "./data/preproc_PubMed_json/train.json"
    nyt_json_train = "./../data/preproc_NYT_json/train.json"
    nyt_json_test = "./../data/preproc_NYT_json/test.json"

    data_nyt_train, NE_LIST, REL_LIST = get_dataset(nyt_json_train, _bert_wp_tokenizer)
    data_nyt_test, _, _ = get_dataset(nyt_json_test, _bert_wp_tokenizer)

    obs = data_nyt_train[5]
    sentence = obs["sentText"]
    entityMentions = obs["entityMentions"]
    relationMentions = obs["relationMentions"]

    from data_processing.BERTinizer import SentenceBERTinizer
    sentbertnizer = SentenceBERTinizer()

    er_aligner = tgtEntRelConstructor(tokenizer=sentbertnizer, ne_tags=NE_LIST, rel_tags=REL_LIST)

    tokens_base = sentbertnizer.base_tokenize(sentence, clean_marking=False)
    ne_tensor, rel_tensor = er_aligner.get_ne_rel_tensors(tokens_base, entityMentions, relationMentions)

    print("+ Original sentence: \t", sentence)
    print("+ Tokenized sentence (without WordPiece): \t", tokens_base)
    print("+ Sentence of entities indices: \t", ne_tensor)
    print()
    print("+ Original NE: \t", entityMentions)
    print("+ BIO NE prepared: \t", er_aligner.NE_biotags)
    print()
    print("+ Size of NE tensor: \t", ne_tensor.size())
    print("+ Size of relation tensor: \t", rel_tensor.size())