Esempio n. 1
0
 def create_pos_tag_input(self):
     '''
     Cria o input contendo as informações de part-of-speech dos dataset,
     além de ir criando juntamente o dicionario de tags 
     '''
     dataset_config = self.get_config('dataset')
     dataset_path = dataset_config.get('path')
     pos_config = self.get_config('part_of_speech')
     pos_path = pos_config.get('path')
     input_config = self.get_config('input')
     input_path = input_config.get('path')
     pos_tag_dict = { 'PAD' : 0 }
     nlp = spacy.load('pt')
     for dataset_type in self.dataset_types:
         input_list = []
         dataset_file_name = 'train_json' if dataset_type == 'train' else 'test_json'
         input_file_name = 'train_pos_tagged_input' if dataset_type == 'train' else 'test_pos_tagged_input'
         dataset = file_helper.get_json_file_data(dataset_path, dataset_config.get(dataset_file_name))
         for data in dataset:
             sentence = data.get('sentence')
             tagged_sentence = [token.pos_ for token in nlp(sentence)]
             for tag in tagged_sentence:
                 self.add_tag_in_pos_tag_dict(pos_tag_dict, tag)
             
             input_data = self.include_padding([pos_tag_dict[tag] for tag in tagged_sentence])
             input_list.append(input_data)
             file_helper.dict_to_json(input_path, input_config.get(input_file_name), input_list, 4)
     
     file_helper.dict_to_json(pos_path, pos_config.get('pos_tag_dict'), pos_tag_dict, 4)
Esempio n. 2
0
def predict(model, parser, config):
    predict = model.predict()
    dataset_config = config.get_configuration('dataset')
    dataset_path = dataset_config.get('path')
    dataset_test = file_helper.get_json_file_data(
        dataset_path, dataset_config.get('test_json'))
    output = parser.save_predicted_output(dataset_test, predict)
    number_of_relations = metrics_helper.get_number_of_relations_in_dataset(
        output)
    correct_relations = metrics_helper.get_correct_relations(output)
    number_predicted_relations = metrics_helper.get_number_of_relations_predicted(
        output)
    exact_precision = metrics_helper.get_exact_precision(output)
    exact_recall = metrics_helper.get_exact_recall(output)
    exact_f_measure = metrics_helper.get_exact_f_measure(output)
    partial_precision = metrics_helper.get_partial_precision(output)
    partial_recall = metrics_helper.get_partial_recall(output)
    partial_f_measure = metrics_helper.get_partial_f_measure(output)
    print(f'number of relations in dataset: {number_of_relations}')
    print(f'number of correct relations: {correct_relations}')
    print(f'number of predicted relations: {number_predicted_relations}')
    print(f'exact precision: {exact_precision}')
    print(f'exact recall: {exact_recall}')
    print(f'exact f-measure: {exact_f_measure}')
    print(f'partial precision: {partial_precision}')
    print(f'partial recall: {partial_recall}')
    print(f'partial f-measure: {partial_f_measure}')
Esempio n. 3
0
 def process_individual_dataset_to_word_to_id(self, path, file_name, word_to_id_dict, reverse_dict):
     '''
     Função para processar individualmente cada um dos datasets em word_to_id e reverse_dict
     '''
     dict_data = file_helper.get_json_file_data(path, file_name)
     self.add_dataset_to_word_to_id(dict_data, word_to_id_dict, reverse_dict)
     file_helper.dict_to_json(path, file_name, dict_data, 4)
Esempio n. 4
0
 def create_word_embeddings_weight(self):
     '''
     Função para criar o arquivo com vetor de pesos do word embeddings que será utilizado no modelo
     '''
     word_embeddings_config = self.get_config('word_embeddings')
     word_embeddings = file_helper.get_json_file_data(word_embeddings_config.get('path'), word_embeddings_config.get('word_embeddings_json'))
     word_embeddings_dimension = word_embeddings_config.get('dimensions')
     word_to_id_config = self.get_config('word_to_id')
     word_to_id = file_helper.get_json_file_data(word_to_id_config.get('path'), word_to_id_config.get('dict'))
     word_embeddings_weight = self.create_empty_word_embeddings_weight_list(len(word_to_id) + 1, word_embeddings_dimension) 
     for word, index in word_to_id.items():
         weight_in_embeddings = word_embeddings.get(word)
         if weight_in_embeddings is not None:
             word_embeddings_weight[index] = weight_in_embeddings
     
     input_config = self.get_config('input')
     file_helper.dict_to_json(input_config.get('path'), input_config.get('word_embeddings_weight'), word_embeddings_weight, 4)
Esempio n. 5
0
 def create_output_for_model(self):
     '''
     Cria o arquivo de output do modelo
     '''
     output_config = self.get_config('output')
     dataset_config = self.get_config('dataset')
     for dataset_type in self.dataset_types:
         dataset_filename = 'train_json' if dataset_type == 'train' else 'test_json'
         output_filename = 'train_sentence_output' if dataset_type == 'train' else 'test_sentence_output'
         dataset = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get(dataset_filename))
         sentences_output = self.parse_output_sentence(dataset)
         file_helper.dict_to_json(output_config.get('path'), output_config.get(output_filename), sentences_output, 4)
Esempio n. 6
0
    def initialize_outputs(self):
        '''
        Inicializa todos os outputs que vão ser utilizados no modelo
        '''
        outputs_config = self.get_config('output')
        path = outputs_config.get('path')

        # output de treino
        self.train_sentences_output = np.asarray(
            file_helper.get_json_file_data(
                path, outputs_config.get('train_sentence_output')))
        '''
        print('train_sentences_output')
        print(self.train_sentences_output)
        '''

        # output de teste
        self.test_sentences_output = np.asarray(
            file_helper.get_json_file_data(
                path, outputs_config.get('test_sentence_output')))
        '''
Esempio n. 7
0
    def parse_inputs_for_model(self):
        '''
        Função para parsear o input de palavras para numeros, tornando melhor para alimentar o modelo
        '''
        word_to_id_config = self.get_config('word_to_id')
        word_to_id_path = word_to_id_config.get('path')
        word_to_id_file_name = word_to_id_config.get('dict')
        word_to_id = file_helper.get_json_file_data(word_to_id_path, word_to_id_file_name)

        self.create_sentence_input(word_to_id)
        self.create_entity_input()
        self.create_pos_tag_input()
        self.create_word_embeddings_weight()
Esempio n. 8
0
 def create_entity_input(self):
     '''
     Cria o arquivo de entity_input, que será utilizado como input no modelo
     '''
     dataset_config = self.get_config('dataset')
     input_config = self.get_config('input')
     dataset_path = dataset_config.get('path')
     input_path = input_config.get('path')
     for dataset_type in self.dataset_types:
         dataset_type_filename = 'train_json' if dataset_type == 'train' else 'test_json'
         input_type_filename = 'train_entity_input' if dataset_type == 'train' else 'test_entity_input'
         dataset = file_helper.get_json_file_data(dataset_path, dataset_config.get(dataset_type_filename))
         entity_input = self.parse_entity_input(dataset)
         file_helper.dict_to_json(input_path, input_config.get(input_type_filename), entity_input, 4)
Esempio n. 9
0
 def save_words_in_relation(self):
     dataset_config = self.get_config('dataset')
     path = dataset_config.get('path')
     for dataset_type in self.dataset_types:
         dataset_file = 'train_json' if dataset_type == 'train' else 'test_json'
         words_in_relation_file = 'train_words_in_relation' if dataset_type == 'train' else 'test_words_in_relation'
         dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file))
         local_dict = {}
         for data in dataset:
             for word in data.get('relation').split(' '):
                 if local_dict.get(word) is None:
                     local_dict[word] = 1
                 else:
                     local_dict[word] += 1
         local_str = self.sort_data_to_txt(local_dict)
         file_helper.save_txt_file(path, dataset_config.get(words_in_relation_file), local_str)
Esempio n. 10
0
 def save_full_relation_in_sentence(self):
     dataset_config = self.get_config('dataset')
     path = dataset_config.get('path')
     for dataset_type in self.dataset_types:
         dataset_file = 'train_json' if dataset_type == 'train' else 'test_json'
         full_relation_file = 'train_full_relation' if dataset_type == 'train' else 'test_full_relation'
         dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file))
         local_dict = {}
         for data in dataset:
             relation = data.get('relation')
             if local_dict.get(relation) is None:
                 local_dict[relation] = 1
             else:
                 local_dict[relation] += 1
         local_str = self.sort_data_to_txt(local_dict)
         file_helper.save_txt_file(path, dataset_config.get(full_relation_file), local_str)
Esempio n. 11
0
 def relation_to_id_json(self):
     '''
     Função para atribuir um id para cada uma das relações encontradas no dataset de treino
     '''
     dataset_config = self.get_config('dataset')
     treino_json = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get('train_json'))
     relation_config = self.get_config('relation')
     relation_dict = {}
     # primeira relação deve ser NA e o id 0
     relation_dict['NA'] = self.relation_id
     for line in treino_json:
         relation = line.get('relation')
         if relation_dict.get(relation) is None:
             self.increment_relation_id()
             relation_dict[relation] = self.relation_id
     
     file_helper.dict_to_json(relation_config.get('path'), relation_config.get('file_name'), relation_dict, 4)
Esempio n. 12
0
    def entities_types_to_id(self):
        '''
        Cria os dicionários para os tipos de entidades presentes no dataset de treino
        '''
        entities_type_dict = {}
        reverse_entities_type_dict = {}
        dataset_config = self.get_config('dataset')
        entities_config = self.get_config('entities')
        path = entities_config.get('path')
        train_dataset = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get('train_json'))

        for sentence in train_dataset:
            for str_type in ['head', 'tail']:
                self.add_data_to_entities_dict(str_type, sentence, entities_type_dict, reverse_entities_type_dict)
        
        file_helper.dict_to_json(path, entities_config.get('entities_to_id'), entities_type_dict, 4)
        file_helper.dict_to_json(path, entities_config.get('reverse_entities_to_id'), reverse_entities_type_dict, 4)
Esempio n. 13
0
 def save_entities_relation(self):
     dataset_config = self.get_config('dataset')
     path = dataset_config.get('path')
     for dataset_type in self.dataset_types:
         dataset_file = 'train_json' if dataset_type == 'train' else 'test_json'
         entities_relation_file = 'train_entities_relation' if dataset_type == 'train' else 'test_entities_relation'
         dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file))
         local_dict = {}
         for data in dataset:
             head = data.get('head').get('category')
             tail = data.get('tail').get('category')
             relation = f'{head}-{tail}'
             if local_dict.get(relation) is None:
                 local_dict[relation] = 1
             else:
                 local_dict[relation] += 1
         local_str = self.sort_data_to_txt(local_dict)  
         file_helper.save_txt_file(path, dataset_config.get(entities_relation_file), local_str)
Esempio n. 14
0
    def save_number_of_entities_in_dataset(self):
        dataset_config = self.get_config('dataset')
        path = dataset_config.get('path')
        for dataset_type in self.dataset_types:
            dataset_file = 'train_json' if dataset_type == 'train' else 'test_json'
            entities_number_file = 'train_entities_number' if dataset_type == 'train' else 'test_entities_number'
            dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file))
            local_dict = {}
            for data in dataset:
                for position in ['head', 'tail']:
                    entity = data.get(position).get('category')
                    if local_dict.get(entity) is None:
                        local_dict[entity] = 1
                    else:
                        local_dict[entity] += 1

            local_str = self.sort_data_to_txt(local_dict)
            file_helper.save_txt_file(path, dataset_config.get(entities_number_file), local_str)
Esempio n. 15
0
    def initialize_inputs(self):
        '''
        Inicializa todos os inputs que vão ser utilizados no modelo
        '''
        inputs_config = self.get_config('input')
        path = inputs_config.get('path')

        # word embeddings pesos já treinados
        self.word_embeddings_matrix = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('word_embeddings_weight')))
        '''
        print('word_embeddings_matrix')
        print(self.word_embeddings_matrix)
        '''

        # inputs de treino
        self.train_sentences_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('train_sentence_input')))
        self.train_entities_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('train_entity_input')))
        self.train_pos_tagged_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('train_pos_tagged_input')))
        '''
        print('train_sentences_input')
        print(self.train_sentences_input)
        print('train_entities_input')
        print(self.train_entities_input)
        '''

        # inputs de teste
        self.test_sentences_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('test_sentence_input')))
        self.test_entities_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('test_entity_input')))
        self.test_pos_tagged_input = np.asarray(
            file_helper.get_json_file_data(
                path, inputs_config.get('test_pos_tagged_input')))
        '''
Esempio n. 16
0
 def __init__(self, path, file_name):
     self.config = file_helper.get_json_file_data(path, file_name)