def create_pos_tag_input(self): ''' Cria o input contendo as informações de part-of-speech dos dataset, além de ir criando juntamente o dicionario de tags ''' dataset_config = self.get_config('dataset') dataset_path = dataset_config.get('path') pos_config = self.get_config('part_of_speech') pos_path = pos_config.get('path') input_config = self.get_config('input') input_path = input_config.get('path') pos_tag_dict = { 'PAD' : 0 } nlp = spacy.load('pt') for dataset_type in self.dataset_types: input_list = [] dataset_file_name = 'train_json' if dataset_type == 'train' else 'test_json' input_file_name = 'train_pos_tagged_input' if dataset_type == 'train' else 'test_pos_tagged_input' dataset = file_helper.get_json_file_data(dataset_path, dataset_config.get(dataset_file_name)) for data in dataset: sentence = data.get('sentence') tagged_sentence = [token.pos_ for token in nlp(sentence)] for tag in tagged_sentence: self.add_tag_in_pos_tag_dict(pos_tag_dict, tag) input_data = self.include_padding([pos_tag_dict[tag] for tag in tagged_sentence]) input_list.append(input_data) file_helper.dict_to_json(input_path, input_config.get(input_file_name), input_list, 4) file_helper.dict_to_json(pos_path, pos_config.get('pos_tag_dict'), pos_tag_dict, 4)
def predict(model, parser, config): predict = model.predict() dataset_config = config.get_configuration('dataset') dataset_path = dataset_config.get('path') dataset_test = file_helper.get_json_file_data( dataset_path, dataset_config.get('test_json')) output = parser.save_predicted_output(dataset_test, predict) number_of_relations = metrics_helper.get_number_of_relations_in_dataset( output) correct_relations = metrics_helper.get_correct_relations(output) number_predicted_relations = metrics_helper.get_number_of_relations_predicted( output) exact_precision = metrics_helper.get_exact_precision(output) exact_recall = metrics_helper.get_exact_recall(output) exact_f_measure = metrics_helper.get_exact_f_measure(output) partial_precision = metrics_helper.get_partial_precision(output) partial_recall = metrics_helper.get_partial_recall(output) partial_f_measure = metrics_helper.get_partial_f_measure(output) print(f'number of relations in dataset: {number_of_relations}') print(f'number of correct relations: {correct_relations}') print(f'number of predicted relations: {number_predicted_relations}') print(f'exact precision: {exact_precision}') print(f'exact recall: {exact_recall}') print(f'exact f-measure: {exact_f_measure}') print(f'partial precision: {partial_precision}') print(f'partial recall: {partial_recall}') print(f'partial f-measure: {partial_f_measure}')
def process_individual_dataset_to_word_to_id(self, path, file_name, word_to_id_dict, reverse_dict): ''' Função para processar individualmente cada um dos datasets em word_to_id e reverse_dict ''' dict_data = file_helper.get_json_file_data(path, file_name) self.add_dataset_to_word_to_id(dict_data, word_to_id_dict, reverse_dict) file_helper.dict_to_json(path, file_name, dict_data, 4)
def create_word_embeddings_weight(self): ''' Função para criar o arquivo com vetor de pesos do word embeddings que será utilizado no modelo ''' word_embeddings_config = self.get_config('word_embeddings') word_embeddings = file_helper.get_json_file_data(word_embeddings_config.get('path'), word_embeddings_config.get('word_embeddings_json')) word_embeddings_dimension = word_embeddings_config.get('dimensions') word_to_id_config = self.get_config('word_to_id') word_to_id = file_helper.get_json_file_data(word_to_id_config.get('path'), word_to_id_config.get('dict')) word_embeddings_weight = self.create_empty_word_embeddings_weight_list(len(word_to_id) + 1, word_embeddings_dimension) for word, index in word_to_id.items(): weight_in_embeddings = word_embeddings.get(word) if weight_in_embeddings is not None: word_embeddings_weight[index] = weight_in_embeddings input_config = self.get_config('input') file_helper.dict_to_json(input_config.get('path'), input_config.get('word_embeddings_weight'), word_embeddings_weight, 4)
def create_output_for_model(self): ''' Cria o arquivo de output do modelo ''' output_config = self.get_config('output') dataset_config = self.get_config('dataset') for dataset_type in self.dataset_types: dataset_filename = 'train_json' if dataset_type == 'train' else 'test_json' output_filename = 'train_sentence_output' if dataset_type == 'train' else 'test_sentence_output' dataset = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get(dataset_filename)) sentences_output = self.parse_output_sentence(dataset) file_helper.dict_to_json(output_config.get('path'), output_config.get(output_filename), sentences_output, 4)
def initialize_outputs(self): ''' Inicializa todos os outputs que vão ser utilizados no modelo ''' outputs_config = self.get_config('output') path = outputs_config.get('path') # output de treino self.train_sentences_output = np.asarray( file_helper.get_json_file_data( path, outputs_config.get('train_sentence_output'))) ''' print('train_sentences_output') print(self.train_sentences_output) ''' # output de teste self.test_sentences_output = np.asarray( file_helper.get_json_file_data( path, outputs_config.get('test_sentence_output'))) '''
def parse_inputs_for_model(self): ''' Função para parsear o input de palavras para numeros, tornando melhor para alimentar o modelo ''' word_to_id_config = self.get_config('word_to_id') word_to_id_path = word_to_id_config.get('path') word_to_id_file_name = word_to_id_config.get('dict') word_to_id = file_helper.get_json_file_data(word_to_id_path, word_to_id_file_name) self.create_sentence_input(word_to_id) self.create_entity_input() self.create_pos_tag_input() self.create_word_embeddings_weight()
def create_entity_input(self): ''' Cria o arquivo de entity_input, que será utilizado como input no modelo ''' dataset_config = self.get_config('dataset') input_config = self.get_config('input') dataset_path = dataset_config.get('path') input_path = input_config.get('path') for dataset_type in self.dataset_types: dataset_type_filename = 'train_json' if dataset_type == 'train' else 'test_json' input_type_filename = 'train_entity_input' if dataset_type == 'train' else 'test_entity_input' dataset = file_helper.get_json_file_data(dataset_path, dataset_config.get(dataset_type_filename)) entity_input = self.parse_entity_input(dataset) file_helper.dict_to_json(input_path, input_config.get(input_type_filename), entity_input, 4)
def save_words_in_relation(self): dataset_config = self.get_config('dataset') path = dataset_config.get('path') for dataset_type in self.dataset_types: dataset_file = 'train_json' if dataset_type == 'train' else 'test_json' words_in_relation_file = 'train_words_in_relation' if dataset_type == 'train' else 'test_words_in_relation' dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file)) local_dict = {} for data in dataset: for word in data.get('relation').split(' '): if local_dict.get(word) is None: local_dict[word] = 1 else: local_dict[word] += 1 local_str = self.sort_data_to_txt(local_dict) file_helper.save_txt_file(path, dataset_config.get(words_in_relation_file), local_str)
def save_full_relation_in_sentence(self): dataset_config = self.get_config('dataset') path = dataset_config.get('path') for dataset_type in self.dataset_types: dataset_file = 'train_json' if dataset_type == 'train' else 'test_json' full_relation_file = 'train_full_relation' if dataset_type == 'train' else 'test_full_relation' dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file)) local_dict = {} for data in dataset: relation = data.get('relation') if local_dict.get(relation) is None: local_dict[relation] = 1 else: local_dict[relation] += 1 local_str = self.sort_data_to_txt(local_dict) file_helper.save_txt_file(path, dataset_config.get(full_relation_file), local_str)
def relation_to_id_json(self): ''' Função para atribuir um id para cada uma das relações encontradas no dataset de treino ''' dataset_config = self.get_config('dataset') treino_json = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get('train_json')) relation_config = self.get_config('relation') relation_dict = {} # primeira relação deve ser NA e o id 0 relation_dict['NA'] = self.relation_id for line in treino_json: relation = line.get('relation') if relation_dict.get(relation) is None: self.increment_relation_id() relation_dict[relation] = self.relation_id file_helper.dict_to_json(relation_config.get('path'), relation_config.get('file_name'), relation_dict, 4)
def entities_types_to_id(self): ''' Cria os dicionários para os tipos de entidades presentes no dataset de treino ''' entities_type_dict = {} reverse_entities_type_dict = {} dataset_config = self.get_config('dataset') entities_config = self.get_config('entities') path = entities_config.get('path') train_dataset = file_helper.get_json_file_data(dataset_config.get('path'), dataset_config.get('train_json')) for sentence in train_dataset: for str_type in ['head', 'tail']: self.add_data_to_entities_dict(str_type, sentence, entities_type_dict, reverse_entities_type_dict) file_helper.dict_to_json(path, entities_config.get('entities_to_id'), entities_type_dict, 4) file_helper.dict_to_json(path, entities_config.get('reverse_entities_to_id'), reverse_entities_type_dict, 4)
def save_entities_relation(self): dataset_config = self.get_config('dataset') path = dataset_config.get('path') for dataset_type in self.dataset_types: dataset_file = 'train_json' if dataset_type == 'train' else 'test_json' entities_relation_file = 'train_entities_relation' if dataset_type == 'train' else 'test_entities_relation' dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file)) local_dict = {} for data in dataset: head = data.get('head').get('category') tail = data.get('tail').get('category') relation = f'{head}-{tail}' if local_dict.get(relation) is None: local_dict[relation] = 1 else: local_dict[relation] += 1 local_str = self.sort_data_to_txt(local_dict) file_helper.save_txt_file(path, dataset_config.get(entities_relation_file), local_str)
def save_number_of_entities_in_dataset(self): dataset_config = self.get_config('dataset') path = dataset_config.get('path') for dataset_type in self.dataset_types: dataset_file = 'train_json' if dataset_type == 'train' else 'test_json' entities_number_file = 'train_entities_number' if dataset_type == 'train' else 'test_entities_number' dataset = file_helper.get_json_file_data(path, dataset_config.get(dataset_file)) local_dict = {} for data in dataset: for position in ['head', 'tail']: entity = data.get(position).get('category') if local_dict.get(entity) is None: local_dict[entity] = 1 else: local_dict[entity] += 1 local_str = self.sort_data_to_txt(local_dict) file_helper.save_txt_file(path, dataset_config.get(entities_number_file), local_str)
def initialize_inputs(self): ''' Inicializa todos os inputs que vão ser utilizados no modelo ''' inputs_config = self.get_config('input') path = inputs_config.get('path') # word embeddings pesos já treinados self.word_embeddings_matrix = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('word_embeddings_weight'))) ''' print('word_embeddings_matrix') print(self.word_embeddings_matrix) ''' # inputs de treino self.train_sentences_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('train_sentence_input'))) self.train_entities_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('train_entity_input'))) self.train_pos_tagged_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('train_pos_tagged_input'))) ''' print('train_sentences_input') print(self.train_sentences_input) print('train_entities_input') print(self.train_entities_input) ''' # inputs de teste self.test_sentences_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('test_sentence_input'))) self.test_entities_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('test_entity_input'))) self.test_pos_tagged_input = np.asarray( file_helper.get_json_file_data( path, inputs_config.get('test_pos_tagged_input'))) '''
def __init__(self, path, file_name): self.config = file_helper.get_json_file_data(path, file_name)