Ejemplo n.º 1
0
 def _get_valid_dataset_filepaths(self, parameters, dataset_types=['train', 'valid', 'test', 'deploy']):
     dataset_filepaths = {}
     dataset_brat_folders = {}
     for dataset_type in dataset_types:
         dataset_filepaths[dataset_type] = os.path.join(parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type))
         dataset_brat_folders[dataset_type] = os.path.join(parameters['dataset_text_folder'], dataset_type)
         dataset_compatible_with_brat_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_compatible_with_brat.txt'.format(dataset_type))
 
         # Conll file exists
         if os.path.isfile(dataset_filepaths[dataset_type]) and os.path.getsize(dataset_filepaths[dataset_type]) > 0:
             # Brat text files exist
             if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:
 
                 # Check compatibility between conll and brat files
                 brat_to_conll.check_brat_annotation_and_text_compatibility(dataset_brat_folders[dataset_type])
                 if os.path.exists(dataset_compatible_with_brat_filepath):
                     dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath
                 conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepaths[dataset_type], dataset_brat_folders[dataset_type])
 
             # Brat text files do not exist
             else:
 
                 # Populate brat text and annotation files based on conll file
                 conll_to_brat.conll_to_brat(dataset_filepaths[dataset_type], dataset_compatible_with_brat_filepath, dataset_brat_folders[dataset_type], dataset_brat_folders[dataset_type])
                 dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath
 
         # Conll file does not exist
         else:
             # Brat text files exist
             if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:
                 dataset_filepath_for_tokenizer = os.path.join(parameters['dataset_text_folder'], '{0}_{1}.txt'.format(dataset_type, parameters['tokenizer']))
                 if os.path.exists(dataset_filepath_for_tokenizer):
                     conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepath_for_tokenizer, dataset_brat_folders[dataset_type])
                 else:
                     # Populate conll file based on brat files
                     brat_to_conll.brat_to_conll(dataset_brat_folders[dataset_type], dataset_filepath_for_tokenizer, parameters['tokenizer'], parameters['spacylanguage'])
                 dataset_filepaths[dataset_type] = dataset_filepath_for_tokenizer
 
             # Brat text files do not exist
             else:
                 del dataset_filepaths[dataset_type]
                 del dataset_brat_folders[dataset_type]
                 continue
 
         if parameters['tagging_format'] == 'bioes':
             # Generate conll file with BIOES format
             bioes_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_bioes.txt'.format(utils.get_basename_without_extension(dataset_filepaths[dataset_type])))
             utils_nlp.convert_conll_from_bio_to_bioes(dataset_filepaths[dataset_type], bioes_filepath)
             dataset_filepaths[dataset_type] = bioes_filepath
 
     return dataset_filepaths, dataset_brat_folders
Ejemplo n.º 2
0
    def _get_valid_dataset_filepaths(self, parameters, dataset_types=['train', 'valid', 'test', 'deploy']):
        '''
        Tiền xử lý dataset đầu vào, nếu data chuẩn conll thì chuyển sang brat
        Tham số:
            - parameters: parameters của toàn bộ chương trình
        Return:
        (
            {   // dataset_filepaths các fields bên dưới là optional, không nhất thiết phải đủ 4
                "train": "data_text_folder/train[_compatible_with_brat][_bioes].txt",
                "valid": "data_text_folder/valid[_compatible_with_brat][_bioes].txt",
                "test": "data_text_folder/test[_compatible_with_brat][_bioes].txt",
                "deploy": "data_text_folder/deploy[_compatible_with_brat][_bioes].txt"
            },
            {   // dataset_brat_folders, các fields bên dưới là optional, không nhất thiết phải đủ 4
                "train": "data_text_folder/train",
                "valid": "data_text_folder/valid",
                "test": "data_text_folder/test",
                "deploy": "data_text_folder/deploy"
            }
        )
        '''
        dataset_filepaths = {}
        dataset_brat_folders = {}
        for dataset_type in dataset_types:
            dataset_filepaths[dataset_type] = os.path.join(parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type))
            dataset_brat_folders[dataset_type] = os.path.join(parameters['dataset_text_folder'], dataset_type)
            dataset_compatible_with_brat_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_compatible_with_brat.txt'.format(dataset_type))

            # Conll file exists
            if os.path.isfile(dataset_filepaths[dataset_type]) and os.path.getsize(dataset_filepaths[dataset_type]) > 0:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:

                    # Check compatibility between conll and brat files
                    brat_to_conll.check_brat_annotation_and_text_compatibility(dataset_brat_folders[dataset_type])
                    if os.path.exists(dataset_compatible_with_brat_filepath):
                        dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath
                    conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepaths[dataset_type], dataset_brat_folders[dataset_type])

                # Brat text files do not exist
                else:

                    # Populate brat text and annotation files based on conll file
                    conll_to_brat.conll_to_brat(dataset_filepaths[dataset_type], dataset_compatible_with_brat_filepath, dataset_brat_folders[dataset_type], dataset_brat_folders[dataset_type])
                    dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath

            # Conll file does not exist
            else:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:
                    dataset_filepath_for_tokenizer = os.path.join(parameters['dataset_text_folder'], '{0}_{1}.txt'.format(dataset_type, parameters['tokenizer']))
                    if os.path.exists(dataset_filepath_for_tokenizer):
                        conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepath_for_tokenizer, dataset_brat_folders[dataset_type])
                    else:
                        # Populate conll file based on brat files
                        brat_to_conll.brat_to_conll(dataset_brat_folders[dataset_type], dataset_filepath_for_tokenizer, parameters['tokenizer'], parameters['spacylanguage'])
                    dataset_filepaths[dataset_type] = dataset_filepath_for_tokenizer

                # Brat text files do not exist
                else:
                    del dataset_filepaths[dataset_type]
                    del dataset_brat_folders[dataset_type]
                    continue

            if parameters['tagging_format'] == 'bioes':
                # Generate conll file with BIOES format
                bioes_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_bioes.txt'.format(utils.get_basename_without_extension(dataset_filepaths[dataset_type])))
                utils_nlp.convert_conll_from_bio_to_bioes(dataset_filepaths[dataset_type], bioes_filepath)
                dataset_filepaths[dataset_type] = bioes_filepath

        return dataset_filepaths, dataset_brat_folders