Esempio n. 1
0
class TrainingManager:
    def __init__(self, root_path: str, lang: str, loader_classes: str,
                 data_dir: str, log_dir: str):
        self.file_system_root = pkg_resources.resource_filename(root_path, '')
        relative_config_filename = os.sep.join(('lang', lang, 'config.cfg'))
        if not pkg_resources.resource_exists(root_path,
                                             relative_config_filename):
            raise LanguageNotSupportedError(lang)
        self.config = Config().from_disk(
            os.sep.join((self.file_system_root, relative_config_filename)))
        loader_classnames = loader_classes.split(',')
        self.loaders = []
        for loader_classname in loader_classnames:
            class_ = getattr(sys.modules['coreferee.training.loaders'],
                             loader_classname)
            self.loaders.append(class_())
        self.lang = lang
        self.models_dirname = os.sep.join(
            (self.file_system_root, '..', 'models', lang))
        if not os.path.isdir(self.models_dirname):
            self.set_up_models_dir()

        self.relevant_config_entry_names = []
        self.nlp_dict = {}
        for config_entry_name, config_entry in self.config.items():
            this_model_dir = ''.join((self.models_dirname, os.sep, ''.join(
                (COMMON_MODELS_PACKAGE_NAMEPART, self.lang)), os.sep,
                                      config_entry_name))
            if not os.path.isdir(this_model_dir):
                self.relevant_config_entry_names.append(config_entry_name)
                model_name = '_'.join((lang, config_entry['model']))
                self.load_model(model_name, config_entry_name,
                                config_entry['from_version'],
                                config_entry['to_version'])
                if 'vectors_model' in config_entry:
                    vectors_model_name = '_'.join(
                        (lang, config_entry['vectors_model']))
                    self.load_model(vectors_model_name,
                                    config_entry_name,
                                    config_entry['vectors_from_version'],
                                    config_entry['vectors_to_version'],
                                    is_vector_model=True)
            else:
                print('Skipping config entry', config_entry_name,
                      'as model exists')

        self.log_dir = log_dir
        if '..' in log_dir:
            print('.. not permitted in log_dir')
            sys.exit(1)
        if not os.path.isdir(self.log_dir):
            os.makedirs(self.log_dir)

        if not os.path.isdir(data_dir):
            print('Data directory', data_dir, 'not found.')
            sys.exit(1)
        self.data_dir = data_dir

        temp_dir = os.sep.join((self.log_dir, 'temp'))
        if os.path.isdir(temp_dir):
            shutil.rmtree(temp_dir)
        time.sleep(1)
        os.mkdir(temp_dir)

    def load_model(self,
                   name,
                   config_entry_name,
                   from_version,
                   to_version,
                   *,
                   is_vector_model=False):
        if name not in self.nlp_dict:
            print('Loading model', name, '...')
            try:
                nlp = spacy.load(name)
            except OSError:
                if is_vector_model:
                    print('Config entry', config_entry_name,
                          'specifies a vectors model', name,
                          'that cannot be loaded.')
                else:
                    print('Config entry', config_entry_name,
                          'specifies a model', name, 'that cannot be loaded.')
                sys.exit(1)
        else:
            nlp = self.nlp_dict[name]
        if version.parse(nlp.meta['version']) < \
                version.parse(from_version) or \
                version.parse(nlp.meta['version']) > \
                version.parse(to_version):
            if is_vector_model:
                print('Config entry', config_entry_name,
                      'specifies a version range for vectors model', name,
                      'that does not include the loaded version.')
            else:
                print('Config entry', config_entry_name,
                      'specifies a version range for model', name,
                      'that does not include the loaded version.')
            sys.exit(1)
        self.nlp_dict[name] = nlp

    def set_up_models_dir(self):
        os.mkdir(self.models_dirname)
        package_dirname = ''.join((COMMON_MODELS_PACKAGE_NAMEPART, self.lang))
        os.mkdir(os.sep.join((self.models_dirname, package_dirname)))
        setup_cfg_filename = os.sep.join((self.models_dirname, 'setup.cfg'))
        with open(setup_cfg_filename, 'w') as setup_cfg_file:
            self.writeln(setup_cfg_file, '[metadata]')
            self.writeln(setup_cfg_file, 'name = ',
                         package_dirname.replace('_', '-'))
            self.writeln(setup_cfg_file, 'version = 1.0.0')
            self.writeln(setup_cfg_file)
            self.writeln(setup_cfg_file, '[options]')
            self.writeln(setup_cfg_file, 'packages = find:')
            self.writeln(setup_cfg_file, 'include_package_data = True')
            self.writeln(setup_cfg_file)
            self.writeln(setup_cfg_file, '[options.package_data]')
            self.writeln(setup_cfg_file, '* = *.bin, *.h5')
        pyproject_toml_filename = os.sep.join(
            (self.models_dirname, 'pyproject.toml'))
        with open(pyproject_toml_filename, 'w') as pyproject_toml_file:
            self.writeln(pyproject_toml_file, '[build-system]')
            self.writeln(pyproject_toml_file, 'requires = [')
            self.writeln(pyproject_toml_file, '  "setuptools",')
            self.writeln(pyproject_toml_file, '  "wheel",')
            self.writeln(pyproject_toml_file, ']')
            self.writeln(pyproject_toml_file,
                         'build-backend = "setuptools.build_meta"')
        init_py_filename = os.sep.join(
            (self.models_dirname, package_dirname, '__init__.py'))
        with open(init_py_filename, 'w') as init_py_file:
            self.writeln(init_py_file)

    @staticmethod
    def writeln(file, *args):
        file.write(''.join((''.join([str(arg) for arg in args]), '\n')))

    def log_incorrect_annotation(self, temp_log_file, token,
                                 correct_referred_token,
                                 incorrect_referred_token):
        doc = token.doc
        self.writeln(temp_log_file, 'Incorrect annotation:')
        start_token_index = min(correct_referred_token.i,
                                incorrect_referred_token.i)
        sentence_start_index = doc._.coref_chains.temp_sent_starts[
            doc[start_token_index]._.coref_chains.temp_sent_index]
        if token._.coref_chains.temp_sent_index + 1 == len(
                doc._.coref_chains.temp_sent_starts):
            self.writeln(temp_log_file, doc[sentence_start_index:])
            self.writeln(temp_log_file, 'Tokens from ', sentence_start_index,
                         ' to the end:')
            self.writeln(temp_log_file, doc[sentence_start_index:])
        else:
            sentence_end_index = doc._.coref_chains.temp_sent_starts[
                token._.coref_chains.temp_sent_index + 1]
            self.writeln(temp_log_file, 'Tokens ', sentence_start_index,
                         ' to ', sentence_end_index, ':')
            self.writeln(temp_log_file,
                         doc[sentence_start_index:sentence_end_index])
        self.writeln(temp_log_file, 'Referring pronoun: ', token, ' at index ',
                     token.i)
        for potential_referred in token._.coref_chains.temp_potential_referreds:
            if hasattr(potential_referred, 'true_in_training'):
                self.writeln(temp_log_file, 'Training referred mentions: ',
                             potential_referred.pretty_representation)
        self.writeln(
            temp_log_file, 'Annotated referred mentions: ',
            [chain.pretty_representation for chain in token._.coref_chains])
        self.writeln(temp_log_file)

    def generate_keras_ensemble(self, model_generator, temp_log_file,
                                training_docs, tendencies_analyzer):
        keras_model = model_generator.generate_keras_model(
            training_docs, tendencies_analyzer, ENSEMBLE_SIZE)
        self.writeln(temp_log_file)
        self.writeln(temp_log_file, 'Generated Keras model:')
        keras_model.summary(
            print_fn=lambda line: self.writeln(temp_log_file, line))
        self.writeln(temp_log_file, 'Training model ...')
        keras_history = model_generator.train_keras_model(
            training_docs, tendencies_analyzer, keras_model)
        for index in range(ENSEMBLE_SIZE):
            keras_accuracy = keras_history.history['_'.join(
                ('output', str(index), 'binary_accuracy'))][-1]
            self.writeln(temp_log_file, 'Sub-network ', index,
                         ' within ensemble:')
            self.writeln(temp_log_file, 'Binary accuracy after training is ',
                         keras_accuracy)
        return keras_model

    def load_documents(self, nlp, rules_analyzer):
        docs = []
        for loader in self.loaders:
            docs.extend(loader.load(self.data_dir, nlp, rules_analyzer))
        return docs

    def train_model(self, config_entry_name, config_entry, temp_log_file):
        self.writeln(temp_log_file, 'Config entry name: ', config_entry_name)
        nlp_name = '_'.join((self.lang, config_entry['model']))
        nlp = self.nlp_dict[nlp_name]
        self.writeln(temp_log_file, 'Spacy model: ', nlp_name, ' version ',
                     nlp.meta['version'])
        if 'vectors_model' in config_entry:
            vectors_nlp_name = '_'.join(
                (self.lang, config_entry['vectors_model']))
            vectors_nlp = self.nlp_dict[vectors_nlp_name]
            self.writeln(temp_log_file, 'Spacy vectors model: ',
                         vectors_nlp_name, ' version ',
                         vectors_nlp.meta['version'])
        else:
            vectors_nlp = nlp
            self.writeln(temp_log_file,
                         'Main model is being used as vectors model')

        rules_analyzer = RulesAnalyzerFactory().get_rules_analyzer(nlp)
        docs = self.load_documents(nlp, rules_analyzer)
        # Separate into training and test for first run
        total_words = 0
        docs_to_total_words_position = []
        for doc in docs:
            docs_to_total_words_position.append(total_words)
            total_words += len(doc)
        split_index = bisect.bisect_right(docs_to_total_words_position,
                                          total_words * 0.8)
        training_docs = docs[:split_index]
        test_docs = docs[split_index:]
        self.writeln(temp_log_file, 'Total words: ', total_words)
        self.writeln(temp_log_file, 'Training docs: ', len(training_docs),
                     '; test docs: ', len(test_docs))
        model_generator = ModelGenerator(config_entry_name, nlp, vectors_nlp)
        feature_table = model_generator.generate_feature_table(training_docs)
        self.writeln(temp_log_file, 'Feature table: ', feature_table.__dict__)
        tendencies_analyzer = TendenciesAnalyzer(rules_analyzer, vectors_nlp,
                                                 feature_table)
        keras_ensemble = self.generate_keras_ensemble(model_generator,
                                                      temp_log_file,
                                                      training_docs,
                                                      tendencies_analyzer)
        annotator = Annotator(nlp, vectors_nlp, feature_table, keras_ensemble)
        self.writeln(temp_log_file)
        correct_counter = incorrect_counter = 0
        for test_doc in test_docs:
            annotator.annotate(test_doc, used_in_training=True)
            self.writeln(temp_log_file, 'test_doc ', test_doc[:100], '... :')
            self.writeln(temp_log_file)
            self.writeln(temp_log_file, 'Coref chains:')
            self.writeln(temp_log_file)
            for chain in test_doc._.coref_chains:
                self.writeln(temp_log_file, chain.pretty_representation)
            self.writeln(temp_log_file)
            self.writeln(temp_log_file, 'Incorrect annotations:')
            self.writeln(temp_log_file)
            for token in test_doc:
                if hasattr(token._.coref_chains, 'temp_potential_referreds'):
                    for potential_referred in token._.coref_chains.temp_potential_referreds:
                        if hasattr(potential_referred, 'true_in_training'):
                            for chain in token._.coref_chains:
                                if potential_referred in chain:
                                    correct_counter += 1
                                else:
                                    incorrect_counter += 1
                                    self.log_incorrect_annotation(
                                        temp_log_file, token, token.doc[
                                            potential_referred.root_index],
                                        token.doc[
                                            chain.mentions[0].root_index])
        if len(test_docs) > 0:
            accuracy = round(
                100 * correct_counter / (correct_counter + incorrect_counter),
                2)
            self.writeln(temp_log_file)
            self.writeln(temp_log_file, 'Correct: ', correct_counter,
                         '; Incorrect: ', incorrect_counter, ' (', accuracy,
                         '%)')
            print('Accuracy: ', ''.join((str(accuracy), '%')))
        self.writeln(temp_log_file)
        self.writeln(temp_log_file, 'Retraining with all documents')
        self.writeln(temp_log_file)
        docs = self.load_documents(nlp, rules_analyzer)
        feature_table = model_generator.generate_feature_table(docs)
        self.writeln(temp_log_file, 'Feature table: ', feature_table.__dict__)
        tendencies_analyzer = TendenciesAnalyzer(rules_analyzer, vectors_nlp,
                                                 feature_table)
        keras_ensemble = self.generate_keras_ensemble(model_generator,
                                                      temp_log_file, docs,
                                                      tendencies_analyzer)
        this_model_dir = os.sep.join((self.models_dirname, ''.join(
            (COMMON_MODELS_PACKAGE_NAMEPART, self.lang)), config_entry_name))
        os.mkdir(this_model_dir)
        init_py_filename = os.sep.join((this_model_dir, '__init__.py'))
        with open(init_py_filename, 'w') as init_py_file:
            self.writeln(init_py_file)
        feature_table_filename = os.sep.join(
            (this_model_dir, FEATURE_TABLE_FILENAME))
        with open(feature_table_filename, 'wb') as feature_table_file:
            pickle.dump(feature_table, feature_table_file)
        keras_filename = ''.join(
            (this_model_dir, os.sep, KERAS_MODEL_FILENAME))
        keras_ensemble.save(keras_filename)

    def train_models(self):
        for config_entry_name in self.relevant_config_entry_names:
            config_entry = self.config[config_entry_name]
            print('Processing', config_entry_name, '...')
            temp_log_filename = ''.join((self.log_dir, os.sep, 'temp', os.sep,
                                         config_entry_name, '.log'))
            with open(temp_log_filename, 'w',
                      encoding='utf-8') as temp_log_file:
                self.train_model(config_entry_name, config_entry,
                                 temp_log_file)
        timestamp = datetime.now().isoformat(timespec='microseconds')
        sanitized_timestamp = ''.join([ch for ch in timestamp if ch.isalnum()])
        zip_filename = ''.join((self.log_dir, os.sep, 'training_log_',
                                self.lang, '_', sanitized_timestamp, '.zip'))
        shutil.make_archive(zip_filename, 'zip',
                            os.sep.join((self.log_dir, 'temp')))
        temp_dir = os.sep.join((self.log_dir, 'temp'))
        if os.path.isdir(temp_dir):
            shutil.rmtree(temp_dir)
        zip_filename = ''.join((self.models_dirname, os.sep, '..', os.sep,
                                COMMON_MODELS_PACKAGE_NAMEPART, self.lang))
        if os.path.isfile(zip_filename):
            os.remove(zip_filename)
        shutil.make_archive(zip_filename, 'zip', self.models_dirname)
Esempio n. 2
0
 def get_annotator(nlp: Language) -> Annotator:
     model_name = '_'.join((nlp.meta['lang'], nlp.meta['name']))
     relative_config_filename = os.sep.join(
         ('lang', nlp.meta['lang'], 'config.cfg'))
     if not pkg_resources.resource_exists(__name__,
                                          relative_config_filename):
         raise LanguageNotSupportedError(nlp.meta['lang'])
     absolute_config_filename = pkg_resources.resource_filename(
         __name__, relative_config_filename)
     config = Config().from_disk(absolute_config_filename)
     for config_entry_name, config_entry in config.items():
         if nlp.meta['name'] == config_entry['model'] and \
                 version.parse(nlp.meta['version']) >= \
                 version.parse(config_entry['from_version']) and \
                 version.parse(nlp.meta['version']) <= \
                 version.parse(config_entry['to_version']):
             if 'vectors_model' in config_entry:
                 try:
                     vectors_nlp = spacy.load('_'.join(
                         (nlp.meta['lang'], config_entry['vectors_model'])))
                 except OSError:
                     raise VectorsModelNotInstalledError(''.join((
                         'Model ', model_name,
                         ' is only supported in conjunction with model ',
                         nlp.meta['lang'], '_',
                         config_entry['vectors_model'],
                         " which must be loaded using 'python -m spacy download ",
                         nlp.meta['lang'], '_',
                         config_entry['vectors_model'], "'.")))
                 if version.parse(vectors_nlp.meta['version']) < \
                         version.parse(config_entry['vectors_from_version']) or \
                         version.parse(vectors_nlp.meta['version']) > \
                         version.parse(config_entry['vectors_to_version']):
                     raise VectorsModelHasWrongVersionError(''.join(
                         ('Model ', model_name,
                          ' is only supported in conjunction with model ',
                          nlp.meta['lang'], '_',
                          config_entry['vectors_model'],
                          ' between versions ',
                          config_entry['vectors_from_version'], ' and ',
                          config_entry['vectors_to_version'],
                          ' inclusive.')))
             else:
                 vectors_nlp = nlp
             model_package_name = ''.join(
                 (COMMON_MODELS_PACKAGE_NAMEPART, nlp.meta['lang'], '.',
                  config_entry_name))
             try:
                 importlib.import_module(model_package_name)
             except ModuleNotFoundError:
                 print(''.join((
                     "Model could not be loaded for config entry '",
                     config_entry_name, "' If models exist for language '",
                     nlp.meta['lang'],
                     "', load them with the command 'python -m coreferee install ",
                     nlp.meta['lang'], "'.")))
                 raise ModelNotSupportedError(''.join(
                     (nlp.meta['lang'], '_', nlp.meta['name'], ' version ',
                      nlp.meta['version'])))
             this_feature_table_filename = pkg_resources.resource_filename(
                 model_package_name, FEATURE_TABLE_FILENAME)
             with open(this_feature_table_filename,
                       "rb") as feature_table_file:
                 feature_table = pickle.load(feature_table_file)
             absolute_keras_model_filename = pkg_resources.resource_filename(
                 model_package_name, KERAS_MODEL_FILENAME)
             keras_ensemble = keras.models.load_model(
                 absolute_keras_model_filename)
             return Annotator(nlp, vectors_nlp, feature_table,
                              keras_ensemble)
     raise ModelNotSupportedError(''.join(
         (nlp.meta['lang'], '_', nlp.meta['name'], ' version ',
          nlp.meta['version'])))