class NeuroNER(object):
    """
    NeuroNER model.

    Args:
        param_filepath (type): description
        pretrained_model_folder (type): description
        dataset_text_folder (type): description
        character_embedding_dimension (type): description
        character_lstm_hidden_state_dimension (type): description
        check_for_digits_replaced_with_zeros (type): description
        check_for_lowercase (type): description
        debug (type): description
        dropout_rate (type): description
        experiment_name (type): description
        freeze_token_embeddings (type): description
        gradient_clipping_value (type): description
        learning_rate (type): description
        load_only_pretrained_token_embeddings (type): description
        load_all_pretrained_token_embeddings (type): description
        main_evaluation_mode (type): description
        maximum_number_of_epochs (type): description
        number_of_cpu_threads (type): description
        number_of_gpus (type): description
        optimizer (type): description
        output_folder (type): description
        output_scores (bool): description
        patience (type): description
        plot_format (type): description
        reload_character_embeddings (type): description
        reload_character_lstm (type): description
        reload_crf (type): description
        reload_feedforward (type): description
        reload_token_embeddings (type): description
        reload_token_lstm (type): description
        remap_unknown_tokens_to_unk (type): description
        spacylanguage (type): description
        tagging_format (type): description
        token_embedding_dimension (type): description
        token_lstm_hidden_state_dimension (type): description
        token_pretrained_embedding_filepath (type): description
        tokenizer (type): description
        train_model (type): description
        use_character_lstm (type): description
        use_crf (type): description
        use_pretrained_model (type): description
        verbose (type): description
    """

    prediction_count = 0

    def __init__(self, **kwargs):

        # Set parameters
        print('hello')
        self.parameters, self.conf_parameters = load_parameters(**kwargs)

        self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters)
        self._check_param_compatibility(self.parameters,
                                        self.dataset_filepaths)

        # Load dataset
        self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'],
                                         debug=self.parameters['debug'])
        token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths,
                                                      self.parameters)

        # Launch session. Automatically choose a device
        # if the specified one doesn't exist
        session_conf = tf.ConfigProto(intra_op_parallelism_threads=self.
                                      parameters['number_of_cpu_threads'],
                                      inter_op_parallelism_threads=self.
                                      parameters['number_of_cpu_threads'],
                                      device_count={
                                          'CPU': 1,
                                          'GPU':
                                          self.parameters['number_of_gpus']
                                      },
                                      allow_soft_placement=True,
                                      log_device_placement=False)

        self.sess = tf.Session(config=session_conf)
        with self.sess.as_default():

            # Initialize or load pretrained model
            self.model = EntityLSTM(self.modeldata, self.parameters)
            self.sess.run(tf.global_variables_initializer())

            if self.parameters['use_pretrained_model']:
                self.transition_params_trained = self.model.restore_from_pretrained_model(
                    self.parameters,
                    self.modeldata,
                    self.sess,
                    token_to_vector=token_to_vector)
            else:
                self.model.load_pretrained_token_embeddings(
                    self.sess, self.modeldata, self.parameters,
                    token_to_vector)
                self.transition_params_trained = np.random.rand(
                    len(self.modeldata.unique_labels) + 2,
                    len(self.modeldata.unique_labels) + 2)

    def _create_stats_graph_folder(self, parameters):
        """
        Initialize stats_graph_folder.

        Args:
            parameters (type): description.
        """
        experiment_timestamp = utils.get_current_time_in_miliseconds()
        dataset_name = utils.get_basename_without_extension(
            parameters['dataset_text_folder'])
        model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
        utils.create_folder_if_not_exists(parameters['output_folder'])

        # Folder where to save graphs
        stats_graph_folder = os.path.join(parameters['output_folder'],
                                          model_name)
        utils.create_folder_if_not_exists(stats_graph_folder)
        return stats_graph_folder, experiment_timestamp

    def _get_valid_dataset_filepaths(self,
                                     parameters,
                                     dataset_types=[
                                         'train', 'valid', 'test', 'deploy'
                                     ]):
        """
        Get paths for the datasets.

        Args:
            parameters (type): description.
            dataset_types (type): description.
        """
        dataset_filepaths = {}
        dataset_brat_folders = {}

        for dataset_type in dataset_types:
            dataset_filepaths[dataset_type] = os.path.join(
                parameters['dataset_text_folder'],
                '{0}.txt'.format(dataset_type))
            # print('dataset_type, dataset_filepaths[dataset_type]: ',dataset_type ,dataset_filepaths[dataset_type])

            dataset_brat_folders[dataset_type] = os.path.join(
                parameters['dataset_text_folder'], dataset_type)
            # print('dataset_type, dataset_brat_folders[dataset_type]: ',dataset_type ,dataset_brat_folders[dataset_type] )

            dataset_compatible_with_brat_filepath = os.path.join(
                parameters['dataset_text_folder'],
                '{0}_compatible_with_brat.txt'.format(dataset_type))
            # print('dataset_type, dataset_compatible_with_brat_filepath: ',dataset_type ,dataset_compatible_with_brat_filepath)

            # Conll file exists
            if os.path.isfile(dataset_filepaths[dataset_type]) \
            and os.path.getsize(dataset_filepaths[dataset_type]) > 0:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and \
                len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:

                    # Check compatibility between conll and brat files
                    brat_to_conll.check_brat_annotation_and_text_compatibility(
                        dataset_brat_folders[dataset_type])
                    if os.path.exists(dataset_compatible_with_brat_filepath):
                        dataset_filepaths[
                            dataset_type] = dataset_compatible_with_brat_filepath
                    conll_to_brat.check_compatibility_between_conll_and_brat_text(
                        dataset_filepaths[dataset_type],
                        dataset_brat_folders[dataset_type])

                # Brat text files do not exist
                else:
                    # Populate brat text and annotation files based on conll file
                    conll_to_brat.conll_to_brat(
                        dataset_filepaths[dataset_type],
                        dataset_compatible_with_brat_filepath,
                        dataset_brat_folders[dataset_type],
                        dataset_brat_folders[dataset_type])
                    dataset_filepaths[
                        dataset_type] = dataset_compatible_with_brat_filepath

            # Conll file does not exist
            else:
                # Brat text files exist
                # print(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt')),len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))))

                if os.path.exists(dataset_brat_folders[dataset_type]) \
                and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:

                    # print('dataset_type, dataset_brat_folders[dataset_type]: ',dataset_type ,dataset_brat_folders[dataset_type])
                    dataset_filepath_for_tokenizer = os.path.join(
                        parameters['dataset_text_folder'],
                        '{0}_{1}.txt'.format(dataset_type,
                                             parameters['tokenizer']))
                    if os.path.exists(dataset_filepath_for_tokenizer):
                        conll_to_brat.check_compatibility_between_conll_and_brat_text(
                            dataset_filepath_for_tokenizer,
                            dataset_brat_folders[dataset_type])
                    else:
                        # Populate conll file based on brat files
                        brat_to_conll.brat_to_conll(
                            dataset_brat_folders[dataset_type],
                            dataset_filepath_for_tokenizer,
                            parameters['tokenizer'],
                            parameters['spacylanguage'])
                    dataset_filepaths[
                        dataset_type] = dataset_filepath_for_tokenizer

                # Brat text files do not exist
                else:
                    del dataset_filepaths[dataset_type]
                    del dataset_brat_folders[dataset_type]
                    continue

            if parameters['tagging_format'] == 'bioes':
                # Generate conll file with BIOES format
                bioes_filepath = os.path.join(
                    parameters['dataset_text_folder'], '{0}_bioes.txt'.format(
                        utils.get_basename_without_extension(
                            dataset_filepaths[dataset_type])))
                utils_nlp.convert_conll_from_bio_to_bioes(
                    dataset_filepaths[dataset_type], bioes_filepath)
                dataset_filepaths[dataset_type] = bioes_filepath

        # print('dataset_filepaths:',dataset_filepaths)
        return dataset_filepaths, dataset_brat_folders

    def _check_param_compatibility(self, parameters, dataset_filepaths):
        """
        Check parameters are compatible.

        Args:
            parameters (type): description.
            dataset_filepaths (type): description.
        """
        check_param_compatibility(parameters, dataset_filepaths)

    def fit(self):
        """
        Fit the model.
        """
        parameters = self.parameters
        conf_parameters = self.conf_parameters
        dataset_filepaths = self.dataset_filepaths
        modeldata = self.modeldata
        dataset_brat_folders = self.dataset_brat_folders
        sess = self.sess
        model = self.model
        transition_params_trained = self.transition_params_trained
        stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(
            parameters)

        # Initialize and save execution details
        start_time = time.time()
        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        model_folder = os.path.join(stats_graph_folder, 'model')
        utils.create_folder_if_not_exists(model_folder)
        with open(os.path.join(model_folder, 'parameters.ini'),
                  'w') as parameters_file:
            conf_parameters.write(parameters_file)
        pickle.dump(modeldata,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

        tensorboard_log_folder = os.path.join(stats_graph_folder,
                                              'tensorboard_logs')
        utils.create_folder_if_not_exists(tensorboard_log_folder)
        tensorboard_log_folders = {}
        for dataset_type in dataset_filepaths.keys():
            tensorboard_log_folders[dataset_type] = os.path.join(
                stats_graph_folder, 'tensorboard_logs', dataset_type)
            utils.create_folder_if_not_exists(
                tensorboard_log_folders[dataset_type])

        # Instantiate the writers for TensorBoard
        writers = {}
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type] = tf.summary.FileWriter(
                tensorboard_log_folders[dataset_type], graph=sess.graph)

        # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
        embedding_writer = tf.summary.FileWriter(model_folder)

        embeddings_projector_config = projector.ProjectorConfig()
        tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
        token_list_file_path = os.path.join(model_folder,
                                            'tensorboard_metadata_tokens.tsv')
        tensorboard_token_embeddings.metadata_path = os.path.relpath(
            token_list_file_path, '.')

        tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
        character_list_file_path = os.path.join(
            model_folder, 'tensorboard_metadata_characters.tsv')
        tensorboard_character_embeddings.metadata_path = os.path.relpath(
            character_list_file_path, '.')

        projector.visualize_embeddings(embedding_writer,
                                       embeddings_projector_config)

        # Write metadata for TensorBoard embeddings
        token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
        for token_index in range(modeldata.vocabulary_size):
            token_list_file.write('{0}\n'.format(
                modeldata.index_to_token[token_index]))
        token_list_file.close()

        character_list_file = codecs.open(character_list_file_path, 'w',
                                          'UTF-8')
        for character_index in range(modeldata.alphabet_size):
            if character_index == modeldata.PADDING_CHARACTER_INDEX:
                character_list_file.write('PADDING\n')
            else:
                character_list_file.write('{0}\n'.format(
                    modeldata.index_to_character[character_index]))
        character_list_file.close()

        # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
        # number of epochs with no improvement on the validation test in terms of F1-score
        bad_counter = 0
        previous_best_valid_f1_score = 0
        epoch_number = -1
        try:
            while True:
                step = 0
                epoch_number += 1
                # print('\nStarting epoch {0}'.format(epoch_number))

                epoch_start_time = time.time()

                if epoch_number != 0:
                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(modeldata.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, modeldata, sequence_number, model,
                            parameters)
                        step += 1
                        if step % 10 == 0:
                            print('Training {0:.2f}% done'.format(
                                step / len(sequence_numbers) * 100),
                                  end='\r',
                                  flush=True)

                epoch_elapsed_training_time = time.time() - epoch_start_time
                # print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time),
                # flush=True)

                y_pred, y_true, output_filepaths = train.predict_labels(
                    sess, model, transition_params_trained, parameters,
                    modeldata, epoch_number, stats_graph_folder,
                    dataset_filepaths)

                # Evaluate model: save and plot results
                evaluate.evaluate_model(results, modeldata, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                if parameters['use_pretrained_model'] and not parameters[
                        'train_model']:
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder)
                    break

                # Save model
                model.saver.save(
                    sess,
                    os.path.join(model_folder,
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                # Save TensorBoard logs
                summary = sess.run(model.summary_op, feed_dict=None)
                writers['train'].add_summary(summary, epoch_number)
                writers['train'].flush()
                utils.copytree(writers['train'].get_logdir(), model_folder)

                # Early stop
                valid_f1_score = results['epoch'][epoch_number][0]['valid'][
                    'f1_score']['micro']
                if valid_f1_score > previous_best_valid_f1_score:
                    bad_counter = 0
                    previous_best_valid_f1_score = valid_f1_score
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder,
                                              overwrite=True)
                    self.transition_params_trained = transition_params_trained
                else:
                    bad_counter += 1
                # print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                if bad_counter >= parameters['patience']:
                    # print('Early Stop!')
                    results['execution_details']['early_stop'] = True
                    break

                if epoch_number >= parameters['maximum_number_of_epochs']:
                    break

        except KeyboardInterrupt:
            results['execution_details']['keyboard_interrupt'] = True
            # print('Training interrupted')

        print('Finishing the experiment')
        end_time = time.time()
        results['execution_details']['train_duration'] = end_time - start_time
        results['execution_details']['train_end'] = end_time
        evaluate.save_results(results, stats_graph_folder)
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type].close()

    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join(
                '.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(
                self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(
                os.path.join(self.parameters['dataset_text_folder'],
                             '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(
            self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(
            dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
        #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        # print('over here: ',dataset_brat_deploy_filepath)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(
            self.sess, self.modeldata, dataset_type, self.model,
            self.transition_params_trained, self.stats_graph_folder,
            self.prediction_count, self.parameters, self.dataset_filepaths)

        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths,
                                  self.dataset_brat_folders,
                                  self.stats_graph_folder,
                                  overwrite=True)

        # Print and output result
        text_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy',
            os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
                utils.get_basename_without_extension(
                    dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(
            text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities

    def get_params(self):
        return self.parameters

    def close(self):
        self.__del__()

    def __del__(self):
        self.sess.close()
Example #2
0
    def __init__(self,
                 parameters_filepath=argument_default_value, 
                 pretrained_model_folder=argument_default_value,
                 dataset_text_folder=argument_default_value, 
                 character_embedding_dimension=argument_default_value,
                 character_lstm_hidden_state_dimension=argument_default_value,
                 check_for_digits_replaced_with_zeros=argument_default_value,
                 check_for_lowercase=argument_default_value,
                 debug=argument_default_value,
                 dropout_rate=argument_default_value,
                 experiment_name=argument_default_value,
                 freeze_token_embeddings=argument_default_value,
                 gradient_clipping_value=argument_default_value,
                 learning_rate=argument_default_value,
                 load_only_pretrained_token_embeddings=argument_default_value,
                 load_all_pretrained_token_embeddings=argument_default_value,
                 main_evaluation_mode=argument_default_value,
                 maximum_number_of_epochs=argument_default_value,
                 number_of_cpu_threads=argument_default_value,
                 number_of_gpus=argument_default_value,
                 optimizer=argument_default_value,
                 output_folder=argument_default_value,
                 patience=argument_default_value,
                 plot_format=argument_default_value,
                 reload_character_embeddings=argument_default_value,
                 reload_character_lstm=argument_default_value,
                 reload_crf=argument_default_value,
                 reload_feedforward=argument_default_value,
                 reload_token_embeddings=argument_default_value,
                 reload_token_lstm=argument_default_value,
                 remap_unknown_tokens_to_unk=argument_default_value,
                 spacylanguage=argument_default_value,
                 tagging_format=argument_default_value,
                 token_embedding_dimension=argument_default_value,
                 token_lstm_hidden_state_dimension=argument_default_value,
                 token_pretrained_embedding_filepath=argument_default_value,
                 tokenizer=argument_default_value,
                 train_model=argument_default_value,
                 use_character_lstm=argument_default_value,
                 use_crf=argument_default_value,
                 use_pretrained_model=argument_default_value,
                 verbose=argument_default_value,
                 argument_default_value=argument_default_value,
                 # new arguments
                 num_layers=argument_default_value,
                 use_deep_lstm=argument_default_value):
        
        # Parse arguments
        arguments = dict( (k,str(v)) for k,v in locals().items() if k !='self')
        
        # Initialize parameters
        parameters, conf_parameters = self._load_parameters(arguments['parameters_filepath'], arguments=arguments)
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(parameters)
        self._check_parameter_compatiblity(parameters, dataset_filepaths)

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
        token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)
        
        # Launch session
        session_conf = tf.ConfigProto(
        intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
        inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
        device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
        allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
        log_device_placement=False
        )
        sess = tf.Session(config=session_conf)
        
        with sess.as_default():
            # Create model and initialize or load pretrained model
            ### Instantiate the model
            model = EntityLSTM(dataset, parameters)
            ### Initialize the model and restore from pretrained model if needed
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector)
                self.transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
            else:
                self.transition_params_trained = model.restore_from_pretrained_model(parameters, dataset, sess, token_to_vector=token_to_vector)
            del token_to_vector

        self.dataset = dataset
        self.dataset_brat_folders = dataset_brat_folders
        self.dataset_filepaths = dataset_filepaths
        self.model = model
        self.parameters = parameters
        self.conf_parameters = conf_parameters
        self.sess = sess
Example #3
0
    def __init__(self,
                 parameters_filepath=argument_default_value,
                 pretrained_model_folder=argument_default_value,
                 dataset_text_folder=argument_default_value,
                 character_embedding_dimension=argument_default_value,
                 character_lstm_hidden_state_dimension=argument_default_value,
                 check_for_digits_replaced_with_zeros=argument_default_value,
                 check_for_lowercase=argument_default_value,
                 debug=argument_default_value,
                 dropout_rate=argument_default_value,
                 experiment_name=argument_default_value,
                 freeze_token_embeddings=argument_default_value,
                 gradient_clipping_value=argument_default_value,
                 learning_rate=argument_default_value,
                 load_only_pretrained_token_embeddings=argument_default_value,
                 load_all_pretrained_token_embeddings=argument_default_value,
                 main_evaluation_mode=argument_default_value,
                 maximum_number_of_epochs=argument_default_value,
                 number_of_cpu_threads=argument_default_value,
                 number_of_gpus=argument_default_value,
                 optimizer=argument_default_value,
                 output_folder=argument_default_value,
                 patience=argument_default_value,
                 plot_format=argument_default_value,
                 reload_character_embeddings=argument_default_value,
                 reload_character_lstm=argument_default_value,
                 reload_crf=argument_default_value,
                 reload_feedforward=argument_default_value,
                 reload_token_embeddings=argument_default_value,
                 reload_token_lstm=argument_default_value,
                 remap_unknown_tokens_to_unk=argument_default_value,
                 spacylanguage=argument_default_value,
                 tagging_format=argument_default_value,
                 token_embedding_dimension=argument_default_value,
                 token_lstm_hidden_state_dimension=argument_default_value,
                 token_pretrained_embedding_filepath=argument_default_value,
                 tokenizer=argument_default_value,
                 train_model=argument_default_value,
                 use_character_lstm=argument_default_value,
                 use_crf=argument_default_value,
                 use_pretrained_model=argument_default_value,
                 verbose=argument_default_value,
                 argument_default_value=argument_default_value):

        # Lấy toàn bộ argument được truyền vào
        arguments = dict( (k,str(v)) for k,v in locals().items() if k !='self')

        # Khởi tạo parameter từ đường dẫn và các parameter truyền vào (coi trong file paremeters.ini)
        parameters, conf_parameters = self._load_parameters(arguments['parameters_filepath'], arguments=arguments)
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(parameters)

        # Kiểm tra độ phù hợp của các parameter
        self._check_parameter_compatiblity(parameters, dataset_filepaths)

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
        token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)
        '''
        token_to_vector = {
            "token": <ndarray> - giá trị word vector trong file glove.6B.100d.txt
        }
        '''
        # Launch session

        session_conf = tf.ConfigProto(
        intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
        inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
        device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
        allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
        log_device_placement=False
        )
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Create model and initialize or load pretrained model
            ### Instantiate the model
            model = EntityLSTM(dataset, parameters)
            ### Initialize the model and restore from pretrained model if needed
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                '''
                Init token_embedding_weights với token_to_vector
                '''
                model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector)
                '''
                unique_labels: các nhãn entity độc lập trong dataset hiện có (conll)
                '''
                self.transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
            else:
                self.transition_params_trained = model.restore_from_pretrained_model(parameters, dataset, sess, token_to_vector=token_to_vector)
            del token_to_vector
        # TODO: Cần tìm hiểu ý nghĩa của transition_params_trained
        self.dataset = dataset                              # quản lý dataset <Dataset>
        self.dataset_brat_folders = dataset_brat_folders    # Folder data dạng brat <dict>
        self.dataset_filepaths = dataset_filepaths          # đường dẫn các file conll data <dict>
        self.model = model                                  # model huấn luyện <EntityLSTM>
        self.parameters = parameters                        # Các parameters
        self.conf_parameters = conf_parameters              # Các paramters config dùng cho Tensorflow
        self.sess = sess                                    # session chạy của tensorflow