Esempio n. 1
0
    def train(self,
              x_train,
              y_train,
              f_train=None,
              x_valid=None,
              y_valid=None,
              f_valid=None,
              callbacks=None):
        # TBD if valid is None, segment train to get one if early_stop is True

        # we concatenate all the training+validation data to create the model vocabulary
        if not x_valid is None:
            x_all = np.concatenate((x_train, x_valid), axis=0)
        else:
            x_all = x_train

        if not y_valid is None:
            y_all = np.concatenate((y_train, y_valid), axis=0)
        else:
            y_all = y_train

        features_all = concatenate_or_none((f_train, f_valid), axis=0)

        self.p = prepare_preprocessor(x_all,
                                      y_all,
                                      features=features_all,
                                      model_config=self.model_config)

        self.model_config.char_vocab_size = len(self.p.vocab_char)
        self.model_config.case_vocab_size = len(self.p.vocab_case)

        self.model = get_model(self.model_config,
                               self.p,
                               len(self.p.vocab_tag),
                               load_pretrained_weights=True)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        # uncomment to plot graph
        #plot_model(self.model,
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png')

        trainer = Trainer(
            self.model,
            self.models,
            self.embeddings,
            self.model_config,
            self.training_config,
            checkpoint_path=self.log_dir,
            preprocessor=self.p,
            transformer_preprocessor=self.model.transformer_preprocessor)
        trainer.train(x_train,
                      y_train,
                      x_valid,
                      y_valid,
                      features_train=f_train,
                      features_valid=f_valid,
                      callbacks=callbacks)
        if self.embeddings and self.embeddings.use_ELMo:
            self.embeddings.clean_ELMo_cache()
Esempio n. 2
0
    def train(self, x_train, y_train, vocab_init=None, callbacks=None):
        self.model = getModel(self.model_config, self.training_config)

        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        bert_data = False
        if self.transformer_name is not None:
            bert_data = True

        if self.training_config.early_stop:
            # create validation set 
            xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1)

            training_generator = DataGenerator(xtr, y, batch_size=self.training_config.batch_size, 
                maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                embeddings=self.embeddings, shuffle=True, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)
            validation_generator = DataGenerator(val_x, None, batch_size=self.training_config.batch_size, 
                maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)
        else:
            val_y = y_train

            training_generator = DataGenerator(x_train, y_train, batch_size=self.training_config.batch_size, 
                maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                embeddings=self.embeddings, shuffle=True, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)
            validation_generator = None

        # uncomment to plot graph
        #plot_model(self.model, 
        #    to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png')
        self.model.train_model(
            self.model_config.list_classes, 
            self.training_config.batch_size, 
            self.training_config.max_epoch, 
            self.training_config.use_roc_auc, 
            self.training_config.class_weights, 
            training_generator, 
            validation_generator, 
            val_y, 
            patience=self.training_config.patience, 
            multiprocessing=self.training_config.multiprocessing, 
            callbacks=callbacks)
Esempio n. 3
0
    def load(self, dir_path='data/models/textClassification/'):
        model_path = os.path.join(dir_path, self.model_config.model_name)
        self.model_config = ModelConfig.load(os.path.join(model_path, self.config_file))

        if self.model_config.transformer_name is None:
            # load embeddings
            # Do not use cache in 'production' mode
            self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_cache=False)
            self.model_config.word_embedding_size = self.embeddings.embed_size
        else:
            self.transformer_name = self.model_config.transformer_name
            self.embeddings = None

        self.model = getModel(self.model_config, 
                              self.training_config, 
                              load_pretrained_weights=False, 
                              local_path=model_path)
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.fold_number == 1:
            print("load weights from", os.path.join(model_path, self.weight_file))
            self.model.load(os.path.join(model_path, self.weight_file))
        else:
            self.models = []
            if self.model_config.transformer_name is None:
                for i in range(0, self.model_config.fold_number):
                    local_model = getModel(self.model_config, 
                                        self.training_config, 
                                        load_pretrained_weights=False, 
                                        local_path=model_path)
                    local_model.load(os.path.join(model_path, "model{0}_weights.hdf5".format(i)))
                    self.models.append(local_model)
            else:
                # only init first fold one, the other will be init at prediction time, all weights will be loaded at prediction time
                local_model = getModel(self.model_config, 
                                    self.training_config, 
                                    load_pretrained_weights=False, 
                                    local_path=model_path)
                self.models.append(local_model)
Esempio n. 4
0
def train_folds(X,
                y,
                model_config,
                training_config,
                embeddings,
                callbacks=None):
    fold_count = model_config.fold_number
    max_epoch = training_config.max_epoch
    architecture = model_config.architecture
    use_roc_auc = training_config.use_roc_auc
    class_weights = training_config.class_weights

    fold_size = len(X) // fold_count
    models = []
    scores = []

    bert_data = False
    if model_config.transformer_name is not None:
        bert_data = True

    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = X[fold_start:fold_end]
        val_y = y[fold_start:fold_end]

        foldModel = getModel(model_config, training_config)

        if fold_id == 0:
            print_parameters(model_config, training_config)
            foldModel.print_summary()

        print('\n------------------------ fold ' + str(fold_id) +
              '--------------------------------------')

        training_generator = DataGenerator(
            train_x,
            train_y,
            batch_size=training_config.batch_size,
            maxlen=model_config.maxlen,
            list_classes=model_config.list_classes,
            embeddings=embeddings,
            bert_data=bert_data,
            shuffle=True,
            transformer_tokenizer=foldModel.transformer_tokenizer)

        validation_generator = None
        if training_config.early_stop:
            validation_generator = DataGenerator(
                val_x,
                val_y,
                batch_size=training_config.batch_size,
                maxlen=model_config.maxlen,
                list_classes=model_config.list_classes,
                embeddings=embeddings,
                bert_data=bert_data,
                shuffle=False,
                transformer_tokenizer=foldModel.transformer_tokenizer)

        foldModel.train_model(model_config.list_classes,
                              training_config.batch_size,
                              max_epoch,
                              use_roc_auc,
                              class_weights,
                              training_generator,
                              validation_generator,
                              val_y,
                              multiprocessing=training_config.multiprocessing,
                              patience=training_config.patience,
                              callbacks=callbacks)

        if model_config.transformer_name is None:
            models.append(foldModel)
        else:
            # if we are using a transformer layer in the architecture, we need to save the fold model on the disk
            directory = os.path.join("data/models/textClassification/",
                                     model_config.model_name)
            if not os.path.exists(directory):
                os.makedirs(directory)

            if fold_id == 0:
                models.append(foldModel)
                # save transformer config and tokenizer
                if foldModel.transformer_config is not None:
                    foldModel.transformer_config.to_json_file(
                        os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME))
                if foldModel.transformer_tokenizer is not None:
                    foldModel.transformer_tokenizer.save_pretrained(
                        os.path.join(directory,
                                     DEFAULT_TRANSFORMER_TOKENIZER_DIR))

            model_path = os.path.join(directory,
                                      "model{0}_weights.hdf5".format(fold_id))
            foldModel.save(model_path)
            if fold_id != 0:
                del foldModel

    return models
Esempio n. 5
0
    def eval_nfold(self, x_test, y_test, features=None):
        if self.models is not None:
            total_f1 = 0
            best_f1 = 0
            best_index = 0
            worst_f1 = 1
            worst_index = 0
            reports = []
            reports_as_map = []
            total_precision = 0
            total_recall = 0
            for i in range(self.model_config.fold_number):

                if self.model_config.transformer_name is None:
                    the_model = self.models[i]
                    bert_preprocessor = None
                else:
                    # the architecture model uses a transformer layer, it is large and needs to be loaded from disk
                    dir_path = 'data/models/sequenceLabelling/'
                    weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                        ".hdf5",
                        str(i) + ".hdf5")
                    self.model = get_model(self.model_config,
                                           self.p,
                                           ntags=len(self.p.vocab_tag),
                                           load_pretrained_weights=False,
                                           local_path=os.path.join(
                                               dir_path,
                                               self.model_config.model_name))
                    self.model.load(filepath=os.path.join(
                        dir_path, self.model_config.model_name, weight_file))
                    the_model = self.model
                    bert_preprocessor = self.model.transformer_preprocessor

                if i == 0:
                    the_model.print_summary()
                    print_parameters(self.model_config, self.training_config)

                print('\n------------------------ fold ' + str(i) +
                      ' --------------------------------------')

                # we can use a data generator for evaluation
                # Prepare test data(steps, generator)
                generator = the_model.get_generator()
                test_generator = generator(
                    x_test,
                    y_test,
                    batch_size=self.model_config.batch_size,
                    preprocessor=self.p,
                    bert_preprocessor=bert_preprocessor,
                    char_embed_size=self.model_config.char_embedding_size,
                    max_sequence_length=self.model_config.max_sequence_length,
                    embeddings=self.embeddings,
                    shuffle=False,
                    features=features,
                    output_input_offsets=True,
                    use_chain_crf=self.model_config.use_chain_crf)

                # Build the evaluator and evaluate the model
                scorer = Scorer(test_generator,
                                self.p,
                                evaluation=True,
                                use_crf=self.model_config.use_crf,
                                use_chain_crf=self.model_config.use_chain_crf)
                scorer.model = the_model
                scorer.on_epoch_end(epoch=-1)
                f1 = scorer.f1
                precision = scorer.precision
                recall = scorer.recall
                reports.append(scorer.report)
                reports_as_map.append(scorer.report_as_map)

                if best_f1 < f1:
                    best_f1 = f1
                    best_index = i
                if worst_f1 > f1:
                    worst_f1 = f1
                    worst_index = i
                total_f1 += f1
                total_precision += precision
                total_recall += recall

            fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}}

            micro_f1 = total_f1 / self.model_config.fold_number
            micro_precision = total_precision / self.model_config.fold_number
            micro_recall = total_recall / self.model_config.fold_number

            micro_eval_block = {
                'f1': micro_f1,
                'precision': micro_precision,
                'recall': micro_recall
            }
            fold_average_evaluation['micro'] = micro_eval_block

            # field-level average over the n folds
            labels = []
            for label in sorted(self.p.vocab_tag):
                if label == 'O' or label == '<PAD>':
                    continue
                if label.startswith("B-") or label.startswith(
                        "S-") or label.startswith("I-") or label.startswith(
                            "E-"):
                    label = label[2:]

                if label in labels:
                    continue
                labels.append(label)

                sum_p = 0
                sum_r = 0
                sum_f1 = 0
                sum_support = 0
                for j in range(0, self.model_config.fold_number):
                    if label not in reports_as_map[j]['labels']:
                        continue
                    report_as_map = reports_as_map[j]['labels'][label]
                    sum_p += report_as_map["precision"]
                    sum_r += report_as_map["recall"]
                    sum_f1 += report_as_map["f1"]
                    sum_support += report_as_map["support"]

                avg_p = sum_p / self.model_config.fold_number
                avg_r = sum_r / self.model_config.fold_number
                avg_f1 = sum_f1 / self.model_config.fold_number
                avg_support = sum_support / self.model_config.fold_number
                avg_support_dec = str(avg_support - int(avg_support))[1:]
                if avg_support_dec != '0':
                    avg_support = math.floor(avg_support)

                block_label = {
                    'precision': avg_p,
                    'recall': avg_r,
                    'support': avg_support,
                    'f1': avg_f1
                }
                fold_average_evaluation['labels'][label] = block_label

            print(
                "----------------------------------------------------------------------"
            )
            print("\n** Worst ** model scores - run", str(worst_index))
            print(reports[worst_index])

            print("\n** Best ** model scores - run", str(best_index))
            print(reports[best_index])

            fold_nb = self.model_config.fold_number
            self.model_config.fold_number = 1
            if self.model_config.transformer_name is None:
                self.model = self.models[best_index]
            else:
                dir_path = 'data/models/sequenceLabelling/'
                weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                    ".hdf5",
                    str(best_index) + ".hdf5")
                # saved config file must be updated to single fold
                self.model.load(filepath=os.path.join(
                    dir_path, self.model_config.model_name, weight_file))

            print(
                "----------------------------------------------------------------------"
            )
            print("\nAverage over", str(int(fold_nb)), "folds")
            print(
                get_report(fold_average_evaluation,
                           digits=4,
                           include_avgs=['micro']))
Esempio n. 6
0
    def eval_single(self, x_test, y_test, features=None):
        if self.model is None:
            raise (OSError('Could not find a model.'))
        print_parameters(self.model_config, self.training_config)
        self.model.print_summary()

        if self.model_config.transformer_name is None:
            # we can use a data generator for evaluation

            # Prepare test data(steps, generator)
            generator = self.model.get_generator()
            test_generator = generator(
                x_test,
                y_test,
                batch_size=self.model_config.batch_size,
                preprocessor=self.p,
                char_embed_size=self.model_config.char_embedding_size,
                max_sequence_length=self.model_config.max_sequence_length,
                embeddings=self.embeddings,
                shuffle=False,
                features=features,
                output_input_offsets=True,
                use_chain_crf=self.model_config.use_chain_crf)

            # Build the evaluator and evaluate the model
            scorer = Scorer(test_generator,
                            self.p,
                            evaluation=True,
                            use_crf=self.model_config.use_crf,
                            use_chain_crf=self.model_config.use_chain_crf)
            scorer.model = self.model
            scorer.on_epoch_end(epoch=-1)
        else:
            # the architecture model uses a transformer layer
            # note that we could also use the above test_generator, but as an alternative here we check the
            # test/prediction alignment of tokens and the validity of the maximum sequence input length
            # wrt the length of the test sequences

            tagger = Tagger(
                self.model,
                self.model_config,
                self.embeddings,
                preprocessor=self.p,
                transformer_preprocessor=self.model.transformer_preprocessor)
            y_pred_pairs = tagger.tag(x_test,
                                      output_format=None,
                                      features=features)

            # keep only labels
            y_pred = []
            for result in y_pred_pairs:
                result_labels = []
                for pair in result:
                    result_labels.append(pair[1])
                y_pred.append(result_labels)

            nb_alignment_issues = 0
            for i in range(len(y_test)):
                if len(y_test[i]) != len(y_pred[i]):
                    #print("y_test:", y_test[i])
                    #print("y_pred:", y_pred[i])

                    nb_alignment_issues += 1
                    # BERT tokenizer appears to introduce some additional tokens without ## prefix,
                    # but we normally handled that well when predicting.
                    # To be very conservative, the following ensure the number of tokens always
                    # match, but it should never be used in practice.
                    if len(y_test[i]) < len(y_pred[i]):
                        y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) -
                                                         len(y_test[i]))
                    if len(y_test[i]) > len(y_pred[i]):
                        y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) -
                                                         len(y_pred[i]))

            if nb_alignment_issues > 0:
                print("number of alignment issues with test set:",
                      nb_alignment_issues)
                print(
                    "to solve them consider increasing the maximum sequence input length of the model and retrain"
                )

            report, report_as_map = classification_report(y_test,
                                                          y_pred,
                                                          digits=4)
            print(report)
Esempio n. 7
0
    def train_nfold(self,
                    x_train,
                    y_train,
                    x_valid=None,
                    y_valid=None,
                    f_train=None,
                    f_valid=None,
                    callbacks=None):
        """
        n-fold training for the instance model

        for RNN models:
        -> the n models are stored in self.models, and self.model left unset at this stage
        fold number is available with self.model_config.fold_number 

        for models with transformer layer:
        -> fold models are saved on disk (because too large) and self.models is not used, we identify the usage
        of folds with self.model_config.fold_number     
        """

        fold_count = self.model_config.fold_number
        fold_size = len(x_train) // fold_count

        dir_path = 'data/models/sequenceLabelling/'
        output_directory = os.path.join(dir_path, self.model_config.model_name)
        print("Output directory:", output_directory)
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        if self.model_config.transformer_name is not None:
            # save the config, preprocessor and transformer layer config on disk
            self.model_config.save(
                os.path.join(output_directory, CONFIG_FILE_NAME))
            self.preprocessor.save(
                os.path.join(output_directory, PROCESSOR_FILE_NAME))

        for fold_id in range(0, fold_count):
            if x_valid is None:
                # segment train and valid
                fold_start = fold_size * fold_id
                fold_end = fold_start + fold_size

                if fold_id == fold_size - 1:
                    fold_end = len(x_train)

                train_x = np.concatenate(
                    [x_train[:fold_start], x_train[fold_end:]])
                train_y = np.concatenate(
                    [y_train[:fold_start], y_train[fold_end:]])
                train_f = np.concatenate(
                    [f_train[:fold_start], f_train[fold_end:]])

                val_x = x_train[fold_start:fold_end]
                val_y = y_train[fold_start:fold_end]
                val_f = f_train[fold_start:fold_end]
            else:
                # reuse given segmentation
                train_x = x_train
                train_y = y_train
                train_f = f_train

                val_x = x_valid
                val_y = y_valid
                val_f = f_valid

            foldModel = get_model(self.model_config,
                                  self.preprocessor,
                                  ntags=len(self.preprocessor.vocab_tag),
                                  load_pretrained_weights=True)

            if fold_id == 0:
                print_parameters(self.model_config, self.training_config)
                foldModel.print_summary()

            print('\n------------------------ fold ' + str(fold_id) +
                  '--------------------------------------')
            self.transformer_preprocessor = foldModel.transformer_preprocessor
            foldModel = self.compile_model(foldModel, len(train_x))
            foldModel = self.train_model(
                foldModel,
                train_x,
                train_y,
                x_valid=val_x,
                y_valid=val_y,
                f_train=train_f,
                f_valid=val_f,
                max_epoch=self.training_config.max_epoch,
                callbacks=callbacks)

            if self.model_config.transformer_name is None:
                self.models.append(foldModel)
            else:
                # save the model with transformer layer on disk
                weight_file = DEFAULT_WEIGHT_FILE_NAME.replace(
                    ".hdf5",
                    str(fold_id) + ".hdf5")
                foldModel.save(os.path.join(output_directory, weight_file))
                if fold_id == 0:
                    foldModel.transformer_config.to_json_file(
                        os.path.join(output_directory,
                                     TRANSFORMER_CONFIG_FILE_NAME))
                    if self.model_config.transformer_name is not None:
                        transformer_preprocessor = foldModel.transformer_preprocessor
                        transformer_preprocessor.tokenizer.save_pretrained(
                            os.path.join(output_directory,
                                         DEFAULT_TRANSFORMER_TOKENIZER_DIR))
Esempio n. 8
0
    def eval(self, x_test, y_test, use_main_thread_only=False):
        print_parameters(self.model_config, self.training_config)

        bert_data = False
        if self.transformer_name is not None:
            bert_data = True

        if self.model_config.fold_number == 1:
            if self.model != None:
                self.model.print_summary()
                test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size,
                        maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, 
                        embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer)

                result = self.model.predict(test_generator, use_main_thread_only=use_main_thread_only)
            else:
                raise (OSError('Could not find a model.'))
        else:
            if self.models is None:
                raise (OSError('Could not find nfolds models.'))

            self.models[0].print_summary()

            # just a warning: n classifiers using BERT layer for prediction might be heavy in term of model sizes
            test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size,
                maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes,
                embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.models[0].transformer_tokenizer)
            result = predict_folds(self.models, test_generator, self.model_config, self.training_config, use_main_thread_only=use_main_thread_only)

        print("-----------------------------------------------")
        print("\nEvaluation on", x_test.shape[0], "instances:")

        total_accuracy = 0.0
        total_f1 = 0.0
        total_loss = 0.0
        total_roc_auc = 0.0

        '''
        def normer(t):
            if t < 0.5: 
                return 0 
            else: 
                return 1
        vfunc = np.vectorize(normer)
        result_binary = vfunc(result)
        '''
        result_intermediate = np.asarray([np.argmax(line) for line in result])
        
        def vectorize(index, size):
            result = np.zeros(size)
            if index < size:
                result[index] = 1
            return result
        result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate])

        precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None)
        print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(" ", "precision", "recall", "f-score", "support"))
        p = 0
        for the_class in self.model_config.list_classes:
            the_class = the_class[:14]
            print('{:>14}  {:>12}  {:>12}  {:>12}  {:>12}'.format(the_class, "{:10.4f}"
                .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p]))
            p += 1

        # macro-average (average of class scores)
        # we distinguish 1-class and multiclass problems 
        if len(self.model_config.list_classes) == 1:
            total_accuracy = accuracy_score(y_test, result_binary)
            total_f1 = f1_score(y_test, result_binary)

            # sklearn will complain if log(0)
            total_loss = log_loss(y_test, result, labels=[0,1])
            if len(np.unique(y_test)) == 1:
                # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                total_roc_auc = r2_score(y_test, result)
                if total_roc_auc < 0:
                    total_roc_auc = 0 
            else:
                total_roc_auc = roc_auc_score(y_test, result)
        else:
            for j in range(0, len(self.model_config.list_classes)):
                accuracy = accuracy_score(y_test[:, j], result_binary[:, j])
                total_accuracy += accuracy
                f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[:, j], result[:, j], labels=[0,1])
                total_loss += loss
                if len(np.unique(y_test[:, j])) == 1:
                    # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                    # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                    roc_auc = r2_score(y_test[:, j], result[:, j])
                    if roc_auc < 0:
                        roc_auc = 0 
                else:
                    roc_auc = roc_auc_score(y_test[:, j], result[:, j], labels=[0,1])
                total_roc_auc += roc_auc
                '''
                print("\nClass:", self.model_config.list_classes[j])
                print("\taccuracy at 0.5 =", accuracy)
                print("\tf-1 at 0.5 =", f1)
                print("\tlog-loss =", loss)
                print("\troc auc =", roc_auc)
                '''

        total_accuracy /= len(self.model_config.list_classes)
        total_f1 /= len(self.model_config.list_classes)
        total_loss /= len(self.model_config.list_classes)
        total_roc_auc /= len(self.model_config.list_classes)

        '''
        if len(self.model_config.list_classes) != 1:
            print("\nMacro-average:")
        print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy))
        print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1))
        print("\taverage log-loss =","{:10.4f}".format( total_loss))
        print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc))
        '''
        
        # micro-average (average of scores for each instance)
        # make sense only if we have more than 1 class, otherwise same as 
        # macro-avergae
        if len(self.model_config.list_classes) != 1:
            total_accuracy = 0.0
            total_f1 = 0.0
            total_loss = 0.0
            total_roc_auc = 0.0

            for i in range(0, result.shape[0]):
                accuracy = accuracy_score(y_test[i,:], result_binary[i,:])
                total_accuracy += accuracy
                f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro')
                total_f1 += f1
                loss = log_loss(y_test[i,:], result[i,:], labels=[0.0, 1.0])
                total_loss += loss
                if len(np.unique(y_test[i,:])) == 1:
                    # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches
                    # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss)
                    roc_auc = r2_score(y_test[i,:], result[i,:])
                    if roc_auc < 0:
                        roc_auc = 0 
                else:
                    roc_auc = roc_auc_score(y_test[i,:], result[i,:], labels=[0.0, 1.0])
                total_roc_auc += roc_auc

            total_accuracy /= result.shape[0]
            total_f1 /= result.shape[0]
            total_loss /= result.shape[0]
            total_roc_auc /= result.shape[0]
            
            '''