def _train_validate_RF_Classifier(self, trees, max_depth):
        '''Treina um classificador Random Forest especificando a quantidade de árvores e profundidade
        máxima. As amostras de treino são usadas para criar/treinar o classificador e as amostras de
        do set de validação são usadas para obter as métricas.

            trees (int)     : quantidade de árvores usadas para criar o classificador.
            max_depth (int) : profundidade máxima permitida.

        Return:
            metrics (float) : acurácia obtida pelo classificador na base de validação.
        '''
        trees = int(trees)
        max_depth = int(max_depth)
        # Criando classificador
        rf_clf = RandomForestClassifier(n_estimators=trees,
                                        max_depth=max_depth,
                                        random_state=self.random_seed)
        # Fita o classificador
        rf_clf.fit(self.x_train, self.y_train)
        # Mede acurácia média para o treino
        accuracy_train = rf_clf.score(self.x_train, self.y_train)
        # Obtém métricas no conjunto de validação
        y_pred = rf_clf.predict(self.x_val)
        metrics_val = metrics.calculate_metrics(y_pred, self.y_val)
        # Salva na lista com os resultados
        self.results_train_val.append(
            (accuracy_train, metrics_val['accuracy'], trees, max_depth,
             metrics_val['TP'], metrics_val['FP'], metrics_val['FN'],
             metrics_val['TN'], metrics_val['precision'],
             metrics_val['recall'], metrics_val['f1']))
        # Otimiza pela acurácia do dataset de validação
        return metrics_val['accuracy']
def one_feature_pipeline_cross_val(train: Data, test: Data,
                                   params: SingleBaseParams) -> None:
    f = get_f(params.get_feature_name())
    X_train, y_train = train.get_x_y(f)
    X_test, y_test = test.get_x_y(f)

    pipeline = get_feature_pipeline(params)

    from utils.metrics import calculate_metrics
    from sklearn.model_selection import cross_val_score

    results = cross_val_score(pipeline, X_train, y_train, cv=10)
    print(params.method)
    print("Cross validated accuracy score: %.2f%% (%.2f%%)" %
          (results.mean() * 100, results.std() * 100))
    pipeline.fit(X_train, y_train)
    test_score = pipeline.score(X_test, y_test)
    preds = pipeline.predict(X_test)

    m = calculate_metrics(y_test, preds)
    print(m.to_string())
    print("test score = %0.4f" % test_score)
    print(params.get_feature_name())

    with open(
            "base_features_hyperparams/" + params.get_feature_name() + '.txt',
            'a') as f:
        f.write("-------\n")
        f.write("score: %.2f%% (%.2f%%)\n" %
                (results.mean() * 100, results.std() * 100))
        f.write("test score = %0.4f\n" % test_score)
        f.write(m.to_string() + '\n')
        f.write("method %s\n" % params.method)
        f.write("pca" + str(params.pca) + '\n')
        f.write("-------\n")
 def get_state_dict(self) -> Dict:
     loss = self.loss / self.batch_processed if self.batch_processed > 0 else 0.0
     state_dict = {'loss': loss}
     if self.lr != 0:
         state_dict['learning_rate'] = self.lr
     state_dict.update(calculate_metrics(self.statistics))
     return state_dict
 def test_calculating_zero_metrics(self):
     statistics = {
         'true_positive': 0,
         'false_positive': 0,
         'false_negative': 0
     }
     metrics = calculate_metrics(statistics)
     for metric, value in metrics.items():
         self.assertEqual(0.0, value)
 def __init__(self, x: List[List[float]], y: List[str], feature: BaseParams,
              verbose, with_scaling):
     self.x = x
     from utils.mapping import map_y_array_to_int
     from utils.metrics import calculate_metrics
     self.y = map_y_array_to_int(y)
     self.feature = feature
     self.feature_name = feature.get_features_names_str()
     self.updated_ranking = dict()
     self.updated_ranking_changed: int = 0
     self.verbose = verbose
     self.accuracy = calculate_metrics(self.y, x).accuracy
     self.with_scaling = with_scaling
    def __init__(self, x: List[List[float]], y: List[str], feature: BaseParams,
                 verbose, with_scaling):
        self.x = x
        self._i = 0

        self.y = map_y_array_to_int(y)
        self.feature = feature
        self.feature_name = feature.get_features_names_str()

        self.verbose = verbose
        self.accuracy = calculate_metrics(self.y, x).accuracy
        print(self.feature_name, self.accuracy)

        self.with_scaling = with_scaling
    def train_one_batch(self, model, vocab, src, trg, src_percentages,
                        src_lengths, trg_lengths, smoothing, loss_type):
        pred, gold, hyp = model(src, src_lengths, trg, verbose=False)
        strs_golds, strs_hyps = [], []

        for j in range(len(gold)):
            ut_gold = gold[j]
            strs_golds.append("".join(
                [vocab.id2label[int(x)] for x in ut_gold]))

        for j in range(len(hyp)):
            ut_hyp = hyp[j]
            strs_hyps.append("".join([vocab.id2label[int(x)] for x in ut_hyp]))

        # handling the last batch
        seq_length = pred.size(1)
        sizes = src_percentages.mul_(int(seq_length)).int()

        loss, num_correct = calculate_metrics(pred,
                                              gold,
                                              vocab.PAD_ID,
                                              input_lengths=sizes,
                                              target_lengths=trg_lengths,
                                              smoothing=smoothing,
                                              loss_type=loss_type)

        if loss is None:
            print("loss is None")

        if loss.item() == float('Inf'):
            logging.info("Found infinity loss, masking")
            print("Found infinity loss, masking")
            loss = torch.where(loss != loss, torch.zeros_like(loss),
                               loss)  # NaN masking

        total_cer, total_wer, total_char, total_word = 0, 0, 0, 0
        for j in range(len(strs_hyps)):
            strs_hyps[j] = post_process(strs_hyps[j], vocab.special_token_list)
            strs_golds[j] = post_process(strs_golds[j],
                                         vocab.special_token_list)
            cer = calculate_cer(strs_hyps[j].replace(' ', ''),
                                strs_golds[j].replace(' ', ''))
            wer = calculate_wer(strs_hyps[j], strs_golds[j])
            total_cer += cer
            total_wer += wer
            total_char += len(strs_golds[j].replace(' ', ''))
            total_word += len(strs_golds[j].split(" "))

        return loss, total_cer, total_char
 def play(self):
     predictions = []
     for i in range(len(self.agents[0].x)):
         if self.verbose:
             print("\n----------------------")
             print("round:", i)
             print("----------------------\n")
         winner = self.small_round()
         if self.verbose:
             print(winner, "TRUE:", self.agents[0].y[i])
         predictions.append(winner)
         self.upp_i()
     y = self.agents[0].y
     from utils.metrics import calculate_metrics
     return calculate_metrics(y, predictions)
Beispiel #9
0
def cross_validate(data, n, metric, **kwargs):
    kf = KFold(n_splits=n, shuffle=True)
    train_metric = []
    test_metric = []

    for train_index, test_index in kf.split(data):
        train, test = data.iloc[train_index, :], data.iloc[test_index, :]
        write_to_file(train, "fasttext_data/captions/training_data.txt")
        write_to_file(test, "fasttext_data/captions/testing_data.txt")
        model = fasttext.train_supervised(
            input="fasttext_data/captions/training_data.txt",
            verbose=0,
            **kwargs)
        train_metric_result = calculate_metrics(
            "fasttext_data/captions/training_data.txt",
            model)['weighted avg'][metric]
        test_metric_result = calculate_metrics(
            "fasttext_data/captions/testing_data.txt",
            model)['weighted avg'][metric]
        train_metric.append(train_metric_result)
        test_metric.append(test_metric_result)
    print(test_metric)
    print("mean %s: %f" % (metric, np.mean(test_metric)))
    return (np.mean(train_metric), np.mean(test_metric))
Beispiel #10
0
def optimize_hyperparameters(data):
    training, validation = train_test_split(data, test_size=0.2)
    write_to_file(training,
                  "fasttext_data/captions/optimized_training_data.txt")
    write_to_file(validation,
                  "fasttext_data/captions/optimized_validation_data.txt")
    print("starting automatic hyperparameter optimization")
    model = fasttext.train_supervised(
        input='fasttext_data/captions/optimized_training_data.txt',
        autotuneValidationFile=
        'fasttext_data/captions/optimized_validation_data.txt',
        autotuneDuration=600,
        verbose=3)
    print("finished optimization, saving model")
    model.save_model("models/captions/optimized_model.bin")
    return calculate_metrics(
        "fasttext_data/captions/optimized_validation_data.txt", model)
    def read(feature: BaseParams) -> Tuple[Optional[BaseParams], float]:
        try:
            _, x_train, y_train = read_one_feature(feature,
                                                   train_vs_test="train",
                                                   only_ab=False,
                                                   to_binary=False)
            _, x_test, y_test = read_one_feature(feature,
                                                 train_vs_test="test",
                                                 only_ab=False,
                                                 to_binary=False)
            x = list(x_train) + list(x_test)
            y = list(y_train) + list(y_test)

            metrics = calculate_metrics(y, proba_to_letters(x))
            print(feature.get_features_names_str(), metrics.accuracy)
            return (feature, metrics.accuracy)
        except Exception as ignored:
            return (None, 0.)
 def play(self):
     predictions = []
     for i in range(len(self.agents[0].x)):
         if self.verbose:
             print("\n----------------------")
             print("round:", i)
             print("----------------------\n")
         winner: Optional[int] = None
         while len(self.agents) > len(self.removed_from_round) + 1:
             round = self.small_round()
             if round is not None:
                 winner = round
                 break
         if not winner:
             winner = self.get_winner()
         if self.verbose:
             print(winner, "TRUE:", self.agents[0].y[i])
         predictions.append(winner)
         self.upp_i()
     y = self.agents[0].y
     from utils.metrics import calculate_metrics
     return calculate_metrics(y, predictions)
Beispiel #13
0
def run_grid_search(train: Data, test: Data, params: BaseParams,
                    parameters: Dict[str, Any]) -> None:
    X_train, y_train = get_Xy(train, params)
    X_test, y_test = get_Xy(test, params)

    clf = params.classifier

    pipeline = get_pipeline(params)
    grid = GridSearchCV(pipeline, param_grid=parameters, cv=10)
    grid.fit(X_train, y_train)
    y_predicted = grid.predict(X_test)

    print("best_score", grid.best_score_)

    index = grid.best_index_
    print(grid.cv_results_['mean_test_score'][index],
          grid.cv_results_['std_test_score'][index])

    from utils.metrics import calculate_metrics
    m = calculate_metrics(y_test, y_predicted)
    print(m.to_string())
    # print(feature_name)

    print("best_params", grid.best_params_)

    from utils.file_management import save_data_with_ultimate_dir_creation
    lines = ("clf: %s\n"
             "mean_test_score = %3.4f\n"
             "std_test_score %3.4f\n"
             "metrics:\n%s\n"
             "best_params: %s\n"
             "------------------\n" %
             (clf, grid.cv_results_['mean_test_score'][index] * 100,
              grid.cv_results_['std_test_score'][index] * 100, m.to_string(),
              str(grid.best_params_)))
    path = "out/" + params.dirname + "_hyperparams/" + params.get_features_names_str(
    ) + '.txt'
    save_data_with_ultimate_dir_creation(path, [lines])
Beispiel #14
0
    def play(self):
        predictions = []
        for i in range(len(self.agents[0].x)):
            if self.verbose:
                print("\n----------------------")
                print("round:", i, "SOR: ", self.agents[0].y[i])
                print("----------------------\n")

            while len(self.agents) > len(self.removed_from_round) + 1:
                self.small_round(i)

            winner = self.get_winner(i)
            if self.verbose:
                print(winner, "TRUE:", self.agents[0].y[i])

            predictions.append(winner)
            self.reset()
        y = self.agents[0].y
        from utils.metrics import calculate_metrics
        m = calculate_metrics(y, predictions)
        if self.verbose:
            print(m.accuracy, m.aed_score, self.C)

        return m
df = pandas.read_csv('senia.csv').values
x = map(lambda x: map(lambda y: generateIntSequence(y), x), df[:, 1:-2])
#print x[0], x[-1]
y = map(lambda y: int(y), df[:, -1])
y = numpy.array(y)
#print y[0], y[-1]
x_metrics = []
for i, x_i in enumerate(x):
    timestamp = int(df[i, -2])
    data = [
        timestamp,
    ] + x_i
    #if i % 1000 == 0:
    #    print data
    x_metrics.append(calculate_metrics(data)[:-1])

#print x_metrics[0]
x_metrics = numpy.array(x_metrics)
#x_metrics = normalize(x_metrics)
#x_metrics = scale(x_metrics)


def f6(x):
    if x == 7:
        return 2
    elif x > 7:
        return x - 1
    else:
        return x
    def forward_one_batch(self,
                          model,
                          vocab,
                          src,
                          trg,
                          src_percentages,
                          src_lengths,
                          trg_lengths,
                          smoothing,
                          loss_type,
                          verbose=False,
                          discriminator=None,
                          accent_id=None,
                          multi_task=False):
        if discriminator is None:
            pred, gold, hyp = model(src, src_lengths, trg, verbose=False)
        else:
            enc_output = model.encode(src, src_lengths)
            accent_pred = discriminator(torch.sum(enc_output, dim=1))
            pred, gold, hyp = model.decode(enc_output, src_lengths, trg)
            if multi_task:
                # calculate multi
                disc_loss = calculate_multi_task(accent_pred, accent_id)
            else:
                # calculate discriminator loss and encoder loss
                disc_loss, enc_loss = calculate_adversarial(
                    accent_pred, accent_id)

        strs_golds, strs_hyps = [], []

        for j in range(len(gold)):
            ut_gold = gold[j]
            strs_golds.append("".join(
                [vocab.id2label[int(x)] for x in ut_gold]))

        for j in range(len(hyp)):
            ut_hyp = hyp[j]
            strs_hyps.append("".join([vocab.id2label[int(x)] for x in ut_hyp]))

        # handling the last batch
        seq_length = pred.size(1)
        sizes = src_percentages.mul_(int(seq_length)).int()

        loss, _ = calculate_metrics(pred,
                                    gold,
                                    vocab.PAD_ID,
                                    input_lengths=sizes,
                                    target_lengths=trg_lengths,
                                    smoothing=smoothing,
                                    loss_type=loss_type)

        if loss is None:
            print("loss is None")

        if loss.item() == float('Inf'):
            logging.info("Found infinity loss, masking")
            print("Found infinity loss, masking")
            loss = torch.where(loss != loss, torch.zeros_like(loss),
                               loss)  # NaN masking

        # if verbose:
        #     print(">PRED:", strs_hyps)
        #     print(">GOLD:", strs_golds)

        total_cer, total_wer, total_char, total_word = 0, 0, 0, 0
        for j in range(len(strs_hyps)):
            strs_hyps[j] = post_process(strs_hyps[j], vocab.special_token_list)
            strs_golds[j] = post_process(strs_golds[j],
                                         vocab.special_token_list)
            cer = calculate_cer(strs_hyps[j].replace(' ', ''),
                                strs_golds[j].replace(' ', ''))
            wer = calculate_wer(strs_hyps[j], strs_golds[j])
            total_cer += cer
            total_wer += wer
            total_char += len(strs_golds[j].replace(' ', ''))
            total_word += len(strs_golds[j].split(" "))

        if verbose:
            print('Total CER', total_cer)
            print('Total char', total_char)

            print("PRED:", strs_hyps)
            print("GOLD:", strs_golds, flush=True)

        if discriminator is None:
            return loss, total_cer, total_char
        else:
            if multi_task:
                return loss, total_cer, total_char, disc_loss
            else:
                return loss, total_cer, total_char, disc_loss, enc_loss
    def run(self):
        ############################################################################################
        # Obtendo amostras de treino, validação e teste                                            #
        ############################################################################################
        # Definindo dataset de treino
        train_data_file_path = os.path.join(dir_data, 'train_dataset.pickle')
        train_ds = PixForceDataset(train_data_file_path,
                                   transformations=self.transformations_train)
        # Obtendo amostras para treino
        samples_train = train_ds[0:len(train_ds)]
        self.x_train = samples_train['images']
        self.y_train = samples_train['labels']

        # Definindo dataset de validacao
        val_data_file_path = os.path.join(dir_data,
                                          'validation_dataset.pickle')
        val_ds = PixForceDataset(val_data_file_path,
                                 transformations=[Flatten()])
        # Obtendo amostras para validação
        samples_val = val_ds[0:len(train_ds)]
        self.x_val = samples_val['images']
        self.y_val = samples_val['labels']

        # Definindo dataset de teste
        test_data_file_path = os.path.join(dir_data, 'test_dataset.pickle')
        test_ds = PixForceDataset(test_data_file_path,
                                  transformations=[Flatten()])
        # Obtendo amostras para validação
        samples_test = test_ds[0:len(train_ds)]
        self.x_test = samples_test['images']
        self.y_test = samples_test['labels']

        ############################################################################################
        # Definindo parâmetros para o otimizador bayesiano                                         #
        ############################################################################################
        # Definindo limites dos parâmetros do RF para busca do otimizador bayesiano
        pbounds = {'trees': (5, 200), 'max_depth': (5, 200)}
        # Definindo otimizador bayesiano
        optimizer = BayesianOptimization(
            f=self._train_validate_RF_Classifier,
            pbounds=pbounds,
            random_state=self.random_seed,
        )
        self.results_train_val = []
        # Chama o otimizador bayesiano
        optimizer.maximize(init_points=15, n_iter=self.iterations_bo)

        ############################################################################################
        # Verificando a acurácia na base de teste                                                  #
        ############################################################################################
        # Aplicando as amostras de teste no classificador com a melhor acurácia na validação
        self.results_train_val = np.array(self.results_train_val)
        best_cls = self.results_train_val[np.argmax(self.results_train_val[:,
                                                                           1])]
        trees = int(best_cls[2])
        max_depth = int(best_cls[3])
        # Cria classificador com a melhor configuração obtida na validação
        rf_final = RandomForestClassifier(n_estimators=trees,
                                          max_depth=max_depth,
                                          random_state=self.random_seed)
        rf_final.fit(self.x_train + self.x_val, self.y_train + self.y_val)
        y_pred_test = rf_final.predict(self.x_test)
        self.results_test = metrics.calculate_metrics(y_pred_test, self.y_test)
        # Mostra os resultados
        print(f'Acurácia na base de teste: {self.results_test["accuracy"]}\n')

        return {
            'final_classifier': rf_final,
            'results_train_val_set': self.results_train_val,
            'metrics_test_set': self.results_test
        }
 def get_state_dict(self) -> Dict:
     loss = self.loss / self.batch_processed if self.batch_processed > 0 else 0.0
     state_dict = {'loss': loss}
     state_dict.update(calculate_metrics(self.statistics))
     return state_dict
Beispiel #19
0
def train(
        model: GraphConvolutionalNetwork,
        train_data: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
        validation_data: Optional[List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]] = None,
        criterion: Callable = nn.CrossEntropyLoss(),
        num_epochs: int = 10,
        learning_rate: float = 1e-3,
        metrics_to_log: Optional[List[str]] = None,
        model_path: Optional[str] = None
) -> None:

    # Send model to device and initialize optimize
    model = model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    logger.info("training model...")
    for i in range(num_epochs):

        count = 0
        total_loss = 0
        for input, adjacency, target in train_data:

            # Send data to device
            input = input.to(DEVICE)
            adjacency = adjacency.to(DEVICE)
            target = target.to(DEVICE)

            # Compute prediction and loss
            predicted = model(input=input, adjacency=adjacency).to(DEVICE)
            loss = criterion(predicted.unsqueeze(0), target.unsqueeze(0)).to(DEVICE)

            # Perform gradient step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Track progress
            count += 1
            total_loss += loss.cpu().item()

        # Log progress
        logger.info(f"epochs completed: \t {i + 1}/{num_epochs}")
        logger.info(f"mean loss: \t {'{0:.3f}'.format(total_loss / count)}")
        if metrics_to_log:
            logger.info("calculating training metrics...")
            log_metrics(
                metrics=calculate_metrics(
                    model=model,
                    data=train_data
                ),
                metrics_to_log=metrics_to_log
            )
            if validation_data:
                logger.info("calculating validation metrics...")
                log_metrics(
                    metrics=calculate_metrics(
                        model=model,
                        data=validation_data
                    ),
                    metrics_to_log=metrics_to_log
                )
        logger.info("-" * 50)

    if model_path:
        logger.info("saving model...")
        model.save(path=model_path)
Beispiel #20
0
    def train(self,
              model,
              train_loader,
              train_sampler,
              valid_loader_list,
              opt,
              loss_type,
              start_epoch,
              num_epochs,
              label2id,
              id2label,
              last_metrics=None):
        """
        Training
        args:
            model: Model object
            train_loader: DataLoader object of the training set
            valid_loader_list: a list of Validation DataLoader objects
            opt: Optimizer object
            start_epoch: start epoch (> 0 if you resume the process)
            num_epochs: last epoch
            last_metrics: (if resume)
        """
        history = []
        start_time = time.time()
        best_valid_loss = 1000000000 if last_metrics is None else last_metrics[
            'valid_loss']
        smoothing = constant.args.label_smoothing

        logging.info("name " + constant.args.name)

        for epoch in range(start_epoch, num_epochs):
            sys.stdout.flush()
            total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0

            start_iter = 0

            logging.info("TRAIN")
            model.train()
            pbar = tqdm(iter(train_loader),
                        leave=True,
                        total=len(train_loader))
            for i, (data) in enumerate(pbar, start=start_iter):
                src, tgt, src_percentages, src_lengths, tgt_lengths = data

                if constant.USE_CUDA:
                    src = src.cuda()
                    tgt = tgt.cuda()

                opt.zero_grad()

                pred, gold, hyp_seq, gold_seq = model(src,
                                                      src_lengths,
                                                      tgt,
                                                      verbose=False)

                try:  # handle case for CTC
                    strs_gold, strs_hyps = [], []
                    for ut_gold in gold_seq:
                        str_gold = ""
                        for x in ut_gold:
                            if int(x) == constant.PAD_TOKEN:
                                break
                            str_gold = str_gold + id2label[int(x)]
                        strs_gold.append(str_gold)
                    for ut_hyp in hyp_seq:
                        str_hyp = ""
                        for x in ut_hyp:
                            if int(x) == constant.PAD_TOKEN:
                                break
                            str_hyp = str_hyp + id2label[int(x)]
                        strs_hyps.append(str_hyp)
                except Exception as e:
                    print(e)
                    logging.info("NaN predictions")
                    continue

                seq_length = pred.size(1)
                sizes = Variable(src_percentages.mul_(int(seq_length)).int(),
                                 requires_grad=False)

                loss, num_correct = calculate_metrics(
                    pred,
                    gold,
                    input_lengths=sizes,
                    target_lengths=tgt_lengths,
                    smoothing=smoothing,
                    loss_type=loss_type)

                if loss.item() == float('Inf'):
                    logging.info("Found infinity loss, masking")
                    loss = torch.where(loss != loss, torch.zeros_like(loss),
                                       loss)  # NaN masking
                    continue

                # if constant.args.verbose:
                #     logging.info("GOLD", strs_gold)
                #     logging.info("HYP", strs_hyps)

                for j in range(len(strs_hyps)):
                    strs_hyps[j] = strs_hyps[j].replace(
                        constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '')
                    strs_gold[j] = strs_gold[j].replace(
                        constant.SOS_CHAR, '').replace(constant.EOS_CHAR, '')
                    cer = calculate_cer(strs_hyps[j].replace(' ', ''),
                                        strs_gold[j].replace(' ', ''))
                    wer = calculate_wer(strs_hyps[j], strs_gold[j])
                    total_cer += cer
                    total_wer += wer
                    total_char += len(strs_gold[j].replace(' ', ''))
                    total_word += len(strs_gold[j].split(" "))

                loss.backward()

                if constant.args.clip:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   constant.args.max_norm)

                opt.step()

                total_loss += loss.item()
                non_pad_mask = gold.ne(constant.PAD_TOKEN)
                num_word = non_pad_mask.sum().item()

                pbar.set_description(
                    "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}".
                    format((epoch + 1), total_loss / (i + 1),
                           total_cer * 100 / total_char, opt._rate))
            logging.info(
                "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% LR:{:.7f}".format(
                    (epoch + 1), total_loss / (len(train_loader)),
                    total_cer * 100 / total_char, opt._rate))

            # evaluate
            print("")
            logging.info("VALID")
            model.eval()

            for ind in range(len(valid_loader_list)):
                valid_loader = valid_loader_list[ind]

                total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0
                valid_pbar = tqdm(iter(valid_loader),
                                  leave=True,
                                  total=len(valid_loader))
                for i, (data) in enumerate(valid_pbar):
                    src, tgt, src_percentages, src_lengths, tgt_lengths = data

                    if constant.USE_CUDA:
                        src = src.cuda()
                        tgt = tgt.cuda()

                    pred, gold, hyp_seq, gold_seq = model(src,
                                                          src_lengths,
                                                          tgt,
                                                          verbose=False)

                    seq_length = pred.size(1)
                    sizes = Variable(src_percentages.mul_(
                        int(seq_length)).int(),
                                     requires_grad=False)

                    loss, num_correct = calculate_metrics(
                        pred,
                        gold,
                        input_lengths=sizes,
                        target_lengths=tgt_lengths,
                        smoothing=smoothing,
                        loss_type=loss_type)

                    if loss.item() == float('Inf'):
                        logging.info("Found infinity loss, masking")
                        loss = torch.where(loss != loss,
                                           torch.zeros_like(loss),
                                           loss)  # NaN masking
                        continue

                    try:  # handle case for CTC
                        strs_gold, strs_hyps = [], []
                        for ut_gold in gold_seq:
                            str_gold = ""
                            for x in ut_gold:
                                if int(x) == constant.PAD_TOKEN:
                                    break
                                str_gold = str_gold + id2label[int(x)]
                            strs_gold.append(str_gold)
                        for ut_hyp in hyp_seq:
                            str_hyp = ""
                            for x in ut_hyp:
                                if int(x) == constant.PAD_TOKEN:
                                    break
                                str_hyp = str_hyp + id2label[int(x)]
                            strs_hyps.append(str_hyp)
                    except Exception as e:
                        print(e)
                        logging.info("NaN predictions")
                        continue

                    for j in range(len(strs_hyps)):
                        strs_hyps[j] = strs_hyps[j].replace(
                            constant.SOS_CHAR,
                            '').replace(constant.EOS_CHAR, '')
                        strs_gold[j] = strs_gold[j].replace(
                            constant.SOS_CHAR,
                            '').replace(constant.EOS_CHAR, '')
                        cer = calculate_cer(strs_hyps[j].replace(' ', ''),
                                            strs_gold[j].replace(' ', ''))
                        wer = calculate_wer(strs_hyps[j], strs_gold[j])
                        total_valid_cer += cer
                        total_valid_wer += wer
                        total_valid_char += len(strs_gold[j].replace(' ', ''))
                        total_valid_word += len(strs_gold[j].split(" "))

                    total_valid_loss += loss.item()
                    valid_pbar.set_description(
                        "VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format(
                            ind, total_valid_loss / (i + 1),
                            total_valid_cer * 100 / total_valid_char))
                logging.info("VALID SET {} LOSS:{:.4f} CER:{:.2f}%".format(
                    ind, total_valid_loss / (len(valid_loader)),
                    total_valid_cer * 100 / total_valid_char))

            metrics = {}
            metrics["train_loss"] = total_loss / len(train_loader)
            metrics["valid_loss"] = total_valid_loss / (len(valid_loader))
            metrics["train_cer"] = total_cer
            metrics["train_wer"] = total_wer
            metrics["valid_cer"] = total_valid_cer
            metrics["valid_wer"] = total_valid_wer
            metrics["history"] = history
            history.append(metrics)

            if epoch % constant.args.save_every == 0:
                save_model(model, (epoch + 1),
                           opt,
                           metrics,
                           label2id,
                           id2label,
                           best_model=False)

            # save the best model
            if best_valid_loss > total_valid_loss / len(valid_loader):
                best_valid_loss = total_valid_loss / len(valid_loader)
                save_model(model, (epoch + 1),
                           opt,
                           metrics,
                           label2id,
                           id2label,
                           best_model=True)

            if constant.args.shuffle:
                logging.info("SHUFFLE")
                print("SHUFFLE")
                train_sampler.shuffle(epoch)
Beispiel #21
0
    def train(self,
              model,
              train_loader,
              train_sampler,
              valid_loaders,
              opt,
              loss_type,
              start_epoch,
              num_epochs,
              label2id,
              id2label,
              last_metrics=None,
              logger=None):
        """
        Training
        args:
            model: Model object
            train_loader: DataLoader object of the training set
            valid_loaders: list of DataLoader object of the validation set
            opt: Optimizer object
            start_epoch: start epoch (> 0 if you resume the process)
            num_epochs: last epoch
            last_metrics: (if resume)
        """
        if logger is not None:
            sys.out = logger

        start_time = time.time()
        best_valid_loss = 1000000000 if last_metrics is None else last_metrics[
            'valid_loss']
        smoothing = constant.args.label_smoothing

        history = []

        for epoch in range(start_epoch, num_epochs):
            sys.out.flush()
            total_loss, total_cer, total_wer, total_char, total_word = 0, 0, 0, 0, 0
            start_iter = 0

            print("TRAIN")
            model.train()
            pbar = tqdm(iter(train_loader),
                        leave=True,
                        total=len(train_loader))
            for i, (data) in enumerate(pbar, start=start_iter):
                src, tgt, src_percentages, src_lengths, tgt_lengths = data

                if constant.USE_CUDA:
                    src = src.cuda()
                    tgt = tgt.cuda()

                opt.optimizer.zero_grad()

                pred, gold, hyp_seq, gold_seq = model(
                    src,
                    input_lengths=src_lengths,
                    padded_target=tgt,
                    verbose=constant.args.verbose)

                strs_gold = [
                    "".join([id2label[int(x)] for x in gold])
                    for gold in gold_seq
                ]
                strs_hyps = [
                    "".join([id2label[int(x)] for x in hyp]) for hyp in hyp_seq
                ]

                loss, num_correct = calculate_metrics(
                    pred,
                    gold,
                    smoothing=smoothing,
                    loss_type=loss_type,
                    input_lengths=src_lengths,
                    target_lengths=tgt_lengths)

                if constant.args.verbose:
                    print("GOLD", strs_gold)
                    print("HYP", strs_hyps)

                for j in range(len(strs_hyps)):
                    cer = calculate_cer(strs_hyps[j], strs_gold[j])
                    wer = calculate_wer(strs_hyps[j], strs_gold[j])
                    total_cer += cer
                    total_wer += wer
                    total_char += len(strs_gold[j])
                    total_word += len(strs_gold[j].split(" "))

                loss.backward()
                opt.optimizer.step()

                total_loss += loss.detach().item()
                non_pad_mask = gold.ne(constant.PAD_TOKEN)
                num_word = non_pad_mask.sum().item()

                pbar.set_description(
                    "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%".
                    format((epoch + 1), total_loss / (i + 1),
                           total_cer * 100 / total_char,
                           total_wer * 100 / total_word))
            print(
                "(Epoch {}) TRAIN LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%".format(
                    (epoch + 1), total_loss / (len(train_loader)),
                    total_cer * 100 / total_char,
                    total_wer * 100 / total_word))

            print("VALID")
            all_valid_loss = []
            for valid_task_id in range(len(valid_loaders)):
                model.eval()
                sys.out.flush()

                valid_loader = valid_loaders[valid_task_id]

                total_valid_loss, total_valid_cer, total_valid_wer, total_valid_char, total_valid_word = 0, 0, 0, 0, 0
                valid_pbar = tqdm(iter(valid_loader),
                                  leave=True,
                                  total=len(valid_loader))
                for i, (data) in enumerate(valid_pbar):
                    src, tgt, src_percentages, src_lengths, tgt_lengths = data

                    if constant.USE_CUDA:
                        src = src.cuda()
                        tgt = tgt.cuda()

                    pred, gold, hyp_seq, gold_seq = model(
                        src,
                        input_lengths=src_lengths,
                        padded_target=tgt,
                        verbose=constant.args.verbose)
                    loss, num_correct = calculate_metrics(
                        pred,
                        gold,
                        smoothing=smoothing,
                        loss_type=loss_type,
                        input_lengths=src_lengths,
                        target_lengths=tgt_lengths)

                    strs_gold = [
                        "".join([id2label[int(x)] for x in gold])
                        for gold in gold_seq
                    ]
                    strs_hyps = [
                        "".join([id2label[int(x)] for x in hyp])
                        for hyp in hyp_seq
                    ]

                    for j in range(len(strs_hyps)):
                        cer = calculate_cer(strs_hyps[j], strs_gold[j])
                        wer = calculate_wer(strs_hyps[j], strs_gold[j])
                        total_valid_cer += cer
                        total_valid_wer += wer
                        total_valid_char += len(strs_gold[j])
                        total_valid_word += len(strs_gold[j].split(" "))

                    total_valid_loss += loss.detach().item()
                    valid_pbar.set_description(
                        "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%"
                        .format((epoch + 1), valid_task_id,
                                total_valid_loss / (i + 1),
                                total_valid_cer * 100 / total_valid_char,
                                total_valid_wer * 100 / total_valid_word))
                all_valid_loss.append(total_valid_loss / len(valid_pbar))
                print(
                    "(Epoch {}) TASK:{} VALID LOSS:{:.4f} CER:{:.2f}% WER:{:.2f}%"
                    .format((epoch + 1), valid_task_id,
                            total_valid_loss / (len(valid_loader)),
                            total_valid_cer * 100 / total_valid_char,
                            total_valid_wer * 100 / total_valid_word))

            metrics = {}
            metrics["train_loss"] = total_loss / len(train_loader)
            metrics["valid_loss"] = np.mean(np.array(all_valid_loss))
            metrics["valid_losses"] = all_valid_loss
            metrics["train_cer"] = total_cer
            metrics["train_wer"] = total_wer
            metrics["valid_cer"] = total_valid_cer
            metrics["valid_wer"] = total_valid_wer
            metrics["history"] = history
            history.append(metrics)

            if epoch % constant.args.save_every == 0:
                save_model(model, (epoch + 1),
                           opt,
                           metrics,
                           label2id,
                           id2label,
                           best_model=False)

            # save the best model
            if best_valid_loss > total_valid_loss / len(valid_loader):
                best_valid_loss = total_valid_loss / len(valid_loader)
                save_model(model, (epoch + 1),
                           opt,
                           metrics,
                           label2id,
                           id2label,
                           best_model=True)

            if constant.args.shuffle:
                print("SHUFFLE")
                train_sampler.shuffle(epoch)
Beispiel #22
0
def train_and_test(training, testing, output_path, **kwargs):
    model = fasttext.train_supervised(training, **kwargs)
    results = calculate_metrics(testing, model)
    model.save_model(output_path)
    return results
Beispiel #23
0
 def count_accuracy(self):
     from utils.metrics import calculate_metrics
     return calculate_metrics(self.conv_Xs_to_label(), self.ys).accuracy
Beispiel #24
0
def main():
    plt.rcParams['figure.dpi'] = 300
    plt.rcParams['font.size'] = 7

    # Classes
    classes = ["dog", "cat", "Null"]
    # classes = ["dog", "cat"]

    # DataFrames
    actual_df = pd.read_csv("example\\actual.csv")
    actual_df = preprocess_df(actual_df)

    detected_df = pd.read_csv("example\\detected.csv")
    detected_df = preprocess_df(detected_df)
    detected_df = remove_overlapping_objects(detected_df)

    # Calculating
    df = calculate_metrics(actual_df,
                           detected_df,
                           prob_thresh=0,
                           iou_thresh=0.0)

    df.to_csv("example\\result_df.csv", index=False)

    # ============ Collect data for sklearn =============
    y_true = []
    y_pred = []
    y_score = []
    for i, row in df[df['a_xmin'] != 'Null'].iterrows():

        true_class = row['a_label']
        y_true.append(true_class)
        pred_class = row['d_label']
        y_pred.append(pred_class)

        prob = row['d_prob']
        if prob == "Null":
            y_score.append(0)
        else:
            y_score.append(float(prob))

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_score = np.array(y_score)

    # for true, pred in zip(y_true, y_pred):
    #     print(true, pred)

    print("Accuracy ", 100 * (y_true == y_pred).sum() / len(y_true))

    # ========= Confusion Matrix ===========
    cm = sm.confusion_matrix(y_true, y_pred, labels=sorted(classes))
    plot_confusion_matrix(cm, classes=sorted(classes))
    plt.show()

    cm_display = sm.ConfusionMatrixDisplay(
        cm, display_labels=sorted(classes)).plot()
    plt.show()

    # ========= Classification Report ===========

    cp = sm.classification_report(y_true,
                                  y_pred,
                                  labels=sorted(classes),
                                  output_dict=False)
    print(cp)

    # ========= PR Curve ===========
    precision = {}
    recall = {}
    thresh = {}

    for i in classes:
        precision[i], recall[i], thresh[i] = sm.precision_recall_curve(
            y_true, y_score, pos_label=i)
        plt.plot(recall[i], precision[i], lw=2, label=f'{i}')

    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.legend(loc="best")
    plt.title("precision vs. recall curve")
    plt.show()

    print("PR Curve")

    # for pr, rec, thresh_ in zip(precision["full_lined"], recall["full_lined"], thresh["full_lined"]):
    #     print(pr, rec, thresh_)

    # ========= ROC Curve ===========
    fpr = {}
    tpr = {}
    thresh = {}
    roc_auc = {}

    for i in classes:
        fpr[i], tpr[i], thresh[i] = sm.roc_curve(y_true, y_score, pos_label=i)
        roc_auc[i] = sm.auc(fpr[i], tpr[i])
        plt.plot(fpr[i], tpr[i], lw=2, label=f'{i} (area = {roc_auc[i]:0.2f})')

    ns_probs = [0 for _ in range(len(y_true))]
    ns_fpr, ns_tpr, _ = sm.roc_curve(y_true, ns_probs, pos_label="nolines")
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')

    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.legend(loc="best")
    plt.title("ROC curve")
    plt.show()

    print("ROC Curve")
Beispiel #25
0
                        fc_hidden_sizes=fc_hs,
                        add_residual_connection=False)

                    model_desc = "_gc_" + str(gc_hs) + "_fc_" + str(
                        fc_hs) + "_lr_" + str(lr) + "_epochs_" + str(epochs)

                    train(model=gcn_model,
                          train_data=train_data,
                          validation_data=valid_data,
                          num_epochs=epochs,
                          learning_rate=lr,
                          metrics_to_log=metrics,
                          model_path=args.model_dir + args.model_prefix +
                          ".pt")

                    valid_metrics_result = calculate_metrics(model=gcn_model,
                                                             data=valid_data)
                    test_metrics_result = calculate_metrics(model=gcn_model,
                                                            data=test_data)

                    result = {
                        "gc_hidden_layers": str(gc_hs),
                        "fc_hidden_layers": str(fc_hs),
                        "learning_rate": lr
                    }

                    for m in metrics:
                        result["test_" + m] = test_metrics_result[m]
                        result["valid_" + m] = valid_metrics_result[m]

                    results = results.append(result, ignore_index=True)
Beispiel #26
0
    val = [float(x) for x in valString.split("_")]
    return val

df = pandas.read_csv('senia.csv').values
x = map(lambda x: map(lambda y: generateIntSequence(y), x), df[:, 1:-2])
#print x[0], x[-1]
y = map(lambda y: int(y), df[:,-1]) 
y = numpy.array(y)
#print y[0], y[-1]
x_metrics = []
for i, x_i in enumerate(x):
    timestamp = int(df[i, -2])
    data = [timestamp, ] + x_i
    #if i % 1000 == 0:
    #    print data
    x_metrics.append(calculate_metrics(data)[:-1])

#print x_metrics[0]
x_metrics = numpy.array(x_metrics) 
#x_metrics = normalize(x_metrics)
#x_metrics = scale(x_metrics)  

def f6(x):
    if x == 7:
        return 2
    elif x > 7:
        return x - 1
    else:
        return x