def nn_inference_pipeline(model_name: Text,
                          model_path: Text,
                          data_params: Dict,
                          dataloader_params: Dict,
                          save_dir=None):
    model = init_model(model_name)
    model = model.load(model_path)

    x_test, y_test, dataset = preprocessing_pipeline(data_params)

    dataloader = generate_dataloader(x_test, y_test, dataloader_params)

    pred = [model.predict(x)[0] for x in dataloader['valid']]
    post_pred = dataset.postprocessing(pred, model_name)
    y_true = [model.predict(x)[1] for x in dataloader['valid']]

    report_df = report(y_true, post_pred)

    logger.info(f' > Model: {model.name}')
    logger.info(f' > Dataset: {dataset.__class__.__name__}')
    logger.info(f' > Test result: \n {report_df}')

    if save_dir:
        save_test(model_name,
                  data_params,
                  report_df,
                  save_dir)
Esempio n. 2
0
def inference_without_trained_model(model_name: Text,
                                    data_params: Dict,
                                    model_params: Dict,
                                    save_dir=None):

    x, y, dataset, _ = preprocessing_pipeline(data_params)

    model_func = init_model(model_name)
    model = model_func(model_name, model_params)

    y_pred = model.predict(x)
    y_pred = model.postprocessing(y_pred)

    report_df = report(y, y_pred)

    logger.info(f' > Model: {model.name}')
    logger.info(f' > Dataset: {dataset.__class__.__name__}')
    logger.info(f' > Test result: \n {report_df}')

    if save_dir:
        save_model_data(None,
                        data_params,
                        model_params,
                        report_df,
                        save_dir=save_dir)

    return model
def model_training(model_name: Text,
                   data_params: Dict,
                   model_params: Dict,
                   seed: int,
                   save_dir=None):

    x, y, dataset, vectorizer = preprocessing_pipeline(data_params)

    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        test_size=data_params['test_size'],
        shuffle=True,
        random_state=seed)

    model_func = init_model(model_name)
    model = model_func(model_name, model_params)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred = dataset.postprocessing(y_pred, model_name)

    report_df = report(y_test, y_pred)

    logger.info(f' > Model: {model.name}')
    logger.info(f' > Test result: \n {report_df}')

    if save_dir:
        save_model_data(model,
                        data_params,
                        model_params,
                        report_df,
                        save_dir=save_dir)

    return model
def model_training_nn(model_name: Text,
                      data_params: Dict,
                      dataloader_params: Dict,
                      model_params: Dict,
                      save_dir=None):
    x, y, dataset, vectorizer = preprocessing_pipeline(data_params)

    dataloader = generate_dataloader(x, y, dataloader_params)

    model_params['dataloader'] = dataloader
    model_params['network']['n_words'] = vectorizer.get_n_words() if vectorizer is not None else None

    model_func = init_model(model_name)
    model = model_func(model_name, model_params)

    model.fit()
    y_pred = np.ravel([model.predict(x) for x in dataloader['valid']])
    y_pred_post = dataset.postprocessing(y_pred, model_name)
    y_true = np.ravel([x[1].numpy() for x in dataloader['valid']])

    report_df = report(y_true, y_pred_post)

    logger.info(f' > Model: {model.name}')
    logger.info(f' > Test result: \n {report_df}')

    # if save_dir:
    #     save_model_data(model,
    #                     data_params,
    #                     model_params,
    #                     report_df,
    #                     save_dir=save_dir)

    return model
def test_network():
    seed = 2021

    emb_dim = 100

    dataset_params = {
        'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv',
        'dataset_type': MOVIE_DATASET,
        'preprocessed': True,
        # 'target_scaling': (0, 1),
        'vectorization': TOKENIZER,
        # 'vector_params': {'ngram_range': (1, 3),
        #                   'max_features': emb_dim},
        'imbalance': None,
        # 'imb_params': {'random_state': seed,
        #                'k_neighbors': 3},
        'train': True
    }

    dataloader_params = {
        'split_size': 0.7,
        'shuffle': True,
        'batch_size': 32,
        'random_seed': seed
    }

    network_params = {
        'emb_dim': emb_dim,
        'dataset_type': TOKENIZER,
        'kernel_size': [3, 5, 7],
        'out_channels': 30,
        'batch_size': 32,
        'stride': 1,
        'padding': [0, 1, 2],
        'pooling_kernel': 2,
        'dropout': 0.4
    }

    training_params = {'epochs': 10, 'lr': 0.001}

    x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params)
    network_params['n_words'] = vectorizer.get_n_words()

    dataloader = generate_dataloader(x, y, dataloader_params)
    network = Conv1D_Network(network_params)
    loss = nn.BCELoss()
    optimizer = torch.optim.Adam

    model = NetworkModel(network, dataloader, loss, optimizer)
    model._init_optimizer(training_params['lr'])

    model.train(training_params['epochs'])

    return
    def test_bert_model(self):
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        seed = 2021

        dataset_params = {
            'data_path': None,
            'dataset_type': SST_DATASET,
            'preprocessed': False,
            'vectorization': None,
            'imbalance': None,
            'train': True
        }

        dataloader_params = {
            'split_size': 0.7,
            'shuffle': True,
            'batch_size': 64,
            'random_seed': seed
        }

        network_params = {'dropout': 0.2, 'device': torch.device('cpu:0')}

        training_params = {
            'epochs': 10,
            'lr': 0.001,
            'save_dir': 'resources/models/',
            'patience': 5
        }

        x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params)

        dataloader = generate_dataloader(x, y, dataloader_params,
                                         Bert_NN_Dataset)
        network = Pretrained_Bert_Network(network_params).to(
            network_params['device'])
        loss = nn.BCELoss()
        optimizer = AdamW

        model = Pretrained_Bert_Model(network, dataloader, loss, optimizer,
                                      training_params['save_dir'],
                                      network_params['device'])
        model._init_optimizer(training_params['lr'])

        model.train(training_params['epochs'],
                    patience=training_params['patience'])
    def test_tokenizing(self):
        seed = 2021

        emb_dim = 100

        dataset_params = {'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv',
                          'dataset_type': MOVIE_DATASET,
                          'preprocessed': True,
                          # 'target_scaling': (0, 1),
                          'vectorization': TOKENIZER,
                          'imbalance': None,
                          'imb_params': {'random_state': seed,
                                         'k_neighbors': 3},
                          'train': True}

        x, y, dataset = preprocessing_pipeline(dataset_params)

        self.assertEqual(len(x), len(y))
Esempio n. 8
0
def inference_pipeline(model_name: Text,
                       model_path: Text,
                       data_params: Dict,
                       save_dir=None):

    model = init_model(model_name)
    model = model.load(model_path)

    x_test, y_test, dataset = preprocessing_pipeline(data_params)

    pred = model.predict(x_test)
    post_pred = dataset.postprocessing(pred, model_name)

    report_df = report(y_test, post_pred)

    logger.info(f' > Model: {model.name}')
    logger.info(f' > Dataset: {dataset.__class__.__name__}')
    logger.info(f' > Test result: \n {report_df}')

    if save_dir:
        save_test(model_name, data_params, report_df, save_dir)
    def test_train_model(self):
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        seed = 2021

        emb_dim = 100

        dataset_params = {
            'data_path': 'resources/preprocessed_data/cleaned_data_v1.csv',
            'dataset_type': MOVIE_DATASET,
            'preprocessed': True,
            # 'target_scaling': (0, 1),
            'vectorization': TOKENIZER,
            # 'vector_params': {'ngram_range': (1, 3),
            #                   'max_features': emb_dim},
            'imbalance': None,
            # 'imb_params': {'random_state': seed,
            #                'k_neighbors': 3},
            'train': True
        }

        dataloader_params = {
            'split_size': 0.7,
            'shuffle': True,
            'batch_size': 64,
            'random_seed': seed,
            'dataset_class': NN_Dataset
        }

        network_params = {
            'emb_dim': MAX_WORD_SENTENCE,
            'dataset_type': TOKENIZER,
            'kernel_size': [5, 7, 9],
            'out_channels': 10,
            'stride': 1,
            'padding': [0, 1, 2],
            'pooling_kernel': 2,
            'dropout': 0.2,
            'device': torch.device('cpu:0')
        }

        training_params = {
            'epochs': 10,
            'lr': 0.001,
            'save_dir': 'resources/models/',
            'patience': 5
        }

        x, y, dataset, vectorizer = preprocessing_pipeline(dataset_params)
        network_params['n_words'] = vectorizer.get_n_words()

        dataloader = generate_dataloader(x, y, dataloader_params)
        network = Conv1D_Network(network_params).to(network_params['device'])
        loss = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam

        model = ConvModel(network, dataloader, loss, optimizer,
                          training_params['save_dir'])
        model._init_optimizer(training_params['lr'])

        model.train(training_params['epochs'],
                    patience=training_params['patience'])