Exemple #1
0
def main(args):

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)

    default_path = create_default_path()
    print('\n*** Set default saving/loading path to:', default_path)

    if args.dataset == AIFB or args.dataset == MUTAG:
        module = importlib.import_module(MODULE.format('dglrgcn'))
        data = module.load_dglrgcn(args.data_path)
        data = to_cuda(data) if cuda else data
        mode = NODE_CLASSIFICATION
    elif args.dataset == MUTAGENICITY or args.dataset == PTC_MR or args.dataset == PTC_MM or args.dataset == PTC_FR or args.dataset == PTC_FM:
        module = importlib.import_module(MODULE.format('dortmund'))
        data = module.load_dortmund(args.data_path)
        data = to_cuda(data) if cuda else data
        mode = GRAPH_CLASSIFICATION
    else:
        raise ValueError('Unable to load dataset', args.dataset)

    print_graph_stats(data[GRAPH])

    config_params = read_params(args.config_fpath, verbose=True)

    # create GNN model
    model = Model(g=data[GRAPH],
                  config_params=config_params[0],
                  n_classes=data[N_CLASSES],
                  n_rels=data[N_RELS] if N_RELS in data else None,
                  n_entities=data[N_ENTITIES] if N_ENTITIES in data else None,
                  is_cuda=cuda,
                  mode=mode)

    if cuda:
        model.cuda()

    # 1. Training
    app = App()
    learning_config = {
        'lr': args.lr,
        'n_epochs': args.n_epochs,
        'weight_decay': args.weight_decay,
        'batch_size': args.batch_size,
        'cuda': cuda
    }
    print('\n*** Start training ***\n')
    app.train(data, config_params[0], learning_config, default_path, mode=mode)

    # 2. Testing
    print('\n*** Start testing ***\n')
    app.test(data, default_path, mode=mode)

    # 3. Delete model
    remove_model(default_path)
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    kmeans = Model(model_type=ModelTypesIdsEnum.kmeans)

    _, train_predicted = kmeans.fit(data=train_data)

    assert all(np.unique(train_predicted) == [0, 1])
Exemple #3
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    _, train_predicted = pca.fit(data=train_data)

    assert train_predicted.shape[1] < data.features.shape[1]
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type=ModelTypesIdsEnum.logit)

    _, train_predicted = log_reg.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
def test_qda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    qda = Model(model_type=ModelTypesIdsEnum.qda)

    _, train_predicted = qda.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Exemple #6
0
def test_lda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    lda = Model(model_type='lda')

    _, train_predicted = lda.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Exemple #7
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type='logit')

    _, train_predicted = log_reg.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Exemple #8
0
def test_scoring_logreg_tune_correct(data_fixture, request):
    train_data, test_data = request.getfixturevalue(data_fixture)

    train_data.features = Scaling().fit(train_data.features).apply(
        train_data.features)
    test_data.features = Scaling().fit(test_data.features).apply(
        test_data.features)

    logreg = Model(model_type='logit')

    model, _ = logreg.fit(train_data)
    test_predicted = logreg.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    logreg_for_tune = Model(model_type='logit')

    model_tuned, _ = logreg_for_tune.fine_tune(
        train_data, iterations=50, max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned,
                                                   data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)

    roc_threshold = 0.6

    assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc,
                                                 2) > roc_threshold
Exemple #9
0
def enable_gradient_clipping(model: Model,
                             grad_clipping: Optional[float]) -> None:
    if grad_clipping is not None:
        for parameter in model.parameters():
            if parameter.requires_grad:
                parameter.register_hook(lambda grad: nn_util.clamp_tensor(
                    grad, minimum=-grad_clipping, maximum=grad_clipping))
Exemple #10
0
    def log_parameter_and_gradient_statistics(self, # pylint: disable=invalid-name
                                              model: Model,
                                              batch_grad_norm: float) -> None:
        """
        Send the mean and std of all parameters and gradients to tensorboard, as well
        as logging the average gradient norm.
        """
        if self._should_log_parameter_statistics:
            # Log parameter values to Tensorboard
            for name, param in model.named_parameters():
                self.add_train_scalar("parameter_mean/" + name, param.data.mean())
                self.add_train_scalar("parameter_std/" + name, param.data.std())
                if param.grad is not None:
                    if param.grad.is_sparse:
                        # pylint: disable=protected-access
                        grad_data = param.grad.data._values()
                    else:
                        grad_data = param.grad.data

                    # skip empty gradients
                    if torch.prod(torch.tensor(grad_data.shape)).item() > 0: # pylint: disable=not-callable
                        self.add_train_scalar("gradient_mean/" + name, grad_data.mean())
                        self.add_train_scalar("gradient_std/" + name, grad_data.std())
                    else:
                        # no gradient for a parameter with sparse gradients
                        logger.info("No gradient for %s, skipping tensorboard logging.", name)
            # norm of gradients
            if batch_grad_norm is not None:
                self.add_train_scalar("gradient_norm", batch_grad_norm)
Exemple #11
0
 def log_histograms(self, model: Model, histogram_parameters: Set[str]) -> None:
     """
     Send histograms of parameters to tensorboard.
     """
     for name, param in model.named_parameters():
         if name in histogram_parameters:
             self.add_train_histogram("parameter_histogram/" + name, param)
Exemple #12
0
def test_node_factory_log_reg_correct(data_setup):
    model_type = ModelTypesIdsEnum.logit
    node = NodeGenerator().primary_node(model_type=model_type)

    expected_model = Model(model_type=model_type).__class__
    actual_model = node.model.__class__

    assert node.__class__ == PrimaryNode
    assert expected_model == actual_model
Exemple #13
0
def test_node_factory_log_reg_correct(data_setup):
    model_type = 'logit'
    node = PrimaryNode(model_type=model_type)

    expected_model = Model(model_type=model_type).__class__
    actual_model = node.model.__class__

    assert node.__class__ == PrimaryNode
    assert expected_model == actual_model
Exemple #14
0
def test_arima_tune_correct():
    data = get_synthetic_ts_data()
    train_data, test_data = train_test_data_setup(data=data)

    arima_for_tune = Model(model_type='arima')
    model, _ = arima_for_tune.fine_tune(data=train_data,
                                        iterations=5,
                                        max_lead_time=timedelta(minutes=0.1))

    test_predicted_tuned = arima_for_tune.predict(fitted_model=model,
                                                  data=test_data)

    rmse_on_test_tuned = mse(y_true=test_data.target,
                             y_pred=test_predicted_tuned,
                             squared=False)

    rmse_threshold = np.std(test_data.target)

    assert rmse_on_test_tuned < rmse_threshold
Exemple #15
0
def rescale_gradients(model: Model,
                      grad_norm: Optional[float] = None) -> Optional[float]:
    """
    Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled.
    """
    if grad_norm:
        parameters_to_clip = [
            p for p in model.parameters() if p.grad is not None
        ]
        return sparse_clip_norm(parameters_to_clip, grad_norm)
    return None
Exemple #16
0
def test_classification_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    knn_for_tune = Model(model_type='knn')
    knn_for_tune.params = {'n_neighbors': 1}
    model, _ = knn_for_tune.fit(data=train_data)

    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Exemple #17
0
def test_max_lead_time_in_tune_process(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    start = datetime.now()

    knn_for_tune = Model(model_type='knn')
    model, _ = knn_for_tune.fine_tune(data=train_data,
                                      max_lead_time=timedelta(minutes=0.05),
                                      iterations=100)
    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                y_score=test_predicted_tuned)
    roc_threshold = 0.6

    spent_time = (datetime.now() - start).seconds

    assert roc_on_test_tuned > roc_threshold
    assert spent_time == 3
Exemple #18
0
def get_metrics(model: Model,
                total_loss: float,
                num_batches: int,
                reset: bool = False) -> Dict[str, float]:
    """
    Gets the metrics but sets ``"loss"`` to
    the total loss divided by the ``num_batches`` so that
    the ``"loss"`` metric is "average loss per batch".
    """
    metrics = model.get_metrics(reset=reset)
    metrics["loss"] = float(total_loss /
                            num_batches) if num_batches > 0 else 0.0
    return metrics
Exemple #19
0
    def enable_activation_logging(self, model: Model) -> None:
        if self._histogram_interval is not None:
            # To log activation histograms to the forward pass, we register
            # a hook on forward to capture the output tensors.
            # This uses a closure to determine whether to log the activations,
            # since we don't want them on every call.
            for _, module in model.named_modules():
                if not getattr(module, 'should_log_activations', False):
                    # skip it
                    continue

                def hook(module_, inputs, outputs):
                    # pylint: disable=unused-argument,cell-var-from-loop
                    log_prefix = 'activation_histogram/{0}'.format(module_.__class__)
                    if self.should_log_histograms_this_batch():
                        self.log_activation_histogram(outputs, log_prefix)
                module.register_forward_hook(hook)
Exemple #20
0
def test_pca_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    model, _ = pca.fit(data=train_data)
    test_predicted = pca.predict(fitted_model=model, data=test_data)

    pca_for_tune = Model(model_type='pca_data_model')

    pca_for_tune.params = {
        'svd_solver': 'randomized',
        'iterated_power': 'auto',
        'dim_reduction_expl_thr': 0.7,
        'dim_reduction_min_expl': 0.001
    }

    model, _ = pca_for_tune.fit(data=train_data)
    test_predicted_tuned = pca_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Exemple #21
0
 def log_learning_rates(self,
                        model: Model,
                        optimizer: torch.optim.Optimizer):
     """
     Send current parameter specific learning rates to tensorboard
     """
     if self._should_log_learning_rate:
         # optimizer stores lr info keyed by parameter tensor
         # we want to log with parameter name
         names = {param: name for name, param in model.named_parameters()}
         for group in optimizer.param_groups:
             if 'lr' not in group:
                 continue
             rate = group['lr']
             for param in group['params']:
                 # check whether params has requires grad or not
                 effective_rate = rate * float(param.requires_grad)
                 self.add_train_scalar("learning_rate/" + names[param], effective_rate)
Exemple #22
0
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False):
    templates_by_models = []
    for model_template in itertools.chain.from_iterable(chain_template):
        model_instance = Model(model_type=model_template.model_type)
        model_template.model_instance = model_instance
        templates_by_models.append((model_template, model_instance))
    if skip_fit:
        return

    for template, instance in templates_by_models:
        samples, features_amount = template.input_shape

        if with_gaussian:
            features, target = gauss_quantiles(samples_amount=samples,
                                               features_amount=features_amount,
                                               classes_amount=classes)
        else:
            options = {
                'informative': features_amount,
                'redundant': 0,
                'repeated': 0,
                'clusters_per_class': 1
            }
            features, target = synthetic_dataset(
                samples_amount=samples,
                features_amount=features_amount,
                classes_amount=classes,
                features_options=options)
        target = np.expand_dims(target, axis=1)
        data_train = InputData(idx=np.arange(0, samples),
                               features=features,
                               target=target,
                               data_type=DataTypesEnum.table,
                               task=Task(TaskTypesEnum.classification))

        preproc_data = copy(data_train)
        preprocessor = Normalization().fit(preproc_data.features)
        preproc_data.features = preprocessor.apply(preproc_data.features)
        print(f'Fit {instance}')
        fitted_model, predictions = instance.fit(data=preproc_data)

        template.fitted_model = fitted_model
        template.data_fit = preproc_data
        template.preprocessor = preprocessor
Exemple #23
0
def test_knn_classification_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted)

    knn_for_tune = Model(model_type='knn')
    model, _ = knn_for_tune.fine_tune(data=train_data,
                                      iterations=10,
                                      max_lead_time=timedelta(minutes=1))

    test_predicted_tuned = knn.predict(fitted_model=model, data=test_data)

    roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                y_score=test_predicted_tuned)
    roc_threshold = 0.6
    assert roc_on_test_tuned > roc_on_test > roc_threshold
Exemple #24
0
def test_rf_class_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    rf = Model(model_type='rf')

    model, _ = rf.fit(train_data)
    test_predicted = rf.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    model_tuned, _ = rf.fine_tune(data=train_data,
                                  iterations=12,
                                  max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)
    roc_threshold = 0.7

    assert test_roc_auc_tuned != test_roc_auc
    assert test_roc_auc_tuned > roc_threshold
Exemple #25
0
 def __init__(self, nodes_from: Optional[List['Node']], model_type: str,
              manual_preprocessing_func: Optional[Callable] = None):
     self.nodes_from = nodes_from
     self.model = Model(model_type=model_type)
     self.cache = FittedModelCache(self)
     self.manual_preprocessing_func = manual_preprocessing_func
Exemple #26
0
class Node(ABC):

    def __init__(self, nodes_from: Optional[List['Node']], model_type: str,
                 manual_preprocessing_func: Optional[Callable] = None):
        self.nodes_from = nodes_from
        self.model = Model(model_type=model_type)
        self.cache = FittedModelCache(self)
        self.manual_preprocessing_func = manual_preprocessing_func

    @property
    def descriptive_id(self):
        return self._descriptive_id_recursive(visited_nodes=[])

    def _descriptive_id_recursive(self, visited_nodes):
        node_label = self.model.description
        if self.manual_preprocessing_func:
            node_label = f'{node_label}_custom_preprocessing={self.manual_preprocessing_func.__name__}'
        full_path = ''
        if self in visited_nodes:
            return 'ID_CYCLED'
        visited_nodes.append(self)
        if self.nodes_from:
            previous_items = []
            for parent_node in self.nodes_from:
                previous_items.append(f'{parent_node._descriptive_id_recursive(copy(visited_nodes))};')
            previous_items.sort()
            previous_items_str = ';'.join(previous_items)

            full_path += f'({previous_items_str})'
        full_path += f'/{node_label}'
        return full_path

    @property
    def model_tags(self) -> List[str]:
        return self.model.metadata.tags

    def output_from_prediction(self, input_data, prediction):
        return OutputData(idx=input_data.idx,
                          features=input_data.features,
                          predict=prediction, task=input_data.task,
                          data_type=self.model.output_datatype(input_data.data_type))

    def _transform(self, input_data: InputData):
        transformed_data = transformation_function_for_data(
            input_data_type=input_data.data_type,
            required_data_types=self.model.metadata.input_types)(input_data)
        return transformed_data

    def _preprocess(self, data: InputData):
        preprocessing_func = preprocessing_func_for_data(data, self)

        if not self.cache.actual_cached_state:
            # if fitted preprocessor not found in cache
            preprocessing_strategy = \
                preprocessing_func().fit(data.features)
        else:
            # if fitted preprocessor already exists
            preprocessing_strategy = self.cache.actual_cached_state.preprocessor

        data.features = preprocessing_strategy.apply(data.features)

        return data, preprocessing_strategy

    def fit(self, input_data: InputData, verbose=False) -> OutputData:
        transformed = self._transform(input_data)
        preprocessed_data, preproc_strategy = self._preprocess(transformed)

        if not self.cache.actual_cached_state:
            if verbose:
                print('Cache is not actual')

            cached_model, model_predict = self.model.fit(data=preprocessed_data)
            self.cache.append(CachedState(preprocessor=copy(preproc_strategy),
                                          model=cached_model))
        else:
            if verbose:
                print('Model were obtained from cache')

            model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model,
                                               data=preprocessed_data)

        return self.output_from_prediction(input_data, model_predict)

    def predict(self, input_data: InputData, verbose=False) -> OutputData:
        transformed = self._transform(input_data)
        preprocessed_data, _ = self._preprocess(transformed)

        if not self.cache:
            raise ValueError('Model must be fitted before predict')

        model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model,
                                           data=preprocessed_data)

        return self.output_from_prediction(input_data, model_predict)

    def fine_tune(self, input_data: InputData,
                  max_lead_time: timedelta = timedelta(minutes=5), iterations: int = 30):

        transformed = self._transform(input_data)
        preprocessed_data, preproc_strategy = self._preprocess(transformed)

        fitted_model, _ = self.model.fine_tune(preprocessed_data,
                                               max_lead_time=max_lead_time,
                                               iterations=iterations)

        self.cache.append(CachedState(preprocessor=copy(preproc_strategy),
                                      model=fitted_model))

    def __str__(self):
        model = f'{self.model}'
        return model

    @property
    def ordered_subnodes_hierarchy(self) -> List['Node']:
        nodes = [self]
        if self.nodes_from:
            for parent in self.nodes_from:
                nodes += parent.ordered_subnodes_hierarchy
        return nodes

    @property
    def custom_params(self) -> dict:
        return self.model.params

    @custom_params.setter
    def custom_params(self, params):
        self.model.params = params
Exemple #27
0
 def __init__(self, model_type: str, nodes_from: Optional[List['Node']] = None):
     model = Model(model_type=model_type)
     nodes_from = [] if nodes_from is None else nodes_from
     super().__init__(nodes_from=nodes_from, model=model)
Exemple #28
0
 def __init__(self, model_type: str):
     model = Model(model_type=model_type)
     super().__init__(nodes_from=None, model=model)
Exemple #29
0
 def __init__(self, model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     super().__init__(nodes_from=None, model=model)
Exemple #30
0
 def __init__(self, nodes_from: Optional[List['Node']],
              model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     nodes_from = [] if nodes_from is None else nodes_from
     super().__init__(nodes_from=nodes_from, model=model)