Ejemplo n.º 1
0
def train(models, train_set, eval_set=None, silent=False):
    """Train all model for production and save them

    Args:
        models (list of str): Model names. Pass if you want to train a just a
            set particular models
        train_set (dg.enums.Dataset): Dataset to train on
        eval_set (dg.enums.Dataset): Dataset to use for evaluation during
            training.
        silent (bool): Don't print details to standard out.
    """
    config = Config()
    model_dir = config.get_model_dir()
    if not silent:
        print('Model dir: ', model_dir)

    bar(silent=silent)
    for model_id in models:
        model = config.models[model_id].set_params(
            **config.get_params(model_id))
        datasets = config.get_datasets(model.id)
        train_set = (datasets[train_set.value] if isinstance(
            train_set, Dataset) else train_set)
        eval_set = (datasets[eval_set.value]
                    if isinstance(eval_set, Dataset) else eval_set)
        train_model(model,
                    train_set=train_set,
                    eval_set=eval_set,
                    model_dir=model_dir,
                    save=True,
                    silent=silent)
        bar(silent=silent)
Ejemplo n.º 2
0
def evaluate_model(model, datasets, silent=False):
    """Evaluate a single model

    Args:
        model (dg.Model): Model to evaluate
        datasets (list of dg.enums.Dataset): List of datasets used for
            evaluation.
        silent (bool): Don't print details to standard out.
    Returns:
        dict: Evaluation metrics
    """
    config = Config()
    metrics = config.get('metrics.all', None)
    if not silent:
        print('Evaluating:', model.id)
    db = persistence.Database()
    old_metrics = db.get(model)
    new_metrics = deepcopy(old_metrics)
    model_datasets = config.get_datasets(model.id)
    for ds in datasets:
        if (new_metrics.get(ds.value, None) is None
                and model_datasets[ds.value] is not None):
            score = model.score_dataset(model_datasets[ds.value],
                                        metrics=metrics)
            new_metrics[ds.value] = (score if isinstance(score, dict) else {
                'score': score
            })
    if old_metrics != new_metrics:
        db.add(model, new_metrics)
    if not silent:
        print_metrics(new_metrics)
    return metrics_to_dict(model, new_metrics)
Ejemplo n.º 3
0
def columns():
    config = Config()
    cols = ['model']
    for ds in Dataset.for_eval():
        metrics = config.get('metrics.all', None)
        if metrics is None:
            cols.append(f'{ds.value}-score')
        else:
            for metric in metrics:
                cols.append(f'{ds.value}-{metric}')
    return cols
Ejemplo n.º 4
0
    def load(cls, model_dir):
        """Load the production model

        Args:
            model_dir (str): Path to the model dir from where we should load
                the model.
        """
        config = Config()
        with io.open(os.path.join(model_dir, 'params.yaml')) as f:
            params = yaml.safe_load(f)
        model = cls(**params)
        model.model_dir = config.get_model_dir(tensorflow=True)
        model.estimator = model._create_estimator(model_dir)
        return model
Ejemplo n.º 5
0
def main():
    # For some reason current working directory is not in the python path
    # when dg is installed with pip
    cwd = os.getcwd()
    if cwd not in sys.path:
        sys.path.append(cwd)

    parser = argparse.ArgumentParser(prog='dg')
    subparsers = parser.add_subparsers(dest='parser', help='commands')

    subparsers.add_parser(
        'help', help='Print usage information')

    loader = Loader()
    loader.load('dg.commands')
    # If we are in the project configuration file will exist and then we can
    # load the project commands
    try:
        config = Config()
        loader.load(f'{config.project_name}.commands')
    except ConfigNotFound:
        pass

    # Print out the modules it could not load
    for module, error in loader.errors.items():
        logger.warning('Could not load "%s": %s', module, error)

    commands = Command.get_instances()

    for command in sorted(commands.values()):
        command.create_subparser(subparsers)

    ns, _ = parser.parse_known_args()
    try:
        if ns.parser == 'help':
            parser.print_help()
        elif ns.parser in commands:
            command = commands[ns.parser]
            command.run(ns)
        else:
            parser.print_help()
    except ConfigNotFound as e:
        print(e)
        return 1
Ejemplo n.º 6
0
def evaluate(models, datasets, silent=False):
    """Evaluate all models and print out the metrics for evaluation.

    Evaluation is using the production model.

    Args:
        models (list of str): Model names. Pass if you want to evaluate just a
            set of particular models.
        datasets (list of dg.enums.Dataset): List of datasets used for
            evaluation.
        silent (bool): Don't print details to standard out.
    """
    config = Config()
    all_metrics = []
    bar(silent=silent)
    for name in models:
        model = persistence.load(config.models[name])
        all_metrics.append(evaluate_model(model, datasets, silent=silent))
        bar(silent=silent)

    df = pd.DataFrame(all_metrics, columns=columns())
    return df
Ejemplo n.º 7
0
 def __init__(self):
     self.config = Config()
Ejemplo n.º 8
0
class Model(BaseEstimator, TransformerMixin):
    """Base class for all estimators

        Attributes:
            config (Config): Configuration instance
        """

    id = None
    'Estimator id'

    _estimator_type = None

    def __init__(self):
        self.config = Config()

    def __str__(self):
        return self.id

    def input_fn(self, filename, mode):
        """Input function that transforms the dataset file into the format
        needed by the model.

        Default implementation just reads the csv file using pandas, and
        returns the X and y tensors.

        Args:
            filename (str): Path to the filename
            mode (Mode):
        Returns:
            X: Dataset features
            y: Dataset targets
        """
        data = pd.read_csv(filename)
        X = data[self.config.features]
        if self.config.targets in (None, [], tuple()):
            y = None
        else:
            y = data[self.config.targets]
        return X, y

    @abc.abstractmethod
    def fit(self, X, y=None):
        """Implementation of a fitting function

        Args:
            X (array-like or sparse matrix of shape = [n_samples, n_features]):
                The training input samples.
            y (array-like, shape = [n_samples] or [n_samples, n_outputs]):
                The target values (class labels in classification, real numbers
                in regression).

        Returns:
            Estimator: Returns self.
        """

    def fit_dataset(self, train_set, eval_set=None):
        """Default implementation of the fit dataset function.

        This function receives a path to the train_set and potentially eval_set
        files, and target labels, reads the files from disk and fits the model.

        Args:
            train_set (str): Path to the csv training dataset
            eval_set (str): Path to the csv evaluation set

        Returns:
            Estimator: Returns self.
        """
        X, y = self.input_fn(train_set, Mode.TRAIN)
        return self.fit(X, y)

    # Optional
    def predict(self, X):
        """Implementation of a predicting function.

        Args:
            X (array-like of shape = [n_samples, n_features]):
                The input samples.

        Returns:
        -------
            array of shape = [n_samples] or [n_samples, n_outputs):
                Predicted values (class labels in classification, real numbers
                in regression).
        """

    def predict_dataset(self, dataset):
        """Default implementation of the predict dataset function.

        This function receives a path to the dataset, reads the files from disk
        and predicts values.

        Args:
            dataset (str): Path to the dataset csv
        """
        X, _ = self.input_fn(dataset, Mode.PREDICT)
        return self.predict(X)

    # Optional
    def predict_proba(self, X):
        """Return probability estimates for the test vector X.

        Args:
            X (array-like, shape = [n_samples, n_features]):
                Input samples.

        Returns:
            C (array-like, shape = [n_samples, n_classes]):
                Returns the probability of the samples for each class in the
                model. The columns correspond to the classes in sorted order,
                as they appear in the training dataset.
        """

    def predict_proba_dataset(self, dataset):
        """Default implementation of the predict proba dataset function.

        This function receives a path to the dataset, reads the files from disk
        and predicts values.

        Args:
            dataset (str): Path to the dataset csv
        """
        X, _ = self.input_fn(dataset, Mode.PREDICT)
        return self.predict_proba(X)

    # Optional
    def transform(self, X):
        """Implementation of a transform function.

        Args:
            X (array-like of shape = [n_samples, n_features]):
                The input samples.

        Returns:
            array of int of shape = [n_samples, n_features]: The transofrmed
                array.
        """

    def transform_dataset(self, dataset):
        """Default implementation of the tranform dataset function.

        This function receives a path to the dataset and target labels, reads
        the files from disk and transforms it.

        Args:
            dataset (str): Path to the dataset csv
        """
        X, _ = self.input_fn(dataset, Mode.TRANSFORM)
        return self.transform(X)

    def score(self, X, y, sample_weight=None, metrics=None):
        """Scores the model.

        If scoring function is defined in the configuration file, this function
        will use that scoring function, else:

        For classification:

        Returns the mean accuracy on the given test data
        and labels.

        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.


        For regression:

        Returns the coefficient of determination R^2 of the prediction.

        The coefficient R^2 is defined as (1 - u/v), where u is the residual
        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
        sum of squares ((y_true - y_true.mean()) ** 2).sum().
        The best possible score is 1.0 and it can be negative (because the
        model can be arbitrarily worse). A constant model that always
        predicts the expected value of y, disregarding the input features,
        would get a R^2 score of 0.0.

        Args:
            X (array-like, shape = (n_samples, n_features)): Test samples.
            y (array-like, shape = (n_samples) or (n_samples, n_outputs)):
                True labels for X.
            sample_weight (array-like, shape = [n_samples], optional):
                Sample weights.
            metrics: Optional dictionary of metrics functions to use instead of
                the selected ones for regression and classification.

        Returns:
            float: For classification: mean accuracy of self.predict(X) wrt. y.
                   For regression: R^2 of self.predict(X) wrt. y.
        """
        # If metrics dictionary is passed in use it to calculate the metrics
        if metrics is not None:
            return {
                key: get_object(value)(y, self.predict(X))
                for key, value in metrics.items()
            }

        # If metrics are None try to get the scoring function from the
        # configuration file
        proba = self.conrig.get('metrics.proba', False)
        score = self.config.get('metrics.score', None)
        predict_func = self.predict_proba if proba else self.predict

        if score is not None:
            return get_object(score)(y,
                                     predict_func(X),
                                     sample_weight=sample_weight)

        # Finally try the default estimators for classification and regression
        estimator_type = getattr(self, '_estimator_type', None)
        if estimator_type == 'classifier':
            from sklearn.metrics import accuracy_score
            return accuracy_score(y,
                                  predict_func(X),
                                  sample_weight=sample_weight)
        elif estimator_type == 'regressor':
            from sklearn.metrics import r2_score
            return r2_score(y,
                            predict_func(X),
                            sample_weight=sample_weight,
                            multioutput='variance_weighted')
        else:
            # Don't know how to score the model, just return 0
            return 0

    def score_dataset(self, dataset, sample_weight=None, metrics=None):
        """Default implementation of the score dataset function.

        This function receives a path to the dataset and target labels, reads
        the files from disk and calcualate score.

        Args:
            dataset (str): Path to the dataset
            sample_weight (array-like, shape = [n_samples], optional):
                Sample weights.
            metrics: Optional dictionary of metrics functions to use instead of
                the selected ones for regression and classification.
        """
        X, y = self.input_fn(dataset, Mode.EVAL)
        return self.score(X, y, sample_weight, metrics)