Ejemplo n.º 1
0
def main(queryfile: str,
         features: Optional[str] = None,
         halving: Optional[bool] = False,
         save: Optional[bool] = True):
    service = GridSearchService()
    models = ModelService()
    with open(queryfile, 'r') as f:
        query = json.load(f)

    search_models = models.query_models(query)
    logging.info("[i] {} models to train".format(len(search_models)))
    for i, m in enumerate(search_models):
        # if m.parameters:
        #     logging.info("==[{}/{}]== MODEL: {} {} {} {} ==> SKIP".format(i+1, len(search_models), m.symbol, m.dataset, m.target, m.pipeline))
        #     continue  # Skip this as search has already been performed
        logging.info("==[{}/{}]== MODEL: {} {} {} {} =====".format(
            i + 1, len(search_models), m.symbol, m.dataset, m.target,
            m.pipeline))
        mp = service.create_parameters_search(m, split=0.7, features=features)
        logging.info("[{}] Start random search".format(get_timestamp()))
        mp = service.random_search(m,
                                   mp,
                                   sync=True,
                                   verbose=1,
                                   n_jobs=8,
                                   halving=halving,
                                   save=save)
        logging.info("[{}] End random search".format(get_timestamp()))
Ejemplo n.º 2
0
    def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures:

        # Load dataset
        X = self.dataset_service.get_features(mf.dataset,
                                              mf.symbol,
                                              mf.search_interval.begin,
                                              mf.search_interval.end,
                                              columns=mf.features)
        y = self.dataset_service.get_target(mf.target, mf.symbol,
                                            mf.search_interval.begin,
                                            mf.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(mf.symbol, mf.dataset, mf.target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        mf.start_at = get_timestamp()  # Log starting timestamp
        if not mf.feature_selection_method or mf.feature_selection_method == 'importances':
            selector = select_from_model(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
        elif mf.feature_selection_method == 'importances_cv':
            selector = select_from_model_cv(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_.best_estimator_, X.columns)
        elif mf.feature_selection_method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif mf.feature_selection_method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif mf.feature_selection_method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(
                    mf.feature_selection_method))
        mf.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        mf.features = label_support(selector.get_support(), X.columns)

        # Update model with the new results
        if kwargs.get('save', True):
            self.model_repo.append_features_query(
                {
                    "dataset": mf.dataset,
                    "symbol": mf.symbol,
                    "target": mf.target
                }, mf)
        return mf
Ejemplo n.º 3
0
    def test_model(self, model: Model, mt: ModelTest, **kwargs):
        if not model.id:
            model = self.model_repo.create(model)
        if self.model_repo.exist_test(model.id, mt.task_key):
            logging.info("Model {} test {} already executed!".format(
                model.id, mt.task_key))
            return mt
        # Load dataset
        ds = DatasetService()
        d = ds.get_dataset(model.dataset, model.symbol)
        # Get training data including the first training window
        begin = sub_interval(timestamp=mt.test_interval.begin,
                             interval=mt.window)
        end = add_interval(timestamp=mt.test_interval.end, interval=mt.step)
        if from_timestamp(d.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \
                                   .format(model.pipeline, model.dataset, model.symbol, mt.window))
        X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end)
        y = ds.get_target(model.target, model.symbol, begin=begin, end=end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        # Slice testing interval in windows

        ranges = timestamp_windows(begin, end, mt.window, mt.step)

        mt.start_at = get_timestamp()
        df = test_windows(pipeline_module.estimator, mt.parameters, X, y,
                          ranges)
        mt.end_at = get_timestamp()

        mt.classification_results = df.to_dict()

        clf_report = flattened_classification_report_imbalanced(
            df.label, df.predicted)
        roc_report = roc_auc_report(
            df.label, df.predicted,
            df[[c for c in df.columns if '_proba_' in c]])
        clf_report.update(roc_report)
        mt.classification_report = clf_report

        self.model_repo.append_test(model.id, mt)

        return mt
Ejemplo n.º 4
0
    def random_search(self, model: Model, mp: ModelParameters,
                      **kwargs) -> ModelParameters:
        pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp)
        tag = "{}-{}-{}-{}-{}" \
            .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters))

        rscv = RandomizedSearchCV(estimator=pipeline_module.estimator,
                                  param_distributions=kwargs.get(
                                      'param_distributions',
                                      pipeline_module.PARAMETER_DISTRIBUTION),
                                  n_iter=kwargs.get('n_iter', 10),
                                  cv=StratifiedKFold(n_splits=mp.cv_splits),
                                  scoring=get_precision_scorer(),
                                  verbose=kwargs.get("verbose", 0),
                                  n_jobs=kwargs.get("n_jobs", None),
                                  refit=False,
                                  random_state=0)

        try:
            mp.start_at = get_timestamp()  # Log starting timestamp
            rscv.fit(X, y)
            mp.end_at = get_timestamp()  # Log ending timestamp
        except SplitException as e:
            logging.exception(
                "Model {} splitting yields single-class folds!\n{}".format(
                    tag, e.message))
            return mp  # Fit failed, don't save this.
        except ValueError as e:
            logging.exception("Model {} raised ValueError!\n{}".format(tag, e))
            return mp  # Fit failed, don't save this.

        # Collect results
        results_df = pd.DataFrame(rscv.cv_results_)

        # Update search request with results
        mp.parameter_search_method = 'randomsearch'
        mp.parameters = rscv.best_params_
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'random-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'random-search-results',
                                    mp.result_file)
            # Update model with the new results
            self.model_repo.append_parameters(model.id, mp)

        return mp
Ejemplo n.º 5
0
def main(dataset: str, target: str, method: str, split: Optional[float] = 0.7, replace: Optional[bool] = False, save: Optional[bool] = True):
    service = FeatureSelectionService()

    symbols = service.get_available_symbols(dataset)
    for i, sym in enumerate(symbols):
        logging.info("==[{}/{}]== Dataset: {} {} {} =====".format(i+1, len(symbols), sym, dataset, target))
        logging.info("[{}] Start feature search".format(get_timestamp()))
        mf = service.feature_selection_new(
            symbol=sym,
            dataset=dataset,
            target=target,
            split=split,
            method=method,
            replace=replace,
            save=save
        )
        logging.info("[{}] End feature search".format(get_timestamp()))
Ejemplo n.º 6
0
    def update(self, id: str, update: BaseModel):
        document = update.dict()
        document["updated"] = get_timestamp()

        result = self.collection.update_one({"_id": id}, {"$set": document})
        if not result.modified_count:
            raise DocumentNotFoundException(collection=self.__collection__, identifier=id)
        return self.get(id)
Ejemplo n.º 7
0
    def create(self, create: BaseModel):
        document = create.dict()
        document["created"] = document["updated"] = get_timestamp()
        document["_id"] = get_uuid()
        document["id"] = document["_id"]
        result = self.collection.insert_one(document)
        assert result.acknowledged

        return self.get(result.inserted_id)
Ejemplo n.º 8
0
 def clear_tests(self, query):
     result = self.collection.update_many(
         query, {"$set": {
             "updated": get_timestamp(),
             "tests": []
         }})
     if not result.modified_count:
         raise DocumentNotFoundException(collection=self.__collection__,
                                         identifier=id)
     return result.modified_count
Ejemplo n.º 9
0
def main(symbol: str, dataset: str, target: str, pipeline: str, feature_selection_method: Optional[str] = 'importances_shap', split: Optional[float] = 0.7, replace: Optional[bool] = True, save: Optional[bool] = True):
    service = GridSearchService()
    n_jobs = int(cpu_count() / 2)
    multithread_pipeline = ['mlp', 'xgboost']
    if any(ext in pipeline for ext in multithread_pipeline):
        n_jobs = int(n_jobs / 2 + 1)
    logging.info("[{}] {}({}.{}) -> {} Start grid search (JOBS: {})".format(get_timestamp(), pipeline, dataset, symbol, target, n_jobs))
    mp = service.grid_search_new(
        pipeline=pipeline,
        dataset=dataset,
        target=target,
        symbol=symbol,
        split=split,
        feature_selection_method=feature_selection_method,
        verbose=1,
        n_jobs=n_jobs,
        replace=replace,
        save=save
    )
    logging.info("[{}] End grid search\n".format(get_timestamp()))
Ejemplo n.º 10
0
def main(dataset: str, target: str):
    service = FeatureSelectionService()
    models = ModelService()
    datasets = DatasetService()

    query = {"dataset": dataset, "target": target}
    # Clear feature search results from models
    models.clear_features(query)
    #search_models = models.query_models(query)
    # logging.info("[i] {} models for feature selection".format(len(search_models)))
    # for i, m in enumerate(search_models):
    symbols = datasets.get_dataset_symbols(dataset)
    for i, sym in enumerate(symbols):
        logging.info("==[{}/{}]== Dataset: {} {} {} =====".format(
            i + 1, len(symbols), sym, dataset, target))
        mf = service.create_features_search(target=target,
                                            dataset=dataset,
                                            symbol=sym,
                                            split=0.7,
                                            method='importances')
        logging.info("[{}] Start feature search".format(get_timestamp()))
        mf = service.feature_selection(mf, sync=True)
        logging.info("[{}] End feature search".format(get_timestamp()))
Ejemplo n.º 11
0
    def grid_search_new(self, symbol: str, dataset: str, target: str,
                        pipeline: str, split: float,
                        feature_selection_method: str, **kwargs):
        # Check if a model exists and has same search method
        existing_model = self.model_service.get_model(pipeline=pipeline,
                                                      dataset=dataset,
                                                      target=target,
                                                      symbol=symbol)
        if existing_model:
            mp_exists = ModelService.get_model_parameters(existing_model,
                                                          method='gridsearch')
            if mp_exists:
                if kwargs.get('replace'):
                    self.model_service.remove_parameters(model=existing_model,
                                                         method='gridsearch')
                else:
                    if kwargs.get('save'):
                        raise MessageException(
                            f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}"
                        )

        # Retrieve dataset to use
        ds = self.dataset_service.get_dataset(dataset, symbol)

        # Determine cv_splits=K for K-fold cross validation based on dataset's sample count
        # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples
        # so we need X samples where X is given by the proportion:
        #       30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold
        X = 40
        cv_splits = 5
        # If samples per fold with 5-fold CV are too low, use 3-folds
        if ds.count / cv_splits < X:
            cv_splits = 3
        # If samples are still too low, raise a value error
        if ds.count / cv_splits < X and not kwargs.get("permissive"):
            raise ValueError("Not enough samples to perform cross validation!")

        # Determine split indices based on dataset
        splits = DatasetService.get_train_test_split_indices(ds, split)
        cv_interval = splits['train']

        # Load dataset features by applying a specified feature selection method
        X = self.dataset_service.get_dataset_features(
            ds=ds,
            begin=cv_interval['begin'],
            end=cv_interval['end'],
            method=feature_selection_method,
            target=target)
        y = self.dataset_service.get_target(
            name=target,
            symbol=symbol,
            begin=cv_interval['begin'],
            end=cv_interval['end'],
        )

        # Check number of samples for each class in training data, if less than 3 instances are present for
        # each class, we're going to get a very unstable model (or no model at all for k-NN based algos)
        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(symbol, dataset, target, pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))
        logging.info("Dataset loaded: X {} y {} (unique: {})".format(
            X.shape, y.shape, unique))

        # Load pipeline algorithm and parameter grid
        pipeline_module = get_pipeline(pipeline)

        # Perform search
        gscv = GridSearchCV(
            estimator=pipeline_module.estimator,
            param_grid=kwargs.get('parameter_grid',
                                  pipeline_module.PARAMETER_GRID),
            # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
            cv=StratifiedKFold(n_splits=cv_splits),
            scoring=get_precision_scorer(),
            verbose=kwargs.get("verbose", 0),
            n_jobs=kwargs.get("n_jobs", None),
            refit=False)

        mp = ModelParameters(cv_interval=splits['train'],
                             cv_splits=cv_splits,
                             task_key=kwargs.get('task_key', str(uuid4())),
                             features=[c for c in X.columns],
                             parameter_search_method='gridsearch')

        mp.start_at = get_timestamp()
        gscv.fit(X, y)
        mp.end_at = get_timestamp()

        # Collect results
        results_df = pd.DataFrame(gscv.cv_results_)

        mp.parameters = gscv.best_params_
        mp.cv_results = results_df.loc[:,
                                       results_df.columns != 'params'].to_dict(
                                           'records')

        tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline,
                                      dict_hash(mp.parameters))
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Is there an existing model for this search?

        model = Model(pipeline=pipeline,
                      dataset=dataset,
                      target=target,
                      symbol=symbol,
                      features=feature_selection_method)
        model.parameters.append(mp)
        self.model_repo.create(model)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'grid-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'grid-search-results',
                                    mp.result_file)
        return mp
Ejemplo n.º 12
0
def main(queryfile: str,
         features: Optional[str] = None,
         parameters: Optional[str] = None,
         save: Optional[bool] = True):
    models = ModelService()
    with open(queryfile, 'r') as f:
        query = json.load(f)
    if save:
        models.clear_tests(query)
    test_models = models.query_models(query)
    logging.info("[i] {} models to test".format(len(test_models)))
    failed = []
    for i, m in enumerate(test_models):
        logging.info("==[{}/{}]== MODEL: {} {} {} {} =====".format(
            i + 1, len(test_models), m.symbol, m.dataset, m.target,
            m.pipeline))
        #t1 = models.create_model_test(model=m, split=0.7, step={'days': 1}, window={'days': 60}, parameters=parameters, features=features)
        t2 = models.create_model_test(model=m,
                                      split=0.7,
                                      step={'days': 1},
                                      window={'days': 90},
                                      parameters=parameters,
                                      features=features)
        t3 = models.create_model_test(model=m,
                                      split=0.7,
                                      step={'days': 1},
                                      window={'days': 180},
                                      parameters=parameters,
                                      features=features)
        t4 = models.create_model_test(model=m,
                                      split=0.7,
                                      step={'days': 1},
                                      window={'days': 240},
                                      parameters=parameters,
                                      features=features)
        try:
            # Test T1
            # logging.info("[{}] {} Start T1".format(get_timestamp(), m.symbol))
            # models.test_model(m, t1, sync=True)
            # Test T2
            logging.info("[{}] {} Start T2".format(get_timestamp(), m.symbol))
            models.test_model(m, t2, sync=True)
            # Test T3
            logging.info("[{}] {} Start T3".format(get_timestamp(), m.symbol))
            models.test_model(m, t3, sync=True)
            logging.info("[{}] {} Start T4".format(get_timestamp(), m.symbol))
            models.test_model(m, t4, sync=True)
        except MessageException as e:
            logging.error("[!] " + e.message)
            # failed.append((m.dict(), t1.dict(), t2.dict(), t3.dict()))
            failed.append((m.dict(), t2.dict(), t3.dict(), t4.dict()))
            pass
        except Exception as e:
            logging.exception("[!] " + str(e))
            # failed.append((m.dict(), t1.dict(), t2.dict(), t3.dict()))
            failed.append((m.dict(), t2.dict(), t3.dict(), t4.dict()))
            pass

        logging.info("[{}] Done".format(m.symbol))
    with open('test-failed.json', 'w') as f:
        json.dump(failed, f)
Ejemplo n.º 13
0
 def touch(self, id):
     result = self.collection.update_one({"_id": id}, {"$set": {"updated": get_timestamp()}})
     if not result.modified_count:
         raise DocumentNotFoundException(collection=self.__collection__, identifier=id)
Ejemplo n.º 14
0
    def grid_search(self, model: Model, mp: ModelParameters,
                    **kwargs) -> ModelParameters:
        pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp)
        tag = "{}-{}-{}-{}-{}" \
            .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters))

        # Perform search
        if not kwargs.get('halving'):
            gscv = GridSearchCV(
                estimator=pipeline_module.estimator,
                param_grid=kwargs.get('parameter_grid',
                                      pipeline_module.PARAMETER_GRID),
                # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
                cv=StratifiedKFold(n_splits=mp.cv_splits),
                scoring=get_precision_scorer(),
                verbose=kwargs.get("verbose", 0),
                n_jobs=kwargs.get("n_jobs", None),
                refit=False)
        else:
            gscv = HalvingGridSearchCV(
                estimator=pipeline_module.estimator,
                param_grid=kwargs.get('parameter_grid',
                                      pipeline_module.PARAMETER_GRID),
                factor=2,
                cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
                scoring=get_precision_scorer(),
                verbose=kwargs.get("verbose", 0),
                n_jobs=kwargs.get("n_jobs",
                                  cpu_count() / 2),
                refit=False,
                random_state=0)

        try:
            mp.start_at = get_timestamp()  # Log starting timestamp
            gscv.fit(X, y)
            mp.end_at = get_timestamp()  # Log ending timestamp
        except SplitException as e:
            logging.exception(
                "Model {} splitting yields single-class folds!\n{}".format(
                    tag, e.message))
            return mp  # Fit failed, don't save this.
        except ValueError as e:
            logging.exception("Model {} raised ValueError!\n{}".format(tag, e))
            return mp  # Fit failed, don't save this.

        # Collect results
        results_df = pd.DataFrame(gscv.cv_results_)

        # Update search request with results
        mp.parameter_search_method = 'halving_grid_search' if kwargs.get(
            'halving') else 'gridsearch'
        mp.parameters = gscv.best_params_
        mp.cv_results = results_df.to_dict()
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'grid-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'grid-search-results',
                                    mp.result_file)
            # Update model with the new results
            self.model_repo.append_parameters(model.id, mp)

        return mp
Ejemplo n.º 15
0
    def feature_selection_new(self, *, symbol: str, dataset: str, target: str,
                              split: float, method: str,
                              **kwargs) -> ModelFeatures:
        ds = self.dataset_service.get_dataset(dataset, symbol)
        fs_exists = DatasetService.has_feature_selection(ds=ds,
                                                         method=method,
                                                         target=target)
        if fs_exists:
            if kwargs.get('replace'):
                self.dataset_service.remove_feature_selection(ds=ds,
                                                              method=method,
                                                              target=target)
            else:
                if kwargs.get('save'):
                    raise MessageException(
                        f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'"
                    )

        splits = DatasetService.get_train_test_split_indices(ds, split)
        fs = FeatureSelection(target=target,
                              method=method,
                              search_interval=splits['train'],
                              task_key=kwargs.get('task_key', str(uuid4())))

        # Load dataset
        X = self.dataset_service.get_dataset_features(
            ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)
        y = self.dataset_service.get_dataset_target(
            name=fs.target,
            ds=ds,
            begin=fs.search_interval.begin,
            end=fs.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(symbol, dataset, target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        fs.start_at = get_timestamp()  # Log starting timestamp
        if not fs.method or 'importances' in fs.method:
            if '_cv' in fs.method:
                selector = select_from_model_cv(X, y)
            else:
                selector = select_from_model(X, y)
            fs.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
            if '_shap' in fs.method:
                fs.shap_values = get_shap_values(
                    model=selector.estimator_.named_steps.c, X=X, X_train=X)
                shap_values = parse_shap_values(fs.shap_values)
        elif fs.method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif fs.method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif fs.method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(fs.method))
        fs.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        fs.features = label_support(selector.get_support(), X.columns)

        if not kwargs.get('save'):
            return fs
        return self.dataset_service.append_feature_selection(ds, fs)
Ejemplo n.º 16
0
 def store_result(self, task_id, result):
     task = self.get_task(task_id)
     task.result = result
     task.completed_at = get_timestamp()
     self.repo.update(task_id, task)
     return task
Ejemplo n.º 17
0
def fit_estimator_new(model: Model, mp: ModelParameters, features: str,
                      day: str, window: dict, X, y, b, e, **kwargs):
    # Check if estimator exists
    if exist_estimator(model=model,
                       parameters=mp.parameter_search_method,
                       features=features,
                       day=day,
                       window=window):
        logging.info(
            f"Estimator exists {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}"
            f" Day: {day} Window: {window}")
        existing = load_estimator(model=model,
                                  parameters=mp.parameter_search_method,
                                  features=features,
                                  day=day,
                                  window=window)
        if existing and existing.is_fit:
            return existing
    X = X[b:e]
    y = y[b:e]
    X_train = X[:-1]
    y_train = y[:-1]

    pipeline_module = get_pipeline(model.pipeline)
    y_unique, _, y_counts = np.unique(y_train,
                                      return_index=True,
                                      return_counts=True)
    if (y_counts < 3).any():
        logging.warning(
            f"fit_estimator: y_train contains less than 3 samples for some class! \nUnique: {y_unique}\nCounts: {y_counts}"
        )

    est = pipeline_module.estimator
    est.set_params(**mp.parameters)

    try:
        start_at = datetime.utcnow().timestamp()
        est = est.fit(X_train, y_train)
        dur = datetime.utcnow().timestamp() - start_at
    except Exception as e:
        logging.exception(f"Exception in estimator fit for day: {day}: {e}")
        return None

    # Save data as attributes of the fit estimator as well
    est.fit_time = dur
    est.fit_timestamp = get_timestamp()
    est.is_fit = True
    est.train_x = X_train
    est.train_y = y_train
    est.begin = b
    est.end = e
    est.skip_save = False
    # Training parameters and Model tuple
    est.day = day
    est.pipeline = model.pipeline
    est.dataset = model.dataset
    est.target = model.target
    est.symbol = model.symbol
    est.train_begin = to_timestamp(X_train.first_valid_index().to_pydatetime())
    est.train_end = to_timestamp(X_train.last_valid_index().to_pydatetime())
    est.window = window
    est.fit_timestamp = get_timestamp()
    est.parameters = mp.parameter_search_method
    est.features = features

    return est