def main(queryfile: str, features: Optional[str] = None, halving: Optional[bool] = False, save: Optional[bool] = True): service = GridSearchService() models = ModelService() with open(queryfile, 'r') as f: query = json.load(f) search_models = models.query_models(query) logging.info("[i] {} models to train".format(len(search_models))) for i, m in enumerate(search_models): # if m.parameters: # logging.info("==[{}/{}]== MODEL: {} {} {} {} ==> SKIP".format(i+1, len(search_models), m.symbol, m.dataset, m.target, m.pipeline)) # continue # Skip this as search has already been performed logging.info("==[{}/{}]== MODEL: {} {} {} {} =====".format( i + 1, len(search_models), m.symbol, m.dataset, m.target, m.pipeline)) mp = service.create_parameters_search(m, split=0.7, features=features) logging.info("[{}] Start random search".format(get_timestamp())) mp = service.random_search(m, mp, sync=True, verbose=1, n_jobs=8, halving=halving, save=save) logging.info("[{}] End random search".format(get_timestamp()))
def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures: # Load dataset X = self.dataset_service.get_features(mf.dataset, mf.symbol, mf.search_interval.begin, mf.search_interval.end, columns=mf.features) y = self.dataset_service.get_target(mf.target, mf.symbol, mf.search_interval.begin, mf.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(mf.symbol, mf.dataset, mf.target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search mf.start_at = get_timestamp() # Log starting timestamp if not mf.feature_selection_method or mf.feature_selection_method == 'importances': selector = select_from_model(X, y) mf.feature_importances = label_feature_importances( selector.estimator_, X.columns) elif mf.feature_selection_method == 'importances_cv': selector = select_from_model_cv(X, y) mf.feature_importances = label_feature_importances( selector.estimator_.best_estimator_, X.columns) elif mf.feature_selection_method == 'fscore': selector = select_percentile(X, y, percentile=10) elif mf.feature_selection_method == 'relieff': selector = select_relieff(X, y, percentile=10) elif mf.feature_selection_method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format( mf.feature_selection_method)) mf.end_at = get_timestamp() # Log ending timestamp # Update search request with results mf.features = label_support(selector.get_support(), X.columns) # Update model with the new results if kwargs.get('save', True): self.model_repo.append_features_query( { "dataset": mf.dataset, "symbol": mf.symbol, "target": mf.target }, mf) return mf
def test_model(self, model: Model, mt: ModelTest, **kwargs): if not model.id: model = self.model_repo.create(model) if self.model_repo.exist_test(model.id, mt.task_key): logging.info("Model {} test {} already executed!".format( model.id, mt.task_key)) return mt # Load dataset ds = DatasetService() d = ds.get_dataset(model.dataset, model.symbol) # Get training data including the first training window begin = sub_interval(timestamp=mt.test_interval.begin, interval=mt.window) end = add_interval(timestamp=mt.test_interval.end, interval=mt.step) if from_timestamp(d.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \ .format(model.pipeline, model.dataset, model.symbol, mt.window)) X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end) y = ds.get_target(model.target, model.symbol, begin=begin, end=end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) # Slice testing interval in windows ranges = timestamp_windows(begin, end, mt.window, mt.step) mt.start_at = get_timestamp() df = test_windows(pipeline_module.estimator, mt.parameters, X, y, ranges) mt.end_at = get_timestamp() mt.classification_results = df.to_dict() clf_report = flattened_classification_report_imbalanced( df.label, df.predicted) roc_report = roc_auc_report( df.label, df.predicted, df[[c for c in df.columns if '_proba_' in c]]) clf_report.update(roc_report) mt.classification_report = clf_report self.model_repo.append_test(model.id, mt) return mt
def random_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) rscv = RandomizedSearchCV(estimator=pipeline_module.estimator, param_distributions=kwargs.get( 'param_distributions', pipeline_module.PARAMETER_DISTRIBUTION), n_iter=kwargs.get('n_iter', 10), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp rscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(rscv.cv_results_) # Update search request with results mp.parameter_search_method = 'randomsearch' mp.parameters = rscv.best_params_ mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'random-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'random-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp
def main(dataset: str, target: str, method: str, split: Optional[float] = 0.7, replace: Optional[bool] = False, save: Optional[bool] = True): service = FeatureSelectionService() symbols = service.get_available_symbols(dataset) for i, sym in enumerate(symbols): logging.info("==[{}/{}]== Dataset: {} {} {} =====".format(i+1, len(symbols), sym, dataset, target)) logging.info("[{}] Start feature search".format(get_timestamp())) mf = service.feature_selection_new( symbol=sym, dataset=dataset, target=target, split=split, method=method, replace=replace, save=save ) logging.info("[{}] End feature search".format(get_timestamp()))
def update(self, id: str, update: BaseModel): document = update.dict() document["updated"] = get_timestamp() result = self.collection.update_one({"_id": id}, {"$set": document}) if not result.modified_count: raise DocumentNotFoundException(collection=self.__collection__, identifier=id) return self.get(id)
def create(self, create: BaseModel): document = create.dict() document["created"] = document["updated"] = get_timestamp() document["_id"] = get_uuid() document["id"] = document["_id"] result = self.collection.insert_one(document) assert result.acknowledged return self.get(result.inserted_id)
def clear_tests(self, query): result = self.collection.update_many( query, {"$set": { "updated": get_timestamp(), "tests": [] }}) if not result.modified_count: raise DocumentNotFoundException(collection=self.__collection__, identifier=id) return result.modified_count
def main(symbol: str, dataset: str, target: str, pipeline: str, feature_selection_method: Optional[str] = 'importances_shap', split: Optional[float] = 0.7, replace: Optional[bool] = True, save: Optional[bool] = True): service = GridSearchService() n_jobs = int(cpu_count() / 2) multithread_pipeline = ['mlp', 'xgboost'] if any(ext in pipeline for ext in multithread_pipeline): n_jobs = int(n_jobs / 2 + 1) logging.info("[{}] {}({}.{}) -> {} Start grid search (JOBS: {})".format(get_timestamp(), pipeline, dataset, symbol, target, n_jobs)) mp = service.grid_search_new( pipeline=pipeline, dataset=dataset, target=target, symbol=symbol, split=split, feature_selection_method=feature_selection_method, verbose=1, n_jobs=n_jobs, replace=replace, save=save ) logging.info("[{}] End grid search\n".format(get_timestamp()))
def main(dataset: str, target: str): service = FeatureSelectionService() models = ModelService() datasets = DatasetService() query = {"dataset": dataset, "target": target} # Clear feature search results from models models.clear_features(query) #search_models = models.query_models(query) # logging.info("[i] {} models for feature selection".format(len(search_models))) # for i, m in enumerate(search_models): symbols = datasets.get_dataset_symbols(dataset) for i, sym in enumerate(symbols): logging.info("==[{}/{}]== Dataset: {} {} {} =====".format( i + 1, len(symbols), sym, dataset, target)) mf = service.create_features_search(target=target, dataset=dataset, symbol=sym, split=0.7, method='importances') logging.info("[{}] Start feature search".format(get_timestamp())) mf = service.feature_selection(mf, sync=True) logging.info("[{}] End feature search".format(get_timestamp()))
def grid_search_new(self, symbol: str, dataset: str, target: str, pipeline: str, split: float, feature_selection_method: str, **kwargs): # Check if a model exists and has same search method existing_model = self.model_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) if existing_model: mp_exists = ModelService.get_model_parameters(existing_model, method='gridsearch') if mp_exists: if kwargs.get('replace'): self.model_service.remove_parameters(model=existing_model, method='gridsearch') else: if kwargs.get('save'): raise MessageException( f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}" ) # Retrieve dataset to use ds = self.dataset_service.get_dataset(dataset, symbol) # Determine cv_splits=K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 cv_splits = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / cv_splits < X: cv_splits = 3 # If samples are still too low, raise a value error if ds.count / cv_splits < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") # Determine split indices based on dataset splits = DatasetService.get_train_test_split_indices(ds, split) cv_interval = splits['train'] # Load dataset features by applying a specified feature selection method X = self.dataset_service.get_dataset_features( ds=ds, begin=cv_interval['begin'], end=cv_interval['end'], method=feature_selection_method, target=target) y = self.dataset_service.get_target( name=target, symbol=symbol, begin=cv_interval['begin'], end=cv_interval['end'], ) # Check number of samples for each class in training data, if less than 3 instances are present for # each class, we're going to get a very unstable model (or no model at all for k-NN based algos) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline algorithm and parameter grid pipeline_module = get_pipeline(pipeline) # Perform search gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) mp = ModelParameters(cv_interval=splits['train'], cv_splits=cv_splits, task_key=kwargs.get('task_key', str(uuid4())), features=[c for c in X.columns], parameter_search_method='gridsearch') mp.start_at = get_timestamp() gscv.fit(X, y) mp.end_at = get_timestamp() # Collect results results_df = pd.DataFrame(gscv.cv_results_) mp.parameters = gscv.best_params_ mp.cv_results = results_df.loc[:, results_df.columns != 'params'].to_dict( 'records') tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline, dict_hash(mp.parameters)) mp.result_file = 'cv_results-{}.csv'.format(tag) # Is there an existing model for this search? model = Model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol, features=feature_selection_method) model.parameters.append(mp) self.model_repo.create(model) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) return mp
def main(queryfile: str, features: Optional[str] = None, parameters: Optional[str] = None, save: Optional[bool] = True): models = ModelService() with open(queryfile, 'r') as f: query = json.load(f) if save: models.clear_tests(query) test_models = models.query_models(query) logging.info("[i] {} models to test".format(len(test_models))) failed = [] for i, m in enumerate(test_models): logging.info("==[{}/{}]== MODEL: {} {} {} {} =====".format( i + 1, len(test_models), m.symbol, m.dataset, m.target, m.pipeline)) #t1 = models.create_model_test(model=m, split=0.7, step={'days': 1}, window={'days': 60}, parameters=parameters, features=features) t2 = models.create_model_test(model=m, split=0.7, step={'days': 1}, window={'days': 90}, parameters=parameters, features=features) t3 = models.create_model_test(model=m, split=0.7, step={'days': 1}, window={'days': 180}, parameters=parameters, features=features) t4 = models.create_model_test(model=m, split=0.7, step={'days': 1}, window={'days': 240}, parameters=parameters, features=features) try: # Test T1 # logging.info("[{}] {} Start T1".format(get_timestamp(), m.symbol)) # models.test_model(m, t1, sync=True) # Test T2 logging.info("[{}] {} Start T2".format(get_timestamp(), m.symbol)) models.test_model(m, t2, sync=True) # Test T3 logging.info("[{}] {} Start T3".format(get_timestamp(), m.symbol)) models.test_model(m, t3, sync=True) logging.info("[{}] {} Start T4".format(get_timestamp(), m.symbol)) models.test_model(m, t4, sync=True) except MessageException as e: logging.error("[!] " + e.message) # failed.append((m.dict(), t1.dict(), t2.dict(), t3.dict())) failed.append((m.dict(), t2.dict(), t3.dict(), t4.dict())) pass except Exception as e: logging.exception("[!] " + str(e)) # failed.append((m.dict(), t1.dict(), t2.dict(), t3.dict())) failed.append((m.dict(), t2.dict(), t3.dict(), t4.dict())) pass logging.info("[{}] Done".format(m.symbol)) with open('test-failed.json', 'w') as f: json.dump(failed, f)
def touch(self, id): result = self.collection.update_one({"_id": id}, {"$set": {"updated": get_timestamp()}}) if not result.modified_count: raise DocumentNotFoundException(collection=self.__collection__, identifier=id)
def grid_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) # Perform search if not kwargs.get('halving'): gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) else: gscv = HalvingGridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), factor=2, cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", cpu_count() / 2), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp gscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(gscv.cv_results_) # Update search request with results mp.parameter_search_method = 'halving_grid_search' if kwargs.get( 'halving') else 'gridsearch' mp.parameters = gscv.best_params_ mp.cv_results = results_df.to_dict() mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp
def feature_selection_new(self, *, symbol: str, dataset: str, target: str, split: float, method: str, **kwargs) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) fs_exists = DatasetService.has_feature_selection(ds=ds, method=method, target=target) if fs_exists: if kwargs.get('replace'): self.dataset_service.remove_feature_selection(ds=ds, method=method, target=target) else: if kwargs.get('save'): raise MessageException( f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'" ) splits = DatasetService.get_train_test_split_indices(ds, split) fs = FeatureSelection(target=target, method=method, search_interval=splits['train'], task_key=kwargs.get('task_key', str(uuid4()))) # Load dataset X = self.dataset_service.get_dataset_features( ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) y = self.dataset_service.get_dataset_target( name=fs.target, ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search fs.start_at = get_timestamp() # Log starting timestamp if not fs.method or 'importances' in fs.method: if '_cv' in fs.method: selector = select_from_model_cv(X, y) else: selector = select_from_model(X, y) fs.feature_importances = label_feature_importances( selector.estimator_, X.columns) if '_shap' in fs.method: fs.shap_values = get_shap_values( model=selector.estimator_.named_steps.c, X=X, X_train=X) shap_values = parse_shap_values(fs.shap_values) elif fs.method == 'fscore': selector = select_percentile(X, y, percentile=10) elif fs.method == 'relieff': selector = select_relieff(X, y, percentile=10) elif fs.method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format(fs.method)) fs.end_at = get_timestamp() # Log ending timestamp # Update search request with results fs.features = label_support(selector.get_support(), X.columns) if not kwargs.get('save'): return fs return self.dataset_service.append_feature_selection(ds, fs)
def store_result(self, task_id, result): task = self.get_task(task_id) task.result = result task.completed_at = get_timestamp() self.repo.update(task_id, task) return task
def fit_estimator_new(model: Model, mp: ModelParameters, features: str, day: str, window: dict, X, y, b, e, **kwargs): # Check if estimator exists if exist_estimator(model=model, parameters=mp.parameter_search_method, features=features, day=day, window=window): logging.info( f"Estimator exists {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}" f" Day: {day} Window: {window}") existing = load_estimator(model=model, parameters=mp.parameter_search_method, features=features, day=day, window=window) if existing and existing.is_fit: return existing X = X[b:e] y = y[b:e] X_train = X[:-1] y_train = y[:-1] pipeline_module = get_pipeline(model.pipeline) y_unique, _, y_counts = np.unique(y_train, return_index=True, return_counts=True) if (y_counts < 3).any(): logging.warning( f"fit_estimator: y_train contains less than 3 samples for some class! \nUnique: {y_unique}\nCounts: {y_counts}" ) est = pipeline_module.estimator est.set_params(**mp.parameters) try: start_at = datetime.utcnow().timestamp() est = est.fit(X_train, y_train) dur = datetime.utcnow().timestamp() - start_at except Exception as e: logging.exception(f"Exception in estimator fit for day: {day}: {e}") return None # Save data as attributes of the fit estimator as well est.fit_time = dur est.fit_timestamp = get_timestamp() est.is_fit = True est.train_x = X_train est.train_y = y_train est.begin = b est.end = e est.skip_save = False # Training parameters and Model tuple est.day = day est.pipeline = model.pipeline est.dataset = model.dataset est.target = model.target est.symbol = model.symbol est.train_begin = to_timestamp(X_train.first_valid_index().to_pydatetime()) est.train_end = to_timestamp(X_train.last_valid_index().to_pydatetime()) est.window = window est.fit_timestamp = get_timestamp() est.parameters = mp.parameter_search_method est.features = features return est