def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True): path = os.path.join(str(fedot_project_root()), files_path) unpack_archived_data(path) data = InputData.from_json_files(path, fields_to_use=['votes', 'year'], label='rating', task=task) class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target]) data.target = class_labels ratio = 0.5 img_files_path = f'{files_path}/*.jpeg' img_path = os.path.join(str(fedot_project_root()), img_files_path) data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size) data_text = InputData.from_json_files(path, fields_to_use=['plot'], label='rating', task=task, data_type=DataTypesEnum.text) data_text.target = class_labels if with_split: train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio) train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio) train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio) else: train_num, test_num = data, data train_img, test_img = data_img, data_img train_text, test_text = data_text, data_text return train_num, test_num, train_img, test_img, train_text, test_text
def prepare_input_data(features, target): """ Function create InputData with features """ x_data_train, x_data_test, y_data_train, y_data_test = train_test_split( features, target, test_size=0.2, shuffle=True, random_state=10) y_data_test = np.ravel(y_data_test) # Define regression task task = Task(TaskTypesEnum.regression) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_data_train)), features=x_data_train, target=y_data_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_data_test)), features=x_data_test, target=y_data_test, task=task, data_type=DataTypesEnum.table) return train_input, predict_input, task
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def get_time_series(): """ Function returns time series for time series forecasting task """ len_forecast = 100 synthetic_ts = generate_synthetic_data(length=1000) train_data = synthetic_ts[:-len_forecast] test_data = synthetic_ts[-len_forecast:] task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_data)), features=train_data, target=train_data, task=task, data_type=DataTypesEnum.ts) start_forecast = len(train_data) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=train_data, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, test_data
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_autokeras(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task == TaskTypesEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) return test_data.target, predicted
def get_small_classification_dataset(): """ Function returns features and target for train and test classification models """ features_options = { 'informative': 1, 'redundant': 0, 'repeated': 0, 'clusters_per_class': 1 } x_train, y_train, x_test, y_test = get_classification_dataset( features_options=features_options, samples_amount=70, features_amount=4, classes_amount=2) # Define regression task task = Task(TaskTypesEnum.classification) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_train)), features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_test)), features=x_test, target=None, task=task, data_type=DataTypesEnum.table) return train_input, predict_input, y_test
def get_classification_data(classes_amount: int): """ Function generate synthetic dataset for classification task :param classes_amount: amount of classes to predict :return train_input: InputData for model fit :return predict_input: InputData for predict stage """ # Define options for dataset with 800 objects features_options = {'informative': 2, 'redundant': 1, 'repeated': 1, 'clusters_per_class': 1} x_train, y_train, x_test, y_test = get_classification_dataset(features_options, 800, 4, classes_amount) y_train = y_train.reshape((-1, 1)) y_test = y_test.reshape((-1, 1)) # Define classification task task = Task(TaskTypesEnum.classification) # Prepare data to train and validate the model train_input = InputData(idx=np.arange(0, len(x_train)), features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_test)), features=x_test, target=y_test, task=task, data_type=DataTypesEnum.table) return train_input, predict_input
def test_vectorize_tfidf_strategy(): train_text = [ 'This document first' 'second This document' 'And one third' 'Is document first' ] test_text = ['document allow', 'spam not found', 'is are'] train_data = InputData(idx=len(train_text), features=train_text, target=[0, 0, 1, 0], data_type=DataTypesEnum.text, task=Task(TaskTypesEnum.classification)) test_data = InputData(idx=len(test_text), features=test_text, target=[0, 1, 0], data_type=DataTypesEnum.text, task=Task(TaskTypesEnum.classification)) vectorizer = SkLearnTextVectorizeStrategy(operation_type='tfidf', params=None) vectorizer_fitted = vectorizer.fit(train_data) predicted = vectorizer.predict(trained_operation=vectorizer_fitted, predict_data=test_data, is_fit_chain_stage=False) predicted_labels = predicted.predict assert isinstance(vectorizer_fitted, TfidfVectorizer) assert len(predicted_labels[0]) == 7
def __chain_fit_predict(self, timeseries_train: np.array, len_gap: int): """ The method makes a prediction as a sequence of elements based on a training sample. There are two main parts: fit model and predict. :param timeseries_train: part of the time series for training the model :param len_gap: number of elements in the gap :return: array without gaps """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_gap)) input_data = InputData(idx=np.arange(0, len(timeseries_train)), features=timeseries_train, target=timeseries_train, task=task, data_type=DataTypesEnum.ts) # Making predictions for the missing part in the time series self.chain.fit_from_scratch(input_data) # "Test data" for making prediction for a specific length start_forecast = len(timeseries_train) end_forecast = start_forecast + len_gap idx_test = np.arange(start_forecast, end_forecast) test_data = InputData(idx=idx_test, features=timeseries_train, target=None, task=task, data_type=DataTypesEnum.ts) predicted_values = self.chain.predict(test_data) predicted_values = np.ravel(np.array(predicted_values.predict)) return predicted_values
def run_pipeline_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run pipeline with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for pipeline """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) OperationTypesRepository.assign_repo('model', 'automl_repository.json') pipeline = Pipeline(node_rf) pipeline.fit(train_data) results = pipeline.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_xgboost(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) if task == TaskTypesEnum.classification: model = xgb.XGBClassifier(max_depth=2, learning_rate=1.0, objective='binary:logistic') model.fit(train_data.features, train_data.target) predicted = model.predict_proba(test_data.features)[:, 1] predicted_labels = model.predict(test_data.features) elif task == TaskTypesEnum.regression: xgbr = xgb.XGBRegressor(max_depth=3, learning_rate=0.3, n_estimators=300, objective='reg:squarederror') xgbr.fit(train_data.features, train_data.target) predicted = xgbr.predict(test_data.features) predicted_labels = None else: raise NotImplementedError() return test_data.target, predicted, predicted_labels
def data_setup(): task = Task(TaskTypesEnum.classification) predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] input_data = InputData(idx=np.arange(0, len(predictors)), features=predictors, target=response, task=task, data_type=DataTypesEnum.table) train_data, test_data = train_test_data_setup(data=input_data) train_data_x = train_data.features test_data_x = test_data.features train_data_y = train_data.target test_data_y = test_data.target train_data = InputData(features=train_data_x, target=train_data_y, idx=np.arange(0, len(train_data_y)), task=task, data_type=DataTypesEnum.table) test_data = InputData(features=test_data_x, target=test_data_y, idx=np.arange(0, len(test_data_y)), task=task, data_type=DataTypesEnum.table) return train_data, test_data
def prepare_train_test_input(train_part, len_forecast): """ Function return prepared data for fit and predict :param len_forecast: forecast length :param train_part: time series which can be used as predictors for train :return train_input: Input Data for fit :return predict_input: Input Data for predict :return task: Time series forecasting task with parameters """ # Specify the task to solve task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_part)), features=train_part, target=train_part, task=task, data_type=DataTypesEnum.ts) start_forecast = len(train_part) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=train_part, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, task
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run chain with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for chain """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def prepare_input_data(len_forecast, train_data_features, train_data_target, test_data_features): """ Function return prepared data for fit and predict :param len_forecast: forecast length :param train_data_features: time series which can be used as predictors for train :param train_data_target: time series which can be used as target for train :param test_data_features: time series which can be used as predictors for prediction :return train_input: Input Data for fit :return predict_input: Input Data for predict :return task: Time series forecasting task with parameters """ task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=len_forecast)) train_input = InputData(idx=np.arange(0, len(train_data_features)), features=train_data_features, target=train_data_target, task=task, data_type=DataTypesEnum.ts) # Determine indices for forecast start_forecast = len(train_data_features) end_forecast = start_forecast + len_forecast predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=test_data_features, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input, task
def get_case_train_test_data(): """ Function for getting data for train and validation """ train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) return train_data, test_data
def run_h2o(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data, round(max_runtime_secs / 60)) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predicted = predict_h2o(imported_model, test_frame) h2o.shutdown(prompt=False) return true_target, predicted
def test_with_custom_target(): test_file_path = str(os.path.dirname(__file__)) file = '../../data/simple_classification.csv' file_custom = '../../data/simple_classification_with_custom_target.csv' file_data = InputData.from_csv(os.path.join(test_file_path, file)) expected_features = file_data.features expected_target = file_data.target custom_file_data = InputData.from_csv(os.path.join(test_file_path, file_custom), delimiter=';') actual_features = custom_file_data.features actual_target = custom_file_data.target assert not np.array_equal(expected_features, actual_features) assert not np.array_equal(expected_target, actual_target) custom_file_data = InputData.from_csv(os.path.join(test_file_path, file_custom), delimiter=';', columns_to_drop=['redundant'], target_columns='custom_target') actual_features = custom_file_data.features actual_target = custom_file_data.target assert np.array_equal(expected_features, actual_features) assert np.array_equal(expected_target, actual_target)
def tabular_cv_generator(data: InputData, folds: int) -> Iterator[Tuple[InputData, InputData]]: """ The function for splitting data into a train and test samples in the InputData format for KFolds cross validation. The function return a generator of tuples, consisting of a pair of train, test. :param data: InputData for train and test splitting :param folds: number of folds :return Iterator[InputData, InputData]: return split train/test data """ kf = KFold(n_splits=folds) for train_idxs, test_idxs in kf.split(data.features): train_features, train_target = _table_data_by_index(train_idxs, data) test_features, test_target = _table_data_by_index(test_idxs, data) idx_for_train = np.arange(0, len(train_features)) idx_for_test = np.arange(0, len(test_features)) train_data = InputData(idx=idx_for_train, features=train_features, target=train_target, task=data.task, data_type=data.data_type, supplementary_data=data.supplementary_data) test_data = InputData(idx=idx_for_test, features=test_features, target=test_target, task=data.task, data_type=data.data_type, supplementary_data=data.supplementary_data) yield train_data, test_data
def test_target_data_from_csv_correct(): """ Function tests two ways of processing target columns in "from_csv" method """ test_file_path = str(os.path.dirname(__file__)) file = '../../data/multi_target_sample.csv' path = os.path.join(test_file_path, file) task = Task(TaskTypesEnum.regression) # Process one column target_column = '1_day' one_column_data = InputData.from_csv(path, target_columns=target_column, columns_to_drop=['date'], task=task) # Process multiple target columns target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] seven_columns_data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date'], task=task) assert one_column_data.target.shape == (499, 1) assert seven_columns_data.target.shape == (499, 7)
def test_multi_modal_data(): num_samples = 5 target = np.asarray([0, 0, 1, 0, 1]) img_data = InputData( idx=range(num_samples), features=None, # in test the real data is not passed target=target, data_type=DataTypesEnum.text, task=Task(TaskTypesEnum.classification)) tbl_data = InputData( idx=range(num_samples), features=None, # in test the real data is not passed target=target, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) multi_modal = MultiModalData({ 'data_source_img': img_data, 'data_source_table': tbl_data, }) assert multi_modal.task.task_type == TaskTypesEnum.classification assert len(multi_modal.idx) == 5 assert multi_modal.num_classes == 2 assert np.array_equal(multi_modal.target, target)
def run_refinement_scoring_example(train_path, test_path, with_tuning=False): """ Function launch example with error modeling for classification task :param train_path: path to the csv file with training sample :param test_path: path to the csv file with test sample :param with_tuning: is it need to tune pipelines or not """ task = Task(TaskTypesEnum.classification) train_dataset = InputData.from_csv(train_path, task=task) test_dataset = InputData.from_csv(test_path, task=task) # Get and fit pipelines no_decompose_c = get_non_refinement_pipeline() decompose_c = get_refinement_pipeline() no_decompose_c.fit(train_dataset) decompose_c.fit(train_dataset) # Check metrics for both pipelines display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline') display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline') if with_tuning: no_decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None, input_data=train_dataset, iterations=30) decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None, input_data=train_dataset, iterations=30) display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline after tuning') display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline after tuning')
def prepare_input_data(forecast_length, horizon): ts = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 101 ]) # Forecast for 2 elements ahead task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) # To avoid data leak ts_train = ts[:-horizon] train_input = InputData(idx=np.arange(0, len(ts_train)), features=ts_train, target=ts_train, task=task, data_type=DataTypesEnum.ts) start_forecast = len(ts_train) end_forecast = start_forecast + forecast_length predict_input = InputData(idx=np.arange(start_forecast, end_forecast), features=ts, target=None, task=task, data_type=DataTypesEnum.ts) return train_input, predict_input
def get_scoring_data() -> Tuple[InputData, InputData]: train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv' test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv' train_data = InputData.from_csv(train_data_path) test_data = InputData.from_csv(test_data_path) return train_data, test_data
def get_scoring_data(): file_path_train = 'cases/data/scoring/scoring_train.csv' full_path_train = join(str(project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/scoring/scoring_test.csv' full_path_test = join(str(project_root()), file_path_test) task = Task(TaskTypesEnum.classification) train = InputData.from_csv(full_path_train, task=task) test = InputData.from_csv(full_path_test, task=task) return train, test
def run_text_problem_from_files(): data_abspath = os.path.abspath(os.path.join('data', 'spamham')) unpack_archived_data(data_abspath) train_path = os.path.join(data_abspath, 'train') test_path = os.path.join(data_abspath, 'test') train_data = InputData.from_text_files(files_path=train_path) test_data = InputData.from_text_files(files_path=test_path) metric = execute_pipeline_for_text_problem(train_data, test_data) print(f'origin files metric: {metric}')
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, target_column=None) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type, tags=['simple']) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model builder = GPComposerBuilder(task).with_requirements( composer_requirements).with_metrics(metric_function) composer = builder.build() # run the search of best suitable model chain_evo_composed = composer.compose_chain(data=dataset_to_compose, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def run_multi_output_case(path, vis=False): """ Function launch case for river levels prediction on Lena river as multi-output regression task :param path: path to the file with table :param vis: is it needed to visualise pipeline and predictions """ target_columns = [ '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day' ] data = InputData.from_csv(path, target_columns=target_columns, columns_to_drop=['date']) train, test = train_test_data_setup(data) problem = 'regression' automl_model = Fedot(problem=problem) automl_model.fit(features=train) predicted_array = automl_model.predict(features=test) # Convert output into one dimensional array forecast = np.ravel(predicted_array) mae_value = mean_absolute_error(np.ravel(test.target), forecast) print(f'MAE - {mae_value:.2f}') if vis: plot_predictions(predicted_array, test)