Esempio n. 1
0
def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True):
    path = os.path.join(str(fedot_project_root()), files_path)

    unpack_archived_data(path)

    data = InputData.from_json_files(path, fields_to_use=['votes', 'year'],
                                     label='rating', task=task)

    class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target])
    data.target = class_labels

    ratio = 0.5

    img_files_path = f'{files_path}/*.jpeg'
    img_path = os.path.join(str(fedot_project_root()), img_files_path)

    data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size)

    data_text = InputData.from_json_files(path, fields_to_use=['plot'],
                                          label='rating', task=task,
                                          data_type=DataTypesEnum.text)
    data_text.target = class_labels

    if with_split:
        train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio)
        train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio)
        train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio)
    else:
        train_num, test_num = data, data
        train_img, test_img = data_img, data_img
        train_text, test_text = data_text, data_text

    return train_num, test_num, train_img, test_img, train_text, test_text
Esempio n. 2
0
def prepare_input_data(features, target):
    """ Function create InputData with features """
    x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
        features,
        target,
        test_size=0.2,
        shuffle=True,
        random_state=10)
    y_data_test = np.ravel(y_data_test)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_data_train)),
                            features=x_data_train,
                            target=y_data_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_data_test)),
                              features=x_data_test,
                              target=y_data_test,
                              task=task,
                              data_type=DataTypesEnum.table)

    return train_input, predict_input, task
Esempio n. 3
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
Esempio n. 4
0
def get_time_series():
    """ Function returns time series for time series forecasting task """
    len_forecast = 100
    synthetic_ts = generate_synthetic_data(length=1000)

    train_data = synthetic_ts[:-len_forecast]
    test_data = synthetic_ts[-len_forecast:]

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data)),
                            features=train_data,
                            target=train_data,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_data)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=train_data,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, test_data
Esempio n. 5
0
def run_chain_from_automl(train_file_path: str, test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')

    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Esempio n. 6
0
def run_autokeras(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task == TaskTypesEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    return test_data.target, predicted
Esempio n. 7
0
def get_small_classification_dataset():
    """ Function returns features and target for train and test classification models """
    features_options = {
        'informative': 1,
        'redundant': 0,
        'repeated': 0,
        'clusters_per_class': 1
    }
    x_train, y_train, x_test, y_test = get_classification_dataset(
        features_options=features_options,
        samples_amount=70,
        features_amount=4,
        classes_amount=2)
    # Define regression task
    task = Task(TaskTypesEnum.classification)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train,
                            target=y_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.table)

    return train_input, predict_input, y_test
Esempio n. 8
0
def get_classification_data(classes_amount: int):
    """ Function generate synthetic dataset for classification task

    :param classes_amount: amount of classes to predict

    :return train_input: InputData for model fit
    :return predict_input: InputData for predict stage
    """

    # Define options for dataset with 800 objects
    features_options = {'informative': 2, 'redundant': 1,
                        'repeated': 1, 'clusters_per_class': 1}
    x_train, y_train, x_test, y_test = get_classification_dataset(features_options,
                                                                  800, 4,
                                                                  classes_amount)
    y_train = y_train.reshape((-1, 1))
    y_test = y_test.reshape((-1, 1))

    # Define classification task
    task = Task(TaskTypesEnum.classification)

    # Prepare data to train and validate the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train, target=y_train,
                            task=task, data_type=DataTypesEnum.table)
    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test, target=y_test,
                              task=task, data_type=DataTypesEnum.table)

    return train_input, predict_input
Esempio n. 9
0
def test_vectorize_tfidf_strategy():
    train_text = [
        'This document first'
        'second This document'
        'And one third'
        'Is document first'
    ]
    test_text = ['document allow', 'spam not found', 'is are']

    train_data = InputData(idx=len(train_text),
                           features=train_text,
                           target=[0, 0, 1, 0],
                           data_type=DataTypesEnum.text,
                           task=Task(TaskTypesEnum.classification))
    test_data = InputData(idx=len(test_text),
                          features=test_text,
                          target=[0, 1, 0],
                          data_type=DataTypesEnum.text,
                          task=Task(TaskTypesEnum.classification))

    vectorizer = SkLearnTextVectorizeStrategy(operation_type='tfidf',
                                              params=None)

    vectorizer_fitted = vectorizer.fit(train_data)

    predicted = vectorizer.predict(trained_operation=vectorizer_fitted,
                                   predict_data=test_data,
                                   is_fit_chain_stage=False)
    predicted_labels = predicted.predict

    assert isinstance(vectorizer_fitted, TfidfVectorizer)
    assert len(predicted_labels[0]) == 7
Esempio n. 10
0
    def __chain_fit_predict(self, timeseries_train: np.array, len_gap: int):
        """
        The method makes a prediction as a sequence of elements based on a
        training sample. There are two main parts: fit model and predict.

        :param timeseries_train: part of the time series for training the model
        :param len_gap: number of elements in the gap
        :return: array without gaps
        """

        task = Task(TaskTypesEnum.ts_forecasting,
                    TsForecastingParams(forecast_length=len_gap))

        input_data = InputData(idx=np.arange(0, len(timeseries_train)),
                               features=timeseries_train,
                               target=timeseries_train,
                               task=task,
                               data_type=DataTypesEnum.ts)

        # Making predictions for the missing part in the time series
        self.chain.fit_from_scratch(input_data)

        # "Test data" for making prediction for a specific length
        start_forecast = len(timeseries_train)
        end_forecast = start_forecast + len_gap
        idx_test = np.arange(start_forecast, end_forecast)
        test_data = InputData(idx=idx_test,
                              features=timeseries_train,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

        predicted_values = self.chain.predict(test_data)
        predicted_values = np.ravel(np.array(predicted_values.predict))
        return predicted_values
Esempio n. 11
0
def run_pipeline_from_automl(train_file_path: str,
                             test_file_path: str,
                             max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run pipeline with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for pipeline
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    OperationTypesRepository.assign_repo('model', 'automl_repository.json')
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Esempio n. 12
0
def run_xgboost(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    if task == TaskTypesEnum.classification:
        model = xgb.XGBClassifier(max_depth=2,
                                  learning_rate=1.0,
                                  objective='binary:logistic')
        model.fit(train_data.features, train_data.target)
        predicted = model.predict_proba(test_data.features)[:, 1]
        predicted_labels = model.predict(test_data.features)

    elif task == TaskTypesEnum.regression:
        xgbr = xgb.XGBRegressor(max_depth=3,
                                learning_rate=0.3,
                                n_estimators=300,
                                objective='reg:squarederror')
        xgbr.fit(train_data.features, train_data.target)
        predicted = xgbr.predict(test_data.features)
        predicted_labels = None

    else:
        raise NotImplementedError()
    return test_data.target, predicted, predicted_labels
Esempio n. 13
0
def data_setup():
    task = Task(TaskTypesEnum.classification)
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]

    input_data = InputData(idx=np.arange(0, len(predictors)),
                           features=predictors,
                           target=response,
                           task=task,
                           data_type=DataTypesEnum.table)
    train_data, test_data = train_test_data_setup(data=input_data)
    train_data_x = train_data.features
    test_data_x = test_data.features
    train_data_y = train_data.target
    test_data_y = test_data.target

    train_data = InputData(features=train_data_x,
                           target=train_data_y,
                           idx=np.arange(0, len(train_data_y)),
                           task=task,
                           data_type=DataTypesEnum.table)
    test_data = InputData(features=test_data_x,
                          target=test_data_y,
                          idx=np.arange(0, len(test_data_y)),
                          task=task,
                          data_type=DataTypesEnum.table)
    return train_data, test_data
Esempio n. 14
0
def prepare_train_test_input(train_part, len_forecast):
    """ Function return prepared data for fit and predict

    :param len_forecast: forecast length
    :param train_part: time series which can be used as predictors for train

    :return train_input: Input Data for fit
    :return predict_input: Input Data for predict
    :return task: Time series forecasting task with parameters
    """

    # Specify the task to solve
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_part)),
                            features=train_part,
                            target=train_part,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_part)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=train_part,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, task
Esempio n. 15
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run chain with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for chain
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Esempio n. 16
0
def prepare_input_data(len_forecast, train_data_features, train_data_target,
                       test_data_features):
    """ Function return prepared data for fit and predict

    :param len_forecast: forecast length
    :param train_data_features: time series which can be used as predictors for train
    :param train_data_target: time series which can be used as target for train
    :param test_data_features: time series which can be used as predictors for prediction

    :return train_input: Input Data for fit
    :return predict_input: Input Data for predict
    :return task: Time series forecasting task with parameters
    """

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data_features)),
                            features=train_data_features,
                            target=train_data_target,
                            task=task,
                            data_type=DataTypesEnum.ts)

    # Determine indices for forecast
    start_forecast = len(train_data_features)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=test_data_features,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, task
Esempio n. 17
0
def get_case_train_test_data():
    """ Function for getting data for train and validation """
    train_file_path, test_file_path = get_scoring_case_data_paths()

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)
    return train_data, test_data
Esempio n. 18
0
def run_h2o(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data, round(max_runtime_secs / 60))
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predicted = predict_h2o(imported_model, test_frame)

    h2o.shutdown(prompt=False)

    return true_target, predicted
Esempio n. 19
0
def test_with_custom_target():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/simple_classification.csv'
    file_custom = '../../data/simple_classification_with_custom_target.csv'

    file_data = InputData.from_csv(os.path.join(test_file_path, file))

    expected_features = file_data.features
    expected_target = file_data.target

    custom_file_data = InputData.from_csv(os.path.join(test_file_path,
                                                       file_custom),
                                          delimiter=';')
    actual_features = custom_file_data.features
    actual_target = custom_file_data.target

    assert not np.array_equal(expected_features, actual_features)
    assert not np.array_equal(expected_target, actual_target)

    custom_file_data = InputData.from_csv(os.path.join(test_file_path,
                                                       file_custom),
                                          delimiter=';',
                                          columns_to_drop=['redundant'],
                                          target_columns='custom_target')

    actual_features = custom_file_data.features
    actual_target = custom_file_data.target

    assert np.array_equal(expected_features, actual_features)
    assert np.array_equal(expected_target, actual_target)
Esempio n. 20
0
def tabular_cv_generator(data: InputData,
                         folds: int) -> Iterator[Tuple[InputData, InputData]]:
    """ The function for splitting data into a train and test samples
        in the InputData format for KFolds cross validation. The function
        return a generator of tuples, consisting of a pair of train, test.

    :param data: InputData for train and test splitting
    :param folds: number of folds

    :return Iterator[InputData, InputData]: return split train/test data
    """
    kf = KFold(n_splits=folds)

    for train_idxs, test_idxs in kf.split(data.features):
        train_features, train_target = _table_data_by_index(train_idxs, data)
        test_features, test_target = _table_data_by_index(test_idxs, data)

        idx_for_train = np.arange(0, len(train_features))
        idx_for_test = np.arange(0, len(test_features))

        train_data = InputData(idx=idx_for_train,
                               features=train_features,
                               target=train_target,
                               task=data.task,
                               data_type=data.data_type,
                               supplementary_data=data.supplementary_data)
        test_data = InputData(idx=idx_for_test,
                              features=test_features,
                              target=test_target,
                              task=data.task,
                              data_type=data.data_type,
                              supplementary_data=data.supplementary_data)

        yield train_data, test_data
Esempio n. 21
0
def test_target_data_from_csv_correct():
    """ Function tests two ways of processing target columns in "from_csv"
    method
    """
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/multi_target_sample.csv'
    path = os.path.join(test_file_path, file)
    task = Task(TaskTypesEnum.regression)

    # Process one column
    target_column = '1_day'
    one_column_data = InputData.from_csv(path,
                                         target_columns=target_column,
                                         columns_to_drop=['date'],
                                         task=task)

    # Process multiple target columns
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]
    seven_columns_data = InputData.from_csv(path,
                                            target_columns=target_columns,
                                            columns_to_drop=['date'],
                                            task=task)

    assert one_column_data.target.shape == (499, 1)
    assert seven_columns_data.target.shape == (499, 7)
Esempio n. 22
0
def test_multi_modal_data():
    num_samples = 5
    target = np.asarray([0, 0, 1, 0, 1])
    img_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.text,
        task=Task(TaskTypesEnum.classification))
    tbl_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.table,
        task=Task(TaskTypesEnum.classification))

    multi_modal = MultiModalData({
        'data_source_img': img_data,
        'data_source_table': tbl_data,
    })

    assert multi_modal.task.task_type == TaskTypesEnum.classification
    assert len(multi_modal.idx) == 5
    assert multi_modal.num_classes == 2
    assert np.array_equal(multi_modal.target, target)
Esempio n. 23
0
def run_refinement_scoring_example(train_path, test_path, with_tuning=False):
    """ Function launch example with error modeling for classification task

    :param train_path: path to the csv file with training sample
    :param test_path: path to the csv file with test sample
    :param with_tuning: is it need to tune pipelines or not
    """

    task = Task(TaskTypesEnum.classification)
    train_dataset = InputData.from_csv(train_path, task=task)
    test_dataset = InputData.from_csv(test_path, task=task)

    # Get and fit pipelines
    no_decompose_c = get_non_refinement_pipeline()
    decompose_c = get_refinement_pipeline()

    no_decompose_c.fit(train_dataset)
    decompose_c.fit(train_dataset)

    # Check metrics for both pipelines
    display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline')
    display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline')

    if with_tuning:
        no_decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                           input_data=train_dataset, iterations=30)

        decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                        input_data=train_dataset, iterations=30)

        display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline after tuning')
        display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline after tuning')
Esempio n. 24
0
def prepare_input_data(forecast_length, horizon):
    ts = np.array([
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 101
    ])

    # Forecast for 2 elements ahead
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=forecast_length))

    # To avoid data leak
    ts_train = ts[:-horizon]
    train_input = InputData(idx=np.arange(0, len(ts_train)),
                            features=ts_train,
                            target=ts_train,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(ts_train)
    end_forecast = start_forecast + forecast_length
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=ts,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input
Esempio n. 25
0
def get_scoring_data() -> Tuple[InputData, InputData]:
    train_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_train.csv'
    test_data_path = f'{fedot_project_root()}/cases/data/scoring/scoring_test.csv'

    train_data = InputData.from_csv(train_data_path)
    test_data = InputData.from_csv(test_data_path)

    return train_data, test_data
Esempio n. 26
0
def get_scoring_data():
    file_path_train = 'cases/data/scoring/scoring_train.csv'
    full_path_train = join(str(project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = 'cases/data/scoring/scoring_test.csv'
    full_path_test = join(str(project_root()), file_path_test)
    task = Task(TaskTypesEnum.classification)
    train = InputData.from_csv(full_path_train, task=task)
    test = InputData.from_csv(full_path_test, task=task)

    return train, test
Esempio n. 27
0
def run_text_problem_from_files():
    data_abspath = os.path.abspath(os.path.join('data', 'spamham'))

    unpack_archived_data(data_abspath)

    train_path = os.path.join(data_abspath, 'train')
    test_path = os.path.join(data_abspath, 'test')

    train_data = InputData.from_text_files(files_path=train_path)
    test_data = InputData.from_text_files(files_path=test_path)

    metric = execute_pipeline_for_text_problem(train_data, test_data)

    print(f'origin files metric: {metric}')
Esempio n. 28
0
def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path,
                                                         return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, target_column=None)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df
Esempio n. 29
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type, tags=['simple'])

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Esempio n. 30
0
def run_multi_output_case(path, vis=False):
    """ Function launch case for river levels prediction on Lena river as
    multi-output regression task

    :param path: path to the file with table
    :param vis: is it needed to visualise pipeline and predictions
    """
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]

    data = InputData.from_csv(path,
                              target_columns=target_columns,
                              columns_to_drop=['date'])
    train, test = train_test_data_setup(data)

    problem = 'regression'

    automl_model = Fedot(problem=problem)
    automl_model.fit(features=train)
    predicted_array = automl_model.predict(features=test)

    # Convert output into one dimensional array
    forecast = np.ravel(predicted_array)

    mae_value = mean_absolute_error(np.ravel(test.target), forecast)
    print(f'MAE - {mae_value:.2f}')

    if vis:
        plot_predictions(predicted_array, test)