Beispiel #1
0
def impute_mean(df, attr):
    """Imputes the given attribute of the given DataFrame with the mean strategy.
    Returns a DataFrame object"""
    imp = Imputer(missing_values="NaN", strategy="mean")
    imp.fit(df[[attr]])
    df[attr] = imp.transform(df[[attr]]).ravel()
    return df
    def preprocessData(self, data):
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  # nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
Beispiel #3
0
    def to_predict_instance(self, X, partition_columns):
        values_for_preferences = []
        for column in partition_columns:
            if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                values_for_preferences.append(list(X[column].unique()))
        all_combinations = list(itertools.product(
            *values_for_preferences))

        instances = []
        for combination in all_combinations:
            instance = []
            for column in X.columns:
                # se é um parametro dentro das preferencias
                if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns):
                    instance.append(
                        combination[list(partition_columns).index(column)])
                # se não está nas preferencias e esta codificado
                elif len(column.split("#")) > 1:
                    instance.append(0)
                # se não está nas preferencias e não esta codificado
                else:
                    instance.append(np.nan)
            imputer = Imputer(
                missing_values=np.nan, strategy='mean', axis=0)
            imputer = imputer.fit(X)
            instance = imputer.transform([instance])[0]
            instances.append(instance)
        return instances
Beispiel #4
0
def _impute(features, imputer=True):
    """
    Helper function that uses the safest imputing method to remove null values, in terms of compatibility with the data size
    @param features: the feature values that need to be imputed
    @type features: numpy.array
    @param imputer: whether or not the scikit imputing method should be used
    @type imputer: boolean
    @return: the modified feature values
    @rtype: numpy.array
    """
    if not imputer: #run imputer only if enabled (default)
        return np.nan_to_num(features)
    else:
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=2)
        try:
            impfeatures = imp.fit_transform(features)
        except ValueError as exc:
            #catch errors with illegal values (e.g. strings)
            log.warning("Exception trying to run scikit imputation: {}".format(exc))
            impfeatures = features
        #show size for debugging purposes
        #log.debug("Featurevectors {} after imputation: {}".format(impfeatures.shape, features))i

        #we don't want shgrid_scores_ape to change, so if this happens, then just replace nans with zero and infinites
        if impfeatures.shape == features.shape:
            features = impfeatures
        else:
            log.warning("Imputer failed, filtering NaN based on numpy converter")
            features = np.nan_to_num(features)
    return features
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        center = not scipy.sparse.isspmatrix((X_transformed))
        standard_scaler = StandardScaler(with_mean=center)
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
Beispiel #6
0
def clean(df, strategy='median'):
    '''Cleans DataFrame.'''
    imputer = Imputer(strategy=strategy)
    object_df = df.select_dtypes(include=['object'])
    float_df = df.select_dtypes(include=['float64'])
    imputer.fit(float_df)
    float_df = pd.DataFrame(imputer.transform(float_df),
                            columns=float_df.columns)

    return pd.concat([object_df, float_df], axis=1)
Beispiel #7
0
def test_deprecated_imputer_axis():
    depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will "
                    "be removed in 0.22. Future (and default) behavior is "
                    "equivalent to 'axis=0' (impute along columns). Row-wise "
                    "imputation can be performed with FunctionTransformer.")
    X = sparse_random_matrix(5, 5, density=0.75, random_state=0)
    imputer = Imputer(missing_values=0, axis=0)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
    imputer = Imputer(missing_values=0, axis=1)
    assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
Beispiel #8
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent']:
        imputer = Imputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
Beispiel #9
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(strategy=strategy)
        X_imputed = imputer.fit_transform(X)
        assert_equal(X_imputed.shape, (10, 2))
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert_equal(X_imputed.shape, (10, 2))
Beispiel #10
0
def feature_inf(my_feature,dim_feature):

  from sklearn.preprocessing.imputation import Imputer
  dim_feature=my_feature.shape[1]
  imp = Imputer(missing_values=np.inf, strategy='mean')
  correction_array=[0]*2*dim_feature
  correction_array=np.asarray(correction_array).reshape(2,dim_feature)
  imp.fit(correction_array) 
  my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc.

  return my_feature
Beispiel #11
0
    def preprocessData(self, data):
        '''
        Handle missing values and scale the data (scaling necessary for SVM to function well).

        :param data: All of the original data.
        :return: Data that has been processed.
        '''
        imputer = Imputer(missing_values=np.nan, strategy='mean')
        imputer.fit(data)
        imputedData = imputer.transform(data)  #nan values will take on mean
        scaledData = preprocessing.scale(imputedData).tolist()

        return scaledData
    def test_initialize_model_from_run(self):
        clf = sklearn.pipeline.Pipeline(
            steps=[('Imputer', Imputer(strategy='median')
                    ), ('VarianceThreshold', VarianceThreshold(
                        threshold=0.05)), ('Estimator', GaussianNB())])
        task = openml.tasks.get_task(11)
        run = openml.runs.run_model_on_task(task,
                                            clf,
                                            avoid_duplicate_runs=False)
        run_ = run.publish()
        run = openml.runs.get_run(run_.run_id)

        modelR = openml.runs.initialize_model_from_run(run.run_id)
        modelS = openml.setups.initialize_model(run.setup_id)

        flowR = openml.flows.sklearn_to_flow(modelR)
        flowS = openml.flows.sklearn_to_flow(modelS)
        flowL = openml.flows.sklearn_to_flow(clf)
        openml.flows.assert_flows_equal(flowR, flowL)
        openml.flows.assert_flows_equal(flowS, flowL)

        self.assertEquals(flowS.components['Imputer'].parameters['strategy'],
                          '"median"')
        self.assertEquals(
            flowS.components['VarianceThreshold'].parameters['threshold'],
            '0.05')
def test_imputation_pickle():
    """Test for pickling imputers."""
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(imputer.transform(X.copy()),
                           imputer_pickled.transform(X.copy()),
                           "Fail to transform the data after pickling "
                           "(strategy = %s)" % (strategy))
Beispiel #14
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)

    for strategy in ["mean", "median", "most_frequent"]:
        imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_equal(
            imputer.transform(X.copy()), imputer_pickled.transform(X.copy()),
            "Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
Beispiel #15
0
    def get_pipeline(self, classifier):
        # preprocess_pipeline = make_pipeline(
        #     ColumnSelector(columns=self.cols_feature),
        #     ,
        # )
        transformer_list = []
        if float in self.X.dtypes.values:
            transformer_list.append((
                "numeric_features",
                make_pipeline(
                    TypeSelector(np.number),
                    # SimpleImputer(strategy="median"),
                    Imputer(strategy="median"),
                    StandardScaler())))
        if "category" in self.X.dtypes.values:
            transformer_list.append((
                "categorical_features",
                make_pipeline(
                    TypeSelector("category"),
                    # SimpleImputer(strategy="most_frequent"),
                    Imputer(strategy="most_frequent"),
                    OneHotEncoder())))
        if 'bool' in self.X.dtypes.values:
            transformer_list.append((
                "boolean_features",
                make_pipeline(TypeSelector("bool"),
                              Imputer(strategy="most_frequent")
                              # SimpleImputer(strategy="most_frequent")
                              )))
        feature_union = FeatureUnion(transformer_list=transformer_list)

        pipeline = Pipeline(
            steps=[('colselector', ColumnSelector(
                columns=self.X.columns)), (
                    'featureunion', feature_union), ('classifier',
                                                     classifier)])

        # make_pipeline(
        #     preprocess_pipeline,
        #     'classfier': classifier,
        # )

        return pipeline
def check_indicator(X, expected_imputed_features, axis):
    n_samples, n_features = X.shape
    imputer = Imputer(missing_values=-1, strategy='mean', axis=axis)
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    Xt = imputer.fit_transform(X)
    Xt_with_in = imputer_with_in.fit_transform(X)
    imputed_features_mask = X[:, expected_imputed_features] == -1
    n_features_new = Xt.shape[1]
    n_imputed_features = len(imputer_with_in.imputed_features_)
    assert_array_equal(imputer.imputed_features_, expected_imputed_features)
    assert_array_equal(imputer_with_in.imputed_features_,
                       expected_imputed_features)
    assert_equal(Xt_with_in.shape,
                 (n_samples, n_features_new + n_imputed_features))
    assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    assert_array_equal(Xt_with_in,
                       imputer_with_in.fit_transform(sparse.csc_matrix(X)).A)
    assert_array_equal(Xt_with_in,
                       imputer_with_in.fit_transform(sparse.csr_matrix(X)).A)
Beispiel #17
0
def check_indicator(X, expected_imputed_features, axis):
    n_samples, n_features = X.shape
    imputer = Imputer(missing_values=-1, strategy='mean', axis=axis)
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    Xt = imputer.fit_transform(X)
    Xt_with_in = imputer_with_in.fit_transform(X)
    imputed_features_mask = X[:, expected_imputed_features] == -1
    n_features_new = Xt.shape[1]
    n_imputed_features = len(imputer_with_in.imputed_features_)
    assert_array_equal(imputer.imputed_features_, expected_imputed_features)
    assert_array_equal(imputer_with_in.imputed_features_,
                       expected_imputed_features)
    assert_equal(Xt_with_in.shape,
                 (n_samples, n_features_new + n_imputed_features))
    assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask)))
    imputer_with_in = clone(imputer).set_params(add_indicator_features=True)
    assert_array_equal(Xt_with_in,
                       imputer_with_in.fit_transform(sparse.csc_matrix(X)).A)
    assert_array_equal(Xt_with_in,
                       imputer_with_in.fit_transform(sparse.csr_matrix(X)).A)
Beispiel #18
0
def test_mice_missing_at_transform():
    n = 100
    d = 10
    Xtr = np.random.randint(low=0, high=3, size=(n, d))
    Xts = np.random.randint(low=0, high=3, size=(n, d))

    Xtr[:, 0] = 1  # definitely no missing values in 0th column
    Xts[0, 0] = 0  # definitely missing value in 0th column

    for strategy in ["mean", "median", "most_frequent"]:
        mice = MICEImputer(missing_values=0,
                           n_imputations=1,
                           n_burn_in=1,
                           initial_strategy=strategy).fit(Xtr)
        initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr)

        # if there were no missing values at time of fit, then mice will
        # only use the initial imputer for that feature at transform
        assert np.all(
            mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
    def test_local_run_metric_score(self):

        # construct sci-kit learn classifier
        clf = Pipeline(steps=[('imputer', Imputer(
            strategy='median')), ('estimator', RandomForestClassifier())])

        # download task
        task = openml.tasks.get_task(7)

        # invoke OpenML run
        run = openml.runs.run_model_on_task(task, clf)

        self._test_local_evaluations(run)
 def test_run_and_upload_decision_tree_pipeline(self):
     pipeline2 = Pipeline(
         steps=[('Imputer', Imputer(
             strategy='median')), ('VarianceThreshold',
                                   VarianceThreshold()),
                ('Estimator',
                 RandomizedSearchCV(DecisionTreeClassifier(), {
                     'min_samples_split': [2**x for x in range(1, 7 + 1)],
                     'min_samples_leaf': [2**x for x in range(0, 6 + 1)]
                 },
                                    cv=3,
                                    n_iter=10))])
     self._run_and_upload(pipeline2, '62501')
    def test__run_exists(self):
        # would be better to not sentinel these clfs,
        # so we do not have to perform the actual runs
        # and can just check their status on line
        clfs = [
            sklearn.pipeline.Pipeline(
                steps=[('Imputer', Imputer(strategy='mean')),
                       ('VarianceThreshold', VarianceThreshold(threshold=0.05)
                        ), ('Estimator',
                            DecisionTreeClassifier(max_depth=4))]),
            sklearn.pipeline.Pipeline(
                steps=[('Imputer', Imputer(strategy='most_frequent')),
                       ('VarianceThreshold', VarianceThreshold(threshold=0.1)
                        ), ('Estimator', DecisionTreeClassifier(max_depth=4))])
        ]

        task = openml.tasks.get_task(115)

        for clf in clfs:
            try:
                # first populate the server with this run.
                # skip run if it was already performed.
                run = openml.runs.run_model_on_task(task,
                                                    clf,
                                                    avoid_duplicate_runs=True)
                run.publish()
            except openml.exceptions.PyOpenMLError as e:
                # run already existed. Great.
                pass

            flow = openml.flows.sklearn_to_flow(clf)
            flow_exists = openml.flows.flow_exists(flow.name,
                                                   flow.external_version)
            self.assertGreater(flow_exists, 0)
            downloaded_flow = openml.flows.get_flow(flow_exists)
            setup_exists = openml.setups.setup_exists(downloaded_flow, clf)
            self.assertGreater(setup_exists, 0)
            run_ids = _run_exists(task.task_id, setup_exists)
            self.assertTrue(run_ids, msg=(run_ids, clf))
Beispiel #22
0
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
                         ('tree', tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"],
        'imputer__axis': [0, 1]
    }

    l = 100
    X = sparse_random_matrix(l, l, density=0.10)
    Y = sparse_random_matrix(l, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)
Beispiel #23
0
def test_imputation_copy():
    """Test imputation with copy=True."""
    l = 5

    # Test default behaviour and with copy=True
    for params in [{}, {'copy': True}]:
        X = sparse_random_matrix(l, l, density=0.75, random_state=0)

        # Dense
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X.todense() == Xt))

        # Sparse
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        X = X.todense()
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X == Xt))
    def test_run_on_dataset_with_missing_labels(self):
        # Check that _run_task_get_arffcontent works when one of the class
        # labels only declared in the arff file, but is not present in the
        # actual data

        task = openml.tasks.get_task(2)
        class_labels = task.class_labels

        model = Pipeline(steps=[('Imputer', Imputer(
            strategy='median')), ('Estimator', DecisionTreeClassifier())])

        data_content, _, _, _, _ = _run_task_get_arffcontent(model, task)
        # 2 folds, 5 repeats; keep in mind that this task comes from the test
        # server, the task on the live server is different
        self.assertEqual(len(data_content), 4490)
        for row in data_content:
            # repeat, fold, row_id, 6 confidences, prediction and correct label
            self.assertEqual(len(row), 12)
    def test_learning_curve_task(self):
        task_id = 801  # diabates dataset
        num_test_instances = 6144  # for learning curve
        num_repeats = 1
        num_folds = 10
        num_samples = 8

        clfs = []
        random_state_fixtures = []

        #nb = GaussianNB()
        #clfs.append(nb)
        #random_state_fixtures.append('62501')

        pipeline1 = Pipeline(steps=[('scaler', StandardScaler(
            with_mean=False)), ('dummy', DummyClassifier(strategy='prior'))])
        clfs.append(pipeline1)
        random_state_fixtures.append('62501')

        pipeline2 = Pipeline(
            steps=[('Imputer', Imputer(
                strategy='median')), ('VarianceThreshold',
                                      VarianceThreshold()),
                   ('Estimator',
                    RandomizedSearchCV(DecisionTreeClassifier(), {
                        'min_samples_split': [2**x for x in range(1, 7 + 1)],
                        'min_samples_leaf': [2**x for x in range(0, 6 + 1)]
                    },
                                       cv=3,
                                       n_iter=10))])
        clfs.append(pipeline2)
        random_state_fixtures.append('62501')

        for clf, rsv in zip(clfs, random_state_fixtures):
            run = self._perform_run(task_id,
                                    num_test_instances,
                                    clf,
                                    random_state_value=rsv)

            # todo: check if runtime is present
            self._check_sample_evaluations(run.sample_evaluations, num_repeats,
                                           num_folds, num_samples)
    def test__prediction_to_row(self):
        repeat_nr = 0
        fold_nr = 0
        clf = sklearn.pipeline.Pipeline(
            steps=[('Imputer', Imputer(strategy='mean')
                    ), ('VarianceThreshold', VarianceThreshold(
                        threshold=0.05)), ('Estimator', GaussianNB())])
        task = openml.tasks.get_task(20)
        train, test = task.get_train_test_split_indices(repeat_nr, fold_nr)
        X, y = task.get_X_and_y()
        clf.fit(X[train], y[train])

        test_X = X[test]
        test_y = y[test]

        probaY = clf.predict_proba(test_X)
        predY = clf.predict(test_X)
        sample_nr = 0  # default for this task
        for idx in range(0, len(test_X)):
            arff_line = _prediction_to_row(repeat_nr, fold_nr, sample_nr, idx,
                                           task.class_labels[test_y[idx]],
                                           predY[idx], probaY[idx],
                                           task.class_labels, clf.classes_)

            self.assertIsInstance(arff_line, list)
            self.assertEqual(len(arff_line), 6 + len(task.class_labels))
            self.assertEqual(arff_line[0], repeat_nr)
            self.assertEqual(arff_line[1], fold_nr)
            self.assertEqual(arff_line[2], sample_nr)
            self.assertEqual(arff_line[3], idx)
            sum = 0.0
            for att_idx in range(4, 4 + len(task.class_labels)):
                self.assertIsInstance(arff_line[att_idx], float)
                self.assertGreaterEqual(arff_line[att_idx], 0.0)
                self.assertLessEqual(arff_line[att_idx], 1.0)
                sum += arff_line[att_idx]
            self.assertAlmostEqual(sum, 1.0)

            self.assertIn(arff_line[-1], task.class_labels)
            self.assertIn(arff_line[-2], task.class_labels)
        pass
 def transform_data(self, housing_data):
    data             = housing_data.drop('median_house_value', axis=1)
    self.housing_num = data.select_dtypes(include=[np.number])
    self.num_attribs = list(self.housing_num)
    self.cat_attribs = list(data.select_dtypes(include=[np.object]))
    
    self.num_pipeline = Pipeline([
          ('selector'     , DataFrameSelector      (self.num_attribs )),
          ('imputer'      , Imputer                (strategy="median")),
          ('attribs_adder', CombinedAttributesAdder(                 )),
          ('std_caller'   , StandardScaler         (                 ))
       ])
    
    self.cat_pipeline = Pipeline([
          ('selector'     , DataFrameSelector      (self.cat_attribs )),
          ('cat_encoder'  , OneHotEncoder          (sparse=False     ))
       ])
    
    self.full_pipeline = FeatureUnion(transformer_list=[
          ("num_pipeline", self.num_pipeline),
          ("cat_pipeline", self.cat_pipeline)
       ])
Beispiel #28
0
def modelo_4v():
    print(request.args)
    loaded_model, graph = cargarModelo_4v()
    # dimensions of our images.

    # Show
    datatest_name = request.args.get("datacsv")
    data_path = '../samples/' + datatest_name + '.csv'

    dataset = pd.read_csv(data_path, delimiter='\t')
    # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    sc = StandardScaler()
    #imputacion de datos(datos nulos)
    imp = Imputer()

    X_ID = dataset.iloc[:, 0].values
    X_testing = dataset.iloc[:, 1:5].values
    #imputacion de datos(datos nulos)
    imp = Imputer()
    imp.fit(X_testing)
    X_test = imp.transform(X_testing)
    X_test = sc.fit_transform(X_test, )

    #prediccion

    with graph.as_default():
        y_pred = loaded_model.predict(X_test)
        resultado_final = ''
        for i in range(0, len(y_pred)):

            if y_pred[i] > 0.5:
                print(X_ID[i], ' --> Genera Valor!')
                resultado = str(X_ID[i]) + ' --> Genera Valor!! '
            else:
                print(X_ID[i], ' --> No genera Valor ')
                resultado = str(X_ID[i]) + ' --> No genera Valor '
            resultado_final = resultado_final + resultado + '\n'

        #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro')
        return resultado_final
Beispiel #29
0
def test_imputation_pickle():
    # Test for pickling imputers.
    import pickle

    n = 100
    X = sparse_random_matrix(n, n, density=0.10).todense()

    for strategy in ["mean", "median", "most_frequent", "mice"]:
        if strategy == 'mice':
            imputer = MICEImputer(missing_values=0,
                                  n_imputations=1,
                                  n_burn_in=1)
        else:
            imputer = Imputer(missing_values=0, strategy=strategy)
        imputer.fit(X)

        imputer_pickled = pickle.loads(pickle.dumps(imputer))

        assert_array_almost_equal(
            imputer.transform(X.copy()),
            imputer_pickled.transform(X.copy()),
            err_msg="Fail to transform the data after pickling "
            "(strategy = %s)" % (strategy))
    def test_learning_curve_task_2(self):
        task_id = 801  # diabates dataset
        num_test_instances = 6144  # for learning curve
        num_repeats = 1
        num_folds = 10
        num_samples = 8

        pipeline2 = Pipeline(
            steps=[('Imputer', Imputer(
                strategy='median')), ('VarianceThreshold',
                                      VarianceThreshold()),
                   ('Estimator',
                    RandomizedSearchCV(DecisionTreeClassifier(), {
                        'min_samples_split': [2**x for x in range(1, 7 + 1)],
                        'min_samples_leaf': [2**x for x in range(0, 6 + 1)]
                    },
                                       cv=3,
                                       n_iter=10))])
        run = self._perform_run(task_id,
                                num_test_instances,
                                pipeline2,
                                random_state_value='62501')
        self._check_sample_evaluations(run.sample_evaluations, num_repeats,
                                       num_folds, num_samples)
Beispiel #31
0
def test_imputation_copy():
    """Test imputation with copy=True."""
    l = 5

    # Test default behaviour and with copy=True
    for params in [{}, {'copy': True}]:
        X = sparse_random_matrix(l, l, density=0.75, random_state=0)

        # Dense
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X.todense() == Xt))

        # Sparse
        imputer = Imputer(missing_values=0, strategy="mean", **params)
        X = X.todense()
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = np.nan
        # Check that the objects are different and that they don't use
        # the same buffer
        assert_false(np.all(X == Xt))
Beispiel #32
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
Beispiel #33
0
def _check_statistics(X, X_true, strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
        count += 1
        if count % 1000 == 0:
            print(count)
        val = noncat_matrix[x, y]
        if val - math.floor(val) != 0.0:
            for i in range(20):
                if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001:
                    X[x, 2 * y] = math.ceil(abs(val) * i)
                    X[x, 2 * y + 1] = i
    return X


# категории
print("building train")
train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix()
imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0)
train_cat_matr = imp.fit_transform(train_cat_matr)
# imp2 = Imputer(missing_values='NaN', strategy='median')
train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix()
# train_noncat_matr = imp2.fit_transform(train_noncat_matr)
# allf = np.hstack((train_cat_matr, train_noncat_matr))


print("building test")
test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode
test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix()
test_cat_matr = imp.transform(test_cat_matr)
test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix()
# test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix()
# test_noncat_matr = imp2.transform(test_noncat_matr)
Beispiel #35
0
def test_imputation_copy():
    # Test imputation with copy
    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

    # copy=True, dense => copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_false(np.all(X == Xt))

    # copy=True, sparse csr => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, dense => no copy
    X = X_orig.copy().toarray()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
    Xt = imputer.fit(X).transform(X)
    Xt[0, 0] = -1
    assert_true(np.all(X == Xt))

    # copy=False, sparse csr, axis=1 => no copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=0 => no copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_true(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=0)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csc, axis=1 => copy
    X = X_orig.copy().tocsc()
    imputer = Imputer(missing_values=X.data[0],
                      strategy="mean",
                      copy=False,
                      axis=1)
    Xt = imputer.fit(X).transform(X)
    Xt.data[0] = -1
    assert_false(np.all(X.data == Xt.data))

    # copy=False, sparse csr, axis=1, missing_values=0 => copy
    X = X_orig.copy()
    imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1)
    Xt = imputer.fit(X).transform(X)
    assert_false(sparse.issparse(Xt))
Beispiel #36
0
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing.imputation import Imputer
from matplotlib import pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
my_age_imputer = Imputer(strategy='median')
# Any results you write to the current directory are saved as output.

# In[ ]:

#loading data into dataframe variable
path = '../input/train.csv'
test_path = '../input/test.csv'
test_data = pd.read_csv(test_path)
train_data = pd.read_csv(path)
total_data = train_data.append(test_data)
#exploring the data
print((total_data.isnull().sum()))  # finding columns that have null values
#getting rid of Cabin since most of its values are missing (687)
data = total_data.drop('Cabin',
                       axis=1)  # drop Cabin because it is mostly blank
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1,))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.mf.set_value("NumberOfMissingValues",
            self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
            self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
Beispiel #38
0
def _check_statistics(X, X_true, strategy, statistics, missing_values):
    """Utility function for testing imputation for a given strategy.

    Test:
        - along the two axes
        - with dense and sparse arrays

    Check that:
        - the statistics (mean, median, mode) are correct
        - the missing values are imputed correctly"""

    err_msg = "Parameters: strategy = %s, missing_values = %s, " \
              "axis = {0}, sparse = {1}" % (strategy, missing_values)

    # Normal matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    X_trans = imputer.fit(X).transform(X.copy())
    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, False))
    assert_array_equal(X_trans, X_true, err_msg.format(0, False))

    # Normal matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(X.transpose())
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform, X.copy().transpose())
    else:
        X_trans = imputer.transform(X.copy().transpose())
        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, False))

    # Sparse matrix, axis = 0
    imputer = Imputer(missing_values, strategy=strategy, axis=0)
    imputer.fit(sparse.csc_matrix(X))
    X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

    if sparse.issparse(X_trans):
        X_trans = X_trans.toarray()

    assert_array_equal(imputer.statistics_, statistics,
                       err_msg.format(0, True))
    assert_array_equal(X_trans, X_true, err_msg.format(0, True))

    # Sparse matrix, axis = 1
    imputer = Imputer(missing_values, strategy=strategy, axis=1)
    imputer.fit(sparse.csc_matrix(X.transpose()))
    if np.isnan(statistics).any():
        assert_raises(ValueError, imputer.transform,
                      sparse.csc_matrix(X.copy().transpose()))
    else:
        X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(X_trans, X_true.transpose(),
                           err_msg.format(1, True))
X_train, X_validation, Y_train, Y_validation = train_test_split(
    X, Y, test_size=validation_size, random_state=seed)
X_train = pd.DataFrame(data=X_train, columns=columns)
X_validation = pd.DataFrame(data=X_validation, columns=columns)

# handling missing values (NaN, Null)
# creates additonal new columns based on calumns where missing data was (fill those columns with 1 and 0)
# True where missing value was, False where not (1 or 0)
missing_columns = [
    col for col in X_train.columns if X_train[col].isnull().any()
]
for col in missing_columns:
    X_train[col + '_missing_data'] = X_train[col].isnull()
original_data = X_train
# fill missing values with mean values
imputer = Imputer()
X_train = pd.DataFrame(data=imputer.fit_transform(X_train))
X_train.columns = original_data.columns
# make one column indicating where wasmissing point, drop missing_columns
X_train['missing_values'] = numpy.zeros((len(X_train), 1))
for col in missing_columns:
    X_train['missing_values'] += X_train[col + '_missing_data']
    X_train = X_train.drop([col + '_missing_data'], axis=1)
X_train['Age'] = X_train['Age'].values.round()
X_train = X_train.values

# validation dataset
missing_columns = [
    col for col in X_validation.columns if X_validation[col].isnull().any()
]
for col in missing_columns: