def __init__(self, num_features, cat_features): self.num_features = num_features self.cat_features = cat_features self.data = None self.fit_flag = False self.num_preprocessing = pipeline.Pipeline(steps = [ ('num', impute.SimpleImputer(strategy = 'mean')) ]) self.cat_preprocessing_for_catboost = pipeline.Pipeline(steps = [ ('cat_impute', impute.SimpleImputer(strategy = "constant")) ]) # трансформер для заполнения пропусков и преобразования вещественных признаков self.features_for_catboost = compose.ColumnTransformer(transformers = [ ('num_features', self.num_preprocessing, self.num_features), ('cat_features', self.cat_preprocessing_for_catboost, self.cat_features) ]) # итоговый pipeline для предобработки данных self.all_features = pipeline.Pipeline(steps = [ ('feature', self.features_for_catboost), ('data', DataForCatboost(self.num_features, self.cat_features)) ])
def GetImputer(options): type = 'simple' strategy = 'mean' if ('impute' in options): if ('type' in options['impute']): type = options['impute']['type'] if ('strategy' in options['impute']): strategy = options['impute']['strategy'] if (type == 'simple'): return impute.SimpleImputer(missing_values=np.nan, strategy=strategy) return impute.SimpleImputer(missing_values=np.nan, strategy=strategy)
def preprocessor(num_feats, cat_feats): num_preprocessing = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer( strategy='median')), ('encoder', preprocessing.StandardScaler())]) cat_preporcessing = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing') ), ('encoder', preprocessing.OrdinalEncoder())]) return compose.ColumnTransformer( transformers=[('num', num_preprocessing, num_feats), ('cat', cat_preporcessing, cat_feats)])
def testFunction(data): #clf = sklearn.ensemble.forest.RandomForestClassifier(bootstrap:true,weight:null,criterion:"gini",depth:null,features:"auto",nodes:null,decrease:0.0,split:null,leaf:1,split:2,leaf:0.0,estimators:10,jobs:1,score:false,state:6826,verbose:0,start:false) #X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); run = oml.runs.get_run(1836360) print(run.flow_id) #flow = oml.flows.get_flow(4834) flow = oml.flows.get_flow(8900) #flow = oml.flows.get_flow(8426) #flow = oml.flows.get_flow(7650) flow = oml.flows.flow_to_sklearn(flow) clf = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer()), ('estimator', flow)]) flow = flows.sklearn_to_flow(clf) print(flow.model) taskId = tasks.get_task(55) run = runs.run_flow_on_task(taskId, flow, avoid_duplicate_runs=True) feval = dict(run.fold_evaluations['predictive_accuracy'][0]) acc = 0 for val in feval.values(): acc += val print(acc / 10)
def _simple_impute(self): for col in self.target: s_impute = impute.SimpleImputer() s_impute.fit(self.df[col].values) self.output_df.loc[:, col] = s_impute.fit_transform( self.df[col].values) return self.output_df
def handle_missing_values(): # prepare sample data csv_data = """ A,B,C,D 1.0,2.0,3.0,4.0 5.0,6.0,,8.0 10.0,11.0,12.0, """ df = pd.read_csv(StringIO(csv_data)) print('[CSV data]') print(df) print('[count missing values for each features]') print(df.isnull().sum()) print('[drop samples which has a NaN]') print(df.dropna()) print('[drop features which has a NaN]') print(df.dropna(axis=1)) print('[drop samples of which all the features are NaN]') print(df.dropna(how='all')) print('[drop samples which has less than 4 NaN]') print(df.dropna(thresh=4)) print('[drop samples which has a NaN in specified features]') print(df.dropna(subset=['C'])) print('[impute(interpolate) by mean]') imp = impute.SimpleImputer(missing_values=np.NaN, strategy='mean').fit(df.values) print(imp.transform(df.values))
def PredictUnwatched(svm, feature_wtarget_df, feature_topred_df): # preprocess the features and make predictions X_np = NP.array(feature_wtarget_df) imputer = IM.SimpleImputer(missing_values=NP.nan, strategy='mean') X_impute = imputer.fit_transform(X_np) scaler = PP.StandardScaler() scaler.fit(X_impute) X_pred = NP.array( feature_topred_df.drop(columns=['Movie_ID', 'Title'])) X_pred_impute = imputer.transform(X_pred) X_pred_scale = scaler.transform(X_pred_impute) predict_np = svm.predict(X_pred_scale) # append the movie-id to the predictions feature_tpreset_df = feature_topred_df.reset_index() predict_ls = [] for idx, row in feature_tpreset_df.iterrows(): movie_id = int(row['Movie_ID']) # get movie by Movie_ID, since features aren't part of db yet, though Movie_ID is unique movie_md = MT.MasterMovie.objects.get(Movie_ID=movie_id) new_dx = { 'Movie_FK': movie_md, 'User': '******', 'RecomLevel': predict_np[idx], } predict_ls.append(new_dx) return predict_ls
def neural_net_cancer(solver): cancer_data = load_data_set('breastcancer') cancer_imp = impute.SimpleImputer(missing_values=np.nan, strategy='mean') cancer_imp.fit( np.array(cancer_data['train']['inputs'] + cancer_data['test']['inputs'], dtype=np.float32)) clf = neural_network.MLPClassifier(solver=solver, warm_start=True, max_iter=1000) with Timer() as t: clf.fit(cancer_imp.transform(cancer_data['train']['inputs']), cancer_data['train']['outputs']) time_to_fit = t.interval * 1000 predicted = clf.predict( cancer_imp.transform(cancer_data['train']['inputs'])) train_f1_score = metrics.f1_score(cancer_data['train']['outputs'], predicted, average='micro') with Timer() as t: predicted = clf.predict( cancer_imp.transform(cancer_data['test']['inputs'])) test_f1_score = metrics.f1_score(cancer_data['test']['outputs'], predicted, average='micro') test_prediction_runtime = t.interval * 1000 data_in = cancer_imp.transform(cancer_data['train']['inputs'] + cancer_data['test']['inputs']) data_out = cancer_data['train']['outputs'] + cancer_data['test']['outputs'] t_out = cancer_data['test']['outputs'] accuracy = accuracy_score(t_out, predicted) * 100 precision = precision_score(t_out, predicted, average="weighted") * 100 print("breastcancer.dataset (solver={})".format(solver)) print("training f1 score:", train_f1_score) print("test f1 score:", test_f1_score) print("time to fit:", time_to_fit) print("test prediction runtime:", test_prediction_runtime) print("test accuracy", accuracy) print("test precision", precision) print() skplt.estimators.plot_learning_curve( clf, data_in, data_out, title="Learning Curve: Neural Net (breastcancer.dataset, solver={})". format(solver), cv=5) plt.savefig('out/neural_net/breastcancer-solver-{}.png'.format(solver))
def _add_preprocessing(estimator): return pipeline.Pipeline( steps=[ ("mvi", impute.SimpleImputer()), ("std", preprocessing.StandardScaler()), ("ml", estimator) ] )
def preparepimadata(url): pima = pd.read_csv(url, header=None) pima.columns = [0, 1, 2, 3, 4, 5, 6, 7, 'L'] imputer = impute.SimpleImputer(missing_values=0, strategy='mean') imputer.fit(pima.iloc[:, 1:8]) pima_imputed = pima.copy(deep=True) pima_imputed.iloc[:, 1:8] = imputer.transform(pima.iloc[:, 1:8]) return pima_imputed
def impute_scale(data): # impute missing value and standard scale data_preprocess = pipeline.Pipeline([ ('imputer', impute.SimpleImputer(strategy="constant", fill_value=0)), ('std_scaler', preprocessing.StandardScaler()), ]) data_processed = data_preprocess.fit_transform(data) return data_processed
def impute_data(x): # 1. Handle missing values in the data using an sklearn SimpleImputer. The transformed # data should be stored in a numpy array x_imp. x_imp = impute.SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) ans = x_imp.fit_transform(x) return ans # x is a numpy.array
def as_local(self): target = sk_imp.SimpleImputer(missing_values=self.missing_values, strategy=self.strategy, fill_value=self.fill_value, copy=self.copy, add_indicator=self.add_indicator) copy_attrs_as_local(self, target, 'statistics_', 'feature_names_in_', 'n_features_in_') # 'indicator_', ) ss = target.statistics_ if isinstance(ss, (list, tuple)) and isinstance(ss[0], np.ndarray): target.statistics_ = ss[0] return target
def filling_train(file_name): data_frame = read_and_replace_sample(file_name) data_frame = data_frame.reindex(axis=1) if data_frame.shape[1] == 15: del data_frame['0.1'] # index of last column imputer = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent') data_frame = pd.DataFrame(imputer.fit_transform(data_frame)) return data_frame, imputer
def constant_imputer(self): if self.missing_val is None: raise Exception("Give 'constant' some value while initializing LeafImputer\ when using constant_imputer") imputer = Imputer.SimpleImputer(missing_values=self.missing_val, strategy='constant', fill_value=self.missing_val) return imputer
def __init__(self): """[Initaties the pre-processing block. Uses `sklearn.StandardScaler` for standardization of inputs and `sklearn.SimpleImputer` for imputing missing values] """ print("Processing Block Constructed") self.X_scaler = preprocessing.StandardScaler() self.y_scaler = preprocessing.StandardScaler() self.imputer = impute.SimpleImputer(missing_values=np.nan, strategy="most_frequent")
def icu_preprocessing(mfunc): return lambda **kwargs: pipeline.Pipeline([ ('fillna', compose.ColumnTransformer([('nanstring', impute.SimpleImputer(strategy='constant', fill_value='NaN'), ['admitdiagnosis'])], remainder='passthrough')), # Have to hackily encode column as 0 on second transformer bc columntransformer throws out Pandas info ('ohe', compose.ColumnTransformer([ ('onehot', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'), [0]) ], remainder='passthrough')), ('impute', impute.SimpleImputer()), ('scale', preprocessing.StandardScaler()), ('model', mfunc(**kwargs)) ])
def runMLAlgorithm(estimator, name, settings, RTPName=None, tooLong=False): acc = 0 expectedRuntime = -1 if settings.showRuntimePrediction and RTPName is not None: expectedRuntime = getAverageRuntime(RTPName, settings.task) if (expectedRuntime <= settings.timeLimit and expectedRuntime != -1) or (not tooLong and expectedRuntime == -1): if settings.removeOutliers: name += "_noOutlier" clf = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer()), ('estimator', WithoutOutliersClassifier( IsolationForest(behaviour='new', contamination='auto'), estimator) )]) else: clf = pipeline.Pipeline( steps=[('imputer', impute.SimpleImputer()), ('estimator', estimator)]) flow = flows.sklearn_to_flow(clf) try: run = runs.run_flow_on_task(settings.taskId, flow, avoid_duplicate_runs=True) except PyOpenMLError: print("Run already exists in OpenML, WIP") return except: print("An unexpected error occured") return feval = dict(run.fold_evaluations['predictive_accuracy'][0]) for val in feval.values(): acc += val settings.addAlgorithm(name, acc / 10) run.publish() run.push_tag("auto-jupyter-notebook") else: print("Skipping run because of time limit set")
def build_forest(data): X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True) forest = Pipeline([('Imputer', impute.SimpleImputer()), ('classifiers', RandomForestClassifier(n_estimators=100, random_state=0))]) forest.fit(X, y) importances = forest.steps[1][1].feature_importances_ indices = np.argsort(importances)[::-1] return data.name, features, importances, indices
def transform(self): # __init__ path_stopwords = self.cfg['path_stopwords'] df = self.origin_df.copy() col_object = df.select_dtypes(['object']).columns col_numeric = df.columns.difference(col_object) self.cluster_address = Cluster_address(path_stopwords, self.num_cluster) # ____excute____ # address df['Address'] = self.cluster_address.fit(df['Address']) # Add new features # df['cnt_NaN'] = df[columns].isna().sum(axis=1) # Impute data self.imputer_object = impute.SimpleImputer( strategy=self.cfg['impute_object'], fill_value=-1) self.imputer_numeric = impute.SimpleImputer( strategy=self.cfg['impute_numeric'], fill_value=-1) df[col_object] = self.imputer_object.fit_transform(df[col_object]) df[col_numeric] = self.imputer_numeric.fit_transform(df[col_numeric]) unique_col = [list(set(df[col])) for col in col_object] for i_col in range(len(col_object)): col = col_object[i_col] if col == 'Type': df[col] = df[col].apply(lambda x: self.class_name.index(x)) else: df[col] = df[col].apply(lambda x: unique_col[i_col].index(x)) self.labels = df.pop('Type') self.columns_name = set(df.columns) self.data = df
def make_preprocessor(self, member): data = member.get_simulation().get_full_data() features = data.features x = data.x y = data.y categorical_features = features['nominal'] numeric_features = features['numeric'] categories = [] for feature in categorical_features: values = np.unique(x[:, feature]) values = values[~np.isnan(values)] values = np.sort(values) categories.append(values) # We create the preprocessing pipelines for both numeric and categorical data. # We create the preprocessing pipelines for both numeric and categorical data. # numeric_features = [0, 1, 2, 5, 6] numeric_imputer = impute.SimpleImputer(strategy='median') numeric_scaler = self.make_scaler(member) numeric_transformer = Pipeline( steps=[('imp', numeric_imputer), ('scaler', numeric_scaler)]) # categorical_features = [3, 4, 7, 8] categorical_imputer = impute.SimpleImputer(strategy="most_frequent") categorical_encoder = preprocessing.OneHotEncoder( categories=categories, dtype=np.float64, handle_unknown="error", sparse=False) categorical_transformer = Pipeline( steps=[('imp', categorical_imputer), ('enc', categorical_encoder)]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) return preprocessor
def getPEASFeatures(filepath, featurefile, labelencoderfile, peaks): data = pd.read_csv(filepath, sep="\t") featurecolumns = PEASUtil.getFeatureColumnData(featurefile) labelencoder = PEASUtil.getLabelEncoder(labelencoderfile) chrpositions = data.values[:, :3] selecteddata = data.values[:, featurecolumns] onehotindices = [] onehotvalues = [] for i in range(len(labelencoder)): curonehotindex = np.argwhere( featurecolumns == labelencoder[i][0])[0][0] onehotindices.append(curonehotindex) curonehotvalues = np.array(labelencoder[i][1:]) curonehotencoder = OneHotEncoder(categories=[curonehotvalues], sparse=False, handle_unknown='ignore') curvector = selecteddata[:, curonehotindex].reshape(-1, 1) for i in range(len(curvector)): if str(curvector[i, 0]) == 'nan': curvector[i, 0] = '' onehotvalues.append(curonehotencoder.fit_transform(curvector)) testX = selecteddata[:, ~np.in1d(np.arange(np.shape(selecteddata)[1] ), onehotindices)] imputer = impute.SimpleImputer(missing_values=np.nan, strategy='mean') sscaleddata = preprocessing.StandardScaler().fit_transform( imputer.fit_transform(testX)) combinedvalues = [sscaleddata] + onehotvalues sdata = np.concatenate(combinedvalues, axis=1).astype(float) peakmap = dict() for i in range(len(sdata)): curchr = chrpositions[i, 0] curstart = chrpositions[i, 1] curend = chrpositions[i, 2] peakmap[curchr + ":" + str(curstart) + "-" + str(curend)] = sdata[i, :] #step 2: use dictionary to populate a tensor with peaks rv = [] for i in range(len(peaks)): curkey = peaks[i][0] + ":" + str(peaks[i][1]) + "-" + str(peaks[i][2]) if curkey in peakmap: rv.append(peakmap[curkey]) else: print("Error: Couldn't find " + curkey + ".") #return np.array(PolynomialFeatures(include_bias=False).fit_transform(rv)) return np.array(rv)
def __init__( self, featSelect="predefined_RF", featTypes="RF", random_state=1, yscale="lnKa", xscale="MinMax", verbose=0, ): self.__dict__.update(**locals()) self.imputer = impute.SimpleImputer() self.xscaler = self.xscalers[self.xscale] self.yscaler = self.yscalers[self.yscale]
def imputer(): # 4 = (1 + 7) / 2 # 5 = (2 + 4 + 9) / 3 x = [[1, 2], [np.nan, 4], [7, 9]] imp = impute.SimpleImputer(strategy='mean') imp.fit(x) print(imp.transform(x)) x2 = [[1, np.nan], [np.nan, np.nan], [np.nan, 9]] print(imp.transform(x2)) print(imp.strategy)
def fixData(trainFileName, testFileName, features, imputer="simple", strategy="mean"): print("Fixing Data\n") #Read files into pandas array training_data = pd.read_csv(trainFileName) testing_data = pd.read_csv(testFileName) featuresForDummies = ["Embarked", "Sex"] trainSurvived = training_data["Survived"] passengerID = testing_data["PassengerId"] features2 = [] for i in range(len(features)): features2.append( features[i]) #Appends feature selected to the features to use training_data = training_data[features2] testing_data = testing_data[features2] tr_data = pd.get_dummies( training_data, columns=featuresForDummies) #Get dummies for required ones te_data = pd.get_dummies(testing_data, columns=featuresForDummies) if imputer.lower() == "simple": imp = impute.SimpleImputer(missing_values=np.NaN, strategy=strategy) #Imputes data elif imputer.lower() == "knn": imp = impute.KNNImputer(missing_values=np.NaN) elif imputer.lower() == "iterative": imp = impute.IterativeImputer(missing_values=np.NaN, initial_strategy=strategy) else: print("You did not enter a correct imputation method.") print( "Correct imputation methods include: \"Simple\", \"KNN\", \"Iterative\"" ) imp.fit(te_data) dummied_test = imp.transform(te_data) #Fits data imp.fit(tr_data) dummied_train = imp.transform(tr_data) return (dummied_test, dummied_train, trainSurvived, passengerID ) #Returns the completed arrays
def __init__(self, num_features, cat_features): self.num_features = num_features self.cat_features = cat_features self.data = None self.fit_flag = False # pipeline for numeric features self.num_preprocessing = pipeline.Pipeline(steps = [ ('num', impute.SimpleImputer(strategy = 'mean')), # strategy = 'constant', fill_value = 0 ('num_scaler', preprocessing.StandardScaler()) ]) # pipeline for numeric features self.cat_preprocessing = pipeline.Pipeline(steps = [ ('cat', impute.SimpleImputer(strategy = 'constant')), # 'most_frequent' ('cat_encoder', preprocessing.OneHotEncoder(handle_unknown = 'ignore', sparse = False)) ]) # transformer for impute NaN and preprocessing features self.data_preprocessing = compose.ColumnTransformer(transformers = [ ('num_features', self.num_preprocessing, self.num_features), ('cat_features', self.cat_preprocessing, self.cat_features) ])
def create_pipeline(num_feat, cat_feat, cfg): """ Create and return the model classification pipeline with encoding and imputation of feature and model :param num_feat: list numerical features name list :param cat_feat: list categorical features name list :param cfg: class custom configuration class :return: sklearn.pipeline.Pipeline model pipeline """ cat_pipeline = make_pipeline( impute.SimpleImputer(strategy='constant', fill_value='NaN'), preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')) pre_process_pipeline = make_pipeline( transformers.ColumnSelector(columns=cfg.features), compose.ColumnTransformer(transformers=[ ('num_feat', impute.SimpleImputer(strategy='constant', fill_value=cfg.num_imputer), num_feat), ('cat_feat', cat_pipeline, cat_feat), ]), ) pipeline = Pipeline( steps=[('preproc', pre_process_pipeline ), ('xgb', xgb.XGBClassifier(objective='binary:logistic'))]) return pipeline
def handle(request): dataset_id = json.loads(request.body) imputer = impute.SimpleImputer(strategy="median") sub_dataset = list(SubDataset.objects.filter(pk=dataset_id))[0] dataframe_all = pd.read_csv(sub_dataset.url) print(dataframe_all.head()) dataframe_no_label = dataframe_all.drop([sub_dataset.label]) dataframe_numeric = dataframe_no_label.select_dtypes(include=['float64']) dataframe_cat = dataframe_no_label.select_dtypes(exclude=['float64']) imputer.fit(dataframe_numeric) X = imputer.transform(dataframe_numeric) dataframe_numeric_tr = pd.DataFrame(X, columns=dataframe_numeric.columns) dataset_result = pd.concat([dataframe_numeric_tr, dataframe_cat], axis=1, sort=False) dataset_result.to_csv(sub_dataset.url, index=False) return JsonResponse({"data": dataset_id}, safe=False, content_type="application/json")
def pipe(self): if self.l_rb_pipeline.get_state(): imputer = impute.SimpleImputer() scaler = self.get_scalers_objects() print('scaler : ', scaler) print('imputer : ', imputer) if self.l_rb_PCA.get_state(): n_comp = self.l_sb_components.get_value() print('n_comp: ', n_comp) if (n_comp >= 0.0 and n_comp <= 1.0) or n_comp.is_integer(): if n_comp.is_integer(): n_comp = int(n_comp) pca = PCA(n_components=n_comp) pipe = pipeline.Pipeline([('imputer', imputer), ('scaler', scaler), ('PCA', pca)]) else: d = CustomDialogWidgets.CustomMessageBoxWarning( 'n_components must be Float in range [0,1] or Int') return 0 else: pipe = pipeline.Pipeline([('imputer', imputer), ('scaler', scaler)]) self.emit_signal_for_ml_widget(pipe) else: imputer = impute.SimpleImputer() pipe = pipeline.Pipeline([('imputer', imputer)]) self.emit_signal_for_ml_widget(pipe)
def imputer(): # 6 = (3 + 9) / 2 # 4 = (1 + 4 + 7) / 3 x = [[3, 7], [np.nan, 4], [9, 1]] imp = impute.SimpleImputer() imp.fit(x) print(imp.transform(x)) # 문제 # 각 컬럼마다 결측치가 들어있는 데이터를 만들어서 변환하세요 # 기존에 만들었던 imp를 사용합니다 print(imp.transform([[np.nan, np.nan]])) print(imp.strategy) print(imp.missing_values) print(imp.statistics_)