def operate(self, input_datanode, target_fields=None): from sklearn.feature_selection import SelectFromModel feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) if self.model is None: from sklearn.svm import LinearSVC self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) if check_none(self.class_weight): self.class_weight = None estimator = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) estimator.fit(X_new, y) self.model = SelectFromModel(estimator, prefit=True, threshold='mean') _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) self.target_fields = target_fields.copy() return output_datanode
def fit(self, X, Y): from sklearn.svm import LinearSVR # In case of nested loss if isinstance(self.loss, dict): combination = self.loss self.loss = combination['loss'] self.dual = combination['dual'] self.epsilon = float(self.epsilon) self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) self.estimator = LinearSVR(epsilon=self.epsilon, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, random_state=self.random_state) self.estimator.fit(X, Y) return self
def operate(self, input_datanode, target_fields): from sklearn.preprocessing import PolynomialFeatures X, y = input_datanode.data X_new = X[:, target_fields] ori_length = X_new.shape[1] # Skip high-dimensional features. if X_new.shape[1] > 100: return X_new.copy() if not self.model: self.degree = int(self.degree) self.interaction_only = check_for_bool(self.interaction_only) self.include_bias = check_for_bool(self.include_bias) self.model = PolynomialFeatures( degree=self.degree, interaction_only=self.interaction_only, include_bias=self.include_bias) self.model.fit(X_new) _X = self.model.transform(X_new) if ori_length == 1: _X = _X[:, 1:] else: _X = _X[:, ori_length + 1:] return _X
def operate(self, input_datanode, target_fields=None): X, y = input_datanode.data if self.model is None: from sklearn.decomposition import FastICA self.whiten = check_for_bool(self.whiten) if check_none(self.n_components): self.n_components = None else: self.n_components = int(self.n_components) self.model = FastICA( n_components=self.n_components, algorithm=self.algorithm, fun=self.fun, whiten=self.whiten, random_state=self.random_state ) # Make the RuntimeWarning an Exception! with warnings.catch_warnings(): warnings.filterwarnings("error", message='array must not contain infs or NaNs') try: self.model.fit(X) except ValueError as e: if 'array must not contain infs or NaNs' in e.args[0]: raise ValueError("Bug in scikit-learn: https://github.com/scikit-learn/scikit-learn/pull/2738") X_new = self.model.transform(X) return X_new
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import RandomForestClassifier if refit: self.estimator = None if self.estimator is None: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) if self.max_features not in ("sqrt", "log2", "auto"): max_features = int(X.shape[1]**float(self.max_features)) else: max_features = self.max_features self.bootstrap = check_for_bool(self.bootstrap) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) # initial fit of only increment trees self.estimator = RandomForestClassifier( n_estimators=n_iter, criterion=self.criterion, max_features=max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def fit(self, X, Y): import sklearn.svm import sklearn.multiclass # In case of nested penalty if isinstance(self.penalty, dict): combination = self.penalty self.penalty = combination['penalty'] self.loss = combination['loss'] self.dual = combination['dual'] self.C = float(self.C) self.tol = float(self.tol) self.dual = check_for_bool(self.dual) self.fit_intercept = check_for_bool(self.fit_intercept) self.intercept_scaling = float(self.intercept_scaling) if check_none(self.class_weight): self.class_weight = None estimator = sklearn.svm.LinearSVC( penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) if len(Y.shape) == 2 and Y.shape[1] > 1: self.estimator = sklearn.multiclass.OneVsRestClassifier(estimator, n_jobs=1) else: self.estimator = estimator self.estimator.fit(X, Y) return self
def fit(self, X, y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier self.bootstrap = check_for_bool(self.bootstrap) self.estimator = ExtraTreesClassifier(n_estimators=self.n_estimators, max_leaf_nodes=None, criterion=self.criterion, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_depth=None, bootstrap=self.bootstrap, random_state=self.random_state, n_jobs=self.n_jobs) self.estimator.fit(X, y, sample_weight=sample_weight) return self
def fit(self, X, Y): import sklearn.svm # Nested kernel if isinstance(self.kernel, tuple): nested_kernel = self.kernel self.kernel = nested_kernel[0] if self.kernel == 'poly': self.degree = nested_kernel[1]['degree'] self.coef0 = nested_kernel[1]['coef0'] elif self.kernel == 'sigmoid': self.coef0 = nested_kernel[1]['coef0'] self.C = float(self.C) if self.degree is None: self.degree = 3 else: self.degree = int(self.degree) if self.gamma is None: self.gamma = 0.0 else: self.gamma = float(self.gamma) if self.coef0 is None: self.coef0 = 0.0 else: self.coef0 = float(self.coef0) self.tol = float(self.tol) self.max_iter = float(self.max_iter) self.shrinking = check_for_bool(self.shrinking) if check_none(self.class_weight): self.class_weight = None self.estimator = sklearn.svm.SVC(C=self.C, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking, tol=self.tol, class_weight=self.class_weight, max_iter=self.max_iter, random_state=self.random_state, decision_function_shape='ovr') self.estimator.fit(X, Y) return self
def operate(self, input_datanode, target_fields=None): X, y = input_datanode.data if self.model is None: import sklearn.decomposition n_components = float(self.keep_variance) self.whiten = check_for_bool(self.whiten) self.model = sklearn.decomposition.PCA(n_components=n_components, whiten=self.whiten, copy=True) self.model.fit(X) if not np.isfinite(self.model.components_).all(): raise ValueError("PCA found non-finite components.") X_new = self.model.transform(X) return X_new
def fit(self, X, Y): from sklearn.svm import SVR # Nested kernel if isinstance(self.kernel, tuple): nested_kernel = self.kernel self.kernel = nested_kernel[0] if self.kernel == 'poly': self.degree = nested_kernel[1]['degree'] self.coef0 = nested_kernel[1]['coef0'] elif self.kernel == 'sigmoid': self.coef0 = nested_kernel[1]['coef0'] self.epsilon = float(self.epsilon) self.C = float(self.C) if self.degree is None: self.degree = 3 else: self.degree = int(self.degree) if self.gamma is None: self.gamma = 0.0 else: self.gamma = float(self.gamma) if self.coef0 is None: self.coef0 = 0.0 else: self.coef0 = float(self.coef0) self.tol = float(self.tol) self.max_iter = float(self.max_iter) self.shrinking = check_for_bool(self.shrinking) self.estimator = SVR(epsilon=self.epsilon, C=self.C, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking, tol=self.tol, max_iter=self.max_iter) self.estimator.fit(X, Y) return self
def operate(self, input_datanode: DataNode, target_fields=None): from sklearn.ensemble import RandomTreesEmbedding X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(input_datanode.feature_types, self.input_type) X_new = X[:, target_fields] if not self.model: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) if X.shape[0] > 5000: self.max_depth = min(4, self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) self.bootstrap = check_for_bool(self.bootstrap) self.model = RandomTreesEmbedding( n_estimators=self.n_estimators, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_leaf_nodes=self.max_leaf_nodes, sparse_output=self.sparse_output, n_jobs=self.n_jobs, random_state=self.random_state) self.model.fit(X_new) _X = self.model.transform(X_new).toarray() return _X
def operate(self, input_datanode, target_fields=None): X, y = input_datanode.data # Skip heavy computation in fast ica. if X.shape[0] > 10000 or X.shape[1] > 200: if not self.pre_trained: self.skip_flag = True self.pre_trained = True if self.skip_flag: return X.copy() if self.model is None: from sklearn.decomposition import FastICA self.whiten = check_for_bool(self.whiten) if check_none(self.n_components): self.n_components = None else: self.n_components = int(self.n_components) if self.n_components is not None: self.n_components = min(self.n_components, X.shape[0]) self.model = FastICA( n_components=self.n_components, algorithm=self.algorithm, fun=self.fun, whiten=self.whiten, random_state=self.random_state ) # Make the RuntimeWarning an Exception! with warnings.catch_warnings(): warnings.filterwarnings("error", message='array must not contain infs or NaNs') try: self.model.fit(X) except ValueError as e: if 'array must not contain infs or NaNs' in e.args[0]: raise ValueError("Bug in scikit-learn: https://github.com/scikit-learn/scikit-learn/pull/2738") raise e X_new = self.model.transform(X) return X_new
def operate(self, input_datanode, target_fields=None, sample_weight=None): from sklearn.feature_selection import SelectFromModel feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) if self.model is None: from sklearn.ensemble import ExtraTreesClassifier if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.bootstrap = check_for_bool(self.bootstrap) self.n_jobs = int(self.n_jobs) self.min_impurity_decrease = float(self.min_impurity_decrease) self.max_features = self.max_features self.min_samples_leaf = int(self.min_samples_leaf) self.min_samples_split = int(self.min_samples_split) self.verbose = int(self.verbose) max_features = int(X_new.shape[1] ** float(self.max_features)) estimator = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight) estimator.fit(X_new, y, sample_weight=sample_weight) self.model = SelectFromModel(estimator=estimator, threshold='mean', prefit=True) _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [feature_types[idx] for idx in target_fields if is_selected[idx]] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) self.target_fields = target_fields.copy() return output_datanode