def __init__(self, columns=None, remove=None, skip_errors=False, single=False, fLOG=None): """ constructor @param columns specify a columns selection @param remove modalities to remove @param skip_errors skip when a new categories appear (no 1) @param single use a single column per category, do not multiply them for each value @param fLOG logging function The logging function displays a message when a new dense and big matrix is created when it should be sparse. A sparse matrix should be allocated instead. """ BaseEstimator.__init__(self) TransformerMixin.__init__(self) self._p_columns = columns if isinstance( columns, list) or columns is None else [columns] self._p_skip_errors = skip_errors self._p_remove = remove self._p_single = single self.fLOG = fLOG
def __init__(self, on=None, ixname='ix', source_suffix='source', target_suffix='target', scoresuffix='score', **kwargs): """ Args: ixname (str): name of the index, default 'ix' source_suffix (str): suffix to be added to the left dataframe default 'left', gives --> 'ix_source' target_suffix (str): suffix to be added to the left dataframe default 'right', gives --> 'ixright' on (str): name of the column on which to do the join scoresuffix (str): suffix to be attached to the on column name """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.on = on self.scoresuffix = scoresuffix if self.on is None: self.outcol = self.scoresuffix else: self.outcol = self.on + '_' + self.scoresuffix self.fitted = False pass
def __init__(self, clustermixin=None, n_simple=10, n_hard=10, ixname='ix', source_suffix='source', target_suffix='target'): """ Args: clustermixin (ClusterMixin): if None, will use KbinsCluster with 25 clusters n_simple (int): number of simple questions per cluster n_hard (int): number of hard questions per cluster ixname (str): default 'ix' source_suffix (str): default 'left' target_suffix (str): default 'right' """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) if clustermixin is None: clustermixin = KBinsCluster(n_clusters=10) self._clustermixin = clustermixin self._simplequestions = SimpleQuestions(n_questions=n_simple) self._hardquestions = HardQuestions(n_questions=n_hard) self._clusterclassifier = ClusterClassifier( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) pass
def __init__(self, transformer=None, estimator=None, normalize=True, keep_tsne_outputs=False, **kwargs): """ :param transformer: `TSNE` by default :param estimator: `MLPRegressor` by default :param normalize: normalizes the outputs, centers and normalizes the output of the *t-SNE* and applies that same normalization to he prediction of the estimator :param keep_tsne_output: if True, keep raw outputs of :epkg:`TSNE` is stored in member *tsne_outputs_* :param kwargs: sent to :meth:`set_params <mlinsights.mlmodel. tsne_transformer.PredictableTSNE.set_params>`, see its documentation to understand how to specify parameters """ TransformerMixin.__init__(self) BaseEstimator.__init__(self) if estimator is None: estimator = KNeighborsRegressor() if transformer is None: transformer = TSNE() self.estimator = estimator self.transformer = transformer self.keep_tsne_outputs = keep_tsne_outputs if not hasattr(transformer, "fit_transform"): raise AttributeError( "Transformer {} does not have a 'fit_transform' " "method.".format(type(transformer))) if not hasattr(estimator, "predict"): raise AttributeError( "Estimator {} does not have a 'predict' method.".format( type(estimator))) self.normalize = normalize if kwargs: self.set_params(**kwargs)
def __init__(self, estimator, method=None, copy_estimator=True): """ @param estimator estimator to wrap in a transformer, it is cloned with the training data (deep copy) when fitted @param method if None, guess what method should be called, *transform* for a transformer, *predict_proba* for a classifier, *decision_function* if found, *predict* otherwiser @param copy_estimator copy the model instead of taking a reference """ TransformerMixin.__init__(self) BaseEstimator.__init__(self) self.estimator = estimator self.copy_estimator = copy_estimator if method is None: if hasattr(estimator, "transform"): method = "transform" elif hasattr(estimator, "predict_proba"): method = "predict_proba" elif hasattr(estimator, "decision_function"): method = "decision_function" elif hasattr(estimator, "predict"): method = "predict" else: raise AttributeError( "Cannot find a method transform, predict_proba, decision_function, predict in object {}" .format(type(estimator))) if not hasattr(estimator, method): raise AttributeError("Cannot find method '{}' in object {}".format( method, type(estimator))) self.method = method
def __init__(self, transformer=None, estimator=None, normalize=True, keep_tsne_outputs=False, **kwargs): TransformerMixin.__init__(self) BaseEstimator.__init__(self) if estimator is None: estimator = KNeighborsRegressor() if transformer is None: transformer = TSNE() self.estimator = estimator self.transformer = transformer self.keep_tsne_outputs = keep_tsne_outputs if not hasattr(transformer, "fit_transform"): raise AttributeError( "transformer {} does not have a 'fit_transform' " "method.".format(type(transformer))) if not hasattr(estimator, "predict"): raise AttributeError("estimator {} does not have a 'predict' " "method.".format(type(estimator))) self.normalize = normalize if kwargs: self.set_params(**kwargs)
def __init__(self, onnx_bytes, output_name=None, enforce_float32=True): BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.onnx_bytes = onnx_bytes self.output_name = output_name self.enforce_float32 = enforce_float32 if not isinstance(onnx_bytes, bytes): raise TypeError("onnx_bytes must be bytes to be pickled.")
def __init__(self, kind='poly', poly_degree=2, poly_interaction_only=False, poly_include_bias=True): BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.kind = kind self.poly_degree = poly_degree self.poly_include_bias = poly_include_bias self.poly_interaction_only = poly_interaction_only
def __init__(self, vocab, merges, padding_length=-1, opset=None): BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.vocab = vocab self.merges = merges self.padding_length = padding_length self.opset = opset if get_library_path is None: raise ImportError("onnxruntime_extensions is not installed.")
def __init__(self, n_questions=10): """ Args: n_questions (int): number of explorer to be asked for each cluster """ TransformerMixin.__init__(self) self.n_questions = n_questions self.n_clusters = None self.clusters = None
def __init__(self, scaler_model, clf_model, hmm_model): prob_bins = np.array([-np.inf, 0.1, 0.3, 0.5, 0.7, 0.9, np.inf]) bins_discretizer = KBinsDiscretizer(encode='ordinal') bins_discretizer.n_bins_ = np.array([prob_bins.shape[0]]) bins_discretizer.bin_edges_ = prob_bins.reshape(1, -1) TransformerMixin.__init__(self) BaseEstimator.__init__(self) self.scaler_model_ = scaler_model self.clf_model_ = clf_model self.hmm_model_ = hmm_model self.bins_discretizer_ = bins_discretizer
def __init__(self, name, fct, kwargs): """ @param name function name @param fct python function @param kwargs parameters function """ BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.name_fct = name self._fct = fct self.kwargs = kwargs
def __init__(self, onnx_bytes, output_name=None, enforce_float32=True, runtime='python', change_batch_size=None, reshape=False): BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.onnx_bytes = (onnx_bytes if not hasattr(onnx_bytes, 'SerializeToString') else onnx_bytes.SerializeToString()) self.output_name = output_name self.enforce_float32 = enforce_float32 self.runtime = runtime self.change_batch_size = change_batch_size self.reshape = reshape
def __init__(self, onnx_bytes, output_name=None): """ :param onnx_bytes: bytes :param output_name: requested output name or None to request all and have method *transform* to store all of them in a dataframe """ BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.onnx_bytes = onnx_bytes self.output_name = output_name if not isinstance(onnx_bytes, bytes): raise TypeError("onnx_bytes must be bytes to be pickled.")
def __init__(self, species: list = [], rcut: int = 6, nmax: int = 6, lmax: int = 8, rbf: str = "gto", sigma: float = 0.125, average: str = "inner", periodic: bool = True, convert: bool = True): """ Initiallize class Parameters ---------- rcut : float A cutoff for local region in angstroms. Should be bigger than 1 angstrom nmax : int The number of radial basis functions. lmax : int The maximum degree of spherical harmonics. species : List list of elements sigma : float The standard deviation of the gaussians used to expand the atomic density. rbf : str The radial basis functions to use. The available options are: * "gto": Spherical gaussian type orbitals defined as :math:`g_{nl}(r) = \sum_{n'=1}^{n_\mathrm{max}}\,\\beta_{nn'l} r^l e^{-\\alpha_{n'l}r^2}` * "polynomial": Polynomial basis defined as :math:`g_{n}(r) = \sum_{n'=1}^{n_\mathrm{max}}\,\\beta_{nn'} (r-r_\mathrm{cut})^{n'+2}` periodic : bool Set to true if you want the descriptor output to respect the periodicity of the atomic systems (see the pbc-parameter in the constructor of ase.Atoms). average : str The averaging mode over the centers of interest. Valid options are: * "off": No averaging. * "inner": Averaging over sites before summing up the magnetic quantum numbers: :math:`p_{nn'l}^{Z_1,Z_2} \sim \sum_m (\\frac{1}{n} \sum_i c_{nlm}^{i, Z_1})^{*} (\\frac{1}{n} \sum_i c_{n'lm}^{i, Z_2})` * "outer": Averaging over the power spectrum of different sites: :math:`p_{nn'l}^{Z_1,Z_2} \sim \\frac{1}{n} \sum_i \sum_m (c_{nlm}^{i, Z_1})^{*} (c_{n'lm}^{i, Z_2})` convert : bool If true convert pymatgen structures to ase.atoms """ TransformerMixin.__init__(self) self.species = species self.rcut = rcut self.nmax = nmax self.lmax = lmax self.soap: Any = None self.rbf = rbf self.sigma = sigma self.average = average self.periodic = periodic self.convert = convert
def __init__(self, onnx_bytes, output_name=None, enforce_float32=True, runtime='onnxruntime1'): BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.onnx_bytes = (onnx_bytes if not hasattr(onnx_bytes, 'SerializeToString') else onnx_bytes.SerializeToString()) self.output_name = output_name self.enforce_float32 = enforce_float32 self.runtime = runtime
def __init__(self, on_source='source', on_target='target', compfunc=None, *args, **kwargs): """ base class for all transformers Args: on_source (str): name of suffix on_target (str): compfunc (callable): ['simple', 'token', 'exact'] """ TransformerMixin.__init__(self) self.left = on_source self.right = on_target if compfunc is None: raise ValueError('comparison function not provided with function', compfunc) assert callable(compfunc) self.compfunc = compfunc
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target', usecols=None, **kwargs): TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.usecols = usecols pass
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target'): TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix) self.index = pd.Index self.dfnum = pd.DataFrame() self.dfix = pd.DataFrame() self.num = None
def __init__(self, model, periods=1, freq='30min'): """Lags a dataset. Lags all features. Missing data is dropped for fitting, and replaced with the mean for predict. :periods: Number of timesteps to lag by """ assert isinstance(model, BaseEstimator), "`model` isn't a scikit-learn model" BaseEstimator.__init__(self) TransformerMixin.__init__(self) self.periods = periods self.freq = freq self.model = model
def __init__( self, normalizer=None, transformer=None, estimator=None, normalize=True, keep_tsne_outputs=False, ): """ @param normalizer None by default @param transformer :epkg:`sklearn:manifold:TSNE` by default @param estimator :epkg:`sklearn:neural_network:MLPRegressor` by default @param normalize normalizes the outputs, centers and normalizes the output of the *t-SNE* and applies that same normalization to he prediction of the estimator @param keep_tsne_output if True, keep raw outputs of :epkg:`TSNE` is stored in member *tsne_outputs_* """ TransformerMixin.__init__(self) BaseEstimator.__init__(self) if estimator is None: estimator = MLPRegressor() if transformer is None: transformer = TSNE() self.estimator = estimator self.transformer = transformer self.normalizer = normalizer self.keep_tsne_outputs = keep_tsne_outputs if normalizer is not None and not hasattr(normalizer, "transform"): raise AttributeError( "normalizer {} does not have a 'transform' method.".format( type(normalizer))) if not hasattr(transformer, "fit_transform"): raise AttributeError( "transformer {} does not have a 'fit_transform' method.". format(type(transformer))) if not hasattr(estimator, "predict"): raise AttributeError( "estimator {} does not have a 'predict' method.".format( type(estimator))) self.normalize = normalize
def _fit_transform_with_state_restore_check(transformer: TransformerMixin, X, **kwargs): transformed = transformer.fit_transform(X, **kwargs) LOGGER.debug('transformed: %s', transformed) LOGGER.debug('transformed.shape: %s', transformed.shape) restored_transformer = _get_state_and_restore(transformer) restored_transformed = restored_transformer.transform(X) LOGGER.debug('restored_transformed: %s', restored_transformed) assert restored_transformed.tolist() == transformed.tolist() return transformed
def __init__(self, ixname='ix', source_suffix='source', target_suffix='target'): """ Args: ixname: 'ix' source_suffix: 'source' target_suffix: 'target' """ TransformerMixin.__init__(self) self.ixname = ixname self.source_suffix = source_suffix self.target_suffix = target_suffix self.ixnamesource, self.ixnametarget, self.ixnamepairs = concatixnames( ixname=self.ixname, source_suffix=self.source_suffix, target_suffix=self.target_suffix)
def transform(dataframe: pd.DataFrame, scaler: TransformerMixin) -> pd.DataFrame: fields_to_normalize = dataframe.filter( ['preco', 'prazo', 'frete', 'latitude', 'longitude']).to_numpy() feature_scaled = scaler.fit_transform(fields_to_normalize) dataframe['features'] = list(feature_scaled) return dataframe
def __init__( self, elements: List, rcut: float = 10.1, stepSize: float = 0.1, sigma: float = 0.2, ): """ Parameters : list list of elements symbols """ TransformerMixin.__init__(self) self.elements = elements self.rdf_tup = calc_rdf_tup(elements) self.rcut = rcut self.stepSize = stepSize self.sigma = sigma self.binRad = np.arange(0.1, self.rcut, self.stepSize) self.numBins = len(self.binRad) self.numPairs = len(self.rdf_tup)
def apply_transform(scaler: TransformerMixin, x: ArrayLike) -> ArrayLike: if isinstance(x, ndarray): return scaler.transform(x) elif isinstance(x, Series): return Series(apply_transform(scaler, x.to_numpy()), x.index, name=x.name) elif isinstance(x, DataFrame): return DataFrame(apply_transform(scaler, x.to_numpy()), x.index, x.columns) else: raise TypeError(f"Invalid type: {type(x).__name__}")
def _fit_transformer_with_progress_logging(transformer: TransformerMixin, X, logger: logging.Logger, message_prefix: str, unit: str, message_suffx: str = ': '): if isinstance(transformer, Pipeline): steps = transformer.steps if len(steps) == 1 and isinstance(steps[0][1], FeatureUnion): feature_union = steps[0][1] for name, union_transformer in feature_union.transformer_list: X = logging_tqdm( iterable=X, logger=logger, desc=f'{message_prefix}.{name}{message_suffx}', unit=unit) union_transformer.fit(X) return X = logging_tqdm(iterable=X, logger=logger, desc=message_prefix + message_suffx, unit=unit) transformer.fit(X)
def __init__(self, model, nbest_size=1, alpha=0.5, reverse=False, add_bos=False, add_eos=False, opset=None): BaseEstimator.__init__(self) TransformerMixin.__init__(self) if isinstance(model, bytes): self.model_b64 = model else: ints = model.tolist() b64 = base64.b64encode(ints) self.model_b64 = b64 self.nbest_size = nbest_size self.alpha = alpha self.reverse = reverse self.add_bos = add_bos self.add_eos = add_eos self.opset = opset if get_library_path is None: raise ImportError("onnxruntime_extensions is not installed.")
def _infer_feature_names( input_feature_names: Sequence[str], output_num_cols: int, trans_name: str, transformer: TransformerMixin ) -> Sequence[str]: feature_names = False try: # polynomial features will add ^1, ^2, etc. # one_hot_encoder will add labels # TODO: this breaks the mapping between names in get/set_params and elsewhere feature-names. I think that's ok feature_names = transformer.get_feature_names(input_feature_names) except (TypeError, AttributeError, NotImplementedError): if output_num_cols == len(input_feature_names): if isinstance(transformer, (StandardScaler, MinMaxScaler, RobustScaler)): feature_names = [f"{trans_name}({fname})" for i, fname in enumerate(input_feature_names)] else: # can't assume 1-1 mapping. gotta wait for sklearn to support get_feature_names on everything # https://github.com/scikit-learn/scikit-learn/pull/12627 pass elif len(input_feature_names) == 1: if output_num_cols == 1: feature_names = [f"{trans_name}({input_feature_names[0]})"] else: feature_names = [f"{trans_name}({input_feature_names[0]})[{i}]" for i in range(output_num_cols)] if feature_names is False: # TODO: maybe somehow support passing aliases? warn(f"Unable to infer feature-names for {trans_name}, forced to concatenate.") return _infer_feature_names( output_num_cols=output_num_cols, trans_name=trans_name, transformer=transformer, input_feature_names=input_feature_names.__repr__() ) return feature_names
def __init__(self): BaseEstimator.__init__(self) TransformerMixin.__init__(self)