def __init__( self, client=None, index_pattern=None, display_names=None, index_field=None, to_copy=None, ) -> None: # Implement copy as we don't deep copy the client if to_copy is not None: self._client = to_copy._client self._index_pattern = to_copy._index_pattern self._index = Index(self, to_copy._index.es_index_field) self._operations = copy.deepcopy(to_copy._operations) self._mappings: FieldMappings = copy.deepcopy(to_copy._mappings) else: self._client = ensure_es_client(client) self._index_pattern = index_pattern # Get and persist mappings, this allows us to correctly # map returned types from Elasticsearch to pandas datatypes self._mappings: FieldMappings = FieldMappings( client=self._client, index_pattern=self._index_pattern, display_names=display_names, ) self._index = Index(self, index_field) self._operations = Operations()
def __init__(self, es_client, model_id: str): """ Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance model_id: str The unique identifier of the trained inference model in Elasticsearch. """ self._client = ensure_es_client(es_client) self._model_id = model_id
def __init__( self, es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"], model_id: str, ): """ Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance model_id: str The unique identifier of the trained inference model in Elasticsearch. """ self._client = ensure_es_client(es_client) self._model_id = model_id self._trained_model_config_cache: Optional[Dict[str, Any]] = None
def import_model( cls, es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"], model_id: str, model: Union[ "DecisionTreeClassifier", "DecisionTreeRegressor", "RandomForestRegressor", "RandomForestClassifier", "XGBClassifier", "XGBRegressor", "LGBMRegressor", "LGBMClassifier", ], feature_names: List[str], classification_labels: Optional[List[str]] = None, classification_weights: Optional[List[float]] = None, es_if_exists: Optional[str] = None, es_compress_model_definition: bool = True, ) -> "MLModel": """ Transform and serialize a trained 3rd party model into Elasticsearch. This model can then be used for inference in the Elastic Stack. Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance model_id: str The unique identifier of the trained inference model in Elasticsearch. model: An instance of a supported python model. We support the following model types: - sklearn.tree.DecisionTreeClassifier - sklearn.tree.DecisionTreeRegressor - sklearn.ensemble.RandomForestRegressor - sklearn.ensemble.RandomForestClassifier - lightgbm.LGBMRegressor - Categorical fields are expected to already be processed - Only the following objectives are supported - "regression" - "regression_l1" - "huber" - "fair" - "quantile" - "mape" - lightgbm.LGBMClassifier - Categorical fields are expected to already be processed - Only the following objectives are supported - "binary" - "multiclass" - "multiclassova" - xgboost.XGBClassifier - only the following objectives are supported: - "binary:logistic" - "multi:softmax" - "multi:softprob" - xgboost.XGBRegressor - only the following objectives are supported: - "reg:squarederror" - "reg:linear" - "reg:squaredlogerror" - "reg:logistic" - "reg:pseudohubererror" feature_names: List[str] Names of the features (required) classification_labels: List[str] Labels of the classification targets classification_weights: List[str] Weights of the classification targets es_if_exists: {'fail', 'replace'} default 'fail' How to behave if model already exists - fail: Raise a Value Error - replace: Overwrite existing model es_compress_model_definition: bool If True will use 'compressed_definition' which uses gzipped JSON instead of raw JSON to reduce the amount of data sent over the wire in HTTP requests. Defaults to 'True'. Examples -------- >>> from sklearn import datasets >>> from sklearn.tree import DecisionTreeClassifier >>> from eland.ml import MLModel >>> # Train model >>> training_data = datasets.make_classification(n_features=5, random_state=0) >>> test_data = [[-50.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]] >>> classifier = DecisionTreeClassifier() >>> classifier = classifier.fit(training_data[0], training_data[1]) >>> # Get some test results >>> classifier.predict(test_data) array([0, 1]) >>> # Serialise the model to Elasticsearch >>> feature_names = ["f0", "f1", "f2", "f3", "f4"] >>> model_id = "test_decision_tree_classifier" >>> es_model = MLModel.import_model( ... 'localhost', ... model_id=model_id, ... model=classifier, ... feature_names=feature_names, ... es_if_exists='replace' ... ) >>> # Get some test results from Elasticsearch model >>> es_model.predict(test_data) array([0, 1]) >>> # Delete model from Elasticsearch >>> es_model.delete_model() """ es_client = ensure_es_client(es_client) transformer = get_model_transformer( model, feature_names=feature_names, classification_labels=classification_labels, classification_weights=classification_weights, ) serializer = transformer.transform() model_type = transformer.model_type if es_if_exists is None: es_if_exists = "fail" ml_model = MLModel( es_client=es_client, model_id=model_id, ) if es_if_exists not in ("fail", "replace"): raise ValueError("'es_if_exists' must be either 'fail' or 'replace'") elif es_if_exists == "fail": if ml_model.exists_model(): raise ValueError( f"Trained machine learning model {model_id} already exists" ) elif es_if_exists == "replace": ml_model.delete_model() body: Dict[str, Any] = { "input": {"field_names": feature_names}, } # 'inference_config' is required in 7.8+ but isn't available in <=7.7 if es_version(es_client) >= (7, 8): body["inference_config"] = {model_type: {}} if es_compress_model_definition: body["compressed_definition"] = serializer.serialize_and_compress_model() else: body["definition"] = serializer.serialize_model() ml_model._client.ml.put_trained_model( model_id=model_id, body=body, ) return ml_model
def pandas_to_eland( pd_df: pd.DataFrame, es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch], es_dest_index: str, es_if_exists: str = "fail", es_refresh: bool = False, es_dropna: bool = False, es_type_overrides: Optional[Mapping[str, str]] = None, thread_count: int = 4, chunksize: Optional[int] = None, use_pandas_index_for_es_ids: bool = True, ) -> DataFrame: """ Append a pandas DataFrame to an Elasticsearch index. Mainly used in testing. Modifies the elasticsearch destination index Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance es_dest_index: str Name of Elasticsearch index to be appended to es_if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the index already exists. - fail: Raise a ValueError. - replace: Delete the index before inserting new values. - append: Insert new values to the existing index. Create if does not exist. es_refresh: bool, default 'False' Refresh es_dest_index after bulk index es_dropna: bool, default 'False' * True: Remove missing values (see pandas.Series.dropna) * False: Include missing values - may cause bulk to fail es_type_overrides: dict, default None Dict of field_name: es_data_type that overrides default es data types thread_count: int number of the threads to use for the bulk requests chunksize: int, default None Number of pandas.DataFrame rows to read before bulk index into Elasticsearch use_pandas_index_for_es_ids: bool, default 'True' * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields. * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch Returns ------- eland.Dataframe eland.DataFrame referencing data in destination_index Examples -------- >>> pd_df = pd.DataFrame(data={'A': 3.141, ... 'B': 1, ... 'C': 'foo', ... 'D': pd.Timestamp('20190102'), ... 'E': [1.0, 2.0, 3.0], ... 'F': False, ... 'G': [1, 2, 3], ... 'H': 'Long text - to be indexed as es type text'}, ... index=['0', '1', '2']) >>> type(pd_df) <class 'pandas.core.frame.DataFrame'> >>> pd_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> pd_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`. Overwrite existing Elasticsearch index if it exists `if_exists="replace"`, and sync index so it is readable on return `refresh=True` >>> ed_df = ed.pandas_to_eland(pd_df, ... 'localhost', ... 'pandas_to_eland', ... es_if_exists="replace", ... es_refresh=True, ... es_type_overrides={'H':'text'}) # index field 'H' as text not keyword >>> type(ed_df) <class 'eland.dataframe.DataFrame'> >>> ed_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> ed_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object See Also -------- eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ if chunksize is None: chunksize = DEFAULT_CHUNK_SIZE mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides) es_client = ensure_es_client(es_client) # If table exists, check if_exists parameter if es_client.indices.exists(index=es_dest_index): if es_if_exists == "fail": raise ValueError( f"Could not create the index [{es_dest_index}] because it " f"already exists. " f"Change the 'es_if_exists' parameter to " f"'append' or 'replace' data." ) elif es_if_exists == "replace": es_client.indices.delete(index=es_dest_index) es_client.indices.create(index=es_dest_index, body=mapping) elif es_if_exists == "append": dest_mapping = es_client.indices.get_mapping(index=es_dest_index)[ es_dest_index ] verify_mapping_compatibility( ed_mapping=mapping, es_mapping=dest_mapping, es_type_overrides=es_type_overrides, ) else: es_client.indices.create(index=es_dest_index, body=mapping) def action_generator( pd_df: pd.DataFrame, es_dropna: bool, use_pandas_index_for_es_ids: bool, es_dest_index: str, ) -> Generator[Dict[str, Any], None, None]: for row in pd_df.iterrows(): if es_dropna: values = row[1].dropna().to_dict() else: values = row[1].to_dict() if use_pandas_index_for_es_ids: # Use index as _id id = row[0] action = {"_index": es_dest_index, "_source": values, "_id": str(id)} else: action = {"_index": es_dest_index, "_source": values} yield action # parallel_bulk is lazy generator so use deque to consume them immediately # maxlen = 0 because don't need results of parallel_bulk deque( parallel_bulk( client=es_client, actions=action_generator( pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index ), thread_count=thread_count, chunk_size=int(chunksize / thread_count), ), maxlen=0, ) if es_refresh: es_client.indices.refresh(index=es_dest_index) return DataFrame(es_client, es_dest_index)