コード例 #1
0
 def __init__(
     self,
     client=None,
     index_pattern=None,
     display_names=None,
     index_field=None,
     to_copy=None,
 ) -> None:
     # Implement copy as we don't deep copy the client
     if to_copy is not None:
         self._client = to_copy._client
         self._index_pattern = to_copy._index_pattern
         self._index = Index(self, to_copy._index.es_index_field)
         self._operations = copy.deepcopy(to_copy._operations)
         self._mappings: FieldMappings = copy.deepcopy(to_copy._mappings)
     else:
         self._client = ensure_es_client(client)
         self._index_pattern = index_pattern
         # Get and persist mappings, this allows us to correctly
         # map returned types from Elasticsearch to pandas datatypes
         self._mappings: FieldMappings = FieldMappings(
             client=self._client,
             index_pattern=self._index_pattern,
             display_names=display_names,
         )
         self._index = Index(self, index_field)
         self._operations = Operations()
コード例 #2
0
    def __init__(self, es_client, model_id: str):
        """
        Parameters
        ----------
        es_client: Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance

        model_id: str
            The unique identifier of the trained inference model in Elasticsearch.
        """
        self._client = ensure_es_client(es_client)
        self._model_id = model_id
コード例 #3
0
    def __init__(
        self,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
    ):
        """
        Parameters
        ----------
        es_client: Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance

        model_id: str
            The unique identifier of the trained inference model in Elasticsearch.
        """
        self._client = ensure_es_client(es_client)
        self._model_id = model_id
        self._trained_model_config_cache: Optional[Dict[str, Any]] = None
コード例 #4
0
    def import_model(
        cls,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
        model: Union[
            "DecisionTreeClassifier",
            "DecisionTreeRegressor",
            "RandomForestRegressor",
            "RandomForestClassifier",
            "XGBClassifier",
            "XGBRegressor",
            "LGBMRegressor",
            "LGBMClassifier",
        ],
        feature_names: List[str],
        classification_labels: Optional[List[str]] = None,
        classification_weights: Optional[List[float]] = None,
        es_if_exists: Optional[str] = None,
        es_compress_model_definition: bool = True,
    ) -> "MLModel":
        """
        Transform and serialize a trained 3rd party model into Elasticsearch.
        This model can then be used for inference in the Elastic Stack.

        Parameters
        ----------
        es_client: Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance

        model_id: str
            The unique identifier of the trained inference model in Elasticsearch.

        model: An instance of a supported python model. We support the following model types:
            - sklearn.tree.DecisionTreeClassifier
            - sklearn.tree.DecisionTreeRegressor
            - sklearn.ensemble.RandomForestRegressor
            - sklearn.ensemble.RandomForestClassifier
            - lightgbm.LGBMRegressor
                - Categorical fields are expected to already be processed
                - Only the following objectives are supported
                    - "regression"
                    - "regression_l1"
                    - "huber"
                    - "fair"
                    - "quantile"
                    - "mape"
            - lightgbm.LGBMClassifier
                - Categorical fields are expected to already be processed
                - Only the following objectives are supported
                    - "binary"
                    - "multiclass"
                    - "multiclassova"
            - xgboost.XGBClassifier
                - only the following objectives are supported:
                    - "binary:logistic"
                    - "multi:softmax"
                    - "multi:softprob"
            - xgboost.XGBRegressor
                - only the following objectives are supported:
                    - "reg:squarederror"
                    - "reg:linear"
                    - "reg:squaredlogerror"
                    - "reg:logistic"
                    - "reg:pseudohubererror"

        feature_names: List[str]
            Names of the features (required)

        classification_labels: List[str]
            Labels of the classification targets

        classification_weights: List[str]
            Weights of the classification targets

        es_if_exists: {'fail', 'replace'} default 'fail'
            How to behave if model already exists

            - fail: Raise a Value Error
            - replace: Overwrite existing model

        es_compress_model_definition: bool
            If True will use 'compressed_definition' which uses gzipped
            JSON instead of raw JSON to reduce the amount of data sent
            over the wire in HTTP requests. Defaults to 'True'.

        Examples
        --------
        >>> from sklearn import datasets
        >>> from sklearn.tree import DecisionTreeClassifier
        >>> from eland.ml import MLModel

        >>> # Train model
        >>> training_data = datasets.make_classification(n_features=5, random_state=0)
        >>> test_data = [[-50.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
        >>> classifier = DecisionTreeClassifier()
        >>> classifier = classifier.fit(training_data[0], training_data[1])

        >>> # Get some test results
        >>> classifier.predict(test_data)
        array([0, 1])

        >>> # Serialise the model to Elasticsearch
        >>> feature_names = ["f0", "f1", "f2", "f3", "f4"]
        >>> model_id = "test_decision_tree_classifier"
        >>> es_model = MLModel.import_model(
        ...   'localhost',
        ...   model_id=model_id,
        ...   model=classifier,
        ...   feature_names=feature_names,
        ...   es_if_exists='replace'
        ... )

        >>> # Get some test results from Elasticsearch model
        >>> es_model.predict(test_data)
        array([0, 1])

        >>> # Delete model from Elasticsearch
        >>> es_model.delete_model()
        """
        es_client = ensure_es_client(es_client)
        transformer = get_model_transformer(
            model,
            feature_names=feature_names,
            classification_labels=classification_labels,
            classification_weights=classification_weights,
        )
        serializer = transformer.transform()
        model_type = transformer.model_type

        if es_if_exists is None:
            es_if_exists = "fail"

        ml_model = MLModel(
            es_client=es_client,
            model_id=model_id,
        )
        if es_if_exists not in ("fail", "replace"):
            raise ValueError("'es_if_exists' must be either 'fail' or 'replace'")
        elif es_if_exists == "fail":
            if ml_model.exists_model():
                raise ValueError(
                    f"Trained machine learning model {model_id} already exists"
                )
        elif es_if_exists == "replace":
            ml_model.delete_model()

        body: Dict[str, Any] = {
            "input": {"field_names": feature_names},
        }
        # 'inference_config' is required in 7.8+ but isn't available in <=7.7
        if es_version(es_client) >= (7, 8):
            body["inference_config"] = {model_type: {}}

        if es_compress_model_definition:
            body["compressed_definition"] = serializer.serialize_and_compress_model()
        else:
            body["definition"] = serializer.serialize_model()

        ml_model._client.ml.put_trained_model(
            model_id=model_id,
            body=body,
        )
        return ml_model
コード例 #5
0
ファイル: etl.py プロジェクト: sethmlarson/eland
def pandas_to_eland(
    pd_df: pd.DataFrame,
    es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch],
    es_dest_index: str,
    es_if_exists: str = "fail",
    es_refresh: bool = False,
    es_dropna: bool = False,
    es_type_overrides: Optional[Mapping[str, str]] = None,
    thread_count: int = 4,
    chunksize: Optional[int] = None,
    use_pandas_index_for_es_ids: bool = True,
) -> DataFrame:
    """
    Append a pandas DataFrame to an Elasticsearch index.
    Mainly used in testing.
    Modifies the elasticsearch destination index

    Parameters
    ----------
    es_client: Elasticsearch client argument(s)
        - elasticsearch-py parameters or
        - elasticsearch-py instance
    es_dest_index: str
        Name of Elasticsearch index to be appended to
    es_if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the index already exists.

        - fail: Raise a ValueError.
        - replace: Delete the index before inserting new values.
        - append: Insert new values to the existing index. Create if does not exist.
    es_refresh: bool, default 'False'
        Refresh es_dest_index after bulk index
    es_dropna: bool, default 'False'
        * True: Remove missing values (see pandas.Series.dropna)
        * False: Include missing values - may cause bulk to fail
    es_type_overrides: dict, default None
        Dict of field_name: es_data_type that overrides default es data types
    thread_count: int
        number of the threads to use for the bulk requests
    chunksize: int, default None
        Number of pandas.DataFrame rows to read before bulk index into Elasticsearch
    use_pandas_index_for_es_ids: bool, default 'True'
        * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields.
        * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch

    Returns
    -------
    eland.Dataframe
        eland.DataFrame referencing data in destination_index

    Examples
    --------

    >>> pd_df = pd.DataFrame(data={'A': 3.141,
    ...                            'B': 1,
    ...                            'C': 'foo',
    ...                            'D': pd.Timestamp('20190102'),
    ...                            'E': [1.0, 2.0, 3.0],
    ...                            'F': False,
    ...                            'G': [1, 2, 3],
    ...                            'H': 'Long text - to be indexed as es type text'},
    ...                      index=['0', '1', '2'])
    >>> type(pd_df)
    <class 'pandas.core.frame.DataFrame'>
    >>> pd_df
           A  B  ...  G                                          H
    0  3.141  1  ...  1  Long text - to be indexed as es type text
    1  3.141  1  ...  2  Long text - to be indexed as es type text
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
    >>> pd_df.dtypes
    A           float64
    B             int64
    C            object
    D    datetime64[ns]
    E           float64
    F              bool
    G             int64
    H            object
    dtype: object

    Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`.
    Overwrite existing Elasticsearch index if it exists `if_exists="replace"`, and sync index so it is
    readable on return `refresh=True`


    >>> ed_df = ed.pandas_to_eland(pd_df,
    ...                            'localhost',
    ...                            'pandas_to_eland',
    ...                            es_if_exists="replace",
    ...                            es_refresh=True,
    ...                            es_type_overrides={'H':'text'}) # index field 'H' as text not keyword
    >>> type(ed_df)
    <class 'eland.dataframe.DataFrame'>
    >>> ed_df
           A  B  ...  G                                          H
    0  3.141  1  ...  1  Long text - to be indexed as es type text
    1  3.141  1  ...  2  Long text - to be indexed as es type text
    2  3.141  1  ...  3  Long text - to be indexed as es type text
    <BLANKLINE>
    [3 rows x 8 columns]
    >>> ed_df.dtypes
    A           float64
    B             int64
    C            object
    D    datetime64[ns]
    E           float64
    F              bool
    G             int64
    H            object
    dtype: object

    See Also
    --------
    eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame
    """
    if chunksize is None:
        chunksize = DEFAULT_CHUNK_SIZE

    mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides)
    es_client = ensure_es_client(es_client)

    # If table exists, check if_exists parameter
    if es_client.indices.exists(index=es_dest_index):
        if es_if_exists == "fail":
            raise ValueError(
                f"Could not create the index [{es_dest_index}] because it "
                f"already exists. "
                f"Change the 'es_if_exists' parameter to "
                f"'append' or 'replace' data."
            )

        elif es_if_exists == "replace":
            es_client.indices.delete(index=es_dest_index)
            es_client.indices.create(index=es_dest_index, body=mapping)

        elif es_if_exists == "append":
            dest_mapping = es_client.indices.get_mapping(index=es_dest_index)[
                es_dest_index
            ]
            verify_mapping_compatibility(
                ed_mapping=mapping,
                es_mapping=dest_mapping,
                es_type_overrides=es_type_overrides,
            )
    else:
        es_client.indices.create(index=es_dest_index, body=mapping)

    def action_generator(
        pd_df: pd.DataFrame,
        es_dropna: bool,
        use_pandas_index_for_es_ids: bool,
        es_dest_index: str,
    ) -> Generator[Dict[str, Any], None, None]:
        for row in pd_df.iterrows():
            if es_dropna:
                values = row[1].dropna().to_dict()
            else:
                values = row[1].to_dict()

            if use_pandas_index_for_es_ids:
                # Use index as _id
                id = row[0]

                action = {"_index": es_dest_index, "_source": values, "_id": str(id)}
            else:
                action = {"_index": es_dest_index, "_source": values}

            yield action

    # parallel_bulk is lazy generator so use deque to consume them immediately
    # maxlen = 0 because don't need results of parallel_bulk
    deque(
        parallel_bulk(
            client=es_client,
            actions=action_generator(
                pd_df, es_dropna, use_pandas_index_for_es_ids, es_dest_index
            ),
            thread_count=thread_count,
            chunk_size=int(chunksize / thread_count),
        ),
        maxlen=0,
    )

    if es_refresh:
        es_client.indices.refresh(index=es_dest_index)

    return DataFrame(es_client, es_dest_index)