Exemple #1
0
def installPipelines():
    conn = get_connection()
    client = IngestClient(conn)
    client.put_pipeline(id='ingest_attachment',
                        body={
                            'description':
                            "Extract attachment information",
                            'processors': [{
                                "attachment": {
                                    "field": "data",
                                    "indexed_chars": "-1"
                                },
                                "remove": {
                                    "field": "data"
                                }
                            }]
                        })
    client.put_pipeline(id='add_timestamp',
                        body={
                            'description':
                            "Adds an index_date timestamp",
                            'processors': [
                                {
                                    "set": {
                                        "field": "index_date",
                                        "value": "{{_ingest.timestamp}}",
                                    },
                                },
                            ]
                        })
Exemple #2
0
    def add_attachment_pipeline(self):
        pipeline = {
            "description":
            "Extract attachment information encoded in Base64 with UTF-8 charset",
            "processors": [{
                "attachment": {
                    "field": "attachment"
                }
            }]
        }

        ingest_client = IngestClient(self.client)
        if self.is_new:
            ingest_client.put_pipeline('attachment', pipeline)
Exemple #3
0
 def _createPipeline(self):
     ic = IngestClient(self.conn)
     self.pipeline_id = 'monthlyprocessor'
     pipeline_body = {
                   "description": "monthly date-time index naming",
                   "processors" : [
                     {
                       "date_index_name" : {
                         "field" : "@timestamp",
                         "index_name_prefix" : "{{ _index}}-",
                         "date_rounding" : "M",
                       }
                     }
                   ]
     }
     ic.put_pipeline(id=self.pipeline_id, body=pipeline_body)
def putPipelines():
    conn = get_connection()
    client = IngestClient(conn)
    client.put_pipeline(id='rename_structure_unit_description',
                        body={
                            'description':
                            "Rename field _source.description to _source.desc",
                            'processors': [
                                {
                                    "rename": {
                                        "field": "_source.description",
                                        "target_field": "_source.desc",
                                    },
                                },
                            ]
                        })
Exemple #5
0
 def _create_ingest_pipeline(self) -> None:
     """
     Create ingest pipeline to allow extract file content and use them for search.
     """
     p = IngestClient(self.es)
     # TODO - G.M - 2019-05-31 - check if possible to set specific analyzer for
     # attachment content parameters. Goal :
     # allow ngram or lang specific indexing for "in file search"
     p.put_pipeline(
         id="attachment",
         body={
             "description": "Extract attachment information",
             "processors": [{
                 "attachment": {
                     "field": "file"
                 }
             }],
         },
     )
class Pipeline:
    """
    A pipeline is a definition of a series of processors that
    are to be executed in the same order as they are declared.
    (https://www.elastic.co/guide/en/elasticsearch/reference/current/pipeline.html)

    Parameters
    ----------

    client: elasticsearch.Elasticsearch
        a elasticsearch client.
    name: str
        name for pipeline.
    pipeline_handler: :obj:`PipelineHandler`
        object that contains json object of pipeline.

    Attributes
    ----------

    client: str
    name: str
    pipeline_handler: :obj:`PipelineHandler`
    """
    def __init__(self, client, name, pipeline_handler):
        self._pipeline_handler = pipeline_handler
        self._name = name
        self.ingest_client = IngestClient(client)

    def create_pipeline(self, params=None):
        """
        create_pipeline method create the elasticsearch pipeline
        with processors specified in pipeline_handler json.
        """
        try:
            self.ingest_client.put_pipeline(
                self._name, load_json(self._pipeline_handler._json))
        except Exception as e:
            raise (e)
Exemple #7
0
        "description":
        "Extract attachment information from arrays",
        "processors": [{
            "foreach": {
                "field": "attachments",
                "processor": {
                    "attachment": {
                        "target_field": "_ingest._value.attachment",
                        "field": "_ingest._value.data"
                    }
                }
            }
        }]
    }

    val = ex_indices.put_pipeline(id="attachment1", body=doc1)
    print(val)

    doc = {
        "attachments": [{
            "filename":
            "test.pdf",
            "data":
            "JVBERi0xLjMNJeLjz9MNCjcgMCBvYmoNPDwvTGluZWFyaXplZCAxL0wgNzk0NS9PIDkvRSAzNTI0L04gMS9UIDc2NTYvSCBbIDQ1MSAxMzddPj4NZW5kb2JqDSAgICAgICAgICAgICAgICAgICAgICAgDQoxMyAwIG9iag08PC9EZWNvZGVQYXJtczw8L0NvbHVtbnMgNC9QcmVkaWN0b3IgMTI+Pi9GaWx0ZXIvRmxhdGVEZWNvZGUvSURbPDREQzkxQTE4NzVBNkQ3MDdBRUMyMDNCQjAyMUM5M0EwPjxGNkM5MkIzNjhBOEExMzQwODQ1N0ExRDM5NUEzN0VCOT5dL0luZGV4WzcgMjFdL0luZm8gNiAwIFIvTGVuZ3RoIDUyL1ByZXYgNzY1Ny9Sb290IDggMCBSL1NpemUgMjgvVHlwZS9YUmVmL1dbMSAyIDFdPj5zdHJlYW0NCmjeYmJkEGBgYmCyARIMIIKxAUgwpwIJNkcg8eUYAxMjwzSQLAMjucR/xp1fAAIMAEykBvANCmVuZHN0cmVhbQ1lbmRvYmoNc3RhcnR4cmVmDQowDQolJUVPRg0KICAgICAgICANCjI3IDAgb2JqDTw8L0ZpbHRlci9GbGF0ZURlY29kZS9JIDY5L0xlbmd0aCA1OC9TIDM4Pj5zdHJlYW0NCmjeYmBgYGFgYPzPAATcNgyogJEBJMvRgCzGAsUMDA0M3Azc0x50JoA4zAwMWgIQLYwsAAEGAL/iBRkNCmVuZHN0cmVhbQ1lbmRvYmoNOCAwIG9iag08PC9NZXRhZGF0YSAxIDAgUi9QYWdlcyA1IDAgUi9UeXBlL0NhdGFsb2c+Pg1lbmRvYmoNOSAwIG9iag08PC9Db250ZW50cyAxMSAwIFIvQ3JvcEJveFswIDAgNTk1IDg0Ml0vTWVkaWFCb3hbMCAwIDU5NSA4NDJdL1BhcmVudCA1IDAgUi9SZXNvdXJjZXMgMTQgMCBSL1JvdGF0ZSAwL1R5cGUvUGFnZT4+DWVuZG9iag0xMCAwIG9iag08PC9GaWx0ZXIvRmxhdGVEZWNvZGUvRmlyc3QgOTQvTGVuZ3RoIDc3My9OIDEzL1R5cGUvT2JqU3RtPj5zdHJlYW0NCmjevFRtb9owEP4r/gPgl9hxIlVI0I6u0lqhJls/RPmQgguRQoISV6P/fncJLoG1K6XSiMz55e58vue545IwwhXhnibcJyKAlSaeCAgPiOeDCImUighGVMiI4CQUoCYIZ1oS4YGt5kRIsGIhEeAokLAGFcYkubigl1VR1dEmmxtcNAovY+R+NKLftvY6spnFg+uI4/XdwbQqLexNBcYAWzSOBQbQTSXe3k19vLibBnhnZz6rq3lkbEJnV1Mam61NR6OEXmbF/fUEr8rW6ywRQwE/iPRQpvQ2s3W+TdhQcnQ+FBwdDxkPPRCe0rjSXEFe2JDzUKAImEIdjZENQ8VUSh9WuTWzKi9t0m0ReOGQBSFEk0IY0Zg8ZUVjaHSLpoLG9/RmYUqb2xcav2zMPj+jEehf5U9Ppjbl3DQJp4/PRWFsulMs59UiL5et3iRrDCaQRi/rx6p4PURYMVXR86NFI7TkNK5+ljkoGMJ3ScUztG+djZs5RERCpiB/m+8mX64sYfTKdPsDwTmdFtmyAca0VpNJtU0GPtBn4GkkgQfMYDJI29O7bG3ouM6zYjCpisVtTG9sVuTzcbksDPiNrFn/Aip6+zDwqjrf2Ko+fN2BF/dG+pCX47LJX9fTvG7s5SqrXXx7d0hsfPCPbKfBub9PTv1sYpel1hBcL+yqSYRGSn7ta2nyKn3O39Dxff2hH6X81rovuxMXpZPuDi8IWy3P89I+wEHI3wPYdwDLHsDKR4CZBoCxUzCmewDH+do0d+b3fbXOyln0DsrsY4z/dnQW0IIfAa3lKUCrw2RDjWPa2tGmVu3/T4UcQe1me6iOAXXQO8hCKd/QlLCr2KHEyHCOo08ADcPt49i9A6ggeie7uBgj/+vTPku/1GV8BSQUypHQ08dd5nzqOfPzCOcdEg40Tmosny3JMOiXpNRdSXLBfMyGeL8k277ZZeYoRQOuPtOF/+n3vNypo2IV/Ixi3X+nFuipPfeDjsxccbr/rqgP+zHu9IoRCtEVo4tiV9JAiD8CDAA+0IrxDQplbmRzdHJlYW0NZW5kb2JqDTExIDAgb2JqDTw8L0ZpbHRlci9GbGF0ZURlY29kZS9MZW5ndGggMTUzMD4+c3RyZWFtDQpIibRXS2/jNhBGr/4Vc1uqiBW9H8d0tynQ02IroIduD7LEJCpk0RDppPlT/Y2dB2l7nS0KLFoEUPgacuabmW/GP3Sb267LIIXuYZMWcVJAgn8yytI8rqukgrqscZ7k0O03t+9tCYPlYwnYYXP70y8pPNrNNomTJKugGzY0qhroXja/qbsoTeJMjdG2jlNldhqibUpD3GjiWg3RNlNrtK3iCnd7Bx8/3MP9RAuNmrWNfu9+Jh0Lr2MmCmbQtHGbkXJZG+eZKMc6JK3XIaMR6zDiu3/BR7O6fjdr+GBQhyRu1XDc68XBfVTGucJFWlv3uJmjgqjLZ4Xa8ObnCCZLqieqh+MyPevV9rMsPEwzWZXhyKx7FONV9xRGh5WMb5W2en32L+sow2+4cZ7ZzAS2aZyW0H1gCJPGG9K2mRhiHqIcYYGI79dRgaDxRNbN4uzN5TxK8LvymKyKC9WzjHPTEm1b9MsjuadRN3ySRQc+IaKzOYq05S0RXkZ4lFWZH54mkbFRosDIvV5RL8GXvcpTYrLFm0XKWzEamR5JUdJUX4i6G5AXdbQtcc9r3dMs9waOorGIWQuIFWHafe+jogiRSSMCEwGE/nCYp6F3k1mgR8MOc+/IiXC0rEam9AjOwLBqCdEe3yqU0zC5OPgsi3PvspTC8BRxjJkEUCvYTh7HRWYjX1rypaWaxXMSQg8Somgc6NkfG/iYW80yDYQXQ5XhEsXwOFm3TrujmGJRPzAYpIPZawsUK1cBJqDUJ1BqUfywGsyQvQUU3Jtl5hda8h1mmQK9sFqYtua4OM2BXRNGL5N7Ik0HVs9LDcCpYZ96MgBTC4M+V9PyGNFlgt/tvWcfAbJhJFkrUkh9F3V/UPpX/lBcVJj+eAYBlZ3GE4NwV0id0htWtSXfc7e8mkXfoJNfX540elOEPaugEV6YYUm9cJ0KKDCgx8xBI7BIT9G2wUAjr2aKDYzhbiYqyBPGSZmjxPiiCR4OIZ4HAqHAE+JA/DCm/YxihoJOhfmw+oUeccMkYLy2rCu5sQjGpj6006SpROFPmrXr+TtGkk40XjE7ChVzpH3SA69NxHuNOkxyZOHjTiIVk4gEZExRdL7E8wwNEQOPBk8N3yCn9nK5aOJkYsFiVMrK5AcYcBcqL4Rxpd5FmIJVEEMPyPKlnvClBhZ2+vKiIx+yXj0yYIu1jbjoq+nwhiNGs7zDYEXw4akX7iYoiQPgzB+eGij1LDLHP1EGCZzTtqK0tVdJgPqU35gHxdfyQEJjG4ZkEhFSTYx7jVyotD6hsAUoLy4qzxeVclE/v/SvXByR+JEF4LBOSESDL6ZoiVpXzTNZc/PrVTXHRGov8i7JTvj7ggfMy1RbUUUmoca/MwkTUQXjxVE/iyPEP/U1vZDfi+K/xDb0GWndppfQpgRtjnQ3cTGqEdqe/xOZIgwvyIYp4fEaZdQKEHoogwSO1efLrWufUOvwluXkcS6NtfqzH97inF3hHDRvQ4dEFYNJh6OWbOi5QXF6pNIr7YtsEN5hex1n3yz5fobKLtYu7kOseXBkKwmtTL2jMBgKNPmZwr5MvSqkHvLt2gc3F/ysb3awNGdpiAes9Q7rlVAakfJlG0QlXQTZBmx/qFkJzQxnJ9WkSkmtXoyD2VgspkdNKRy6gbMtLIG2SNvmDbpq29LsnCo+jJ8xDZgQM/Y2Zh3G9bRgWnCiZGp/QL5CNtxN8+SIiNX/yQzbs5oUvkHLDvnpQfyPSQR3g4xWbss/6X4MLdFKvbA/1zN+5BJ2CJVGgm40L8ts+pG7KoksrKG7U+ELr2D8ZESPQfTUxiCJ7i5Z+hwqeXMR9UQOFE90QYW6YdtEs7CqsSX9dyC/mV1zgbBoGt8+vTfsSYz4gb9OflOcOsEaSfFUOHNPvumpvabxKnksG2D3sjr7kyvLYSmRZSqCPKXKGIQm/0NGjlKnzaPBX3n9tL9p9D6Tm2QR3fdVF4SI4ah9pHAFjl9EXUYghV0eY680/EukCF0CF2hl3QXtEelReBHnc6uh4Ff67sSBP3abvwcArRiH3QoNCmVuZHN0cmVhbQ1lbmRvYmoNMTIgMCBvYmoNPDwvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aCAyMDg+PnN0cmVhbQ0KSIlUkL0OwjAMhPc+hUcQQ9rOVRdYOvAjCuxp4laRiBO56dC3JykFxBBL9uXTnS32zaEhE0Bc2KkWA/SGNOPoJlYIHQ6GoChBGxXWbqnKSg8iwu08BrQN9Q6qKhPXKI6BZ9i0s+3cc5dvQZxZIxsaYHMr7o84aCfvn2iRAuRQ16Cxz8T+KP1JWozyii7zYjV0GkcvFbKkAaHKi/pdkPS/9iG6/t3+vlZlXpZ1FomPluC0yddbTcwx1rLukihlMITfi3jnk2V62UuAAQBDyGk/Cg0KZW5kc3RyZWFtDWVuZG9iag0xIDAgb2JqDTw8L0xlbmd0aCAzNjU2L1N1YnR5cGUvWE1ML1R5cGUvTWV0YWRhdGE+PnN0cmVhbQ0KPD94cGFja2V0IGJlZ2luPSLvu78iIGlkPSJXNU0wTXBDZWhpSHpyZVN6TlRjemtjOWQiPz4KPHg6eG1wbWV0YSB4bWxuczp4PSJhZG9iZTpuczptZXRhLyIgeDp4bXB0az0iQWRvYmUgWE1QIENvcmUgNC4yLjEtYzA0MyA1Mi4zNzI3MjgsIDIwMDkvMDEvMTgtMTU6MDg6MDQgICAgICAgICI+CiAgIDxyZGY6UkRGIHhtbG5zOnJkZj0iaHR0cDovL3d3dy53My5vcmcvMTk5OS8wMi8yMi1yZGYtc3ludGF4LW5zIyI+CiAgICAgIDxyZGY6RGVzY3JpcHRpb24gcmRmOmFib3V0PSIiCiAgICAgICAgICAgIHhtbG5zOmRjPSJodHRwOi8vcHVybC5vcmcvZGMvZWxlbWVudHMvMS4xLyI+CiAgICAgICAgIDxkYzpmb3JtYXQ+YXBwbGljYXRpb24vcGRmPC9kYzpmb3JtYXQ+CiAgICAgICAgIDxkYzpjcmVhdG9yPgogICAgICAgICAgICA8cmRmOlNlcT4KICAgICAgICAgICAgICAgPHJkZjpsaT5jZGFpbHk8L3JkZjpsaT4KICAgICAgICAgICAgPC9yZGY6U2VxPgogICAgICAgICA8L2RjOmNyZWF0b3I+CiAgICAgICAgIDxkYzp0aXRsZT4KICAgICAgICAgICAgPHJkZjpBbHQ+CiAgICAgICAgICAgICAgIDxyZGY6bGkgeG1sOmxhbmc9IngtZGVmYXVsdCI+VGhpcyBpcyBhIHRlc3QgUERGIGZpbGU8L3JkZjpsaT4KICAgICAgICAgICAgPC9yZGY6QWx0PgogICAgICAgICA8L2RjOnRpdGxlPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgICAgPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIj4KICAgICAgICAgPHhtcDpDcmVhdGVEYXRlPjIwMDAtMDYtMjlUMTA6MjE6MDgrMTE6MDA8L3htcDpDcmVhdGVEYXRlPgogICAgICAgICA8eG1wOkNyZWF0b3JUb29sPk1pY3Jvc29mdCBXb3JkIDguMDwveG1wOkNyZWF0b3JUb29sPgogICAgICAgICA8eG1wOk1vZGlmeURhdGU+MjAxMy0xMC0yOFQxNToyNDoxMy0wNDowMDwveG1wOk1vZGlmeURhdGU+CiAgICAgICAgIDx4bXA6TWV0YWRhdGFEYXRlPjIwMTMtMTAtMjhUMTU6MjQ6MTMtMDQ6MDA8L3htcDpNZXRhZGF0YURhdGU+CiAgICAgIDwvcmRmOkRlc2NyaXB0aW9uPgogICAgICA8cmRmOkRlc2NyaXB0aW9uIHJkZjphYm91dD0iIgogICAgICAgICAgICB4bWxuczpwZGY9Imh0dHA6Ly9ucy5hZG9iZS5jb20vcGRmLzEuMy8iPgogICAgICAgICA8cGRmOlByb2R1Y2VyPkFjcm9iYXQgRGlzdGlsbGVyIDQuMCBmb3IgV2luZG93czwvcGRmOlByb2R1Y2VyPgogICAgICA8L3JkZjpEZXNjcmlwdGlvbj4KICAgICAgPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIKICAgICAgICAgICAgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iPgogICAgICAgICA8eG1wTU06RG9jdW1lbnRJRD51dWlkOjA4MDVlMjIxLTgwYTgtNDU5ZS1hNTIyLTYzNWVkNWMxZTJlNjwveG1wTU06RG9jdW1lbnRJRD4KICAgICAgICAgPHhtcE1NOkluc3RhbmNlSUQ+dXVpZDo2MmQ2YWU2ZC00M2M0LTQ3MmQtOWIyOC03YzRhZGQ4ZjllNDY8L3htcE1NOkluc3RhbmNlSUQ+CiAgICAgIDwvcmRmOkRlc2NyaXB0aW9uPgogICA8L3JkZjpSREY+CjwveDp4bXBtZXRhPgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAgICAgICAgICAgCjw/eHBhY2tldCBlbmQ9InciPz4NCmVuZHN0cmVhbQ1lbmRvYmoNMiAwIG9iag08PC9GaWx0ZXIvRmxhdGVEZWNvZGUvRmlyc3QgNC9MZW5ndGggNDgvTiAxL1R5cGUvT2JqU3RtPj5zdHJlYW0NCmjeMlUwULCx0XfOL80rUTDU985MKY62BIoFxeqHVBak6gckpqcW29kBBBgA1ncLgA0KZW5kc3RyZWFtDWVuZG9iag0zIDAgb2JqDTw8L0ZpbHRlci9GbGF0ZURlY29kZS9GaXJzdCA0L0xlbmd0aCAxNjcvTiAxL1R5cGUvT2JqU3RtPj5zdHJlYW0NCmjePMvBCsIwEEXRX5mdDaKdxCpVSqFY3AkuBNexSelA6EAyRfx7A4qPu3znAAhNU3aLTByLwVkKb1Weo7dCPPdWfNGfDOYdzFGj0VivtV4hrn6vrK40RE48Cjw4Oqi3qMoruz/WuwxrvTeV3m2w+uJbZLcMPhZdxk8r0FMSCsFHqLYII0d40Oz4lVR5Jwm+uE+UIGdBfBK49RcYKXjVth8BBgBnZztkDQplbmRzdHJlYW0NZW5kb2JqDTQgMCBvYmoNPDwvRGVjb2RlUGFybXM8PC9Db2x1bW5zIDMvUHJlZGljdG9yIDEyPj4vRmlsdGVyL0ZsYXRlRGVjb2RlL0lEWzw0REM5MUExODc1QTZENzA3QUVDMjAzQkIwMjFDOTNBMD48RjZDOTJCMzY4QThBMTM0MDg0NTdBMUQzOTVBMzdFQjk+XS9JbmZvIDYgMCBSL0xlbmd0aCAzNy9Sb290IDggMCBSL1NpemUgNy9UeXBlL1hSZWYvV1sxIDIgMF0+PnN0cmVhbQ0KaN5iYmBgYGLkPcLEwD+ViYGhh4mBkYWJ8bEkkM0IEGAAKlkDFA0KZW5kc3RyZWFtDWVuZG9iag1zdGFydHhyZWYNCjExNg0KJSVFT0YNCg=="
        }]
    }
    res = es.index(index="test-index0",
                   doc_type='tweet',
                   id=1,
                   body=doc,
                   pipeline='attachment1')
Exemple #8
0
class IngestConnector:
    def __init__(
            self,
            pipeline_id: str = "pdf_content",
            field: str = "data",
            pipeline_description: str = "Extracting info from pdf content"):
        self.pipeline_id: str = pipeline_id
        self.index_name: str = pipeline_id + "_index"
        self.field: str = field
        self.pipeline_description: str = pipeline_description

        self.ingest_client = IngestClient(current_app.elasticsearch)

    def create_pipeline(self):
        self.ingest_client.put_pipeline(id=self.pipeline_id,
                                        body={
                                            'description':
                                            self.pipeline_description,
                                            'processors': [{
                                                "attachment": {
                                                    "field": self.field
                                                }
                                            }]
                                        })

    def delete_pipeline(self):
        self.ingest_client.delete_pipeline(id=self.pipeline_id)

    def get_pipeline(self):
        return self.ingest_client.get_pipeline(id=self.pipeline_id)

    def add_to_index(self, id_: int, content: str, content_page: int,
                     content_paragraph: int):
        current_app.elasticsearch.index(
            index=self.index_name,
            id=id_,
            pipeline=self.pipeline_id,
            body={
                self.field:
                base64.b64encode(content.encode("utf-8")).decode("utf-8"),
                "content_page":
                content_page,
                "content_paragraph":
                content_paragraph,
            })

    def remove_from_index(self, id_: int):
        current_app.elasticsearch.delete(index=self.index_name, id=id_)

    def api_search(self, query: str):
        return current_app.elasticsearch.search(
            index=self.index_name,
            body={"query": {
                "match": {
                    "attachment.content": query
                }
            }})

    def search(self, query: str):
        search = self.api_search(query)

        ids = [int(hit['_id']) for hit in search['hits']['hits']]

        if len(ids) == 0:
            return None

        when = []
        for i in range(len(ids)):
            when.append((ids[i], i))

        res = KnowledgePdfContent.query.filter(
            KnowledgePdfContent.id.in_(ids)).order_by(
                db.case(when, value=KnowledgePdfContent.id)).all()
        return res[0] if len(res) > 0 else None
Exemple #9
0
        hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
    from elasticsearch import Elasticsearch

    es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
    from elasticsearch.client import IngestClient

    c = IngestClient(es)
    try:
        c.get_pipeline('geoip')
    except elasticsearch.exceptions.NotFoundError:
        c.put_pipeline('geoip',
                       body='''{
              "description" : "Add geoip info",
              "processors" : [
                {
                  "geoip" : {
                    "field" : "ip"
                  }
                }
              ]
            }''')


class GeoIp(InnerDoc):
    continent_name = Keyword()
    country_iso_code = Keyword()
    country_name = Keyword()
    location = GeoPoint()


class UserAgentBrowser(InnerDoc):
Exemple #10
0
def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite):
    """Upload ML DGA model and dependencies and enrich DNS data."""
    import io
    import requests
    import shutil
    import zipfile

    es_client: Elasticsearch = ctx.obj['es']
    client_info = es_client.info()
    license_client = LicenseClient(es_client)

    if license_client.get()['license']['type'].lower() not in ('platinum',
                                                               'enterprise'):
        client_error(
            'You must have a platinum or enterprise subscription in order to use these ML features'
        )

    # download files if necessary
    if not model_dir:
        if not model_tag:
            client_error(
                'model-tag or model-dir required to download model files')

        click.echo(f'Downloading artifact: {model_tag}')

        release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}'
        release = requests.get(release_url)
        release.raise_for_status()
        assets = [
            a for a in release.json()['assets']
            if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')
        ]

        if len(assets) != 1:
            client_error(
                f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!'
            )

        zipped_url = assets[0]['browser_download_url']
        zipped = requests.get(zipped_url)
        z = zipfile.ZipFile(io.BytesIO(zipped.content))

        dga_dir = get_path('ML-models', 'DGA')
        model_dir = os.path.join(dga_dir, model_tag)
        os.makedirs(dga_dir, exist_ok=True)
        shutil.rmtree(model_dir, ignore_errors=True)
        z.extractall(dga_dir)
        click.echo(f'files saved to {model_dir}')

        # read files as needed
        z.close()

    def get_model_filename(pattern):
        paths = list(Path(model_dir).glob(pattern))
        if not paths:
            client_error(
                f'{model_dir} missing files matching the pattern: {pattern}')
        if len(paths) > 1:
            client_error(
                f'{model_dir} contains multiple files matching the pattern: {pattern}'
            )

        return paths[0]

    @contextmanager
    def open_model_file(name):
        pattern = expected_ml_dga_patterns[name]
        with open(get_model_filename(pattern), 'r') as f:
            yield json.load(f)

    model_id, _ = os.path.basename(
        get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1)

    click.echo(
        f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})'
    )

    # upload model
    ml_client = MlClient(es_client)
    ingest_client = IngestClient(es_client)

    existing_models = ml_client.get_trained_models()
    if model_id in [
            m['model_id']
            for m in existing_models.get('trained_model_configs', [])
    ]:
        if overwrite:
            ctx.invoke(remove_dga_model,
                       model_id=model_id,
                       es_client=es_client,
                       ml_client=ml_client,
                       ingest_client=ingest_client,
                       force=True)
        else:
            client_error(
                f'Model: {model_id} already exists on stack! Try --overwrite to force the upload'
            )

    click.secho('[+] Uploading model (may take a while)')

    with open_model_file('model') as model_file:
        try:
            ml_client.put_trained_model(model_id=model_id, body=model_file)
        except elasticsearch.ConnectionTimeout:
            msg = 'Connection timeout, try increasing timeout using `es --timeout <secs> experimental setup_dga_model`.'
            client_error(msg)

    # install scripts
    click.secho('[+] Uploading painless scripts')

    with open_model_file('dga_ngrams_create') as painless_install:
        es_client.put_script(id='dga_ngrams_create', body=painless_install)
        # f'{model_id}_dga_ngrams_create'

    with open_model_file('dga_ngrams_transform_delete') as painless_delete:
        es_client.put_script(id='dga_ngrams_transform_delete',
                             body=painless_delete)
        # f'{model_id}_dga_ngrams_transform_delete'

    # Install ingest pipelines
    click.secho('[+] Uploading pipelines')

    def _build_es_script_error(err, pipeline_file):
        error = err.info['error']
        cause = error['caused_by']

        error_msg = [
            f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
            ' '.join(f'{k}: {v}' for k, v in error['position'].items()),
            '\n'.join(error['script_stack'])
        ]

        return click.style('\n'.join(error_msg), fg='red')

    with open_model_file('dns_enrich_pipeline') as ingest_pipeline1:
        try:
            ingest_client.put_pipeline(id='dns_enrich_pipeline',
                                       body=ingest_pipeline1)
        except elasticsearch.RequestError as e:
            if e.error == 'script_exception':
                client_error(_build_es_script_error(e, 'ingest_pipeline1'),
                             e,
                             ctx=ctx)
            else:
                raise

    with open_model_file(
            'dns_dga_inference_enrich_pipeline') as ingest_pipeline2:
        try:
            ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline',
                                       body=ingest_pipeline2)
        except elasticsearch.RequestError as e:
            if e.error == 'script_exception':
                client_error(_build_es_script_error(e, 'ingest_pipeline2'),
                             e,
                             ctx=ctx)
            else:
                raise

    click.echo('Ensure that you have updated your packetbeat.yml config file.')
    click.echo('    - reference: ML_DGA.md #2-update-packetbeat-configuration')
    click.echo(
        'Associated rules and jobs can be found under ML-experimental-detections releases in the repo'
    )
    click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
    click.echo(
        'To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
    def createElasticSearchIngestPipeline(self):
        esIngestClient = IngestClient(self.client)

        self.constructLanguagePipeline(
            esIngestClient, 'title_language_detector', 'Work title language detection',
            field='title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'alt_title_language_detector', 'Work alt_title language detection',
            prefix='_ingest._value.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'edition_title_language_detector', 'Edition title language detection',
            prefix='_ingest._value.',
            field='title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'edition_sub_title_language_detector', 'Edition subtitle language detection',
            prefix='_ingest._value.',
            field='sub_title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'subject_heading_language_detector', 'Subject heading language detection',
            prefix='_ingest._value.',
            field='heading.'
        )

        esIngestClient.put_pipeline(
            id='foreach_alt_title_language_detector',
            body={
                'description': 'loop for parsing alt_titles',
                'processors': [
                    {
                        'foreach': {
                            'field': 'alt_titles',
                            'processor': {
                                'pipeline': {
                                    'name': 'alt_title_language_detector',
                                }
                            }
                        }
                    }
                ]
            }
        )

        esIngestClient.put_pipeline(
            id='edition_language_detector',
            body={
                'description': 'loop for parsing edition fields',
                'processors': [
                    {
                        'pipeline': {
                            'name': 'edition_title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'pipeline': {
                            'name': 'edition_sub_title_language_detector',
                            'ignore_failure': True
                        }
                    }
                ]
            }
        )

        esIngestClient.put_pipeline(
            id='language_detector',
            body={
                'description': 'Full language processing',
                'processors': [
                    {
                        'pipeline': {
                            'name': 'title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'pipeline': {
                            'name': 'foreach_alt_title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'foreach': {
                            'field': 'editions',
                            'processor': {
                                'pipeline': {
                                    'name': 'edition_language_detector',
                                    'ignore_failure': True
                                }
                            }
                        }
                    },
                    {
                        'foreach': {
                            'field': 'subjects',
                            'ignore_missing': True,
                            'processor': {
                                'pipeline': {
                                    'name': 'subject_heading_language_detector',
                                    'ignore_failure': True
                                }
                            }
                        }
                    }
                ]
            }
        )
Exemple #12
0
            "pl": 1.9,
            "pw": 0.4
        }
    }]
}

# simulate ingest pipeline
IngestClient.simulate(es, body)

# In[ ]:

# store the pipeline for use in prod
pipeline_name = model_id + '_ingest_pipeline'
body = {'description': 'predict flower type', 'processors': processors}

IngestClient.put_pipeline(es, id=pipeline_name, body=body)

# In[ ]:

# verify pipeline
IngestClient.get_pipeline(es, pipeline_name)

# In[ ]:

# create index template with our new pipeline as the default pipeline

settings = {
    "index_patterns": ["flower_measurements-*"],
    "settings": {
        "default_pipeline": "jeffs-rfc-flower-type_ingest_pipeline"
    }
Exemple #13
0
def make_pipelines():
    with open(os.path.join(pipelines_dir, "pipelines.json")) as file:
        pipelines = json.load(file)
        ing_client = IngestClient(client)
        for key in pipelines.keys():
            print("Creating {0} created {1}".format(key,ing_client.put_pipeline(key, pipelines[key])))