コード例 #1
0
ファイル: ElasticSearchDB.py プロジェクト: TaykYoku/DIRAC
class ElasticSearchDB(object):
    """
    .. class:: ElasticSearchDB

    :param str url: the url to the database for example: el.cern.ch:9200
    :param str gDebugFile: is used to save the debug information to a file
    :param int timeout: the default time out to Elasticsearch
    :param int RESULT_SIZE: The number of data points which will be returned by the query.
    """

    __url = ""
    __timeout = 120
    clusterName = ""
    RESULT_SIZE = 10000

    ########################################################################
    def __init__(
        self,
        host,
        port,
        user=None,
        password=None,
        indexPrefix="",
        useSSL=True,
        useCRT=False,
        ca_certs=None,
        client_key=None,
        client_cert=None,
    ):
        """c'tor

        :param self: self reference
        :param str host: name of the database for example: MonitoringDB
        :param str port: The full name of the database for example: 'Monitoring/MonitoringDB'
        :param str user: user name to access the db
        :param str password: if the db is password protected we need to provide a password
        :param str indexPrefix: it is the indexPrefix used to get all indexes
        :param bool useSSL: We can disable using secure connection. By default we use secure connection.
        :param bool useCRT: Use certificates.
        :param str ca_certs: CA certificates bundle.
        :param str client_key: Client key.
        :param str client_cert: Client certificate.
        """

        self.__indexPrefix = indexPrefix
        self._connected = False
        if user and password:
            sLog.debug("Specified username and password")
            if port:
                self.__url = "https://%s:%s@%s:%d" % (user, password, host,
                                                      port)
            else:
                self.__url = "https://%s:%s@%s" % (user, password, host)
        else:
            sLog.debug("Username and password not specified")
            if port:
                self.__url = "http://%s:%d" % (host, port)
            else:
                self.__url = "http://%s" % host

        if port:
            sLog.verbose("Connecting to %s:%s, useSSL = %s" %
                         (host, port, useSSL))
        else:
            sLog.verbose("Connecting to %s, useSSL = %s" % (host, useSSL))

        if useSSL:
            if ca_certs:
                casFile = ca_certs
            else:
                bd = BundleDeliveryClient()
                retVal = bd.getCAs()
                casFile = None
                if not retVal["OK"]:
                    sLog.error("CAs file does not exists:", retVal["Message"])
                    casFile = certifi.where()
                else:
                    casFile = retVal["Value"]

            self.client = Elasticsearch(self.__url,
                                        timeout=self.__timeout,
                                        use_ssl=True,
                                        verify_certs=True,
                                        ca_certs=casFile)
        elif useCRT:
            self.client = Elasticsearch(
                self.__url,
                timeout=self.__timeout,
                use_ssl=True,
                verify_certs=True,
                ca_certs=ca_certs,
                client_cert=client_cert,
                client_key=client_key,
            )
        else:
            self.client = Elasticsearch(self.__url, timeout=self.__timeout)

        # Before we use the database we try to connect
        # and retrieve the cluster name

        try:
            if self.client.ping():
                # Returns True if the cluster is running, False otherwise
                result = self.client.info()
                self.clusterName = result.get("cluster_name", " ")  # pylint: disable=no-member
                sLog.info("Database info\n", json.dumps(result, indent=4))
                self._connected = True
            else:
                sLog.error("Cannot ping ElasticsearchDB!")
        except ConnectionError as e:
            sLog.error(repr(e))

    ########################################################################
    def getIndexPrefix(self):
        """
        It returns the DIRAC setup.
        """
        return self.__indexPrefix

    ########################################################################
    @ifConnected
    def query(self, index, query):
        """Executes a query and returns its result (uses ES DSL language).

        :param self: self reference
        :param str index: index name
        :param dict query: It is the query in ElasticSearch DSL language

        """
        try:
            esDSLQueryResult = self.client.search(index=index, body=query)
            return S_OK(esDSLQueryResult)
        except RequestError as re:
            return S_ERROR(re)

    @ifConnected
    def update(self, index, query=None, updateByQuery=True, id=None):
        """Executes an update of a document, and returns S_OK/S_ERROR

        :param self: self reference
        :param str index: index name
        :param dict query: It is the query in ElasticSearch DSL language
        :param bool updateByQuery: A bool to determine update by query or index values using index function.
        :param int id: ID for the document to be created.

        """

        sLog.debug("Updating %s with %s, updateByQuery=%s, id=%s" %
                   (index, query, updateByQuery, id))

        if not index or not query:
            return S_ERROR("Missing index or query")

        try:
            if updateByQuery:
                esDSLQueryResult = self.client.update_by_query(index=index,
                                                               body=query)
            else:
                esDSLQueryResult = self.client.index(index=index,
                                                     body=query,
                                                     id=id)
            return S_OK(esDSLQueryResult)
        except RequestError as re:
            return S_ERROR(re)

    @ifConnected
    def _Search(self, indexname):
        """
        it returns the object which can be used for retreiving certain value from the DB
        """
        return Search(using=self.client, index=indexname)

    ########################################################################
    def _Q(self, name_or_query="match", **params):
        """
        It is a wrapper to ElasticDSL Query module used to create a query object.
        :param str name_or_query is the type of the query
        """
        return Q(name_or_query, **params)

    def _A(self, name_or_agg, aggsfilter=None, **params):
        """
        It is a wrapper to ElasticDSL aggregation module, used to create an aggregation
        """
        return A(name_or_agg, aggsfilter, **params)

    ########################################################################
    @ifConnected
    def getIndexes(self, indexName=None):
        """
        It returns the available indexes...
        """
        if not indexName:
            indexName = self.__indexPrefix
        sLog.debug("Getting indices alias of %s" % indexName)
        # we only return indexes which belong to a specific prefix for example 'lhcb-production' or 'dirac-production etc.
        return list(self.client.indices.get_alias("%s*" % indexName))

    ########################################################################
    @ifConnected
    def getDocTypes(self, indexName):
        """
        Returns mappings, by index.

        :param str indexName: is the name of the index...
        :return: S_OK or S_ERROR
        """
        result = []
        try:
            sLog.debug("Getting mappings for ", indexName)
            result = self.client.indices.get_mapping(indexName)
        except Exception as e:  # pylint: disable=broad-except
            sLog.exception()
            return S_ERROR(e)

        doctype = ""
        for indexConfig in result:
            if not result[indexConfig].get("mappings"):
                # there is a case when the mapping exits and the value is None...
                # this is usually an empty index or a corrupted index.
                sLog.warn("Index does not have mapping %s!" % indexConfig)
                continue
            if result[indexConfig].get("mappings"):
                doctype = result[indexConfig]["mappings"]
                break  # we suppose the mapping of all indexes are the same...

        if not doctype:
            return S_ERROR("%s does not exists!" % indexName)

        return S_OK(doctype)

    ########################################################################
    @ifConnected
    def existingIndex(self, indexName):
        """
        Checks the existance of an index, by its name

        :param str indexName: the name of the index
        :returns: S_OK/S_ERROR if the request is successful
        """
        sLog.debug("Checking existance of index %s" % indexName)
        try:
            return S_OK(self.client.indices.exists(indexName))
        except TransportError as e:
            sLog.exception()
            return S_ERROR(e)

    ########################################################################

    @ifConnected
    def createIndex(self, indexPrefix, mapping=None, period="day"):
        """
        :param str indexPrefix: it is the index name.
        :param dict mapping: the configuration of the index.
        :param str period: We can specify, which kind of index will be created.
                           Currently only daily and monthly indexes are supported.

        """
        if period is not None:
            fullIndex = self.generateFullIndexName(
                indexPrefix,
                period)  # we have to create an index each period...
        else:
            sLog.warn(
                "The period is not provided, so using non-periodic indexes names"
            )
            fullIndex = indexPrefix

        res = self.existingIndex(fullIndex)
        if not res["OK"]:
            return res
        elif res["Value"]:
            return S_OK(fullIndex)

        try:
            sLog.info("Create index: ", fullIndex + str(mapping))
            self.client.indices.create(index=fullIndex,
                                       body={"mappings": mapping})  # ES7

            return S_OK(fullIndex)
        except Exception as e:  # pylint: disable=broad-except
            sLog.error("Can not create the index:", repr(e))
            return S_ERROR("Can not create the index")

    @ifConnected
    def deleteIndex(self, indexName):
        """
        :param str indexName: the name of the index to be deleted...
        """
        sLog.info("Deleting index", indexName)
        try:
            retVal = self.client.indices.delete(indexName)
        except NotFoundError:
            sLog.warn("Index does not exist", indexName)
            return S_OK("Noting to delete")
        except ValueError as e:
            return S_ERROR(DErrno.EVALUE, e)

        if retVal.get("acknowledged"):
            # if the value exists and the value is not None
            sLog.info("Deleted index", indexName)
            return S_OK(indexName)

        return S_ERROR(retVal)

    def index(self, indexName, body=None, docID=None, op_type="index"):
        """
        :param str indexName: the name of the index to be used
        :param dict body: the data which will be indexed (basically the JSON)
        :param int id: optional document id
        :param str op_type: Explicit operation type. (options: 'index' (default) or 'create')
        :return: the index name in case of success.
        """

        sLog.debug("Indexing in %s body %s, id=%s" % (indexName, body, docID))

        if not indexName or not body:
            return S_ERROR("Missing index or body")

        try:
            res = self.client.index(index=indexName,
                                    body=body,
                                    id=docID,
                                    params={"op_type": op_type})
        except (RequestError, TransportError) as e:
            sLog.exception()
            return S_ERROR(e)

        if res.get("created") or res.get("result") in ("created", "updated"):
            # the created index exists but the value can be None.
            return S_OK(indexName)

        return S_ERROR(res)

    @ifConnected
    def bulk_index(self,
                   indexPrefix,
                   data=None,
                   mapping=None,
                   period="day",
                   withTimeStamp=True):
        """
        :param str indexPrefix: index name.
        :param list data: contains a list of dictionary
        :param dict mapping: the mapping used by elasticsearch
        :param str period: Accepts 'day' and 'month'. We can specify which kind of indexes will be created.
        :param bool withTimeStamp: add timestamp to data, if not there already.

        :returns: S_OK/S_ERROR
        """
        sLog.verbose("Bulk indexing",
                     "%d records will be inserted" % len(data))
        if mapping is None:
            mapping = {}

        if period is not None:
            indexName = self.generateFullIndexName(indexPrefix, period)
        else:
            indexName = indexPrefix
        sLog.debug("Bulk indexing into %s of %s" % (indexName, data))

        res = self.existingIndex(indexName)
        if not res["OK"]:
            return res
        if not res["Value"]:
            retVal = self.createIndex(indexPrefix, mapping, period)
            if not retVal["OK"]:
                return retVal

        try:
            res = bulk(client=self.client,
                       index=indexName,
                       actions=generateDocs(data, withTimeStamp))
        except (BulkIndexError, RequestError) as e:
            sLog.exception()
            return S_ERROR(e)

        if res[0] == len(data):
            # we have inserted all documents...
            return S_OK(len(data))
        else:
            return S_ERROR(res)

    @ifConnected
    def getUniqueValue(self, indexName, key, orderBy=False):
        """
        :param str indexName: the name of the index which will be used for the query
        :param dict orderBy: it is a dictionary in case we want to order the result {key:'desc'} or {key:'asc'}
        :returns: a list of unique value for a certain key from the dictionary.
        """

        query = self._Search(indexName)

        endDate = datetime.utcnow()

        startDate = endDate - timedelta(days=30)

        timeFilter = self._Q(
            "range",
            timestamp={
                "lte": int(TimeUtilities.toEpoch(endDate)) * 1000,
                "gte": int(TimeUtilities.toEpoch(startDate)) * 1000,
            },
        )
        query = query.filter("bool", must=timeFilter)
        if orderBy:
            query.aggs.bucket(key,
                              "terms",
                              field=key,
                              size=self.RESULT_SIZE,
                              order=orderBy).metric(key,
                                                    "cardinality",
                                                    field=key)
        else:
            query.aggs.bucket(key, "terms", field=key,
                              size=self.RESULT_SIZE).metric(key,
                                                            "cardinality",
                                                            field=key)

        try:
            query = query.extra(
                size=self.RESULT_SIZE)  # do not need the raw data.
            sLog.debug("Query", query.to_dict())
            result = query.execute()
        except TransportError as e:
            return S_ERROR(e)

        values = []
        for bucket in result.aggregations[key].buckets:
            values += [bucket["key"]]
        del query
        sLog.debug("Nb of unique rows retrieved", len(values))
        return S_OK(values)

    def pingDB(self):
        """
        Try to connect to the database

        :return: S_OK(TRUE/FALSE)
        """
        connected = False
        try:
            connected = self.client.ping()
        except ConnectionError as e:
            sLog.error("Cannot connect to the db", repr(e))
        return S_OK(connected)

    @ifConnected
    def deleteByQuery(self, indexName, query):
        """
        Delete data by query (careful!)

        :param str indexName: the name of the index
        :param str query: the JSON-formatted query for which we want to issue the delete
        """
        try:
            self.client.delete_by_query(index=indexName, body=query)
        except Exception as inst:
            sLog.error("ERROR: Couldn't delete data")
            return S_ERROR(inst)
        return S_OK("Successfully deleted data from index %s" % indexName)

    @staticmethod
    def generateFullIndexName(indexName, period):
        """
        Given an index prefix we create the actual index name.

        :param str indexName: it is the name of the index
        :param str period: We can specify which kind of indexes will be created (day, week, month, year, null).
        :returns: string with full index name
        """

        # if the period is not correct, we use no-period indexes (same as "null").
        if period.lower() not in ["day", "week", "month", "year", "null"]:
            sLog.error("Period is not correct: ", period)
            return indexName
        elif period.lower() == "day":
            today = datetime.today().strftime("%Y-%m-%d")
            return "%s-%s" % (indexName, today)
        elif period.lower() == "week":
            week = datetime.today().isocalendar()[1]
            return "%s-%s" % (indexName, week)
        elif period.lower() == "month":
            month = datetime.today().strftime("%Y-%m")
            return "%s-%s" % (indexName, month)
        elif period.lower() == "year":
            year = datetime.today().strftime("%Y")
            return "%s-%s" % (indexName, year)
        elif period.lower() == "null":
            return indexName
コード例 #2
0
def test_es_basic_operations():
    """Run basic operations for testing purposes."""

    es = OpenSearch([{"host": "localhost", "port": 9200}])

    try:
        logging.debug("Deleting existing test data")
        es.delete(index="unit-test-index", doc_type="test", id=1)
    except exceptions.NotFoundError:
        pass

    logging.debug("Adding test data")
    r = es.index(
        index="unit-test-index",
        doc_type="test",
        id=1,
        body={
            "name": "Koira Koiruli Pöö",
            "height": "49",
            "mass": "10",
            "hair_color": "blond",
            "birth_year": "1999",
            "gender": "male",
        },
    )

    assert r["result"] == "created"

    es.indices.refresh(index="unit-test-index")
    r = es.get(index="unit-test-index", doc_type="test", id=1)
    assert r["_id"] == "1"

    s = es.search(index="unit-test-index",
                  body={"query": {
                      "match": {
                          "name": "cat"
                      }
                  }})
    hits = s["hits"]["total"]["value"]
    assert hits == 0

    s = es.search(index="unit-test-index", body={"query": {"match_all": {}}})
    logging.debug(s)
    hits = s["hits"]["total"]["value"]
    assert hits == 1

    s = es.search(index="unit-test-index",
                  body={"query": {
                      "match": {
                          "mass": "10"
                      }
                  }})
    logging.debug(s)
    hits = s["hits"]["total"]["value"]
    assert hits == 1

    s = es.search(index="unit-test-index",
                  body={"query": {
                      "match": {
                          "name": "Koiruli"
                      }
                  }})
    logging.debug(s)
    hits = s["hits"]["total"]["value"]
    assert hits == 1

    logging.debug("Deleting test data")
    es.delete(index="unit-test-index", doc_type="test", id=1)
コード例 #3
0
ファイル: esClient.py プロジェクト: leegggg/py_utils_linyz
class SearchEngineClient():
    def __init__(self, auth, host='localhost', port=9200, index_name='book'):
        self.fingerprintCreator = FootPrintCreator()
        self.index_name = index_name
        self.client = OpenSearch(
            hosts=[{'host': host, 'port': port}],
            http_compress=True,  # enables gzip compression for request bodies
            http_auth=auth,
            # client_cert = client_cert_path,
            # client_key = client_key_path,
            use_ssl=True,
            verify_certs=False,
            ssl_assert_hostname=False,
            ssl_show_warn=False,
            # ca_certs = ca_certs_path
        )
        if not self.client.indices.exists(index=index_name):
            response = self.client.indices.create(index=index_name, body=bookMapping)
            print('\nCreating index:')
            print(response)

    def post(self, book_uid: str, content: str, title: str, path="", part_no=0, fingerprint=None, tags_text=None, tags=None, ts=None):
        """
        item = {
            "book_uid": bookUid,   # required
            "path": str(textFile),
            "part_no": 0,
            "title": title,        # required
            "content": content,    # required
            "fingerprint": fingerprint,
            "tags_text": ",".join(tags),
            "tags": tags,
            "ts": datetime.now()
        }
        """

        if not fingerprint:
            fingerprint = self.fingerprintCreator.gen(content)

        if not ts:
            ts = datetime.now()

        if not tags:
            tags = fingerprint.split(",")

        if not tags_text:
            tags_text = ",".join(tags)

        body = {
            "book_uid": book_uid,
            "path": path,
            "part_no": part_no,
            "title": title,
            "content": content,
            "fingerprint": fingerprint,
            "tags_text": tags_text,
            "tags": tags,
            "ts": ts
        }

        response = self.client.index(
            id=book_uid,
            index=self.index_name,
            body=body,
            refresh=True
        )

        return response
コード例 #4
0
class Importer(ABC, Generic[IndexableData]):
    """Base class for importers.

    This base class provides functionality for ingesting data to Elastic Search. A
    subclass needs to provide index_base_names (normally just one, multiple if you need
    to import multiple kinds of data with the same importer) and implement run(), which
    will be called and should carry out the actual importing / ingesting process.
    Basically it should call apply_mapping() once at the beginning if needed, and then
    add documents using add_data() method.

    For every index_base_name there will be actually two indexes used. Let's use name
    "location" as an example here. Then, there will be actual indexes "location_1" and
    "location_2". In addition, two index aliases are used, "location" (the same as the
    base name) and "location_wip".

    The idea is that normally the data will be in one of the indexes, and "location"
    alias will point to that index. When new data is being imported, the other free
    index is used for that, and "location_wip" alias will point to that index. Once
    the import is finished, the "location" alias is swapped to the new index, and the
    old index is removed.

    A special occasion is when data is being imported the first time. In that case,
    "location" alias will also point to the yet to be finished index so that one
    doesn't need to wait for the import to finish to get some data available.
    """

    index_base_names: Tuple[str, ...]

    def __init__(self) -> None:
        if not getattr(self, "index_base_names", None):
            raise NotImplementedError(
                f"Importer {self.__class__.__name__} is missing index_base_names."
            )
        self.es = OpenSearch([settings.ES_URI])

    @abstractmethod
    def run(self) -> None:
        pass

    def base_run(self):
        self._initialize()
        self.run()
        self._finish()

    def add_data(
        self,
        data: IndexableData,
        index_base_name: Optional[str] = None,
        extra_params: Optional[dict] = None,
    ) -> None:
        index_name = self._get_wip_alias(index_base_name
                                         or self.index_base_names[0])
        body = asdict(data) if is_dataclass(data) else data
        try:
            self.es.index(index=index_name, body=body, **(extra_params or {}))
        except ConnectionError as e:
            logger.error(e)

    def add_data_bulk(
        self,
        data: List[IndexableData],
        index_base_name: Optional[str] = None,
    ) -> None:
        index_name = self._get_wip_alias(index_base_name
                                         or self.index_base_names[0])

        body = [{
            "_index": index_name,
            "_source": asdict(d) if is_dataclass(d) else d
        } for d in data]
        try:
            bulk(self.es, body)
        except ConnectionError as e:
            logger.error(e)

    def apply_mapping(self,
                      mapping: dict,
                      index_base_name: Optional[str] = None):
        index_name = self._get_wip_alias(index_base_name
                                         or self.index_base_names[0])
        logger.debug(f"Applying custom mapping to index {index_name}")
        self.es.indices.put_mapping(index=index_name, body=mapping)

    def delete_all_data(self) -> None:
        for alias in self.index_base_names:
            for index_name in (f"{alias}_1", f"{alias}_2"):
                logger.debug(f"Deleting index {index_name}")
                try:
                    response = self.es.indices.delete(index=index_name,
                                                      ignore=404)
                    logger.debug(response)
                except NotFoundError as e:
                    if e.error == "index_not_found_exception":
                        logger.debug(f"Index {index_name} does not exist")
                    else:
                        raise e

    def _initialize(self) -> None:
        for active_alias in self.index_base_names:
            logger.debug(f"Initializing {active_alias}")

            indexes = (f"{active_alias}_1", f"{active_alias}_2")
            active_index = self._get_index_from_es(active_alias)
            wip_index = indexes[1] if active_index == indexes[0] else indexes[0]

            # Clean everything except possible active index and alias
            for index in indexes:
                if index != active_index:
                    self._delete_index(index)
            self._delete_alias(self._get_wip_alias(active_alias))

            wip_alias = self._get_wip_alias(active_alias)
            wip_index_aliases = {wip_alias}
            if not active_index:
                # Make wip data available when there is no existing index. This is
                # most useful in development where one doesn't need to run the whole
                # import to get some data available.
                wip_index_aliases.add(active_alias)

            logger.debug(
                f"Creating wip index {wip_index} with aliases {wip_index_aliases}"
            )
            self.es.indices.create(
                index=wip_index,
                body={"aliases": {w: {}
                                  for w in wip_index_aliases}})

    def _finish(self) -> None:
        for active_alias in self.index_base_names:
            logger.debug(f"Finishing {active_alias}")

            wip_alias = self._get_wip_alias(active_alias)
            old_active_index = self._get_index_from_es(active_alias)
            wip_index = self._get_index_from_es(wip_alias)

            # Swap active alias to the wip index, delete the wip alias and old active
            # index as long as it is not the same as the wip index
            self.es.indices.update_aliases(
                body={
                    "actions": [
                        {
                            "add": {
                                "index": wip_index,
                                "alias": active_alias
                            }
                        },
                        {
                            "remove": {
                                "index": f"{active_alias}_*",
                                "alias": wip_alias,
                            }
                        },
                    ]
                })
            if old_active_index and old_active_index != wip_index:
                self._delete_index(old_active_index)

    def _delete_index(self, index) -> None:
        logger.debug(f"Deleting index {index}")
        try:
            response = self.es.indices.delete(index=index, ignore=404)
            logger.debug(response)
        except NotFoundError as e:
            if e.error == "index_not_found_exception":
                logger.debug(f"Index {index} does not exist")
            else:
                raise e

    def _delete_alias(self, alias: str) -> None:
        logger.debug(f"Deleting alias {alias}")
        try:
            self.es.indices.delete_alias(index="*", name=alias, ignore=404)
        except NotFoundError:
            pass

    def _get_index_from_es(self, alias: str) -> Optional[str]:
        try:
            return next(iter(self.es.indices.get_alias(name=alias)))
        except (NotFoundError, StopIteration):
            return None

    @staticmethod
    def _get_wip_alias(alias: str) -> str:
        return f"{alias}_wip"