Esempio n. 1
0
                "must_not": {
                    "exists": {
                        "field": "processedText"
                    }            
                }
            }
        }
    }, index=indexName)

if len(results['hits']['hits']) > 0:
    print("Records Found: ", len(results['hits']['hits']) , "Processing Now")
    import re
    from bs4 import BeautifulSoup
    for item in range(len(results['hits']['hits'])):
        print("Processing", results['hits']['hits'][item]['_id'])
        soup = BeautifulSoup(results['hits']['hits'][item]['_source']['html'], 'html.parser')
        for script in soup(["script", "style",""]):
            script.extract()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        body = {
            "doc": {
                "processedText": text
            }
        }
        _update = es.update(index=results['hits']['hits'][item]['_index'], id=results['hits']['hits'][item]['_id'], body=body)
        print("HTML Cleaned, and Converted to Searchable Text, Updating Doc", _update)
        
print("Quiting Now")
Esempio n. 2
0
class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [
            {
                "host": host,
                "port": port,
            }
        ]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds."
                    % (elastic_conn, excpt, timeout)
                )
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "number": {"type": "keyword"},
                "change_id": {"type": "keyword"},
                "title": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "text": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "url": {"type": "keyword"},
                "commit_count": {"type": "integer"},
                "additions": {"type": "integer"},
                "deletions": {"type": "integer"},
                "changed_files_count": {"type": "integer"},
                "changed_files": {
                    "properties": {
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "path": {"type": "keyword"},
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {"type": "keyword"},
                        "author": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "title": {"type": "text"},
                    }
                },
                "repository_prefix": {"type": "keyword"},
                "repository_fullname": {"type": "keyword"},
                "repository_shortname": {"type": "keyword"},
                "author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "branch": {"type": "keyword"},
                "target_branch": {"type": "keyword"},
                "created_at": {"type": "date", "format": "date_time_no_millis"},
                "on_created_at": {"type": "date", "format": "date_time_no_millis"},
                "merged_at": {"type": "date", "format": "date_time_no_millis"},
                "updated_at": {"type": "date", "format": "date_time_no_millis"},
                "closed_at": {"type": "date", "format": "date_time_no_millis"},
                "state": {"type": "keyword"},
                "duration": {"type": "integer"},
                "mergeable": {"type": "keyword"},
                "labels": {"type": "keyword"},
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    },
                },
                "approval": {"type": "keyword"},
                "draft": {"type": "boolean"},
                "self_merged": {"type": "boolean"},
                "crawler_metadata": {
                    "properties": {
                        "last_commit_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "last_post_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "total_docs_posted": {"type": "integer"},
                        "total_changes_updated": {"type": "integer"},
                        "total_orphans_updated": {"type": "integer"},
                    }
                },
                "tasks_data": {
                    "properties": {
                        "tid": {"type": "keyword"},
                        "ttype": {"type": "keyword"},
                        "crawler_name": {"type": "keyword"},
                        "updated_at": {"type": "date", "format": "date_time_no_millis"},
                        "change_url": {"type": "keyword"},
                        "severity": {"type": "keyword"},
                        "priority": {"type": "keyword"},
                        "score": {"type": "integer"},
                        "url": {"type": "keyword"},
                        "title": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 8191}
                            },
                        },
                        "_adopted": {"type": "boolean"},
                    }
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def update_task_data(
        self,
        source_it: Union[
            List[TaskDataForEL],
            List[OrphanTaskDataForEL],
            List[AdoptedTaskDataForEL],
        ],
    ) -> Optional[BulkIndexError]:
        def gen(it):
            for _source in it:
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = _source._id
                d["doc"] = {}
                d["doc"].update({"id": _source._id})
                if isinstance(_source, TaskDataForEL):
                    d["doc"].update(
                        {"tasks_data": [asdict(td) for td in _source.tasks_data]}
                    )
                if isinstance(_source, OrphanTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                    d["doc"]["type"] = "OrphanTaskData"
                if isinstance(_source, AdoptedTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                d["doc_as_upsert"] = True
                yield d

        ret = None
        try:
            bulk(self.es, gen(source_it))
        except BulkIndexError as err:
            ret = err
        self.es.indices.refresh(index=self.index)
        return ret

    def compute_crawler_id_by_name(self, name, _type):
        return "crawler/%s/%s" % (_type, name)

    def get_task_crawler_metadata(self, name: str) -> Dict:
        try:
            ret = self.es.get(
                self.index, self.compute_crawler_id_by_name(name, "tasks_crawler")
            )
            return ret["_source"]["crawler_metadata"]
        except Exception:
            return {}

    def set_task_crawler_metadata(
        self, name: str, commit_date: datetime = None, push_infos: Dict = None
    ):
        metadata = {}
        if commit_date:
            metadata.update({"last_commit_at": commit_date})
        if push_infos:
            prev_metadata = self.get_task_crawler_metadata(name)
            metadata.update(
                {
                    "last_post_at": push_infos["last_post_at"],
                    "total_docs_posted": prev_metadata.get("total_docs_posted", 0)
                    + push_infos["total_docs_posted"],
                    "total_changes_updated": prev_metadata.get(
                        "total_changes_updated", 0
                    )
                    + push_infos["total_changes_updated"],
                    "total_orphans_updated": prev_metadata.get(
                        "total_orphans_updated", 0
                    )
                    + push_infos["total_orphans_updated"],
                }
            )
        body = {
            "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata},
            "doc_as_upsert": True,
        }
        ret = None
        try:
            self.es.update(
                self.index,
                self.compute_crawler_id_by_name(name, "tasks_crawler"),
                body=body,
            )
            self.es.indices.refresh(index=self.index)
        except Exception as err:
            ret = err
        return ret

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {"value": repository_fullname}
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{"updated_at": {"order": "desc"}}],
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"type": "Change"}},
                        {
                            "regexp": {
                                "repository_fullname": {"value": repository_fullname}
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def get_changes_by_url(self, change_urls, size):
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"type": "Change"}},
                            {"terms": {"url": change_urls}},
                        ]
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_by_change_urls(self, change_urls):
        assert len(change_urls) <= 50
        size = 5000  # Asumming not more that 100 TD data relataed to a change
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "must_not": {"exists": {"field": "tasks_data._adopted"}},
                        "filter": [
                            {"term": {"type": "OrphanTaskData"}},
                            {"terms": {"tasks_data.change_url": change_urls}},
                        ],
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_and_declare_adpotion(self, changes_url):
        assert len(changes_url) <= 50
        tds = self.get_orphan_tds_by_change_urls(changes_url)
        if tds:
            adopted_tds = [
                AdoptedTaskDataForEL(
                    _id=td["id"],
                    task_data=AdoptedTaskData(_adopted=True),
                )
                for td in tds
            ]
            self.update_task_data(adopted_tds)
        return tds

    def update_changes_with_orphan_tds(self, mapping: Dict[str, str]):
        change_urls = list(mapping.keys())
        while change_urls:
            change_urls_to_process = change_urls[:50]
            change_urls = change_urls[50:]
            tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process)
            # Group tds in buckets by change_url
            _map: Dict[str, List] = dict()
            for td in tds:
                _map.setdefault(td["tasks_data"]["change_url"], []).append(
                    td["tasks_data"]
                )
            # Create update docs to attach tds to matching changes
            to_update = []
            for change_url, tds in _map.items():
                to_update.append(
                    TaskDataForEL(
                        _id=mapping[change_url],
                        tasks_data=createELTaskData(tds),
                    )
                )
            self.update_task_data(to_update)

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config)
            return dict_ident

        def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident, obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            if obj["type"] in get_events_list():
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)
class ESConnector:
    """
    as many MS will communicate with ElasticSearch, centralize access
    with this library
    """

    def __init__(self,
                 host=None,
                 port=9200,
                 timeout=10,
                 local_env=False):
        self.host = host
        self.port = port
        self.timeout = timeout
        self.local_env = local_env
        self.es = None

    def _connect(self):
        """
        connect to a member of the ElasticSearch cluster
        """
        try:
            if self.local_env:
                self.es = Elasticsearch([{'host': self.host,
                                          'port': self.port}])
            else:
                self.es = Elasticsearch([{'host': self.host,
                                          'port': self.port}],
                                        sniff_on_start=True,
                                        sniff_on_connection_fail=True,
                                        sniffer_timeout=self.timeout)
            self.idx = IndicesClient(self.es)
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host, self.port)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace, str(e))

    def _create_index(self, index, doc_type, settings=None, mappings=None):
        """
        create a new empty index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            settings = ElasticSearch cluster configuration
            mappings = dict of document fields by type and indexing preference
        """
        if not settings:
            settings = {'index': {'number_of_shards': '1',
                                  'number_of_replicas': '0'}}
        if not mappings:
            mappings = {'property': {'id': {'type': 'string',
                                           'index': 'not_analyzed'}}} 
        try:
            response = self.es.create(index=index,
                                      doc_type=doc_type,
                                      body=dumps(settings))
            self.idx.put_mapping(index=index,
                                 doc_type=doc_type,
                                 body=dumps(mappings))
            if not 'created' in response or not response['created']:
                return ElasticSearchError.unable_to_create_index(index)                                                                                                            
            log.info('Index: {} created'.format(index))
            log.info('ES create(): response: {}'.format(response))
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except NotFoundError as e:
            return ElasticSearchError.missing_index(self.index)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace,
                                                        str(e))

    def drop_index(self, index):
        try:
            if index in self.es.indices.stats()['indices'].keys():
                self.es.indices.delete(index=index, ignore=[400, 404])
            log.info('Index: {} deleted'.format(index))
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except NotFoundError as e:
            return ElasticSearcheError.missing_index(self.index)
        except RequestError as e:
            return ElasticSearcheError.invalid_request(str(e))
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace,
                                                        str(e))

    def add_document(self,
                     index=None,
                     doc_type=None,
                     doc_id=0,
                     settings={},
                     mappings={},
                     values={}):
        """
        add a new document to an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            settings = ElasticSearch cluster configuration
            mappings = dict of document fields by type and indexing preference
            values = dictionary of fields and values
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            if index not in self.es.indices.stats()['indices'].keys():
                err_msg = self._create_index(index,
                                             doc_type,
                                             settings,
                                             mappings)
                if err_msg:
                    return err_msg
            response = self.es.create(index=index,
                                      doc_type=doc_type,
                                      id=doc_id,
                                      body=dumps(values))
            log.info('ES create(): response: {}'.format(response))
            return ElasticSearchWrite.object_created(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchWriteError.unknown_exception(doc_id,
                                                             values,
                                                             backtrace,
                                                             str(e))

    def update_document(self,
                        index,
                        doc_type,
                        doc_id,
                        values):
        """
        update an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            doc_id = document_id
            values = dictionary of fields and values
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            log.info('ES body: {}'.format(values))
            response = self.es.update(index=index,
                                      doc_type=doc_type,
                                      id=doc_id,
                                      body=dumps(values))
            log.info('ES update(): response: {}'.format(response))
            return ElasticSearchWrite.object_updated(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchWriteError.unknown_exception(doc_id,
                                                             values,
                                                             backtrace,
                                                             str(e))

    def find_document(self,
                      index,
                      doc_type,
                      dsl=None,
                      fields=None):
        """
        find an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            dsl = query parameters in DSL format
            fields = list of fields to return
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            response = self.es.search(index=index,
                                      doc_type=doc_type,
                                      body=dumps(dsl),
                                      _source=fields)
            return ElasticSearchRead.object_found(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchReadError.unknown_exception(dsl,
                                                            fields,
                                                            backtrace,
                                                            str(e))

    def search_documents(self,
                         index,
                         doc_type,
                         dsl,
                         fields=None):
        """
        find an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            dsl = query parameters in DSL format
            fields = list of fields to return
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            response = self.es.search(index=index,
                                      doc_type=doc_type,
                                      body=dumps(dsl),
                                      _source=fields)
            return ElasticSearchRead.objects_found(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchReadError.unknown_exception(dsl,
                                                            fields,
                                                            backtrace,
                                                            str(e))
Esempio n. 4
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(
            self.indices, doc_type, body, doc_id, **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(
            self.indices, doc_type, body, doc_id, **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(
            self.indices, doc_type, doc_id, body, **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(
            self.indices, doc_type, body, **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(
            self.indices, doc_type, doc_id, body, **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(
            self.indices, doc_type, body, **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
Esempio n. 5
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(self.indices, doc_type, body, doc_id,
                                  **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(self.indices, doc_type, body, doc_id,
                                 **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(self.indices, doc_type, doc_id, body,
                                  **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(self.indices, doc_type, body,
                                           **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(self.indices, doc_type, doc_id, body,
                                   **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(self.indices, doc_type, body,
                                           **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(self.indices, doc_type, doc_id, body,
                                     **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(self.indices, doc_type, doc_id,
                                           body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(self.indices, doc_type, doc_id, body,
                                      **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)