Example #1
0
class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [
            {
                "host": host,
                "port": port,
            }
        ]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds."
                    % (elastic_conn, excpt, timeout)
                )
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "number": {"type": "keyword"},
                "change_id": {"type": "keyword"},
                "title": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "text": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "url": {"type": "keyword"},
                "commit_count": {"type": "integer"},
                "additions": {"type": "integer"},
                "deletions": {"type": "integer"},
                "changed_files_count": {"type": "integer"},
                "changed_files": {
                    "properties": {
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "path": {"type": "keyword"},
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {"type": "keyword"},
                        "author": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "title": {"type": "text"},
                    }
                },
                "repository_prefix": {"type": "keyword"},
                "repository_fullname": {"type": "keyword"},
                "repository_shortname": {"type": "keyword"},
                "author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "branch": {"type": "keyword"},
                "target_branch": {"type": "keyword"},
                "created_at": {"type": "date", "format": "date_time_no_millis"},
                "on_created_at": {"type": "date", "format": "date_time_no_millis"},
                "merged_at": {"type": "date", "format": "date_time_no_millis"},
                "updated_at": {"type": "date", "format": "date_time_no_millis"},
                "closed_at": {"type": "date", "format": "date_time_no_millis"},
                "state": {"type": "keyword"},
                "duration": {"type": "integer"},
                "mergeable": {"type": "keyword"},
                "labels": {"type": "keyword"},
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    },
                },
                "approval": {"type": "keyword"},
                "draft": {"type": "boolean"},
                "self_merged": {"type": "boolean"},
                "crawler_metadata": {
                    "properties": {
                        "last_commit_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "last_post_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "total_docs_posted": {"type": "integer"},
                        "total_changes_updated": {"type": "integer"},
                        "total_orphans_updated": {"type": "integer"},
                    }
                },
                "tasks_data": {
                    "properties": {
                        "tid": {"type": "keyword"},
                        "ttype": {"type": "keyword"},
                        "crawler_name": {"type": "keyword"},
                        "updated_at": {"type": "date", "format": "date_time_no_millis"},
                        "change_url": {"type": "keyword"},
                        "severity": {"type": "keyword"},
                        "priority": {"type": "keyword"},
                        "score": {"type": "integer"},
                        "url": {"type": "keyword"},
                        "title": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 8191}
                            },
                        },
                        "_adopted": {"type": "boolean"},
                    }
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def update_task_data(
        self,
        source_it: Union[
            List[TaskDataForEL],
            List[OrphanTaskDataForEL],
            List[AdoptedTaskDataForEL],
        ],
    ) -> Optional[BulkIndexError]:
        def gen(it):
            for _source in it:
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = _source._id
                d["doc"] = {}
                d["doc"].update({"id": _source._id})
                if isinstance(_source, TaskDataForEL):
                    d["doc"].update(
                        {"tasks_data": [asdict(td) for td in _source.tasks_data]}
                    )
                if isinstance(_source, OrphanTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                    d["doc"]["type"] = "OrphanTaskData"
                if isinstance(_source, AdoptedTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                d["doc_as_upsert"] = True
                yield d

        ret = None
        try:
            bulk(self.es, gen(source_it))
        except BulkIndexError as err:
            ret = err
        self.es.indices.refresh(index=self.index)
        return ret

    def compute_crawler_id_by_name(self, name, _type):
        return "crawler/%s/%s" % (_type, name)

    def get_task_crawler_metadata(self, name: str) -> Dict:
        try:
            ret = self.es.get(
                self.index, self.compute_crawler_id_by_name(name, "tasks_crawler")
            )
            return ret["_source"]["crawler_metadata"]
        except Exception:
            return {}

    def set_task_crawler_metadata(
        self, name: str, commit_date: datetime = None, push_infos: Dict = None
    ):
        metadata = {}
        if commit_date:
            metadata.update({"last_commit_at": commit_date})
        if push_infos:
            prev_metadata = self.get_task_crawler_metadata(name)
            metadata.update(
                {
                    "last_post_at": push_infos["last_post_at"],
                    "total_docs_posted": prev_metadata.get("total_docs_posted", 0)
                    + push_infos["total_docs_posted"],
                    "total_changes_updated": prev_metadata.get(
                        "total_changes_updated", 0
                    )
                    + push_infos["total_changes_updated"],
                    "total_orphans_updated": prev_metadata.get(
                        "total_orphans_updated", 0
                    )
                    + push_infos["total_orphans_updated"],
                }
            )
        body = {
            "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata},
            "doc_as_upsert": True,
        }
        ret = None
        try:
            self.es.update(
                self.index,
                self.compute_crawler_id_by_name(name, "tasks_crawler"),
                body=body,
            )
            self.es.indices.refresh(index=self.index)
        except Exception as err:
            ret = err
        return ret

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {"value": repository_fullname}
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{"updated_at": {"order": "desc"}}],
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"type": "Change"}},
                        {
                            "regexp": {
                                "repository_fullname": {"value": repository_fullname}
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def get_changes_by_url(self, change_urls, size):
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"type": "Change"}},
                            {"terms": {"url": change_urls}},
                        ]
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_by_change_urls(self, change_urls):
        assert len(change_urls) <= 50
        size = 5000  # Asumming not more that 100 TD data relataed to a change
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "must_not": {"exists": {"field": "tasks_data._adopted"}},
                        "filter": [
                            {"term": {"type": "OrphanTaskData"}},
                            {"terms": {"tasks_data.change_url": change_urls}},
                        ],
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_and_declare_adpotion(self, changes_url):
        assert len(changes_url) <= 50
        tds = self.get_orphan_tds_by_change_urls(changes_url)
        if tds:
            adopted_tds = [
                AdoptedTaskDataForEL(
                    _id=td["id"],
                    task_data=AdoptedTaskData(_adopted=True),
                )
                for td in tds
            ]
            self.update_task_data(adopted_tds)
        return tds

    def update_changes_with_orphan_tds(self, mapping: Dict[str, str]):
        change_urls = list(mapping.keys())
        while change_urls:
            change_urls_to_process = change_urls[:50]
            change_urls = change_urls[50:]
            tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process)
            # Group tds in buckets by change_url
            _map: Dict[str, List] = dict()
            for td in tds:
                _map.setdefault(td["tasks_data"]["change_url"], []).append(
                    td["tasks_data"]
                )
            # Create update docs to attach tds to matching changes
            to_update = []
            for change_url, tds in _map.items():
                to_update.append(
                    TaskDataForEL(
                        _id=mapping[change_url],
                        tasks_data=createELTaskData(tds),
                    )
                )
            self.update_task_data(to_update)

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config)
            return dict_ident

        def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident, obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            if obj["type"] in get_events_list():
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)
Example #2
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(self.indices, doc_type, body, doc_id,
                                  **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(self.indices, doc_type, body, doc_id,
                                 **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(self.indices, doc_type, doc_id, body,
                                  **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(self.indices, doc_type, body,
                                           **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(self.indices, doc_type, doc_id, body,
                                   **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(self.indices, doc_type, body,
                                           **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(self.indices, doc_type, doc_id, body,
                                     **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(self.indices, doc_type, doc_id,
                                           body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(self.indices, doc_type, doc_id, body,
                                      **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
Example #3
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(
            self.indices, doc_type, body, doc_id, **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(
            self.indices, doc_type, body, doc_id, **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(
            self.indices, doc_type, doc_id, body, **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(
            self.indices, doc_type, body, **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(
            self.indices, doc_type, doc_id, body, **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(
            self.indices, doc_type, body, **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
Example #4
0
class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [{
            "host": host,
            "port": port,
        }]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds." %
                    (elastic_conn, excpt, timeout))
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {
                    "type": "keyword"
                },
                "type": {
                    "type": "keyword"
                },
                "number": {
                    "type": "keyword"
                },
                "change_id": {
                    "type": "keyword"
                },
                "title": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "text": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "url": {
                    "type": "keyword"
                },
                "commit_count": {
                    "type": "integer"
                },
                "additions": {
                    "type": "integer"
                },
                "deletions": {
                    "type": "integer"
                },
                "changed_files_count": {
                    "type": "integer"
                },
                "changed_files": {
                    "properties": {
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "path": {
                            "type": "keyword"
                        },
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {
                            "type": "keyword"
                        },
                        "author": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "title": {
                            "type": "text"
                        },
                    }
                },
                "repository_prefix": {
                    "type": "keyword"
                },
                "repository_fullname": {
                    "type": "keyword"
                },
                "repository_shortname": {
                    "type": "keyword"
                },
                "author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "branch": {
                    "type": "keyword"
                },
                "target_branch": {
                    "type": "keyword"
                },
                "created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "on_created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "merged_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "updated_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "closed_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "state": {
                    "type": "keyword"
                },
                "duration": {
                    "type": "integer"
                },
                "mergeable": {
                    "type": "keyword"
                },
                "labels": {
                    "type": "keyword"
                },
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    },
                },
                "approval": {
                    "type": "keyword"
                },
                "draft": {
                    "type": "boolean"
                },
                "self_merged": {
                    "type": "boolean"
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {
                                "value": repository_fullname
                            }
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{
                "updated_at": {
                    "order": "desc"
                }
            }],
            "query": {
                "bool": {
                    "filter": [
                        {
                            "term": {
                                "type": "Change"
                            }
                        },
                        {
                            "regexp": {
                                "repository_fullname": {
                                    "value": repository_fullname
                                }
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"],
                                             self.idents_config)
            return dict_ident

        def _update_idents(
                obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident,
                                                obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            else:
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)