def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) if obj["type"] in get_events_list(): if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False
def test_get_change_events_by_url(self): objs = self.db.get_change_events_by_url([ "https://tests.com/unit/repo1/pull/1", "https://tests.com/unit/repo2/pull/2", ], ) total_match = len(objs) self.assertEqual(9, total_match) self.assertEqual( total_match, len([o["type"] for o in objs if o["type"] in get_events_list()]), )
def dict_to_change_or_event(d: Dict) -> Union[Change, Event]: _type = d["type"] for k1, k2 in (("id", "_id"), ("type", "_type")): d[k2] = d[k1] del d[k1] if _type == "Change": return from_dict(data_class=Change, data=d) elif _type in get_events_list(): return from_dict(data_class=Event, data=d) else: raise Exception("Unknown DB item id: %s" % _type)
def changes_and_events(es, index, repository_fullname, params): params = deepcopy(params) params["etype"] = [ "Change", ] + get_events_list() body = { "sort": [{"created_at": {"order": "asc"}}], "size": params["size"], "from": params["from"], "query": generate_filter(es, index, repository_fullname, params), } data = run_query(es, index, body) changes = [r["_source"] for r in data["hits"]["hits"]] changes = enhance_changes(changes) return {"items": changes, "total": totalc(data["hits"]["total"])}
def string_ident_to_ident(elastic_conn, index) -> None: bulk_size = 7500 client = ELmonocleDB(elastic_conn, index, previous_schema=True) client2 = ELmonocleDB(elastic_conn, index) changes_url_lookup: Dict[str, str] = {} to_update: List = [] need_url_update: List[Dict] = [] total_objects_updated = 0 def bulk_update(to_update: List) -> None: client2.update(to_update) def update_changes_url_lookup(objs: List[Dict]) -> None: change_ids = [o["change_id"] for o in objs] change_ids = list(set(change_ids)) change_ids = [ _id for _id in change_ids if _id not in changes_url_lookup ] print("Updating change_url_lookup for %s changes ..." % len(change_ids)) params = {"change_ids": change_ids, "size": 10000, "from": 0} result = client.run_named_query("changes", ".*", params=params) changes = result["items"] for change in changes: changes_url_lookup[change["change_id"]] = utils.strip_url( change["url"]) print("%s entries in changes_url_lookup" % len(changes_url_lookup)) def update_obj(obj: Dict) -> Dict: url = utils.strip_url(obj["url"]) def update_approval_type(approval): if isinstance(approval, str): ret = [approval] else: ret = approval return [r for r in ret if r is not None] def create_ident_dict(url: str, uid: str) -> Dict: domain = urlparse(url).netloc uid = prefix(domain, uid) return { "uid": uid, "muid": create_muid_from_uid(uid), } def to_ident(value: Optional[str]) -> Optional[Dict]: if value: return create_ident_dict(url, value) return None if obj["type"] == "Change": obj["author"] = to_ident(obj["author"]) obj["committer"] = to_ident(obj.get("committer")) obj["merged_by"] = to_ident(obj.get("merged_by")) obj["assignees"] = list(map(to_ident, obj.get("assignees", []))) for commit in obj.get("commits", []): # Also fix commit's author that might be not exists if "author" not in commit.keys(): commit["author"] = obj["author"] else: commit["author"] = to_ident(commit["author"]) # Also fix commit's committer that might be not exists if "committer" not in commit.keys(): commit["committer"] = commit["author"] else: commit["committer"] = to_ident(commit["committer"]) else: obj["author"] = to_ident(obj.get("author")) obj["on_author"] = to_ident(obj.get("on_author")) # Also fix missing created_at date on ChangeCommitPushedEvent if obj["type"] == "ChangeCommitPushedEvent" and obj[ "created_at"] is None: obj["created_at"] = obj["on_created_at"] # Also fix approval format if needed if obj.get("approval"): obj["approval"] = update_approval_type(obj["approval"]) # Ensure we have the stripped url obj["url"] = url return obj def proceed(): if need_url_update: update_changes_url_lookup(need_url_update) for o in to_update: if o in need_url_update: if o["change_id"] in changes_url_lookup: o["url"] = changes_url_lookup[o["change_id"]] else: print("Warning - unable to find change %s" % o["change_id"]) o["url"] = "https://undefined" updated = list(map(update_obj, to_update)) print("Updating %s objects ..." % len(to_update)) bulk_update(list(map(dict_to_change_or_event, updated))) for _obj in client.iter_index(): obj = _obj["_source"] if obj["type"] in utils.get_events_list() and "url" not in obj.keys(): need_url_update.append(obj) if obj["type"] in utils.get_events_list() + ["Change"]: to_update.append(obj) if len(to_update) == bulk_size: proceed() total_objects_updated += len(to_update) print("Total objects updated: %s" % total_objects_updated) need_url_update = [] to_update = [] proceed() total_objects_updated += len(to_update) print("Total objects updated: %s" % total_objects_updated)