def self_merge(elastic_conn, index) -> None: to_update = [] bulk_size = 500 def update_change(change): if "self_merged" not in change.keys(): if "merged_by" not in change: # Here we fix the missing field that can happen with the Gerrit crawler change["merged_by"] = None if change["merged_by"]: change["self_merged"] = change["merged_by"] == change["author"] else: change["self_merged"] = None return True client = ELmonocleDB(elastic_conn, index) for _obj in client.iter_index(): obj = _obj["_source"] if obj["type"] == "Change": updated = update_change(obj) if updated: to_update.append(dict_to_change_or_event(obj)) if len(to_update) == bulk_size: print("Updating %s changes ..." % bulk_size) client.update(to_update) to_update = []
def self_merge(elastic_conn, index): to_update = [] bulk_size = 500 def update_change(change): if 'self_merged' not in change.keys(): if 'merged_by' not in change: # Here we fix the missing field that can happen with the Gerrit crawler change['merged_by'] = None if change['merged_by']: change['self_merged'] = change['merged_by'] == change['author'] else: change['self_merged'] = None return True client = ELmonocleDB(elastic_conn, index) for _obj in client.iter_index(): obj = _obj['_source'] if obj['type'] == 'Change': updated = update_change(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: print("Updating %s changes ..." % bulk_size) client.update(to_update) to_update = []
def __init__(self, args, elastic_conn='localhost:9200', elastic_timeout=10): super().__init__() self.updated_since = args.updated_since self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None self.loop_delay = int(args.loop_delay) self.db = ELmonocleDB(elastic_conn=elastic_conn, index=args.index, timeout=elastic_timeout) if args.command == 'github_crawler': if args.repository: self.repository_el_re = "%s/%s" % ( args.org.lstrip('^'), args.repository.lstrip('^'), ) else: self.repository_el_re = args.org.lstrip('^') + '/.*' self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token), args.base_url, args.org, args.repository) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher(args.base_url, args.repository)
class MonocleCrawler(): log = logging.getLogger("monocle.Crawler") def __init__(self, args): self.updated_since = args.updated_since self.loop_delay = int(args.loop_delay) self.get_one = getattr(args, 'id', None) self.db = ELmonocleDB() if args.command == 'github_crawler': self.get_one_rep = getattr(args, 'repository', None) self.org = args.org self.repository_el_re = args.org.lstrip('^') + '.*' self.prf = pullrequest.PRsFetcher( GithubGraphQLQuery(args.token), args.host, args.org) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher( args.host, args.repository) def get_last_updated_date(self): change = self.db.get_last_updated(self.repository_el_re) if not change: return ( self.updated_since or datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")) else: logging.info( "Most recent change date in the database for %s is %s" % ( self.repository_el_re, change['updated_at'])) return change['updated_at'] def run_step(self): updated_since = self.get_last_updated_date() prs = self.prf.get(updated_since) objects = self.prf.extract_objects(prs) if objects: self.log.info("%s objects will be updated in the database" % len( objects)) self.db.update(objects) def run(self): if self.get_one: if not self.get_one_rep: print("The --repository argument must be given") else: pprint(self.prf.get_one( self.org, self.get_one_rep, self.get_one)) else: while True: self.run_step() self.log.info("Waiting %s seconds before next fetch ..." % ( self.loop_delay)) sleep(self.loop_delay)
def do_query(index, repository_fullname, args, name): params = utils.set_params(args) db = ELmonocleDB( elastic_conn=os.getenv('ELASTIC_CONN', 'localhost:9200'), index=index, prefix=CHANGE_PREFIX, create=False, ) try: result = db.run_named_query(name, repository_fullname, params) except InvalidIndexError: return 'Invalid index: %s' % request.args.get('index'), 404 return jsonify(result)
def __init__(self, args): self.updated_since = args.updated_since self.loop_delay = int(args.loop_delay) self.get_one = getattr(args, 'id', None) self.db = ELmonocleDB() if args.command == 'github_crawler': self.get_one_rep = getattr(args, 'repository', None) self.org = args.org self.repository_el_re = args.org.lstrip('^') + '.*' self.prf = pullrequest.PRsFetcher( GithubGraphQLQuery(args.token), args.host, args.org) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher( args.host, args.repository)
def indices(): db = ELmonocleDB( elastic_conn=os.getenv('ELASTIC_CONN', 'localhost:9200'), create=False, prefix=CHANGE_PREFIX, ) _indices = db.get_indices() indices = [] for indice in _indices: if config.is_public_index(indexes_acl, indice): indices.append(indice) else: user = session.get('username') if user: if user in config.get_authorized_users(indexes_acl, indice): indices.append(indice) return jsonify(indices)
def do_query(index, repository_fullname, args, name): params = utils.set_params(args) db = ELmonocleDB( elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"), index=index, prefix=CHANGE_PREFIX, create=False, user=os.getenv("ELASTIC_USER", None), password=os.getenv("ELASTIC_PASSWORD", None), use_ssl=os.getenv("ELASTIC_USE_SSL", None), verify_certs=os.getenv("ELASTIC_INSECURE", None), ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None), ) try: result = db.run_named_query(name, repository_fullname, params) except InvalidIndexError: return "Invalid index: %s" % request.args.get("index"), 404 return jsonify(result)
def get_db_cnx(index: str, prefix: str) -> ELmonocleDB: return ELmonocleDB( index=index, prefix=prefix, user=os.getenv("ELASTIC_USER", None), password=os.getenv("ELASTIC_PASSWORD", None), use_ssl=os.getenv("ELASTIC_USE_SSL", False), verify_certs=os.getenv("ELASTIC_INSECURE", None), ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None), )
def setUpClass(cls): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s", ) log = logging.getLogger(__name__) # log to stderr log.addHandler(logging.StreamHandler()) cls.eldb = ELmonocleDB(index=cls.index, prefix="monocle.test.") for dataset in cls.datasets: index_dataset(cls.eldb, dataset)
def create_db_connection(index: Optional[str]) -> ELmonocleDB: return ELmonocleDB( elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"), index=index, prefix=CHANGE_PREFIX, create=False, user=os.getenv("ELASTIC_USER", None), password=os.getenv("ELASTIC_PASSWORD", None), use_ssl=os.getenv("ELASTIC_USE_SSL", None), verify_certs=os.getenv("ELASTIC_INSECURE", None), ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None), )
def indices(): db = ELmonocleDB( elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"), create=False, prefix=CHANGE_PREFIX, user=os.getenv("ELASTIC_USER", None), password=os.getenv("ELASTIC_PASSWORD", None), use_ssl=os.getenv("ELASTIC_USE_SSL", None), verify_certs=os.getenv("ELASTIC_INSECURE", None), ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None), ) _indices = db.get_indices() indices = [] for indice in _indices: if config.is_public_index(indexes_acl, indice): indices.append(indice) else: user = session.get("username") if user: if user in config.get_authorized_users(indexes_acl, indice): indices.append(indice) return jsonify(indices)
def setUpClass(cls): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s", ) log = logging.getLogger(__name__) # log to stderr log.addHandler(logging.StreamHandler()) cls.eldb = ELmonocleDB( index=cls.index, prefix="monocle.test.", user=os.getenv("ELASTIC_USER", None), password=os.getenv("ELASTIC_PASSWORD", None), use_ssl=os.getenv("ELASTIC_USE_SSL", None), verify_certs=os.getenv("ELASTIC_INSECURE", None), ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None), ) for dataset in cls.datasets: index_dataset(cls.eldb, dataset)
def main(): parser = argparse.ArgumentParser(prog='monocle') parser.add_argument('--loglevel', help='logging level', default='INFO') parser.add_argument( '--elastic-timeout', help='Elasticsearch connection retry timeout', default=10, type=int, ) parser.add_argument( '--elastic-conn', help='Elasticsearch connection info', default='localhost:9200' ) subparsers = parser.add_subparsers( title='Subcommands', description='valid subcommands', dest="command" ) parser_crawler = subparsers.add_parser('crawler', help='Threaded crawlers pool') parser_crawler.add_argument( '--config', help='Configuration file of the crawlers pool', required=True ) parser_dbmanage = subparsers.add_parser('dbmanage', help='Database manager') parser_dbmanage.add_argument( '--delete-repository', help='Delete events related to a repository (regexp)', ) parser_dbmanage.add_argument( '--delete-index', help='Delete the index', action='store_true', ) parser_dbmanage.add_argument( '--index', help='The Elastisearch index name', required=True ) parser_dbquery = subparsers.add_parser( 'dbquery', help='Run an existsing query on stored events' ) parser_dbquery.add_argument( '--index', help='The Elastisearch index name', required=True ) parser_dbquery.add_argument('--name', help='The query name', required=True) parser_dbquery.add_argument( '--repository', help='Scope to events of repositories (regexp)', required=True ) parser_dbquery.add_argument( '--target-branch', help='Scope to events of a target branches (regexp)' ) parser_dbquery.add_argument('--gte', help='Scope to events created after date') parser_dbquery.add_argument('--lte', help='Scope to events created before date') parser_dbquery.add_argument( '--on_cc_gte', help='Scope to events related to changes created after date' ) parser_dbquery.add_argument( '--on_cc_lte', help='Scope to events related to changes created before date' ) parser_dbquery.add_argument( '--ec-same-date', help='Scope to events related to changes created during the ' 'same date bondaries defined by gte/lte arguments', action='store_true', ) parser_dbquery.add_argument( '--type', help='Scope to events types list (comma separated)' ) parser_dbquery.add_argument( '--files', help='Scope to changes containing this file regexp' ) parser_dbquery.add_argument( '--state', help='Scope to changes having this state', choices=['OPEN', 'CLOSED', 'MERGED'], ) parser_dbquery.add_argument( '--change-ids', help='Scope to change ids (comma separated)' ) parser_dbquery.add_argument('--authors', help='Scope to authors (comma separated)') parser_dbquery.add_argument( '--approvals', help='Scope to objects with approvals (comma separated)' ) parser_dbquery.add_argument( '--exclude-approvals', help='Approvals exclude list (comma separated)' ) parser_dbquery.add_argument( '--size', help='Return maximum of size results', default=10 ) parser_dbquery.add_argument( '--from', help='Starting index of the elements to retrieve', default=0 ) parser_dbquery.add_argument( '--exclude-authors', help='Authors exclude list (comma separated)' ) parser_dbquery.add_argument( '--tests-included', help='Scope to changes containing tests', action='store_true', ) parser_dbquery.add_argument( '--has-issue-tracker-links', help='Scope to changes containing an issue tracker link', choices=['generic', 'github.com', 'altassian.net'], ) args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.loglevel.upper()), format="%(asctime)s - %(name)s - %(thread)d - %(threadName)s - " + "%(levelname)s - %(message)s", ) log = logging.getLogger(__name__) if not args.command: parser.print_usage() return 1 if args.command == "crawler": realpath = os.path.expanduser(args.config) if not os.path.isfile(realpath): log.error('Unable to access config: %s' % realpath) sys.exit(1) configdata = yaml.safe_load(open(realpath).read()) validate(instance=configdata, schema=config.schema) tpool = [] group = {} app = None if os.getenv('APP_ID') and os.getenv('APP_KEY_PATH'): app = application.get_app(os.getenv('APP_ID'), os.getenv('APP_KEY_PATH')) for tenant in configdata['tenants']: for crawler_item in tenant['crawler'].get('github_orgs', []): tg = pullrequest.TokenGetter( crawler_item['name'], crawler_item.get('token'), app ) c_args = pullrequest.GithubCrawlerArgs( command='github_crawler', org=crawler_item['name'], updated_since=crawler_item['updated_since'], loop_delay=tenant['crawler']['loop_delay'], repository=crawler_item.get('repository'), base_url=crawler_item['base_url'], token_getter=tg, db=ELmonocleDB( elastic_conn=args.elastic_conn, index=tenant['index'], timeout=args.elastic_timeout, ), ) gid = crawler_item.get('token') if not gid: if app: # No token, if we have a app then get the token from the app gid = app.get_token(org=crawler_item['name']) else: log.info('Skip crawler because no token: %s' % c_args) continue if gid not in group: group[gid] = GroupCrawler() tpool.append(group[gid]) if c_args.repository: repositories = [c_args.repository] else: log.info('Discovering repositories in %s ...' % c_args.org) # No repository specified for that organization so # try to discover all of them rf = organization.RepositoriesFetcher( graphql.GithubGraphQLQuery(token_getter=tg) ) repos = rf.get(c_args.org) repositories = [ repo['name'] for repo in repos if not repo['isArchived'] ] log.info( 'Found %s repositories in %s ...' % (len(repositories), c_args.org) ) for repository in repositories: c_args.repository = repository group[gid].add_crawler(Runner(c_args)) for crawler_item in tenant['crawler'].get('gerrit_repositories', []): c_args = review.GerritCrawlerArgs( command='gerrit_crawler', repository=crawler_item['name'], updated_since=crawler_item['updated_since'], loop_delay=tenant['crawler']['loop_delay'], base_url=crawler_item['base_url'], insecure=crawler_item.get('insecure', False), login=crawler_item.get('login'), password=crawler_item.get('password'), db=ELmonocleDB( elastic_conn=args.elastic_conn, index=tenant['index'], timeout=args.elastic_timeout, ), ) tpool.append(Crawler(c_args)) log.info('%d configured threads' % len(tpool)) for cthread in tpool: cthread.start() if args.command == "dbmanage": db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index) if args.delete_repository: db.delete_repository(args.delete_repository) if args.delete_index: db.delete_index() if args.command == "dbquery": db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index) params = utils.set_params(args) try: ret = db.run_named_query(args.name, args.repository.lstrip('^'), params) except UnknownQueryException as err: log.error('Unable to run query: %s' % err) sys.exit(1) pprint(ret)
def main(): parser = argparse.ArgumentParser(prog='monocle') parser.add_argument( '--loglevel', help='logging level', default='INFO') subparsers = parser.add_subparsers(title='Subcommands', description='valid subcommands', dest="command") for crawler_driver in (pullrequest, review): parser_crawler = subparsers.add_parser( crawler_driver.name, help=crawler_driver.help) parser_crawler.add_argument( '--loop-delay', help='Request last updated events every N secs', default=900) parser_crawler.add_argument( '--host', help='Base url of the code review server', required=True) crawler_driver.init_crawler_args_parser(parser_crawler) parser_dbmanage = subparsers.add_parser( 'dbmanage', help='Database manager') parser_dbmanage.add_argument( '--delete-repository', help='Delete events related to a repository (regexp)', required=True) parser_dbquery = subparsers.add_parser( 'dbquery', help='Run an existsing query on stored events') parser_dbquery.add_argument( '--interval', help='Histogram interval', default="3h") parser_dbquery.add_argument( '--name', help='The query name', required=True) parser_dbquery.add_argument( '--repository', help='Scope to events of a repository (regexp)', required=True) parser_dbquery.add_argument( '--gte', help='Scope to events created after date') parser_dbquery.add_argument( '--lte', help='Scope to events created before date') parser_dbquery.add_argument( '--on_cc_gte', help='Scope to events related to changes created after date') parser_dbquery.add_argument( '--on_cc_lte', help='Scope to events related to changes created before date') parser_dbquery.add_argument( '--ec-same-date', help='Scope to events related to changes created during the ' 'same date bondaries defined by gte/lte arguments', action='store_true') parser_dbquery.add_argument( '--type', help='Scope to events types list (comma separated)') parser_dbquery.add_argument( '--authors', help='Scope to authors (comma separated)') parser_dbquery.add_argument( '--approval', help='Scope to events with approval') parser_dbquery.add_argument( '--size', help='Return maximum of size results', default=10) parser_dbquery.add_argument( '--exclude-authors', help='Authors exclude list (comma separated)') args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.loglevel.upper())) if not args.command: parser.print_usage() return 1 if args.command.endswith("_crawler"): crawler = MonocleCrawler(args) crawler.run() if args.command == "dbmanage": if args.delete_repository: db = ELmonocleDB() db.delete_repository(args.delete_repository) if args.command == "dbquery": db = ELmonocleDB() params = utils.set_params(args) ret = db.run_named_query( args.name, args.repository.lstrip('^'), params) pprint(ret)
class Runner(object): def __init__(self, args, elastic_conn='localhost:9200', elastic_timeout=10): super().__init__() self.updated_since = args.updated_since self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None self.loop_delay = int(args.loop_delay) self.db = ELmonocleDB(elastic_conn=elastic_conn, index=args.index, timeout=elastic_timeout) if args.command == 'github_crawler': if args.repository: self.repository_el_re = "%s/%s" % ( args.org.lstrip('^'), args.repository.lstrip('^'), ) else: self.repository_el_re = args.org.lstrip('^') + '/.*' self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token), args.base_url, args.org, args.repository) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher(args.base_url, args.repository) def get_last_updated_date(self): change = self.db.get_last_updated(self.repository_el_re) if not change: return self.updated_since or datetime.now().strftime( "%Y-%m-%dT%H:%M:%SZ") else: log.info("Most recent change date in the database for %s is %s" % (self.repository_el_re, change['updated_at'])) return change['updated_at'] def run_step(self): def dump_data(data, prefix=None): try: if self.dump_dir: tmpfile = tempfile.NamedTemporaryFile( dir=self.dump_dir, prefix=prefix, suffix='.json', mode='w', delete=False, ) json.dump(data, tmpfile) tmpfile.close() log.info('Data dumped to %s' % tmpfile.name) return tmpfile.name except Exception: log.exception('Unable to dump data') return None updated_since = self.get_last_updated_date() try: prs = self.prf.get(updated_since) except Exception: log.exception('Unable to get PR data') return objects = self.prf.extract_objects(prs, dump_data) if objects: log.info("%d objects will be updated in the database" % len(objects)) self.db.update(objects)
def string_ident_to_ident(elastic_conn, index) -> None: bulk_size = 7500 client = ELmonocleDB(elastic_conn, index, previous_schema=True) client2 = ELmonocleDB(elastic_conn, index) changes_url_lookup: Dict[str, str] = {} to_update: List = [] need_url_update: List[Dict] = [] total_objects_updated = 0 def bulk_update(to_update: List) -> None: client2.update(to_update) def update_changes_url_lookup(objs: List[Dict]) -> None: change_ids = [o["change_id"] for o in objs] change_ids = list(set(change_ids)) change_ids = [ _id for _id in change_ids if _id not in changes_url_lookup ] print("Updating change_url_lookup for %s changes ..." % len(change_ids)) params = {"change_ids": change_ids, "size": 10000, "from": 0} result = client.run_named_query("changes", ".*", params=params) changes = result["items"] for change in changes: changes_url_lookup[change["change_id"]] = utils.strip_url( change["url"]) print("%s entries in changes_url_lookup" % len(changes_url_lookup)) def update_obj(obj: Dict) -> Dict: url = utils.strip_url(obj["url"]) def update_approval_type(approval): if isinstance(approval, str): ret = [approval] else: ret = approval return [r for r in ret if r is not None] def create_ident_dict(url: str, uid: str) -> Dict: domain = urlparse(url).netloc uid = prefix(domain, uid) return { "uid": uid, "muid": create_muid_from_uid(uid), } def to_ident(value: Optional[str]) -> Optional[Dict]: if value: return create_ident_dict(url, value) return None if obj["type"] == "Change": obj["author"] = to_ident(obj["author"]) obj["committer"] = to_ident(obj.get("committer")) obj["merged_by"] = to_ident(obj.get("merged_by")) obj["assignees"] = list(map(to_ident, obj.get("assignees", []))) for commit in obj.get("commits", []): # Also fix commit's author that might be not exists if "author" not in commit.keys(): commit["author"] = obj["author"] else: commit["author"] = to_ident(commit["author"]) # Also fix commit's committer that might be not exists if "committer" not in commit.keys(): commit["committer"] = commit["author"] else: commit["committer"] = to_ident(commit["committer"]) else: obj["author"] = to_ident(obj.get("author")) obj["on_author"] = to_ident(obj.get("on_author")) # Also fix missing created_at date on ChangeCommitPushedEvent if obj["type"] == "ChangeCommitPushedEvent" and obj[ "created_at"] is None: obj["created_at"] = obj["on_created_at"] # Also fix approval format if needed if obj.get("approval"): obj["approval"] = update_approval_type(obj["approval"]) # Ensure we have the stripped url obj["url"] = url return obj def proceed(): if need_url_update: update_changes_url_lookup(need_url_update) for o in to_update: if o in need_url_update: if o["change_id"] in changes_url_lookup: o["url"] = changes_url_lookup[o["change_id"]] else: print("Warning - unable to find change %s" % o["change_id"]) o["url"] = "https://undefined" updated = list(map(update_obj, to_update)) print("Updating %s objects ..." % len(to_update)) bulk_update(list(map(dict_to_change_or_event, updated))) for _obj in client.iter_index(): obj = _obj["_source"] if obj["type"] in utils.get_events_list() and "url" not in obj.keys(): need_url_update.append(obj) if obj["type"] in utils.get_events_list() + ["Change"]: to_update.append(obj) if len(to_update) == bulk_size: proceed() total_objects_updated += len(to_update) print("Total objects updated: %s" % total_objects_updated) need_url_update = [] to_update = [] proceed() total_objects_updated += len(to_update) print("Total objects updated: %s" % total_objects_updated)
def main(): parser_dbquery = argparse.ArgumentParser(prog=sys.argv[0]) parser_dbquery.add_argument( "--repository", help="Scope to events of a repository (regexp)", default=r".*") parser_dbquery.add_argument("--gte", help="Scope to events created after date") parser_dbquery.add_argument("--lte", help="Scope to events created before date") parser_dbquery.add_argument("--size", help="Return maximum of size results", default=1000) parser_dbquery.add_argument("--exclude-authors", help="Authors exclude list (comma separated)") args = parser_dbquery.parse_args() db = ELmonocleDB() params = utils.set_params(args) data = db.run_named_query("last_merged_changes", args.repository.lstrip("^"), params) lte_time = (datetime.datetime.strptime(args.lte, "%Y-%m-%d") + datetime.timedelta(days=1) if args.lte else None) title = {} for entry in data: # example: 2020-02-24T19:05:13Z created_time = datetime.datetime.strptime(entry["created_at"], "%Y-%m-%dT%H:%M:%SZ") merge_time = datetime.datetime.strptime(entry["merged_at"], "%Y-%m-%dT%H:%M:%SZ") if lte_time and merge_time > lte_time: continue print("%.0f|%s|A|/%s/%s|" % ( created_time.timestamp(), entry["author"], entry["repository_fullname"], entry["title"], )) print("%.0f|%s|M|/%s/%s|" % ( merge_time.timestamp(), entry["author"], entry["repository_fullname"], entry["title"], )) title[entry["repository_fullname_and_number"]] = entry["title"] params["etype"] = ("ChangeCommentedEvent", ) data = db.run_named_query("_scan", args.repository.lstrip("^"), params) for entry in data: # example: 2020-02-24T19:05:13Z created_time = datetime.datetime.strptime(entry["created_at"], "%Y-%m-%dT%H:%M:%SZ") try: print("%.0f|%s|M|/%s/%s|" % ( created_time.timestamp(), entry["author"], entry["repository_fullname"], title[entry["repository_fullname_and_number"]], )) except KeyError: print( "%s not merged" % entry["repository_fullname_and_number"], file=sys.stderr, )
def main(): parser = argparse.ArgumentParser(prog='monocle') parser.add_argument('--loglevel', help='logging level', default='INFO') parser.add_argument( '--elastic-timeout', help='Elasticsearch connection retry timeout', default=10, type=int, ) parser.add_argument('--elastic-conn', help='Elasticsearch connection info', default='localhost:9200') subparsers = parser.add_subparsers(title='Subcommands', description='valid subcommands', dest="command") parser_crawler = subparsers.add_parser('crawler', help='Threaded crawlers pool') parser_crawler.add_argument('--config', help='Configuration file of the crawlers pool', required=True) parser_dbmanage = subparsers.add_parser('dbmanage', help='Database manager') parser_dbmanage.add_argument( '--delete-repository', help='Delete events related to a repository (regexp)', required=True, ) parser_dbmanage.add_argument('--index', help='The Elastisearch index name', required=True) parser_dbquery = subparsers.add_parser( 'dbquery', help='Run an existsing query on stored events') parser_dbquery.add_argument('--index', help='The Elastisearch index name', required=True) parser_dbquery.add_argument('--name', help='The query name', required=True) parser_dbquery.add_argument( '--repository', help='Scope to events of repositories (regexp)', required=True) parser_dbquery.add_argument( '--target-branch', help='Scope to events of a target branches (regexp)') parser_dbquery.add_argument('--gte', help='Scope to events created after date') parser_dbquery.add_argument('--lte', help='Scope to events created before date') parser_dbquery.add_argument( '--on_cc_gte', help='Scope to events related to changes created after date') parser_dbquery.add_argument( '--on_cc_lte', help='Scope to events related to changes created before date') parser_dbquery.add_argument( '--ec-same-date', help='Scope to events related to changes created during the ' 'same date bondaries defined by gte/lte arguments', action='store_true', ) parser_dbquery.add_argument( '--type', help='Scope to events types list (comma separated)') parser_dbquery.add_argument( '--files', help='Scope to changes containing this file regexp') parser_dbquery.add_argument( '--state', help='Scope to changes having this state', choices=['OPEN', 'CLOSED', 'MERGED'], ) parser_dbquery.add_argument('--change-ids', help='Scope to change ids (comma separated)') parser_dbquery.add_argument('--authors', help='Scope to authors (comma separated)') parser_dbquery.add_argument('--approval', help='Scope to events with approval') parser_dbquery.add_argument('--size', help='Return maximum of size results', default=10) parser_dbquery.add_argument( '--from', help='Starting index of the elements to retrieve', default=0) parser_dbquery.add_argument('--exclude-authors', help='Authors exclude list (comma separated)') parser_dbquery.add_argument( '--tests-included', help='Scope to changes containing tests', action='store_true', ) parser_dbquery.add_argument( '--has-issue-tracker-links', help='Scope to changes containing an issue tracker link', choices=['generic', 'github.com', 'altassian.net'], ) args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.loglevel.upper()), format="%(asctime)s - %(name)s - %(threadName)s - " + "%(levelname)s - %(message)s", ) log = logging.getLogger(__name__) if not args.command: parser.print_usage() return 1 if args.command == "crawler": realpath = os.path.expanduser(args.config) if not os.path.isfile(realpath): log.error('Unable to access config: %s' % realpath) sys.exit(1) configdata = yaml.safe_load(open(realpath).read()) validate(instance=configdata, schema=config.schema) tpool = [] group = {} for tenant in configdata['tenants']: for crawler_item in tenant['crawler'].get('github_orgs', []): c_args = pullrequest.GithubCrawlerArgs( command='github_crawler', index=tenant['index'], org=crawler_item['name'], updated_since=crawler_item['updated_since'], loop_delay=tenant['crawler']['loop_delay'], token=crawler_item['token'], repository=crawler_item.get('repository'), base_url=crawler_item['base_url'], ) log.info('args=%s' % c_args) if crawler_item['token'] not in group: group[crawler_item['token']] = GroupCrawler() tpool.append(group[crawler_item['token']]) group[crawler_item['token']].add_crawler( Runner( c_args, elastic_conn=args.elastic_conn, elastic_timeout=args.elastic_timeout, )) for crawler_item in tenant['crawler'].get('gerrit_repositories', []): c_args = review.GerritCrawlerArgs( command='gerrit_crawler', index=tenant['index'], repository=crawler_item['name'], updated_since=crawler_item['updated_since'], loop_delay=tenant['crawler']['loop_delay'], base_url=crawler_item['base_url'], ) tpool.append( Crawler( c_args, elastic_conn=args.elastic_conn, elastic_timeout=args.elastic_timeout, )) log.info('%d configured threads' % len(tpool)) for cthread in tpool: cthread.start() if args.command == "dbmanage": if args.delete_repository: db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index) db.delete_repository(args.delete_repository) if args.command == "dbquery": db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index) params = utils.set_params(args) try: ret = db.run_named_query(args.name, args.repository.lstrip('^'), params) except UnknownQueryException as err: log.error('Unable to run query: %s' % err) sys.exit(1) pprint(ret)
def main() -> None: parser = argparse.ArgumentParser(prog="monocle") parser.add_argument("--loglevel", help="logging level", default="INFO") parser.add_argument( "--elastic-timeout", help="Elasticsearch connection retry timeout", default=10, type=int, ) parser.add_argument("--elastic-conn", help="Elasticsearch connection info", default="localhost:9200") parser.add_argument( "--use-ssl", help="Use https protocol for communication with Elasticsearch", action="store_true", ) parser.add_argument( "--insecure", help="Skip SSL CA cert validation", action="store_false", ) parser.add_argument( "--ssl_show_warn", help="Skip showing a SSL warning message if it is not signed " "by CA authority", action="store_false", ) parser.add_argument( "--elastic-user", help="Username for Elasticsearch authorization", ) parser.add_argument( "--elastic-password", help="Password for Elasticsearch authorization", ) subparsers = parser.add_subparsers(title="Subcommands", description="valid subcommands", dest="command") parser_crawler = subparsers.add_parser("crawler", help="Threaded crawlers pool") parser_crawler.add_argument("--config", help="Configuration file of the crawlers pool", required=True) parser_dbmanage = subparsers.add_parser("dbmanage", help="Database manager") parser_dbmanage.add_argument("--config", help="Configuration file", required=False) parser_dbmanage.add_argument( "--delete-repository", help="Delete events related to a repository (regexp)", ) parser_dbmanage.add_argument( "--delete-index", help="Delete the index", action="store_true", ) parser_dbmanage.add_argument("--index", help="The Elastisearch index name", required=True) parser_dbmanage.add_argument( "--run-migrate", help="Run the migration process", ) parser_dbmanage.add_argument( "--update-idents", help="Update identities", action="store_true", ) parser_dbquery = subparsers.add_parser( "dbquery", help="Run an existsing query on stored events") parser_dbquery.add_argument("--index", help="The Elastisearch index name", required=True) parser_dbquery.add_argument("--name", help="The query name", required=True) parser_dbquery.add_argument( "--repository", help="Scope to events of repositories (regexp)", required=True) parser_dbquery.add_argument( "--target-branch", help="Scope to events of a target branches (regexp)") parser_dbquery.add_argument("--gte", help="Scope to events created after date") parser_dbquery.add_argument("--lte", help="Scope to events created before date") parser_dbquery.add_argument( "--on_cc_gte", help="Scope to events related to changes created after date") parser_dbquery.add_argument( "--on_cc_lte", help="Scope to events related to changes created before date") parser_dbquery.add_argument( "--ec-same-date", help="Scope to events related to changes created during the " "same date bondaries defined by gte/lte arguments", action="store_true", ) parser_dbquery.add_argument( "--type", help="Scope to events types list (comma separated)") parser_dbquery.add_argument( "--files", help="Scope to changes containing this file regexp") parser_dbquery.add_argument( "--state", help="Scope to changes with state (comma separated)", ) parser_dbquery.add_argument("--change-ids", help="Scope to change ids (comma separated)") parser_dbquery.add_argument("--authors", help="Scope to authors (comma separated)") parser_dbquery.add_argument( "--approvals", help="Scope to objects with approvals (comma separated)") parser_dbquery.add_argument( "--exclude-approvals", help="Approvals exclude list (comma separated)") parser_dbquery.add_argument("--size", help="Return maximum of size results", default=10) parser_dbquery.add_argument( "--from", help="Starting index of the elements to retrieve", default=0) parser_dbquery.add_argument("--exclude-authors", help="Authors exclude list (comma separated)") parser_dbquery.add_argument( "--tests-included", help="Scope to changes containing tests", action="store_true", ) parser_dbquery.add_argument( "--self-merged", help="Scope to changes merged by their authors", action="store_true", ) parser_dbquery.add_argument( "--has-issue-tracker-links", help="Scope to changes containing an issue tracker link", choices=["generic", "github.com", "altassian.net"], ) parser_dbquery.add_argument( "--task-priority", help="Scope to changes related to task priorities (comma separated)", ) parser_dbquery.add_argument( "--task-severity", help="Scope to changes related to task severities (comma separated)", ) parser_dbquery.add_argument( "--task-issue-type", help="Scope to changes related to task type (comma separated)", ) parser_dbquery.add_argument( "--task-score", help="Scope to changes related to task score '<op>: <val>'", ) args = parser.parse_args() logging.basicConfig( level=getattr(logging, args.loglevel.upper()), format="%(asctime)s - %(name)s - %(thread)d - %(threadName)s - " + "%(levelname)s - %(message)s", ) log = logging.getLogger(__name__) if not args.command: parser.print_usage() sys.exit(1) if args.command == "crawler": realpath = os.path.expanduser(args.config) if not os.path.isfile(realpath): log.error("Unable to access config: %s" % realpath) sys.exit(1) configdata = yaml.safe_load(open(realpath).read()) config.validate(configdata, config.schema) tpool: List[Union[Crawler, GroupCrawler]] = [] group = {} app = None if os.getenv("APP_ID") and os.getenv("APP_KEY_PATH"): app = application.get_app(os.getenv("APP_ID"), os.getenv("APP_KEY_PATH")) for tenant in configdata["tenants"]: idents_config = config.get_idents_config(configdata, tenant["index"]) for crawler_item in tenant.get("crawler", {}).get("github_orgs", []): tg = pullrequest.TokenGetter(crawler_item["name"], crawler_item.get("token"), app) github_c_args = pullrequest.GithubCrawlerArgs( command="github_crawler", org=crawler_item["name"], updated_since=crawler_item["updated_since"], loop_delay=tenant["crawler"]["loop_delay"], repository=crawler_item.get("repository"), base_url=utils.strip_url(crawler_item["base_url"]), token_getter=tg, db=ELmonocleDB( elastic_conn=args.elastic_conn, index=tenant["index"], timeout=args.elastic_timeout, user=args.elastic_user, password=args.elastic_password, use_ssl=args.use_ssl, verify_certs=args.insecure, ssl_show_warn=args.ssl_show_warn, ), idents_config=idents_config, ) gid = crawler_item.get("token") if not gid: if app: # No token, if we have a app then get the token from the app gid = app.get_token(org=crawler_item["name"]) else: log.info("Skip crawler because no token: %s" % github_c_args) continue if gid not in group: group[gid] = GroupCrawler() tpool.append(group[gid]) if github_c_args.repository: repositories = [github_c_args.repository] else: log.info("Discovering repositories in %s ..." % github_c_args.org) # No repository specified for that organization so # try to discover all of them rf = organization.RepositoriesFetcher( graphql.GithubGraphQLQuery(token_getter=tg)) repos = rf.get(github_c_args.org) repositories = [ repo["name"] for repo in repos if not repo["isArchived"] ] log.info("Found %s repositories in %s ..." % (len(repositories), github_c_args.org)) for repository in repositories: github_c_args.repository = repository group[gid].add_crawler(Runner(github_c_args)) for crawler_item in tenant.get("crawler", {}).get("gerrit_repositories", []): gerrit_c_args = review.GerritCrawlerArgs( command="gerrit_crawler", repository=crawler_item["name"], updated_since=crawler_item["updated_since"], loop_delay=tenant["crawler"]["loop_delay"], base_url=utils.strip_url(crawler_item["base_url"]), insecure=crawler_item.get("insecure", False), login=crawler_item.get("login"), password=crawler_item.get("password"), db=ELmonocleDB( elastic_conn=args.elastic_conn, index=tenant["index"], timeout=args.elastic_timeout, user=args.elastic_user, password=args.elastic_password, use_ssl=args.use_ssl, verify_certs=args.insecure, ssl_show_warn=args.ssl_show_warn, ), prefix=crawler_item.get("prefix"), idents_config=idents_config, ) tpool.append(Crawler(gerrit_c_args)) log.info("%d configured threads" % len(tpool)) for cthread in tpool: cthread.start() if args.command == "dbmanage": if args.update_idents and not args.config: log.error("Please provide the --config option") sys.exit(1) if args.update_idents: idents_config = config.get_idents_config( yaml.safe_load(open(args.config)), args.index) else: idents_config = [] db = ELmonocleDB( elastic_conn=args.elastic_conn, index=args.index, idents_config=idents_config, user=args.elastic_user, password=args.elastic_password, use_ssl=args.use_ssl, verify_certs=args.insecure, ssl_show_warn=args.ssl_show_warn, ) if args.delete_repository: db.delete_repository(args.delete_repository) if args.delete_index: db.delete_index() if args.update_idents: db.update_idents() if args.run_migrate: try: migrate.run_migrate(args.run_migrate, args.elastic_conn, args.index) except migrate.NotAvailableException: log.error("Error: %s is not a valid migration process" % args.run_migrate) if args.command == "dbquery": db = ELmonocleDB( elastic_conn=args.elastic_conn, index=args.index, user=args.elastic_user, password=args.elastic_password, use_ssl=args.use_ssl, verify_certs=args.insecure, ssl_show_warn=args.ssl_show_warn, ) params = utils.set_params(args) try: ret = db.run_named_query(args.name, args.repository.lstrip("^"), params) except UnknownQueryException as err: log.error("Unable to run query: %s" % err) sys.exit(1) pprint(ret)
def query(name): repository_fullname = request.args.get('repository') params = utils.set_params(request.args) db = ELmonocleDB() result = db.run_named_query(name, repository_fullname, params) return jsonify(result)