Example #1
0
def self_merge(elastic_conn, index) -> None:
    to_update = []
    bulk_size = 500

    def update_change(change):
        if "self_merged" not in change.keys():
            if "merged_by" not in change:
                # Here we fix the missing field that can happen with the Gerrit crawler
                change["merged_by"] = None
            if change["merged_by"]:
                change["self_merged"] = change["merged_by"] == change["author"]
            else:
                change["self_merged"] = None
            return True

    client = ELmonocleDB(elastic_conn, index)
    for _obj in client.iter_index():
        obj = _obj["_source"]
        if obj["type"] == "Change":
            updated = update_change(obj)
            if updated:
                to_update.append(dict_to_change_or_event(obj))
            if len(to_update) == bulk_size:
                print("Updating %s changes ..." % bulk_size)
                client.update(to_update)
                to_update = []
Example #2
0
def self_merge(elastic_conn, index):
    to_update = []
    bulk_size = 500

    def update_change(change):
        if 'self_merged' not in change.keys():
            if 'merged_by' not in change:
                # Here we fix the missing field that can happen with the Gerrit crawler
                change['merged_by'] = None
            if change['merged_by']:
                change['self_merged'] = change['merged_by'] == change['author']
            else:
                change['self_merged'] = None
            return True

    client = ELmonocleDB(elastic_conn, index)
    for _obj in client.iter_index():
        obj = _obj['_source']
        if obj['type'] == 'Change':
            updated = update_change(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                print("Updating %s changes ..." % bulk_size)
                client.update(to_update)
                to_update = []
Example #3
0
 def __init__(self,
              args,
              elastic_conn='localhost:9200',
              elastic_timeout=10):
     super().__init__()
     self.updated_since = args.updated_since
     self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None
     self.loop_delay = int(args.loop_delay)
     self.db = ELmonocleDB(elastic_conn=elastic_conn,
                           index=args.index,
                           timeout=elastic_timeout)
     if args.command == 'github_crawler':
         if args.repository:
             self.repository_el_re = "%s/%s" % (
                 args.org.lstrip('^'),
                 args.repository.lstrip('^'),
             )
         else:
             self.repository_el_re = args.org.lstrip('^') + '/.*'
         self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token),
                                           args.base_url, args.org,
                                           args.repository)
     elif args.command == 'gerrit_crawler':
         self.repository_el_re = args.repository.lstrip('^')
         self.prf = review.ReviewesFetcher(args.base_url, args.repository)
Example #4
0
class MonocleCrawler():

    log = logging.getLogger("monocle.Crawler")

    def __init__(self, args):
        self.updated_since = args.updated_since
        self.loop_delay = int(args.loop_delay)
        self.get_one = getattr(args, 'id', None)
        self.db = ELmonocleDB()
        if args.command == 'github_crawler':
            self.get_one_rep = getattr(args, 'repository', None)
            self.org = args.org
            self.repository_el_re = args.org.lstrip('^') + '.*'
            self.prf = pullrequest.PRsFetcher(
                GithubGraphQLQuery(args.token),
                args.host, args.org)
        elif args.command == 'gerrit_crawler':
            self.repository_el_re = args.repository.lstrip('^')
            self.prf = review.ReviewesFetcher(
                args.host, args.repository)

    def get_last_updated_date(self):
        change = self.db.get_last_updated(self.repository_el_re)
        if not change:
            return (
                self.updated_since or
                datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"))
        else:
            logging.info(
                "Most recent change date in the database for %s is %s" % (
                    self.repository_el_re, change['updated_at']))
            return change['updated_at']

    def run_step(self):
        updated_since = self.get_last_updated_date()
        prs = self.prf.get(updated_since)
        objects = self.prf.extract_objects(prs)
        if objects:
            self.log.info("%s objects will be updated in the database" % len(
                objects))
            self.db.update(objects)

    def run(self):
        if self.get_one:
            if not self.get_one_rep:
                print("The --repository argument must be given")
            else:
                pprint(self.prf.get_one(
                    self.org, self.get_one_rep,
                    self.get_one))
        else:
            while True:
                self.run_step()
                self.log.info("Waiting %s seconds before next fetch ..." % (
                    self.loop_delay))
                sleep(self.loop_delay)
Example #5
0
def do_query(index, repository_fullname, args, name):
    params = utils.set_params(args)
    db = ELmonocleDB(
        elastic_conn=os.getenv('ELASTIC_CONN', 'localhost:9200'),
        index=index,
        prefix=CHANGE_PREFIX,
        create=False,
    )
    try:
        result = db.run_named_query(name, repository_fullname, params)
    except InvalidIndexError:
        return 'Invalid index: %s' % request.args.get('index'), 404
    return jsonify(result)
Example #6
0
 def __init__(self, args):
     self.updated_since = args.updated_since
     self.loop_delay = int(args.loop_delay)
     self.get_one = getattr(args, 'id', None)
     self.db = ELmonocleDB()
     if args.command == 'github_crawler':
         self.get_one_rep = getattr(args, 'repository', None)
         self.org = args.org
         self.repository_el_re = args.org.lstrip('^') + '.*'
         self.prf = pullrequest.PRsFetcher(
             GithubGraphQLQuery(args.token),
             args.host, args.org)
     elif args.command == 'gerrit_crawler':
         self.repository_el_re = args.repository.lstrip('^')
         self.prf = review.ReviewesFetcher(
             args.host, args.repository)
Example #7
0
def indices():
    db = ELmonocleDB(
        elastic_conn=os.getenv('ELASTIC_CONN', 'localhost:9200'),
        create=False,
        prefix=CHANGE_PREFIX,
    )
    _indices = db.get_indices()
    indices = []
    for indice in _indices:
        if config.is_public_index(indexes_acl, indice):
            indices.append(indice)
        else:
            user = session.get('username')
            if user:
                if user in config.get_authorized_users(indexes_acl, indice):
                    indices.append(indice)
    return jsonify(indices)
Example #8
0
def do_query(index, repository_fullname, args, name):
    params = utils.set_params(args)
    db = ELmonocleDB(
        elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"),
        index=index,
        prefix=CHANGE_PREFIX,
        create=False,
        user=os.getenv("ELASTIC_USER", None),
        password=os.getenv("ELASTIC_PASSWORD", None),
        use_ssl=os.getenv("ELASTIC_USE_SSL", None),
        verify_certs=os.getenv("ELASTIC_INSECURE", None),
        ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None),
    )
    try:
        result = db.run_named_query(name, repository_fullname, params)
    except InvalidIndexError:
        return "Invalid index: %s" % request.args.get("index"), 404
    return jsonify(result)
Example #9
0
def get_db_cnx(index: str, prefix: str) -> ELmonocleDB:
    return ELmonocleDB(
        index=index,
        prefix=prefix,
        user=os.getenv("ELASTIC_USER", None),
        password=os.getenv("ELASTIC_PASSWORD", None),
        use_ssl=os.getenv("ELASTIC_USE_SSL", False),
        verify_certs=os.getenv("ELASTIC_INSECURE", None),
        ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None),
    )
Example #10
0
 def setUpClass(cls):
     logging.basicConfig(
         level=logging.DEBUG,
         format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s",
     )
     log = logging.getLogger(__name__)
     # log to stderr
     log.addHandler(logging.StreamHandler())
     cls.eldb = ELmonocleDB(index=cls.index, prefix="monocle.test.")
     for dataset in cls.datasets:
         index_dataset(cls.eldb, dataset)
Example #11
0
def create_db_connection(index: Optional[str]) -> ELmonocleDB:
    return ELmonocleDB(
        elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"),
        index=index,
        prefix=CHANGE_PREFIX,
        create=False,
        user=os.getenv("ELASTIC_USER", None),
        password=os.getenv("ELASTIC_PASSWORD", None),
        use_ssl=os.getenv("ELASTIC_USE_SSL", None),
        verify_certs=os.getenv("ELASTIC_INSECURE", None),
        ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None),
    )
Example #12
0
def indices():
    db = ELmonocleDB(
        elastic_conn=os.getenv("ELASTIC_CONN", "localhost:9200"),
        create=False,
        prefix=CHANGE_PREFIX,
        user=os.getenv("ELASTIC_USER", None),
        password=os.getenv("ELASTIC_PASSWORD", None),
        use_ssl=os.getenv("ELASTIC_USE_SSL", None),
        verify_certs=os.getenv("ELASTIC_INSECURE", None),
        ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None),
    )
    _indices = db.get_indices()
    indices = []
    for indice in _indices:
        if config.is_public_index(indexes_acl, indice):
            indices.append(indice)
        else:
            user = session.get("username")
            if user:
                if user in config.get_authorized_users(indexes_acl, indice):
                    indices.append(indice)
    return jsonify(indices)
Example #13
0
 def setUpClass(cls):
     logging.basicConfig(
         level=logging.DEBUG,
         format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s",
     )
     log = logging.getLogger(__name__)
     # log to stderr
     log.addHandler(logging.StreamHandler())
     cls.eldb = ELmonocleDB(
         index=cls.index,
         prefix="monocle.test.",
         user=os.getenv("ELASTIC_USER", None),
         password=os.getenv("ELASTIC_PASSWORD", None),
         use_ssl=os.getenv("ELASTIC_USE_SSL", None),
         verify_certs=os.getenv("ELASTIC_INSECURE", None),
         ssl_show_warn=os.getenv("ELASTIC_SSL_SHOW_WARN", None),
     )
     for dataset in cls.datasets:
         index_dataset(cls.eldb, dataset)
Example #14
0
def main():
    parser = argparse.ArgumentParser(prog='monocle')
    parser.add_argument('--loglevel', help='logging level', default='INFO')
    parser.add_argument(
        '--elastic-timeout',
        help='Elasticsearch connection retry timeout',
        default=10,
        type=int,
    )
    parser.add_argument(
        '--elastic-conn', help='Elasticsearch connection info', default='localhost:9200'
    )
    subparsers = parser.add_subparsers(
        title='Subcommands', description='valid subcommands', dest="command"
    )

    parser_crawler = subparsers.add_parser('crawler', help='Threaded crawlers pool')
    parser_crawler.add_argument(
        '--config', help='Configuration file of the crawlers pool', required=True
    )

    parser_dbmanage = subparsers.add_parser('dbmanage', help='Database manager')
    parser_dbmanage.add_argument(
        '--delete-repository', help='Delete events related to a repository (regexp)',
    )
    parser_dbmanage.add_argument(
        '--delete-index', help='Delete the index', action='store_true',
    )
    parser_dbmanage.add_argument(
        '--index', help='The Elastisearch index name', required=True
    )

    parser_dbquery = subparsers.add_parser(
        'dbquery', help='Run an existsing query on stored events'
    )
    parser_dbquery.add_argument(
        '--index', help='The Elastisearch index name', required=True
    )
    parser_dbquery.add_argument('--name', help='The query name', required=True)
    parser_dbquery.add_argument(
        '--repository', help='Scope to events of repositories (regexp)', required=True
    )
    parser_dbquery.add_argument(
        '--target-branch', help='Scope to events of a target branches (regexp)'
    )
    parser_dbquery.add_argument('--gte', help='Scope to events created after date')
    parser_dbquery.add_argument('--lte', help='Scope to events created before date')
    parser_dbquery.add_argument(
        '--on_cc_gte', help='Scope to events related to changes created after date'
    )
    parser_dbquery.add_argument(
        '--on_cc_lte', help='Scope to events related to changes created before date'
    )
    parser_dbquery.add_argument(
        '--ec-same-date',
        help='Scope to events related to changes created during the '
        'same date bondaries defined by gte/lte arguments',
        action='store_true',
    )
    parser_dbquery.add_argument(
        '--type', help='Scope to events types list (comma separated)'
    )
    parser_dbquery.add_argument(
        '--files', help='Scope to changes containing this file regexp'
    )
    parser_dbquery.add_argument(
        '--state',
        help='Scope to changes having this state',
        choices=['OPEN', 'CLOSED', 'MERGED'],
    )
    parser_dbquery.add_argument(
        '--change-ids', help='Scope to change ids (comma separated)'
    )
    parser_dbquery.add_argument('--authors', help='Scope to authors (comma separated)')
    parser_dbquery.add_argument(
        '--approvals', help='Scope to objects with approvals (comma separated)'
    )
    parser_dbquery.add_argument(
        '--exclude-approvals', help='Approvals exclude list (comma separated)'
    )
    parser_dbquery.add_argument(
        '--size', help='Return maximum of size results', default=10
    )
    parser_dbquery.add_argument(
        '--from', help='Starting index of the elements to retrieve', default=0
    )
    parser_dbquery.add_argument(
        '--exclude-authors', help='Authors exclude list (comma separated)'
    )
    parser_dbquery.add_argument(
        '--tests-included',
        help='Scope to changes containing tests',
        action='store_true',
    )
    parser_dbquery.add_argument(
        '--has-issue-tracker-links',
        help='Scope to changes containing an issue tracker link',
        choices=['generic', 'github.com', 'altassian.net'],
    )

    args = parser.parse_args()

    logging.basicConfig(
        level=getattr(logging, args.loglevel.upper()),
        format="%(asctime)s - %(name)s - %(thread)d - %(threadName)s - "
        + "%(levelname)s - %(message)s",
    )
    log = logging.getLogger(__name__)

    if not args.command:
        parser.print_usage()
        return 1

    if args.command == "crawler":
        realpath = os.path.expanduser(args.config)
        if not os.path.isfile(realpath):
            log.error('Unable to access config: %s' % realpath)
            sys.exit(1)
        configdata = yaml.safe_load(open(realpath).read())
        validate(instance=configdata, schema=config.schema)
        tpool = []
        group = {}
        app = None
        if os.getenv('APP_ID') and os.getenv('APP_KEY_PATH'):
            app = application.get_app(os.getenv('APP_ID'), os.getenv('APP_KEY_PATH'))
        for tenant in configdata['tenants']:
            for crawler_item in tenant['crawler'].get('github_orgs', []):
                tg = pullrequest.TokenGetter(
                    crawler_item['name'], crawler_item.get('token'), app
                )
                c_args = pullrequest.GithubCrawlerArgs(
                    command='github_crawler',
                    org=crawler_item['name'],
                    updated_since=crawler_item['updated_since'],
                    loop_delay=tenant['crawler']['loop_delay'],
                    repository=crawler_item.get('repository'),
                    base_url=crawler_item['base_url'],
                    token_getter=tg,
                    db=ELmonocleDB(
                        elastic_conn=args.elastic_conn,
                        index=tenant['index'],
                        timeout=args.elastic_timeout,
                    ),
                )
                gid = crawler_item.get('token')
                if not gid:
                    if app:
                        # No token, if we have a app then get the token from the app
                        gid = app.get_token(org=crawler_item['name'])
                    else:
                        log.info('Skip crawler because no token: %s' % c_args)
                        continue
                if gid not in group:
                    group[gid] = GroupCrawler()
                    tpool.append(group[gid])
                if c_args.repository:
                    repositories = [c_args.repository]
                else:
                    log.info('Discovering repositories in %s ...' % c_args.org)
                    # No repository specified for that organization so
                    # try to discover all of them
                    rf = organization.RepositoriesFetcher(
                        graphql.GithubGraphQLQuery(token_getter=tg)
                    )
                    repos = rf.get(c_args.org)
                    repositories = [
                        repo['name'] for repo in repos if not repo['isArchived']
                    ]
                    log.info(
                        'Found %s repositories in %s ...'
                        % (len(repositories), c_args.org)
                    )
                for repository in repositories:
                    c_args.repository = repository
                    group[gid].add_crawler(Runner(c_args))
            for crawler_item in tenant['crawler'].get('gerrit_repositories', []):
                c_args = review.GerritCrawlerArgs(
                    command='gerrit_crawler',
                    repository=crawler_item['name'],
                    updated_since=crawler_item['updated_since'],
                    loop_delay=tenant['crawler']['loop_delay'],
                    base_url=crawler_item['base_url'],
                    insecure=crawler_item.get('insecure', False),
                    login=crawler_item.get('login'),
                    password=crawler_item.get('password'),
                    db=ELmonocleDB(
                        elastic_conn=args.elastic_conn,
                        index=tenant['index'],
                        timeout=args.elastic_timeout,
                    ),
                )
                tpool.append(Crawler(c_args))
        log.info('%d configured threads' % len(tpool))
        for cthread in tpool:
            cthread.start()

    if args.command == "dbmanage":
        db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index)
        if args.delete_repository:
            db.delete_repository(args.delete_repository)
        if args.delete_index:
            db.delete_index()

    if args.command == "dbquery":
        db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index)
        params = utils.set_params(args)
        try:
            ret = db.run_named_query(args.name, args.repository.lstrip('^'), params)
        except UnknownQueryException as err:
            log.error('Unable to run query: %s' % err)
            sys.exit(1)
        pprint(ret)
Example #15
0
def main():
    parser = argparse.ArgumentParser(prog='monocle')
    parser.add_argument(
        '--loglevel', help='logging level', default='INFO')
    subparsers = parser.add_subparsers(title='Subcommands',
                                       description='valid subcommands',
                                       dest="command")

    for crawler_driver in (pullrequest, review):
        parser_crawler = subparsers.add_parser(
            crawler_driver.name, help=crawler_driver.help)
        parser_crawler.add_argument(
            '--loop-delay', help='Request last updated events every N secs',
            default=900)
        parser_crawler.add_argument(
            '--host', help='Base url of the code review server',
            required=True)
        crawler_driver.init_crawler_args_parser(parser_crawler)

    parser_dbmanage = subparsers.add_parser(
        'dbmanage', help='Database manager')
    parser_dbmanage.add_argument(
        '--delete-repository',
        help='Delete events related to a repository (regexp)',
        required=True)

    parser_dbquery = subparsers.add_parser(
        'dbquery', help='Run an existsing query on stored events')
    parser_dbquery.add_argument(
        '--interval', help='Histogram interval',
        default="3h")
    parser_dbquery.add_argument(
        '--name', help='The query name',
        required=True)
    parser_dbquery.add_argument(
        '--repository', help='Scope to events of a repository (regexp)',
        required=True)
    parser_dbquery.add_argument(
        '--gte', help='Scope to events created after date')
    parser_dbquery.add_argument(
        '--lte', help='Scope to events created before date')
    parser_dbquery.add_argument(
        '--on_cc_gte',
        help='Scope to events related to changes created after date')
    parser_dbquery.add_argument(
        '--on_cc_lte',
        help='Scope to events related to changes created before date')
    parser_dbquery.add_argument(
        '--ec-same-date',
        help='Scope to events related to changes created during the '
        'same date bondaries defined by gte/lte arguments',
        action='store_true')
    parser_dbquery.add_argument(
        '--type', help='Scope to events types list (comma separated)')
    parser_dbquery.add_argument(
        '--authors', help='Scope to authors (comma separated)')
    parser_dbquery.add_argument(
        '--approval', help='Scope to events with approval')
    parser_dbquery.add_argument(
        '--size', help='Return maximum of size results',
        default=10)
    parser_dbquery.add_argument(
        '--exclude-authors', help='Authors exclude list (comma separated)')

    args = parser.parse_args()

    logging.basicConfig(
        level=getattr(logging, args.loglevel.upper()))

    if not args.command:
        parser.print_usage()
        return 1

    if args.command.endswith("_crawler"):
        crawler = MonocleCrawler(args)
        crawler.run()

    if args.command == "dbmanage":
        if args.delete_repository:
            db = ELmonocleDB()
            db.delete_repository(args.delete_repository)

    if args.command == "dbquery":
        db = ELmonocleDB()
        params = utils.set_params(args)
        ret = db.run_named_query(
            args.name,
            args.repository.lstrip('^'),
            params)
        pprint(ret)
Example #16
0
class Runner(object):
    def __init__(self,
                 args,
                 elastic_conn='localhost:9200',
                 elastic_timeout=10):
        super().__init__()
        self.updated_since = args.updated_since
        self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None
        self.loop_delay = int(args.loop_delay)
        self.db = ELmonocleDB(elastic_conn=elastic_conn,
                              index=args.index,
                              timeout=elastic_timeout)
        if args.command == 'github_crawler':
            if args.repository:
                self.repository_el_re = "%s/%s" % (
                    args.org.lstrip('^'),
                    args.repository.lstrip('^'),
                )
            else:
                self.repository_el_re = args.org.lstrip('^') + '/.*'
            self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token),
                                              args.base_url, args.org,
                                              args.repository)
        elif args.command == 'gerrit_crawler':
            self.repository_el_re = args.repository.lstrip('^')
            self.prf = review.ReviewesFetcher(args.base_url, args.repository)

    def get_last_updated_date(self):
        change = self.db.get_last_updated(self.repository_el_re)
        if not change:
            return self.updated_since or datetime.now().strftime(
                "%Y-%m-%dT%H:%M:%SZ")
        else:
            log.info("Most recent change date in the database for %s is %s" %
                     (self.repository_el_re, change['updated_at']))
            return change['updated_at']

    def run_step(self):
        def dump_data(data, prefix=None):
            try:
                if self.dump_dir:
                    tmpfile = tempfile.NamedTemporaryFile(
                        dir=self.dump_dir,
                        prefix=prefix,
                        suffix='.json',
                        mode='w',
                        delete=False,
                    )
                    json.dump(data, tmpfile)
                    tmpfile.close()
                    log.info('Data dumped to %s' % tmpfile.name)
                    return tmpfile.name
            except Exception:
                log.exception('Unable to dump data')
            return None

        updated_since = self.get_last_updated_date()
        try:
            prs = self.prf.get(updated_since)
        except Exception:
            log.exception('Unable to get PR data')
            return
        objects = self.prf.extract_objects(prs, dump_data)
        if objects:
            log.info("%d objects will be updated in the database" %
                     len(objects))
            self.db.update(objects)
Example #17
0
def string_ident_to_ident(elastic_conn, index) -> None:
    bulk_size = 7500
    client = ELmonocleDB(elastic_conn, index, previous_schema=True)
    client2 = ELmonocleDB(elastic_conn, index)
    changes_url_lookup: Dict[str, str] = {}
    to_update: List = []
    need_url_update: List[Dict] = []
    total_objects_updated = 0

    def bulk_update(to_update: List) -> None:
        client2.update(to_update)

    def update_changes_url_lookup(objs: List[Dict]) -> None:
        change_ids = [o["change_id"] for o in objs]
        change_ids = list(set(change_ids))
        change_ids = [
            _id for _id in change_ids if _id not in changes_url_lookup
        ]
        print("Updating change_url_lookup for %s changes ..." %
              len(change_ids))
        params = {"change_ids": change_ids, "size": 10000, "from": 0}
        result = client.run_named_query("changes", ".*", params=params)
        changes = result["items"]
        for change in changes:
            changes_url_lookup[change["change_id"]] = utils.strip_url(
                change["url"])
        print("%s entries in changes_url_lookup" % len(changes_url_lookup))

    def update_obj(obj: Dict) -> Dict:

        url = utils.strip_url(obj["url"])

        def update_approval_type(approval):
            if isinstance(approval, str):
                ret = [approval]
            else:
                ret = approval
            return [r for r in ret if r is not None]

        def create_ident_dict(url: str, uid: str) -> Dict:
            domain = urlparse(url).netloc
            uid = prefix(domain, uid)
            return {
                "uid": uid,
                "muid": create_muid_from_uid(uid),
            }

        def to_ident(value: Optional[str]) -> Optional[Dict]:
            if value:
                return create_ident_dict(url, value)
            return None

        if obj["type"] == "Change":
            obj["author"] = to_ident(obj["author"])
            obj["committer"] = to_ident(obj.get("committer"))
            obj["merged_by"] = to_ident(obj.get("merged_by"))
            obj["assignees"] = list(map(to_ident, obj.get("assignees", [])))
            for commit in obj.get("commits", []):
                # Also fix commit's author that might be not exists
                if "author" not in commit.keys():
                    commit["author"] = obj["author"]
                else:
                    commit["author"] = to_ident(commit["author"])
                # Also fix commit's committer that might be not exists
                if "committer" not in commit.keys():
                    commit["committer"] = commit["author"]
                else:
                    commit["committer"] = to_ident(commit["committer"])
        else:
            obj["author"] = to_ident(obj.get("author"))
            obj["on_author"] = to_ident(obj.get("on_author"))
            # Also fix missing created_at date on ChangeCommitPushedEvent
            if obj["type"] == "ChangeCommitPushedEvent" and obj[
                    "created_at"] is None:
                obj["created_at"] = obj["on_created_at"]
        # Also fix approval format if needed
        if obj.get("approval"):
            obj["approval"] = update_approval_type(obj["approval"])
        # Ensure we have the stripped url
        obj["url"] = url

        return obj

    def proceed():
        if need_url_update:
            update_changes_url_lookup(need_url_update)
        for o in to_update:
            if o in need_url_update:
                if o["change_id"] in changes_url_lookup:
                    o["url"] = changes_url_lookup[o["change_id"]]
                else:
                    print("Warning - unable to find change %s" %
                          o["change_id"])
                    o["url"] = "https://undefined"
        updated = list(map(update_obj, to_update))
        print("Updating %s objects ..." % len(to_update))
        bulk_update(list(map(dict_to_change_or_event, updated)))

    for _obj in client.iter_index():
        obj = _obj["_source"]
        if obj["type"] in utils.get_events_list() and "url" not in obj.keys():
            need_url_update.append(obj)
        if obj["type"] in utils.get_events_list() + ["Change"]:
            to_update.append(obj)

        if len(to_update) == bulk_size:
            proceed()
            total_objects_updated += len(to_update)
            print("Total objects updated: %s" % total_objects_updated)
            need_url_update = []
            to_update = []

    proceed()
    total_objects_updated += len(to_update)
    print("Total objects updated: %s" % total_objects_updated)
Example #18
0
def main():
    parser_dbquery = argparse.ArgumentParser(prog=sys.argv[0])
    parser_dbquery.add_argument(
        "--repository",
        help="Scope to events of a repository (regexp)",
        default=r".*")
    parser_dbquery.add_argument("--gte",
                                help="Scope to events created after date")
    parser_dbquery.add_argument("--lte",
                                help="Scope to events created before date")
    parser_dbquery.add_argument("--size",
                                help="Return maximum of size results",
                                default=1000)
    parser_dbquery.add_argument("--exclude-authors",
                                help="Authors exclude list (comma separated)")
    args = parser_dbquery.parse_args()

    db = ELmonocleDB()
    params = utils.set_params(args)
    data = db.run_named_query("last_merged_changes",
                              args.repository.lstrip("^"), params)

    lte_time = (datetime.datetime.strptime(args.lte, "%Y-%m-%d") +
                datetime.timedelta(days=1) if args.lte else None)

    title = {}

    for entry in data:
        # example: 2020-02-24T19:05:13Z
        created_time = datetime.datetime.strptime(entry["created_at"],
                                                  "%Y-%m-%dT%H:%M:%SZ")
        merge_time = datetime.datetime.strptime(entry["merged_at"],
                                                "%Y-%m-%dT%H:%M:%SZ")
        if lte_time and merge_time > lte_time:
            continue

        print("%.0f|%s|A|/%s/%s|" % (
            created_time.timestamp(),
            entry["author"],
            entry["repository_fullname"],
            entry["title"],
        ))
        print("%.0f|%s|M|/%s/%s|" % (
            merge_time.timestamp(),
            entry["author"],
            entry["repository_fullname"],
            entry["title"],
        ))
        title[entry["repository_fullname_and_number"]] = entry["title"]

    params["etype"] = ("ChangeCommentedEvent", )
    data = db.run_named_query("_scan", args.repository.lstrip("^"), params)

    for entry in data:
        # example: 2020-02-24T19:05:13Z
        created_time = datetime.datetime.strptime(entry["created_at"],
                                                  "%Y-%m-%dT%H:%M:%SZ")
        try:
            print("%.0f|%s|M|/%s/%s|" % (
                created_time.timestamp(),
                entry["author"],
                entry["repository_fullname"],
                title[entry["repository_fullname_and_number"]],
            ))
        except KeyError:
            print(
                "%s not merged" % entry["repository_fullname_and_number"],
                file=sys.stderr,
            )
Example #19
0
def main():
    parser = argparse.ArgumentParser(prog='monocle')
    parser.add_argument('--loglevel', help='logging level', default='INFO')
    parser.add_argument(
        '--elastic-timeout',
        help='Elasticsearch connection retry timeout',
        default=10,
        type=int,
    )
    parser.add_argument('--elastic-conn',
                        help='Elasticsearch connection info',
                        default='localhost:9200')
    subparsers = parser.add_subparsers(title='Subcommands',
                                       description='valid subcommands',
                                       dest="command")

    parser_crawler = subparsers.add_parser('crawler',
                                           help='Threaded crawlers pool')
    parser_crawler.add_argument('--config',
                                help='Configuration file of the crawlers pool',
                                required=True)

    parser_dbmanage = subparsers.add_parser('dbmanage',
                                            help='Database manager')
    parser_dbmanage.add_argument(
        '--delete-repository',
        help='Delete events related to a repository (regexp)',
        required=True,
    )
    parser_dbmanage.add_argument('--index',
                                 help='The Elastisearch index name',
                                 required=True)

    parser_dbquery = subparsers.add_parser(
        'dbquery', help='Run an existsing query on stored events')
    parser_dbquery.add_argument('--index',
                                help='The Elastisearch index name',
                                required=True)
    parser_dbquery.add_argument('--name', help='The query name', required=True)
    parser_dbquery.add_argument(
        '--repository',
        help='Scope to events of repositories (regexp)',
        required=True)
    parser_dbquery.add_argument(
        '--target-branch',
        help='Scope to events of a target branches (regexp)')
    parser_dbquery.add_argument('--gte',
                                help='Scope to events created after date')
    parser_dbquery.add_argument('--lte',
                                help='Scope to events created before date')
    parser_dbquery.add_argument(
        '--on_cc_gte',
        help='Scope to events related to changes created after date')
    parser_dbquery.add_argument(
        '--on_cc_lte',
        help='Scope to events related to changes created before date')
    parser_dbquery.add_argument(
        '--ec-same-date',
        help='Scope to events related to changes created during the '
        'same date bondaries defined by gte/lte arguments',
        action='store_true',
    )
    parser_dbquery.add_argument(
        '--type', help='Scope to events types list (comma separated)')
    parser_dbquery.add_argument(
        '--files', help='Scope to changes containing this file regexp')
    parser_dbquery.add_argument(
        '--state',
        help='Scope to changes having this state',
        choices=['OPEN', 'CLOSED', 'MERGED'],
    )
    parser_dbquery.add_argument('--change-ids',
                                help='Scope to change ids (comma separated)')
    parser_dbquery.add_argument('--authors',
                                help='Scope to authors (comma separated)')
    parser_dbquery.add_argument('--approval',
                                help='Scope to events with approval')
    parser_dbquery.add_argument('--size',
                                help='Return maximum of size results',
                                default=10)
    parser_dbquery.add_argument(
        '--from', help='Starting index of the elements to retrieve', default=0)
    parser_dbquery.add_argument('--exclude-authors',
                                help='Authors exclude list (comma separated)')
    parser_dbquery.add_argument(
        '--tests-included',
        help='Scope to changes containing tests',
        action='store_true',
    )
    parser_dbquery.add_argument(
        '--has-issue-tracker-links',
        help='Scope to changes containing an issue tracker link',
        choices=['generic', 'github.com', 'altassian.net'],
    )

    args = parser.parse_args()

    logging.basicConfig(
        level=getattr(logging, args.loglevel.upper()),
        format="%(asctime)s - %(name)s - %(threadName)s - " +
        "%(levelname)s - %(message)s",
    )
    log = logging.getLogger(__name__)

    if not args.command:
        parser.print_usage()
        return 1

    if args.command == "crawler":
        realpath = os.path.expanduser(args.config)
        if not os.path.isfile(realpath):
            log.error('Unable to access config: %s' % realpath)
            sys.exit(1)
        configdata = yaml.safe_load(open(realpath).read())
        validate(instance=configdata, schema=config.schema)
        tpool = []
        group = {}
        for tenant in configdata['tenants']:
            for crawler_item in tenant['crawler'].get('github_orgs', []):
                c_args = pullrequest.GithubCrawlerArgs(
                    command='github_crawler',
                    index=tenant['index'],
                    org=crawler_item['name'],
                    updated_since=crawler_item['updated_since'],
                    loop_delay=tenant['crawler']['loop_delay'],
                    token=crawler_item['token'],
                    repository=crawler_item.get('repository'),
                    base_url=crawler_item['base_url'],
                )
                log.info('args=%s' % c_args)
                if crawler_item['token'] not in group:
                    group[crawler_item['token']] = GroupCrawler()
                    tpool.append(group[crawler_item['token']])
                group[crawler_item['token']].add_crawler(
                    Runner(
                        c_args,
                        elastic_conn=args.elastic_conn,
                        elastic_timeout=args.elastic_timeout,
                    ))
            for crawler_item in tenant['crawler'].get('gerrit_repositories',
                                                      []):
                c_args = review.GerritCrawlerArgs(
                    command='gerrit_crawler',
                    index=tenant['index'],
                    repository=crawler_item['name'],
                    updated_since=crawler_item['updated_since'],
                    loop_delay=tenant['crawler']['loop_delay'],
                    base_url=crawler_item['base_url'],
                )
                tpool.append(
                    Crawler(
                        c_args,
                        elastic_conn=args.elastic_conn,
                        elastic_timeout=args.elastic_timeout,
                    ))
        log.info('%d configured threads' % len(tpool))
        for cthread in tpool:
            cthread.start()

    if args.command == "dbmanage":
        if args.delete_repository:
            db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index)
            db.delete_repository(args.delete_repository)

    if args.command == "dbquery":
        db = ELmonocleDB(elastic_conn=args.elastic_conn, index=args.index)
        params = utils.set_params(args)
        try:
            ret = db.run_named_query(args.name, args.repository.lstrip('^'),
                                     params)
        except UnknownQueryException as err:
            log.error('Unable to run query: %s' % err)
            sys.exit(1)
        pprint(ret)
Example #20
0
def main() -> None:
    parser = argparse.ArgumentParser(prog="monocle")
    parser.add_argument("--loglevel", help="logging level", default="INFO")
    parser.add_argument(
        "--elastic-timeout",
        help="Elasticsearch connection retry timeout",
        default=10,
        type=int,
    )
    parser.add_argument("--elastic-conn",
                        help="Elasticsearch connection info",
                        default="localhost:9200")
    parser.add_argument(
        "--use-ssl",
        help="Use https protocol for communication with Elasticsearch",
        action="store_true",
    )
    parser.add_argument(
        "--insecure",
        help="Skip SSL CA cert validation",
        action="store_false",
    )
    parser.add_argument(
        "--ssl_show_warn",
        help="Skip showing a SSL warning message if it is not signed "
        "by CA authority",
        action="store_false",
    )
    parser.add_argument(
        "--elastic-user",
        help="Username for Elasticsearch authorization",
    )
    parser.add_argument(
        "--elastic-password",
        help="Password for Elasticsearch authorization",
    )
    subparsers = parser.add_subparsers(title="Subcommands",
                                       description="valid subcommands",
                                       dest="command")

    parser_crawler = subparsers.add_parser("crawler",
                                           help="Threaded crawlers pool")
    parser_crawler.add_argument("--config",
                                help="Configuration file of the crawlers pool",
                                required=True)

    parser_dbmanage = subparsers.add_parser("dbmanage",
                                            help="Database manager")
    parser_dbmanage.add_argument("--config",
                                 help="Configuration file",
                                 required=False)
    parser_dbmanage.add_argument(
        "--delete-repository",
        help="Delete events related to a repository (regexp)",
    )
    parser_dbmanage.add_argument(
        "--delete-index",
        help="Delete the index",
        action="store_true",
    )
    parser_dbmanage.add_argument("--index",
                                 help="The Elastisearch index name",
                                 required=True)
    parser_dbmanage.add_argument(
        "--run-migrate",
        help="Run the migration process",
    )

    parser_dbmanage.add_argument(
        "--update-idents",
        help="Update identities",
        action="store_true",
    )

    parser_dbquery = subparsers.add_parser(
        "dbquery", help="Run an existsing query on stored events")
    parser_dbquery.add_argument("--index",
                                help="The Elastisearch index name",
                                required=True)
    parser_dbquery.add_argument("--name", help="The query name", required=True)
    parser_dbquery.add_argument(
        "--repository",
        help="Scope to events of repositories (regexp)",
        required=True)
    parser_dbquery.add_argument(
        "--target-branch",
        help="Scope to events of a target branches (regexp)")
    parser_dbquery.add_argument("--gte",
                                help="Scope to events created after date")
    parser_dbquery.add_argument("--lte",
                                help="Scope to events created before date")
    parser_dbquery.add_argument(
        "--on_cc_gte",
        help="Scope to events related to changes created after date")
    parser_dbquery.add_argument(
        "--on_cc_lte",
        help="Scope to events related to changes created before date")
    parser_dbquery.add_argument(
        "--ec-same-date",
        help="Scope to events related to changes created during the "
        "same date bondaries defined by gte/lte arguments",
        action="store_true",
    )
    parser_dbquery.add_argument(
        "--type", help="Scope to events types list (comma separated)")
    parser_dbquery.add_argument(
        "--files", help="Scope to changes containing this file regexp")
    parser_dbquery.add_argument(
        "--state",
        help="Scope to changes with state (comma separated)",
    )
    parser_dbquery.add_argument("--change-ids",
                                help="Scope to change ids (comma separated)")
    parser_dbquery.add_argument("--authors",
                                help="Scope to authors (comma separated)")
    parser_dbquery.add_argument(
        "--approvals",
        help="Scope to objects with approvals (comma separated)")
    parser_dbquery.add_argument(
        "--exclude-approvals", help="Approvals exclude list (comma separated)")
    parser_dbquery.add_argument("--size",
                                help="Return maximum of size results",
                                default=10)
    parser_dbquery.add_argument(
        "--from", help="Starting index of the elements to retrieve", default=0)
    parser_dbquery.add_argument("--exclude-authors",
                                help="Authors exclude list (comma separated)")
    parser_dbquery.add_argument(
        "--tests-included",
        help="Scope to changes containing tests",
        action="store_true",
    )
    parser_dbquery.add_argument(
        "--self-merged",
        help="Scope to changes merged by their authors",
        action="store_true",
    )
    parser_dbquery.add_argument(
        "--has-issue-tracker-links",
        help="Scope to changes containing an issue tracker link",
        choices=["generic", "github.com", "altassian.net"],
    )
    parser_dbquery.add_argument(
        "--task-priority",
        help="Scope to changes related to task priorities (comma separated)",
    )
    parser_dbquery.add_argument(
        "--task-severity",
        help="Scope to changes related to task severities (comma separated)",
    )
    parser_dbquery.add_argument(
        "--task-issue-type",
        help="Scope to changes related to task type (comma separated)",
    )

    parser_dbquery.add_argument(
        "--task-score",
        help="Scope to changes related to task score '<op>: <val>'",
    )

    args = parser.parse_args()

    logging.basicConfig(
        level=getattr(logging, args.loglevel.upper()),
        format="%(asctime)s - %(name)s - %(thread)d - %(threadName)s - " +
        "%(levelname)s - %(message)s",
    )
    log = logging.getLogger(__name__)

    if not args.command:
        parser.print_usage()
        sys.exit(1)

    if args.command == "crawler":
        realpath = os.path.expanduser(args.config)
        if not os.path.isfile(realpath):
            log.error("Unable to access config: %s" % realpath)
            sys.exit(1)
        configdata = yaml.safe_load(open(realpath).read())
        config.validate(configdata, config.schema)
        tpool: List[Union[Crawler, GroupCrawler]] = []
        group = {}
        app = None
        if os.getenv("APP_ID") and os.getenv("APP_KEY_PATH"):
            app = application.get_app(os.getenv("APP_ID"),
                                      os.getenv("APP_KEY_PATH"))
        for tenant in configdata["tenants"]:
            idents_config = config.get_idents_config(configdata,
                                                     tenant["index"])
            for crawler_item in tenant.get("crawler",
                                           {}).get("github_orgs", []):
                tg = pullrequest.TokenGetter(crawler_item["name"],
                                             crawler_item.get("token"), app)
                github_c_args = pullrequest.GithubCrawlerArgs(
                    command="github_crawler",
                    org=crawler_item["name"],
                    updated_since=crawler_item["updated_since"],
                    loop_delay=tenant["crawler"]["loop_delay"],
                    repository=crawler_item.get("repository"),
                    base_url=utils.strip_url(crawler_item["base_url"]),
                    token_getter=tg,
                    db=ELmonocleDB(
                        elastic_conn=args.elastic_conn,
                        index=tenant["index"],
                        timeout=args.elastic_timeout,
                        user=args.elastic_user,
                        password=args.elastic_password,
                        use_ssl=args.use_ssl,
                        verify_certs=args.insecure,
                        ssl_show_warn=args.ssl_show_warn,
                    ),
                    idents_config=idents_config,
                )
                gid = crawler_item.get("token")
                if not gid:
                    if app:
                        # No token, if we have a app then get the token from the app
                        gid = app.get_token(org=crawler_item["name"])
                    else:
                        log.info("Skip crawler because no token: %s" %
                                 github_c_args)
                        continue
                if gid not in group:
                    group[gid] = GroupCrawler()
                    tpool.append(group[gid])
                if github_c_args.repository:
                    repositories = [github_c_args.repository]
                else:
                    log.info("Discovering repositories in %s ..." %
                             github_c_args.org)
                    # No repository specified for that organization so
                    # try to discover all of them
                    rf = organization.RepositoriesFetcher(
                        graphql.GithubGraphQLQuery(token_getter=tg))
                    repos = rf.get(github_c_args.org)
                    repositories = [
                        repo["name"] for repo in repos
                        if not repo["isArchived"]
                    ]
                    log.info("Found %s repositories in %s ..." %
                             (len(repositories), github_c_args.org))
                for repository in repositories:
                    github_c_args.repository = repository
                    group[gid].add_crawler(Runner(github_c_args))
            for crawler_item in tenant.get("crawler",
                                           {}).get("gerrit_repositories", []):
                gerrit_c_args = review.GerritCrawlerArgs(
                    command="gerrit_crawler",
                    repository=crawler_item["name"],
                    updated_since=crawler_item["updated_since"],
                    loop_delay=tenant["crawler"]["loop_delay"],
                    base_url=utils.strip_url(crawler_item["base_url"]),
                    insecure=crawler_item.get("insecure", False),
                    login=crawler_item.get("login"),
                    password=crawler_item.get("password"),
                    db=ELmonocleDB(
                        elastic_conn=args.elastic_conn,
                        index=tenant["index"],
                        timeout=args.elastic_timeout,
                        user=args.elastic_user,
                        password=args.elastic_password,
                        use_ssl=args.use_ssl,
                        verify_certs=args.insecure,
                        ssl_show_warn=args.ssl_show_warn,
                    ),
                    prefix=crawler_item.get("prefix"),
                    idents_config=idents_config,
                )
                tpool.append(Crawler(gerrit_c_args))
        log.info("%d configured threads" % len(tpool))
        for cthread in tpool:
            cthread.start()

    if args.command == "dbmanage":

        if args.update_idents and not args.config:
            log.error("Please provide the --config option")
            sys.exit(1)
        if args.update_idents:
            idents_config = config.get_idents_config(
                yaml.safe_load(open(args.config)), args.index)
        else:
            idents_config = []
        db = ELmonocleDB(
            elastic_conn=args.elastic_conn,
            index=args.index,
            idents_config=idents_config,
            user=args.elastic_user,
            password=args.elastic_password,
            use_ssl=args.use_ssl,
            verify_certs=args.insecure,
            ssl_show_warn=args.ssl_show_warn,
        )
        if args.delete_repository:
            db.delete_repository(args.delete_repository)
        if args.delete_index:
            db.delete_index()
        if args.update_idents:
            db.update_idents()
        if args.run_migrate:
            try:
                migrate.run_migrate(args.run_migrate, args.elastic_conn,
                                    args.index)
            except migrate.NotAvailableException:
                log.error("Error: %s is not a valid migration process" %
                          args.run_migrate)

    if args.command == "dbquery":
        db = ELmonocleDB(
            elastic_conn=args.elastic_conn,
            index=args.index,
            user=args.elastic_user,
            password=args.elastic_password,
            use_ssl=args.use_ssl,
            verify_certs=args.insecure,
            ssl_show_warn=args.ssl_show_warn,
        )
        params = utils.set_params(args)
        try:
            ret = db.run_named_query(args.name, args.repository.lstrip("^"),
                                     params)
        except UnknownQueryException as err:
            log.error("Unable to run query: %s" % err)
            sys.exit(1)
        pprint(ret)
Example #21
0
def query(name):
    repository_fullname = request.args.get('repository')
    params = utils.set_params(request.args)
    db = ELmonocleDB()
    result = db.run_named_query(name, repository_fullname, params)
    return jsonify(result)