Exemple #1
0
class MonocleCrawler():

    log = logging.getLogger("monocle.Crawler")

    def __init__(self, args):
        self.updated_since = args.updated_since
        self.loop_delay = int(args.loop_delay)
        self.get_one = getattr(args, 'id', None)
        self.db = ELmonocleDB()
        if args.command == 'github_crawler':
            self.get_one_rep = getattr(args, 'repository', None)
            self.org = args.org
            self.repository_el_re = args.org.lstrip('^') + '.*'
            self.prf = pullrequest.PRsFetcher(
                GithubGraphQLQuery(args.token),
                args.host, args.org)
        elif args.command == 'gerrit_crawler':
            self.repository_el_re = args.repository.lstrip('^')
            self.prf = review.ReviewesFetcher(
                args.host, args.repository)

    def get_last_updated_date(self):
        change = self.db.get_last_updated(self.repository_el_re)
        if not change:
            return (
                self.updated_since or
                datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"))
        else:
            logging.info(
                "Most recent change date in the database for %s is %s" % (
                    self.repository_el_re, change['updated_at']))
            return change['updated_at']

    def run_step(self):
        updated_since = self.get_last_updated_date()
        prs = self.prf.get(updated_since)
        objects = self.prf.extract_objects(prs)
        if objects:
            self.log.info("%s objects will be updated in the database" % len(
                objects))
            self.db.update(objects)

    def run(self):
        if self.get_one:
            if not self.get_one_rep:
                print("The --repository argument must be given")
            else:
                pprint(self.prf.get_one(
                    self.org, self.get_one_rep,
                    self.get_one))
        else:
            while True:
                self.run_step()
                self.log.info("Waiting %s seconds before next fetch ..." % (
                    self.loop_delay))
                sleep(self.loop_delay)
Exemple #2
0
class Runner(object):
    def __init__(self,
                 args,
                 elastic_conn='localhost:9200',
                 elastic_timeout=10):
        super().__init__()
        self.updated_since = args.updated_since
        self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None
        self.loop_delay = int(args.loop_delay)
        self.db = ELmonocleDB(elastic_conn=elastic_conn,
                              index=args.index,
                              timeout=elastic_timeout)
        if args.command == 'github_crawler':
            if args.repository:
                self.repository_el_re = "%s/%s" % (
                    args.org.lstrip('^'),
                    args.repository.lstrip('^'),
                )
            else:
                self.repository_el_re = args.org.lstrip('^') + '/.*'
            self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token),
                                              args.base_url, args.org,
                                              args.repository)
        elif args.command == 'gerrit_crawler':
            self.repository_el_re = args.repository.lstrip('^')
            self.prf = review.ReviewesFetcher(args.base_url, args.repository)

    def get_last_updated_date(self):
        change = self.db.get_last_updated(self.repository_el_re)
        if not change:
            return self.updated_since or datetime.now().strftime(
                "%Y-%m-%dT%H:%M:%SZ")
        else:
            log.info("Most recent change date in the database for %s is %s" %
                     (self.repository_el_re, change['updated_at']))
            return change['updated_at']

    def run_step(self):
        def dump_data(data, prefix=None):
            try:
                if self.dump_dir:
                    tmpfile = tempfile.NamedTemporaryFile(
                        dir=self.dump_dir,
                        prefix=prefix,
                        suffix='.json',
                        mode='w',
                        delete=False,
                    )
                    json.dump(data, tmpfile)
                    tmpfile.close()
                    log.info('Data dumped to %s' % tmpfile.name)
                    return tmpfile.name
            except Exception:
                log.exception('Unable to dump data')
            return None

        updated_since = self.get_last_updated_date()
        try:
            prs = self.prf.get(updated_since)
        except Exception:
            log.exception('Unable to get PR data')
            return
        objects = self.prf.extract_objects(prs, dump_data)
        if objects:
            log.info("%d objects will be updated in the database" %
                     len(objects))
            self.db.update(objects)