class MonocleCrawler(): log = logging.getLogger("monocle.Crawler") def __init__(self, args): self.updated_since = args.updated_since self.loop_delay = int(args.loop_delay) self.get_one = getattr(args, 'id', None) self.db = ELmonocleDB() if args.command == 'github_crawler': self.get_one_rep = getattr(args, 'repository', None) self.org = args.org self.repository_el_re = args.org.lstrip('^') + '.*' self.prf = pullrequest.PRsFetcher( GithubGraphQLQuery(args.token), args.host, args.org) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher( args.host, args.repository) def get_last_updated_date(self): change = self.db.get_last_updated(self.repository_el_re) if not change: return ( self.updated_since or datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")) else: logging.info( "Most recent change date in the database for %s is %s" % ( self.repository_el_re, change['updated_at'])) return change['updated_at'] def run_step(self): updated_since = self.get_last_updated_date() prs = self.prf.get(updated_since) objects = self.prf.extract_objects(prs) if objects: self.log.info("%s objects will be updated in the database" % len( objects)) self.db.update(objects) def run(self): if self.get_one: if not self.get_one_rep: print("The --repository argument must be given") else: pprint(self.prf.get_one( self.org, self.get_one_rep, self.get_one)) else: while True: self.run_step() self.log.info("Waiting %s seconds before next fetch ..." % ( self.loop_delay)) sleep(self.loop_delay)
class Runner(object): def __init__(self, args, elastic_conn='localhost:9200', elastic_timeout=10): super().__init__() self.updated_since = args.updated_since self.dump_dir = DUMP_DIR if os.path.isdir(DUMP_DIR) else None self.loop_delay = int(args.loop_delay) self.db = ELmonocleDB(elastic_conn=elastic_conn, index=args.index, timeout=elastic_timeout) if args.command == 'github_crawler': if args.repository: self.repository_el_re = "%s/%s" % ( args.org.lstrip('^'), args.repository.lstrip('^'), ) else: self.repository_el_re = args.org.lstrip('^') + '/.*' self.prf = pullrequest.PRsFetcher(GithubGraphQLQuery(args.token), args.base_url, args.org, args.repository) elif args.command == 'gerrit_crawler': self.repository_el_re = args.repository.lstrip('^') self.prf = review.ReviewesFetcher(args.base_url, args.repository) def get_last_updated_date(self): change = self.db.get_last_updated(self.repository_el_re) if not change: return self.updated_since or datetime.now().strftime( "%Y-%m-%dT%H:%M:%SZ") else: log.info("Most recent change date in the database for %s is %s" % (self.repository_el_re, change['updated_at'])) return change['updated_at'] def run_step(self): def dump_data(data, prefix=None): try: if self.dump_dir: tmpfile = tempfile.NamedTemporaryFile( dir=self.dump_dir, prefix=prefix, suffix='.json', mode='w', delete=False, ) json.dump(data, tmpfile) tmpfile.close() log.info('Data dumped to %s' % tmpfile.name) return tmpfile.name except Exception: log.exception('Unable to dump data') return None updated_since = self.get_last_updated_date() try: prs = self.prf.get(updated_since) except Exception: log.exception('Unable to get PR data') return objects = self.prf.extract_objects(prs, dump_data) if objects: log.info("%d objects will be updated in the database" % len(objects)) self.db.update(objects)