def __init__(self, github, since=0, db=DBHandler()): self.api_link = 'api.github.com/' # We check here to see if the api is currently working/alive self.api_up = self.is_api_up() # Login stuff self.gh = github # Where we store our json data self.json_output = "./output.json" self.db = db # self.count is used to keep track of which repo were on self.count = since # self.curr is our current repo self.curr = None # shas to be processed by BlobDecoder self.shas = [] # BlobParser self.blob = BlobParser(self.db)
class Scraper: def __init__(self, github, since=0, db=DBHandler()): self.api_link = 'api.github.com/' # We check here to see if the api is currently working/alive self.api_up = self.is_api_up() # Login stuff self.gh = github # Where we store our json data self.json_output = "./output.json" self.db = db # self.count is used to keep track of which repo were on self.count = since # self.curr is our current repo self.curr = None # shas to be processed by BlobDecoder self.shas = [] # BlobParser self.blob = BlobParser(self.db) def error_handler(self, e, func, *args): if 'rate limit' in str(e): self.sleep_for_api() else: print "{} returned in {}".format(str(func), e) if len(args) > 0: if args is not None and func is not None: func(*args) def sleep_for_api(self): unix_time_left = github3.rate_limit()['resources']['core']['reset'] api_accessible = datetime.fromtimestamp(int(unix_time_left)) time_to_sleep = (api_accessible - datetime.now()).total_seconds() + 10 print 'Api Limit exceeded. Sleeping for {} seconds until {}'.format( time_to_sleep, api_accessible.strftime('%H:%M:%S')) print 'Beginning to parse for todos' self.blob.run() time_to_sleep = (api_accessible - datetime.now()).total_seconds() + 10 sleep(time_to_sleep if time_to_sleep > 0 else 10000) def is_api_up(self): try: HTTPConnection(self.api_link) return True except HTTPException as e: print "Github api is down:" self.error_handler(e, self.is_api_up) return False def _wait_for_api(self): self.sleep_for_api() if self.is_api_up(): self.api_up = True else: self._wait_for_api() def get_repo_langs(self): for i in self.curr.iter_languages(): yield i def _iter_all_repos(self, number=-1, since=None): if since is None: for i in self.gh.iter_all_repos(number, self.count): self.count += 1 yield i def request_repo(self, repo): return self.gh.repository(repo) def _write_repo_to_json(self, file_handle, repo_data): f = file_handle print 'Retrieved {} repos so far!'.format(self.count) if repo_data is list: for repo in repo_data: dump(repo, f) else: dump(repo_data, f) f.write('\n') def download_all_repos(self): self.api_up = True if not self.api_up: self._wait_for_api() try: with open(self.json_output, 'a+') as f: for repo in self._iter_all_repos(): self._write_repo_to_json(f, repo.to_json()) except github3.models.GitHubError as e: print "Failed to query repos with error:" self.error_handler(e, self.download_all_repos) except IOError as e: print "Failed to open file:" self.error_handler(e, self.download_all_repos) def find_all_files(self, repo, repo_name, path=None): # json represents the output from read_json_output() as a repository # object with contents() called on it # TODO(ian): Add more common directories if necessary. common_dirs = ['bin', 'lib', 'src', repo_name] files_found = [] if path is None: path = '.' try: for file_, val in repo.contents(path[1:]).iteritems(): if val.type == 'file': if file_ in common_dirs: files_found.append( self.find_all_files(repo, repo_name, path='{}/{}'.format(path, file_))) else: files_found.append(((path + '/' + file_), val.sha)) except Exception as e: self.error_handler(e, self.find_all_files) return files_found def parse_json_output(self, data): # Should be exclusively called by read_json_output try: print data['full_name'] owner = str(data['full_name'].split('/')[0]) repo = str(data['full_name'].split('/')[1]) files_found = self.find_all_files(self.gh.repository( owner, repo), repo) if files_found: files_ = filter_files_by_ext(files_found) sha_dict = construct_sha_dictionary(owner, repo, files_) for sha in flatten_sha_dict(sha_dict): blob = BlobHandler(self.gh, sha['sha'], sha['owner'], sha['repo']) sha['blob'] = blob.download_blob().decoded self.db.insert_blob(sha) except github3.models.GitHubError as e: self.error_handler(e, self.parse_json_output) except Exception as e: self.error_handler(e, self.parse_json_output) def read_json_output(self): try: with open(self.json_output) as f: for data in load_json_lazy(f): self.parse_json_output(loads(data)) except IOError as e: self.error_handler(e, self.read_json_output) # Basically if we catch this error, the file doesn't exist. self.download_all_repos() self.read_json_output() except ValueError as e: self.error_handler(e, self.read_json_output) def start_scraping(self): self.read_json_output() self.blob.run()