Ejemplo n.º 1
0
 def __init__(self, github, since=0, db=DBHandler()):
     self.api_link = 'api.github.com/'
     # We check here to see if the api is currently working/alive
     self.api_up = self.is_api_up()
     # Login stuff
     self.gh = github
     # Where we store our json data
     self.json_output = "./output.json"
     self.db = db
     # self.count is used to keep track of which repo were on
     self.count = since
     # self.curr is our current repo
     self.curr = None
     # shas to be processed by BlobDecoder
     self.shas = []
     # BlobParser
     self.blob = BlobParser(self.db)
Ejemplo n.º 2
0
class Scraper:
    def __init__(self, github, since=0, db=DBHandler()):
        self.api_link = 'api.github.com/'
        # We check here to see if the api is currently working/alive
        self.api_up = self.is_api_up()
        # Login stuff
        self.gh = github
        # Where we store our json data
        self.json_output = "./output.json"
        self.db = db
        # self.count is used to keep track of which repo were on
        self.count = since
        # self.curr is our current repo
        self.curr = None
        # shas to be processed by BlobDecoder
        self.shas = []
        # BlobParser
        self.blob = BlobParser(self.db)

    def error_handler(self, e, func, *args):
        if 'rate limit' in str(e):
            self.sleep_for_api()
        else:
            print "{} returned in {}".format(str(func), e)

        if len(args) > 0:
            if args is not None and func is not None:
                func(*args)

    def sleep_for_api(self):
        unix_time_left = github3.rate_limit()['resources']['core']['reset']
        api_accessible = datetime.fromtimestamp(int(unix_time_left))
        time_to_sleep = (api_accessible - datetime.now()).total_seconds() + 10

        print 'Api Limit exceeded. Sleeping for {} seconds until {}'.format(
                time_to_sleep, api_accessible.strftime('%H:%M:%S'))

        print 'Beginning to parse for todos'
        self.blob.run()

        time_to_sleep = (api_accessible - datetime.now()).total_seconds() + 10
        sleep(time_to_sleep if time_to_sleep > 0 else 10000)

    def is_api_up(self):
        try:
            HTTPConnection(self.api_link)
            return True
        except HTTPException as e:
            print "Github api is down:"
            self.error_handler(e, self.is_api_up)
            return False

    def _wait_for_api(self):
        self.sleep_for_api()
        if self.is_api_up():
            self.api_up = True
        else:
            self._wait_for_api()

    def get_repo_langs(self):
        for i in self.curr.iter_languages():
            yield i

    def _iter_all_repos(self, number=-1, since=None):
        if since is None:
            for i in self.gh.iter_all_repos(number, self.count):
                self.count += 1
                yield i

    def request_repo(self, repo):
        return self.gh.repository(repo)

    def _write_repo_to_json(self, file_handle, repo_data):
        f = file_handle
        print 'Retrieved {} repos so far!'.format(self.count)
        if repo_data is list:
            for repo in repo_data:
                dump(repo, f)
        else:
            dump(repo_data, f)
            f.write('\n')

    def download_all_repos(self):
        self.api_up = True
        if not self.api_up:
            self._wait_for_api()

        try:
            with open(self.json_output, 'a+') as f:
                for repo in self._iter_all_repos():
                    self._write_repo_to_json(f, repo.to_json())

        except github3.models.GitHubError as e:
            print "Failed to query repos with error:"
            self.error_handler(e, self.download_all_repos)
        except IOError as e:
            print "Failed to open file:"
            self.error_handler(e, self.download_all_repos)

    def find_all_files(self, repo, repo_name, path=None):
        # json represents the output from read_json_output() as a repository
        # object with contents() called on it

        # TODO(ian): Add more common directories if necessary.
        common_dirs = ['bin', 'lib', 'src', repo_name]
        files_found = []

        if path is None:
            path = '.'

        try:
            for file_, val in repo.contents(path[1:]).iteritems():
                if val.type == 'file':
                    if file_ in common_dirs:
                        files_found.append(
                                self.find_all_files(repo, repo_name,
                                                    path='{}/{}'.format(path,
                                                                        file_)))
                    else:
                        files_found.append(((path + '/' + file_), val.sha))
        except Exception as e:
            self.error_handler(e, self.find_all_files)
        return files_found

    def parse_json_output(self, data):
        # Should be exclusively called by read_json_output
        try:
            print data['full_name']
            owner = str(data['full_name'].split('/')[0])
            repo = str(data['full_name'].split('/')[1])
            files_found = self.find_all_files(self.gh.repository(
                owner, repo), repo)

            if files_found:
                files_ = filter_files_by_ext(files_found)
                sha_dict = construct_sha_dictionary(owner, repo, files_)
                for sha in flatten_sha_dict(sha_dict):
                    blob = BlobHandler(self.gh, sha['sha'], sha['owner'],
                                       sha['repo'])
                    sha['blob'] = blob.download_blob().decoded
                    self.db.insert_blob(sha)
        except github3.models.GitHubError as e:
            self.error_handler(e, self.parse_json_output)
        except Exception as e:
            self.error_handler(e, self.parse_json_output)

    def read_json_output(self):
        try:
            with open(self.json_output) as f:
                for data in load_json_lazy(f):
                    self.parse_json_output(loads(data))
        except IOError as e:
            self.error_handler(e, self.read_json_output)
            # Basically if we catch this error, the file doesn't exist.
            self.download_all_repos()
            self.read_json_output()
        except ValueError as e:
            self.error_handler(e, self.read_json_output)

    def start_scraping(self):
        self.read_json_output()
        self.blob.run()