Python DataManager Examples

Programming Language: Python

Namespace/Package Name: github.data_manager

Class/Type: DataManager

Examples at hotexamples.com: 4

Python DataManager - 4 examples found. These are the top rated real world Python examples of github.data_manager.DataManager extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataManager(1)

extractNextURL(1)

extractReposFiltered(1)

getDataLikeTail(1)

getKeysFromCrawlData(1)

parseNextBlock(1)

writeRepositoryList(1)

Example #1

Show file

File: crawler.py Project: songofhack/GithubSpider

 def extractReposFiltered(input_file, output_file,
                          _filter=None):
     """
     Extract any repository from 'input_file' that matches 'filter',
     into 'output_file'.
     """
     DataManager.extractReposFiltered(input_file, output_file, _filter)

Example #2

Show file

File: crawler.py Project: songofhack/GithubSpider

 def getKeyFromCrawlData(input_file, output_file,
                               keys=KEY_CLONE_URL):
     """
     Extract the value for 'key' from every crawled repository in file
     'input_file'.
     Output is redirected into 'output_file'.
     """
     DataManager.getKeysFromCrawlData(input_file, output_file, keys)

Example #3

Show file

File: crawler.py Project: songofhack/GithubSpider

 def __init__(self, file_path):
     '''
     Constructor
     '''
     # DataManager handles file reading/writing.
     self.datamanager  = DataManager()
     
     # Get OAuth from file 'authentication'.
     auth_file    = file_path
     auth_manager = OAuthManager(filename=auth_file)
     auth = None
     try:
         auth = auth_manager.getAuthData()
         
     except (AuthFileNotFoundException, AuthException):
         # Authentication file not found or malformatted. Recreate it.
         auth = self.initiateAuthCreation(auth_manager)
           
     except NoCredentialsException:
         oauth      = None
         user_agent = None
             
     if auth:
         oauth       = auth[auth_manager.KEY_OAUTH]
         user_agent  = auth[auth_manager.KEY_USER_AGENT]
 
     self.OAUTH = oauth
     self.HEADER_USER_AGENT = user_agent
     
     self.HEADERS = {
                 'User-Agent':    self.HEADER_USER_AGENT,
                 'Authorization': "token %s" % self.OAUTH,
                 }
     
     # Setup authentication and settings
     self.s = GithubSession(self.OAUTH, self.HEADER_USER_AGENT)

Example #4

Show file

File: crawler.py Project: songofhack/GithubSpider

class Crawler(object):
    '''
    classdocs
    '''

    # constants
    FILE_AUTHENTICATION = "authentication"
    
    LINK_API   = "https://api.github.com"
    LINK_REPO_API   = LINK_API + "/repositories"
    LINK_SEARCH_API = LINK_API + "/search/repositories"
    LINK_RATE_LIMIT = LINK_API + "/rate_limit"
    HEADER_USER_AGENT    = None
    HEADER_XRATELIMIT_LIMIT     = "X-RateLimit-Limit"
    HEADER_XRATELIMIT_REMAINING = "X-RateLimit-Remaining"
    
    KEY_NEXT  = "next"
    KEY_SINCE = "since"
    KEY_COUNT = "count"
    KEY_START = "start"
    KEY_CLONE_URL = "clone_url"
    KEY_RL_REMAIN = "X-RateLimit-Remaining"
    KEY_STATUS_CODE   = "status_code"
    KEY_CRAWLED_LINKS = "crawled_links"
    
    # GitHub Session object
    s = None
    
    def __init__(self, file_path):
        '''
        Constructor
        '''
        # DataManager handles file reading/writing.
        self.datamanager  = DataManager()
        
        # Get OAuth from file 'authentication'.
        auth_file    = file_path
        auth_manager = OAuthManager(filename=auth_file)
        auth = None
        try:
            auth = auth_manager.getAuthData()
            
        except (AuthFileNotFoundException, AuthException):
            # Authentication file not found or malformatted. Recreate it.
            auth = self.initiateAuthCreation(auth_manager)
              
        except NoCredentialsException:
            oauth      = None
            user_agent = None
                
        if auth:
            oauth       = auth[auth_manager.KEY_OAUTH]
            user_agent  = auth[auth_manager.KEY_USER_AGENT]
    
        self.OAUTH = oauth
        self.HEADER_USER_AGENT = user_agent
        
        self.HEADERS = {
                    'User-Agent':    self.HEADER_USER_AGENT,
                    'Authorization': "token %s" % self.OAUTH,
                    }
        
        # Setup authentication and settings
        self.s = GithubSession(self.OAUTH, self.HEADER_USER_AGENT)
        
    def initiateAuthCreation(self, auth_manager):
        try:
            auth_manager.createAuth()
            auth = auth_manager.getAuthData()
            print "Authentication process done. Continuing..."
            
        except OAuthCreationException:
            # OAuth error. Maybe the OAuth token could not be created, because
            # it already exists.
            print (
                "OAuth error. Maybe authentication file could not be written "
                "because of missing write-privilege."
                )
            sys.exit()
        
        return auth
        
    def crawlReposWUpdate(self, data_filename):
        self.crawlRepos(data_filename, skip=False)
    
    def crawlRepos(self, file_links, skip=True, _filter=None):
        current_ratelimit = self.getRateLimit()["core"]["remaining"]
        if current_ratelimit == 0:
            self.endExecution()
        
        url = None
        copy_only = False
        
        file_links_backup = ""

        # Filehandle for writing.
        fw = None
        f_links = None
        
        
        TEXT_PROCESSING = "Processing contents of file: "
        # If a links file already exists from earlier crawls, then parse it.
        if os.path.isfile(file_links):
            print "File '%s' exists already. Will be appending to it." % (file_links)

            file_links_backup = file_links + "_backup"
            
            def restoreBackup(signum, frame):
                """
                Inner function: Restore original file from backup upon 
                termination in backup process.
                """
                msg = "Got exit signal. Restoring original file from backup..."
                print "\n%s\r" % (msg), 
                
                if fw:
                    fw.close()

                if f_links:
                    f_links.close()

                # Copy backup file back.
                shutil.copyfile(file_links_backup, file_links)
                
                print "%s Done." % (msg)
                
                sys.exit()
            
            # Catch process-kill signal.
            signal.signal(signal.SIGTERM, restoreBackup)
            
            # Also catch Ctrl-C/D.
            signal.signal(signal.SIGINT, restoreBackup)

            os.rename(file_links, file_links_backup)
            
            f_links     = open(file_links_backup, 'r')
            
            if skip:
                # We do not want to recrawl old data, so
                # just copy-paste it.
                shutil.copyfile(file_links_backup, file_links)
                    
            # Open fh for writing.
            fw = open(file_links, 'a')
            
            print TEXT_PROCESSING + str(file_links) + "..."
            sys.stdout.flush()

            if skip:
                # We do not want to recrawl old data.
                # Therefore, get the last next-link from the old data,
                # so that we can continue crawling from there.
                data = self.datamanager.getDataLikeTail(file_links, 
                                                            1, stepsize=65)

                url = self.datamanager.extractNextURL(data)
            else:
                old_data = f_links
 
            etag  = None
            repos = None
            next_url = None
            
            file_pos = None
            # Parse old data if skip was not specified.
            while 1 and not skip:
                try:
                    file_pos    = old_data.tell()
                    parsed_data = self.datamanager.parseNextBlock(old_data)

                    if parsed_data:
                        _repos, url, etag, next_url = parsed_data
                        
                        repos = RepositoryList(
                                    url, etag, repos=_repos,
                                    next_url=next_url
                                    )
        
                        if not skip:
                            try:
                                # Update data, by requesting Github API.
                                self.nextBackupCrawl(fw, repos, 
                                                     copy_only=copy_only,
                                                     _filter=_filter)
                                
                            except RatelimitExceededException:
                                    # No ratelimit remaining, continue
                                    # to only copy the old data and finish.
                                    copy_only = True

                    # We finished parsing the old data.                    
                    else:
                        break
                    
                # Encountered malformatted block, probably because
                # the original data file was cut/edited.
                # Rewind the file position and skip one line.
                except IOError as err:
                    old_data.seek(file_pos, os.SEEK_SET)
                    old_data.readline()
                    print err, " Skipping this line!"

            if repos:
                url = repos.getNextURL()
                
            # Remove backup signal handlers.
            # SIG_DFL is the standard signal handle for any signal.
            signal.signal(signal.SIGTERM, signal.SIG_DFL)
            signal.signal(signal.SIGINT,  signal.SIG_DFL)
            print "Done parsing old data."
            
            if copy_only:
                self.endExecution()
        
        repos = None
        
        try:
            # Parsing finished or no backup file found. Start crawling new data.
            if not fw:
                # There was no backup file
                fw = open(file_links, 'a')
            
            if not url:
                # We do not have a URL to start form yet.
                # Start crawling from the beginning.
                repos = self.nextCrawl(fw, _filter=_filter)
                url   = repos.getNextURL()

            # Parse until ratelimit is reached.
            while url:
                # Crawl next page
                repos = self.nextCrawl(fw, url=url, _filter=_filter)
                url   = repos.getNextURL()
    
            fw.close()
            
        except RatelimitExceededException:
            self.endExecution()

    def nextBackupCrawl(self, fh, repository_list, 
                        copy_only=False, _filter=None):
        """
        Get up-to-date data for already crawled repositories.
        If 'copy_only' is specified, we only copy old data from
        the backup file to not lose any already crawled data.
        """
        result = None

        if not copy_only:
            # We do not want to simply copy the old data - 
            # check for an update.
            print "Updating from: %s" % repository_list.getURL()
            
            result = self.s.update(repository_list)
            
            if result:
                print "Found update!"
        
        if _filter:
            # Filter results
            repository_list.filter(self.s, self.DEFAULT_REPO_FILTER)
        
        self.datamanager.writeRepositoryList(fh, repository_list)
        
        return result

    def nextCrawl(self, fh, url=None, _filter=None):
        """
        Crawl repositories from GitHub.
        'url' is used to specify the next parse-URL.
        """
        result = None

        _format = "Crawling: %s"
        
        # Setup visual feedback thread.
        visual_feedback = visualCrawlingFeedback()
        
        if url:
            _format = _format % url
            sys.stdout.write(_format + "\r")
            sys.stdout.flush()
            
            visual_feedback.setMsg(_format)
            visual_feedback.start()
            result = self.s.getRepos(url=url)
            
        else:
            _format = _format % "From beginning."
            sys.stdout.write(_format + "\r")
            sys.stdout.flush()
            
            visual_feedback.setMsg(_format)
            visual_feedback.start()
            result = self.s.getRepos()

        if _filter:
            # Filter results
            result.filter(self.s, _filter)

        # Write new results from Github.
        self.datamanager.writeRepositoryList(fh, result)

        visual_feedback.stopFeedback()
        
        print visual_feedback.getMsg() + "Saved to file."

        return result

    @staticmethod
    def getKeyFromCrawlData(input_file, output_file,
                                  keys=KEY_CLONE_URL):
        """
        Extract the value for 'key' from every crawled repository in file
        'input_file'.
        Output is redirected into 'output_file'.
        """
        DataManager.getKeysFromCrawlData(input_file, output_file, keys)

    @staticmethod
    def extractReposFiltered(input_file, output_file,
                             _filter=None):
        """
        Extract any repository from 'input_file' that matches 'filter',
        into 'output_file'.
        """
        DataManager.extractReposFiltered(input_file, output_file, _filter)
    
    def endExecution(self):
        print "Ratelimit reached. Quitting..."
        sys.exit()
        
    def getNextURL(self, _dict, next_link=None):
        """
        Find the URL in _dict and return it.
        Empty string if it does not exist.
        'next_link' can be used to specify an alternative if there is no
        link in _dict.
        """
        if self.KEY_NEXT_URL in _dict:
            return _dict[self.KEY_NEXT_URL]
        else:
            if next_link:
                return next_link
            else:
                return ""
        
    def search(self, q="language:PHP", sort=None, order=None):
        """
        Search GitHub for 'q'.
        Any search is limited to 1000 results.
        """
        # Could yield problems, because no deep copy is done.
        # TODO: (maybe)
        resp = r.get(self.addOAuth(self.LINK_SEARCH_API + "?q=" + q),
                     headers=self.HEADERS)
        
        decoded = json.loads(resp.text)
        
        for _dict in decoded["items"]: 
            print _dict["clone_url"]
            
        return decoded
    
    def getRateLimit(self):
        return self.s.getRatelimit()

    def addOAuth(self, url):
        """
        Add the OAuth get-parameter to the specified 'url'.
        """
        token_query = "access_token=" + self.OAUTH
        if url.find('?') != -1:
            url += "&" + token_query
        else:
            url += "?" + token_query 
    
        return url
    
    ### LEGACY CODE
    ### ~~~~~~~~~~~
    def crawlSearchDays(self, start, end, q="langauge:PHP", sort=None, order=None):
        """
        Crawl the clone urls for the search query 'q'.
        However, the query will be modified to only show results of
        a certain day.
        This will be repeated until each day in [start, end] was queried.
        Therefore, 'start' and 'end' have to be dates of format YYYY-MM-DD.
        
        Some days may be skipped due to different length of months.
        """
        # Check start and end format first.
        r = re.compile('^[0-9]{4}-[0-9]{2}-[0-9]{2}$')
        if not r.match(start) or not r.match(end):
            # 'start' or 'end' have a wrong format.
            print (
                "'start' and 'end' are expected to be of format YYYY-MM-DD."
                "'%s' and '%s' were given." % (start, end)
                )
            return -1
        
        else:
            # Parameters are ok, continue
            pass
    
    def crawlSearching(self, q="language:PHP", sort=None, order=None):
        """
        Crawl the clone urls for the search query 'q'.
        The response is split into 10 URLs with 100 repositories each.
        """
        per_page = 100
        page     = 0
        
        for page in range(1, 11):
            resp = self.search(q + "&per_page=" + str(per_page) + 
                               "&page=" + str(page))
            
            # Check if the response was empty, so that we can reduce
            # the load on the GitHub API servers.
            if not resp["items"]:
                break