def getpagecount(self, collection, uriAt, sort, view, partial_match, flatList, logging_me=True, debugging_me=False): """Get page count""" if logging_me: logging.info( "[diagnose_handler:getpagecount] Sitemap flatList = " + flatList) servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) uriAt = self.SanitizeURI(uriAt) for server in servers: client = urltracker_client.URLTrackerClient( server.host(), int(server.port())) contents = client.GetPageCount(string.strip(collection), string.strip(uriAt), string.strip(sort), string.strip(view), self.GetIntValue(partial_match), self.GetIntValue(flatList)) if contents == None: continue return 'response = %s\n' % repr(contents) return 'response = []\n'
def get(self, collection, uriAt, sort, view, page, partial_match, flatList, logging_me=True, debugging_me=False): """Returns the content diagnostics""" if logging_me: logging.info("[diagnose_handler:get] Sitemap flatList = " + flatList) servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) uriAt = self.SanitizeURI(uriAt) for server in servers: client = urltracker_client.URLTrackerClient( server.host(), int(server.port())) contents = client.Get(string.strip(collection), string.strip(uriAt), string.strip(sort), string.strip(view), self.GetIntValue(page), self.GetIntValue(partial_match), self.GetIntValue(flatList)) if contents == None: continue # # Note: This last-minute `pagerank' update is the sole difference btwn # this procedure (viz., `get()') and `export()' below. # # Note: The call to `exportDiagnostics()' in "AdminCaller.java" actually # calls _this_ (viz., `get()'), __not__ `export()' below. # # Note: The `execute()' code in "ExportDiagnosticsHandler.java" ignores # this meticulously calculated `pagerank' data altogether. Hmmm. # for content in contents[:-1]: if content.get('type') == 'FileContentData': content['pagerank'] = self.GetPageRank( long(pywrapurl.URL(content['uri']).Fingerprint())) if debugging_me: if self.GetIntValue(flatList) == 0: content['name'] = '[tree=' + flatList + '] ' + content[ 'name'] else: content['name'] = '[flat=' + flatList + '] ' + content[ 'name'] return 'response = %s\n' % repr(contents) return 'response = []\n'
def getFile(self, uriAt): """Returns the content status for the URI uriAt""" collection_names = ent_collection.ListCollections( self.cfg.globalParams) collection_fingerprint_map = {} uriAt = self.SanitizeURI(uriAt) for name in collection_names: collection_fingerprint_map[pywraphash.Fingerprint(name)] = name urltracker_servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) DocID = long(pywrapurl.URL(uriAt).Fingerprint()) for urltracker_server in urltracker_servers: urltracker_client_ = urltracker_client.URLTrackerClient( urltracker_server.host(), int(urltracker_server.port())) (response, last_successful_crawl_timestamp, auth_method) = urltracker_client_.GetFile( string.strip(uriAt), collection_fingerprint_map) if response == None: continue pagerank = self.GetPageRank(DocID) cached = self.IsDocCached(DocID) forwardLinks = self.GetLinkCount(DocID, 1) backwardLinks = self.GetLinkCount(DocID, 0) date = self.GetDate(DocID) lastmodifieddate = self.GetLastModifiedDate(DocID) dict = { 'pagerank': pagerank, 'cached': cached, 'date': date, 'lastmodifieddate': lastmodifieddate, 'forwardLinks': forwardLinks, 'backwardLinks': backwardLinks } # Note that this timestamp and the CRAWLED_NEW state may appear in # history list of states in @response. However, if it ages enough, # it will be removed from the history list. Therefore we need to # store away and pass around along with, if applicable, its auth_method. if last_successful_crawl_timestamp: dict['lastSuccessfulCrawlTimestamp'] = long( last_successful_crawl_timestamp) if auth_method: dict['authMethod'] = int(auth_method) response.append(dict) return 'response = %s\n' % repr(response) return 'response = []\n'
def getContentTypeStats(self, sortBy, collection): """Get Content Type stats from urltracker_server.""" servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) for server in servers: client = urltracker_client.URLTrackerClient( server.host(), int(server.port())) contents = client.GetContentTypeStats(sortBy, string.strip(collection)) if contents: return 'response = %s\n' % repr(contents) return 'response = {}\n'