def buildLinkData(self): start_time = time.time() entity_handler = EntityLinkHandler() entity_handler.connect() link_data = entity_handler.listEntityLinks() str_buf_data = [] for link in link_data: try: str_buf_data.append(link.mainUrl + "\n") str_buf_data.append(link.urlTitle + "\n\n") except Exception, e: print "ERROR: generic error while retrieving data"
class BotlistSweepLinks: def __init__(self): self.link_handler = EntityLinkHandler() def init(self): self.link_handler.connect() def shutdown(self): self.link_handler.closeConn() def filterStopwords(self, keywords): """ Find all of the stopwords and remove them""" res = [] keywords_list = keywords.lower().split() res = list(set(keywords_list).difference(set(BOTLIST_STOP_WORDS))) # Return the new keyword string return " ".join(res) def sweep(self): """ Cleanup the entity link system, clean keywords""" sql_where_clause = "where (process_count = 0) and (full_name = 'botbert99' or full_name = 'botrover99')" data = self.link_handler.listEntityLinks(result_limit=MAX_ENTITY_LINKS, where_clause=sql_where_clause) for node in data: try: new_keywords = self.filterStopwords(node.keywords) self.link_handler.updateKeywords(node, new_keywords) except Exception, e: print e # End of the file
class BotlistProcessCategory: def __init__(self): self.link_handler = EntityLinkHandler() # Data structure to hold the groups and their terms self.terms_dict = { } def init(self): self.link_handler.connect() def shutdown(self): self.link_handler.closeConn() print "INFO: CLEANLY shutdown database connection" def reqURLData(self, requrl): opener = urllib2.build_opener() req = urllib2.Request(requrl) req.add_header('user-agent', ROVER_USER_AGENT) data = opener.open(req).read() return data def findTerms(self, category_group): """ Perform a request for the terms """ term_url = "%s?categorytype=%s" % (BOT_TERMS_SERVICE, category_group) data = self.reqURLData(term_url) terms = data.split() self.terms_dict[category_group] = terms def buildTermSet(self): """ Connect to the bots category service and collect all categories and terms """ # Get the category list data = self.reqURLData(BOT_TERMS_SERVICE) all_groups = data.split() for group in all_groups: try: self.findTerms(group) except Exception, e: print e
class BotlistProcessCategory: def __init__(self): self.link_handler = EntityLinkHandler() # Data structure to hold the groups and their terms self.terms_dict = {} def init(self): self.link_handler.connect() def shutdown(self): self.link_handler.closeConn() print "INFO: CLEANLY shutdown database connection" def reqURLData(self, requrl): opener = urllib2.build_opener() req = urllib2.Request(requrl) req.add_header('user-agent', ROVER_USER_AGENT) data = opener.open(req).read() return data def findTerms(self, category_group): """ Perform a request for the terms """ term_url = "%s?categorytype=%s" % (BOT_TERMS_SERVICE, category_group) data = self.reqURLData(term_url) terms = data.split() self.terms_dict[category_group] = terms def buildTermSet(self): """ Connect to the bots category service and collect all categories and terms """ # Get the category list data = self.reqURLData(BOT_TERMS_SERVICE) all_groups = data.split() for group in all_groups: try: self.findTerms(group) except Exception, e: print e
def buildLinkData(self, drop_result_limit=MAX_LINKS_PROCESS): try: # Open output data file fout = open("/home/bbrown/botlist_datadump.dat", "w") start_time = time.time() entity_handler = EntityLinkHandler() entity_handler.connect() link_data = entity_handler.listEntityLinks( result_limit=drop_result_limit) for link in link_data: try: fout.write("%s\t%s\t%s\n" % (link.mainUrl, link.urlTitle, link.keywords)) except Exception, e: print "ERROR: generic error while retrieving data" finally: fout.close() entity_handler.closeConn() end_time = time.time() diff = end_time - start_time
class EntityLinkWebAnalysisJob: def __init__(self): self.passed_ct = -1 self.link_handler = EntityLinkHandler() def init(self): self.link_handler.connect() def shutdown(self): self.link_handler.closeConn() print "INFO: CLEANLY shutdown database connection" def processURLs(self): """ Iterate through the list of web URLs and set web information""" PROCESS_CT_ID = 3 sql_where_clause = "where (process_count < %s)" % PROCESS_CT_ID max_links_proc = MAX_ENTITY_LINKS_ANALYTICS data = self.link_handler.listEntityLinks(result_limit=max_links_proc, where_clause=sql_where_clause) opener = urllib2.build_opener() cur_time = datetime.datetime.now() print "INFO [%s]: requesting URL data MAX=%s" % (cur_time, max_links_proc) for node in data: try: web_model = scan_url.extractPageData(opener, node.mainUrl) # Update the entity link to set web data analysis if web_model: self.link_handler.updateWebAnalytics( node, web_model, PROCESS_STATUS_ANALYTICS) except Exception, e: cur_time = datetime.datetime.now() print "ERR [%s]:processURLs url=%s" % (cur_time, node.mainUrl) print e # quick summary print "INFO: links updated=%s" % self.link_handler.update_count
def buildLinkData(self, drop_result_limit=MAX_LINKS_PROCESS): try: # Open output data file fout = open("/home/bbrown/botlist_datadump.dat", "w") start_time = time.time() entity_handler = EntityLinkHandler() entity_handler.connect() link_data = entity_handler.listEntityLinks(result_limit=drop_result_limit) for link in link_data: try: fout.write("%s\t%s\t%s\n" % (link.mainUrl, link.urlTitle, link.keywords)) except Exception, e: print "ERROR: generic error while retrieving data" finally: fout.close() entity_handler.closeConn() end_time = time.time() diff = end_time - start_time
def __init__(self): self.passed_ct = -1 self.link_handler = EntityLinkHandler()
def __init__(self): self.link_handler = EntityLinkHandler() # Data structure to hold the groups and their terms self.terms_dict = { }
def __init__(self): self.link_handler = EntityLinkHandler()
def __init__(self): self.link_handler = EntityLinkHandler() # Data structure to hold the groups and their terms self.terms_dict = {}