Esempio n. 1
0
	def buildLinkData(self):		
		start_time = time.time()
		entity_handler = EntityLinkHandler()
		entity_handler.connect()
		link_data = entity_handler.listEntityLinks()
		
		str_buf_data = []		
		for link in link_data:
			try:				
				str_buf_data.append(link.mainUrl + "\n")
				str_buf_data.append(link.urlTitle + "\n\n")
			except Exception, e:
				print "ERROR: generic error while retrieving data"
Esempio n. 2
0
    def buildLinkData(self):
        start_time = time.time()
        entity_handler = EntityLinkHandler()
        entity_handler.connect()
        link_data = entity_handler.listEntityLinks()

        str_buf_data = []
        for link in link_data:
            try:
                str_buf_data.append(link.mainUrl + "\n")
                str_buf_data.append(link.urlTitle + "\n\n")
            except Exception, e:
                print "ERROR: generic error while retrieving data"
Esempio n. 3
0
class BotlistSweepLinks:
	
	def __init__(self):
		self.link_handler = EntityLinkHandler()
		
	def init(self):		
		self.link_handler.connect()
		
	def shutdown(self):
		self.link_handler.closeConn()
		
	def filterStopwords(self, keywords):
		""" Find all of the stopwords and remove them"""
		res = []
		keywords_list = keywords.lower().split()
		res = list(set(keywords_list).difference(set(BOTLIST_STOP_WORDS)))
		
		# Return the new keyword string
		return " ".join(res)
	
	def sweep(self):
		""" Cleanup the entity link system, clean keywords"""
		sql_where_clause = "where (process_count = 0) and (full_name = 'botbert99' or full_name = 'botrover99')"
		data = self.link_handler.listEntityLinks(result_limit=MAX_ENTITY_LINKS, where_clause=sql_where_clause)
		for node in data:
			try:				
				new_keywords = self.filterStopwords(node.keywords)
				self.link_handler.updateKeywords(node, new_keywords)
			except Exception, e:
				print e

# End of the file
Esempio n. 4
0
class BotlistSweepLinks:
    def __init__(self):
        self.link_handler = EntityLinkHandler()

    def init(self):
        self.link_handler.connect()

    def shutdown(self):
        self.link_handler.closeConn()

    def filterStopwords(self, keywords):
        """ Find all of the stopwords and remove them"""
        res = []
        keywords_list = keywords.lower().split()
        res = list(set(keywords_list).difference(set(BOTLIST_STOP_WORDS)))

        # Return the new keyword string
        return " ".join(res)

    def sweep(self):
        """ Cleanup the entity link system, clean keywords"""
        sql_where_clause = "where (process_count = 0) and (full_name = 'botbert99' or full_name = 'botrover99')"
        data = self.link_handler.listEntityLinks(result_limit=MAX_ENTITY_LINKS,
                                                 where_clause=sql_where_clause)
        for node in data:
            try:
                new_keywords = self.filterStopwords(node.keywords)
                self.link_handler.updateKeywords(node, new_keywords)
            except Exception, e:
                print e


# End of the file
Esempio n. 5
0
class BotlistProcessCategory:	
	def __init__(self):
		self.link_handler = EntityLinkHandler()
		# Data structure to hold the groups and their terms
		self.terms_dict = { }
		
	def init(self):		
		self.link_handler.connect()
		
	def shutdown(self):
		self.link_handler.closeConn()
		print "INFO: CLEANLY shutdown database connection"
		
	def reqURLData(self, requrl):
		opener = urllib2.build_opener()
		req = urllib2.Request(requrl)
		req.add_header('user-agent', ROVER_USER_AGENT)
		data = opener.open(req).read()
		return data

	def findTerms(self, category_group):
		""" Perform a request for the terms """
		term_url = "%s?categorytype=%s" % (BOT_TERMS_SERVICE, category_group)
		data = self.reqURLData(term_url)
		terms = data.split()
		self.terms_dict[category_group] = terms		
		
	def buildTermSet(self):
		""" Connect to the bots category service and collect all categories and terms """
		# Get the category list
		data = self.reqURLData(BOT_TERMS_SERVICE)
		all_groups = data.split()
		for group in all_groups:
			try:
				self.findTerms(group)
			except Exception, e:
				print e
Esempio n. 6
0
class BotlistProcessCategory:
    def __init__(self):
        self.link_handler = EntityLinkHandler()
        # Data structure to hold the groups and their terms
        self.terms_dict = {}

    def init(self):
        self.link_handler.connect()

    def shutdown(self):
        self.link_handler.closeConn()
        print "INFO: CLEANLY shutdown database connection"

    def reqURLData(self, requrl):
        opener = urllib2.build_opener()
        req = urllib2.Request(requrl)
        req.add_header('user-agent', ROVER_USER_AGENT)
        data = opener.open(req).read()
        return data

    def findTerms(self, category_group):
        """ Perform a request for the terms """
        term_url = "%s?categorytype=%s" % (BOT_TERMS_SERVICE, category_group)
        data = self.reqURLData(term_url)
        terms = data.split()
        self.terms_dict[category_group] = terms

    def buildTermSet(self):
        """ Connect to the bots category service and collect all categories and terms """
        # Get the category list
        data = self.reqURLData(BOT_TERMS_SERVICE)
        all_groups = data.split()
        for group in all_groups:
            try:
                self.findTerms(group)
            except Exception, e:
                print e
Esempio n. 7
0
    def buildLinkData(self, drop_result_limit=MAX_LINKS_PROCESS):
        try:

            # Open output data file
            fout = open("/home/bbrown/botlist_datadump.dat", "w")
            start_time = time.time()
            entity_handler = EntityLinkHandler()
            entity_handler.connect()
            link_data = entity_handler.listEntityLinks(
                result_limit=drop_result_limit)
            for link in link_data:
                try:
                    fout.write("%s\t%s\t%s\n" %
                               (link.mainUrl, link.urlTitle, link.keywords))
                except Exception, e:
                    print "ERROR: generic error while retrieving data"
        finally:
            fout.close()
            entity_handler.closeConn()
            end_time = time.time()
            diff = end_time - start_time
Esempio n. 8
0
class EntityLinkWebAnalysisJob:
    def __init__(self):
        self.passed_ct = -1
        self.link_handler = EntityLinkHandler()

    def init(self):
        self.link_handler.connect()

    def shutdown(self):
        self.link_handler.closeConn()
        print "INFO: CLEANLY shutdown database connection"

    def processURLs(self):
        """ Iterate through the list of web URLs and set web information"""
        PROCESS_CT_ID = 3
        sql_where_clause = "where (process_count < %s)" % PROCESS_CT_ID
        max_links_proc = MAX_ENTITY_LINKS_ANALYTICS
        data = self.link_handler.listEntityLinks(result_limit=max_links_proc,
                                                 where_clause=sql_where_clause)
        opener = urllib2.build_opener()
        cur_time = datetime.datetime.now()
        print "INFO [%s]: requesting URL data MAX=%s" % (cur_time,
                                                         max_links_proc)
        for node in data:
            try:
                web_model = scan_url.extractPageData(opener, node.mainUrl)

                # Update the entity link to set web data analysis
                if web_model:
                    self.link_handler.updateWebAnalytics(
                        node, web_model, PROCESS_STATUS_ANALYTICS)
            except Exception, e:
                cur_time = datetime.datetime.now()
                print "ERR [%s]:processURLs url=%s" % (cur_time, node.mainUrl)
                print e

        # quick summary
        print "INFO: links updated=%s" % self.link_handler.update_count
Esempio n. 9
0
	def buildLinkData(self, drop_result_limit=MAX_LINKS_PROCESS):
		try:
			
			# Open output data file
			fout = open("/home/bbrown/botlist_datadump.dat", "w")			
			start_time = time.time()
			entity_handler = EntityLinkHandler()
			entity_handler.connect()
			link_data = entity_handler.listEntityLinks(result_limit=drop_result_limit)
			for link in link_data:
				try:				
					fout.write("%s\t%s\t%s\n" % (link.mainUrl, link.urlTitle, link.keywords))
				except Exception, e:
					print "ERROR: generic error while retrieving data"
		finally:
			fout.close()
			entity_handler.closeConn()
			end_time = time.time()			
			diff = end_time - start_time
Esempio n. 10
0
 def __init__(self):
     self.passed_ct = -1
     self.link_handler = EntityLinkHandler()
Esempio n. 11
0
	def __init__(self):
		self.link_handler = EntityLinkHandler()
		# Data structure to hold the groups and their terms
		self.terms_dict = { }
Esempio n. 12
0
	def __init__(self):
		self.link_handler = EntityLinkHandler()
Esempio n. 13
0
 def __init__(self):
     self.link_handler = EntityLinkHandler()
     # Data structure to hold the groups and their terms
     self.terms_dict = {}
Esempio n. 14
0
 def __init__(self):
     self.link_handler = EntityLinkHandler()