def Reload(self, filename): if filename == "": self.program = Program() self.schedule = Schedule(self.program, [Processor(0, 0.9, 5)]) return self.program = Program(filename) self.LoadProcessors(filename) # TODO: exception in case of any errors self.schedule = Schedule(self.program, self.processors) self.schedule.SetToDefault()
def __init__(self, v, k, m): self.v = v if k.__class__.__name__ == "Version": self.k = k else: self.k = Version(v, k) if m.__class__.__name__ == "Processor": self.m = m else: self.m = Processor(m)
def _getProcessor(self, proc=None): if not proc is None: # TODO: check errors ind = self.emptyprocessors.index(proc) m = self.emptyprocessors[ind] del self.emptyprocessors[ind] self.processors.append(m) return m if len(self.emptyprocessors) > 0: p = self.emptyprocessors[0] self.emptyprocessors = self.emptyprocessors[1:] else: p = Processor(self.GetProcessorsWithoutDoubles() + 1, \ self.availableProcessors[0].reliability, \ self.availableProcessors[0].speed) self.processors.append(p) return p
def LoadProcessors(self, filename): '''Parse the XML with to get the specs of the processors .. warning:: Describe XML format here''' f = open(filename, "r") dom = xml.dom.minidom.parse(f) for node in dom.childNodes: if node.tagName == "program": self.tdir = int(node.getAttribute("tdir")) self.rdir = float(node.getAttribute("rdir")) #Parse vertices for vertex in node.childNodes: if vertex.nodeName == "processor": speed = int(vertex.getAttribute("speed")) rel = float(vertex.getAttribute("reliability")) p = Processor(0, rel, speed) self.processors.append(p) f.close()
def Randomize(self): '''Make a random schedule''' #TODO: randomize versions and processors too self.vertices = {} self.processors = [] self.emptyprocessors = [] count = random.randint(1, len(self.program.vertices)) keys = [] procs = {} for i in range(count): p = self._getProcessor() self.vertices[p.number] = [] keys.append(p.number) procs[p.number] = p # Use fictional processor number -1 to check correctness of the schedule. fict = Processor(-1) self.vertices[-1] = [] verts = self.program.OrderedVertices() backup = [[v for v in self.program.vertices], [e for e in self.program.edges]] self.program.vertices = [] self.program.edges = [] for v in verts: self.program.vertices.append(v) self.program.edges = [] for e in backup[1]: if e.source in self.program.vertices and e.destination in self.program.vertices: self.program.edges.append(e) self.program._buildData() s = ScheduleVertex(v, v.versions[0], fict) self.currentVersions[v.number] = [s] self.vertices[-1] = [s] self._succCache = {} while True: m = random.randint(1, count) n = random.randint(0, len(self.vertices[m])) if self.TryMoveVertex(s, 0, procs[m], n) == True: self.MoveVertex(s, 0, procs[m], n) break for m in self.processors: self._delEmptyProc(m)
def start_spider_core(self): if log.isEnabledFor(logging.INFO): log.info('Spider 开始启动') try: # 创建Redis连接 redis_connect_retry_times = 3 while redis_connect_retry_times > 0: self.redis_connection = redis.StrictRedis( host=self.redis_host, port=self.redis_port, db=self.redis_db, password=self.redis_password) ping = self.redis_connection.ping() if ping is True: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接成功') break else: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接失败') redis_connect_retry_times -= 1 time.sleep(5) # 若连接不成功则退出 if redis_connect_retry_times <= 0: raise Exception() # 创建MySQL连接 self.mysql_connection = pymysql.connect(host=self.mysql_host, user=self.mysql_username, passwd=self.mysql_password, db=self.mysql_database, charset=self.mysql_charset) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Redis 启动失败') log.exception(e) return # 创建 response 缓存队列 self.response_buffer = ResponseBuffer() # 启动账户管理器并登陆 self.account_manager = AccountManager(self.login_token, self.password, self.is_login_by_cookie, self.z_c0) is_login = self.account_manager.login() if not is_login: return # 启动Downloader self.downloader = Downloader( self.redis_connection, self.response_buffer, self.account_manager, self.is_proxy_service_enable, self.session_pool_size, self.download_thread_num, self.network_retry_times, self.connect_timeout, self.download_interval) self.downloader.start_downloader() # 启动Scheduler self.schedule = Scheduler(self.redis_connection, self.url_rate) self.schedule.start() # 启动 DataPersistent self.dataPersistent = DataPersistent( self.persistent_cache_size, self.follow_relation_persistent_cache_size, self.mysql_connection, self.redis_connection) self.dataPersistent.start_data_persistent() # 启动Processor self.processor = Processor(self.process_thread_num, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation, self.redis_connection, self.response_buffer) self.processor.start_processor() self.processor.load_init_data(self.init_token) # 启动邮件服务 if self.is_email_service_enable is True: self.email_service = EmailService( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.dataPersistent) self.email_service.start_email_service() self.email_service.send_message('Spider 启动完毕') if log.isEnabledFor(logging.INFO): log.info('Spider 启动完毕') # 模块异常检查 while True: # Downloader模块异常检查 self.downloader.check_and_restart() # EmailService 模块异常检查 if self.is_email_service_enable is True: self.email_service.check_and_restart() # DataPersistent 模块异常检查 self.dataPersistent.check_and_restart() # Scheduler 模块异常检查 # Processor 模块异常检查 self.processor.check_and_restart() # 检查间隔 time.sleep(180) gc.collect()
class SpiderCore: def __init__(self): # 默认配置 # downloader 模块配置 self.is_proxy_service_enable = False self.session_pool_size = 20 self.download_thread_num = 10 self.network_retry_times = 3 self.connect_timeout = 30 self.download_interval = 3 # Processor 模块配置 self.process_thread_num = 2 self.is_parser_following_list = True self.is_parser_follower_list = False self.is_parser_follow_relation = False # Scheduler 模块配置 self.url_rate = 8 # DataPersistent 模块配置 self.persistent_cache_size = 1000 self.follow_relation_persistent_cache_size = 1000 # 邮件服务配置 self.is_email_service_enable = False self.smtp_server_host = '' self.smtp_server_port = 25 self.smtp_server_password = '' self.smtp_from_addr = '' self.smtp_to_addr = '' self.smtp_email_header = '' self.smtp_send_interval = 3600 # redis 数据库配置 self.redis_host = '' self.redis_port = 6379 self.redis_db = 0 self.redis_password = '' # MySQL 数据库配置 self.mysql_host = '' self.mysql_username = '' self.mysql_password = '' self.mysql_database = '' self.mysql_charset = 'utf8' # 知乎账户配置 self.is_login_by_cookie = True self.z_c0 = '' self.login_token = '' self.password = '' # 初始 Token self.init_token = [] # 载入用户自定义配置 self.load_config() # 模块实例 self.redis_connection = None self.mysql_connection = None self.response_buffer = None self.account_manager = None self.downloader = None self.processor = None self.schedule = None self.dataPersistent = None self.email_service = None # 启动Spider def start_spider_core(self): if log.isEnabledFor(logging.INFO): log.info('Spider 开始启动') try: # 创建Redis连接 redis_connect_retry_times = 3 while redis_connect_retry_times > 0: self.redis_connection = redis.StrictRedis( host=self.redis_host, port=self.redis_port, db=self.redis_db, password=self.redis_password) ping = self.redis_connection.ping() if ping is True: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接成功') break else: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接失败') redis_connect_retry_times -= 1 time.sleep(5) # 若连接不成功则退出 if redis_connect_retry_times <= 0: raise Exception() # 创建MySQL连接 self.mysql_connection = pymysql.connect(host=self.mysql_host, user=self.mysql_username, passwd=self.mysql_password, db=self.mysql_database, charset=self.mysql_charset) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Redis 启动失败') log.exception(e) return # 创建 response 缓存队列 self.response_buffer = ResponseBuffer() # 启动账户管理器并登陆 self.account_manager = AccountManager(self.login_token, self.password, self.is_login_by_cookie, self.z_c0) is_login = self.account_manager.login() if not is_login: return # 启动Downloader self.downloader = Downloader( self.redis_connection, self.response_buffer, self.account_manager, self.is_proxy_service_enable, self.session_pool_size, self.download_thread_num, self.network_retry_times, self.connect_timeout, self.download_interval) self.downloader.start_downloader() # 启动Scheduler self.schedule = Scheduler(self.redis_connection, self.url_rate) self.schedule.start() # 启动 DataPersistent self.dataPersistent = DataPersistent( self.persistent_cache_size, self.follow_relation_persistent_cache_size, self.mysql_connection, self.redis_connection) self.dataPersistent.start_data_persistent() # 启动Processor self.processor = Processor(self.process_thread_num, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation, self.redis_connection, self.response_buffer) self.processor.start_processor() self.processor.load_init_data(self.init_token) # 启动邮件服务 if self.is_email_service_enable is True: self.email_service = EmailService( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.dataPersistent) self.email_service.start_email_service() self.email_service.send_message('Spider 启动完毕') if log.isEnabledFor(logging.INFO): log.info('Spider 启动完毕') # 模块异常检查 while True: # Downloader模块异常检查 self.downloader.check_and_restart() # EmailService 模块异常检查 if self.is_email_service_enable is True: self.email_service.check_and_restart() # DataPersistent 模块异常检查 self.dataPersistent.check_and_restart() # Scheduler 模块异常检查 # Processor 模块异常检查 self.processor.check_and_restart() # 检查间隔 time.sleep(180) gc.collect() # 加载自定义配置信息 def load_config(self): section = "spider_core" config = configparser.ConfigParser() config.read("Core/Config/SpiderCoreConfig.conf", encoding="utf8") # 读取 downloader 模块配置 self.is_proxy_service_enable = True if int( config.get(section, 'isProxyServiceEnable')) == 1 else False self.session_pool_size = int(config.get(section, 'sessionPoolSize')) self.download_thread_num = int(config.get(section, 'downloadThreadNum')) self.network_retry_times = int(config.get(section, 'networkRetryTimes')) self.connect_timeout = int(config.get(section, 'connectTimeout')) self.download_interval = int(config.get(section, 'downloadInterval')) # 读取 Processor 模块配置 self.process_thread_num = int(config.get(section, 'processThreadNum')) self.is_parser_following_list = True if int( config.get(section, 'isParserFollowingList')) == 1 else False self.is_parser_follower_list = True if int( config.get(section, 'isParserFollowerList')) == 1 else False self.is_parser_follow_relation = True if int( config.get(section, 'isParserFollowRelation')) == 1 else False # 读取 Scheduler 模块配置 self.url_rate = int(config.get(section, 'urlRate')) # 读取 DataPersistent 模块配置 self.persistent_cache_size = int( config.get(section, 'persistentCacheSize')) self.follow_relation_persistent_cache_size = int( config.get(section, 'followRelationPersistentCacheSize')) # 读取邮件服务配置 self.is_email_service_enable = True if int( config.get(section, 'isEmailServiceEnable')) == 1 else False self.smtp_server_host = config.get(section, 'smtpServerHost') self.smtp_server_port = int(config.get(section, 'smtpServerPort')) self.smtp_server_password = config.get(section, 'smtpServerPassword') self.smtp_from_addr = config.get(section, 'smtpFromAddr') self.smtp_to_addr = config.get(section, 'smtpToAddr') self.smtp_email_header = config.get(section, 'smtpEmailHeader') self.smtp_send_interval = int(config.get(section, 'smtpSendInterval')) # 读取 Redis 数据库配置 self.redis_host = config.get(section, 'redisHost') self.redis_port = int(config.get(section, 'redisPort')) self.redis_db = int(config.get(section, 'redisDB')) self.redis_password = config.get(section, 'redisPassword') # 读取 MySQL 数据库配置 self.mysql_host = config.get(section, 'mysqlHost') self.mysql_username = config.get(section, 'mysqlUsername') self.mysql_password = config.get(section, 'mysqlPassword') self.mysql_database = config.get(section, 'mysqlDatabase') self.mysql_charset = config.get(section, 'mysqlCharset') # 读取知乎账户配置 self.is_login_by_cookie = True if int( config.get(section, 'isLoginByCookie')) == 1 else False self.z_c0 = config.get(section, 'z_c0') self.login_token = config.get(section, 'loginToken') self.password = config.get(section, 'password') # 读取初始token token_list = config.get(section, 'initToken') for token in token_list.split(','): self.init_token.append(str(token).strip()) if log.isEnabledFor(logging.INFO): log.info('配置文件读取并配置完毕')
def ReplaceProcessor(self, tasks): ''' Replaces the list of vertices on some processor with tasks, moving other vertices accordingly. Used for crossover in genetic algorithm.''' oldverts = self.vertices ordered = self.program.OrderedVertices() self.processors = [] self.emptyprocessors = [] self.vertices = {} self.currentVersions = {} p = self._getProcessor() self.vertices[p.number] = [] backup = [[v for v in self.program.vertices], [e for e in self.program.edges]] self.program._buildData() #self.program.vertices = [] #self.program.edges = [] for t in tasks: s = ScheduleVertex(t.v, t.v.versions[0], p) self.vertices[p.number].append(s) self.currentVersions[t.v.number] = [s] for v in self.program.vertices: if not v.number in self.currentVersions: self.currentVersions[v.number] = [] # self.program.vertices.append(t.v) self.Consistency() #for e in backup[1]: # if e.source in self.program.vertices and e.destination in self.program.vertices: # self.program.edges.append(e) fict = Processor(-1) spare = self._getProcessor() self.vertices[-1] = [] self.vertices[spare.number] = [] newprocs = {} allverts = [] for m in oldverts.keys(): allverts += oldverts[m] for vp in ordered: for v in [t for t in allverts if t.v == vp]: if [t for t in tasks if t.v == v.v] == []: if v.m in newprocs: p = newprocs[v.m] else: p = self._getProcessor() self.vertices[p.number] = [] newprocs[v.m] = p i = oldverts[v.m].index(v) #self.program.vertices.append(v.v) #self.program.edges = [] #for e in backup[1]: # if e.source in self.program.vertices and e.destination in self.program.vertices: # self.program.edges.append(e) #self.program._buildData() s = ScheduleVertex(v.v, v.v.versions[0], fict) self.currentVersions[v.v.number] = [s] self.vertices[-1] = [s] self._succCache = {} if self.TryMoveVertex(s, 0, p, i) == True: #print ("Applying operation 1", str(s), 0, p, i) self.MoveVertex(s, 0, p, i) else: if len(self.vertices[spare.number]) == 0: #print ("Applying operation 2", str(s), 0, spare, 0) self.MoveVertex(s, 0, spare, 0) else: for j in range( len(self.vertices[spare.number]) + 1): if self.TryMoveVertex(s, 0, spare, j) == True: #print ("Applying operation 3", str(s), 0, spare, j) self.MoveVertex(s, 0, spare, j) break self.emptyprocessors = [] #print(self) #print("++++++++++++") for m in self.processors: self._delEmptyProc(m) self.Consistency()