def __init__(self, ids_thread_num=4, content_process_num=2, content_thread_num=8, **kwargs): manager = Manager() # 爬虫的状态信息 self.kw_id = spiders.journal_kw_id if SPIDERS_STATUS.get(self.kw_id): raise Exception('Journal spider is running!') self.TYPE = 'JOURNAL_SPIDER' self.id_process = None # ID进程对象 self.content_process = [] # Content进程对象 self.ids_thread_num = ids_thread_num # ids线程个数 self.content_process_num = content_process_num # Content进程个数 self.content_thread_num = content_thread_num # 每个Content进程的线程个数 self.ids_queen = ProcessQueen(maxsize=-1) # 待爬取的文章ID列表,是个队列 self.ids_queen = ProcessQueen(maxsize=-1) # 待爬取的文章ID列表,是个队列 self.page_Num = Value('i', -1, lock=True) # 页数 self.finished_page_Num = Value('i', 0, lock=True) # 页数 self.finished_page_Num_locker = Lock() # Lock() self.failed_page_Num = Value('i', 0, lock=True) # 页数 self.failed_page_Num_locker = Lock() # Lock() self.ids_queen_size = Value( 'i', 0, lock=True) # 由于qsize的不准确性以及mac平台未实现qsize函数,因此自己实现 self.ids_queen_size_locker = Lock() # Lock() self.finished_num = Value('i', 0, lock=True) # 已完成的爬取 self.finished_num_locker = Lock() # 已完成的爬取 self.failed_num = Value('i', 0, lock=True) # 失败的爬取 self.failed_num_locker = Lock() # 失败的爬取 self.status = Value( 'i', -1, lock=True ) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,5:混合状态 self.idsP_status = Value( 'i', -1, lock=True) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止, self.contentP_status = Value( 'i', -1, lock=True) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止, self.create_user_id = kwargs['create_user_id'] # 创建人的ID self.create_user_name = kwargs['create_user_name'] # 创建人名称 self.create_time = getFTime() # 爬虫创建时间 self.start_time = None # 爬虫启动时间 self.page_sessionHelper = None self.ajax_sessionHelper = None self.journal_user_name = None self.journal_cookies = manager.dict( ) #存贮正在使用的cookie, 当为None时,表示正在等待用户输入cookie self.journal_cookies['main'] = None SPIDERS_STATUS[self.kw_id] = self
def resume(self, idsP=True, contentP=True): error_ = [] if idsP: if self.idsP_status.value != 2 and self.idsP_status.value != 6: error_.append(Exception('IDS process status is invalided')) else: self.idsP_status.value = 1 if contentP: if self.contentP_status.value != 2 and self.contentP_status.value != 6: error_.append(Exception('Content process status is invalided')) else: self.contentP_status.value = 1 self.start_time = getFTime() # 爬虫启动时间 self.status.value = self.getStatus() return error_
def start_assist(self): if self.contentP_status.value == 1: return self.start_time = getFTime() # step1: 先结束content 的进程 for c_process in self.content_process: c_process.terminate() self.content_process = [] # step2: 启动获取 content 的进程 for i in range(self.content_process_num): name = '%s PUBMED_ASSIST_CONTEND_PROCESS-%02d' % (self.common_tag, int(i + 1)) content_worker = _pubmedContendWorker(kw_id=self.kw_id, name=name, project=self.project) content_worker.start() self.content_process.append(content_worker) self.contentP_status.value = 1
def start(self): self.start_time = getFTime() self.status.value = 1 # 将状态置为开始 # 启动获取pubmedID的进程 id_worker = _scienceIDWorker(kw_id=self.kw_id, name='%s SCIENCE_IDS_PROCESS-MAIN' % self.common_tag, thread_num=self.ids_thread_num, page_size=self.page_size) id_worker.start() self.id_process = id_worker self.idsP_status.value = 1 # 启动获取 content 的进程 for i in range(self.content_process_num): name = '%s SCIENCE_CONTEND_PROCESS-%02d' % (self.common_tag, int(i + 1)) content_worker = _scienceContendWorker(kw_id=self.kw_id, name=name) content_worker.start() self.content_process.append(content_worker) self.contentP_status.value = 1
def start(self): self.start_time = getFTime() common_tag = "KWID=%03d uid=%s uname=%s" % (int( self.kw_id), self.create_user_id, self.create_user_name) self.status.value = 1 # 将状态置为开始 # 启动获取pubmedID的进程 id_worker = _journalIDWorker(kw_id=self.kw_id, name='%s JOURNAL_IDS_PROCESS-MAIN' % common_tag, thread_num=self.ids_thread_num) id_worker.start() self.id_process = id_worker self.idsP_status.value = 1 # 启动获取 content 的进程 for i in range(self.content_process_num): name = '%s JOURNAL_CONTEND_PROCESS-%02d' % (common_tag, int(i + 1)) content_worker = _journalContendWorker( kw_id=self.kw_id, name=name, thread_num=self.content_thread_num) content_worker.start() self.content_process.append(content_worker) self.contentP_status.value = 1
def __init__(self, ids_thread_num=4, project=None, content_process_num=2, content_thread_num=8, page_size=spiders.default_science_pagesize, **kwargs): # 爬虫的状态信息 self.kw_id = kwargs['kw_id'] if SPIDERS_STATUS.get(self.kw_id) and (not self.kw_id in special_kw): raise Exception('current kw has been existed') try: if not self.kw_id in special_kw: kw_json = SpiderKeyWord.objects.filter( id=self.kw_id).values()[0] self.kw_name = kw_json['name'] else: self.kw_name = special_kw.get(self.kw_id) except Exception as e: raise Exception('查询关键词名字失败' + str(e)) self.TYPE = 'SCIENCE_SPIDER' self.id_process = None # ID进程对象 self.content_process = [] # Content进程对象 self.ids_thread_num = ids_thread_num # ids线程个数 self.content_process_num = content_process_num # Content进程个数 self.content_thread_num = content_thread_num # 每个Content进程的线程个数 self.ids_queen = ProcessQueen(maxsize=-1) # 待爬取的文章ID列表,是个队列 self.failed_ids_queen = ProcessQueen(maxsize=-1) # 已失败的文章ID列表,是个队列 self.page_Num = Value('i', -1, lock=True) # 页数 self.page_size = page_size # 页面大小 self.finished_page_Num = Value('i', 0, lock=True) # 页数 self.finished_page_Num_locker = Lock() # Lock() self.failed_page_Num = Value('i', 0, lock=True) # 页数 self.failed_page_Num_locker = Lock() # Lock() self.ids_queen_size = Value( 'i', 0, lock=True) # 由于qsize的不准确性以及mac平台未实现qsize函数,因此自己实现 self.ids_queen_size_locker = Lock() # Lock() self.finished_num = Value('i', 0, lock=True) # 已完成的爬取 self.finished_num_locker = Lock() # 已完成的爬取 self.failed_num = Value('i', 0, lock=True) # 失败的爬取 self.failed_num_locker = Lock() # 失败的爬取 self.status = Value( 'i', -1, lock=True ) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,5:混合状态 self.idsP_status = Value( 'i', -1, lock=True) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止, self.contentP_status = Value( 'i', -1, lock=True) # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止, self.create_user_id = kwargs['create_user_id'] # 创建人的ID self.create_user_name = kwargs['create_user_name'] # 创建人名称 self.create_time = getFTime() # 爬虫创建时间 self.start_time = None # 爬虫启动时间 self.project = project # 是否固定project self.common_tag = "KWID=%03d uid=%s uname=%s" % (int( self.kw_id), self.create_user_id, self.create_user_name) # 查询相关的信息 self.lastQueryKey = -1 SPIDERS_STATUS[self.kw_id] = self