コード例 #1
0
    def __init__(self,
                 ids_thread_num=4,
                 content_process_num=2,
                 content_thread_num=8,
                 **kwargs):
        manager = Manager()
        # 爬虫的状态信息
        self.kw_id = spiders.journal_kw_id
        if SPIDERS_STATUS.get(self.kw_id):
            raise Exception('Journal spider is running!')
        self.TYPE = 'JOURNAL_SPIDER'
        self.id_process = None  # ID进程对象
        self.content_process = []  # Content进程对象
        self.ids_thread_num = ids_thread_num  # ids线程个数
        self.content_process_num = content_process_num  # Content进程个数
        self.content_thread_num = content_thread_num  # 每个Content进程的线程个数

        self.ids_queen = ProcessQueen(maxsize=-1)  # 待爬取的文章ID列表,是个队列
        self.ids_queen = ProcessQueen(maxsize=-1)  # 待爬取的文章ID列表,是个队列
        self.page_Num = Value('i', -1, lock=True)  # 页数
        self.finished_page_Num = Value('i', 0, lock=True)  # 页数
        self.finished_page_Num_locker = Lock()  # Lock()
        self.failed_page_Num = Value('i', 0, lock=True)  # 页数
        self.failed_page_Num_locker = Lock()  # Lock()

        self.ids_queen_size = Value(
            'i', 0, lock=True)  # 由于qsize的不准确性以及mac平台未实现qsize函数,因此自己实现
        self.ids_queen_size_locker = Lock()  # Lock()

        self.finished_num = Value('i', 0, lock=True)  # 已完成的爬取
        self.finished_num_locker = Lock()  # 已完成的爬取

        self.failed_num = Value('i', 0, lock=True)  # 失败的爬取
        self.failed_num_locker = Lock()  # 失败的爬取

        self.status = Value(
            'i', -1, lock=True
        )  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,5:混合状态
        self.idsP_status = Value(
            'i', -1,
            lock=True)  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,
        self.contentP_status = Value(
            'i', -1,
            lock=True)  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,

        self.create_user_id = kwargs['create_user_id']  # 创建人的ID
        self.create_user_name = kwargs['create_user_name']  # 创建人名称
        self.create_time = getFTime()  # 爬虫创建时间
        self.start_time = None  # 爬虫启动时间
        self.page_sessionHelper = None
        self.ajax_sessionHelper = None
        self.journal_user_name = None
        self.journal_cookies = manager.dict(
        )  #存贮正在使用的cookie, 当为None时,表示正在等待用户输入cookie
        self.journal_cookies['main'] = None

        SPIDERS_STATUS[self.kw_id] = self
コード例 #2
0
 def resume(self, idsP=True, contentP=True):
     error_ = []
     if idsP:
         if self.idsP_status.value != 2 and self.idsP_status.value != 6:
             error_.append(Exception('IDS process status is invalided'))
         else:
             self.idsP_status.value = 1
     if contentP:
         if self.contentP_status.value != 2 and self.contentP_status.value != 6:
             error_.append(Exception('Content process status is invalided'))
         else:
             self.contentP_status.value = 1
             self.start_time = getFTime()  # 爬虫启动时间
     self.status.value = self.getStatus()
     return error_
コード例 #3
0
ファイル: PubmedSpider.py プロジェクト: izhangxm/cofco_b
    def start_assist(self):
        if self.contentP_status.value == 1:
            return
        self.start_time = getFTime()
        # step1: 先结束content 的进程
        for c_process in self.content_process:
            c_process.terminate()
        self.content_process = []

        # step2: 启动获取 content 的进程
        for i in range(self.content_process_num):
            name = '%s PUBMED_ASSIST_CONTEND_PROCESS-%02d' % (self.common_tag,
                                                              int(i + 1))
            content_worker = _pubmedContendWorker(kw_id=self.kw_id,
                                                  name=name,
                                                  project=self.project)
            content_worker.start()
            self.content_process.append(content_worker)
        self.contentP_status.value = 1
コード例 #4
0
    def start(self):
        self.start_time = getFTime()
        self.status.value = 1  # 将状态置为开始

        # 启动获取pubmedID的进程
        id_worker = _scienceIDWorker(kw_id=self.kw_id,
                                     name='%s SCIENCE_IDS_PROCESS-MAIN' %
                                     self.common_tag,
                                     thread_num=self.ids_thread_num,
                                     page_size=self.page_size)
        id_worker.start()
        self.id_process = id_worker
        self.idsP_status.value = 1

        # 启动获取 content 的进程
        for i in range(self.content_process_num):
            name = '%s SCIENCE_CONTEND_PROCESS-%02d' % (self.common_tag,
                                                        int(i + 1))
            content_worker = _scienceContendWorker(kw_id=self.kw_id, name=name)
            content_worker.start()
            self.content_process.append(content_worker)
        self.contentP_status.value = 1
コード例 #5
0
    def start(self):
        self.start_time = getFTime()
        common_tag = "KWID=%03d uid=%s uname=%s" % (int(
            self.kw_id), self.create_user_id, self.create_user_name)
        self.status.value = 1  # 将状态置为开始
        # 启动获取pubmedID的进程
        id_worker = _journalIDWorker(kw_id=self.kw_id,
                                     name='%s JOURNAL_IDS_PROCESS-MAIN' %
                                     common_tag,
                                     thread_num=self.ids_thread_num)
        id_worker.start()
        self.id_process = id_worker
        self.idsP_status.value = 1

        # 启动获取 content 的进程
        for i in range(self.content_process_num):
            name = '%s JOURNAL_CONTEND_PROCESS-%02d' % (common_tag, int(i + 1))
            content_worker = _journalContendWorker(
                kw_id=self.kw_id,
                name=name,
                thread_num=self.content_thread_num)
            content_worker.start()
            self.content_process.append(content_worker)
        self.contentP_status.value = 1
コード例 #6
0
    def __init__(self,
                 ids_thread_num=4,
                 project=None,
                 content_process_num=2,
                 content_thread_num=8,
                 page_size=spiders.default_science_pagesize,
                 **kwargs):
        # 爬虫的状态信息
        self.kw_id = kwargs['kw_id']
        if SPIDERS_STATUS.get(self.kw_id) and (not self.kw_id in special_kw):
            raise Exception('current kw has been existed')
        try:
            if not self.kw_id in special_kw:
                kw_json = SpiderKeyWord.objects.filter(
                    id=self.kw_id).values()[0]
                self.kw_name = kw_json['name']
            else:
                self.kw_name = special_kw.get(self.kw_id)
        except Exception as e:
            raise Exception('查询关键词名字失败' + str(e))
        self.TYPE = 'SCIENCE_SPIDER'
        self.id_process = None  # ID进程对象
        self.content_process = []  # Content进程对象
        self.ids_thread_num = ids_thread_num  # ids线程个数
        self.content_process_num = content_process_num  # Content进程个数
        self.content_thread_num = content_thread_num  # 每个Content进程的线程个数

        self.ids_queen = ProcessQueen(maxsize=-1)  # 待爬取的文章ID列表,是个队列
        self.failed_ids_queen = ProcessQueen(maxsize=-1)  # 已失败的文章ID列表,是个队列
        self.page_Num = Value('i', -1, lock=True)  # 页数
        self.page_size = page_size  # 页面大小
        self.finished_page_Num = Value('i', 0, lock=True)  # 页数
        self.finished_page_Num_locker = Lock()  # Lock()
        self.failed_page_Num = Value('i', 0, lock=True)  # 页数
        self.failed_page_Num_locker = Lock()  # Lock()

        self.ids_queen_size = Value(
            'i', 0, lock=True)  # 由于qsize的不准确性以及mac平台未实现qsize函数,因此自己实现
        self.ids_queen_size_locker = Lock()  # Lock()

        self.finished_num = Value('i', 0, lock=True)  # 已完成的爬取
        self.finished_num_locker = Lock()  # 已完成的爬取

        self.failed_num = Value('i', 0, lock=True)  # 失败的爬取
        self.failed_num_locker = Lock()  # 失败的爬取

        self.status = Value(
            'i', -1, lock=True
        )  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,5:混合状态
        self.idsP_status = Value(
            'i', -1,
            lock=True)  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,
        self.contentP_status = Value(
            'i', -1,
            lock=True)  # 爬虫状态类型为整数, -1:未初始化,0:未运行,1:正在运行,2:暂停, 3:运行完成,4:被终止,

        self.create_user_id = kwargs['create_user_id']  # 创建人的ID
        self.create_user_name = kwargs['create_user_name']  # 创建人名称
        self.create_time = getFTime()  # 爬虫创建时间
        self.start_time = None  # 爬虫启动时间
        self.project = project  # 是否固定project

        self.common_tag = "KWID=%03d uid=%s uname=%s" % (int(
            self.kw_id), self.create_user_id, self.create_user_name)

        # 查询相关的信息
        self.lastQueryKey = -1

        SPIDERS_STATUS[self.kw_id] = self