Esempio n. 1
0
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
Esempio n. 2
0
File: spider.py Progetto: lxshen/try
    def __init__(self, tab_urls, tab_site = '', tab_content  = '', parser_count = None, depth = None, parser_params = {}, begin_callback = None, end_callback = None, content_unique_key = 'url', delete_tab_urls = False, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls
        self._url_manager = UrlManager(tab_urls)

        if delete_tab_urls:self._url_manager.clear_url()

        # self._db = MongoDB()
        # if delete_tab_urls: self._db.delete(tab_urls)

        # self._db.set_unique_key(tab_urls, 'url')
        # if tab_site: self._db.set_unique_key(tab_site, 'site_id')
        # if tab_content: self._db.set_unique_key(tab_content, content_unique_key)

        # #设置索引 加快查询速度
        # self._db.set_ensure_index(tab_urls, 'depth')
        # self._db.set_ensure_index(tab_urls, 'status')
        # if tab_site: self._db.set_ensure_index(tab_site, 'read_status')
        # if tab_content: self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls, depth, process_num)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        # 扩展end_callback方法
        def _end_callback():
            # self._url_manager.stop()
            if end_callback: end_callback()

        self._end_callabck = _end_callback

        self._parser_count = int(tools.get_conf_value('config.conf', 'parser', 'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value('config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value('config.conf', "spider_site", "except_site_name").split(',')
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False
Esempio n. 4
0
@summary: 提供一些操作数据库公用的方法
---------
@author: Boris
'''
import sys
sys.path.append('..')
import init

import base.constance as Constance
from base.url_manager import UrlManager
import utils.tools as tools
# from db.mongodb import MongoDB
import random

mongodb = None #MongoDB()
url_manager = UrlManager('news:news_urls') # 此处需要传url表名
url_manager.start()

def get_contained_key(title, content, key1, key2, key3):
    text = title + content

    # 过滤
    if tools.get_info(text, key3):
        return '', 0

    # 取包含的关键字
    contained_key = []
    contained_key_count = 0

    def get_weigth(text, keys, key_weigth):
        weigth = 0
Esempio n. 5
0
File: spider.py Progetto: lxshen/try
class Spider(threading.Thread):
    def __init__(self, tab_urls, tab_site = '', tab_content  = '', parser_count = None, depth = None, parser_params = {}, begin_callback = None, end_callback = None, content_unique_key = 'url', delete_tab_urls = False, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls
        self._url_manager = UrlManager(tab_urls)

        if delete_tab_urls:self._url_manager.clear_url()

        # self._db = MongoDB()
        # if delete_tab_urls: self._db.delete(tab_urls)

        # self._db.set_unique_key(tab_urls, 'url')
        # if tab_site: self._db.set_unique_key(tab_site, 'site_id')
        # if tab_content: self._db.set_unique_key(tab_content, content_unique_key)

        # #设置索引 加快查询速度
        # self._db.set_ensure_index(tab_urls, 'depth')
        # self._db.set_ensure_index(tab_urls, 'status')
        # if tab_site: self._db.set_ensure_index(tab_site, 'read_status')
        # if tab_content: self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls, depth, process_num)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        # 扩展end_callback方法
        def _end_callback():
            # self._url_manager.stop()
            if end_callback: end_callback()

        self._end_callabck = _end_callback

        self._parser_count = int(tools.get_conf_value('config.conf', 'parser', 'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value('config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value('config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动parser 的add site 和 add root
        #print(self._parser_params)
        for parser in self._parsers:
            # parser.add_site_info()
            parser.add_root_url(self._parser_params)

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()

        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1
Esempio n. 6
0
class Collector(threading.Thread):
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')

    def run(self):
        while not self._thread_stop:
            try:
                self.__input_data()
            except Exception as e:
                log.error(e)

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls))
            return

        # 汇报节点信息
        self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做

        url_count = self._url_count # 先赋值
        # 根据等待节点数量,动态分配url
        worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0)
        if worker_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_urls)
            # 动态分配的数量 = 任务数量 / 休息的节点数量
            url_count = task_count // worker_wait_count

        url_count = url_count if url_count <= self._url_count else self._url_count

        urls_list = self._db.zget(self._tab_urls, count = url_count)

        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # # 记录url数量 测试用
            # url_count_record = tools.read_file('url_count.txt')
            # url_count_record =  url_count_record and int(url_count_record) or 0
            # url_count_record += len(urls_list)
            # tools.write_file('url_count.txt', str(url_count_record))

            # 汇报节点信息
            self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做

            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False


    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls
class Collector(threading.Thread):
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

    def run(self):
        while not self._thread_stop:
            self.__input_data()
            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完,不取url, url数量 = %s' % len(self._urls))
            return

        urls_list = self._db.zget(self._tab_urls, count=self._url_count)
        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count(
            ) == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False

    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls