class PaserControl(threading.Thread):
    def __init__(self):
        super(PaserControl, self).__init__()
        self._collector = Collector()
        self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
        self._interval = int(tools.getConfValue("html_parser", "sleep_time"))

    def run(self):
        while True:
            try:
                urls = self._collector.getUrls(self._urlCount)
                print("取到的url大小 %d" % len(urls))
                # 判断是否结束
                if self._collector.isFinished():
                    log.debug("-------------- 结束 --------------")
                    break

                for url in urls:
                    self.parseUrl(url)

                time.sleep(self._interval)
            except Exception as e:
                log.debug(urls)
                log.debug(e)

    def parseUrl(self, urlInfo):
        website_id = urlInfo['website_id']

        try:
            domain = list(db.website.find({'_id': website_id}))[0]['domain']

            if domain == Constance.IFENG:
                ifeng.parseUrl(urlInfo)

            elif domain == Constance.SOHU:
                sohu.parseUrl(urlInfo)

            elif domain == Constance.TENCENT:
                tencent.parseUrl(urlInfo)

            elif domain == Constance.SINA:
                #sina.parseUrl(urlInfo)
                pass
            elif domain == Constance.CCTV:
                cctv.parseUrl(urlInfo)

            elif domain == Constance.PEOPLE:
                people.parseUrl(urlInfo)

            elif domain == Constance.WANG_YI:
                wangyi.parseUrl(urlInfo)

            elif domain == Constance.XIN_HUA:
                xinhua.parseUrl(urlInfo)

        except Exception as e:
            log.debug(e)
Esempio n. 2
0
    def __init__(self,
                 tab_urls,
                 tab_site='',
                 tab_content='',
                 parser_count=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key='url',
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        if delete_tab_urls: self._db.delete(tab_urls)

        self._db.set_unique_key(tab_urls, 'url')
        if tab_site: self._db.set_unique_key(tab_site, 'site_id')
        if tab_content:
            self._db.set_unique_key(tab_content, content_unique_key)

        #设置索引 加快查询速度
        self._db.set_ensure_index(tab_urls, 'depth')
        self._db.set_ensure_index(tab_urls, 'status')
        if tab_site: self._db.set_ensure_index(tab_site, 'read_status')
        if tab_content: self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
Esempio n. 3
0
def collect(args):
    if args.verbose:
        logging.basicConfig(filename='dbg_'+args.parser+'_'+str(time.time())+'.log', filemode='w', level=logging.DEBUG)

    parser_params = process_param_string(args.parser_params)
    collector = Collector(args.parser, args.use_proxy, parser_params)
    try:
        collector.start()
    finally:
        if collector.proxy_client is not None:
            collector.proxy_client.stop()
Esempio n. 4
0
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 search_keyword1=[],
                 search_keyword2=[],
                 search_keyword3=[],
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param search_keyword1: 搜索关键字(列表)全部包含
        @param search_keyword2: 搜索关键字(列表)至少包含一个
        @param search_keyword3: 搜索关键字(列表)一个都不能包含
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(
            tab_content,
            'url' if not content_unique_key else content_unique_key)

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._search_keyword1 = search_keyword1
        self._search_keyword2 = search_keyword2
        self._search_keyword3 = search_keyword3

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
Esempio n. 5
0
    def __init__(self,
                 tab_list,
                 tab_unique_key_list,
                 tab_ensure_index_list,
                 parser_count=None,
                 site_parsers=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 delete_tab_urls=False):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的参数
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()
        self._db = MongoDB()

        self._tab_urls = tab_list[0]
        if delete_tab_urls: self._db.delete(self._tab_urls)

        self._site_parsers = site_parsers

        for tab_index in range(len(tab_list)):
            self._db.set_unique_key(tab_list[tab_index],
                                    tab_unique_key_list[tab_index])
            # 设置索引 加快查询速度
            for ensure_index in tab_ensure_index_list[tab_index]:
                self._db.set_ensure_index(tab_list[tab_index], ensure_index)

        self._collector = Collector(self._tab_urls, self._site_parsers)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')
class  PaserControl(threading.Thread):
    def __init__(self):
        super(PaserControl, self).__init__()
        self._collector = Collector()
        self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
        self._interval = int(tools.getConfValue("html_parser", "sleep_time"))

    def run(self):
        while True:
            urls = self._collector.getUrls(self._urlCount)
            print("取到的url大小 %d"%len(urls))
            # 判断是否结束
            if self._collector.isFinished():
                log.debug("-------------- 结束 --------------")
                break

            for url in urls:
                self.parseUrl(url)


            time.sleep(self._interval)

    def parseUrl(self, urlInfo):
        parser.parseUrl(urlInfo)
Esempio n. 7
0
class PaserControl(threading.Thread):
    def __init__(self):
        super(PaserControl, self).__init__()
        self._collector = Collector()
        self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
        self._interval = int(tools.getConfValue("html_parser", "sleep_time"))

    def run(self):
        while True:
            urls = self._collector.getUrls(self._urlCount)
            print("取到的url大小 %d" % len(urls))
            for url in urls:
                self.parseUrl(url)

            time.sleep(self._interval)

    def parseUrl(self, urlInfo):
        website_id = urlInfo['website_id']

        domain = list(db.website.find({'_id': website_id}))[0]['domain']
        if domain == Constance.YOUKU:
            youku.parseUrl(urlInfo)
        elif domain == Constance.TENCENT:
            tencent.parseUrl(urlInfo)
        elif domain == Constance.WANG_YI:
            wangyi.parseUrl(urlInfo)
        elif domain == Constance.PPTV:
            pptv.parseUrl(urlInfo)
        elif domain == Constance.KAN_KAN:
            kankan.parseUrl(urlInfo)
        elif domain == Constance.CCTV:
            cctv.parseUrl(urlInfo)
        elif domain == Constance.TUDOU:
            tudou.parseUrl(urlInfo)
        elif domain == Constance.V1:
            v1.parseUrl(urlInfo)
        elif domain == Constance.KU6:
            ku6.parseUrl(urlInfo)
Esempio n. 8
0
class Spider(threading.Thread):
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key='url'):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的參數
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(tab_content, content_unique_key)

        #设置索引 加快查询速度
        self._db.set_ensure_index(tab_urls, 'depth')
        self._db.set_ensure_index(tab_urls, 'status')
        self._db.set_ensure_index(tab_site, 'read_status')
        self._db.set_ensure_index(tab_content, 'read_status')

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()
        # 启动parser 的add site 和 add root
        #print(self._parser_params)
        for parser in self._parsers:
            threading.Thread(target=parser.add_site_info).start()
            threading.Thread(target=parser.add_root_url,
                             args=(self._parser_params, )).start()
        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1
Esempio n. 9
0
class Spider(threading.Thread):
    def __init__(self,
                 tab_list,
                 tab_unique_key_list,
                 tab_ensure_index_list,
                 parser_count=None,
                 site_parsers=None,
                 parser_params={},
                 begin_callback=None,
                 end_callback=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param parser_params : 解析器所用的参数
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_list[0]
        self._site_parsers = site_parsers

        self._db = MongoDB()
        for tab_index in range(len(tab_list)):
            self._db.set_unique_key(tab_list[tab_index],
                                    tab_unique_key_list[tab_index])
            # 设置索引 加快查询速度
            for ensure_index in tab_ensure_index_list[tab_index]:
                self._db.set_ensure_index(tab_list[tab_index], ensure_index)

        self._collector = Collector(self._tab_urls, self._site_parsers)
        self._parsers = []

        self._parser_params = parser_params

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动parser 的add site 和 add root
        # print(self._parser_params)
        for parser in self._parsers:
            parser.add_site_info()
            parser.add_root_url(self._parser_params)
        print('添加跟url完毕')

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()

        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1
Esempio n. 10
0
sys.path.append("..")

import utils.tools as tools
from utils.log import log
from base.collector import Collector
from base.root_url import AddRootUrl
from html_parser.parser_control import PaserControl


def init():
    db = tools.getConnectedDB()
    # 设唯一索引
    db.urls.ensure_index('url', unique=True)
    db.text_info.ensure_index('url', unique=True)


if __name__ == '__main__':
    log.info("--------begin--------")
    init()

    addRootUrl = AddRootUrl()
    addRootUrl.start()

    coll = Collector()
    coll.start()

    paserCount = int(tools.getConfValue("html_parser", "parser_count"))
    while paserCount:
        paser = PaserControl()
        paser.start()
        paserCount = paserCount - 1
 def __init__(self):
     super(PaserControl, self).__init__()
     self._collector = Collector()
     self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
     self._interval = int(tools.getConfValue("html_parser", "sleep_time"))
Esempio n. 12
0
class Spider(threading.Thread):
    def __init__(self,
                 tab_urls,
                 tab_site,
                 tab_content,
                 parser_count=None,
                 search_keyword1=[],
                 search_keyword2=[],
                 search_keyword3=[],
                 begin_callback=None,
                 end_callback=None,
                 content_unique_key=None):
        '''
        @summary:
        ---------
        @param tab_urls: url表名
        @param tab_site: 网站表名
        @param parser_count: parser 的线程数,为空时以配置文件为准
        @param search_keyword1: 搜索关键字(列表)全部包含
        @param search_keyword2: 搜索关键字(列表)至少包含一个
        @param search_keyword3: 搜索关键字(列表)一个都不能包含
        @param begin_callback:  爬虫开始的回调
        @param end_callback:    爬虫结束的回调
        ---------
        @result:
        '''
        super(Spider, self).__init__()

        self._tab_urls = tab_urls

        self._db = MongoDB()
        self._db.set_unique_key(tab_urls, 'url')
        self._db.set_unique_key(tab_site, 'site_id')
        self._db.set_unique_key(
            tab_content,
            'url' if not content_unique_key else content_unique_key)

        self._collector = Collector(tab_urls)
        self._parsers = []

        self._search_keyword1 = search_keyword1
        self._search_keyword2 = search_keyword2
        self._search_keyword3 = search_keyword3

        self._begin_callback = begin_callback
        self._end_callabck = end_callback

        self._parser_count = int(
            tools.get_conf_value(
                'config.conf', 'parser',
                'parser_count')) if not parser_count else parser_count
        self._spider_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "spider_site_name").split(',')
        self._except_site_name = tools.get_conf_value(
            'config.conf', "spider_site", "except_site_name").split(',')

    def add_parser(self, parser):
        if self._spider_site_name[0] == 'all':
            for except_site_name in self._except_site_name:
                if parser.NAME != except_site_name.strip():
                    self._parsers.append(parser)
        else:
            for spider_site_name in self._spider_site_name:
                if parser.NAME == spider_site_name.strip():
                    self._parsers.append(parser)

    def run(self):
        self.__start()

    def __start(self):
        if self._begin_callback:
            self._begin_callback()

        if not self._parsers:
            if self._end_callabck:
                self._end_callabck()
            return

        # 启动collector
        self._collector.add_finished_callback(self._end_callabck)
        self._collector.start()
        # 启动parser 的add site 和 add root
        for parser in self._parsers:
            threading.Thread(target=parser.add_site_info).start()
            threading.Thread(target=parser.add_root_url,
                             args=(self._search_keyword1,
                                   self._search_keyword2,
                                   self._search_keyword3)).start()
        # 启动parser control
        while self._parser_count:
            parser_control = PaserControl(self._collector, self._tab_urls)

            for parser in self._parsers:
                parser_control.add_parser(parser)

            parser_control.start()
            self._parser_count -= 1