Exemple #1
0
 def __init__(self, name):
     self.db = Connection(config.db_host, config.db_db, config.db_user,
                          config.db_password)
     self.logger = fn.init_file_logger(name + '.log')
     self.urlpool = UrlPool(name)
     self.hub_hosts = None
     self.load_hubs()
Exemple #2
0
 def run(self):
     while self.alive:
         time.sleep(1)
         walk_url = TaskQueue.getInstance().get()
         if not walk_url:
             self.emptycount = self.emptycount - 1
             if self.emptycount == 0:
                 return
             continue
         self.emptycount = self.initempty
         _level = walk_url[1]
         _url = walk_url[0]
         if _level > self.level:
             continue
         if UrlPool.getInstance().exist(_url):
             continue
         UrlPool.getInstance().put_url(_url)
         try:
             html = self.down_load_html(_url, coding="gb2312")
         except Exception, e:
             print e
             continue
         proxydata = self.proxy_parser(html)
         _level = _level + 1
         if len(proxydata):
             ProxyData.getInstance().put(proxydata)
             _level = 0
         link_list = self.get_link(html, _level, _url)
         for link in link_list:
             if not UrlPool.getInstance().exist(link[0]):
                 TaskQueue.getInstance().put(link)
Exemple #3
0
 def run(self):
     while self.alive:
         time.sleep(1)
         walk_url = TaskQueue.getInstance().get()
         if not walk_url:
             self.emptycount = self.emptycount - 1
             if self.emptycount == 0 :
                 return
             continue
         self.emptycount = self.initempty
         _level = walk_url[1]
         _url = walk_url[0]
         if _level > self.level:
             continue 
         if UrlPool.getInstance().exist(_url):
             continue
         UrlPool.getInstance().put_url(_url)
         try:
             html = self.down_load_html(_url,coding="gb2312")
         except Exception,e:
             print e
             continue   
         proxydata = self.proxy_parser(html)
         _level = _level + 1 
         if len(proxydata):
             ProxyData.getInstance().put(proxydata)
             _level = 0 
         link_list = self.get_link(html,_level,_url)
         for  link in link_list:
             if not UrlPool.getInstance().exist(link[0]):
                 TaskQueue.getInstance().put(link)
Exemple #4
0
class NewsCrawlerAsync:
    def __init__(self, name):
        self._workers = 0
        self._workers_max = 5
        self.logger = fn.init_file_logger(name + '.log')
        self.urlpool = UrlPool(name)
        self.loop = asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"]

    async def load_hubs(self,):
        data = self.db.doctor_para.find({"status": {"$ne": True}})
        urls = []
        async for d in data:
            urls.append("{}&{}".format(d['faculty'], d["page"]))
        self.urlpool.addmany(urls)

    async def process(self, url):
        para = {"faculty": url.split("&")[0], "page": url.split("&")[1]}
        status, html = await fn.fetch(self.session, para)
        self.urlpool.set_status(url, status)
        if status != 200:
            return
        await self.db.doctors.insert_one(html)
        await self.db.doctor_para.update_one(para, {"$set": {"status": True}})
        self._workers -= 1

    async def loop_crawl(self):
        await self.load_hubs()
        last_rating_time = time.time()
        counter = 0
        while True:
            tasks = self.urlpool.db.pop_from_redis(self._workers_max)
            if not tasks:
                print('no url to crawl, sleep 10S')
                await asyncio.sleep(10)
                continue
            for url in tasks:
                self._workers += 1
                counter += 1
                print('crawl:', url, self._workers, counter)
                asyncio.ensure_future(self.process(url))

            gap = time.time() - last_rating_time
            if gap > 5:
                rate = counter / gap
                print('\tloop_crawl() rate:%s, counter: %s, workers: %s' % (round(rate, 2), counter, self._workers))
                last_rating_time = time.time()
                counter = 0
            if self._workers >= self._workers_max:
                print('====== got workers_max, sleep 3 sec to next worker =====')
                await asyncio.sleep(1)

    def run(self):
        try:
            self.loop.run_until_complete(self.loop_crawl())
        except KeyboardInterrupt:
            print('stopped by yourself!')
            del self.urlpool
            pass
Exemple #5
0
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()
Exemple #6
0
 def __init__(self, name):
     self._workers = 0
     self._workers_max = 5
     self.logger = fn.init_file_logger(name + '.log')
     self.urlpool = UrlPool(name)
     self.loop = asyncio.get_event_loop()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.db = motor.motor_asyncio.AsyncIOMotorClient(config.MONGO_URI)["haodaifu"]
    def __init__(self, name):
        self._workers = 0
        self._workers_max = 30
        self.logger = fn.init_file_logger(name + '.log')

        self.urlpool = UrlPool(name)

        self.loop = asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.db = sanicdb.SanicDB(config.db_host,
                                  config.db_db,
                                  config.db_user,
                                  config.db_password,
                                  loop=self.loop)
Exemple #8
0
 def __init__(self):
     self._workers = 0
     self._workers_max = 5
     self.urlpool = UrlPool()
     self.loop = asyncio.get_event_loop()
     self.session = aiohttp.ClientSession(loop=self.loop)
     self.db = motor.motor_asyncio.AsyncIOMotorClient(
         config.MONGO_URI)["cninfo"]
Exemple #9
0
 def __init__(self , keyword = []):
     self.keyword.extend(keyword)
     if not isinstance(keyword, list):
         raise TypeError("KEY_EORD_MUST_BE_LIST")
     for _kw in keyword:
         searchword = self.keyword.pop()
         url = self.baidu_search(searchword)
         if not UrlPool.getInstance().exist(url):
             TaskQueue.getInstance().put((url,-10))
Exemple #10
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='xworkers.conf',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Exemple #11
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Exemple #12
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 worker_conf_file='',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls, self.is_good_link)
            else:
                self.logger.error('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    #don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    self.urlpool.add(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            self.max_working = ns
        except:
            pass
        msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
            BRO,
            self.max_working,
            GRE,
            self._workers,
            NOR)
        print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            #if self._workers >= self.max_working:
            #    gevent.sleep(2)
            #    continue
            for i in xrange(self.max_working):
                url = self.urlpool.pop()
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
                #print 'start worker: ', self._workers
                if self._workers >= self.max_working:
                    break
            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')


    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            headers = {
                'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)',
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR))
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8') ## get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            traceback.print_exc()
            html = ''
        #if status_code == 200:
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS)
        #else:
        #print status_code, url, len(html)
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED)
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls
Exemple #13
0
class NewsCrawlerSync:
    def __init__(self, name):
        self.db = Connection(config.db_host, config.db_db, config.db_user,
                             config.db_password)
        self.logger = fn.init_file_logger(name + '.log')
        self.urlpool = UrlPool(name)
        self.hub_hosts = None
        self.load_hubs()

    def load_hubs(self, ):
        sql = 'select url from crawler_hub'
        data = self.db.query(sql)
        self.hub_hosts = set()
        hubs = []
        for d in data:
            host = urlparse.urlparse(d['url']).netloc
            self.hub_hosts.add(host)
            hubs.append(d['url'])
        self.urlpool.set_hubs(hubs, 300)

    def save_to_db(self, url, html):
        urlhash = farmhash.hash64(url)
        sql = 'select url from crawler_html where urlhash=%s'
        d = self.db.get(sql, urlhash)
        if d:
            if d['url'] != url:
                msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
                self.logger.error(msg)
            return True
        if isinstance(html, str):
            html = html.encode('utf8')
        html_lzma = lzma.compress(html)
        sql = ('insert into crawler_html(urlhash, url, html_lzma) '
               'values(%s, %s, %s)')
        good = False
        try:
            self.db.execute(sql, urlhash, url, html_lzma)
            good = True
        except Exception as e:
            if e.args[0] == 1062:
                # Duplicate entry
                good = True
                pass
            else:
                traceback.print_exc()
                raise e
        return good

    def filter_good(self, urls):
        goodlinks = []
        for url in urls:
            host = urlparse.urlparse(url).netloc
            if host in self.hub_hosts:
                goodlinks.append(url)
        return goodlinks

    def process(self, url, ishub):
        status, html, redirected_url = fn.downloader(url)
        self.urlpool.set_status(url, status)
        if redirected_url != url:
            self.urlpool.set_status(redirected_url, status)
        # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
        if status != 200:
            return
        if ishub:
            newlinks = fn.extract_links_re(redirected_url, html)
            goodlinks = self.filter_good(newlinks)
            print("%s/%s, goodlinks/newlinks" %
                  (len(goodlinks), len(newlinks)))
            self.urlpool.addmany(goodlinks)
        else:
            self.save_to_db(redirected_url, html)

    def run(self, ):
        while 1:
            urls = self.urlpool.pop(5)
            for url, ishub in urls.items():
                self.process(url, ishub)
Exemple #14
0
#!/usr/bin/env python3
# encoding: utf8
# author: veelion
# file: bee_server.py

from sanic import Sanic
from sanic import response

from urlpool import UrlPool

urlpool = UrlPool(__file__)

# 初始化urlpool,根据你的需要进行修改
hub_urls = []
urlpool.set_hubs(hub_urls, 300)
urlpool.add('https://news.sina.com.cn/')

# init
app = Sanic(__name__)


@app.listener('after_server_stop')
async def cache_urlpool(app, loop):
    global urlpool
    print('caching urlpool after_server_stop')
    del urlpool
    print('bye!')


@app.route('/task')
async def task_get(request):
Exemple #15
0
#!/usr/bin/env python3
# encoding: utf8
# author: veelion
# file: bee_server.py

from sanic import Sanic
from sanic import response

from urlpool import UrlPool

urlpool = UrlPool(__file__)

# 初始化urlpool,根据你的需要进行修改
urlpool.add('https://news.sina.com.cn/')

# init
app = Sanic(__name__)


@app.listener('after_server_stop')
async def cache_urlpool(app, loop):
    global urlpool
    print('caching urlpool after_server_stop')
    del urlpool
    print('bye!')


@app.route('/task')
async def task_get(request):
    count = request.args.get('count', 10)
    try:
class NewsCrawlerAsync:
    def __init__(self, name):
        self._workers = 0
        self._workers_max = 30
        self.logger = fn.init_file_logger(name + '.log')

        self.urlpool = UrlPool(name)

        self.loop = asyncio.get_event_loop()
        self.session = aiohttp.ClientSession(loop=self.loop)
        self.db = sanicdb.SanicDB(config.db_host,
                                  config.db_db,
                                  config.db_user,
                                  config.db_password,
                                  loop=self.loop)

    async def load_hubs(self, ):
        sql = 'select url from crawler_hub'
        data = await self.db.query(sql)
        self.hub_hosts = set()
        hubs = []
        for d in data:
            host = urlparse.urlparse(d['url']).netloc
            self.hub_hosts.add(host)
            hubs.append(d['url'])
        self.urlpool.set_hubs(hubs, 300)

    async def save_to_db(self, url, html):
        urlhash = farmhash.hash64(url)
        sql = 'select url from crawler_html where urlhash=%s'
        d = await self.db.get(sql, urlhash)
        if d:
            if d['url'] != url:
                msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
                self.logger.error(msg)
            return True
        if isinstance(html, str):
            html = html.encode('utf8')
        html_lzma = lzma.compress(html)
        sql = ('insert into crawler_html(urlhash, url, html_lzma) '
               'values(%s, %s, %s)')
        good = False
        try:
            await self.db.execute(sql, urlhash, url, html_lzma)
            good = True
        except Exception as e:
            if e.args[0] == 1062:
                # Duplicate entry
                good = True
                pass
            else:
                traceback.print_exc()
                raise e
        return good

    def filter_good(self, urls):
        goodlinks = []
        for url in urls:
            host = urlparse.urlparse(url).netloc
            if host in self.hub_hosts:
                goodlinks.append(url)
        return goodlinks

    async def process(self, url, ishub):
        status, html, redirected_url = await fn.fetch(self.session, url)
        self.urlpool.set_status(url, status)
        if redirected_url != url:
            self.urlpool.set_status(redirected_url, status)
        # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
        if status != 200:
            return
        if ishub:
            newlinks = fn.extract_links_re(redirected_url, html)
            goodlinks = self.filter_good(newlinks)
            print("%s/%s, goodlinks/newlinks" %
                  (len(goodlinks), len(newlinks)))
            self.urlpool.addmany(goodlinks)
        else:
            await self.save_to_db(redirected_url, html)
        self._workers -= 1

    async def loop_crawl(self, ):
        await self.load_hubs()
        last_rating_time = time.time()
        counter = 0
        while 1:
            tasks = self.urlpool.pop(self._workers_max)
            if not tasks:
                print('no url to crawl, sleep')
                await asyncio.sleep(3)
                continue
            for url, ishub in tasks.items():
                self._workers += 1
                counter += 1
                print('crawl:', url)
                asyncio.ensure_future(self.process(url, ishub))

            gap = time.time() - last_rating_time
            if gap > 5:
                rate = counter / gap
                print('\tloop_crawl() rate:%s, counter: %s, workers: %s' %
                      (round(rate, 2), counter, self._workers))
                last_rating_time = time.time()
                counter = 0
            if self._workers > self._workers_max:
                print(
                    '====== got workers_max, sleep 3 sec to next worker =====')
                await asyncio.sleep(3)

    def run(self):
        try:
            self.loop.run_until_complete(self.loop_crawl())
        except KeyboardInterrupt:
            print('stopped by yourself!')
            del self.urlpool
            pass
Exemple #17
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls, self.is_good_link)
            else:
                self.logger.error('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    #don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    self.urlpool.add(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            self.max_working = ns
        except:
            import os
            cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file)
            print '!!!!!! ', cmd
            os.system(cmd)
            pass
        msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
            BRO,
            self.max_working,
            GRE,
            self._workers,
            NOR)
        print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            #if self._workers >= self.max_working:
            #    gevent.sleep(2)
            #    continue
            for i in xrange(self.max_working):
                if self._workers >= self.max_working:
                    break
                url = self.urlpool.pop()
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
                #print 'start worker: ', self._workers

            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')


    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            headers = {
                'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)',
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR))
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8') ## get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            traceback.print_exc()
            html = ''
        #if status_code == 200:
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS)
        #else:
        #print status_code, url, len(html)
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED)
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls
Exemple #18
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls)
            else:
                self.logger.info('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    # don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    t = self.failed_urls.get(url, 0)
                    if t == 0:
                        self.failed_urls[url] = 1
                        self.urlpool.add(url)
                    if t < 3:
                        self.failed_urls[url] += 1
                        self.urlpool.add(url)
                    if t >= 3:
                        self.urlpool.set_url_bad(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        changed = False
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            if ns != self.max_working:
                changed = True
                self.max_working = ns
            else:
                changed = False
        except:
            import os
            cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file)
            print '!!!!!! ', cmd
            os.system(cmd)
            pass
        if changed:
            msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
                BRO,
                self.max_working,
                GRE,
                self._workers,
                NOR)
            print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        self.last_special_crawl = 0
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            for i in xrange(self.max_working):
                if self._workers >= self.max_working:
                    gevent.sleep(10)
                    break
                try:
                    url = self.urlqueue.get_nowait()
                except:
                    url = self.urlpool.pop()
                gap = self.special_crawl_gap(url)
                skip_special = False
                if gap > 0:
                    to_sleep = gap - (time.time() - self.last_special_crawl)
                    if to_sleep > 0:
                        print '\tskip special:'
                        time.sleep(1)
                        self.urlpool.add(url, always=True)
                        skip_special = True
                    else:
                        self.last_special_crawl = time.time()
                if skip_special:
                    continue
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def special_crawl_gap(self, url):
        ''' re-define if sleep some time for this url
        '''
        return 0

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')

    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            ua = ('Mozilla/5.0 (compatible; MSIE 9.0; '
                  'Windows NT 6.1; Win64; x64; Trident/5.0)')
            headers = {
                'User-Agent': ua,
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            msg = '%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)
            self.logger.debug(msg)
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers,
                                 timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8')  # get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            # traceback.print_exc()
            html = ''
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls