Example #1
0
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()
Example #2
0
def shall_I_begin(in_str, is_file=False, is_hq=False, need_proxy_pool=False):
    #start terminate_watcher
    Terminate_Watcher()
    global ppool
    if need_proxy_pool:
        LOG.info(u'初始化proxy pool')
        ppool = ProxyPool()
        LOG.info(u'proxy pool:[%d] 初始完毕' % len(ppool.proxies))

    #xiami obj
    xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\
            config.XIAMI_LOGIN_PASSWORD, \
            is_hq,proxies=ppool)
    #netease obj
    m163 = netease.Netease(is_hq, proxies=ppool)

    if is_file:
        from_file(xiami_obj, m163, in_str)
    elif re.match(pat_xm, in_str):
        from_url_xm(xiami_obj, in_str)
    elif re.match(pat_163, in_str):
        from_url_163(m163, in_str)

    print border
    if len(dl_songs):
        LOG.info(u' 下载任务总数: %d \n 3秒后开始下载' % len(dl_songs))
        sleep(3)
        downloader.start_download(dl_songs)
    else:
        LOG.warning(u' 没有可下载任务,自动退出.')
    def __init__(self):
        self.retry_count = 3
        self.empty_count = 0
        self.pre_empty_flag = False

        self.enabled_programs = []
        self.unabled_programs = []
        self.collected_programs = []

        self.proxypool = ProxyPool()
        self.proxy = self.proxypool.get_proxy()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
Example #4
0
    def post(self):
        target = self.get_argument('target', default='') or 'all'
        num = int(self.get_argument('num', default='') or 5)
        delay = int(self.get_argument('delay', default='') or 10)

        proxypool = ProxyPool()

        try:
            proxies = proxypool.get_many(target=target,
                                         num=num,
                                         maxscore=delay)
            num_ret = len(proxies)
            mtime = proxypool.get_mtime(target=target)

            proxylist = []
            for proxy in proxies:
                proxylist.append(proxy.decode('utf-8'))

            if str(target).upper() in proxypool.targets:
                status = 'success'
            else:
                status = 'success-partial'

            ret = {
                'status': status,
                'proxylist': {
                    'num': num_ret,
                    'mtime': mtime,
                    'target': target,
                    'proxies': proxylist,
                },
            }
        except Exception as e:
            ret = {
                'status': 'failure',
                'target': target,
                'err': str(e),
            }

        self.set_header('Content-Type', 'application/json')

        self.write(json.dumps(ret))
Example #5
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='xworkers.conf',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Example #6
0
    def post(self):
        target = self.get_argument('target', default='') or 'all'
        num    = int(self.get_argument('num', default='') or 5)
        delay  = int(self.get_argument('delay', default='') or 10)

        proxypool = ProxyPool()
        
        try:
            proxies = proxypool.get_many(target=target, num=num, maxscore=delay)
            num_ret = len(proxies)
            mtime   = proxypool.get_mtime(target=target)

            proxylist = []
            for proxy in proxies:
                proxylist.append(proxy.decode('utf-8'))

            if str(target).upper() in proxypool.targets:
                status = 'success'
            else:
                status = 'success-partial'

            ret = {
                'status': status,
                'proxylist': {
                    'num': num_ret,
                    'mtime': mtime,
                    'target': target,
                    'proxies': proxylist,
                },
            }
        except Exception as e:
            ret = {
                'status': 'failure',
                'target': target,
                'err': str(e),
            }

        self.set_header('Content-Type', 'application/json')

        self.write(json.dumps(ret))
Example #7
0
def shall_I_begin(option):
    #start terminate_watcher
    Terminate_Watcher()
    global ppool, xiami_obj
    if option.need_proxy_pool:
        LOG.info(msgTxt.init_proxypool)
        ppool = ProxyPool()
        option.proxies = ppool
        LOG.info(msgTxt.fmt_init_proxypool_done %len(ppool.proxies))

    #netease obj
    m163 = netease.Netease(option)

    if option.inFile:
        from_file(m163,option)
    elif re.match(pat_xm, option.inUrl):
        __init_xiami_obj(option)
        from_url_xm(xiami_obj, option.inUrl)
    elif re.match(pat_163, option.inUrl):
        from_url_163(m163, option.inUrl)

    print border
    #here do filtering for incremental download
    skipped_songs = [] #used by incremental_dl
    skipped_hists = [] #used by incremental_dl

    dl_songs = []
    if option.incremental_dl:
        skipped_songs, skipped_hists = hist_handler.filter_songs(total_songs)
        LOG.warning(msgTxt.fmt_skip_dl_nm % len(skipped_songs))

    dl_songs = [song for song in total_songs if song not in skipped_songs]
    dl_num = len(dl_songs)
    skip_num = len(skipped_songs)
    output_num = '%d' % dl_num if not skip_num else \
                 "%d-%d=%d" %(dl_num + skip_num, skip_num, dl_num)
    if len(dl_songs):
        LOG.info(msgTxt.fmt_total_dl_nm % output_num)
        sleep(3)
        downloader.start_download(dl_songs, skipped_hists)
        # test
        # downloader.success_list.extend(dl_songs)
        # downloader.failed_list.extend(dl_songs)
        # downloader.finish_summary(skipped_hists)
        # test
    else:
        LOG.warning(msgTxt.no_dl_task)
Example #8
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Example #9
0
def shall_I_begin(url, is_file=False, is_hq=False, need_proxy_pool=False):
    #start terminate_watcher
    # Terminate_Watcher()
    global ppool
    if need_proxy_pool:
        print '初始化proxy pool'
        ppool = ProxyPool()
        print('proxy pool:[%d] 初始完毕' % len(ppool.proxies))

    #xiami obj
    xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\
            config.XIAMI_LOGIN_PASSWORD, \
            is_hq,proxies=ppool)
    #netease obj
    m163 = netease.Netease(is_hq, proxies=ppool)

    #用来得到歌手ID的排序名单txt,用过一次就行了,什么时候想起来了再更新一次
    # artistIDs = getTopArtists(xiami_obj, url)

    getTopSongs(xiami_obj)
# -*- coding: utf-8 -*-
import platform
import os
from daemon import daemonize
from proxypool import ProxyPool

if __name__ == '__main__':
    # 以守护进程方式运行
    if "Linux" in platform.system():
         daemonize(os.getcwd(), '/dev/null','/tmp/daemon_stdout.log','/tmp/daemon_error.log')
    redis_key_https = "merge_https_proxy"
    redis_key_http = "merge_http_proxy"
    redis_distinct_set_http = "merge_set_http"
    redis_distinct_set_https = "merge_set_https"
    ProxyPool(redis_key_http=redis_key_http,
              redis_key_https=redis_key_https,
              redis_distinct_set_http=redis_distinct_set_http,
              redis_distinct_set_https=redis_distinct_set_https).run()

class Scrapyer(object):
    """
    Function:
        1.search programs from http://www.tvmao.com and collect relative programs info
        2.crawl detail info of programs from http://www.tvmao.com to help classify programs
    """
    def __init__(self):
        self.retry_count = 3
        self.empty_count = 0
        self.pre_empty_flag = False

        self.enabled_programs = []
        self.unabled_programs = []
        self.collected_programs = []

        self.proxypool = ProxyPool()
        self.proxy = self.proxypool.get_proxy()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }

    def change_proxy(self):
        """
        change current proxy
        :return:
        """

        self.proxypool.delete_proxy(self.proxy)
        self.proxy = self.proxypool.get_proxy()

    def check_empty(self, num, source_programs, lock):
        """
        check whether if the current proxy is dead
        if the res '[]' occurs over 5 times consecutively
        :param num: number of current columns
        :param source_programs: programs need to crawl
        :param lock: lock to access the source_programs
        :return:
        """

        if num == 0:
            if self.pre_empty_flag:
                self.empty_count += 1
                if self.empty_count >= 5:
                    for i in range(5, 0, -1):
                        program = self.unabled_programs[i]
                        if empty_times[program] < 2:
                            self.unabled_programs.pop(i)
                            with lock:
                                source_programs.put(program)
                                empty_times[program] += 1
                    self.change_proxy()
                    self.empty_count = 0
            else:
                self.pre_empty_flag = True
                self.empty_count = 1
        elif self.pre_empty_flag:
            self.pre_empty_flag = False
            self.empty_count = 0

    def collect_programs(self, page_uls, page_columns):
        """
        parse programs from the crawed result by columns
        :param page_uls: all uls in the result
        :param page_columns: all categories in the result
        :return:
        """

        prefix = 'http://www.tvmao.com'

        programs = []
        for column, uls in zip(page_columns, page_uls):
            lis = uls.find_all('li', class_='mr10')
            if len(lis) == 0: continue
            if re.search('^(电视剧|电影)', column):
                href_names = [(prefix + li.p.a['href'], li.p.a.get_text())
                              for li in lis]
            elif re.search('^(综艺|明星|赛事)', column):
                href_names = [(prefix + li.a['href'], li.a['title'])
                              for li in lis]
            else:
                continue
            programs.append(href_names)
        return dict(zip(page_columns, programs))

    def crawl_relative_program(self, program, source_programs, lock):
        """
        crawl relative programs info from http://www.tvmao.com
        :param program:
        :param source_programs: all programs need to crawl
        :param lock: lock to access the source_programs
        :return:
        """

        url = 'http://www.tvmao.com/query.jsp?keys=%s&ed=' % quote(program) + \
              'bOWkp%2BeZveWkq%2BWmh%2BS4iua8lOazoeayq%2BS5i%2BWQu28%3D'

        # crawl the website
        bsObj = None
        self.retry_count = 3
        while self.retry_count > 0:
            try:
                content = requests.get(url,
                                       proxies={'http': self.proxy},
                                       headers=self.headers,
                                       timeout=2)
                bsObj = BeautifulSoup(content.text, 'html.parser')
                break
            except:
                self.retry_count -= 1
                if self.retry_count <= 0:
                    if DEBUG: print("waiting...")
                    self.change_proxy()
                    self.retry_count = 3

        # parse infomation
        try:
            page_content = bsObj.find_all('div', class_='page-content')[0]
            page_columns = [
                item.a.get_text() for item in page_content.dl.find_all('dd')
            ]
            page_columns = [
                column for column in page_columns
                if not re.search('^(播出时间|电视频道)', column)
            ]
            page_content_uls = page_content.div.find_all(
                'ul', class_=re.compile('^.+qtable$'), recursive=False)
            if len(page_columns) == 0:
                self.unabled_programs.append(program)
            else:
                self.enabled_programs.append(program)
                column_programs = self.collect_programs(
                    page_content_uls, page_columns)
                return {program: column_programs}

            # check whether if the current proxy was dead
            self.check_empty(len(page_columns), source_programs, lock)
        except:
            with lock:
                source_programs.put(program)
            self.change_proxy()
            return None

    def run_crawl_relative_programs(self, source_programs, lock, limit=False):
        """
        single process
        :param source_programs: all programs need to crawl
        :param lock: lock to access the source_programs
        :param limit: if size of source_programs has little change, end process when limit is true
        :return: collected programs info, enabled programs, unabled programs
        """

        collected_programs = []
        # count, pre = 0, source_programs.qsize()
        while True:
            try:
                with lock:
                    program = source_programs.get_nowait()
                if DEBUG: print(source_programs.qsize())

                if source_programs.qsize() < 1500:
                    return collected_programs, self.enabled_programs, self.unabled_programs
                # count += 1
                # if count % 50 == 0 and limit:
                #     if pre - source_programs.qsize() < 0:
                #         return collected_programs, self.enabled_programs, self.unabled_programs
                # pre = source_programs.qsize()

                result = self.crawl_relative_program(program, source_programs,
                                                     lock)
                if result: collected_programs.append(result)
                time.sleep(randint(0, 1))
            except:
                return collected_programs, self.enabled_programs, self.unabled_programs

    def category_classify(self, category):
        """
        classify by the category from xingchen
        :param category: program intro or program category from xingchen
        :return:
        """

        if re.search('军旅', category): return '军事'
        if re.search('纪录片', category): return '纪实'
        if re.search('动漫', category): return '少儿'
        if re.search('戏剧', category): return '戏曲'
        if re.search('真人秀', category): return '综艺'
        res = re.search('|'.join(all_categories), category)
        if res: return res.group()
        return None

    def intro_classify(self, intro):
        """
        classify the category 'living' into more accurate category
        :param intro: introduction of the realtive program in xingchen
        :return:
        """

        if re.search('军旅', intro): return '军事'
        if re.search('纪录片', intro): return '纪实'
        if re.search('动漫', intro): return '少儿'
        if re.search('戏剧', intro): return '戏曲'
        if re.search('真人秀', intro): return '综艺'
        res = re.search('|'.join(all_categories), intro)
        if res: return res.group()
        return "生活"

    def search_to_classify_program(self, href):
        """
        classify programs by crawling more detail info from xingchen
        :param href: link of the relative program in xingchen
        :return:
        """

        # crawling the website
        bsObj = None
        self.retry_count = 3
        while self.retry_count > 0:
            try:
                content = requests.get(href,
                                       proxies={'http': self.proxy},
                                       headers=self.headers,
                                       timeout=2)
                if content.status_code != 200:
                    self.retry_count -= 1
                    if self.retry_count <= 0:
                        if DEBUG: print("waiting...")
                        self.change_proxy()
                        self.retry_count = 3
                    continue
                bsObj = BeautifulSoup(content.text, 'html.parser')
                break
            except:
                self.retry_count -= 1
                if self.retry_count <= 0:
                    if DEBUG: print("waiting...")
                    self.change_proxy()
                    self.retry_count = 3

        # classify the program by detail info from website
        try:
            if re.search('tvcolumn', href):
                res_1 = bsObj.find_all('td', class_='gray pl15')
                if res_1:
                    category = res_1[0].findNext('td').get_text()
                    if category != "生活":
                        category = self.category_classify(category)
                        return category if category else '综艺'
                    div = bsObj.find_all('div', class_='clear more_c')[0]
                    intro = '; '.join(
                        [p.get_text() for p in div.find_all('p')])
                    return self.intro_classify(intro)
                else:
                    return '综艺'
            elif re.search('drama', href):
                mark = bsObj.find(text='类别:')
                td = mark.parent.findNext('td')
                category = ' '.join(
                    [a.get_text() for a in td.find_all('a', recursive=False)])
                category = self.category_classify(category)
                return category if category else '电视剧'
        except:
            if DEBUG: print("f**k", href)
            return choice(['综艺', '电视剧'])

    def run_search_to_classify_programs(self, source_items, lock):
        """
        single process
        :param source_items: all programs need to crawl more detail info
        :param lock: lock to access source_items
        :return:
        """

        program_cateogry = []
        while True:
            try:
                with lock:
                    item = source_items.get_nowait()
                if DEBUG: print(source_items.qsize())
                category = self.search_to_classify_program(item[2])
                program_cateogry.append((item[0], category))
                time.sleep(randint(0, 1))
            except:
                return program_cateogry
Example #12
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 worker_conf_file='',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls, self.is_good_link)
            else:
                self.logger.error('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    #don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    self.urlpool.add(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            self.max_working = ns
        except:
            pass
        msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
            BRO,
            self.max_working,
            GRE,
            self._workers,
            NOR)
        print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            #if self._workers >= self.max_working:
            #    gevent.sleep(2)
            #    continue
            for i in xrange(self.max_working):
                url = self.urlpool.pop()
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
                #print 'start worker: ', self._workers
                if self._workers >= self.max_working:
                    break
            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')


    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            headers = {
                'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)',
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR))
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8') ## get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            traceback.print_exc()
            html = ''
        #if status_code == 200:
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS)
        #else:
        #print status_code, url, len(html)
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED)
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls
Example #13
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls, self.is_good_link)
            else:
                self.logger.error('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    #don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    self.urlpool.add(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            self.max_working = ns
        except:
            import os
            cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file)
            print '!!!!!! ', cmd
            os.system(cmd)
            pass
        msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
            BRO,
            self.max_working,
            GRE,
            self._workers,
            NOR)
        print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            #if self._workers >= self.max_working:
            #    gevent.sleep(2)
            #    continue
            for i in xrange(self.max_working):
                if self._workers >= self.max_working:
                    break
                url = self.urlpool.pop()
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
                #print 'start worker: ', self._workers

            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')


    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            headers = {
                'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)',
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR))
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8') ## get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            traceback.print_exc()
            html = ''
        #if status_code == 200:
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS)
        #else:
        #print status_code, url, len(html)
        #    self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED)
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls
Example #14
0
class XCrawler(object):
    '''index key-value: {url: state} , state:
        'task': the url is pending as a task
        'done': the url has been download seccessfully
    '''
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()

    def _worker(self, url):
        '''
            do a task
        '''
        try:
            self.logger.info('start a worker: [%s]' % self._workers)
            proxy, status_code, html, url_real = self.downloader(url)
            if not proxy and status_code == self._http_exception_code:
                status_code, html = self.special_downloader(url)
            if status_code == 200:
                new_urls = self.processor(url_real, html)
                self.urlpool.set_url_done(url)
                self.urlpool.set_url_done(url_real)
                if new_urls:
                    self.urlpool.addmany(new_urls)
            else:
                self.logger.info('%sfailed download: %s, [%s]%s' % (
                    RED,
                    url, status_code,
                    NOR,
                ))
                if proxy:
                    self.urlpool.set_url_404(url)
                    self.urlpool.add(url)
                elif (status_code == self._http_exception_code or
                      status_code >= 400):
                    # don't try more if no proxy
                    self.urlpool.set_url_bad(url)
                else:
                    t = self.failed_urls.get(url, 0)
                    if t == 0:
                        self.failed_urls[url] = 1
                        self.urlpool.add(url)
                    if t < 3:
                        self.failed_urls[url] += 1
                        self.urlpool.add(url)
                    if t >= 3:
                        self.urlpool.set_url_bad(url)
        except:
            traceback.print_exc()
        self._workers -= 1

    def dynamic_max_working(self,):
        changed = False
        try:
            ns = open(self.worker_conf_file).read()
            ns = int(ns)
            if ns != self.max_working:
                changed = True
                self.max_working = ns
            else:
                changed = False
        except:
            import os
            cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file)
            print '!!!!!! ', cmd
            os.system(cmd)
            pass
        if changed:
            msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % (
                BRO,
                self.max_working,
                GRE,
                self._workers,
                NOR)
            print msg

    def start(self):
        self.init_urlpool()
        spawn(self.main_parallel_task_loop)
        self.dynamic_max_working()
        self.last_special_crawl = 0
        while 1:
            print '%sworkers left: %s%s' % (
                GRE,
                self._workers,
                NOR
            )
            self.dynamic_max_working()
            for i in xrange(self.max_working):
                if self._workers >= self.max_working:
                    gevent.sleep(10)
                    break
                try:
                    url = self.urlqueue.get_nowait()
                except:
                    url = self.urlpool.pop()
                gap = self.special_crawl_gap(url)
                skip_special = False
                if gap > 0:
                    to_sleep = gap - (time.time() - self.last_special_crawl)
                    if to_sleep > 0:
                        print '\tskip special:'
                        time.sleep(1)
                        self.urlpool.add(url, always=True)
                        skip_special = True
                    else:
                        self.last_special_crawl = time.time()
                if skip_special:
                    continue
                if not url:
                    break
                spawn(self._worker, url)
                self._workers += 1
            # wait for workers to start
            gevent.sleep(3)

    def main_parallel_task_loop(self,):
        '''define the task to do in a main-parallel loop'''
        return

    def special_crawl_gap(self, url):
        ''' re-define if sleep some time for this url
        '''
        return 0

    def is_ip_blocked(self, url, html):
        '''
        find ip blocked info in redirected url or html
        '''
        return False

    def is_good_link(self, url):
        '''
        filter url which you don't want to download
        re-implement if needs
        '''
        return True

    def init_urlpool(self, urls=None):
        '''
            init url pool with urls
            re-implement your own if need
        '''
        pass

    def special_downloader(self, url, timeout=20):
        ''' define supplementary to self.downloader()
        e.g. use special proxy to try in Exception in self.downloader()
        '''
        return (self._http_exception_code, '')

    def downloader(self, url, timeout=20):
        '''
            download url to get html
            re-implement your own if need
        '''
        if not self.headers:
            ua = ('Mozilla/5.0 (compatible; MSIE 9.0; '
                  'Windows NT 6.1; Win64; x64; Trident/5.0)')
            headers = {
                'User-Agent': ua,
            }
        else:
            headers = self.headers
        proxy, to_sleep = self.proxypool.get(url)
        if to_sleep > 10:
            print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep))
        status_code = self._http_exception_code
        html = ''
        url_real = url
        try:
            msg = '%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)
            self.logger.debug(msg)
            if to_sleep:
                gevent.sleep(to_sleep)
            if proxy:
                timeout = 25
                r = requests.get(url, headers=headers,
                                 timeout=timeout, proxies=proxy)
            else:
                r = requests.get(url, headers=headers, timeout=timeout)
            html = r.content
            url_real = r.url.encode('utf8')  # get the redirected url
            status_code = r.status_code
            if self.is_ip_blocked(r.url, html):
                html = ''
                status_code = 400
                self.proxypool._pool.remove(proxy)
                print '%sremove proxy: %s, pool size: %s%s' % (
                    BRO, str(proxy), len(self.proxypool._pool), NOR)
        except:
            # traceback.print_exc()
            html = ''
        return (proxy, status_code, html, url_real)

    def processor(self, url, html):
        '''
            process the html from downloader
            e.g.
                extract URL, title, content and other info
                save the info extracted from html to DB
        '''
        new_urls = []
        return new_urls
Example #15
0
 def __init__(self):
     self.proxyPool = ProxyPool()