def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue()
def shall_I_begin(in_str, is_file=False, is_hq=False, need_proxy_pool=False): #start terminate_watcher Terminate_Watcher() global ppool if need_proxy_pool: LOG.info(u'初始化proxy pool') ppool = ProxyPool() LOG.info(u'proxy pool:[%d] 初始完毕' % len(ppool.proxies)) #xiami obj xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\ config.XIAMI_LOGIN_PASSWORD, \ is_hq,proxies=ppool) #netease obj m163 = netease.Netease(is_hq, proxies=ppool) if is_file: from_file(xiami_obj, m163, in_str) elif re.match(pat_xm, in_str): from_url_xm(xiami_obj, in_str) elif re.match(pat_163, in_str): from_url_163(m163, in_str) print border if len(dl_songs): LOG.info(u' 下载任务总数: %d \n 3秒后开始下载' % len(dl_songs)) sleep(3) downloader.start_download(dl_songs) else: LOG.warning(u' 没有可下载任务,自动退出.')
def __init__(self): self.retry_count = 3 self.empty_count = 0 self.pre_empty_flag = False self.enabled_programs = [] self.unabled_programs = [] self.collected_programs = [] self.proxypool = ProxyPool() self.proxy = self.proxypool.get_proxy() self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }
def post(self): target = self.get_argument('target', default='') or 'all' num = int(self.get_argument('num', default='') or 5) delay = int(self.get_argument('delay', default='') or 10) proxypool = ProxyPool() try: proxies = proxypool.get_many(target=target, num=num, maxscore=delay) num_ret = len(proxies) mtime = proxypool.get_mtime(target=target) proxylist = [] for proxy in proxies: proxylist.append(proxy.decode('utf-8')) if str(target).upper() in proxypool.targets: status = 'success' else: status = 'success-partial' ret = { 'status': status, 'proxylist': { 'num': num_ret, 'mtime': mtime, 'target': target, 'proxies': proxylist, }, } except Exception as e: ret = { 'status': 'failure', 'target': target, 'err': str(e), } self.set_header('Content-Type', 'application/json') self.write(json.dumps(ret))
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
def shall_I_begin(option): #start terminate_watcher Terminate_Watcher() global ppool, xiami_obj if option.need_proxy_pool: LOG.info(msgTxt.init_proxypool) ppool = ProxyPool() option.proxies = ppool LOG.info(msgTxt.fmt_init_proxypool_done %len(ppool.proxies)) #netease obj m163 = netease.Netease(option) if option.inFile: from_file(m163,option) elif re.match(pat_xm, option.inUrl): __init_xiami_obj(option) from_url_xm(xiami_obj, option.inUrl) elif re.match(pat_163, option.inUrl): from_url_163(m163, option.inUrl) print border #here do filtering for incremental download skipped_songs = [] #used by incremental_dl skipped_hists = [] #used by incremental_dl dl_songs = [] if option.incremental_dl: skipped_songs, skipped_hists = hist_handler.filter_songs(total_songs) LOG.warning(msgTxt.fmt_skip_dl_nm % len(skipped_songs)) dl_songs = [song for song in total_songs if song not in skipped_songs] dl_num = len(dl_songs) skip_num = len(skipped_songs) output_num = '%d' % dl_num if not skip_num else \ "%d-%d=%d" %(dl_num + skip_num, skip_num, dl_num) if len(dl_songs): LOG.info(msgTxt.fmt_total_dl_nm % output_num) sleep(3) downloader.start_download(dl_songs, skipped_hists) # test # downloader.success_list.extend(dl_songs) # downloader.failed_list.extend(dl_songs) # downloader.finish_summary(skipped_hists) # test else: LOG.warning(msgTxt.no_dl_task)
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
def shall_I_begin(url, is_file=False, is_hq=False, need_proxy_pool=False): #start terminate_watcher # Terminate_Watcher() global ppool if need_proxy_pool: print '初始化proxy pool' ppool = ProxyPool() print('proxy pool:[%d] 初始完毕' % len(ppool.proxies)) #xiami obj xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\ config.XIAMI_LOGIN_PASSWORD, \ is_hq,proxies=ppool) #netease obj m163 = netease.Netease(is_hq, proxies=ppool) #用来得到歌手ID的排序名单txt,用过一次就行了,什么时候想起来了再更新一次 # artistIDs = getTopArtists(xiami_obj, url) getTopSongs(xiami_obj)
# -*- coding: utf-8 -*- import platform import os from daemon import daemonize from proxypool import ProxyPool if __name__ == '__main__': # 以守护进程方式运行 if "Linux" in platform.system(): daemonize(os.getcwd(), '/dev/null','/tmp/daemon_stdout.log','/tmp/daemon_error.log') redis_key_https = "merge_https_proxy" redis_key_http = "merge_http_proxy" redis_distinct_set_http = "merge_set_http" redis_distinct_set_https = "merge_set_https" ProxyPool(redis_key_http=redis_key_http, redis_key_https=redis_key_https, redis_distinct_set_http=redis_distinct_set_http, redis_distinct_set_https=redis_distinct_set_https).run()
class Scrapyer(object): """ Function: 1.search programs from http://www.tvmao.com and collect relative programs info 2.crawl detail info of programs from http://www.tvmao.com to help classify programs """ def __init__(self): self.retry_count = 3 self.empty_count = 0 self.pre_empty_flag = False self.enabled_programs = [] self.unabled_programs = [] self.collected_programs = [] self.proxypool = ProxyPool() self.proxy = self.proxypool.get_proxy() self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } def change_proxy(self): """ change current proxy :return: """ self.proxypool.delete_proxy(self.proxy) self.proxy = self.proxypool.get_proxy() def check_empty(self, num, source_programs, lock): """ check whether if the current proxy is dead if the res '[]' occurs over 5 times consecutively :param num: number of current columns :param source_programs: programs need to crawl :param lock: lock to access the source_programs :return: """ if num == 0: if self.pre_empty_flag: self.empty_count += 1 if self.empty_count >= 5: for i in range(5, 0, -1): program = self.unabled_programs[i] if empty_times[program] < 2: self.unabled_programs.pop(i) with lock: source_programs.put(program) empty_times[program] += 1 self.change_proxy() self.empty_count = 0 else: self.pre_empty_flag = True self.empty_count = 1 elif self.pre_empty_flag: self.pre_empty_flag = False self.empty_count = 0 def collect_programs(self, page_uls, page_columns): """ parse programs from the crawed result by columns :param page_uls: all uls in the result :param page_columns: all categories in the result :return: """ prefix = 'http://www.tvmao.com' programs = [] for column, uls in zip(page_columns, page_uls): lis = uls.find_all('li', class_='mr10') if len(lis) == 0: continue if re.search('^(电视剧|电影)', column): href_names = [(prefix + li.p.a['href'], li.p.a.get_text()) for li in lis] elif re.search('^(综艺|明星|赛事)', column): href_names = [(prefix + li.a['href'], li.a['title']) for li in lis] else: continue programs.append(href_names) return dict(zip(page_columns, programs)) def crawl_relative_program(self, program, source_programs, lock): """ crawl relative programs info from http://www.tvmao.com :param program: :param source_programs: all programs need to crawl :param lock: lock to access the source_programs :return: """ url = 'http://www.tvmao.com/query.jsp?keys=%s&ed=' % quote(program) + \ 'bOWkp%2BeZveWkq%2BWmh%2BS4iua8lOazoeayq%2BS5i%2BWQu28%3D' # crawl the website bsObj = None self.retry_count = 3 while self.retry_count > 0: try: content = requests.get(url, proxies={'http': self.proxy}, headers=self.headers, timeout=2) bsObj = BeautifulSoup(content.text, 'html.parser') break except: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 # parse infomation try: page_content = bsObj.find_all('div', class_='page-content')[0] page_columns = [ item.a.get_text() for item in page_content.dl.find_all('dd') ] page_columns = [ column for column in page_columns if not re.search('^(播出时间|电视频道)', column) ] page_content_uls = page_content.div.find_all( 'ul', class_=re.compile('^.+qtable$'), recursive=False) if len(page_columns) == 0: self.unabled_programs.append(program) else: self.enabled_programs.append(program) column_programs = self.collect_programs( page_content_uls, page_columns) return {program: column_programs} # check whether if the current proxy was dead self.check_empty(len(page_columns), source_programs, lock) except: with lock: source_programs.put(program) self.change_proxy() return None def run_crawl_relative_programs(self, source_programs, lock, limit=False): """ single process :param source_programs: all programs need to crawl :param lock: lock to access the source_programs :param limit: if size of source_programs has little change, end process when limit is true :return: collected programs info, enabled programs, unabled programs """ collected_programs = [] # count, pre = 0, source_programs.qsize() while True: try: with lock: program = source_programs.get_nowait() if DEBUG: print(source_programs.qsize()) if source_programs.qsize() < 1500: return collected_programs, self.enabled_programs, self.unabled_programs # count += 1 # if count % 50 == 0 and limit: # if pre - source_programs.qsize() < 0: # return collected_programs, self.enabled_programs, self.unabled_programs # pre = source_programs.qsize() result = self.crawl_relative_program(program, source_programs, lock) if result: collected_programs.append(result) time.sleep(randint(0, 1)) except: return collected_programs, self.enabled_programs, self.unabled_programs def category_classify(self, category): """ classify by the category from xingchen :param category: program intro or program category from xingchen :return: """ if re.search('军旅', category): return '军事' if re.search('纪录片', category): return '纪实' if re.search('动漫', category): return '少儿' if re.search('戏剧', category): return '戏曲' if re.search('真人秀', category): return '综艺' res = re.search('|'.join(all_categories), category) if res: return res.group() return None def intro_classify(self, intro): """ classify the category 'living' into more accurate category :param intro: introduction of the realtive program in xingchen :return: """ if re.search('军旅', intro): return '军事' if re.search('纪录片', intro): return '纪实' if re.search('动漫', intro): return '少儿' if re.search('戏剧', intro): return '戏曲' if re.search('真人秀', intro): return '综艺' res = re.search('|'.join(all_categories), intro) if res: return res.group() return "生活" def search_to_classify_program(self, href): """ classify programs by crawling more detail info from xingchen :param href: link of the relative program in xingchen :return: """ # crawling the website bsObj = None self.retry_count = 3 while self.retry_count > 0: try: content = requests.get(href, proxies={'http': self.proxy}, headers=self.headers, timeout=2) if content.status_code != 200: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 continue bsObj = BeautifulSoup(content.text, 'html.parser') break except: self.retry_count -= 1 if self.retry_count <= 0: if DEBUG: print("waiting...") self.change_proxy() self.retry_count = 3 # classify the program by detail info from website try: if re.search('tvcolumn', href): res_1 = bsObj.find_all('td', class_='gray pl15') if res_1: category = res_1[0].findNext('td').get_text() if category != "生活": category = self.category_classify(category) return category if category else '综艺' div = bsObj.find_all('div', class_='clear more_c')[0] intro = '; '.join( [p.get_text() for p in div.find_all('p')]) return self.intro_classify(intro) else: return '综艺' elif re.search('drama', href): mark = bsObj.find(text='类别:') td = mark.parent.findNext('td') category = ' '.join( [a.get_text() for a in td.find_all('a', recursive=False)]) category = self.category_classify(category) return category if category else '电视剧' except: if DEBUG: print("f**k", href) return choice(['综艺', '电视剧']) def run_search_to_classify_programs(self, source_items, lock): """ single process :param source_items: all programs need to crawl more detail info :param lock: lock to access source_items :return: """ program_cateogry = [] while True: try: with lock: item = source_items.get_nowait() if DEBUG: print(source_items.qsize()) category = self.search_to_classify_program(item[2]) program_cateogry.append((item[0], category)) time.sleep(randint(0, 1)) except: return program_cateogry
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls, self.is_good_link) else: self.logger.error('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): #don't try more if no proxy self.urlpool.set_url_bad(url) else: self.urlpool.add(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): try: ns = open(self.worker_conf_file).read() ns = int(ns) self.max_working = ns except: pass msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() #if self._workers >= self.max_working: # gevent.sleep(2) # continue for i in xrange(self.max_working): url = self.urlpool.pop() if not url: break spawn(self._worker, url) self._workers += 1 #print 'start worker: ', self._workers if self._workers >= self.max_working: break # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: headers = { 'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)', } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') ## get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: traceback.print_exc() html = '' #if status_code == 200: # self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS) #else: #print status_code, url, len(html) # self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED) return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls, self.is_good_link) else: self.logger.error('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): #don't try more if no proxy self.urlpool.set_url_bad(url) else: self.urlpool.add(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): try: ns = open(self.worker_conf_file).read() ns = int(ns) self.max_working = ns except: import os cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file) print '!!!!!! ', cmd os.system(cmd) pass msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() #if self._workers >= self.max_working: # gevent.sleep(2) # continue for i in xrange(self.max_working): if self._workers >= self.max_working: break url = self.urlpool.pop() if not url: break spawn(self._worker, url) self._workers += 1 #print 'start worker: ', self._workers # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: headers = { 'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)', } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: self.logger.debug('%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR)) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') ## get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: traceback.print_exc() html = '' #if status_code == 200: # self.proxypool.record_proxy_state(proxy, self.proxypool.SUCCESS) #else: #print status_code, url, len(html) # self.proxypool.record_proxy_state(proxy, self.proxypool.FAILED) return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls
class XCrawler(object): '''index key-value: {url: state} , state: 'task': the url is pending as a task 'done': the url has been download seccessfully ''' def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue() def _worker(self, url): ''' do a task ''' try: self.logger.info('start a worker: [%s]' % self._workers) proxy, status_code, html, url_real = self.downloader(url) if not proxy and status_code == self._http_exception_code: status_code, html = self.special_downloader(url) if status_code == 200: new_urls = self.processor(url_real, html) self.urlpool.set_url_done(url) self.urlpool.set_url_done(url_real) if new_urls: self.urlpool.addmany(new_urls) else: self.logger.info('%sfailed download: %s, [%s]%s' % ( RED, url, status_code, NOR, )) if proxy: self.urlpool.set_url_404(url) self.urlpool.add(url) elif (status_code == self._http_exception_code or status_code >= 400): # don't try more if no proxy self.urlpool.set_url_bad(url) else: t = self.failed_urls.get(url, 0) if t == 0: self.failed_urls[url] = 1 self.urlpool.add(url) if t < 3: self.failed_urls[url] += 1 self.urlpool.add(url) if t >= 3: self.urlpool.set_url_bad(url) except: traceback.print_exc() self._workers -= 1 def dynamic_max_working(self,): changed = False try: ns = open(self.worker_conf_file).read() ns = int(ns) if ns != self.max_working: changed = True self.max_working = ns else: changed = False except: import os cmd = 'echo %s > %s' % (self.max_working, self.worker_conf_file) print '!!!!!! ', cmd os.system(cmd) pass if changed: msg = '%sset max_working to [%s]. %sworkers:[%s]%s' % ( BRO, self.max_working, GRE, self._workers, NOR) print msg def start(self): self.init_urlpool() spawn(self.main_parallel_task_loop) self.dynamic_max_working() self.last_special_crawl = 0 while 1: print '%sworkers left: %s%s' % ( GRE, self._workers, NOR ) self.dynamic_max_working() for i in xrange(self.max_working): if self._workers >= self.max_working: gevent.sleep(10) break try: url = self.urlqueue.get_nowait() except: url = self.urlpool.pop() gap = self.special_crawl_gap(url) skip_special = False if gap > 0: to_sleep = gap - (time.time() - self.last_special_crawl) if to_sleep > 0: print '\tskip special:' time.sleep(1) self.urlpool.add(url, always=True) skip_special = True else: self.last_special_crawl = time.time() if skip_special: continue if not url: break spawn(self._worker, url) self._workers += 1 # wait for workers to start gevent.sleep(3) def main_parallel_task_loop(self,): '''define the task to do in a main-parallel loop''' return def special_crawl_gap(self, url): ''' re-define if sleep some time for this url ''' return 0 def is_ip_blocked(self, url, html): ''' find ip blocked info in redirected url or html ''' return False def is_good_link(self, url): ''' filter url which you don't want to download re-implement if needs ''' return True def init_urlpool(self, urls=None): ''' init url pool with urls re-implement your own if need ''' pass def special_downloader(self, url, timeout=20): ''' define supplementary to self.downloader() e.g. use special proxy to try in Exception in self.downloader() ''' return (self._http_exception_code, '') def downloader(self, url, timeout=20): ''' download url to get html re-implement your own if need ''' if not self.headers: ua = ('Mozilla/5.0 (compatible; MSIE 9.0; ' 'Windows NT 6.1; Win64; x64; Trident/5.0)') headers = { 'User-Agent': ua, } else: headers = self.headers proxy, to_sleep = self.proxypool.get(url) if to_sleep > 10: print ('url: %s, proxy: %s ,to_sleep: %s' % (url, proxy, to_sleep)) status_code = self._http_exception_code html = '' url_real = url try: msg = '%scrawl @[%s]-[%s]%s' % (BLU, time.ctime(), url, NOR) self.logger.debug(msg) if to_sleep: gevent.sleep(to_sleep) if proxy: timeout = 25 r = requests.get(url, headers=headers, timeout=timeout, proxies=proxy) else: r = requests.get(url, headers=headers, timeout=timeout) html = r.content url_real = r.url.encode('utf8') # get the redirected url status_code = r.status_code if self.is_ip_blocked(r.url, html): html = '' status_code = 400 self.proxypool._pool.remove(proxy) print '%sremove proxy: %s, pool size: %s%s' % ( BRO, str(proxy), len(self.proxypool._pool), NOR) except: # traceback.print_exc() html = '' return (proxy, status_code, html, url_real) def processor(self, url, html): ''' process the html from downloader e.g. extract URL, title, content and other info save the info extracted from html to DB ''' new_urls = [] return new_urls
def __init__(self): self.proxyPool = ProxyPool()