def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, max_in_mem=100000, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, max_in_mem=max_in_mem, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler') self.failed_urls = {} # it is for hight priority url to download, # start() will get url from this queque firstly self.urlqueue = gevent.queue.Queue()
def shall_I_begin(in_str, is_file=False, is_hq=False, need_proxy_pool=False): #start terminate_watcher Terminate_Watcher() global ppool if need_proxy_pool: LOG.info(u'初始化proxy pool') ppool = ProxyPool() LOG.info(u'proxy pool:[%d] 初始完毕' % len(ppool.proxies)) #xiami obj xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\ config.XIAMI_LOGIN_PASSWORD, \ is_hq,proxies=ppool) #netease obj m163 = netease.Netease(is_hq, proxies=ppool) if is_file: from_file(xiami_obj, m163, in_str) elif re.match(pat_xm, in_str): from_url_xm(xiami_obj, in_str) elif re.match(pat_163, in_str): from_url_163(m163, in_str) print border if len(dl_songs): LOG.info(u' 下载任务总数: %d \n 3秒后开始下载' % len(dl_songs)) sleep(3) downloader.start_download(dl_songs) else: LOG.warning(u' 没有可下载任务,自动退出.')
def __init__(self): self.retry_count = 3 self.empty_count = 0 self.pre_empty_flag = False self.enabled_programs = [] self.unabled_programs = [] self.collected_programs = [] self.proxypool = ProxyPool() self.proxy = self.proxypool.get_proxy() self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/' '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }
def shall_I_begin(option): #start terminate_watcher Terminate_Watcher() global ppool, xiami_obj if option.need_proxy_pool: LOG.info(msgTxt.init_proxypool) ppool = ProxyPool() option.proxies = ppool LOG.info(msgTxt.fmt_init_proxypool_done %len(ppool.proxies)) #netease obj m163 = netease.Netease(option) if option.inFile: from_file(m163,option) elif re.match(pat_xm, option.inUrl): __init_xiami_obj(option) from_url_xm(xiami_obj, option.inUrl) elif re.match(pat_163, option.inUrl): from_url_163(m163, option.inUrl) print border #here do filtering for incremental download skipped_songs = [] #used by incremental_dl skipped_hists = [] #used by incremental_dl dl_songs = [] if option.incremental_dl: skipped_songs, skipped_hists = hist_handler.filter_songs(total_songs) LOG.warning(msgTxt.fmt_skip_dl_nm % len(skipped_songs)) dl_songs = [song for song in total_songs if song not in skipped_songs] dl_num = len(dl_songs) skip_num = len(skipped_songs) output_num = '%d' % dl_num if not skip_num else \ "%d-%d=%d" %(dl_num + skip_num, skip_num, dl_num) if len(dl_songs): LOG.info(msgTxt.fmt_total_dl_nm % output_num) sleep(3) downloader.start_download(dl_songs, skipped_hists) # test # downloader.success_list.extend(dl_songs) # downloader.failed_list.extend(dl_songs) # downloader.finish_summary(skipped_hists) # test else: LOG.warning(msgTxt.no_dl_task)
def post(self): target = self.get_argument('target', default='') or 'all' num = int(self.get_argument('num', default='') or 5) delay = int(self.get_argument('delay', default='') or 10) proxypool = ProxyPool() try: proxies = proxypool.get_many(target=target, num=num, maxscore=delay) num_ret = len(proxies) mtime = proxypool.get_mtime(target=target) proxylist = [] for proxy in proxies: proxylist.append(proxy.decode('utf-8')) if str(target).upper() in proxypool.targets: status = 'success' else: status = 'success-partial' ret = { 'status': status, 'proxylist': { 'num': num_ret, 'mtime': mtime, 'target': target, 'proxies': proxylist, }, } except Exception as e: ret = { 'status': 'failure', 'target': target, 'err': str(e), } self.set_header('Content-Type', 'application/json') self.write(json.dumps(ret))
def __init__(self, max_working=20, common_gap=20, urlindex_file="", proxies_file=None, span_of_host=3, worker_conf_file='xworkers.conf', load_bad_url=None, logfile=''): self.proxypool = ProxyPool(common_gap, proxies_file) self.urlpool = UrlPool(urlindex_file, load_bad_url=load_bad_url, span_of_host=span_of_host, is_good_link=self.is_good_link) self.max_working = max_working self.worker_conf_file = worker_conf_file self._workers = 0 # you can customize your http header in init_urlpool() self.headers = None self._http_exception_code = 900 if logfile: self.logger = init_file_logger(logfile) else: self.logger = logging.getLogger('xcrawler')
def shall_I_begin(url, is_file=False, is_hq=False, need_proxy_pool=False): #start terminate_watcher # Terminate_Watcher() global ppool if need_proxy_pool: print '初始化proxy pool' ppool = ProxyPool() print('proxy pool:[%d] 初始完毕' % len(ppool.proxies)) #xiami obj xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\ config.XIAMI_LOGIN_PASSWORD, \ is_hq,proxies=ppool) #netease obj m163 = netease.Netease(is_hq, proxies=ppool) #用来得到歌手ID的排序名单txt,用过一次就行了,什么时候想起来了再更新一次 # artistIDs = getTopArtists(xiami_obj, url) getTopSongs(xiami_obj)
# -*- coding: utf-8 -*- import platform import os from daemon import daemonize from proxypool import ProxyPool if __name__ == '__main__': # 以守护进程方式运行 if "Linux" in platform.system(): daemonize(os.getcwd(), '/dev/null','/tmp/daemon_stdout.log','/tmp/daemon_error.log') redis_key_https = "merge_https_proxy" redis_key_http = "merge_http_proxy" redis_distinct_set_http = "merge_set_http" redis_distinct_set_https = "merge_set_https" ProxyPool(redis_key_http=redis_key_http, redis_key_https=redis_key_https, redis_distinct_set_http=redis_distinct_set_http, redis_distinct_set_https=redis_distinct_set_https).run()
def __init__(self): self.proxyPool = ProxyPool()