Example #1
0
    def __init__(self, max_working=20, common_gap=20,
                 urlindex_file="", proxies_file=None,
                 span_of_host=3,
                 max_in_mem=100000,
                 worker_conf_file='xworkers.conf',
                 load_bad_url=None, logfile=''):
        self.proxypool = ProxyPool(common_gap, proxies_file)
        self.urlpool = UrlPool(urlindex_file,
                               load_bad_url=load_bad_url,
                               span_of_host=span_of_host,
                               max_in_mem=max_in_mem,
                               is_good_link=self.is_good_link)
        self.max_working = max_working
        self.worker_conf_file = worker_conf_file
        self._workers = 0
        # you can customize your http header in init_urlpool()
        self.headers = None
        self._http_exception_code = 900
        if logfile:
            self.logger = init_file_logger(logfile)
        else:
            self.logger = logging.getLogger('xcrawler')
        self.failed_urls = {}

        # it is for hight priority url to download,
        # start() will get url from this queque firstly
        self.urlqueue = gevent.queue.Queue()
Example #2
0
def shall_I_begin(in_str, is_file=False, is_hq=False, need_proxy_pool=False):
    #start terminate_watcher
    Terminate_Watcher()
    global ppool
    if need_proxy_pool:
        LOG.info(u'初始化proxy pool')
        ppool = ProxyPool()
        LOG.info(u'proxy pool:[%d] 初始完毕' % len(ppool.proxies))

    #xiami obj
    xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\
            config.XIAMI_LOGIN_PASSWORD, \
            is_hq,proxies=ppool)
    #netease obj
    m163 = netease.Netease(is_hq, proxies=ppool)

    if is_file:
        from_file(xiami_obj, m163, in_str)
    elif re.match(pat_xm, in_str):
        from_url_xm(xiami_obj, in_str)
    elif re.match(pat_163, in_str):
        from_url_163(m163, in_str)

    print border
    if len(dl_songs):
        LOG.info(u' 下载任务总数: %d \n 3秒后开始下载' % len(dl_songs))
        sleep(3)
        downloader.start_download(dl_songs)
    else:
        LOG.warning(u' 没有可下载任务,自动退出.')
    def __init__(self):
        self.retry_count = 3
        self.empty_count = 0
        self.pre_empty_flag = False

        self.enabled_programs = []
        self.unabled_programs = []
        self.collected_programs = []

        self.proxypool = ProxyPool()
        self.proxy = self.proxypool.get_proxy()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
        }
Example #4
0
def shall_I_begin(option):
    #start terminate_watcher
    Terminate_Watcher()
    global ppool, xiami_obj
    if option.need_proxy_pool:
        LOG.info(msgTxt.init_proxypool)
        ppool = ProxyPool()
        option.proxies = ppool
        LOG.info(msgTxt.fmt_init_proxypool_done %len(ppool.proxies))

    #netease obj
    m163 = netease.Netease(option)

    if option.inFile:
        from_file(m163,option)
    elif re.match(pat_xm, option.inUrl):
        __init_xiami_obj(option)
        from_url_xm(xiami_obj, option.inUrl)
    elif re.match(pat_163, option.inUrl):
        from_url_163(m163, option.inUrl)

    print border
    #here do filtering for incremental download
    skipped_songs = [] #used by incremental_dl
    skipped_hists = [] #used by incremental_dl

    dl_songs = []
    if option.incremental_dl:
        skipped_songs, skipped_hists = hist_handler.filter_songs(total_songs)
        LOG.warning(msgTxt.fmt_skip_dl_nm % len(skipped_songs))

    dl_songs = [song for song in total_songs if song not in skipped_songs]
    dl_num = len(dl_songs)
    skip_num = len(skipped_songs)
    output_num = '%d' % dl_num if not skip_num else \
                 "%d-%d=%d" %(dl_num + skip_num, skip_num, dl_num)
    if len(dl_songs):
        LOG.info(msgTxt.fmt_total_dl_nm % output_num)
        sleep(3)
        downloader.start_download(dl_songs, skipped_hists)
        # test
        # downloader.success_list.extend(dl_songs)
        # downloader.failed_list.extend(dl_songs)
        # downloader.finish_summary(skipped_hists)
        # test
    else:
        LOG.warning(msgTxt.no_dl_task)
Example #5
0
    def post(self):
        target = self.get_argument('target', default='') or 'all'
        num = int(self.get_argument('num', default='') or 5)
        delay = int(self.get_argument('delay', default='') or 10)

        proxypool = ProxyPool()

        try:
            proxies = proxypool.get_many(target=target,
                                         num=num,
                                         maxscore=delay)
            num_ret = len(proxies)
            mtime = proxypool.get_mtime(target=target)

            proxylist = []
            for proxy in proxies:
                proxylist.append(proxy.decode('utf-8'))

            if str(target).upper() in proxypool.targets:
                status = 'success'
            else:
                status = 'success-partial'

            ret = {
                'status': status,
                'proxylist': {
                    'num': num_ret,
                    'mtime': mtime,
                    'target': target,
                    'proxies': proxylist,
                },
            }
        except Exception as e:
            ret = {
                'status': 'failure',
                'target': target,
                'err': str(e),
            }

        self.set_header('Content-Type', 'application/json')

        self.write(json.dumps(ret))
Example #6
0
 def __init__(self, max_working=20, common_gap=20,
              urlindex_file="", proxies_file=None,
              span_of_host=3,
              worker_conf_file='xworkers.conf',
              load_bad_url=None, logfile=''):
     self.proxypool = ProxyPool(common_gap, proxies_file)
     self.urlpool = UrlPool(urlindex_file,
                            load_bad_url=load_bad_url,
                            span_of_host=span_of_host,
                            is_good_link=self.is_good_link)
     self.max_working = max_working
     self.worker_conf_file = worker_conf_file
     self._workers = 0
     # you can customize your http header in init_urlpool()
     self.headers = None
     self._http_exception_code = 900
     if logfile:
         self.logger = init_file_logger(logfile)
     else:
         self.logger = logging.getLogger('xcrawler')
Example #7
0
def shall_I_begin(url, is_file=False, is_hq=False, need_proxy_pool=False):
    #start terminate_watcher
    # Terminate_Watcher()
    global ppool
    if need_proxy_pool:
        print '初始化proxy pool'
        ppool = ProxyPool()
        print('proxy pool:[%d] 初始完毕' % len(ppool.proxies))

    #xiami obj
    xiami_obj = xm.Xiami(config.XIAMI_LOGIN_EMAIL,\
            config.XIAMI_LOGIN_PASSWORD, \
            is_hq,proxies=ppool)
    #netease obj
    m163 = netease.Netease(is_hq, proxies=ppool)

    #用来得到歌手ID的排序名单txt,用过一次就行了,什么时候想起来了再更新一次
    # artistIDs = getTopArtists(xiami_obj, url)

    getTopSongs(xiami_obj)
# -*- coding: utf-8 -*-
import platform
import os
from daemon import daemonize
from proxypool import ProxyPool

if __name__ == '__main__':
    # 以守护进程方式运行
    if "Linux" in platform.system():
         daemonize(os.getcwd(), '/dev/null','/tmp/daemon_stdout.log','/tmp/daemon_error.log')
    redis_key_https = "merge_https_proxy"
    redis_key_http = "merge_http_proxy"
    redis_distinct_set_http = "merge_set_http"
    redis_distinct_set_https = "merge_set_https"
    ProxyPool(redis_key_http=redis_key_http,
              redis_key_https=redis_key_https,
              redis_distinct_set_http=redis_distinct_set_http,
              redis_distinct_set_https=redis_distinct_set_https).run()

Example #9
0
 def __init__(self):
     self.proxyPool = ProxyPool()