Exemple #1
0
    #         'spec.recover': True, 'sch.recover': False, 'score': 750,
    #         'detail.parser': False, 'thcnt': 4}
    #
    # ]
    proxies = []
    am = AccountManager()
    with open('proxy_r') as f:
        for l in f:
            proxies.append(l.strip())
    proxies.append(None)
    idx = [22]
    count = len(idx)
    name = '青海'
    ac = {
        'accounts': []
        , 'name': name
        , 'prefix': provinces[name]
    }
    acsa = am.get(name, 8)
    i = 0
    acs = []
    for a in acsa:
        acs.insert(0, a)
    for a in acs:
        a.proxy = None
        runner = FullJobRunner(
            gen_jobs_params([a.gen_run_param()], name, provinces[name], spec=False, detail=True, school=False,
                            spec_parser=False, detail_parser=False, spec_r=False,
                            sch_parser=False))
        runner.run()
Exemple #2
0
    #     {
    #         'accounts': [
    #             {'username': '******', 'proxy': '101.200.178.46:3128', 'password': '******', 'ua': ua[2]},
    #         ],
    #         'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'],
    #         'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True,
    #         'spec.recover': True, 'sch.recover': False, 'score': 750,
    #         'detail.parser': False, 'thcnt': 4}
    #
    # ]

    am = AccountManager()

    name = '宁夏'
    ac = {'accounts': [], 'name': name, 'prefix': provinces[name]}
    acs = am.get(name, 1)
    i = 0
    for a in acs:
        a.proxy = '192.168.1.39:3428'
        ac['accounts'].append(a.gen_run_param())
        i += 1
    run_fun(ac)
    name = '青海'
    ac = {'accounts': [], 'name': name, 'prefix': provinces[name]}
    acs = am.get(name, 1)
    i = 0
    for a in acs:
        a.proxy = '192.168.1.39:3428'
        ac['accounts'].append(a.gen_run_param())
        i += 1
    run_fun(ac)
Exemple #3
0
    #
    # ]
    # jobs = [
    #     {
    #         'accounts': [
    #             {'username': '******', 'proxy': '101.200.178.46:3128', 'password': '******', 'ua': ua[2]},
    #         ],
    #         'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'],
    #         'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True,
    #         'spec.recover': True, 'sch.recover': False, 'score': 750,
    #         'detail.parser': False, 'thcnt': 4}
    #
    # ]
    proxies = []
    am = AccountManager()
    with open('proxy_r') as f:
        for l in f:
            proxies.append(l.strip())
    idx = [1, 2, 3, 4, 5, 6, 7]

    count = len(idx)
    name = '广东'
    ac = {'accounts': [], 'name': name, 'prefix': provinces[name]}
    acs = am.get(name, count)
    i = 0
    for a in acs:
        a.proxy = proxies[idx[i]]
        ac['accounts'].append(a.gen_run_param())
        i += 1
    run_fun(ac)
Exemple #4
0
    #     a.proxy = proxies[idx[i]]
    #     ac['accounts'].append(a.gen_run_param())
    #     i += 1
    # run_fun3(ac)

    proxies = []
    am = AccountManager()
    with open('proxy_r') as f:
        for l in f:
            proxies.append(l.strip())
    proxies.append(None)
    idx = [5, 6]
    count = len(idx)
    name = '贵州'
    ac = {'accounts': [], 'name': name, 'prefix': provinces[name]}
    acsa = am.get(name, count)
    i = 0
    acs = []
    for a in acsa:
        acs.insert(0, a)
    for a in acs:
        a.proxy = proxies[idx[0]]
        ac['accounts'].append(a.gen_run_param())
    runner = FullJobRunner(
        gen_jobs_params(ac['accounts'],
                        name,
                        provinces[name],
                        spec=False,
                        detail=True,
                        school=False,
                        spec_parser=False,
Exemple #5
0
    #         ],
    #         'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'],
    #         'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True,
    #         'spec.recover': True, 'sch.recover': False, 'score': 750,
    #         'detail.parser': False, 'thcnt': 4}
    #
    # ]
    proxies = []
    am = AccountManager()
    with open('proxy_r') as f:
        for l in f:
            proxies.append(l.strip())
    idx = [16]
    count = len(idx)

    acsa = am.get('天津', 8)
    acs = []
    for a in acsa:
        acs.insert(0, a)
    name = '天津'
    for a in acs:
        a.proxy = '192.168.1.39:3428'
        runner = FullJobRunner(
            gen_jobs_params([a.gen_run_param()],
                            name,
                            provinces[name],
                            spec=False,
                            detail=True,
                            school=False,
                            spec_parser=False,
                            detail_parser=False,
Exemple #6
0
    #     a.proxy = proxies[idx[i]]
    #     ac['accounts'].append(a.gen_run_param())
    #     i += 1
    # run_fun3(ac)

    proxies = []
    am = AccountManager()
    with open('proxy_r') as f:
        for l in f:
            proxies.append(l.strip())
    proxies.append(None)
    idx = [13]
    count = len(idx)
    name = '甘肃'
    ac = {'accounts': [], 'name': name, 'prefix': provinces[name]}
    acs = am.get(name, 8)
    i = 0

    for a in acs:
        a.proxy = proxies[idx[0]]
        runner = FullJobRunner(
            gen_jobs_params([a.gen_run_param()],
                            name,
                            provinces[name],
                            spec=False,
                            detail=True,
                            school=False,
                            spec_parser=False,
                            detail_parser=False,
                            spec_r=False,
                            detail_r=False,
Exemple #7
0
class JobManager():
    def __init__(self, jobs, proxy='proxy_r'):
        self.ac = AccountManager()
        self.pm = ProxyManager()
        self.pq = ProxyQueue()
        self.pq.load(proxy)
        self.pm.load(proxy)
        self.jobs = jobs
        self.threads = []
        self.running = False

    def init(self):
        random.seed(int(time.time()))
        for job in self.jobs:
            for ac in self.ac.get(job['name'], 2):
                ac.proxy = self.pm.get_good_proxy(7)
                ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)]
        for job in self.jobs:
            for ac in self.ac.get(job['name'], job['count']):
                if ac.proxy is None:
                    ac.proxy = self.pm.get_good_proxy(1)
                    ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)]

    def run(self):
        if self.running:
            return
        self.init()
        for tid in range(len(self.jobs)):
            t = threading.Thread(target=self.runner, args=(tid, ))
            self.threads.append(t)
        for t in self.threads:
            t.start()
        time.sleep(2)
        for t in self.threads:
            t.join()
        self.ac.save()

    def random_run(self):
        if self.running:
            return
        for tid in range(len(self.jobs)):
            t = threading.Thread(target=self.rand_runner, args=(tid, ))
            t.start()
            t.setDaemon(True)
            time.sleep(1)
            self.threads.append(t)
        time.sleep(2)
        for t in self.threads:
            t.join()
        self.ac.save()

    def rand_runner(self, tid):
        job = self.jobs[tid]
        ac = {
            'accounts': [],
            'name': job['name'],
            'prefix': provinces[job['name']]
        }
        acs = self.ac.get(job['name'], job['count'])
        if len(acs) > 0:
            for a in acs:
                a.proxy = self.pq.get_good_proxy()
                ac['accounts'].append(a.gen_run_param())
            level = 0
            print '%s start ' % ac['name']
            while level < 3:
                if ac['name'] != '海南':
                    level = smart_full_job(ac['accounts'],
                                           ac['name'],
                                           ac['prefix'],
                                           level=level)
                else:
                    level = smart_full_job(ac['accounts'],
                                           ac['name'],
                                           ac['prefix'],
                                           900,
                                           level=level)
            for a in acs:
                self.pq.release(a.proxy)
                a.proxy = None

    def runner(self, tid):
        job = self.jobs[tid]
        ac = {
            'accounts': [],
            'name': job['name'],
            'prefix': provinces[job['name']]
        }
        for a in self.ac.get(job['name'], job['count']):
            ac['accounts'].append(a.gen_run_param())
        level = 0
        print '%s start ' % ac['name']
        while level < 3:
            if ac['name'] != '海南':
                level = smart_full_job(ac['accounts'],
                                       ac['name'],
                                       ac['prefix'],
                                       level=level)
            else:
                level = smart_full_job(ac['accounts'],
                                       ac['name'],
                                       ac['prefix'],
                                       900,
                                       level=level)
Exemple #8
0
    #         'prefix': 'tj', 'sleep': 1.0, 'school': False, 'sch.parser': True, 'kldms': ['5', '1'], 'bkccs': ['2', '1'],
    #         'spec.parser': True, 'spec': True, 'detail': False, 'name': 'tianjin', 'detail.recover': True,
    #         'spec.recover': True, 'sch.recover': False, 'score': 750,
    #         'detail.parser': False, 'thcnt': 4}
    #
    # ]
    # proxies = []
    am = AccountManager()
    # with open('proxy_r') as f:
    #     for l in f:
    #         proxies.append(l.strip())
    # idx = [28]
    # count = len(idx)

    name = '江苏'
    acsa = am.get(name, 3)
    acs = []
    for a in acsa:
        acs.insert(0, a)
    for a in acs:
        a.proxy = '101.226.249.237:80'
        runner = FullJobRunner(
            gen_jobs_params([a.gen_run_param()],
                            name,
                            provinces[name],
                            spec=False,
                            detail=True,
                            school=False,
                            spec_parser=False,
                            detail_parser=False,
                            spec_r=False,
Exemple #9
0
class QueueJobManager():
    def __init__(self, jobs, thcnt=2, proxy='proxy_s', times=3):
        self.ac = AccountManager()
        self.pq = ProxyManager()
        self.pq.load(proxy)
        self.jobs = jobs
        self.threads = []
        self.running = False
        self.thread_cnt = thcnt
        self.times = times
        self.job_queue = Queue.Queue()
        self.done_job = 0
        self.job_lock = threading.RLock()

    def distpatch(self):
        for job in self.jobs:
            ac = {
                'accounts': []
                , 'name': job['name']
                , 'prefix': provinces[job['name']]
                , 'score': 750
                , 'times': 0
            }
            if job['name'] == '海南':
                ac['score'] = 900
            elif job['name'] == '上海':
                ac['score'] = 600
            elif job['name'] == '江苏':
                ac['score'] = 500
            acs = self.ac.get(job['name'], job['count'])
            if len(acs) > 0:
                for a in acs:
                    ac['accounts'].append(a.gen_run_param())
                self.job_queue.put(ac)

    def run(self):
        if self.running:
            return
        self.running = True
        self.distpatch()
        for tid in range(self.thread_cnt):
            proxy = self.pq.get_good_proxy()
            if proxy is None:
                break
            t = threading.Thread(target=self.runner, args=(proxy,))
            self.threads.append(t)
        for t in self.threads:
            t.start()
        time.sleep(2)
        for t in self.threads:
            t.join()
        self.ac.save()
        self.threads = []

    def runner(self, proxy):
        while True:
            ac = self.job_queue.get()
            ac['proxy'] = proxy
            times = ac.get('times', 0)
            if times < self.times:
                print '%s start %d' % (ac['name'], ac['times'])
                recruit_jobs(ac)
                ac['times'] += 1
                ac['proxy'] = None
                self.job_queue.put(ac)
            else:
                print '%s start check crawling' % ac['name']
                level = 0
                while level < 3:
                    level = recheck(ac, level=level)
                with self.job_lock:
                    self.done_job += 1
                    if self.done_job >= len(self.jobs):
                        break
Exemple #10
0
class JobManager():
    def __init__(self, jobs, proxy='proxy_s'):
        self.ac = AccountManager()
        self.pm = ProxyManager()
        self.pq = ProxyQueue()
        self.pq.load(proxy)
        self.pm.load(proxy)
        self.jobs = jobs
        self.threads = []
        self.running = False
        self.job_queue = Queue.Queue()

    def init(self):
        random.seed(int(time.time()))
        for job in self.jobs:
            job['proxy'] = self.pm.get_good_proxy(1)
            for ac in self.ac.get(job['name'], job['count']):
                ac.user_agent = ua[random.randint(0, len(ua)) % len(ua)]

    def run(self):
        if self.running:
            return
        self.init()
        for tid in range(len(self.jobs)):
            t = threading.Thread(target=self.runner, args=(tid,))
            self.threads.append(t)
        for t in self.threads:
            t.start()
        time.sleep(2)
        for t in self.threads:
            t.join()
        self.ac.save()

    def random_run(self):
        if self.running:
            return
        for tid in range(len(self.jobs)):
            t = threading.Thread(target=self.rand_runner, args=(tid,))
            t.start()
            time.sleep(1)
            self.threads.append(t)
        time.sleep(2)
        for t in self.threads:
            t.join()
        self.ac.save()

    def rand_runner(self, tid):
        job = self.jobs[tid]
        ac = {
            'accounts': []
            , 'name': job['name']
            , 'prefix': provinces[job['name']]
            , 'proxy': None
            , 'score': 750
        }
        if job['name'] == '海南':
            ac['score'] = 900
        elif job['name'] == '上海':
            ac['score'] = 600
        elif job['name'] == '江苏':
            ac['score'] = 500
        acs = self.ac.get(job['name'], job['count'])
        if len(acs) > 0:
            for a in acs:
                ac['accounts'].append(a.gen_run_param())
            proxy = self.pq.get_good_proxy()
            print '%s start ' % ac['name']
            recruit_jobs(ac)
            self.pq.release(proxy)

    def runner(self, tid):
        job = self.jobs[tid]
        ac = {
            'accounts': []
            , 'name': job['name']
            , 'prefix': provinces[job['name']]
            , 'score': 750
            , 'proxy': job['proxy']
        }
        if job['name'] == '海南':
            ac['score'] = 900
        elif job['name'] == '上海':
            ac['score'] = 600
        elif job['name'] == '江苏':
            ac['score'] = 500
        for a in self.ac.get(job['name'], job['count']):
            ac['accounts'].append(a.gen_run_param())
        print '%s start ' % ac['name']
        recruit_jobs(ac)