Ejemplo n.º 1
0
    def _web_validator(self, proxies):
        rs = (request(choice(WEBSITES), proxy=proxy, is_map=True)
              for proxy in proxies)

        resps = grequests.map(rs, gtimeout=TIMEOUT, exception_handler=eh)
        for resp, proxy in zip(resps, proxies):
            COLLECTION.update_one({'proxy': proxy}, {
                '$set': {
                    'proxy': proxy
                },
                '$inc': {
                    'detect_times': 1
                },
            },
                                  upsert=True)
            if not resp or resp.status_code != 200:
                continue

            self.proxy_validated_q.put(proxy)

            self.count_website_ok += 1
            self.logger.info('websites available: %s %s %s',
                             self.count_website_ok, proxy, resp.url)

            COLLECTION.update_one({'proxy': proxy}, {
                '$addToSet': {
                    'type': 'normal'
                },
                '$inc': {
                    'alive_times': 1
                }
            })
Ejemplo n.º 2
0
    def real_time(self):
        while True:
            proxy, protocol = self.real_time_q.get()

            # if not proxy:
            #     time.sleep(60*2)
            self.logger.info('real time testing: %s', proxy)
            # TODO: websites http https
            resp = request('https://www.baidu.com/', proxy=proxy, timeout=5)

            if not resp:
                self.logger.info('dead: %s', proxy)
                continue

            self.logger.info('still alive: %s', proxy)
            self.real_time_q.put(proxy)
            HIGH_ANONYMOUS.update_one({'proxy': proxy}, {
                '$set': {
                    'proxy': proxy,
                    'protocol': protocol
                },
                '$inc': {
                    'alive_times': 1
                },
                '$push': {
                    'alive_time_base': datetime.now()
                }
            },
                                      upsert=True)
Ejemplo n.º 3
0
 def collect(self, concurrent_num=3):
     while not self.q.empty():
         elements = cut_queue(self.q, concurrent_num)
         reqs = (request(ele[0], is_map=True) for ele in elements)
         resps = grequests.map(reqs, gtimeout=10, exception_handler=eh)
         for resp, element in zip(resps, elements):
             if not resp:
                 self.q.put(element)
                 self.logger.error('recycle %s', element[0])
                 continue
             self.logger.info('fetched %s', resp.url)
             element[1](resp)
Ejemplo n.º 4
0
 def collect(self, concurrent_num=10):
     while not self.q.empty():
         if self.proxy_q.qsize() < concurrent_num:
             self.get_proxies()
         proxies = cut_queue(self.proxy_q, concurrent_num)
         _urls = cut_queue(self.q, concurrent_num)
         reqs = (request(u, proxy=p, is_map=True)
                 for u, p in zip(_urls, proxies))
         resps = grequests.map(reqs, gtimeout=10)
         for url, resp, proxy in zip(_urls, resps, proxies):
             if not resp or len(RE_HOST.findall(resp.text)) < 5:
                 self.logger.error('recycle: %s', url)
                 self.q.put(url)
                 # requests.post(api_detect_url, data=json.dumps(proxy))
                 continue
             self.logger.info('fetched: %s', url)
             self.proxy_q.put(proxy)
             # requests.post(api_alive_url, data=json.dumps(proxy))
             self.parse_regex(resp)
Ejemplo n.º 5
0
    def _test_validator(self, proxy):
        COLLECTION.update_one({'proxy': proxy}, {'$inc': {'detect_times': 1}})
        detect_times_label = 0
        http, https = {
            'http': proxy.get('http')
        }, {
            'https': proxy.get('https')
        }
        reqs = (request(choice(TESTSITES.get('http')), proxy=http,
                        is_map=True),
                request(choice(TESTSITES.get('https')),
                        proxy=https,
                        is_map=True))
        resps = grequests.map(reqs, gtimeout=10, exception_handler=eh)
        for index, resp in enumerate(resps):
            if not resp:
                continue

            matches = RE_HOST.findall(resp.text)
            if not matches or self.my_ip in matches:
                continue

            p = (https, 'https') if index > 0 else (http, 'http')
            r = request(p[0], timeout=10)
            if r:
                continue

            self.real_time_q.put(p)

            self.logger.info('%s, %s', matches, proxy)

            detect_times_label += 1

            if detect_times_label == 2:
                COLLECTION.update_one({'proxy': proxy},
                                      {'$inc': {
                                          'detect_times': 1
                                      }})

            self.count_testsite_ok += 1

            cursor = COLLECTION.find_one({'proxy': proxy})
            this_id = cursor.get('_id')
            alive_times = cursor.get('alive_times') + 1
            detect_times = cursor.get('detect_times')
            # [0, 1]
            score = alive_times * 2 / (alive_times + detect_times)
            self.logger.info('high anonymity: %s score: %s %s %s',
                             self.count_testsite_ok, score, p[0], resp.url)
            COLLECTION.update_one({'_id': ObjectId(this_id)}, {
                '$set': {
                    'score': score
                },
                '$push': {
                    'alive_time_base': datetime.now()
                },
                '$addToSet': {
                    'type': 'high',
                    'protocol': p[1]
                },
                '$inc': {
                    'alive_times': 1
                }
            })
Ejemplo n.º 6
0
 def get_my_ip(self):
     resp = request('http://httpbin.org/ip')
     if not resp:
         raise ValueError('cannot get my ip!')
     self.logger.info('my ip: %s', resp.json().get('origin'))
     return resp.json().get('origin')
Ejemplo n.º 7
0
 def _get_speciality(self):
     while True:
         url = self.speciality_q.get()
         resp = utils.request(url)
         self._parse_speciality(resp, url)
         self.speciality_q.task_done()
Ejemplo n.º 8
0
 def _get_homepage(self):
     while not self.homepage_q.empty():
         url = self.homepage_q.get()
         resp = utils.request(url)
         self._parse_homepage(resp, url)
     self.speciality_q.join()