Exemple #1
0
 def save(self):
     ## create diff
     Proxy.objects.filter(delegating=self.user).delete()
     newproxy = Proxy(delegating=self.user,isdefault=True)
     newproxy.save()
     if 'main_proxy' in self.data.keys():
         for user in self.data.getlist('main_proxy'):
             user_object = CustomUser.objects.get(pk=user)
             newproxy.delegates.add(user_object)
     newproxy.save()
     
     for count in xrange(int(self.data["tagfieldcount"])):
         if "side_proxy%d"%count in self.data.keys() and "side_proxy_tags%d"%count in self.data.keys():
             newproxy = Proxy(delegating=self.user)
             newproxy.save()
             for user in self.data.getlist("side_proxy%d"%count):
                 user_object = CustomUser.objects.get(pk=user)
                 newproxy.delegates.add(user_object)
         
             for tag in self.data.getlist("side_proxy_tags%d"%count):
                 tag_object = Tag.objects.get(pk=tag)
                 newproxy.tags.add(tag_object)
             
             newproxy.save()
     return
Exemple #2
0
    def save(self):
        ### first disable all existing proxies (suboptimal, but easiest) ###
        for proxy in Proxy.objects.filter(delegating=self.user):
            proxy.disable()

        ### add new proxy for each form element ###
        newproxy = Proxy(delegating=self.user, isdefault=True)
        newproxy.save()
        if 'main_proxy' in self.data.keys():
            for user in self.data.getlist('main_proxy'):
                user_object = CustomUser.objects.get(pk=user)
                newproxy.delegates.add(user_object)
        newproxy.save()
        
        for count in xrange(int(self.data["tagfieldcount"])):
            if "side_proxy%d" % count in self.data.keys() and "side_proxy_tags%d" % count in self.data.keys():
                newproxy = Proxy(delegating=self.user)
                newproxy.save()
                for user in self.data.getlist("side_proxy%d"%count):
                    user_object = CustomUser.objects.get(pk=user)
                    newproxy.delegates.add(user_object)
            
                for tag in self.data.getlist("side_proxy_tags%d"%count):
                    tag_object = Tag.objects.get(pk=tag)
                    newproxy.tags.add(tag_object)
                
                newproxy.save()
        return
Exemple #3
0
        async def wrap(self, *args, **kwargs):
            good_proxies_count = await db.count(
                Proxy.select().where(Proxy.number_of_bad_checks == 0))

            bad_proxies_count = await db.count(Proxy.select().where(
                Proxy.number_of_bad_checks > 0,
                Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD,
            ))

            dead_proxies_count = await db.count(Proxy.select().where(
                Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD,
                Proxy.number_of_bad_checks <
                settings.DO_NOT_CHECK_ON_N_BAD_CHECKS,
            ))

            not_checked_proxies_count = await db.count(Proxy.select().where(
                Proxy.number_of_bad_checks >=
                settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, ))

            response = {
                "bad_proxies_count": bad_proxies_count,
                "good_proxies_count": good_proxies_count,
                "dead_proxies_count": dead_proxies_count,
                "not_checked_proxies_count": not_checked_proxies_count,
            }

            response.update(await func(self, *args, **kwargs))

            return response
Exemple #4
0
async def number_of_proxies_to_process(timestamp):
    good_proxies_count = await db.count(
        Proxy.select().where(
            Proxy.number_of_bad_checks == 0,
            Proxy.next_check_time < timestamp,
        )
    )

    bad_proxies_count = await db.count(
        Proxy.select().where(
            Proxy.number_of_bad_checks > 0,
            Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD,
            Proxy.next_check_time < timestamp,
        )
    )

    dead_proxies_count = await db.count(
        Proxy.select().where(
            Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD,
            Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS,
            Proxy.next_check_time < timestamp,
        )
    )

    await db.create(
        NumberOfProxiesToProcess,
        timestamp=timestamp,
        good_proxies=good_proxies_count,
        bad_proxies=bad_proxies_count,
        dead_proxies=dead_proxies_count,
    )
Exemple #5
0
    async def process_proxies(self):
        while True:
            await asyncio.sleep(0.01)
            try:
                # check good proxies
                proxies = await db.execute(
                    Proxy.select().where(
                        Proxy.number_of_bad_checks == 0,
                        Proxy.next_check_time < time.time(),
                    ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS)
                )
                if proxies:
                    self.good_proxies_are_processed = False

                await self.add_proxies_to_queue(proxies)

                if proxies:
                    continue

                self.good_proxies_are_processed = True

                # check bad proxies
                proxies = await db.execute(
                    Proxy.select().where(
                        Proxy.number_of_bad_checks > 0,
                        Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD,
                        Proxy.next_check_time < time.time(),
                    ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS)
                )

                await self.add_proxies_to_queue(proxies)

                if proxies:
                    continue

                # check dead proxies
                proxies = await db.execute(
                    Proxy.select().where(
                        Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD,
                        Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS,
                        Proxy.next_check_time < time.time(),
                    ).order_by(Proxy.next_check_time).limit(settings.NUMBER_OF_CONCURRENT_TASKS)
                )

                await self.add_proxies_to_queue(proxies)
            except KeyboardInterrupt as ex:
                raise ex
            except BaseException as ex:
                self.logger.exception(ex)
                if settings.DEBUG:
                    raise ex

                await asyncio.sleep(settings.SLEEP_AFTER_ERROR_PERIOD)
def save_proxies(url):
    try:
        r = fetch(url)
    except requests.exceptions.RequestException:
        return False
    addresses = re.findall(PROXY_REGEX, r.text)
    for address in addresses:
        proxy = Proxy(address=address)
        try:
            proxy.save()
        except NotUniqueError:
            pass
Exemple #7
0
def save_proxies(url):
    try:
        r = fetch(url)
    except requests.exceptions.RequestException:
        return False
    addresses = re.findall(PROXY_REGEX, r.text)
    for address in addresses:
        proxy = Proxy(address=address)
        try:
            proxy.save()
        except NotUniqueError:
            pass
Exemple #8
0
def add_proxy(request):
	urls = request.body
	print urls
	urlList = urls.split(',')
	rst=[]
	for url in urlList:
		try:
			p = Proxy(url = url, rate=0)
			p.save()
		except Exception as e:
			print rst.append(e.message)
			continue
	return HttpResponse(reduce(lambda x,y:x+','+y, rst))
Exemple #9
0
def add_proxy(request):
    urls = request.body
    print urls
    urlList = urls.split(',')
    rst = []
    for url in urlList:
        try:
            p = Proxy(url=url, rate=0)
            p.save()
        except Exception as e:
            print rst.append(e.message)
            continue
    return HttpResponse(reduce(lambda x, y: x + ',' + y, rst))
Exemple #10
0
    def load(self) -> list:
        ls = []
        if self._num is None:
            return ls

        if self._context and self._context.logger:
            self._context.logger.info('SixSixIPProxySpider: loading proxy list.')

        url = SixSixIPProxySpider._POOL_URL.format(self._num)
        reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)')
        try:
            res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout)
            for match in reg.finditer(res.text):
                try:
                    for protocol in ('http', 'https'):
                        proxy = Proxy()
                        proxy.ip = match.group(1)
                        proxy.port = match.group(2)
                        proxy.protocol = protocol
                        proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol)
                        proxy.collect_time = Datetime.now()
                        proxy.local = Config.local
                        ls.append(proxy)
                except:
                    pass
            return ls
        except:
            if self._context and self._context.logger:
                self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.')
            raise
Exemple #11
0
    def load(self) -> list:
        ls = []

        if self._context and self._context.logger:
            self._context.logger.info('FatezeroProxySpider: loading proxy list.')
        try:
            res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout)
            for text in res.text.split('\n'):
                try:
                    p = json.loads(text, encoding='utf-8')
                    proxy = Proxy()
                    proxy.ip = p['host']
                    proxy.port = p['port']
                    proxy.protocol = p['type']
                    proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol)
                    proxy.collect_time = Datetime.now()
                    proxy.local = Config.local
                    ls.append(proxy)
                except:
                    pass
            if self._num is None:
                return ls
            else:
                return ls[:self._num]
        except:
            if self._context and self._context.logger:
                self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.')
            raise
def add_proxy(request):
    if request.method == 'POST':
        port = request.POST.get('port', '0')
        destination = request.POST.get('destination', 'https://empty')
        delay = request.POST.get('delay', '0')
        p = Proxy(delay=delay,
                  port=port,
                  destination=destination,
                  started=False,
                  pid=0)
        p.save()
        new_id = p.id
        proxy = Proxy.objects.filter(id=new_id).values()
        response = [{"message": "proxy was started", "proxy": list(proxy)[0]}]
    return JsonResponse(response, safe=False)
def save_search_result(p, queue, retry=0):
    proxy = Proxy.get_random()['address']
    url = SEARCH_URL.format(SEARCH_TEXT, p)

    try:
        r = fetch(url, proxy=proxy)
    except (Timeout, ConnectionError):
        sleep(0.1)
        retry += 1
        if retry > 5:
            queue.put(url)
            raise GreenletExit()
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass

        return save_search_result(url, queue, retry)
    soup = BeautifulSoup(r.text, 'lxml')
    results = soup.find(class_='results')
    if results is None:
        # 此代理已经被封, 换其他的代理
        sleep(0.1)
        retry += 1
        if retry > 5:
            queue.put(url)
            raise GreenletExit()
        return save_search_result(url, queue, retry)
    articles = results.find_all(
        'div', lambda x: 'wx-rb' in x)
    for article in articles:
        save_article(article)
Exemple #14
0
def addProxies(s):
    s = s.splitlines()
    for i in s:
        try:
            Proxy(ip_and_port=i, proxy_type='http').save()
        except:
            pass
Exemple #15
0
    async def get_proxies_html(self, request):
        proxies = await db.execute(Proxy.select().where(
            Proxy.number_of_bad_checks == 0).order_by(Proxy.response_time))
        proxies = list(proxies)
        current_timestamp = time.time()

        return {
            "proxies": [{
                "address":
                proxy.address,
                "response_time":
                proxy.response_time /
                1000 if proxy.response_time is not None else None,
                "uptime":
                datetime.timedelta(seconds=int(current_timestamp -
                                               proxy.uptime))
                if proxy.uptime is not None else None,
                "bad_uptime":
                datetime.timedelta(seconds=int(current_timestamp -
                                               proxy.bad_uptime))
                if proxy.bad_uptime is not None else None,
                "last_check_time":
                proxy.last_check_time,
                "checking_period":
                proxy.checking_period,
                "number_of_bad_checks":
                proxy.number_of_bad_checks,
                "bad_proxy":
                proxy.bad_proxy,
                "white_ipv4":
                proxy.white_ipv4,
                "location":
                proxy.location,
            } for proxy in proxies]
        }
Exemple #16
0
    async def process_proxy(self, raw_protocol: int, auth_data: str,
                            domain: str, port: int, collector_id):
        async with self.proxies_semaphore:
            self.logger.debug(
                "start processing proxy {}://{}@{}:{} with collector id {}".
                format(raw_protocol, auth_data, domain, port, collector_id))

            if auth_data is None:
                auth_data = ""

            proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol])
            if auth_data:
                proxy_url += auth_data + "@"

            proxy_url += domain + ":" + str(port)

            start_checking_time = time.time()
            check_result, checker_additional_info = await proxy_utils.check_proxy(
                proxy_url)
            end_checking_time = time.time()

            if check_result:
                self.logger.debug("proxy {0} works".format(proxy_url))
                await self.create_or_update_proxy(
                    raw_protocol,
                    auth_data,
                    domain,
                    port,
                    start_checking_time,
                    end_checking_time,
                    checker_additional_info,
                )
            else:
                self.logger.debug("proxy {0} doesn't work".format(proxy_url))
                try:
                    proxy = await db.get(Proxy.select().where(
                        Proxy.raw_protocol == raw_protocol,
                        Proxy.auth_data == auth_data,
                        Proxy.domain == domain,
                        Proxy.port == port,
                    ))

                    proxy.last_check_time = int(time.time())
                    proxy.next_check_time = (proxy.last_check_time +
                                             proxy.checking_period)
                    proxy.number_of_bad_checks += 1
                    proxy.uptime = int(time.time())

                    if proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD:
                        proxy.bad_uptime = int(time.time())

                    if (proxy.number_of_bad_checks ==
                            settings.DO_NOT_CHECK_ON_N_BAD_CHECKS):
                        self.logger.debug(
                            "proxy {} isn't checked anymore".format(
                                proxy.to_url()))

                    await db.update(proxy)
                except Proxy.DoesNotExist:
                    pass
Exemple #17
0
def save_search_result(p, queue, retry=0):
    proxy = Proxy.get_random()['address']
    url = SEARCH_URL.format(SEARCH_TEXT, p)

    try:
        r = fetch(url, proxy=proxy)
    except (Timeout, ConnectionError):
        sleep(0.1)
        retry += 1
        if retry > 5:
            queue.put(url)
            raise GreenletExit()
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass

        return save_search_result(url, queue, retry)
    soup = BeautifulSoup(r.text, 'lxml')
    results = soup.find(class_='results')
    if results is None:
        # 此代理已经被封, 换其他的代理
        sleep(0.1)
        retry += 1
        if retry > 5:
            queue.put(url)
            raise GreenletExit()
        return save_search_result(url, queue, retry)
    articles = results.find_all('div', lambda x: 'wx-rb' in x)
    for article in articles:
        save_article(article)
Exemple #18
0
    async def get_best_http_proxy(self, request):
        proxy_address = (await db.get(Proxy.select().where(
            Proxy.number_of_bad_checks == 0,
            Proxy.raw_protocol == Proxy.PROTOCOLS.index("http"),
        ).order_by(Proxy.response_time))).address

        return web.Response(text=proxy_address)
    async def get_proxies_for_id(self, data: dict) -> dict:
        validate_dict_must_have_key(data, 'id')
        validate_dict_must_have_key(data, 'number')
        number = int(data['number'])
        validate_uint(number)

        # TODO: validate id
        results = []

        for item in await db.execute(
                Proxy.raw(
                    f'SELECT * FROM working_proxies TABLESAMPLE SYSTEM_ROWS({number});'
                )):
            obj = {}

            for field_name in settings.PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG[
                    'fields']:
                obj[field_name] = getattr(item, field_name)

            results.append(obj)

        return {
            "number_of_results": len(results),
            "results": results,
        }
Exemple #20
0
    def load_proxylist(self):
        if not self.scrappers:
            return

        proxylist = set()

        for scrapper in self.scrappers:
            try:
                proxylist.update(scrapper.scrap())
            except Exception as e:
                log.exception('%s proxy scrapper failed: %s',
                              type(scrapper).__name__, e)

        log.info('%s scrapped a total of %d proxies.',
                 type(self).__name__, len(proxylist))
        proxylist = self.__parse_proxylist(proxylist).values()
        Proxy.insert_new(proxylist)
Exemple #21
0
async def create_proxy_count_item(timestamp):
    good_proxies_count = await db.count(
        Proxy.select().where(Proxy.number_of_bad_checks == 0))
    bad_proxies_count = await db.count(Proxy.select().where(
        Proxy.number_of_bad_checks > 0,
        Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD,
    ))
    dead_proxies_count = await db.count(Proxy.select().where(
        Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD))

    await db.create(
        ProxyCountItem,
        timestamp=timestamp,
        good_proxies_count=good_proxies_count,
        bad_proxies_count=bad_proxies_count,
        dead_proxies_count=dead_proxies_count,
    )
Exemple #22
0
def filte(content):
    soup = BeautifulSoup(content)
    proxy_list_info = soup.findAll('tr')
    proxy_list = []
    for proxy in proxy_list_info:
        td_index = 0
        proxy_tds = proxy.findAll('td')
        has_get = False
        proxy = Proxy()
        for proxy_td in proxy_tds:
            td_index += 1
            if td_index == 2:
                has_get = True
                proxy.ip = proxy_td.text
            elif td_index == 3:
                proxy.port = proxy_td.text
            elif td_index == 4:
                if not proxy_td.a == None:
                    proxy.location = proxy_td.a.text
            elif td_index == 5:
                proxy.anonymous_type = proxy_td.text
            elif td_index == 6:
                proxy.proxy_type = proxy_td.text.lower()

        if has_get:
            proxy_list.append(proxy)
    return proxy_list
Exemple #23
0
 async def add_proxy_to_queue(self, proxy: Proxy, collector_id=None):
     async with self.proxies_semaphore:
         asyncio.ensure_future(self.process_proxy(
             proxy.get_raw_protocol(),
             proxy.auth_data,
             proxy.domain,
             proxy.port,
             collector_id,
         ))
Exemple #24
0
    def create_or_update_proxy(raw_protocol: Proxy.PROTOCOLS, auth_data,
                               domain, port, start_checking_time,
                               end_checking_time):
        if raw_protocol is None or domain is None or port is None or auth_data is None or start_checking_time is None\
                or end_checking_time is None:
            raise Exception("Bad arguments")

        if raw_protocol < 0 or raw_protocol >= len(Proxy.PROTOCOLS):
            raise Exception("Bad protocol")

        response_time = int(
            round((end_checking_time - start_checking_time) * 1000000))

        proxy = session.query(Proxy).filter(
            sqlalchemy.and_(Proxy.raw_protocol == raw_protocol,
                            Proxy.auth_data == auth_data,
                            Proxy.domain == domain,
                            Proxy.port == port)).first()

        if proxy:
            # exists, so update
            pass
        else:
            # doesn't exist, so create
            proxy = Proxy(number_of_bad_checks=0,
                          raw_protocol=raw_protocol,
                          auth_data=auth_data,
                          domain=domain,
                          port=port)
            session.add(proxy)

        if proxy.bad_proxy or proxy.uptime is None or proxy.uptime == 0:
            proxy.uptime = int(time.time())

        if proxy.bad_uptime is None or proxy.bad_uptime == 0 or \
                proxy.number_of_bad_checks > settings.DEAD_PROXY_THRESHOLD:
            proxy.bad_uptime = int(time.time())

        proxy.response_time = response_time
        proxy.number_of_bad_checks = 0
        proxy.last_check_time = int(time.time())

        checking_time = int(end_checking_time - start_checking_time)
        if checking_time > settings.PROXY_CHECKING_TIMEOUT:
            checking_time = settings.PROXY_CHECKING_TIMEOUT

        proxy.checking_period = \
            settings.MIN_PROXY_CHECKING_PERIOD \
            + (checking_time / settings.PROXY_CHECKING_TIMEOUT) \
            * (settings.MAX_PROXY_CHECKING_PERIOD - settings.MIN_PROXY_CHECKING_PERIOD)

        session.commit()
Exemple #25
0
def use_thread_with_queue2():
    cleanup()
    # in_queue = Queue.Queue()
    # out_queue = Queue.Queue()
    #
    # for i in range(5):
    #     t = threading.Thread(target=save_proxies_with_queue2,
    #                          args=(in_queue, out_queue))
    #     t.setDaemon(True)
    #     t.start()
    #
    # for url in PROXY_SITES:
    #     in_queue.put(url)
    #
    # result = []
    #
    # for i in range(5):
    #     t = threading.Thread(target=append_result,
    #                          args=(out_queue, result))
    #     t.setDaemon(True)
    #     t.start()
    #
    # in_queue.join()
    # out_queue.join()

    addresses = []

    # mogu_key = ""
    # res = requests.get(mogu_key)
    # addresses = res.json()['msg']

    for address in addresses:
        proxy = Proxy(address=address['ip'] + ':' + address['port'])
        try:
            proxy.save()
        except NotUniqueError:
            pass

    pool = Pool(10)
    pool.map(check_proxy, Proxy.objects.all())
    print(len(addresses))
    print(Proxy.objects.count())
Exemple #26
0
def save_proxies(url):
    proxies = []
    try:
        # if url == 'http://www.kuaidaili.com/free':
        #     import pdb;
        #     pdb.set_trace()
        res = requests.get(url)

    except requests.exceptions.RequestException:
        return False
    addresses = re.findall(PROXY_REGEX, res.text)
    for address in addresses:
        proxy = Proxy(address=address)
        try:
            proxy.save()
        except NotUniqueError:
            pass
        else:
            proxies.append(address)
    return proxies
def filte(content):
	soup = BeautifulSoup(content)
	proxy_list_info = soup.findAll('tr')
	proxy_list = []
	for proxy in proxy_list_info:
		td_index = 0
		proxy_tds = proxy.findAll('td')
		has_get = False 
		proxy = Proxy();
		for proxy_td in proxy_tds:
			td_index += 1
			if td_index == 2:
				has_get = True
				proxy.ip = proxy_td.text
			elif td_index == 3:
				proxy.port = proxy_td.text
			elif td_index == 4:
				if not proxy_td.a == None:
					proxy.location = proxy_td.a.text
			elif td_index == 5:
				proxy.anonymous_type = proxy_td.text
			elif td_index == 6:
				proxy.proxy_type = proxy_td.text.lower()  

		if has_get:
			proxy_list.append(proxy)
	return proxy_list
Exemple #28
0
def fetch(url):
    s = requests.Session()
    s.headers.update({'user-agent': get_user_agent()})
    proxies = {
        'http': Proxy.get_random()['address'],
    }
    html_text = s.get(url, timeout=TIMEOUT, proxies=proxies).text
    js_url = gen_js_url(url)
    try:
        js_data = s.get(js_url, timeout=TIMEOUT, proxies=proxies).json()
    except JSONDecodeError:
        raise RequestException()
    return html_text, js_data
def fetch(url):
    s = requests.Session()
    s.headers.update({'user-agent': get_user_agent()})
    proxies = {
        'http': Proxy.get_random()['address'],
    }
    html_text = s.get(url, timeout=TIMEOUT, proxies=proxies).text
    js_url = gen_js_url(url)
    try:
        js_data = s.get(js_url, timeout=TIMEOUT, proxies=proxies).json()
    except JSONDecodeError:
        raise RequestException()
    return html_text, js_data
Exemple #30
0
    def __execute_one_spider_task(self, spider):
        """
        一次执行具体爬虫的任务
        :param spider:
        :return:
        """
        try:
            for proxy in spider.get_proxies():
                proxy = check_proxy(proxy)
                if proxy.speed != -1:
                    session = Session()
                    exist = session.query(Proxy) \
                        .filter(Proxy.ip == str(proxy.ip), Proxy.port == str(proxy.port)) \
                        .first()

                    if not exist:
                        obj = Proxy(
                            ip=str(proxy.ip),
                            port=str(proxy.port),
                            protocol=proxy.protocol,
                            nick_type=proxy.nick_type,
                            speed=proxy.speed,
                            area=str(proxy.area),
                            score=proxy.score,
                            disable_domain=proxy.disable_domain,
                            origin=str(proxy.origin),
                            create_time=datetime.now()
                        )
                        session.add(obj)
                        session.commit()
                        session.close()
                        logger.info(f'insert: {proxy.ip}:{proxy.port} from {proxy.origin}!')
                    else:
                        exist.score['score'] = settings.MAX_SCORE
                        exist.score['power'] = 0
                        exist.port = proxy.port
                        exist.protocol = proxy.protocol
                        exist.nick_type = proxy.nick_type
                        exist.speed = proxy.speed
                        exist.area = proxy.area
                        exist.disable_domain = proxy.disable_domain
                        exist.origin = proxy.origin
                        session.commit()
                        session.close()
                        logger.info(f'update: {proxy.ip}:{proxy.port}, to max score successfully!')
                else:
                    logger.info(f'invalid: {proxy.ip}:{proxy.port} from {proxy.origin}!')

        except Exception as e:
            logger.error(f'spider error: {e}')
Exemple #31
0
    def __update_proxy(self, proxy, valid=False):
        proxy['scan_date'] = datetime.utcnow()
        if valid:
            proxy['fail_count'] = 0
            self.stats['valid'] += 1
            self.stats['total_valid'] += 1
        else:
            proxy['fail_count'] += 1
            self.stats['fail'] += 1
            self.stats['total_fail'] += 1

        proxy = Proxy.db_format(proxy)
        with self.proxy_updates_lock:
            self.test_hashes.remove(proxy['hash'])
            self.proxy_updates[proxy['hash']] = proxy
Exemple #32
0
    async def process_raw_proxy(self, proxy, collector_id):
        self.logger.debug("processing raw proxy \"{}\"".format(proxy))

        try:
            _, auth_data, domain, port = proxy_validator.retrieve(proxy)
        except proxy_validator.ValidationError as ex:
            self.collectors_logger.error(
                "Collector with id \"{}\" returned bad raw proxy \"{}\". "
                "Message: {}".format(collector_id, proxy, ex)
            )
            return


        # don't care about protocol
        try:
            proxy = await db.get(
                Proxy.select().where(
                    Proxy.auth_data == auth_data,
                    Proxy.domain == domain,
                    Proxy.port == port,
                )
            )

            if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time():
                proxy_short_address = ""
                if auth_data:
                    proxy_short_address += auth_data + "@"

                proxy_short_address += "{}:{}".format(domain, port)

                self.logger.debug(
                    "skipping proxy \"{}\" from collector \"{}\"".format(
                        proxy_short_address, collector_id)
                )
                return
        except Proxy.DoesNotExist:
            pass

        for raw_protocol in range(len(Proxy.PROTOCOLS)):
            while not self.good_proxies_are_processed:
                # TODO: find a better way
                await asyncio.sleep(0.1)

            new_proxy = Proxy()
            new_proxy.raw_protocol = raw_protocol
            new_proxy.auth_data = auth_data
            new_proxy.domain = domain
            new_proxy.port = port

            await self.add_proxy_to_queue(new_proxy, collector_id)
def save_search_result(page, queue, retry=0):
    proxy = Proxy.get_random()['address']
    url = SEARCH_URL.format(SEARCH_TEXT, page)

    try:
        r = fetch(url, proxy=proxy)
    except (Timeout, ConnectionError, IOError):
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            raise GreenletExit()
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass

        return save_search_result(page, queue, retry)
    soup = BeautifulSoup(r.text, 'lxml')
    results = soup.find(class_='results')
    if results is None:
        # 此代理已经被封, 换其他的代理
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            print 'retry too much!'
            raise GreenletExit()
        return save_search_result(page, queue, retry)
    articles = results.find_all(
        'div', lambda x: 'wx-rb' in x)
    for article in articles:
        save_article(article)

    page_container = soup.find(id='pagebar_container')
    if page_container and u'下一页' in page_container.text:
        last_page = int(page_container.find_all('a')[-2].text)
        current_page = int(page_container.find('span').text)
        for page in range(current_page + 1, last_page + 1):
            put_new_page(page, queue)
def save_search_result(page, queue, retry=0):
    proxy = Proxy.get_random()['address']
    url = SEARCH_URL.format(SEARCH_TEXT, page)

    try:
        r = fetch(url, proxy=proxy)
    except (Timeout, ConnectionError, IOError):
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            raise GreenletExit()
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass

        return save_search_result(page, queue, retry)
    soup = BeautifulSoup(r.text, 'lxml')
    results = soup.find(class_='results')
    if results is None:
        # 此代理已经被封, 换其他的代理
        sleep(0.1)
        retry += 1
        if retry > 5:
            put_new_page(page, queue)
            print 'retry too much!'
            raise GreenletExit()
        return save_search_result(page, queue, retry)
    articles = results.find_all('div', lambda x: 'wx-rb' in x)
    for article in articles:
        save_article(article)

    page_container = soup.find(id='pagebar_container')
    if page_container and u'下一页' in page_container.text:
        last_page = int(page_container.find_all('a')[-2].text)
        current_page = int(page_container.find('span').text)
        for page in range(current_page + 1, last_page + 1):
            put_new_page(page, queue)
Exemple #35
0
def filte(content):
    soup = BeautifulSoup(content)
    proxy_list_tables = soup.findAll('table')
    table_index = 0
    pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
    proxy_list = []
    for table in proxy_list_tables:
        table_index += 1
        if table_index == 3:
            proxy_list_info = table.findAll('tr')
            for proxy in proxy_list_info:
                td_index = 0
                proxy_tds = proxy.findAll('td')
                proxy = Proxy()
                is_proxy = False
                for proxy_td in proxy_tds:
                    td_index += 1
                    if td_index == 2:
                        rel_ip_info = re.search(pattern, proxy_td.text)
                        if rel_ip_info:
                            proxy.ip = rel_ip_info.group(0)
                            is_proxy = True
                    elif td_index == 3:
                        if is_proxy:
                            proxy.port = int(proxy_td.text)
                    elif td_index == 4:
                        if is_proxy:
                            if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text:
                                proxy.anonymous_type = '高匿'
                            else:
                                proxy.anonymous_type = '透明'
                    elif td_index == 5:
                        if is_proxy:
                            proxy.location = proxy_td.text
                            proxy.proxy_type = 'http'
                if is_proxy:
                    proxy_list.append(proxy)
    return proxy_list
async def fetch(url, retry=0):
    proxy = 'http://{}'.format(Proxy.get_random()['address'])
    headers = {'user-agent': get_user_agent()}
    conn = aiohttp.ProxyConnector(proxy=proxy)

    js_url = gen_js_url(url)

    try:
        with aiohttp.ClientSession(connector=conn) as session:
            with aiohttp.Timeout(TIMEOUT):
                async with session.get(url, headers=headers) as resp:
                    html_text = await resp.text()

                async with session.get(js_url, headers=headers) as resp:
                    js_data = await resp.json()
    except:
        retry += 1
        if retry > 5:
            raise CrawlerError()
        await asyncio.sleep(1)
        return await fetch(url, retry=retry)
    return html_text, js_data
async def fetch(url, retry=0):
    proxy = 'http://{}'.format(Proxy.get_random()['address'])
    headers = {'user-agent': get_user_agent()}
    conn = aiohttp.ProxyConnector(proxy=proxy)

    js_url = gen_js_url(url)

    try:
        with aiohttp.ClientSession(connector=conn) as session:
            with aiohttp.Timeout(TIMEOUT):
                async with session.get(url, headers=headers) as resp:
                    html_text = await resp.text()

                async with session.get(js_url, headers=headers) as resp:
                    js_data = await resp.json()
    except:
        retry += 1
        if retry > 5:
            raise CrawlerError()
        await asyncio.sleep(1)
        return await fetch(url, retry=retry)
    return html_text, js_data
Exemple #38
0
def set_proxy(bot, update, args):
    if len(args) == 1:
        proxy_data = args[0].split(':')
        if len(proxy_data) != 4:
            update.message.reply_text(
                "Please, include the proxy data to the "
                "command, like in the example:\n"
                "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>",
                parse_mode=ParseMode.HTML)
            return
        proxy_ip, proxy_port, proxy_username, proxy_password = proxy_data
        current_proxy = session.query(Proxy).first()
        if current_proxy:
            session.delete(current_proxy)
        new_proxy = Proxy(proxy_ip, proxy_port, proxy_username, proxy_password)
        session.add(new_proxy)
        session.commit()
        update.message.reply_text("Proxy settings updated.")
    else:
        update.message.reply_text(
            "Please, include the proxy data to the "
            "command, like in the example:\n"
            "<code>/set_proxy mydomain.com:8080:usErnAme:s3cret</code>",
            parse_mode=ParseMode.HTML)
async def fetch(retry=0):
    proxy = 'http://{}'.format(Proxy.get_random()['address'])
    headers = {'user-agent': get_user_agent()}
    conn = aiohttp.ProxyConnector(proxy=proxy)

    url = 'http://httpbin.org/ip'

    try:
        with aiohttp.ClientSession(connector=conn) as session:
            with aiohttp.Timeout(TIMEOUT):
                async with session.get(url, headers=headers) as resp:
                    return await resp.json()
    except (ProxyConnectionError, TimeoutError):
        try:
            p = Proxy.objects.get(address=proxy)
            if p:
                p.delete()
        except DoesNotExist:
            pass
        retry += 1
        if retry > 5:
            raise TimeoutError()
        await asyncio.sleep(1)
        return await fetch(retry=retry)
def filte(content):
	soup = BeautifulSoup(content)
	proxy_list_tables = soup.findAll('table')
	table_index = 0
	pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
	proxy_list = []
	for table in proxy_list_tables:
		table_index += 1
		if table_index == 3:
			proxy_list_info = table.findAll('tr')
			for proxy in proxy_list_info:
				td_index = 0
				proxy_tds = proxy.findAll('td')
				proxy = Proxy();
				is_proxy = False
				for proxy_td in proxy_tds:
					td_index += 1
					if td_index == 2:
						rel_ip_info = re.search(pattern, proxy_td.text)
						if rel_ip_info:
							proxy.ip = rel_ip_info.group(0)
							is_proxy = True
					elif td_index == 3:
						if is_proxy:
							proxy.port = int(proxy_td.text)
					elif td_index == 4:
						if is_proxy:
							if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text:
								proxy.anonymous_type = '高匿'
							else:
								proxy.anonymous_type = '透明'
					elif td_index == 5:
						if is_proxy:
							proxy.location = proxy_td.text
							proxy.proxy_type = 'http'
				if is_proxy:
					proxy_list.append(proxy)
	return proxy_list
Exemple #41
0
		return (False, 0)
	except:
		return (False, 0)

def check_google(proxy_info):
	proxy_content = proxy_info.ip + ':' + str(proxy_info.port)
	proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content})
	opener = urllib2.build_opener(proxy)
	urllib2.install_opener(opener)
	try:
		time1 = time.time()
		response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3)
		title = BeautifulSoup(response.read()).title.text
		if 'Google' == str(title):
			proxy_info.check_time = str(datetime.now()).split('.')[0]
			return (True, (time.time() - time1) * 1000)
		else:
			return (False, 0)
	except:
		return (False, 0)

if __name__ == '__main__':
	proxy = Proxy()
	proxy.ip = '222.74.6.48'
	proxy.port = '8000'
	proxy.proxy_type = 'http'
	default_ip = get_default_ip()
	print check_anonymous(proxy, default_ip)
	
	
def cleanup():
    Proxy.drop_collection()
        return (False, 0)
    except:
        return (False, 0)


def check_google(proxy_info):
    proxy_content = proxy_info.ip + ":" + str(proxy_info.port)
    proxy = urllib2.ProxyHandler({proxy_info.proxy_type: proxy_content})
    opener = urllib2.build_opener(proxy)
    urllib2.install_opener(opener)
    try:
        time1 = time.time()
        response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3)
        title = BeautifulSoup(response.read()).title.text
        if "Google" == str(title):
            proxy_info.check_time = str(datetime.now()).split(".")[0]
            return (True, (time.time() - time1) * 1000)
        else:
            return (False, 0)
    except:
        return (False, 0)


if __name__ == "__main__":
    proxy = Proxy()
    proxy.ip = "222.74.6.48"
    proxy.port = "8000"
    proxy.proxy_type = "http"
    default_ip = get_default_ip()
    print check_anonymous(proxy, default_ip)