Esempio n. 1
0
    def load(self) -> list:
        ls = []
        if self._num is None:
            return ls

        if self._context and self._context.logger:
            self._context.logger.info('SixSixIPProxySpider: loading proxy list.')

        url = SixSixIPProxySpider._POOL_URL.format(self._num)
        reg = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)(?=<br />)')
        try:
            res = requests.get(url, proxies=self._sys_proxy, timeout=self._timeout)
            for match in reg.finditer(res.text):
                try:
                    for protocol in ('http', 'https'):
                        proxy = Proxy()
                        proxy.ip = match.group(1)
                        proxy.port = match.group(2)
                        proxy.protocol = protocol
                        proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol)
                        proxy.collect_time = Datetime.now()
                        proxy.local = Config.local
                        ls.append(proxy)
                except:
                    pass
            return ls
        except:
            if self._context and self._context.logger:
                self._context.logger.exception('SixSixIPProxySpider: Failed be load proxy list.')
            raise
Esempio n. 2
0
def filte(content):
	soup = BeautifulSoup(content)
	proxy_list_info = soup.findAll('tr')
	proxy_list = []
	for proxy in proxy_list_info:
		td_index = 0
		proxy_tds = proxy.findAll('td')
		has_get = False 
		proxy = Proxy();
		for proxy_td in proxy_tds:
			td_index += 1
			if td_index == 2:
				has_get = True
				proxy.ip = proxy_td.text
			elif td_index == 3:
				proxy.port = proxy_td.text
			elif td_index == 4:
				if not proxy_td.a == None:
					proxy.location = proxy_td.a.text
			elif td_index == 5:
				proxy.anonymous_type = proxy_td.text
			elif td_index == 6:
				proxy.proxy_type = proxy_td.text.lower()  

		if has_get:
			proxy_list.append(proxy)
	return proxy_list
Esempio n. 3
0
    def load(self) -> list:
        ls = []

        if self._context and self._context.logger:
            self._context.logger.info('FatezeroProxySpider: loading proxy list.')
        try:
            res = requests.get(FatezeroProxySpider._POOL_URL, proxies=self._sys_proxy, timeout=self._timeout)
            for text in res.text.split('\n'):
                try:
                    p = json.loads(text, encoding='utf-8')
                    proxy = Proxy()
                    proxy.ip = p['host']
                    proxy.port = p['port']
                    proxy.protocol = p['type']
                    proxy.proxy_url = self.proxy_url(proxy.ip, proxy.port, proxy.protocol)
                    proxy.collect_time = Datetime.now()
                    proxy.local = Config.local
                    ls.append(proxy)
                except:
                    pass
            if self._num is None:
                return ls
            else:
                return ls[:self._num]
        except:
            if self._context and self._context.logger:
                self._context.logger.exception('FatezeroProxySpider: Failed be load proxy list.')
            raise
Esempio n. 4
0
def filte(content):
    soup = BeautifulSoup(content)
    proxy_list_info = soup.findAll('tr')
    proxy_list = []
    for proxy in proxy_list_info:
        td_index = 0
        proxy_tds = proxy.findAll('td')
        has_get = False
        proxy = Proxy()
        for proxy_td in proxy_tds:
            td_index += 1
            if td_index == 2:
                has_get = True
                proxy.ip = proxy_td.text
            elif td_index == 3:
                proxy.port = proxy_td.text
            elif td_index == 4:
                if not proxy_td.a == None:
                    proxy.location = proxy_td.a.text
            elif td_index == 5:
                proxy.anonymous_type = proxy_td.text
            elif td_index == 6:
                proxy.proxy_type = proxy_td.text.lower()

        if has_get:
            proxy_list.append(proxy)
    return proxy_list
Esempio n. 5
0
    async def process_raw_proxy(self, proxy, collector_id):
        self.logger.debug("processing raw proxy \"{}\"".format(proxy))

        try:
            _, auth_data, domain, port = proxy_validator.retrieve(proxy)
        except proxy_validator.ValidationError as ex:
            self.collectors_logger.error(
                "Collector with id \"{}\" returned bad raw proxy \"{}\". "
                "Message: {}".format(collector_id, proxy, ex)
            )
            return


        # don't care about protocol
        try:
            proxy = await db.get(
                Proxy.select().where(
                    Proxy.auth_data == auth_data,
                    Proxy.domain == domain,
                    Proxy.port == port,
                )
            )

            if proxy.last_check_time + settings.PROXY_NOT_CHECKING_PERIOD >= time.time():
                proxy_short_address = ""
                if auth_data:
                    proxy_short_address += auth_data + "@"

                proxy_short_address += "{}:{}".format(domain, port)

                self.logger.debug(
                    "skipping proxy \"{}\" from collector \"{}\"".format(
                        proxy_short_address, collector_id)
                )
                return
        except Proxy.DoesNotExist:
            pass

        for raw_protocol in range(len(Proxy.PROTOCOLS)):
            while not self.good_proxies_are_processed:
                # TODO: find a better way
                await asyncio.sleep(0.1)

            new_proxy = Proxy()
            new_proxy.raw_protocol = raw_protocol
            new_proxy.auth_data = auth_data
            new_proxy.domain = domain
            new_proxy.port = port

            await self.add_proxy_to_queue(new_proxy, collector_id)
Esempio n. 6
0
 def get_ips(self):
     for pat in pats:
         objs = []
         headers = {
             'content-type': 'application/json',
             'User-Agent': random.choice(agents)
         }
         for i in range(100):
             ip_obj = self.session.query(Proxy).order_by(
                 func.random()).first()
             proxies = {
                 '{type}'.format(type=ip_obj.type):
                 '{type}://{ip}:{port}'.format(type=ip_obj.type,
                                               ip=ip_obj.ip,
                                               port=ip_obj.port)
             }
             url = '{base_url}{pat}{page}'.format(base_url=base_url,
                                                  pat=pat,
                                                  page=i)
             logger.info('Scrapy {url}'.format(url=url))
             try:
                 response = requests.get(url,
                                         headers=headers,
                                         proxies=proxies)
                 if response.status_code == 200:
                     selector = etree.HTML(response.text)
                     for line in selector.xpath(
                             '//table[@id="ip_list"]//tr[@class="odd"]'):
                         proxy_obj = Proxy()
                         proxy_obj.id = str(uuid.uuid1())
                         proxy_obj.ip = line.xpath('td')[1].xpath(
                             'text()')[0]
                         proxy_obj.port = line.xpath('td')[2].xpath(
                             'text()')[0]
                         proxy_obj.type = str(
                             line.xpath('td')[5].xpath('text()')
                             [0]).lower().replace('https', 'http')
                         objs.append(proxy_obj)
             except:
                 pass
         self._threads_check(objs)
Esempio n. 7
0
def filte(content):
	soup = BeautifulSoup(content)
	proxy_list_tables = soup.findAll('table')
	table_index = 0
	pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
	proxy_list = []
	for table in proxy_list_tables:
		table_index += 1
		if table_index == 3:
			proxy_list_info = table.findAll('tr')
			for proxy in proxy_list_info:
				td_index = 0
				proxy_tds = proxy.findAll('td')
				proxy = Proxy();
				is_proxy = False
				for proxy_td in proxy_tds:
					td_index += 1
					if td_index == 2:
						rel_ip_info = re.search(pattern, proxy_td.text)
						if rel_ip_info:
							proxy.ip = rel_ip_info.group(0)
							is_proxy = True
					elif td_index == 3:
						if is_proxy:
							proxy.port = int(proxy_td.text)
					elif td_index == 4:
						if is_proxy:
							if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text:
								proxy.anonymous_type = '高匿'
							else:
								proxy.anonymous_type = '透明'
					elif td_index == 5:
						if is_proxy:
							proxy.location = proxy_td.text
							proxy.proxy_type = 'http'
				if is_proxy:
					proxy_list.append(proxy)
	return proxy_list
Esempio n. 8
0
def filte(content):
    soup = BeautifulSoup(content)
    proxy_list_tables = soup.findAll('table')
    table_index = 0
    pattern = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
    proxy_list = []
    for table in proxy_list_tables:
        table_index += 1
        if table_index == 3:
            proxy_list_info = table.findAll('tr')
            for proxy in proxy_list_info:
                td_index = 0
                proxy_tds = proxy.findAll('td')
                proxy = Proxy()
                is_proxy = False
                for proxy_td in proxy_tds:
                    td_index += 1
                    if td_index == 2:
                        rel_ip_info = re.search(pattern, proxy_td.text)
                        if rel_ip_info:
                            proxy.ip = rel_ip_info.group(0)
                            is_proxy = True
                    elif td_index == 3:
                        if is_proxy:
                            proxy.port = int(proxy_td.text)
                    elif td_index == 4:
                        if is_proxy:
                            if '匿名代理' == proxy_td.text or '高度匿名' == proxy_td.text:
                                proxy.anonymous_type = '高匿'
                            else:
                                proxy.anonymous_type = '透明'
                    elif td_index == 5:
                        if is_proxy:
                            proxy.location = proxy_td.text
                            proxy.proxy_type = 'http'
                if is_proxy:
                    proxy_list.append(proxy)
    return proxy_list
Esempio n. 9
0
        return (False, 0)
    except:
        return (False, 0)


def check_google(proxy_info):
    proxy_content = proxy_info.ip + ":" + str(proxy_info.port)
    proxy = urllib2.ProxyHandler({proxy_info.proxy_type: proxy_content})
    opener = urllib2.build_opener(proxy)
    urllib2.install_opener(opener)
    try:
        time1 = time.time()
        response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3)
        title = BeautifulSoup(response.read()).title.text
        if "Google" == str(title):
            proxy_info.check_time = str(datetime.now()).split(".")[0]
            return (True, (time.time() - time1) * 1000)
        else:
            return (False, 0)
    except:
        return (False, 0)


if __name__ == "__main__":
    proxy = Proxy()
    proxy.ip = "222.74.6.48"
    proxy.port = "8000"
    proxy.proxy_type = "http"
    default_ip = get_default_ip()
    print check_anonymous(proxy, default_ip)
Esempio n. 10
0
				return (True, (time.time() - time1) * 1000)
		return (False, 0)
	except:
		return (False, 0)

def check_google(proxy_info):
	proxy_content = proxy_info.ip + ':' + str(proxy_info.port)
	proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content})
	opener = urllib2.build_opener(proxy)
	urllib2.install_opener(opener)
	try:
		time1 = time.time()
		response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3)
		title = BeautifulSoup(response.read()).title.text
		if 'Google' == str(title):
			proxy_info.check_time = str(datetime.now()).split('.')[0]
			return (True, (time.time() - time1) * 1000)
		else:
			return (False, 0)
	except:
		return (False, 0)

if __name__ == '__main__':
	proxy = Proxy()
	proxy.ip = '222.74.6.48'
	proxy.port = '8000'
	proxy.proxy_type = 'http'
	default_ip = get_default_ip()
	print check_anonymous(proxy, default_ip)
	
	
Esempio n. 11
0
    if len(sys.argv) < 2:
        print("Usage: python3 {} RESULT_FILE".format(
            sys.argv[0] if len(sys.argv) == 1 else "program"))
        exit(1)

    last_check_time = int(time.time())
    i = 0
    with open(sys.argv[1], 'r') as file:
        for line in file:
            try:
                print("line {}".format(i))
                json_proxy = json.loads(
                    base64.b64decode(line.encode()).decode())
                # print(json_proxy)
                proxy = Proxy()
                proxy.raw_protocol = Proxy.PROTOCOLS.index(
                    json_proxy['protocol'])
                proxy.auth_data = ""
                proxy.domain = json_proxy['domain']
                proxy.port = json_proxy['port']
                proxy.last_check_time = last_check_time
                last_check_time += 1
                proxy.number_of_bad_checks = settings.REMOVE_ON_N_BAD_CHECKS - 5
                session.add(proxy)
                session.commit()
            except sqlalchemy.exc.IntegrityError:
                session.rollback()
                print("proxy {} exists".format(proxy))

            i += 1
Esempio n. 12
0
		return (False, 0)
	except:
		return (False, 0)

def check_google(proxy_info):
	proxy_content = proxy_info.ip + ':' + str(proxy_info.port)
	proxy = urllib2.ProxyHandler({proxy_info.proxy_type : proxy_content})
	opener = urllib2.build_opener(proxy)
	urllib2.install_opener(opener)
	try:
		time1 = time.time()
		response = urllib2.urlopen(GOOGLE_CHECK_URL, timeout=3)
		title = BeautifulSoup(response.read()).title.text
		if 'Google' == str(title):
			proxy_info.check_time = str(datetime.now()).split('.')[0]
			return (True, (time.time() - time1) * 1000)
		else:
			return (False, 0)
	except:
		return (False, 0)

if __name__ == '__main__':
	proxy = Proxy()
	proxy.ip = '222.74.6.48'
	proxy.port = '8000'
	proxy.proxy_type = 'http'
	default_ip = get_default_ip()
	print check_anonymous(proxy, default_ip)