Example #1
0
def get_proxy():
    pool = ProxyPool.get_proxy_pool()
    while pool:
        proxy = random.choice(pool)
        if ProxyPool.is_valid_proxy(proxy):
            return proxy
        ProxyPool.expire(proxy)
        pool = ProxyPool.get_proxy_pool()
    logging.error("@get_proxy Error: no available proxy.")
def run_proxy_pool_service(**kwargs):
    proxy_pool = ProxyPool(kwargs)
    proxy_pool.load()
    logger.debug("proxy_pool load ok, count = %d", proxy_pool.count())
    from thrift.transport import TSocket, TTransport
    from thrift.server import TServer
    from thrift.protocol import TBinaryProtocol
    handler = ProxyPoolHandler(proxy_pool)
    import proxymaid_rpc.rpc.ProxyPool
    processor = proxymaid_rpc.rpc.ProxyPool.Processor(handler)
    import proxymaid_rpc.settings
    transport = TSocket.TServerSocket(port=proxymaid_rpc.settings.PROXY_POOL_LISTEN_PORT)
    tfactory = TTransport.TBufferedTransportFactory()
    pfactory = TBinaryProtocol.TBinaryProtocolFactory()
    server = TServer.TThreadedServer(processor, transport, tfactory, pfactory)
    start_proxy_web(proxy_pool ,9100)
    logger.debug_fun("start proxy pool service ok")
    server.serve()
def start_spider(sort, numThreads, numProxyThreads):
    pname = multiprocessing.current_process().name
    # task queue
    taskQueue = Queue.Queue()

    [SpiderThread(taskQueue).start() for _ in range(numThreads)]
    print('{}: {} threads created.'.format(pname, threads))

    # add tasks (producers)
    while True:
        idList = get_id_list(sort)
        num = len(idList)
        if num == 0:
            break
        print('{}: {} rows need to update.'.format(pname, num))
        genProxy = ProxyPool(numProxyThreads).gen_proxy()
        [taskQueue.put((i, sort, genProxy.next())) for i in idList]
        print('{}: {} tasks added!'.format(pname, num))
        # wait
        taskQueue.join()
Example #4
0
def get_ip_from_proxiesPool():
    proxypool = ProxyPool(redis_host, redis_port)
    is_ip = 0
    while not is_ip:
        try:
            t = proxypool.pop_ip()[0]
            ip = t[0].decode('utf-8')
            timestamp = t[1]

            if time.time() - timestamp > 60:
                print('ip too old')
                time.sleep(1)
            else:
                is_ip = 1
        except Exception as e:

            print('ip proxy is empty ,wait.....', e)
            time.sleep(3)

    print(ip)
    return ip, timestamp
Example #5
0
from proxy import ProxyPool
from setting import api,redis_host,redis_port,min_ip



proxypool = ProxyPool(redis_host, redis_port)
proxypool.mother(api, min_ip)

Example #6
0
def thread_task(task_name, user_id, json_config):
    queue = '.'.join([md5(PROJECT_NAME), str(user_id), md5(task_name)])
    redis_db = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
    try:
        if task_name not in TASK_LIST:
            return
        logger.info('开始监听任务%s,队列key为%s' % (task_name, queue))
        while True:
            task = redis_db.blpop(queue + '.TASK', timeout=2)
            if not task:
                if task_name in TASK_LIST:
                    logger.info('task: %s,queue: %s time out' % (task_name, queue))
                    continue
                else:
                    break
            url = task[1]
            try:
                # 判断是否使用代理
                if json_config.get('USE_PROXY', 0) == 0:
                    from proxy import ProxyPool
                    requests = ProxyPool(key='927200326076035', debug=True)
                else:
                    import requests
                # 判断是否使用随机请求头
                if json_config.get('RANDOM_AU', 0) == 0:
                    # print 'RANDOM AU'
                    pass
                resp = requests.get(url)
                logger.info('GET %s' % url)
                task_res = {}
                # 解析结果内容
                for res in json_config.get('RESULT', []):
                    key = res[0]
                    rex = res[1]
                    if rex == 'ALL':
                        value = resp.content
                    else:
                        value = re.findall(rex, resp.content)[0]
                    task_res[key] = value
                # 解析下一个url
                for task_rex in json_config.get('TASK_REX', []):
                    next_urls = re.findall(task_rex, resp.content)
                    for next_url in next_urls:
                        # 判断是否已经存在该任务
                        if redis_db.sismember(queue + '.ALL', next_url):
                            continue
                        redis_db.lpush(queue + '.ALL', next_url)
                        redis_db.lpush(queue + '.TASK', next_url)
                # 将结果存入结果队列
                logger.info('OK %s' % url)
                redis_db.lpush(queue + '.OK', url)
                redis_db.lpush(queue + '.RESULT', json.dumps(task_res))
            except Exception, e:
                logger.info('ERR %s %s' % (url, str(e)))
                # 将错误存入错误队列
                error_msg = {
                    'task_name': task_name,
                    'url': url,
                    'simple_msg': str(e),
                    'more_msg': traceback.format_exc()
                }
                redis_db.lpush(queue + '.ERROR', json.dumps(error_msg))
    except Exception, e:
        logger.info('任务出现错误 %s %s' % (task_name, str(e)))
        # 将错误存入错误队列
        error_msg = {
            'task_name': task_name,
            'simple_msg': str(e),
            'more_msg': traceback.format_exc()
        }
        redis_db.lpush(queue + '.ERROR', json.dumps(error_msg))
Example #7
0
 def __init__(self, loop):
     self.loop = loop
     self.proxy_pool = ProxyPool(loop)
Example #8
0
class Crawler:

    headers = {
        'charset': "utf-8",
        'platform': "4",
        "referer": "https://servicewechat.com/wx40f112341ae33edb/1/",
        'content-type': "application/x-www-form-urlencoded",
        'user-agent': "MicroMessenger/6.5.4.1000 NetType/WIFI Language/zh_CN",
        'host': "mwx.mobike.com",
        'connection': "Keep-Alive",
        'accept-encoding': "gzip",
        'cache-control': "no-cache"
    }

    mobike_url = "https://mwx.mobike.com/mobike-api/rent/nearbyBikesInfo.do"
    data_format = "latitude={}&longitude={}&errMsg=getMapCenterLocation"

    left = 30.7828453209
    top = 103.9213455517
    right = 30.4781772402
    bottom = 104.2178123382
    offset = 0.002

    def __init__(self, loop):
        self.loop = loop
        self.proxy_pool = ProxyPool(loop)

    async def get_bike(self, lat, lon):
        proxy = self.proxy_pool.pick()
        async with sema:
            with async_timeout.timeout(5, loop=self.loop):
                try:
                    async with aiohttp.request(
                            method='POST',
                            url=self.mobike_url,
                            headers=self.headers,
                            data=self.data_format.format(lat, lon),
                            proxy=proxy.url,
                    ) as resp:
                        ret = await resp.json()
                        if ret:
                            logger.info("success proxy: %s", proxy.url)
                            self.save(ret)
                except Exception as e:
                    proxy.error()
                    logger.error("get bike error: %s, lat: %s, lon: %s",
                                 str(e), str(lat), str(lon))
                finally:
                    self.total += 1
                    logger.info("success: %s", str(self.total))
            #  sema.release()

    def save(self, ret):
        for item in ret['object']:
            BikeLocation.new_location(item)

    async def run(self):
        logger.info("start")
        self.total = 1
        start = int(time.time())
        lat_range = np.arange(self.left, self.right, -self.offset)
        future_list = []
        for lat in lat_range:
            lon_range = np.arange(self.top, self.bottom, self.offset)
            for lon in lon_range:
                future_list.append(asyncio.Task(self.get_bike(lat, lon)))
        for task in future_list:
            await task
        logger.info("get %s", str(self.total))
        logger.info("done cast: %s", str(int(time.time()) - start))
        logger.info("proxys: %s", str(self.proxy_pool.proxies))
from Writer import Writer
import threading
import traceback
from bs4 import BeautifulSoup
from proxy import KDLProxyFinder, ProxyPool
from pybloom import BloomFilter

basePageURL = 'https://www.proginn.com/'
baseUserURL = 'https://www.proginn.com/wo/'
userQueue = queue.Queue()
writer = Writer()
working = 1
myFilter = BloomFilter(capacity=150000, error_rate=0.0000001)
finder = KDLProxyFinder("http://www.kuaidaili.com/free/inha/")
# finder = XiciProxyFinder("http://www.xicidaili.com/wn/")
ppool_instance = ProxyPool(finder)
ppool_instance.get_proxies()


# headers = {
#     'Cookie': 'UM_distinctid=15f80e3cfdd43-08fecc7b7f450d-8383667-100200-15f80e3cfde45c; client_id=5a3bb7e3dc6b2; x_access_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI4ODI2OCIsImN0aW1lIjoxNTEzODYzMTkwLCJleHAiOjE1MTY0NTUxOTB9.AkbrWfEjN82MoOZKZC1QJ_Dif4y0ySPxKaZDWGPiDcw; 48421=48421; XSRF-TOKEN=eyJpdiI6ImpJSXRlU3VMSnNzT3puZHlVdWdCWGc9PSIsInZhbHVlIjoiU1lJV0dWdEVqcHduMjlFMkZIOVBXSXc0VjlHNmcwMjF3QjRRZ1dKRVNrbkNOXC9mMWZwblQyTExvZzlPXC9kWkM4cXBZRjFpTVJjaU9YaTJJa1QzNlV1QT09IiwibWFjIjoiZGI5MGY3YWE1MmJhZmJlZmRmMjdiYTA5YjM2MDI3NzliY2M4ZmY3MDE3MjQyODQ0NzU0ODEyZGZlZGMxMGNiNSJ9; laravel_session=eyJpdiI6IjlvY0Iya3dnSDZ2ZDJoUktORlhZM1E9PSIsInZhbHVlIjoia1BoYlQwejJ4QVFNTXgxaE5mS05qRjR5TzIzZUVkQURkNDd5dU5GcjVvVTFvYWdLXC8wR1dLNUxCNXBwMUJqeE80ZFExRURqUDU4K05BOTVickJsOHZBPT0iLCJtYWMiOiIyNjNhMjhkYmQzMzdkNTU4NGRjMjE5YmU1NGQ0ZWZkNWE2YTNlNTEyN2E2YmM4M2ZhZjJjYTdhNmQ2ZWFiOTllIn0%3D; fed34112b737c4245348174f9344d1d3b8814ad4=eyJpdiI6IkU1R2piV1lVeWJFaWZhTXlsdTk4UEE9PSIsInZhbHVlIjoicEQyMnd0UnQ4TkpMZlEwYTNDbVlNWENteWVyajVBYzI3dnFtME5ud3pBK1JyVGxpVUxSTUtlcGJ6SzJQVGlWQnlpczB4VXZCTW45VFZpd1F1ZU5MNHZBYUFzdUFGeGE0VExGc3ExQmRKcHBicGU0Vlh5bXIram5XaGdVMUMrOWxxaWkzRFVldEFWdmlMK01LSlZJa1VqYU82MkplRndcL2creEdjaEZoY1ltbVlLTzFwQloyVGFFZjdFWFN6M3VISGNUKzdwXC9mbzJzVGxQQk44dzBDcTNkSis2T2FJZnhcL25mUlBPVHZhRHBXeFpOaFwvSTFiSlZiUEhIVDdSTjZ5bGwxSUpwZmpweklEekZQZkdGSzNZdVFwaUtMV2RYOEdzMlZHMHNpNUZvXC94R1pGNFRqMHozYkdNaFlMMWJkT2ZaVG9waGF2ZjNwUnpBbDFLc3BFMDIwVWVxWmRNSGZCM2ZnZWhRTnlcL3hueUJBV3FWWkVVa3g0SkNzRXRoTnFvYVpkNWtpQnR6RGI1Q1RJUVR0T0dJbTFSUCtXTVNlTjFTUzJrWElyQlBZeE1KRW94R1dybGJzSTl1WVwvZlNydE9lenpERjUxNGJsM0pCaE5wSDNpVXNoNlNtSkJaSmtcL1NwanVFdFdrQmtxd0diUT0iLCJtYWMiOiJhOWIxZjEzOGE4YmI0MjM1Mzc3ODYzYzkxYzViMTIyYjUxZmVkZWExMjkwYzQxN2VjODg1ZmUyY2U0ZmM0MWI3In0%3D; CNZZDATA1261469621=1194602006-1509690624-%7C1513924663; Hm_lvt_c92adf6182a39eb23c24cf43abc3f439=1512022434,1513863024; Hm_lpvt_c92adf6182a39eb23c24cf43abc3f439=1513925804',
#     'Referer': 'https://www.proginn.com/wo/60153'
# }
def handleHTML(html, user):
    try:
        soup = BeautifulSoup(html)
        tag = soup.find(class_='nickname')
        user.nickname = tag.a.string
        # tag = soup.find(class_='introduction')
        # if(len(tag.contents)==3):
        #     user.city = "未知"
Example #10
0
#-*-coding:utf-8-*-

import threading
import traceback
from bs4 import BeautifulSoup
from proxy import KDLProxyFinder, ProxyPool

import requests

baseurl = "http://www.plantphoto.cn/class"
r = requests.get(baseurl)
html = r.text
# soup = BeautifulSoup(html,"html.parser")
finder = KDLProxyFinder("http://www.kuaidaili.com/free/inha/")

ppool_instance = ProxyPool(finder)
ppool_instance.get_proxies()
r = requests.get(baseurl, proxies=ppool_instance.get_one_proxy())
html = r.text
print html
requests.request()
Example #11
0
def muti_control(id_list):

    pool = ThreadPool(7)
    proxypool = ProxyPool(redis_host, redis_port)
    #模拟登陆
    s = requests.Session()
    is_login = 0
    while not is_login:
        ip, timestamp = get_ip_from_proxiesPool()
        is_login = login(s, ip)
        #time.sleep(10)

    print('suss login')
    proxies = {
        'http': 'http://{}'.format(ip),
        'https': 'https://{}'.format(ip)
    }
    requests.post(
        url=
        'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml',
        proxies=proxies,
        cookies=s.cookies)
    print('tou ming zhuang ')

    while True:
        #用于记录ip使用次数
        #datetime_info = get_data_from_datetime(threads, sub_duty[0], sub_duty[-1])
        datetime_info = get_data_from_datetime(10, id_list)
        if len(datetime_info) == 0:
            print(time.asctime())
            break
        else:
            task = [list(t) for t in datetime_info]
            for i in task:
                i.append(s.cookies)
                i.append(ip)
            result = pool.map(spider, task)

            #根据返回的结果判断是否需要换Ip、cookies
            needIp = 0
            needCookies = 0

            for r in result:
                #id = r[0]
                #status = r[1]
                #content = r[2]
                if r[1] == 1:
                    #正常的数据,写入数据库
                    write_db(r[0], r[2])
                elif r[1] == 2:
                    #cookies,
                    print(r[0], 'cookies wrong')
                    needCookies = 1
                    needIp = 1
                elif r[1] == 3:
                    #ip过期
                    print(r[0], 'ip wrong')
                    needIp = 1

            #根据结果 更新session的状态
            if needCookies:
                is_login = 0
                while not is_login:
                    print('login again...')
                    ip, timestamp = get_ip_from_proxiesPool()
                    is_login = login(s, ip)
                    requests.post(
                        url=
                        'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml',
                        proxies=proxies,
                        cookies=s.cookies)
                print('tou ming zhuang ')
                proxypool.push_ip(ip, timestamp)

            elif needIp:
                while True:
                    ip, timestamp = get_ip_from_proxiesPool()
                    print('renew ip ...')
                    proxies = {
                        'http': 'http://{}'.format(ip),
                        'https': 'https://{}'.format(ip)
                    }

                    try:
                        rsp_ipTest = requests.get(
                            'http://www.pss-system.gov.cn/sipopublicsearch/portal/uiIndex.shtml',
                            proxies=proxies,
                            timeout=10)
                        if rsp_ipTest.status_code == 200 and '访问受限' not in rsp_ipTest.text:
                            try:
                                requests.post(
                                    url=
                                    'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/pageIsUesd-pageUsed.shtml',
                                    proxies=proxies,
                                    cookies=s.cookies,
                                    timeout=7)
                                print('tou ming zhuang ')
                                break

                            except:
                                pass

                    except:
                        pass