Example #1
0
def get_proxies():
    """
    this function is to get proxies from publishing websites. Used python module trip, which is a combination of requests and tornado,
    can handle requests with multi-coroutine. We have to be careful of the max TCP packages that the machine can send, cause this function
    consume all TCP packages sending quantity. 
    """
    global ITL#used to iterate URL_SET
    this_web_has = 0#the total number of proxies this web page has.
    ITL += 1 
    #re for IP and Port infomation, eg"127.0.0.1:8080".
    pi = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))'
    ipandportlist = []
    print(URL_SET[ITL])
    try:
        r = yield trip.get(URL_SET[ITL], timeout=30, headers=header)#used trip.
        p = re.findall(pi, r.text)
        for each in p:
            str1 = str(each)
            str1 = str1.replace(',', ':')
            str1 = str1.replace('(', '')
            str1 = str1.replace(')', '')
            str1 = str1.replace('\'', '')
            str1 = str1.replace(' ', '')
            ipandportlist.append(str1)
            this_web_has += 1
    except Exception as detail:
        print('This website has a problem.', detail)
        get_proxies()
    print(URL_SET[ITL]+' has '+str(this_web_has)+' proxies.')
    return ipandportlist
Example #2
0
def get_proxies():
    """
    爬代理网站的函数
    """
    global ALL
    global ITL  #协助遍历URL_SET的常数,因为需要在这个函数里改动,所以需要在这里global声明一下。
    this_web_has = 0  #用来记录这个网站有多少个代理的信息,记录所有的代理数量,包括不好用的。
    ITL += 1  #用来遍历URL_SET。
    #下面这句是正则表达式,能找到IP和端口信息。
    pi = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))'
    ipandportlist = []  #装找到的代理,回头会返回。
    print(URL_SET[ITL])
    try:
        r = yield trip.get(URL_SET[ITL], timeout=200, headers=header)
        #print(r.text)
        p = re.findall(pi, r.text)
        for each in p:
            str1 = str(each)
            str1 = str1.replace(',', ':')
            str1 = str1.replace('(', '')
            str1 = str1.replace(')', '')
            str1 = str1.replace('\'', '')
            str1 = str1.replace(' ', '')
            ALL.append(str1)
            this_web_has += 1
    except Exception as detail:
        print('This website has a problem.', detail)
        get_proxies()
    print(URL_SET[ITL] + ' has ' + str(this_web_has) + ' proxies.')
Example #3
0
def test_proxy(proxy):
    try:
        r = yield trip.get('http://httpbin.org/get', timeout=5,
            proxies={ 'http': proxy, 'https': proxy })
        if 'httpbin.org' not in r.text:
            raise Exception('Invalid reply')
    except Exception as e:
        pass
    else:
        raise trip.Return(proxy)
def requests_a(url, square_, type_):
    global result
    try:
        result = yield trip.get(url)
    except Exception as e:
        return

    print('-----------------{}'.format(result.text))
    if 'DAILY_QUERY_OVER_LIMIT' in result.text:
        sys.exit()
    try:
        status = result.json()['status']
    except Exception as e:
        print(e)
        print(result)
        return
    if status is '1':
        count = int(result.json()['count'])
        if count != 0:
            if count < 50:
                print('count < 50')

                connection_result = r_result.connection
                channel = connection_result.channel()
                channel.queue_declare(queue='amap_result_json')
                channel.basic_publish(exchange='',
                                      routing_key='amap_result_json',
                                      body=json.dumps(result.json()))
                channel.close()
                print(result.json())

                connection_result.close()
            else:
                print('count > 50')

                connection_page = r_page.connection
                channel = connection_page.channel()
                channel.queue_declare(queue='amap_page_url')
                for i in range(1, int(count / 50 + 0.5)):
                    channel.basic_publish(
                        exchange='',
                        routing_key='amap_page_url',
                        body='http://restapi.amap.com/v3/place/polygon?polygon='
                        + square_ + ';&types=' + type_ +
                        '&output=JSON&offset=50' + '&page=' + str(i + 1),
                    )
                    print('分页 的url放入')
                channel.close()

                # connection_page.close()
        else:
            print('count = 0')
            return
    else:
        return
def asyn_message(body,api_key):
    body = body.decode('utf8')
    url_list = []
    for i in json.loads(body):
        s, t = i['square_list'], i['type']
        square_ = str(s[0]) + ',' + str(s[1]) + ';' + str(s[2]) + ',' + str(s[3])
        url_list.append('http://restapi.amap.com/v3/place/polygon?polygon=' + square_ + ';&types=' + t + '&output=JSON&key=' + \
          api_key + '&offset=50')
    r = yield [trip.get(url) for url in url_list]
    for result in r:
        requests_a(result)
Example #6
0
def test_proxy(proxy):
    """
    This function is used to test if the proxy server is still in service. We send every proxy candidate a request to get web
    page 'http://httpbin.org/get' for us. If the correct web page is returned then this proxy pass 
    test.This function also used trip to max its speed. 
    """
    global valid_proxy
    try:
        r = yield trip.get('http://httpbin.org/get', timeout=40,
            proxies={'http': proxy, 'https': proxy })
        if 'httpbin' in r.text:
            valid_proxy.append(proxy)
            print('currently, we have '+str(len(valid_proxy))+' valid proxies')
    except Exception as detail:
        print ("ERROR:", detail)
    else:
        raise trip.Return(proxy)
Example #7
0
def test_proxy(proxy):
    """
    Used global var valid_proxy,this function tests proxy using trip
    """
    global ALL
    global valid_proxy
    try:
        print(proxy)
        r = yield trip.get('http://httpbin.org/get',
                           timeout=200,
                           proxies={
                               'http': proxy,
                               'https': proxy
                           })
        if 'httpbin' in r.text:
            valid_proxy.append(proxy)
            valid_proxy = list(set(valid_proxy))
            print('currently, we have ' + str(len(valid_proxy)) +
                  ' valid proxies')
    except Exception as detail:
        print("ERROR:", detail)
    else:
        raise trip.Return(proxy)
Example #8
0
def main():
    r = yield trip.get('https://httpbin.org/get')
    print(r.content)
Example #9
0
def get_proxies(number=10):
    r = yield trip.get('http://www.89ip.cn/apijk/' +
        '?&tqsl=%s&sxa=&sxb=&tta=&ports=&ktip=&cf=1' % number)
    p = re.findall('((?:\d{1,3}.){3}\d{1,3}:\d+)', r.text)
    raise trip.Return(p)
Example #10
0
def async_fetch():
    r = yield [trip.get(url) for i in range(times)]
    raise trip.Return(r)
def asyn_message(body):
    r = yield [trip.get(url) for url in json.loads(body.decode())]
    for result in r:
        requests_a(result)