コード例 #1
0
def get_proxy(url):
    """ Get suitable proxy from database, if proxy is restricted proxy delete from database. Also call
	    eliteproxy.py when the proxy is not available in ETL_Config database """
    c_url = url.split('/')[0] + "//" + url.split('/')[2]
    database = "ETL_Config"
    table = "EliteProxy"
    proxy_id = pipeline.select(database="ETL_Config",
                               table="EliteProxy",
                               column="ProxyID")
    i_d = ""
    if not proxy_id or int(len(proxy_id)) <= 3:
        scrap_proxy()
        get_proxy(url=c_url)
    else:
        i_d = proxy_id[random.randrange(int(len(proxy_id)))][0]
    headers = useragent.get_agent()
    proxy = validate_proxy(database=database,
                           table=table,
                           url=c_url,
                           i_d=i_d,
                           header=headers)
    return proxy, headers
コード例 #2
0
def check_proxy(proxy, url, ip):
    """ Check proxy working status for current scraping URL """

    try:
        req = requests.get(url,
                           proxies=proxy,
                           headers=useragent.get_agent(),
                           timeout=(5, 10))

        response_code = req.status_code
        print(
            "[{}] Got response from <{}> while using Proxy: {} and returned response code [{}]"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), url, proxy,
                    response_code))

        if response_code == 200:
            print("[{}] Proxy {} is validated and proceed for anonymity check".
                  format(time.strftime("%I:%M:%S %p", time.localtime()),
                         proxy))

            result = elite_proxy(proxy=proxy, ip=ip)

            if result is True:
                return True
            else:
                return False
        else:
            print(
                "[{}] Bad response from <{}> while using proxy {} and returning to Scrap Proxy"
                .format(time.strftime("%I:%M:%S %p", time.localtime()), url,
                        proxy))
    except Exception as e:
        print("[{}] Exception Occurs at Check_Proxy Method. Error: {}".format(
            time.strftime("%I:%M:%S %p", time.localtime()), e))
    except KeyboardInterrupt as e:
        print(
            "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at Check_Proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        exit()
コード例 #3
0
def elite_proxy(proxy, ip):
    """ Check proxy for high anonymity level (Elite Proxy) """
    try:
        url = 'https://httpbin.org/ip'
        req = requests.get(url,
                           proxies=proxy,
                           headers=useragent.get_agent(),
                           timeout=(5, 10))
        soup = BeautifulSoup(req.content, 'html5lib')
        origin_ip = soup.text[15:int((((len(soup.text[15:-4])) / 2) - 1) + 15)]
        if origin_ip == ip:
            print("[{}] Request originated(while using proxy) from {}".format(
                time.strftime("%I:%M:%S %p", time.localtime()), soup.text))
            return True
        else:
            return False
    except Exception as e:
        print("[{}] Exception Occurs at Elite_Proxy Method. Error: {}".format(
            time.strftime("%I:%M:%S %p", time.localtime()), e))
    except KeyboardInterrupt as e:
        print(
            "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at Elite_Proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        exit()
コード例 #4
0
def scrap_proxy():
    """ Scrap proxy from desired websites and filter high anonymity level proxies """
    checker = [
        'https://www.google.com/', 'https://in.yahoo.com/',
        'https://www.bing.com/', 'https://duckduckgo.com/',
        'https://www.dogpile.com/', 'https://scholar.google.com/'
    ]
    try:
        pipeline.truncate(database="ETL_Config", table="EliteProxy")
        url = pipeline.select(database="ETL_Config",
                              table="NavigationUrl",
                              column="NextPageUrl",
                              condition={"UrlCategory": "Proxy"},
                              operator="AND")
        req = requests.get(url[0][0],
                           headers=useragent.get_agent(),
                           timeout=(5, 10))
        soup = BeautifulSoup(req.text, 'html5lib')
        ip = list(map(lambda x: x.text, soup.findAll('td')[::8]))
        port = list(map(lambda x: x.text, soup.findAll('td')[1::8]))
        anonymity = list(map(lambda x: x.text, soup.findAll('td')[4::8]))
        data_dictionary = {'IP': ip, 'PORT': port, 'ANONYMITY': anonymity}
        data_frame = pd.DataFrame(data_dictionary)
        data_filter = data_frame['ANONYMITY'] == 'elite proxy'
        elite_data = data_frame[data_filter]
        print("[{}] [{}] items scraped from <{}> successfully.".format(
            time.strftime("%I:%M:%S %p", time.localtime()),
            len(elite_data.index), url[0][0]))
        process = []
        for i in range(len(elite_data.index)):
            ip = elite_data.iloc[i]['IP']
            port = elite_data.iloc[i]['PORT']
            proxies = ('http://' + ip + ':' + port)
            proxy = {
                'http': proxies,
                'https': proxies,
            }
            print("[{}] Evaluating Proxy <{}> that scraped from [{}]".format(
                time.strftime("%I:%M:%S %p", time.localtime()), proxies,
                url[0][0]))
            result = check_proxy(proxy=proxy,
                                 url=random.choice(checker),
                                 ip=ip)
            if result is True:
                p_count = pipeline.select(database="ETL_Config",
                                          table="EliteProxy",
                                          column="COUNT(*)")
                if int(p_count[0][0]) >= 10:
                    pipeline.call(database="ETL_Config",
                                  procedure="SP_UpdateProxy")
                    db_result = pipeline.call(
                        database="ETL_Config",
                        procedure="SP_NavigationUrl_Sync",
                        parameter={"category": "Proxy"})
                    if db_result is True:
                        print(
                            "[{}] Elite Proxy Scraper successfully completed and Synchronized."
                            .format(
                                time.strftime("%I:%M:%S %p",
                                              time.localtime())))
                        return True
                    else:
                        print(
                            "[{}] Elite Proxy Scraper successfully completed and 'NOT' Synchronized."
                            .format(
                                time.strftime("%I:%M:%S %p",
                                              time.localtime())))
                        return False
                    pass
                else:
                    pipeline.insert(database="ETL_Config",
                                    table="EliteProxy",
                                    values={
                                        "IP":
                                        ip,
                                        "Port":
                                        port,
                                        "Anonymity":
                                        "High",
                                        "IsAlive":
                                        "Y",
                                        "LastUpdate":
                                        time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                                    })
            else:
                pass
        pipeline.call(database="ETL_Config", procedure="SP_UpdateProxy")
        pipeline.call(database="ETL_Config",
                      procedure="SP_NavigationUrl_Sync",
                      parameter={"category": "Proxy"})
    except Exception as e:
        print(
            "[{}] Exception Occurs and retries Scrap_Proxy Method. Error: {}".
            format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        scrap_proxy()
    except KeyboardInterrupt as e:
        print(
            "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at Scrap_Proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        exit()