def get_agent(): try: ua_count = pipeline.select(database="ETL_Config", table="UserAgent", column="COUNT(*)") headers = requests.utils.default_headers() if int(ua_count[0][0]) <= 10: ua_count_bkp = pipeline.select(database="ETL_Config", table="UserAgent_BKP", column="COUNT(*)") if int(ua_count_bkp[0][0]) <= 10: print( "[{}] Advise you to Scrap UserAgent and Sync UserAgent. Only [{}] items exist in Table[UserAgent_BKP] and Database[ETL_Config]." .format(time.strftime("%I:%M:%S %p", time.localtime()), ua_count_bkp[0][0])) user_agent = temp_agent() headers.update({"User-Agent": user_agent}) return headers else: ua = random.choice([x for x in range(1, int(ua_count_bkp[0][0]))]) user_agent = pipeline.select(database="ETL_Config", table="UserAgent_BKP", column="UserAgent", condition={"UserAgentID": ua} ) print("[{}] Backup UserAgent is Choosed. Which mean UserAgent is not available in Database [{}]." .format(time.strftime("%I:%M:%S %p", time.localtime()), ua)) headers.update({"User-Agent": user_agent[0][0]}) return headers else: ua = random.choice([x for x in range(1, int(ua_count[0][0]))]) user_agent = pipeline.select(database="ETL_Config", table="UserAgent", column="UserAgent", condition={"UserAgentID": ua} ) headers.update({"User-Agent": user_agent[0][0]}) return headers except requests.Timeout as e: print("[{}] Exception Occurs - requests.Timeout at get_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) temp_agent() except requests.ConnectionError as e: print("[{}] Exception Occurs - request.ConnectionError at get_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) temp_agent() except requests.RequestException as e: print("[{}] Exception Occurs - request.RequestException(GeneralException) at get_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) temp_agent() except KeyboardInterrupt as e: print("[{}] Someone Forced Program to EXIT - KeyboardInterrupt at get_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) exit()
def validate_proxy(database, table, url, i_d, header): """ Validate proxy working status for the given domain""" try: ip_port = pipeline.select(database=database, table=table, column="IP, Port", condition={"ProxyID": i_d}) proxies = ('http://' + ip_port[0][0] + ":" + ip_port[0][1]) proxy = { 'http': proxies, 'https': proxies, } req = requests.get(url, proxies=proxy, headers=header, timeout=(5, 10)) status = req.status_code if status == 200: print("[{}] Right Proxy and User Agent choosed for scrapping {}.". format(time.strftime("%I:%M:%S %p", time.localtime()), proxy)) return proxy else: print("[{}] Bad response from {}.".format( time.strftime("%I:%M:%S %p", time.localtime()), proxy)) pipeline.delete(database=database, table=table, condition={"ProxyID": i_d}) get_proxy(url=url) except requests.Timeout as e: print( "[{}] Exception Occurs - requests.Timeout at validate_proxy Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) pipeline.delete(database="ETL_Config", table="EliteProxy", condition={"ProxyID": i_d}) get_proxy(url=url) except requests.ConnectionError as e: print( "[{}] Exception Occurs - request.ConnectionError at validate_proxy Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) pipeline.delete(database="ETL_Config", table="EliteProxy", condition={"ProxyID": i_d}) get_proxy(url=url) except requests.RequestException as e: print( "[{}] Exception Occurs - request.RequestException(GeneralException) at validate_proxy Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) pipeline.delete(database="ETL_Config", table="EliteProxy", condition={"ProxyID": i_d}) get_proxy(url=url) except KeyboardInterrupt as e: print( "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at validate_proxy Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) exit() pass
def get_proxy(url): """ Get suitable proxy from database, if proxy is restricted proxy delete from database. Also call eliteproxy.py when the proxy is not available in ETL_Config database """ c_url = url.split('/')[0] + "//" + url.split('/')[2] database = "ETL_Config" table = "EliteProxy" proxy_id = pipeline.select(database="ETL_Config", table="EliteProxy", column="ProxyID") i_d = "" if not proxy_id or int(len(proxy_id)) <= 3: scrap_proxy() get_proxy(url=c_url) else: i_d = proxy_id[random.randrange(int(len(proxy_id)))][0] headers = useragent.get_agent() proxy = validate_proxy(database=database, table=table, url=c_url, i_d=i_d, header=headers) return proxy, headers
def scrap_proxy(): """ Scrap proxy from desired websites and filter high anonymity level proxies """ checker = [ 'https://www.google.com/', 'https://in.yahoo.com/', 'https://www.bing.com/', 'https://duckduckgo.com/', 'https://www.dogpile.com/', 'https://scholar.google.com/' ] try: pipeline.truncate(database="ETL_Config", table="EliteProxy") url = pipeline.select(database="ETL_Config", table="NavigationUrl", column="NextPageUrl", condition={"UrlCategory": "Proxy"}, operator="AND") req = requests.get(url[0][0], headers=useragent.get_agent(), timeout=(5, 10)) soup = BeautifulSoup(req.text, 'html5lib') ip = list(map(lambda x: x.text, soup.findAll('td')[::8])) port = list(map(lambda x: x.text, soup.findAll('td')[1::8])) anonymity = list(map(lambda x: x.text, soup.findAll('td')[4::8])) data_dictionary = {'IP': ip, 'PORT': port, 'ANONYMITY': anonymity} data_frame = pd.DataFrame(data_dictionary) data_filter = data_frame['ANONYMITY'] == 'elite proxy' elite_data = data_frame[data_filter] print("[{}] [{}] items scraped from <{}> successfully.".format( time.strftime("%I:%M:%S %p", time.localtime()), len(elite_data.index), url[0][0])) process = [] for i in range(len(elite_data.index)): ip = elite_data.iloc[i]['IP'] port = elite_data.iloc[i]['PORT'] proxies = ('http://' + ip + ':' + port) proxy = { 'http': proxies, 'https': proxies, } print("[{}] Evaluating Proxy <{}> that scraped from [{}]".format( time.strftime("%I:%M:%S %p", time.localtime()), proxies, url[0][0])) result = check_proxy(proxy=proxy, url=random.choice(checker), ip=ip) if result is True: p_count = pipeline.select(database="ETL_Config", table="EliteProxy", column="COUNT(*)") if int(p_count[0][0]) >= 10: pipeline.call(database="ETL_Config", procedure="SP_UpdateProxy") db_result = pipeline.call( database="ETL_Config", procedure="SP_NavigationUrl_Sync", parameter={"category": "Proxy"}) if db_result is True: print( "[{}] Elite Proxy Scraper successfully completed and Synchronized." .format( time.strftime("%I:%M:%S %p", time.localtime()))) return True else: print( "[{}] Elite Proxy Scraper successfully completed and 'NOT' Synchronized." .format( time.strftime("%I:%M:%S %p", time.localtime()))) return False pass else: pipeline.insert(database="ETL_Config", table="EliteProxy", values={ "IP": ip, "Port": port, "Anonymity": "High", "IsAlive": "Y", "LastUpdate": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) }) else: pass pipeline.call(database="ETL_Config", procedure="SP_UpdateProxy") pipeline.call(database="ETL_Config", procedure="SP_NavigationUrl_Sync", parameter={"category": "Proxy"}) except Exception as e: print( "[{}] Exception Occurs and retries Scrap_Proxy Method. Error: {}". format(time.strftime("%I:%M:%S %p", time.localtime()), e)) scrap_proxy() except KeyboardInterrupt as e: print( "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at Scrap_Proxy Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) exit()
def scrap_agent(): """ Scrap user agent from 'https://developers.whatismybrowser.com/' using high anonymous proxy.""" try: i_url = pipeline.select(database="ETL_Config", column="NextPageUrl", table="NavigationUrl", condition={"UrlCategory": "User Agent"}, operator="AND" ) i = int((i_url[0][0])[-1]) if i == 1: pipeline.truncate(database="ETL_Config", table="UserAgent") else: pass while i < 11: url = pipeline.select(database="ETL_Config", column="NextPageUrl", table="NavigationUrl", condition={"UrlCategory": "User Agent"}, operator="AND" ) proxy_header = eliteproxy.get_proxy(url=url[0][0]) req = requests.get(url[0][0], headers=proxy_header[1], proxies=proxy_header[0], timeout=(5, 10)) soup = BeautifulSoup(req.content, 'html5lib') user_agents = list(map(lambda x: x.text, soup.findAll('td')[::5])) software = list(map(lambda x: x.text, soup.findAll('td')[1::5])) software_type = list(map(lambda x: x.text, soup.findAll('td')[2::5])) os = list(map(lambda x: x.text, soup.findAll('td')[3::5])) popularity = list(map(lambda x: x.text, soup.findAll('td')[4::5])) data_dictionary = { 'User Agent': user_agents, 'Software': software, 'Software Type': software_type, 'OS': os, 'Popularity': popularity } data_frame = pd.DataFrame(data_dictionary) data_filter = data_frame['Popularity'] == 'Very common' df_agents = data_frame[data_filter] if int(len(df_agents.index)) == 0: print("[{}] Scraper crawled [{}] items from <{}> and ready to crawl same URL." .format(time.strftime("%I:%M:%S %p", time.localtime()), len(df_agents.index), url[0][0])) scrap_agent() else: print("[{}] Scraper crawled [{}] items from <{}> and ready to insert scraped items." .format(time.strftime("%I:%M:%S %p", time.localtime()), len(df_agents.index), url[0][0])) for row in range(len(df_agents.index)): pipeline.insert(database="ETL_Config", table="UserAgent", values={"UserAgent": df_agents.iloc[row]['User Agent'], "Software": df_agents.iloc[row]['Software'], "SoftwareType": df_agents.iloc[row]['Software Type'], "OS": df_agents.iloc[row]['OS'], "Popularity": df_agents.iloc[row]['Popularity'] } ) pipeline.update(database="ETL_Config", table="NavigationUrl", values={ "PreviousPageUrl": ( (url[0][0].replace((url[0][0])[-2:], str(int(i - 1)))) if i >= 10 else url[0][ 0].replace( (url[0][0])[-1], str(int(i - 1)))), "CurrentPageUrl": url[0][0], "NextPageUrl": ( (url[0][0].replace((url[0][0])[-2:], str(int(i - 9)))) if i >= 10 else url[0][ 0].replace( (url[0][0])[-1], str(int(i + 1)))) }, condition={"UrlCategory": "User Agent"}, operator="AND" ) i = i + 1 pipeline.call(database="ETL_Config", procedure="SP_UpdateUserAgent") print("[{}] User Agent Scraped and loaded Successfully..." .format(time.strftime("%I:%M:%S %p", time.localtime()))) except Exception as e: print("[{}] Exception Occurs and retries scrap_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) scrap_agent() except KeyboardInterrupt as e: print("[{}] Someone Forced Program to EXIT - KeyboardInterrupt at scrap_agent Method. Error: {}" .format(time.strftime("%I:%M:%S %p", time.localtime()), e)) exit()