Ejemplo n.º 1
0
def get_agent():
	try:
		ua_count = pipeline.select(database="ETL_Config", table="UserAgent", column="COUNT(*)")
		headers = requests.utils.default_headers()
		if int(ua_count[0][0]) <= 10:
			ua_count_bkp = pipeline.select(database="ETL_Config", table="UserAgent_BKP", column="COUNT(*)")
			if int(ua_count_bkp[0][0]) <= 10:
				print(
					"[{}] Advise you to Scrap UserAgent and Sync UserAgent. Only [{}] items exist in Table[UserAgent_BKP] and Database[ETL_Config]."
						.format(time.strftime("%I:%M:%S %p", time.localtime()), ua_count_bkp[0][0]))
				user_agent = temp_agent()
				headers.update({"User-Agent": user_agent})
				return headers
			else:
				ua = random.choice([x for x in range(1, int(ua_count_bkp[0][0]))])
				user_agent = pipeline.select(database="ETL_Config", table="UserAgent_BKP", column="UserAgent",
				                             condition={"UserAgentID": ua}
				                             )
				print("[{}] Backup UserAgent is Choosed. Which mean UserAgent is not available in Database [{}]."
				      .format(time.strftime("%I:%M:%S %p", time.localtime()), ua))
				headers.update({"User-Agent": user_agent[0][0]})
				return headers
		else:
			ua = random.choice([x for x in range(1, int(ua_count[0][0]))])
			user_agent = pipeline.select(database="ETL_Config", table="UserAgent", column="UserAgent",
			                             condition={"UserAgentID": ua}
			                             )
			headers.update({"User-Agent": user_agent[0][0]})
			return headers
	except requests.Timeout as e:
		print("[{}] Exception Occurs - requests.Timeout at get_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		temp_agent()
	except requests.ConnectionError as e:
		print("[{}] Exception Occurs - request.ConnectionError at get_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		temp_agent()
	except requests.RequestException as e:
		print("[{}] Exception Occurs - request.RequestException(GeneralException) at get_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		temp_agent()
	except KeyboardInterrupt as e:
		print("[{}] Someone Forced Program to EXIT - KeyboardInterrupt at get_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		exit()
Ejemplo n.º 2
0
def validate_proxy(database, table, url, i_d, header):
    """ Validate proxy working status for the given domain"""
    try:
        ip_port = pipeline.select(database=database,
                                  table=table,
                                  column="IP, Port",
                                  condition={"ProxyID": i_d})
        proxies = ('http://' + ip_port[0][0] + ":" + ip_port[0][1])
        proxy = {
            'http': proxies,
            'https': proxies,
        }
        req = requests.get(url, proxies=proxy, headers=header, timeout=(5, 10))
        status = req.status_code
        if status == 200:
            print("[{}] Right Proxy and User Agent choosed for scrapping {}.".
                  format(time.strftime("%I:%M:%S %p", time.localtime()),
                         proxy))
            return proxy
        else:
            print("[{}] Bad response from {}.".format(
                time.strftime("%I:%M:%S %p", time.localtime()), proxy))
            pipeline.delete(database=database,
                            table=table,
                            condition={"ProxyID": i_d})
            get_proxy(url=url)
    except requests.Timeout as e:
        print(
            "[{}] Exception Occurs - requests.Timeout at validate_proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        pipeline.delete(database="ETL_Config",
                        table="EliteProxy",
                        condition={"ProxyID": i_d})
        get_proxy(url=url)
    except requests.ConnectionError as e:
        print(
            "[{}] Exception Occurs - request.ConnectionError at validate_proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        pipeline.delete(database="ETL_Config",
                        table="EliteProxy",
                        condition={"ProxyID": i_d})
        get_proxy(url=url)
    except requests.RequestException as e:
        print(
            "[{}] Exception Occurs - request.RequestException(GeneralException) at validate_proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        pipeline.delete(database="ETL_Config",
                        table="EliteProxy",
                        condition={"ProxyID": i_d})
        get_proxy(url=url)
    except KeyboardInterrupt as e:
        print(
            "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at validate_proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        exit()
        pass
Ejemplo n.º 3
0
def get_proxy(url):
    """ Get suitable proxy from database, if proxy is restricted proxy delete from database. Also call
	    eliteproxy.py when the proxy is not available in ETL_Config database """
    c_url = url.split('/')[0] + "//" + url.split('/')[2]
    database = "ETL_Config"
    table = "EliteProxy"
    proxy_id = pipeline.select(database="ETL_Config",
                               table="EliteProxy",
                               column="ProxyID")
    i_d = ""
    if not proxy_id or int(len(proxy_id)) <= 3:
        scrap_proxy()
        get_proxy(url=c_url)
    else:
        i_d = proxy_id[random.randrange(int(len(proxy_id)))][0]
    headers = useragent.get_agent()
    proxy = validate_proxy(database=database,
                           table=table,
                           url=c_url,
                           i_d=i_d,
                           header=headers)
    return proxy, headers
Ejemplo n.º 4
0
def scrap_proxy():
    """ Scrap proxy from desired websites and filter high anonymity level proxies """
    checker = [
        'https://www.google.com/', 'https://in.yahoo.com/',
        'https://www.bing.com/', 'https://duckduckgo.com/',
        'https://www.dogpile.com/', 'https://scholar.google.com/'
    ]
    try:
        pipeline.truncate(database="ETL_Config", table="EliteProxy")
        url = pipeline.select(database="ETL_Config",
                              table="NavigationUrl",
                              column="NextPageUrl",
                              condition={"UrlCategory": "Proxy"},
                              operator="AND")
        req = requests.get(url[0][0],
                           headers=useragent.get_agent(),
                           timeout=(5, 10))
        soup = BeautifulSoup(req.text, 'html5lib')
        ip = list(map(lambda x: x.text, soup.findAll('td')[::8]))
        port = list(map(lambda x: x.text, soup.findAll('td')[1::8]))
        anonymity = list(map(lambda x: x.text, soup.findAll('td')[4::8]))
        data_dictionary = {'IP': ip, 'PORT': port, 'ANONYMITY': anonymity}
        data_frame = pd.DataFrame(data_dictionary)
        data_filter = data_frame['ANONYMITY'] == 'elite proxy'
        elite_data = data_frame[data_filter]
        print("[{}] [{}] items scraped from <{}> successfully.".format(
            time.strftime("%I:%M:%S %p", time.localtime()),
            len(elite_data.index), url[0][0]))
        process = []
        for i in range(len(elite_data.index)):
            ip = elite_data.iloc[i]['IP']
            port = elite_data.iloc[i]['PORT']
            proxies = ('http://' + ip + ':' + port)
            proxy = {
                'http': proxies,
                'https': proxies,
            }
            print("[{}] Evaluating Proxy <{}> that scraped from [{}]".format(
                time.strftime("%I:%M:%S %p", time.localtime()), proxies,
                url[0][0]))
            result = check_proxy(proxy=proxy,
                                 url=random.choice(checker),
                                 ip=ip)
            if result is True:
                p_count = pipeline.select(database="ETL_Config",
                                          table="EliteProxy",
                                          column="COUNT(*)")
                if int(p_count[0][0]) >= 10:
                    pipeline.call(database="ETL_Config",
                                  procedure="SP_UpdateProxy")
                    db_result = pipeline.call(
                        database="ETL_Config",
                        procedure="SP_NavigationUrl_Sync",
                        parameter={"category": "Proxy"})
                    if db_result is True:
                        print(
                            "[{}] Elite Proxy Scraper successfully completed and Synchronized."
                            .format(
                                time.strftime("%I:%M:%S %p",
                                              time.localtime())))
                        return True
                    else:
                        print(
                            "[{}] Elite Proxy Scraper successfully completed and 'NOT' Synchronized."
                            .format(
                                time.strftime("%I:%M:%S %p",
                                              time.localtime())))
                        return False
                    pass
                else:
                    pipeline.insert(database="ETL_Config",
                                    table="EliteProxy",
                                    values={
                                        "IP":
                                        ip,
                                        "Port":
                                        port,
                                        "Anonymity":
                                        "High",
                                        "IsAlive":
                                        "Y",
                                        "LastUpdate":
                                        time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime())
                                    })
            else:
                pass
        pipeline.call(database="ETL_Config", procedure="SP_UpdateProxy")
        pipeline.call(database="ETL_Config",
                      procedure="SP_NavigationUrl_Sync",
                      parameter={"category": "Proxy"})
    except Exception as e:
        print(
            "[{}] Exception Occurs and retries Scrap_Proxy Method. Error: {}".
            format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        scrap_proxy()
    except KeyboardInterrupt as e:
        print(
            "[{}] Someone Forced Program to EXIT - KeyboardInterrupt at Scrap_Proxy Method. Error: {}"
            .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
        exit()
Ejemplo n.º 5
0
def scrap_agent():
	""" Scrap user agent from 'https://developers.whatismybrowser.com/' using high anonymous proxy."""
	try:
		i_url = pipeline.select(database="ETL_Config", column="NextPageUrl", table="NavigationUrl",
		                        condition={"UrlCategory": "User Agent"}, operator="AND"
		                        )
		i = int((i_url[0][0])[-1])
		if i == 1:
			pipeline.truncate(database="ETL_Config", table="UserAgent")
		else:
			pass
		while i < 11:

			url = pipeline.select(database="ETL_Config", column="NextPageUrl", table="NavigationUrl",
			                      condition={"UrlCategory": "User Agent"}, operator="AND"
			                      )
			proxy_header = eliteproxy.get_proxy(url=url[0][0])
			req = requests.get(url[0][0], headers=proxy_header[1], proxies=proxy_header[0], timeout=(5, 10))
			soup = BeautifulSoup(req.content, 'html5lib')
			user_agents = list(map(lambda x: x.text, soup.findAll('td')[::5]))
			software = list(map(lambda x: x.text, soup.findAll('td')[1::5]))
			software_type = list(map(lambda x: x.text, soup.findAll('td')[2::5]))
			os = list(map(lambda x: x.text, soup.findAll('td')[3::5]))
			popularity = list(map(lambda x: x.text, soup.findAll('td')[4::5]))
			data_dictionary = {
				'User Agent': user_agents,
				'Software': software,
				'Software Type': software_type,
				'OS': os,
				'Popularity': popularity
			}
			data_frame = pd.DataFrame(data_dictionary)
			data_filter = data_frame['Popularity'] == 'Very common'
			df_agents = data_frame[data_filter]
			if int(len(df_agents.index)) == 0:
				print("[{}] Scraper crawled [{}] items from <{}> and ready to crawl same URL."
				      .format(time.strftime("%I:%M:%S %p", time.localtime()), len(df_agents.index), url[0][0]))
				scrap_agent()
			else:
				print("[{}] Scraper crawled [{}] items from <{}> and ready to insert scraped items."
				      .format(time.strftime("%I:%M:%S %p", time.localtime()), len(df_agents.index), url[0][0]))
				for row in range(len(df_agents.index)):
					pipeline.insert(database="ETL_Config", table="UserAgent",
					                values={"UserAgent": df_agents.iloc[row]['User Agent'],
					                        "Software": df_agents.iloc[row]['Software'],
					                        "SoftwareType": df_agents.iloc[row]['Software Type'],
					                        "OS": df_agents.iloc[row]['OS'],
					                        "Popularity": df_agents.iloc[row]['Popularity']
					                        }
					                )
				pipeline.update(database="ETL_Config", table="NavigationUrl",
				                values={
					                "PreviousPageUrl": (
						                (url[0][0].replace((url[0][0])[-2:], str(int(i - 1)))) if i >= 10 else url[0][
							                0].replace(
							                (url[0][0])[-1], str(int(i - 1)))),
					                "CurrentPageUrl": url[0][0],
					                "NextPageUrl": (
						                (url[0][0].replace((url[0][0])[-2:], str(int(i - 9)))) if i >= 10 else url[0][
							                0].replace(
							                (url[0][0])[-1], str(int(i + 1))))
				                },
				                condition={"UrlCategory": "User Agent"},
				                operator="AND"
				                )
			i = i + 1
		pipeline.call(database="ETL_Config", procedure="SP_UpdateUserAgent")
		print("[{}] User Agent Scraped and loaded Successfully..."
		      .format(time.strftime("%I:%M:%S %p", time.localtime())))
	except Exception as e:
		print("[{}] Exception Occurs and retries scrap_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		scrap_agent()
	except KeyboardInterrupt as e:
		print("[{}] Someone Forced Program to EXIT - KeyboardInterrupt at scrap_agent Method. Error: {}"
		      .format(time.strftime("%I:%M:%S %p", time.localtime()), e))
		exit()