def testGetBlogs(self): niche = "Society/Law" proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" # countries = GoogleUtility.CountryCodeEnglish countries = ["uk", ] min_delay = 2 max_delay = 5 max_page = 2 days_ago = 4*365 target_keywords_init = ["legal case", "Labour law", "human rights law", "crime law", "Immigration law", "Family law", "Transactional law", "Company law", "Commercial law", "Admiralty law", "Intellectual property law", "international law", "tax law", "banking law", "competition law", "consumer law", "environmental law"] suggested_keywords = [] for country in countries: # temp_keywords = self.testGetSuggestionBatch(target_keywords_init, proxies=proxies, # country_code=country, # min_delay=min_delay, max_delay=max_delay) temp_keywords = list(set(FileHandler.read_lines_from_file(keyword_log_path))) # FileHandler.append_lines_to_file(keyword_log_path, temp_keywords, option="at") # suggested_keywords += temp_keywords crawl_keywords = [x for x in list(set(target_keywords_init + temp_keywords))] self.testGetLinksBatch_single_t(niche, keywords=crawl_keywords, page_count=max_page, index=0, length=100, country_code=country, source_type=GoogleConst.SourceTypeBlog, min_delay=min_delay, max_delay=max_delay, days_ago=days_ago, proxies=proxies, use_browser=False)
def testOllipldsfapenChrome(self): ''' todo:http://stackoverflow.com/questions/29983106/how-can-i-set-proxy-with-authentication-in-selenium-chrome-web-driver-using-pyth :return: ''' # request_url = "https://www.google.com/search?q=crimial%20law&num=100&start=0&site=webhp&tbm=blg&source=lnt&as_qdr=y5" request_url = "https://www.google.com/search?q=bbs&num=100&start=0&gl=us&gws_rd=cr&as_qdr=d10" # request_url = "https://www.whatismyip.com/" # request_url = "http://whatsmyuseragent.com/" proxy = BuyProxyOrg(buy_proxy_org_account) proxy_list = proxy.get_proxies(5) chrome_list = list() for item in proxy_list: PROXY = item.str_no_auth() chrome_options = webdriver.ChromeOptions() # PROXY = "23.95.32.92:80" # USER_AGENT = "i like ice cream." USER_AGENT = WebRequestCommonHeader.webpage_agent chrome_options.add_argument('--proxy-server=http://{0:s}'.format( PROXY, )) chrome_options.add_argument('--user-agent={0:s}'.format( USER_AGENT, )) chrome = webdriver.Chrome(chrome_options=chrome_options) chrome.get(request_url) chrome_list.append(chrome) time.sleep(60) for item in chrome_list: item.close()
def testOllipldsfapenChrome(self): ''' todo:http://stackoverflow.com/questions/29983106/how-can-i-set-proxy-with-authentication-in-selenium-chrome-web-driver-using-pyth :return: ''' # request_url = "https://www.google.com/search?q=crimial%20law&num=100&start=0&site=webhp&tbm=blg&source=lnt&as_qdr=y5" request_url = "https://www.google.com/search?q=bbs&num=100&start=0&gl=us&gws_rd=cr&as_qdr=d10" # request_url = "https://www.whatismyip.com/" # request_url = "http://whatsmyuseragent.com/" proxy = BuyProxyOrg(buy_proxy_org_account) proxy_list = proxy.get_proxies(5) chrome_list = list() for item in proxy_list: PROXY = item.str_no_auth() chrome_options = webdriver.ChromeOptions() # PROXY = "23.95.32.92:80" # USER_AGENT = "i like ice cream." USER_AGENT = WebRequestCommonHeader.webpage_agent chrome_options.add_argument('--proxy-server=http://{0:s}'.format(PROXY,)) chrome_options.add_argument('--user-agent={0:s}'.format(USER_AGENT,)) chrome = webdriver.Chrome(chrome_options=chrome_options) chrome.get(request_url) chrome_list.append(chrome) time.sleep(60) for item in chrome_list: item.close()
def testProxyGet(self): proxy = BuyProxyOrg(buy_proxy_org_account) proxy_list = proxy.get_proxies(5) for item in proxy_list: print("try proxy:", item) # sites = self.testGetLlinks(proxy=item) # for site in sites: # print(site) time.sleep(1)
def testBingResult(self): keyword = "law blog" proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) sites = BingCom.get_sites(keyword, page_number=1, index=0, length=100, filter_list=filter_list, country_code="us", source_type="", days_ago=10, return_domain_home_only=False, proxy=proxies[0], timeout=30) for item in sites: print(item) return sites
def testProxyGetOpen(self): request_url = "https://www.google.com/search?q=crimial%20law&num=100&start=0&site=webhp&tbm=blg&source=lnt&as_qdr=y5" # request_url = "https://www.whatismyip.com/" proxy = BuyProxyOrg(buy_proxy_org_account) proxy_list = proxy.get_proxies(5) for item in proxy_list: PROXY = item.str_no_auth() print("try proxy:", str(item)) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--proxy-server=http://%s' % PROXY) chrome = webdriver.Chrome(chrome_options=chrome_options) # driver = WebDriver.get_chrome(additional_options=chrome_options) chrome.get(request_url) # sites = self.testGetLlinks(proxy=item) # for site in sites: # print(site) time.sleep(5)
def testGetkeywordsRecursive(self, niche="Society/Law", level=1, keyword_init=[], proxies=None, country_code="us", min_delay=2, max_delay=5, offset=120): keyword_log_path = "/Users/superCat/Desktop/PycharmProjectPortable/Seeds/KeywordSuggestions/"+niche.replace('/', '-')+".txt" def save_callback(keywords: list): FileHandler.append_lines_to_file(keyword_log_path, keywords, option="at") if len(keyword_init) == 0: keyword_init = list(set(FileHandler.read_lines_from_file(keyword_log_path)))[offset:] for item in keyword_init: print(item) print("total keywords:", len(keyword_init)) if proxies is None: proxy_site = BuyProxyOrg(buy_proxy_org_account) proxies = proxy_site.get_proxies(timeout=5) current_level = 0 keywords_pool = keyword_init while current_level < level: keyword_init = self.testGetSuggestionBatch(keyword_init, proxies=proxies, country_code=country_code, min_delay=min_delay, max_delay=max_delay, callback=save_callback) keywords_pool += keyword_init current_level += 1 FileHandler.remove_file_if_exist(keyword_log_path) FileHandler.append_lines_to_file(keyword_log_path, keywords_pool, option="t")