def create_webdriver(proxy_url=None): """ Creates the webdriver for Selenium @param proxy_url: Proxy url in string format "host:port". None if no proxy is used @return: the Selenium webdriver """ driver = None chrome_options = webdriver.ChromeOptions() user_agent = user_agents.random_user_agent() print "Setting User Agent: %s" % user_agent chrome_options.add_argument("--user-agent=" + user_agent) if proxy_url is not None: my_proxy = proxy_url proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': my_proxy, 'ftpProxy': my_proxy, 'sslProxy': my_proxy, 'noProxy': '' # set this value as desired }) #driver = webdriver.Firefox(firefox_profile=create_firefox_profile(firefox_profile_path=FIREFOX_PROFILE_PATH), proxy=proxy) #Firefox login is causing too many login errors, switching to Chrome print "Setting Proxy: %s" % proxy_url chrome_options.add_argument('--proxy-server=%s' % proxy_url) else: #driver = webdriver.Firefox(firefox_profile=create_firefox_profile(firefox_profile_path=FIREFOX_PROFILE_PATH)) pass driver = webdriver.Chrome(chrome_options=chrome_options) return driver
def create_webdriver(proxy_url=None): """ Creates the webdriver for Selenium @param proxy_url: Proxy url in string format "host:port". None if no proxy is used @return: the Selenium webdriver """ driver = None chrome_options = webdriver.ChromeOptions() user_agent = user_agents.random_user_agent() print "Setting User Agent: %s" % user_agent chrome_options.add_argument("--user-agent=" + user_agent); if proxy_url is not None: my_proxy = proxy_url proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': my_proxy, 'ftpProxy': my_proxy, 'sslProxy': my_proxy, 'noProxy': '' # set this value as desired }) #driver = webdriver.Firefox(firefox_profile=create_firefox_profile(firefox_profile_path=FIREFOX_PROFILE_PATH), proxy=proxy) #Firefox login is causing too many login errors, switching to Chrome print "Setting Proxy: %s" % proxy_url chrome_options.add_argument('--proxy-server=%s' % proxy_url) else: #driver = webdriver.Firefox(firefox_profile=create_firefox_profile(firefox_profile_path=FIREFOX_PROFILE_PATH)) pass driver = webdriver.Chrome(chrome_options=chrome_options) return driver
def html_to_md(url, param): # dcap = dict(DesiredCapabilities.PHANTOMJS) # dcap["phantomjs.page.settings.userAgent"] = (random_user_agent()) # driver = webdriver.PhantomJS(desired_capabilities=dcap) # driver = webdriver.PhantomJS(executable_path=executable_path) # obj = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe',desired_capabilities=dcap) # driver.get(url) # driver.implicitly_wait(5) # html = driver.page_source # driver.close() # driver.quit() headers = {'User-Agent': random_user_agent()} session = HTMLSession() r = session.get(url, headers=headers) # print(r.encoding) # print(r.apparent_encoding) # r.encoding = 'utf-8' r.encoding = r.apparent_encoding html = r.text md = html2text.html2text(html) return md
def sm1234_search(key, pn): # print("sm1234_search start...") kv = {'q': key, 'p': pn} # print(kv) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.get("http://sm.sm1234.net/", params=kv, headers=headers) # url = 'http://sm.sm1234.net/?q=python3&p=2' # print(r.url) soup = BeautifulSoup(r.text, 'lxml') # li = [] for item in soup.find_all('div', attrs={"class": "g"}): # print(item) # if item.has_attr('id') and item['id'] == str(now): result = {} result['title'] = item.h2.get_text() result['url'] = "http://sm.sm1234.net" + item.h2.a['href'] #http://sm.sm1234.net # print(result['url']) result['text'] = (item.find("div", attrs={"class": "std"})).get_text() # li.append(result) yield result
def gg_search(key, pn): # print("gg_search start...") kv = {'q': key, 'start': pn} headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.get("https://www.google.com/search", params=kv, headers=headers) #print(r.url) soup = BeautifulSoup(r.text, 'lxml') # li = [] for item in soup.find_all('div', attrs={"class": "g"}): #print("bing search div g") a = item.find('a') result = {} result['title'] = a.text.strip() result['url'] = a["href"] stext = item.find("span", attrs={"class": "st"}) if stext: result['text'] = stext.text # li.append(result) yield result
def bd_search(key, pn): print("bd_search start...") # key = quote(key) kv = {'wd': key, 'pn': pn} # print(kv) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.get("http://www.baidu.com/s", params=kv, headers=headers) # print(r.url) soup = BeautifulSoup(r.text, 'lxml') # select_html = soup.find("div", attrs={'id':'content_left'}) # li = [] now = int(pn) # t1 = time.time() for item in soup.find_all('div', attrs={"class": "c-container"}): #print("search c-container") now += 1 if item.has_attr('id') and item['id'] == str(now): #print(type(now)) result = {} # result['title'] = item.h3.get_text() #print(item.h3) #print(item.h3.name) name is h3 #print(item.h3.a.contents) #result['title'] = str(item.h3.a.contents).strip("[]") ss = '' for div in item.find_all('div'): if div.has_attr('class') and ( div['class'][0].find('abstract') != -1 or div['class'][0] == 'c-row'): ss += div.get_text() #ss += div.contents result['text'] = ss # class="c-showurl" style="text-decoration:none;" # a = item.find('a') # print(a.get('href')) # print(a.get_text()) if item.h3.a: #result['url'] = item.h3.a.get('href') #print(item.h3.a.string) result['url'] = item.h3.a['href'] # requests get for baidu redirect url to get result url. # a = requests.get(url = item.h3.a['href'], headers=headers) # result['url'] = a.url # print("one get: " + str(t6 - t5) + " seconds") result['title'] = item.h3.get_text() # Optimization baidu search # result['title'] = item.h3.get_text() + " " + result['url'] # li.append(result) yield result else: print("item.h3.a is None ***") print(item.h3)
def get_urls(baseurl): headers = {'User-Agent': random_user_agent()} session = HTMLSession() r = session.get(baseurl, headers=headers) all_urls = set() for link in r.html.absolute_links: all_urls.add(link) return "\n".join(all_urls)
def sina_news(): print("sina news") url = 'https://news.sina.com.cn/' # <ul class="list_14" data-sudaclick="blk_news_3"> headers = {'User-Agent': random_user_agent()} r = requests.get(url, headers=headers) r.encoding = 'utf-8' # print(r.text) soup = BeautifulSoup(r.text, 'lxml') divs = soup.select('.ct_t_01 p a') print(len(divs)) result = {} for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sina" yield result # <ul class="list_14" data-sudaclick="blk_news_1"> divs = soup.select('[data-sudaclick="blk_news_1"] li a') print(len(divs)) result = {} for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sina" yield result divs = soup.select('[data-sudaclick="blk_news_2"] li a') print(len(divs)) result = {} for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sina" yield result divs = soup.select('[data-sudaclick="blk_news_3"] li a') print(len(divs)) result = {} for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sina" yield result divs = soup.select('[data-sudaclick="blk_news_4"] li a') print(len(divs)) result = {} for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sina" yield result
def ddk_search(key, pn): # kv = {'wd':key, 'pn':pn} pn = int(pn) * 3 # dc = pn - 1 # kv = {'q':key, 's':pn, 'dc':pn, 'api':'/d.js'} if pn > 10: kv = { 'q': key, 's': pn, 'dc': pn, 'v': 'l', 'o': 'json', 'api': '/d.js' } else: kv = {'q': key, 's': pn, 'dc': pn} # print(kv) # print(random_user_agent()) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.post("https://www.duckduckgo.com/html", params=kv, headers=headers) # print(r.url) # print(r.status_code) # html = content.decode("utf8", "ignore") # print(r.text) soup = BeautifulSoup(r.text, 'lxml') # li = [] # <div class="result results_links results_links_deep web-result "> # On server, need use follow string. # <div class="links_main links_deep result__body"> for item in soup.find_all( 'div', attrs={"class": "links_main links_deep result__body"}): result = {} # result['title'] = item.h2.get_text() # print(result['title']) result['url'] = item.h2.a['href'] result['title'] = item.h2.get_text() + " " + result['url'] gettext = item.find("a", attrs={"class": "result__snippet"}) if gettext: result['text'] = gettext.get_text() else: result['text'] = "" # "<br>" # result['text'] = (item.find("a", attrs={"class":"result__snippet"})).get_text() # li.append(result) yield result
def sohu_news(): print("sohu news") url = 'http://www.sohu.com' headers = {'User-Agent': random_user_agent()} r = requests.get(url, headers=headers) html = r.text soup = BeautifulSoup(r.text, 'lxml') ''' <div class="news" data-spm="top-news1"> ''' # top-news1 top-news2 top-news3 top-news4 top-news5 result = {} divs = soup.select('[data-spm="top-news1"] p a') print(len(divs)) for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sohu" yield result divs = soup.select('[data-spm="top-news2"] li a') print(len(divs)) for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sohu" # print(a.text.strip()) # print(a['href']) yield result divs = soup.select('[data-spm="top-news3"] li a') print(len(divs)) for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sohu" # print(a.text.strip()) # print(a['href']) yield result divs = soup.select('[data-spm="top-news4"] li a') print(len(divs)) for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sohu" # print(a.text.strip()) # print(a['href']) yield result
def sohu_news_s(): url = 'https://news.sina.com.cn/' # <ul class="list_14" data-sudaclick="blk_news_3"> headers = {'User-Agent': random_user_agent()} r = requests.get(url, headers=headers) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml') divs = soup.select('[data-spm="top-news2"] li a') # print(len(divs)) for a in divs: result['title'] = a.text.strip() result['url'] = a['href'] result['name'] = "-sohu" # print(a.text.strip()) # print(a['href']) yield result
def youdao_fanyi(type, q, dst): print("youdao_fanyi") # q = "你好" # q = "hello" headers = {'User-Agent':random_user_agent()} # type = detect(q[:30]) #"en | zh_cn" # print("detect(q) = " + type) url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom=https://www.baidu.com/link' # if dst == "cn": # print("youdao cn") # # if type in ['zh'] 'zh-CHS' # data = {'from': type, 'to': 'zh-cn', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # elif dst == "en": # print("youdao en") # data = {'from': type, 'to': 'en', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # elif dst == "fra": # print("youdao fra") # data = {'from': 'zh-tw', 'to': 'zh-CHS', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # else: # print("youdao else") data = {'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # print(q) r = requests.get(url, params=data, headers=headers) ta = r.json() result = "" for ii in ta['translateResult']: for i in ii: result += i['tgt'] result += "\n" return result
def gg_fanyi(type, q, dst): print("gg_fanyi") ctx = execjs.compile(""" function TL(a) { var k = ""; var b = 406644; var b1 = 3293161072; var jd = "."; var $b = "+-a^+6"; var Zb = "+-3^+b+-f"; for (var e = [], f = 0, g = 0; g < a.length; g++) { var m = a.charCodeAt(g); 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = m >> 18 | 240, e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, e[f++] = m >> 6 & 63 | 128), e[f++] = m & 63 | 128) } a = b; for (f = 0; f < e.length; f++) a += e[f], a = RL(a, $b); a = RL(a, Zb); a ^= b1 || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + jd + (a ^ b) }; function RL(a, b) { var t = "a"; var Yb = "+"; for (var c = 0; c < b.length - 2; c += 3) { var d = b.charAt(c + 2), d = d >= t ? d.charCodeAt(0) - 87 : Number(d), d = b.charAt(c + 1) == Yb ? a >>> d: a << d; a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d } return a } """) tk = ctx.call("TL", q) # print(q) headers = {'User-Agent':random_user_agent()} # print(q) type = detect(q[:30]) #"en | zh_cn" print("detect(q) = " + type) # if type == 'ko': # type = "zh-cn" if type == "en": # print("gg dedect type en") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) # url = "http://translate.google.cn/translate_a/single?client=t" \ # "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ # "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ # "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif type == 'zh-cn': # print("google dedect type zh-cn") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif type == 'zh-tw': # print("google dedect type zh-cn") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: # print("google dedect type else") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) r = requests.get(url, headers=headers) data = r.json() result = '' for dt in data[0]: if dt[0]: result += dt[0] return result