def _add_proxy_url(self): urls = [] with open("proxy_url.txt", "r") as f: for line in f.readlines(): urls.append(line[:-1]) logger.info(f"number of proxy url {len(urls)}") for url in urls: logger.debug(f"update {url}") try: if len(self.ip_q["https"]) > 10: proxies = { "https": self._get_good_proxy("https", change_priority=False) } else: proxies = None resp = requests.get(f"https://{url}", timeout=15, proxies=proxies) regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)', resp.text) for protocol in self.PROTOCOLS: for reg in regs: self.add_ip(protocol, reg[0] + ":" + reg[1]) logger.debug(f"done url {url}") except: try: if len(self.ip_q["http"]) > 10: proxies = { "http": self._get_good_proxy("http", change_priority=False) } else: proxies = None resp = requests.get(f"http://{url}", timeout=15, proxies=proxies) regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)', resp.text) for protocol in self.PROTOCOLS: for reg in regs: self.add_ip(protocol, reg[0] + ":" + reg[1]) logger.debug(f"done url {url}") except: try: resp = requests.get(url, timeout=15) regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)', resp.text) for protocol in self.PROTOCOLS: for reg in regs: self.add_ip(protocol, reg[0] + ":" + reg[1]) logger.debug(f"done url {url}") except: logger.info(f"bad url {url}")
def init(): print('Starting...') f = open('centros.json') centers = json.load(f) with open('emails', 'a') as emails: for center in centers: label = center['label'] if 'A Coru' in label and ('IES ' in label or 'CPR ' in label or 'CPI ' in label): id = center['codigo'] try: r = session.get(url + id) image_path = r.html.search('img src="{}"')[0] print('Retrieving center id ' + id + ' email information...') r = requests.get(domain + image_path) with open(id + '.png', 'wb') as f: f.write(r.content) # Image recognition image=Image.open(id + '.png') email=pytesseract.image_to_string(image, lang='eng') emails.write(email) except Exception as e: print('Image link not found, skipping center:' + id) os.system('rm *.png') print('Finished, results at: "emails" file')
def _add_scylla(self): try: all_p = list( requests.get( "http://localhost:8899/api/v1/proxies?https=true&limit=10000", timeout=10).json()["proxies"]) for p in all_p: self.add_ip("https", f'{p["ip"]}:{p["port"]}') all_p = list( requests.get( "http://localhost:8899/api/v1/proxies?https=false&limit=10000", timeout=10).json()["proxies"]) for p in all_p: self.add_ip("http", f'{p["ip"]}:{p["port"]}') except: logger.exception("scylla failed") pass
def download_mp3(content: str): url = "http://dict.youdao.com/dictvoice?type=0&audio=" + content usatok = mktoken() uktok = mktoken() path = "./static/music/" + usatok + ".mp3" r = requests.get(url) with open(path, "wb") as f: f.write(r.content) f.close() url = "http://dict.youdao.com/dictvoice?type=1&audio=" + content path = "./static/music/" + uktok + ".mp3" r = requests.get(url) with open(path, "wb") as f: f.write(r.content) f.close() return usatok, uktok
def mainly(): #直接从CubeQL里面提取baiduCDS的内容,然后放进cylinder的爬虫队列内 req = requests.post('/get') que = demjson.decode(req.text) while que != []: word = que.pop() urllist = [] a = gethtmurl(requests.get('http://baidu.com/s?wd='+word).text) print(a) requests.post('/set?url = ',que.pop()) if que == []: req = requests.post('/baidu_get') que = demjson.decode(req.text)
def _add_pubproxy(self): try: if len(self.ip_q["https"]) > 10: proxies = { "https": self._get_good_proxy("https", change_priority=False) } else: proxies = None for protocol in self.PROTOCOLS: self.add_ip( protocol, requests.get( "https://pubproxy.com/api/proxy?type=http&speed=15&https=true", timeout=15, proxies=proxies).json()["data"][0]["ipPort"]) except: logger.exception("pubproxy failed") pass
def specfic_search(word): # 如果啥也没有就返回False,如果有就返回搜索后的结果 try: re_list = ["([a-z]|[A-Z]|\s){1,}翻译", "([a-z]|[A-Z]|\s){1,}", "(.*)的英语"] mode = -1 tmp = -1 cmpres = "" for i in re_list: cmp = re.compile(i) cmpres = re.match(cmp, word) tmp += 1 if cmpres != None: print(cmpres) mode = tmp break if mode == -1: # print(-1) return False # try content = cmpres.group() if mode == 1: content = content[:len(content)] if mode == 0: content = content[:len(content) - 2] if mode == 2: content = content[:len(content) - 3] # print(content) req = requests.get( "http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=" + content) ret = get_word_mean(content, hea_ordinary) ret_url = ("http://dict.youdao.com/search?q=" + content + "&keyfrom=new-fanyi.smartResult") return ret, ret_url, mode except: return False
import time from requests_html import requests while True: time.sleep(2) requests.get('http://127.0.0.1:1278/save')
def getsearchurl(url): soup = BeautifulSoup(url, "html.parser") ret = [] href_ = soup.find_all(name="a") #print(href_) for each in href_: #print(each.get('rel')) if each.get("rel") == ["noopener" ,"noreferrer"]: if each.get('href').find('mijisou.com')==-1: ret.append(each.get("href")) return ret def mainly(): #直接从CubeQL里面提取baiduCDS的内容,然后放进cylinder的爬虫队列内 req = requests.post('/get') que = demjson.decode(req.text) while que != []: word = que.pop() urllist = [] a = gethtmurl(requests.get('http://baidu.com/s?wd='+word).text) print(a) requests.post('/set?url = ',que.pop()) if que == []: req = requests.post('/baidu_get') que = demjson.decode(req.text) if __name__ == '__main__': #mainly() word='Linux' print(gethtmurl(requests.get('https://mijisou.com/?q='+word+'&category_general=on&time_range=&language=zh-CN&pageno=1',hea_ordinary).text)) #print(gethtmurl(requests.get('https://mijisou.com/?q='+word,hea_ordinary).text))