def _add_proxy_url(self):
     urls = []
     with open("proxy_url.txt", "r") as f:
         for line in f.readlines():
             urls.append(line[:-1])
     logger.info(f"number of proxy url {len(urls)}")
     for url in urls:
         logger.debug(f"update {url}")
         try:
             if len(self.ip_q["https"]) > 10:
                 proxies = {
                     "https":
                     self._get_good_proxy("https", change_priority=False)
                 }
             else:
                 proxies = None
             resp = requests.get(f"https://{url}",
                                 timeout=15,
                                 proxies=proxies)
             regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)',
                               resp.text)
             for protocol in self.PROTOCOLS:
                 for reg in regs:
                     self.add_ip(protocol, reg[0] + ":" + reg[1])
             logger.debug(f"done url {url}")
         except:
             try:
                 if len(self.ip_q["http"]) > 10:
                     proxies = {
                         "http":
                         self._get_good_proxy("http", change_priority=False)
                     }
                 else:
                     proxies = None
                 resp = requests.get(f"http://{url}",
                                     timeout=15,
                                     proxies=proxies)
                 regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)',
                                   resp.text)
                 for protocol in self.PROTOCOLS:
                     for reg in regs:
                         self.add_ip(protocol, reg[0] + ":" + reg[1])
                 logger.debug(f"done url {url}")
             except:
                 try:
                     resp = requests.get(url, timeout=15)
                     regs = re.findall(r'((?:\d{1,3}\.){3}\d{1,3}):(\d+)',
                                       resp.text)
                     for protocol in self.PROTOCOLS:
                         for reg in regs:
                             self.add_ip(protocol, reg[0] + ":" + reg[1])
                     logger.debug(f"done url {url}")
                 except:
                     logger.info(f"bad url {url}")
Exemple #2
0
def init():

    print('Starting...')
    f = open('centros.json')
    centers = json.load(f)
    with open('emails', 'a') as emails:
        for center in centers:
            label = center['label']
            if 'A Coru' in label and ('IES ' in label or 'CPR ' in label or 'CPI ' in label):
                id = center['codigo']
                try:
                    r = session.get(url + id)
                    image_path = r.html.search('img src="{}"')[0]
                    print('Retrieving center id ' + id + ' email information...')
                    r = requests.get(domain + image_path)
                    with open(id + '.png', 'wb') as f:
                        f.write(r.content)
                    # Image recognition
                    image=Image.open(id + '.png')
                    email=pytesseract.image_to_string(image, lang='eng')
                    emails.write(email)
                except Exception as e:
                    print('Image link not found, skipping center:' + id)

    os.system('rm *.png')
    print('Finished, results at: "emails" file')
 def _add_scylla(self):
     try:
         all_p = list(
             requests.get(
                 "http://localhost:8899/api/v1/proxies?https=true&limit=10000",
                 timeout=10).json()["proxies"])
         for p in all_p:
             self.add_ip("https", f'{p["ip"]}:{p["port"]}')
         all_p = list(
             requests.get(
                 "http://localhost:8899/api/v1/proxies?https=false&limit=10000",
                 timeout=10).json()["proxies"])
         for p in all_p:
             self.add_ip("http", f'{p["ip"]}:{p["port"]}')
     except:
         logger.exception("scylla failed")
         pass
def download_mp3(content: str):
    url = "http://dict.youdao.com/dictvoice?type=0&audio=" + content
    usatok = mktoken()
    uktok = mktoken()
    path = "./static/music/" + usatok + ".mp3"
    r = requests.get(url)

    with open(path, "wb") as f:
        f.write(r.content)
    f.close()
    url = "http://dict.youdao.com/dictvoice?type=1&audio=" + content
    path = "./static/music/" + uktok + ".mp3"
    r = requests.get(url)

    with open(path, "wb") as f:
        f.write(r.content)
    f.close()

    return usatok, uktok
def mainly():
    #直接从CubeQL里面提取baiduCDS的内容,然后放进cylinder的爬虫队列内
    req = requests.post('/get')
    que = demjson.decode(req.text)
    while que != []:
        word = que.pop()
        urllist = []
        a = gethtmurl(requests.get('http://baidu.com/s?wd='+word).text)
        print(a)
        requests.post('/set?url = ',que.pop())
        if que == []:
            req = requests.post('/baidu_get')
            que = demjson.decode(req.text)
 def _add_pubproxy(self):
     try:
         if len(self.ip_q["https"]) > 10:
             proxies = {
                 "https": self._get_good_proxy("https",
                                               change_priority=False)
             }
         else:
             proxies = None
         for protocol in self.PROTOCOLS:
             self.add_ip(
                 protocol,
                 requests.get(
                     "https://pubproxy.com/api/proxy?type=http&speed=15&https=true",
                     timeout=15,
                     proxies=proxies).json()["data"][0]["ipPort"])
     except:
         logger.exception("pubproxy failed")
     pass
def specfic_search(word):  # 如果啥也没有就返回False,如果有就返回搜索后的结果
    try:

        re_list = ["([a-z]|[A-Z]|\s){1,}翻译", "([a-z]|[A-Z]|\s){1,}", "(.*)的英语"]
        mode = -1
        tmp = -1
        cmpres = ""
        for i in re_list:

            cmp = re.compile(i)
            cmpres = re.match(cmp, word)
            tmp += 1
            if cmpres != None:
                print(cmpres)
                mode = tmp
                break
        if mode == -1:
            # print(-1)
            return False
            # try
        content = cmpres.group()
        if mode == 1:
            content = content[:len(content)]
        if mode == 0:
            content = content[:len(content) - 2]
        if mode == 2:
            content = content[:len(content) - 3]
        # print(content)
        req = requests.get(
            "http://fanyi.youdao.com/translate?&doctype=json&type=AUTO&i=" +
            content)

        ret = get_word_mean(content, hea_ordinary)
        ret_url = ("http://dict.youdao.com/search?q=" + content +
                   "&keyfrom=new-fanyi.smartResult")
        return ret, ret_url, mode
    except:
        return False
Exemple #8
0
import time
from requests_html import requests
while True:
    time.sleep(2)
    requests.get('http://127.0.0.1:1278/save')
def getsearchurl(url):
    soup = BeautifulSoup(url, "html.parser")
    ret = []
    href_ = soup.find_all(name="a")
    #print(href_)
    for each in href_:
        #print(each.get('rel'))
        if each.get("rel") == ["noopener" ,"noreferrer"]:
            if each.get('href').find('mijisou.com')==-1:
                ret.append(each.get("href"))
    return ret
def mainly():
    #直接从CubeQL里面提取baiduCDS的内容,然后放进cylinder的爬虫队列内
    req = requests.post('/get')
    que = demjson.decode(req.text)
    while que != []:
        word = que.pop()
        urllist = []
        a = gethtmurl(requests.get('http://baidu.com/s?wd='+word).text)
        print(a)
        requests.post('/set?url = ',que.pop())
        if que == []:
            req = requests.post('/baidu_get')
            que = demjson.decode(req.text)
        
    
if __name__ == '__main__':
    #mainly()
    word='Linux'
    print(gethtmurl(requests.get('https://mijisou.com/?q='+word+'&category_general=on&time_range=&language=zh-CN&pageno=1',hea_ordinary).text))
    #print(gethtmurl(requests.get('https://mijisou.com/?q='+word,hea_ordinary).text))