コード例 #1
0
 def __init__(self, jid, lid):
     self.jid = jid  #json请求参数
     self.lid = lid  #json请求参数
     self.url = "https://www.zhipin.com/view/job/card.json?jid=" + str(
         self.jid) + "&lid=" + str(self.lid)
     #https://www.zhipin.com/view/job/card.json?jid=2339af182b9be5111XB70t2_GVQ~&lid=17qKeuLoGkf.search
     self.headers = {
         "Host":
         "www.zhipin.com",
         "Connection":
         "keep-alive",
         "Pragma":
         "no-cache",
         "Cache-Control":
         "no-cache",
         "Accept":
         "application/json, text/javascript, */*; q=0.01",
         "X-Requested-With":
         "XMLHttpRequest",
         "User-Agent":
         random.choice(UserAgents.agents),
         #"Referer": https://www.zhipin.com/c101190100/h_101190100/?query=python&page=2&ka=page-2
         "Accept-Encoding":
         "gzip, deflate, br",
         "Accept-Language":
         "zh-CN,zh;q=0.9",
         "Cookie":
         "sid=sem_pz_bdpc_dasou_title; JSESSIONID="
         "; __c=1539227402; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3DUTF-8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339,1539152693,1539227402; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2%26ka%3Dpage-2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539249307; __a=1223038.1539076337.1539076344.1539227402.57.3.21.21"
     }
     proxy_pool = ProxyPool.Proxy_Pool()
     self.proxies = []  #代理IP列表
     if not proxy_pool.Is_Empty():
         ip, self.proxies = proxy_pool.pop_all()
コード例 #2
0
 def __init__(self):
     self._start = 0
     self._end = 0
     self._keyword = ''
     self._timeout = 0
     self._recon = 0
     self._filename = ''
     self._urls = []
     self._re_urls = []
     self._pool = ProxyPool.ProxyPool()
     self._searchURL = ''
     self._punc = ',.!?:;~\'\",。!?:;、~…⋯()<>「」[]【】<>〈〉《》()﹙﹚『』«»“”’{}\\[\\]'   # the '[]' needs to be the last one
     self._stop_words = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄧㄨㄩㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦ \
                         abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ \
                         ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ \
                         12345678901234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!?©@#$%^&*.…⋯→‧•◎※■+・ˇˋˊ˙ \
                         ()_+=-\\[]/,.;:`~|{}<>\'\"\n\t\r\xa0,。、!?「」[]【】<>〈〉《》():;«»*˙●/_—『』×@#$%︿&-=〜~≡|│║★☆Ⓡ➠†§– \
                         ♥❤“”’ ̄▽😊😆😋😏😅😀😍😎📍👍🚫🐍💟🎉⊙◢◤˚゚・。`↑↓﹙﹚▲▼◆◈▣✥▒👉►⓪①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬〝〞▌☀ღ▶➦ⓞ☎▋♡▂▃▄▅▆▊▩⇓✽�🕘㊣╳'
     self._sw_no_punc = re.sub('([{}])'.format(self._punc), '', self._stop_words)
     self._sw_dict = {w : True for w in self._stop_words}
     self._sw_no_punc_dict = {w : True for w in self._sw_no_punc}
     self._websites = {'Unknown': -1, 'Pixnet': 0, 'Hares': 1}
     self._title_tags = ['title', 'entry-title']                       # {0: 'Pixnet', 1: 'Hares}
     self._content_tags = ['article-content-inner', 'entry-content']   # {0: 'Pixnet', 1: 'Hares}
     self._result = ''
コード例 #3
0
ファイル: BossZP_Spider.py プロジェクト: kellaJL/BossZP
    def __init__(self):
        self.start_url = "https://www.zhipin.com/"
        self.cities_code = {"深圳": "c101280600-p100109/h_101280600/", "上海": "c101020100-p100109/h_101020100/", "北京": "c101010100/h_101010100/",
                            "南京": "c101190100/h_101190100/", "杭州": "c101210100/h_101210100/"}  # 北京,南京,杭州
        self.headers = {
		    'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            'Accept-Encoding': "gzip, deflate, br",
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Cache-Control': "no-cache",
            'Connection': "keep-alive",
            'Cookie': "sid=sem_pz_bdpc_dasou_title; JSESSIONID=""; __g=sem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1539076339; __c=1539076344; __l=r=https%3A%2F%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&l=%2Fwww.zhipin.com%2Fjob_detail%2F%3Fka%3Dheader-job&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_ti; lastCity=101190100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101190100%2Fh_101190100%2F%3Fquery%3Dpython%26page%3D2; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1539130799; __a=1223038.1539076337.1539076337.1539076344.24.2.23.24",
            'Host': "www.zhipin.com",
            'Pragma': "no-cache",
            'Upgrade-Insecure-Requests': "1",
            "User-Agent": random.choice(UserAgents.agents),
            #请求伪造,可以试一下跨站请求伪造CSRF,Referer表示访问请求来源
            'Referer': 'https://www.zhipin.com/c101280600-p100109/h_101280600/?query=python&page=1&ka=page-1'
        }
        
        pool = redis.ConnectionPool(host='localhost', port=6379)
        self.conn = redis.Redis(connection_pool=pool)
        self.proxy_pool=ProxyPool.Proxy_Pool()
        self.proxies = []
        self.ip=[]
        if not self.proxy_pool.Is_Empty():
           self.ip,self.proxies=self.proxy_pool.pop_all()
コード例 #4
0
 async def __process_fourth_html(self):
     '''
     foreign website
     :return:
     '''
     page_count = 6
     urls = [
         self.__proxy_urls[3].format(_i + 1) for _i in range(page_count)
     ]
     tasks = [self.__get_html(url) for url in urls]
     done = await asyncio.gather(*tasks)
     import base64
     for html in done:
         proxies = re.findall(r"Proxy\('(.*?)'\)", html)
         for proxy in proxies:
             temp = base64.b64decode(proxy).decode()
             ip = re.findall('\d+\.\d+\.\d+\.\d+', temp)
             ip = ip[0]
             port = re.findall(':\d+', temp)
             port = port[0]
             port = port[1:]
             new = ProxyPool.proxy(ip, port)
             if new not in self.__pool:
                 self.__pool.append(new)
                 print(
                     f'process_fourth_html got {new.get_string_address()}')
コード例 #5
0
 async def __process_third_html(self):
     page_count = 2
     urls = [
         self.__proxy_urls[2].format(_i + 1) for _i in range(page_count)
     ]
     tasks = [self.__get_html(url) for url in urls]
     done = await asyncio.gather(*tasks)
     for html in done:
         if html:
             soup = BeautifulSoup(html, features='html5lib')
             res = soup.find_all('td')
             for _i, item in enumerate(res):
                 str_ = str(item.string)
                 temp = re.findall('\d+\.\d+\.\d+\.\d+', str_)
                 if temp:
                     ip = temp[0]
                     port = res[_i + 1].string
                     port = str(port)
                     port = port.strip()
                     new = ProxyPool.proxy(ip, port)
                     if new not in self.__pool:
                         self.__pool.append(new)
                         print(
                             f'process_third_html got {new.get_string_address()}'
                         )
コード例 #6
0
ファイル: PoolModule.py プロジェクト: Raibows/bilibiliSpider
 async def __process_first_html(self):
     # tasks = [self.__get_html(self.__proxy_urls[0].format(_i)) for _i in range(1, 4)]
     page_count = 4
     done = []
     for _i in range(1, page_count):
         url = self.__proxy_urls[0].format(_i)
         html = await self.__get_html(url)
         done.append(html)
         time.sleep(1.148)
     # done = await asyncio.gather(*tasks)
     for html in done:
         if html:
             soup = BeautifulSoup(html, features="html5lib")
             ips = soup.find_all(
                 'td',
                 {
                     'data-title': 'IP'
                 }
             )
             ports = soup.find_all(
                 'td',
                 {
                     'data-title': 'PORT'
                 }
             )
             proxies = zip(ips, ports)
             # print(len(ips))
             # os._exit(-1)
             for item in proxies:
                 proxy = ProxyPool.proxy(item[0].string, item[1].string)
                 if proxy not in self.__pool:
                     self.__pool.append(proxy)
                     print(f'process_first_html got {proxy.get_string_address()}')
コード例 #7
0
    def __init__(self, exit_pipe, count_pipe, data_pipe):
        self.pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT)
        self.r_con = redis.Redis(connection_pool=self.pool)
        self.ua = UserAgent()
        self.exit_flag = False  # 退出信号

        self.exit_pipe = exit_pipe
        self.count_pipe = count_pipe
        self.data_pipe = data_pipe

        self.crawl_threads = []  # 爬取线程

        self.data_format = [
            "uid", "user_name", "user_sign", "gender", "level", "birthday",
            "coins", "vip", "favorite_list", "favorite_sum", "follow", 'fans',
            "live_title", "audio", "video", "album", "article", "play_count",
            "read_count", "time"
        ]

        self.pages_count = 0
        self.items_count = 0

        self.redis = ProxyPool.ProxyAPI()

        self.run()
コード例 #8
0
ファイル: ProxyPoolDeamon.py プロジェクト: hethune/ProxyPool
def makeSiteThreads():
  import pkgutil
  import sites

  # load proxy file
  ProxyPool.restore()

  sites_modules = []
  for importer, modname, ispkg in pkgutil.iter_modules(sites.__path__):
    sites_modules.append(importer.find_module(modname).load_module(modname))

  site_threads = []
  for m in sites_modules:
    t = siteThread(m.__name__, m.crawl)
    t.deamon = True
    site_threads.append(t)

  return site_threads
コード例 #9
0
def set_proxy():
    proxy = ProxyPool.get_proxy_ip()
    settings = {
        "httpProxy": proxy,
        "sslProxy": proxy
    }
    print('proxy_ip', proxy)
    proxy = Proxy(settings)
    cap = DesiredCapabilities.CHROME.copy()
    cap['platform'] = "WINDOWS"
    cap['version'] = "10"
    proxy.add_to_capabilities(cap)
    return cap
コード例 #10
0
ファイル: PoolModule.py プロジェクト: Raibows/bilibiliSpider
 def __universal_soup(self, flag:str, html):
     res = html
     soup = BeautifulSoup(res, features='html5lib')
     ipandport = soup.find_all('td')
     for i, ip in enumerate(ipandport):
         temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string)
         if temp:
             port = ipandport[i + 1].string
             port = port.strip()
             new = ProxyPool.proxy(temp[0], port)
             if new not in self.__pool:
                 self.__pool.append(new)
                 print(f'process_{flag}_html got {new.get_string_address()}')
コード例 #11
0
ファイル: Spider.py プロジェクト: kellaJL/E-P-SPIDER
    def getHTMLText(self, code="utf-8"):
        if not self.parseURL():
            return
        if self.cache:
            self.html = self.cache[self.url]
            if not self.html:
                p_p = ProxyPool.Proxy_Pool()
                proxy = db.proxy
                tag = True
                while tag:
                    proxies = proxy.find_one()
                    if proxies == None:
                        ProxyGetter.get_ip()
                    one_p = str(proxies['类型'])
                    two_p = str(proxies['IP'])
                    three_p = str(proxies['PORT'])
                    #print(one_p)
                    #print(type(one_p))
                    flag = p_p.test_connection(
                        one_p, two_p, three_p)  ##########################
                    if flag == False:
                        p_p.del_record(proxies['IP'])
                    #proxies = proxy.find_one()
                    else:
                        tag = False
                proxy_ip = {
                    str(proxies['类型']):
                    str(proxies['IP']) + ":" + str(proxies['PORT'])
                }
                try:

                    ua = {
                        'user-agent':
                        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
                    }
                    r = requests.get(self.url, headers=ua, proxies=proxy_ip)
                    r.raise_for_status()
                    r.encoding = code
                    self.html = r.text
                    self.cache[self.url] = self.html
            #p_p.clean_nonworking()
                except:
                    #p_p.clean_nonworking()
                    pass
コード例 #12
0
ファイル: PoolModule.py プロジェクト: Raibows/bilibiliSpider
 async def __process_fifth_html(self):
     '''
     forbidden website
     :return:
     '''
     proxy_count = 20
     url = self.__proxy_urls[4].format(proxy_count)
     html = await self.__get_html(url)
     print(html)
     items = re.findall('\d+\.\d+\.\d+\.\d+:\d{1,5}', html)
     for item in items:
         ip = re.findall('\d+\.\d+\.\d+\.\d+', item)
         ip = ip[0]
         port = item.replace(ip, "")
         port = port[1:]
         new = ProxyPool.proxy(ip, port)
         if new not in self.__pool:
             self.__pool.append(new)
             print(f'process_fifth_html got {new.get_string_address()}')
コード例 #13
0
 async def __process_second_html(self):
     urls = self.__proxy_urls[1]
     tasks = [self.__get_html(url) for url in urls]
     done = await asyncio.gather(*tasks)
     for res in done:
         if res:
             soup = BeautifulSoup(res, features='html5lib')
             ipandport = soup.find_all('td')
             for i, ip in enumerate(ipandport):
                 temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string)
                 if temp:
                     port = ipandport[i + 1].string
                     port = port.strip()
                     new = ProxyPool.proxy(temp[0], port)
                     if new not in self.__pool:
                         self.__pool.append(new)
                         print(
                             f'process_second_html got {new.get_string_address()}'
                         )
コード例 #14
0
 async def __process_sixth_html(self):
     page_count = 5
     urls = [
         self.__proxy_urls[5].format(_i + 1) for _i in range(page_count)
     ]
     tasks = [self.__get_html(url) for url in urls]
     done = await asyncio.gather(*tasks)
     for html in done:
         if html:
             soup = BeautifulSoup(html, features='html5lib')
             ipandport = soup.find_all('td')
             for i, ip in enumerate(ipandport):
                 temp = re.findall('\d+\.\d+\.\d+\.\d+', ip.string)
                 if temp:
                     port = ipandport[i + 1].string
                     port = port.strip()
                     new = ProxyPool.proxy(temp[0], port)
                     if new not in self.__pool:
                         self.__pool.append(new)
                         print(
                             f'process_sixth_html got {new.get_string_address()}'
                         )
コード例 #15
0
import sys
sys.path.append('..')
sys.path.append('.')

import ProxyPool

if __name__ == '__main__':
    pool = ProxyPool.proxy_pool()
    pool.start_work()
コード例 #16
0
        while temp:
            temp = bytes.decode(temp)
            info.append(temp)
            temp = self.__redis.rpop('decrease')
        feedback['decrease'] = info

        return feedback


if __name__ == '__main__':

    test = database()
    proxy = 'http://119.254.94.114:34422'
    # test.proxy_feedback(proxy, True)
    #
    a = ProxyPool.proxy('192.141.32.2', '2367')
    b = ProxyPool.proxy('111.23.214.123', '23')
    c = ProxyPool.proxy('111.23.214.123', '231')
    test.add_proxies([a, b, c])

    # test.delete_proxies([a, b])
    # test.add_proxies([a, b, c])
    # test.add_proxies([a, b, c])
    #
    # test.proxy_feedback(a.get_string_address(), True)
    # test.proxy_feedback(a.get_dict_address(), False)
    # test.proxy_feedback(b.get_dict_address(), True)
    #
    # print(test.get_feedback())

    # temp = test.get_all_increase()
コード例 #17
0
ファイル: MAIN.py プロジェクト: kellaJL/E-P-SPIDER
sp=Spider()
sp.updateDatabase()#更新数据库
#start_url=input("请输入目标网站首页地址,例如 http://www.xiaohuar.com/  :")
#if parseURL(start_url)==False:
#    exit(1)
start_url="http://www.xiaohuar.com/"

#start_url=u"http://www.baidu.com/"

sp.main(start_url,start_url)
spider_schedule=SpiderSchedule.SpiderSchedule()

downloads = downloaderware()
threads=[]

p_p=ProxyPool.Proxy_Pool()
proxy_counter=1#代理池更新计数器
threads_counter=0#线程计数器
t0 = threading.Thread(target=spider_schedule.SpiderSchedule,args=(start_url,))
t0.start()
threads.append(t0)
while threads and threads_counter<=50:

    # the crawl is still active
    for thread in threads:
        if not thread.is_alive():
            # remove the stopped threads
            threads.remove(thread)

        t2 = threading.Thread(target=downloads.downloaderware, args=(start_url,))
        t2.start()
コード例 #18
0
ファイル: ProxyPoolDeamon.py プロジェクト: hethune/ProxyPool
 def run(self):
   print "starting proxy dump thread"
   while True:
     ProxyPool.dump()
     print "[Deamon] Dumped proxies"
     time.sleep(30)
コード例 #19
0
ファイル: ProxyPoolDeamon.py プロジェクト: hethune/ProxyPool
 def run(self):
   print "starting proxy pools"
   while True:
     ProxyPool.cleanNonWorking()