コード例 #1
0
ファイル: dx_new_product.py プロジェクト: xtuyaowu/dx-crawl
 def run(self):
     while True:
         product_url_id = rds.pop_member(NewProductUrlId)
         if product_url_id is None:
             break
         else:
             if isinstance(product_url_id, bytes):
                 product_url_id = product_url_id.decode('utf-8')
             rds.add_set(CrawlProductUrlId, product_url_id)
             if self.que.qsize() < 10:
                 proxy_ip_list = get_proxy_ip_main()
                 for proxy_ip in proxy_ip_list:
                     self.que.put({'ip': proxy_ip, 'num': 0})
             ip_proxy = self.que.get()
             ip = ip_proxy['ip']
             proxy = {"http": 'http://' + ip}
             header = {"User-Agent": random.choice(HEADERS)}
             try:
                 product_url = 'http://www.dx.com/%s' % product_url_id
                 rq = requests.get(product_url,
                                   headers=header,
                                   proxies=proxy,
                                   timeout=5)
                 if rq.status_code == 200:
                     rq.encoding = 'utf-8'
                     lock.acquire()
                     self.parse_html(rq.text, product_url_id, rq.url)
                     lock.release()
                 elif rq.status_code in (404, 500):
                     print('400/500')
                     rds.remove_member(CrawlProductUrlId, product_url_id)
                     poa_key = PoaId + product_url_id
                     rds.delete_key(poa_key)
                 else:
                     print(rq.status_code)
                     ip_proxy['num'] += 1
                     rds.add_set(NewProductUrlId, product_url_id)
             except RequestException:
                 print('REX')
                 ip_proxy['num'] += 1
                 rds.add_set(NewProductUrlId, product_url_id)
             finally:
                 if ip_proxy['num'] <= 5:
                     self.que.put(ip_proxy)
コード例 #2
0
 def run(self):
     while True:
         mp = rds.pop_member(NewProduct)
         if mp is None:
             break
         else:
             if isinstance(mp, bytes):
                 mp = mp.decode('utf-8')
             rds.add_set(CrawlProduct, mp)
             if self.que.qsize() < 10:
                 proxy_ip_list = get_proxy_ip_main()
                 for proxy_ip in proxy_ip_list:
                     self.que.put({'ip': proxy_ip, 'num': 0})
             ip_proxy = self.que.get()
             ip = ip_proxy['ip']
             proxy = {"http": 'http://' + ip}
             header = {"User-Agent": random.choice(HEADERS)}
             try:
                 product_url = eval(mp)['product_url']
                 rq = requests.get(product_url,
                                   headers=header,
                                   proxies=proxy,
                                   timeout=5)
                 if rq.status_code == 200:
                     rq.encoding = 'utf-8'
                     lock.acquire()
                     self.parse_html(rq.text, mp, rq.url)
                     lock.release()
                 elif rq.status_code == 404:
                     rds.remove_member(CrawlProduct, mp)
                 else:
                     print(rq.status_code)
                     ip_proxy['num'] += 1
                     rds.add_set(NewProduct, mp)
             except RequestException:
                 print('REX')
                 ip_proxy['num'] += 1
                 rds.add_set(NewProduct, mp)
             finally:
                 if ip_proxy['num'] <= 5:
                     self.que.put(ip_proxy)
コード例 #3
0
 def run(self):
     while True:
         mp = rds.pop_member(NewPageUrl)
         if mp is None:
             break
         else:
             if isinstance(mp, bytes):
                 mp = mp.decode('utf-8')
             rds.add_set(CrawlPageUrl, mp)
             mpp = eval(mp)
             try:
                 if self.que.qsize() < 10:
                     proxy_ip_list = get_proxy_ip_main()
                     for proxy_ip in proxy_ip_list:
                         self.que.put({'ip': proxy_ip, 'num': 0})
                 ip_proxy = self.que.get()
                 ip = ip_proxy['ip']
                 proxy = {"http": 'http://' + ip}
                 header = {"User-Agent": random.choice(HEADERS)}
                 page_url = mpp['page_url']
                 rq = requests.get(page_url,
                                   headers=header,
                                   proxies=proxy,
                                   timeout=5)
                 if rq.status_code == 200:
                     print(rq.url)
                     self.analyze_page_html(rq.text, mp)
                 else:
                     ip_proxy['num'] += 1
                     print('status_code: %s ' % rq.status_code)
                     rds.add_set(NewPageUrl, mp)
             except RequestException:
                 ip_proxy['num'] += 1
                 print('REX')
                 #traceback.print_exc()
                 rds.add_set(NewPageUrl, mp)
             finally:
                 if ip_proxy['num'] <= 5:
                     self.que.put(ip_proxy)
                 else:
                     print('move queue', ip_proxy)
コード例 #4
0
ファイル: dx_popular_crawl.py プロジェクト: xtuyaowu/dx-crawl
from dx_popular_update_track_uuid import update_track
from dx_popular_get_category2_url import get_page
from get_proxy_ip import get_proxy_ip_main
import queue
from configuration_file import DbNum, NewPageUrl
from store import DxRedis

rds = DxRedis(DbNum)
start_url = "http://www.dx.com/"


def dx_crawl_main(que):
    #get_page(start_url)
    for mem in rds.get_all_members('category2_url'):
        rds.add_set(NewPageUrl, mem)
    print('get_category2_url finished')
    dx_list_main(que)
    print('list finished')
    dx_product_main(que)
    print('product finished')
    update_track()
    print('update_track finished')


if __name__ == '__main__':
    q = queue.Queue()
    proxy_ip_lst = get_proxy_ip_main()
    for ip in proxy_ip_lst:
        q.put({'ip': ip, 'num': 0})
    dx_crawl_main(q)