def run(self): while True: product_url_id = rds.pop_member(NewProductUrlId) if product_url_id is None: break else: if isinstance(product_url_id, bytes): product_url_id = product_url_id.decode('utf-8') rds.add_set(CrawlProductUrlId, product_url_id) if self.que.qsize() < 10: proxy_ip_list = get_proxy_ip_main() for proxy_ip in proxy_ip_list: self.que.put({'ip': proxy_ip, 'num': 0}) ip_proxy = self.que.get() ip = ip_proxy['ip'] proxy = {"http": 'http://' + ip} header = {"User-Agent": random.choice(HEADERS)} try: product_url = 'http://www.dx.com/%s' % product_url_id rq = requests.get(product_url, headers=header, proxies=proxy, timeout=5) if rq.status_code == 200: rq.encoding = 'utf-8' lock.acquire() self.parse_html(rq.text, product_url_id, rq.url) lock.release() elif rq.status_code in (404, 500): print('400/500') rds.remove_member(CrawlProductUrlId, product_url_id) poa_key = PoaId + product_url_id rds.delete_key(poa_key) else: print(rq.status_code) ip_proxy['num'] += 1 rds.add_set(NewProductUrlId, product_url_id) except RequestException: print('REX') ip_proxy['num'] += 1 rds.add_set(NewProductUrlId, product_url_id) finally: if ip_proxy['num'] <= 5: self.que.put(ip_proxy)
def run(self): while True: mp = rds.pop_member(NewProduct) if mp is None: break else: if isinstance(mp, bytes): mp = mp.decode('utf-8') rds.add_set(CrawlProduct, mp) if self.que.qsize() < 10: proxy_ip_list = get_proxy_ip_main() for proxy_ip in proxy_ip_list: self.que.put({'ip': proxy_ip, 'num': 0}) ip_proxy = self.que.get() ip = ip_proxy['ip'] proxy = {"http": 'http://' + ip} header = {"User-Agent": random.choice(HEADERS)} try: product_url = eval(mp)['product_url'] rq = requests.get(product_url, headers=header, proxies=proxy, timeout=5) if rq.status_code == 200: rq.encoding = 'utf-8' lock.acquire() self.parse_html(rq.text, mp, rq.url) lock.release() elif rq.status_code == 404: rds.remove_member(CrawlProduct, mp) else: print(rq.status_code) ip_proxy['num'] += 1 rds.add_set(NewProduct, mp) except RequestException: print('REX') ip_proxy['num'] += 1 rds.add_set(NewProduct, mp) finally: if ip_proxy['num'] <= 5: self.que.put(ip_proxy)
def run(self): while True: mp = rds.pop_member(NewPageUrl) if mp is None: break else: if isinstance(mp, bytes): mp = mp.decode('utf-8') rds.add_set(CrawlPageUrl, mp) mpp = eval(mp) try: if self.que.qsize() < 10: proxy_ip_list = get_proxy_ip_main() for proxy_ip in proxy_ip_list: self.que.put({'ip': proxy_ip, 'num': 0}) ip_proxy = self.que.get() ip = ip_proxy['ip'] proxy = {"http": 'http://' + ip} header = {"User-Agent": random.choice(HEADERS)} page_url = mpp['page_url'] rq = requests.get(page_url, headers=header, proxies=proxy, timeout=5) if rq.status_code == 200: print(rq.url) self.analyze_page_html(rq.text, mp) else: ip_proxy['num'] += 1 print('status_code: %s ' % rq.status_code) rds.add_set(NewPageUrl, mp) except RequestException: ip_proxy['num'] += 1 print('REX') #traceback.print_exc() rds.add_set(NewPageUrl, mp) finally: if ip_proxy['num'] <= 5: self.que.put(ip_proxy) else: print('move queue', ip_proxy)
from dx_popular_update_track_uuid import update_track from dx_popular_get_category2_url import get_page from get_proxy_ip import get_proxy_ip_main import queue from configuration_file import DbNum, NewPageUrl from store import DxRedis rds = DxRedis(DbNum) start_url = "http://www.dx.com/" def dx_crawl_main(que): #get_page(start_url) for mem in rds.get_all_members('category2_url'): rds.add_set(NewPageUrl, mem) print('get_category2_url finished') dx_list_main(que) print('list finished') dx_product_main(que) print('product finished') update_track() print('update_track finished') if __name__ == '__main__': q = queue.Queue() proxy_ip_lst = get_proxy_ip_main() for ip in proxy_ip_lst: q.put({'ip': ip, 'num': 0}) dx_crawl_main(q)