def check_proxy(p): try: print(p) fetch('http://baidu.com', proxy=p['ip']) except RequestException as e: print(e) CouponDB().delete_ip(p['id'])
def get_coupon_info(self): ''' 获取商品信息 :param url: 请求url :param self.second_id: 商品分类id :return: ''' # 获取links.bin中的url urls = read_file_to_url(LINKS_BIN) for url in urls: m = url.replace("\n", "").split("-") url = m[0] first_id = m[1] second_id = self.get_id_for_url(url) page = 0 while True: try: resp = fetch(self.get_url.format(id=second_id, page=page)) except RequestException as e: resp = fetch(self.get_url.format(id=second_id, page=page)) log.logging.info('[warn] ineffective:{0}'.format(e)) if resp.text[0] == "<" or len(resp.json().get( 'data')['coupon_list']) == CouponList.ZERO: log.logging.info( '[INFO] Get {0} success'.format(second_id)) break else: if resp: try: if resp.json().get('data'): log.logging.info('[INFO]page {0}'.format(page)) coupon = Coupon() for info in resp.json().get( 'data')['coupon_list']: coupon.second_id = second_id coupon.first_id = first_id coupon.title = info['title'] coupon.price = info['raw_price'] coupon.url = info['url'] coupon.thumbnail_pic = info[ 'thumbnail_pic'] if Goods.save_coupon(coupon): log.logging.info( '[INFO] {0} save to database ok'. format(coupon.title)) else: log.logging.info( '[INFO] {0} is existed'.format( coupon.title)) page += 1 else: log.logging.info('[ERROR] {0}'.format( resp.text)) except Exception as e: log.logging.info('[ERROR] {0}'.format(e)) else: log.logging.info('[ERROR] resp is None') # 一条url处理完成以后, 从文件中删除 delete_line(LINKS_BIN, url)
def get_category_second(self, url): ''' 三级分类 :param url: :return: ''' time.sleep(10) try: resp = fetch(url, proxy=self.get_random_ip()).text() except RequestException as e: resp = fetch(url).text() log.logging.info('[warn] ineffective:{0}'.format(e)) html = etree.HTML(resp) childs = html.xpath('/html/body/div[2]/div[2]/div[3]/div/a') for child in childs[::-1]: result_url = urllib.parse.urljoin(url, child.attrib['href']) cate_id = self.get_id_for_url(result_url) self.get_coupon_info(result_url, cate_id)
def operation(self,url): try: r = fetch(url) except requests.exceptions.RequestException: return False addresses = re.findall(PROXY_REGEX, r.text()) for address in addresses: time = self.timestamp_to_date_str(self.get_time_now()) try: CouponDB().save_ip(address, time) except Exception: pass
def get_category(self, url): ''' 获取分类 :param url:一级分类的url :return: ''' try: resp = fetch(url).text except RequestException as e: resp = fetch(url).text log.logging.info('[warn] ineffective:{0}'.format(e)) html = etree.HTML(resp) childs = html.xpath('/html/body/div[4]/div[1]/div/a') for rc in childs[::-1]: log.logging.info('[INFO] Get url: {0} >>> {1}'.format( rc.attrib['href'], rc.text)) url = urllib.parse.urljoin(url, rc.attrib['href']) url_join = url + "-" + self.first_id # 把url提取出来, 存到links.bin中 if self.get_id_for_url(url): write_links(LINKS_BIN, url_join)
def get_category(self, url): ''' 获取分类 :param url:一级分类的url :return: ''' time.sleep(10) try: resp = fetch(url, proxy=self.get_random_ip()).text() except RequestException as e: resp = fetch(url).text() log.logging.info('[warn] ineffective:{0}'.format(e)) html = etree.HTML(resp) childs = html.xpath('/html/body/div[2]/div[2]/div[2]/div/a') for rc in childs[::-1]: log.logging.info('[INFO] Get url: {0} >>> {1}'.format( rc.attrib['href'], rc.text)) url = urllib.parse.urljoin(url, rc.attrib['href']) self.second_id = self.get_id_for_url(url) self.get_category_second( urllib.parse.urljoin(url, rc.attrib['href']))
def init_category(self): ''' 获取所有的一级分类 :return: ''' log.logging.info('[INFO] Get category') url = self.site_url try: res = fetch(url, proxy=self.get_random_ip()).text() except RequestException as e: res = fetch(url).text() log.logging.info('[warn] ineffective:{0}'.format(e)) html = etree.HTML(res) # 一级分类 root_brother = html.xpath("/html/body/div[2]/div[2]/div[1]/a") # 所有的一级分类请求url for rb in root_brother[::-1]: log.logging.info('[INFO] Get url: {0} >>> {1}'.format( rb.attrib['href'], rb.text)) url = urllib.parse.urljoin(url, rb.attrib['href']) self.first_id = self.get_id_for_url(url) self.get_category(urllib.parse.urljoin(url, rb.attrib['href']))
def get_coupon_info(self, url, cate_id): ''' 获取商品信息 :param url: 请求url :param cate_id: 商品分类id :return: ''' page = 0 while True: time.sleep(10) try: resp = fetch(self.get_url.format(id=cate_id, page=page), proxy=self.get_random_ip()) except RequestException as e: resp = fetch(self.get_url.format(id=cate_id, page=page)) log.logging.info('[warn] ineffective:{0}'.format(e)) time.sleep(10) try: resp2 = fetch(self.get_url.format(id=cate_id, page=page + 1), proxy=self.get_random_ip()) except RequestException as e: resp2 = fetch(self.get_url.format(id=cate_id, page=page + 1)) log.logging.info('[warn] ineffective:{0}'.format(e)) if resp.text() == resp2.text(): log.logging.info('[INFO] Get {0} success'.format(cate_id)) break else: if resp: try: if json.loads(resp.text()).get('data'): log.logging.info('[INFO]page {0}'.format(page, )) coupon = Coupon() for info in json.loads( resp.text()).get('data')['product']: coupon.category_id = cate_id coupon.second_id = self.second_id coupon.first_id = self.first_id coupon.title = info['title'] coupon.price = info['price'] coupon.url = info['url'] coupon.pic = info['pic'] coupon.goods_desc = info['desc'] coupon.brand = info['brand']['name'] coupon.add_time = self.timestamp_to_date_str( self.get_time_now()) print(info['title']) if self.coupon_db.save_coupon(coupon): log.logging.info( '[INFO] {0} save to database ok'. format(coupon)) else: log.logging.info( '[INFO] {0} is existed'.format(coupon)) page += 1 else: log.logging.info('[ERROR] {0}'.format(resp.text())) except Exception as e: log.logging.info('[ERROR] {0}'.format(e)) else: log.logging.info('[ERROR] resp is None')