def check_proxy(p):
    try:
        print(p)
        fetch('http://baidu.com', proxy=p['ip'])
    except RequestException as e:
        print(e)
        CouponDB().delete_ip(p['id'])
Beispiel #2
0
    def get_coupon_info(self):
        '''
        获取商品信息
        :param url: 请求url
        :param self.second_id: 商品分类id
        :return:
        '''
        # 获取links.bin中的url
        urls = read_file_to_url(LINKS_BIN)
        for url in urls:
            m = url.replace("\n", "").split("-")
            url = m[0]
            first_id = m[1]
            second_id = self.get_id_for_url(url)
            page = 0
            while True:
                try:
                    resp = fetch(self.get_url.format(id=second_id, page=page))
                except RequestException as e:
                    resp = fetch(self.get_url.format(id=second_id, page=page))
                    log.logging.info('[warn] ineffective:{0}'.format(e))

                if resp.text[0] == "<" or len(resp.json().get(
                        'data')['coupon_list']) == CouponList.ZERO:
                    log.logging.info(
                        '[INFO] Get {0} success'.format(second_id))
                    break
                else:
                    if resp:
                        try:
                            if resp.json().get('data'):
                                log.logging.info('[INFO]page {0}'.format(page))
                                coupon = Coupon()
                                for info in resp.json().get(
                                        'data')['coupon_list']:
                                    coupon.second_id = second_id
                                    coupon.first_id = first_id
                                    coupon.title = info['title']
                                    coupon.price = info['raw_price']
                                    coupon.url = info['url']
                                    coupon.thumbnail_pic = info[
                                        'thumbnail_pic']
                                    if Goods.save_coupon(coupon):
                                        log.logging.info(
                                            '[INFO] {0} save to database ok'.
                                            format(coupon.title))
                                    else:
                                        log.logging.info(
                                            '[INFO] {0} is existed'.format(
                                                coupon.title))
                                page += 1
                            else:
                                log.logging.info('[ERROR] {0}'.format(
                                    resp.text))
                        except Exception as e:
                            log.logging.info('[ERROR] {0}'.format(e))
                    else:
                        log.logging.info('[ERROR] resp is None')
            # 一条url处理完成以后, 从文件中删除
            delete_line(LINKS_BIN, url)
Beispiel #3
0
    def get_category_second(self, url):
        '''
        三级分类
        :param url:
        :return:
        '''
        time.sleep(10)
        try:
            resp = fetch(url, proxy=self.get_random_ip()).text()
        except RequestException as e:
            resp = fetch(url).text()
            log.logging.info('[warn] ineffective:{0}'.format(e))
        html = etree.HTML(resp)
        childs = html.xpath('/html/body/div[2]/div[2]/div[3]/div/a')

        for child in childs[::-1]:
            result_url = urllib.parse.urljoin(url, child.attrib['href'])
            cate_id = self.get_id_for_url(result_url)
            self.get_coupon_info(result_url, cate_id)
 def operation(self,url):
     try:
         r = fetch(url)
     except requests.exceptions.RequestException:
         return False
     addresses = re.findall(PROXY_REGEX, r.text())
     for address in addresses:
         time = self.timestamp_to_date_str(self.get_time_now())
         try:
             CouponDB().save_ip(address, time)
         except Exception:
             pass
Beispiel #5
0
    def get_category(self, url):
        '''
        获取分类
        :param url:一级分类的url
        :return:
        '''
        try:
            resp = fetch(url).text
        except RequestException as e:
            resp = fetch(url).text
            log.logging.info('[warn] ineffective:{0}'.format(e))
        html = etree.HTML(resp)
        childs = html.xpath('/html/body/div[4]/div[1]/div/a')

        for rc in childs[::-1]:
            log.logging.info('[INFO] Get url: {0} >>> {1}'.format(
                rc.attrib['href'], rc.text))
            url = urllib.parse.urljoin(url, rc.attrib['href'])
            url_join = url + "-" + self.first_id
            # 把url提取出来, 存到links.bin中
            if self.get_id_for_url(url): write_links(LINKS_BIN, url_join)
Beispiel #6
0
    def get_category(self, url):
        '''
        获取分类
        :param url:一级分类的url
        :return:
        '''
        time.sleep(10)
        try:
            resp = fetch(url, proxy=self.get_random_ip()).text()
        except RequestException as e:
            resp = fetch(url).text()
            log.logging.info('[warn] ineffective:{0}'.format(e))
        html = etree.HTML(resp)
        childs = html.xpath('/html/body/div[2]/div[2]/div[2]/div/a')

        for rc in childs[::-1]:
            log.logging.info('[INFO] Get url: {0} >>> {1}'.format(
                rc.attrib['href'], rc.text))
            url = urllib.parse.urljoin(url, rc.attrib['href'])
            self.second_id = self.get_id_for_url(url)
            self.get_category_second(
                urllib.parse.urljoin(url, rc.attrib['href']))
Beispiel #7
0
 def init_category(self):
     '''
     获取所有的一级分类
     :return:
     '''
     log.logging.info('[INFO] Get category')
     url = self.site_url
     try:
         res = fetch(url, proxy=self.get_random_ip()).text()
     except RequestException as e:
         res = fetch(url).text()
         log.logging.info('[warn] ineffective:{0}'.format(e))
     html = etree.HTML(res)
     # 一级分类
     root_brother = html.xpath("/html/body/div[2]/div[2]/div[1]/a")
     # 所有的一级分类请求url
     for rb in root_brother[::-1]:
         log.logging.info('[INFO] Get url: {0} >>> {1}'.format(
             rb.attrib['href'], rb.text))
         url = urllib.parse.urljoin(url, rb.attrib['href'])
         self.first_id = self.get_id_for_url(url)
         self.get_category(urllib.parse.urljoin(url, rb.attrib['href']))
Beispiel #8
0
    def get_coupon_info(self, url, cate_id):
        '''
        获取商品信息
        :param url: 请求url
        :param cate_id: 商品分类id
        :return:
        '''

        page = 0
        while True:
            time.sleep(10)
            try:
                resp = fetch(self.get_url.format(id=cate_id, page=page),
                             proxy=self.get_random_ip())
            except RequestException as e:
                resp = fetch(self.get_url.format(id=cate_id, page=page))
                log.logging.info('[warn] ineffective:{0}'.format(e))
            time.sleep(10)
            try:
                resp2 = fetch(self.get_url.format(id=cate_id, page=page + 1),
                              proxy=self.get_random_ip())
            except RequestException as e:
                resp2 = fetch(self.get_url.format(id=cate_id, page=page + 1))
                log.logging.info('[warn] ineffective:{0}'.format(e))

            if resp.text() == resp2.text():
                log.logging.info('[INFO] Get {0} success'.format(cate_id))
                break
            else:
                if resp:
                    try:
                        if json.loads(resp.text()).get('data'):
                            log.logging.info('[INFO]page {0}'.format(page, ))
                            coupon = Coupon()
                            for info in json.loads(
                                    resp.text()).get('data')['product']:
                                coupon.category_id = cate_id
                                coupon.second_id = self.second_id
                                coupon.first_id = self.first_id
                                coupon.title = info['title']
                                coupon.price = info['price']
                                coupon.url = info['url']
                                coupon.pic = info['pic']
                                coupon.goods_desc = info['desc']
                                coupon.brand = info['brand']['name']
                                coupon.add_time = self.timestamp_to_date_str(
                                    self.get_time_now())
                                print(info['title'])
                                if self.coupon_db.save_coupon(coupon):
                                    log.logging.info(
                                        '[INFO] {0} save to database ok'.
                                        format(coupon))
                                else:
                                    log.logging.info(
                                        '[INFO] {0} is existed'.format(coupon))
                            page += 1
                        else:
                            log.logging.info('[ERROR] {0}'.format(resp.text()))
                    except Exception as e:
                        log.logging.info('[ERROR] {0}'.format(e))
                else:
                    log.logging.info('[ERROR] resp is None')