Python MyRequests.get_url_body Examples, my_requests.MyRequests.get_url_body Python Examples

Example #1

0

Show file

File: zhe_800_pintuan_parse.py Project: backchenlin/python

    def get_p_info_list(self, p_info_url):
        '''
        得到详情介绍信息
        :param p_info_url:
        :return: 返回一个list
        '''
        # 使用requests
        p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        try:
            p_info_data = json.loads(p_info_body)
            tmp_p_info = p_info_data.get('perportieslist', [])
        except Exception:
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            tmp_p_info = []

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info

Example #2

0

Show file

File: 解析抖音接口数据.py Project: cash2one/python-7

def get_aweme_api_videos_info():
    headers = {
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'zh-CN,zh;q=0.9',
        'upgrade-insecure-requests':
        '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'cache-control':
        'max-age=0',
        'authority':
        'www.douyin.com',
        'cookie':
        '_ba=BA0.2-20180330-5199e-OeUxtvwJvy5ElpWGFLId; _ga=GA1.2.390071767.1522391891; sso_login_status=1; tt_webid=6540458660484122126; __tea_sdk__user_unique_id=10_; __tea_sdk__ssid=e88eef4a-ec1f-497d-b2c7-301239bfdc67; login_flag=d6ee54ffebe3021c3fb67ff863970736; sessionid=7bdfd0e36df78f38c25abd13f0eff3cc; uid_tt=644e532b271dae498b62c659de17afdf; sid_tt=7bdfd0e36df78f38c25abd13f0eff3cc; sid_guard="7bdfd0e36df78f38c25abd13f0eff3cc|1522819290|2591999|Fri\\054 04-May-2018 05:21:29 GMT"',
    }

    params = (
        ('user_id', '94470216810'),
        ('max_cursor', '0'),
        ('count', '20'),
    )

    url = 'https://www.douyin.com/aweme/v1/aweme/post/'
    body = MyRequests.get_url_body(url=url, headers=headers, params=params)
    # print(body)

    deal_with_data(body=body)

Example #3

0

Show file

File: mia_parse.py Project: cash2one/python-7

    def get_all_img_url(self, goods_id, is_hk):
        '''
        得到all_img_url
        :param goods_id:
        :param is_hk:
        :return:
        '''
        if is_hk is True:  # 全球购
            tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str(
                goods_id) + '.html'
        else:
            tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html'

        tmp_body_2 = MyRequests.get_url_body(url=tmp_url_2,
                                             headers=self.headers,
                                             had_referer=True)
        # print(Selector(text=tmp_body_2).css('div.small').extract())

        if tmp_body_2 == '':
            print('请求tmp_body_2为空值, 此处先跳过!')
            return ''

        all_img_url = []
        for item in Selector(text=tmp_body_2).css('div.small img').extract():
            # print(item)
            tmp_img_url = Selector(
                text=item).css('img::attr("src")').extract_first()
            all_img_url.append({'img_url': tmp_img_url})

        return all_img_url

Example #4

0

Show file

File: xiaohongshu_parse.py Project: backchenlin/python

    def _deal_with_every_article(self):
        home_articles_link_list = self._get_xiaohongshu_home_aritles_info()
        pprint(home_articles_link_list)

        for item in home_articles_link_list:  # eg: [{'id': '5b311bfc910cf67e693d273e','share_link': 'https://www.xiaohongshu.com/discovery/item/5b311bfc910cf67e693d273e'},...]
            article_id = item.get('id', '')
            article_link = item.get('article_link', '')

            if article_link != '':
                body = MyRequests.get_url_body(url=article_link,
                                               headers=self.headers)
                try:
                    article_info = re.compile(
                        'window.__INITIAL_SSR_STATE__=(.*?)</script>').findall(
                            body)[0]
                except IndexError:
                    self.my_lg.error('获取article_info时IndexError!请检查!')
                    sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
                    continue

                article_info = self._wash_article_info(
                    self.json_2_dict(article_info))
                pprint(article_info)
                sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            else:
                pass

Example #5

0

Show file

    async def _get_target_url_and_content_id_and_csid(self, taobao_short_url):
        '''
        根据给与的淘宝分享短链接, 得到target_url, content_id, csid
        :param taobao_short_url:
        :return:
        '''
        if re.compile(r'contentId').findall(taobao_short_url) != []:
            # 先检查是否已为目标地址
            target_url = taobao_short_url

        else:
            body = MyRequests.get_url_body(url=taobao_short_url,
                                           headers=self.headers)
            # self.my_lg.info(str(body))
            if body == '':
                self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(
                    str(taobao_short_url)))
                return '', '', ''

            try:
                # 获取短连接的目标地址
                target_url = re.compile('var url = \'(.*?)\';').findall(
                    body)[0]
                # self.my_lg.info(str(target_url))
            except IndexError:
                self.my_lg.error(
                    '获取target_url的时候IndexError! 出错短链接地址: {0}'.format(
                        str(taobao_short_url)))
                target_url = ''

        try:
            # 得到contentId
            content_id = re.compile('contentId=(\d+)').findall(target_url)[0]
            # self.my_lg.info(content_id)
        except IndexError:
            self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(
                str(taobao_short_url)))
            content_id = ''

        try:
            # 得到csid
            csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(
                target_url)[0]
            # self.my_lg.info(csid)
        except IndexError:
            self.my_lg.info('此链接为无csid情况的链接...')
            # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
            csid = ''

        try:
            tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag_name = ''

        try:
            tag = re.compile('tag=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag = ''

        return target_url, content_id, csid, tag_name, tag

Example #6

0

Show file

File: mia_parse.py Project: cash2one/python-7

    def get_jump_to_url_and_is_hk(self, body):
        '''
        得到跳转地址和is_hk
        :param body: 待解析的url的body
        :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean
        '''
        if re.compile(r'_sign_direct_url = ').findall(
                body) != []:  # 表明是跳转，一般会出现这种情况的是拼团商品
            # 出现跳转时
            try:
                sign_direct_url = re.compile(
                    r"_sign_direct_url = '(.*?)';").findall(body)[0]
                print('*** 获取到跳转地址为: ', sign_direct_url)
            except IndexError:
                sign_direct_url = ''
                print('获取跳转的地址时出错!')

            body = MyRequests.get_url_body(url=sign_direct_url,
                                           headers=self.headers,
                                           had_referer=True)

            if re.compile(r'://m.miyabaobei.hk/').findall(
                    sign_direct_url) != []:
                # 表示为全球购商品
                print('*** 此商品为全球购商品!')
                is_hk = True
            else:
                is_hk = False

        else:
            is_hk = False
            sign_direct_url = ''

        return (body, sign_direct_url, is_hk)

Example #7

0

Show file

    def _get_taobao_goods_keywords_goods_id_list(self, keyword):
        '''
        获取该keywords的商品的goods_id_list
        :param keyword: (id, keyword)
        :return: a list
        '''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': '*/*',
            # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
            'authority': 's.taobao.com',
            # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw',
        }

        # 获取到的为淘宝关键字搜索按销量排名
        params = (
            ('data-key', 'sort'),
            ('data-value', 'sale-desc'),
            ('ajax', 'true'),
            # ('_ksTS', '1528171408340_395'),
            ('callback', 'jsonp396'),
            ('q', keyword[1]),
            ('imgfile', ''),
            ('commend', 'all'),
            ('ssid', 's5-e'),
            ('search_type', 'item'),
            ('sourceId', 'tb.index'),
            # ('spm', 'a21bo.2017.201856-taobao-item.1'),
            ('ie', 'utf8'),
            # ('initiative_id', 'tbindexz_20170306'),
        )

        s_url = 'https://s.taobao.com/search'
        body = MyRequests.get_url_body(url=s_url, headers=headers, params=params)
        if body == '':
            return []
        else:
            try:
                data = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1]))
                return []

            data = self.json_str_2_dict(json_str=data)
            if data == {}:
                self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(keyword[1]))
                return []
            else:
                goods_id_list = data.get('mainInfo', {}).get('traceInfo', {}).get('traceData', {}).get('allNids', [])
                if goods_id_list is None or goods_id_list == []:
                    self.my_lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []
                else:
                    return goods_id_list

Example #8

0

Show file

File: mia_parse.py Project: cash2one/python-7

    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
        # print(tmp_body)

        try:
            tmp_data = json.loads(tmp_body).get('data', [])
            # pprint(tmp_data)
        except Exception as e:
            print('json.loads转换tmp_body时出错!')
            tmp_data = []
            self.result_data = {}
            return {}

        true_sku_info = []
        i_s = {}
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['normal_price'] = normal_price
                            tmp['detail_price'] = detail_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s)

Example #9

0

Show file

    def get_div_from_pc_div_url(self, url, goods_id):
        '''
        根据pc描述的url模拟请求获取描述的div
        :return: str
        '''
        t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

        params_data_1 = {
            'id': goods_id,
            'type': '1',
        }

        tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/'
        _params = (
            ('appKey', '12574478'),
            ('t', t),
            ('api', 'mtop.taobao.detail.getdesc'),
            ('v', '6.0'),
            ('type', 'jsonp'),
            ('dataType', 'jsonp'),
            ('timeout', '20000'),
            ('callback', 'mtopjsonp1'),
            ('data', json.dumps(params_data_1)),
        )
        url = tmp_url + '?' + urlencode(_params)
        last_url = re.compile(r'\+').sub('', url)  # 转换后得到正确的url请求地址(替换'+')
        # self.my_lg.info(last_url)

        data = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14, num_retries=3)
        if data == '':
            self.my_lg.error('获取到的div_desc为空值!请检查! 出错goods_id: {0}'.format(goods_id))
            return ''

        try:
            data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0]  # 贪婪匹配匹配所有
            # self.my_lg.info(str(data))
        except IndexError as e:
            self.my_lg.error('获取data时, IndexError出错! 出错goods_id: {0}'.format(goods_id))
            self.my_lg.exception(e)
            return ''

        try:
            data = json.loads(data)
            # pprint(data)
        except JSONDecodeError:
            self.my_lg.error('json转换data时出错, 请检查!')
            data = {}

        div = data.get('data', {}).get('pcDescContent', '')
        # self.my_lg.info(str(div))
        div = self.deal_with_div(div)
        # self.my_lg.info(div)

        return div

Example #10

0

Show file

def test_requests():
    url = 'https://superonesfazai.github.io/'
    start_time = time.time()
    for _ in range(200):
        body = MyRequests.get_url_body(url=url, headers=headers)
        if body != '':
            print('success')
        else:
            print(body)

    end_time = time.time()
    print('requests用时:', end_time - start_time)

Example #11

0

Show file

    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = self._json_2_dict(body).get('wareDetailComment',
                                                {}).get('commentInfoList', [])
            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data

Example #12

0

Show file

File: 获取整站所有外链.py Project: cash2one/python-7

def getRandomExternalLink(startingPage):
    html = MyRequests.get_url_body(url=startingPage, headers=headers)
    bsObj = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("没有外部链接，准备遍历整个网站")
        domain = urlparse(startingPage).scheme + "://" + urlparse(
            startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(
            0,
            len(internalLinks) - 1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks) - 1)]

Example #13

0

Show file

    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = MyRequests.get_url_body(url=s_url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            data = self.json_str_2_dict(json_str=body)
            if data == {}:
                self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.my_lg.exception(e)
                    self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []

                return goods_id_list

Example #14

0

Show file

File: 解析抖音接口数据.py Project: backchenlin/python

    def _get_aweme_api_videos_info(self, user_id):
        self.user_id = user_id
        params = (
            ('user_id', self.user_id),
            ('max_cursor', '0'),
            ('count', '20'),
        )

        url = 'https://www.douyin.com/aweme/v1/aweme/post/'
        body = MyRequests.get_url_body(url=url,
                                       headers=self.headers,
                                       params=params)
        # print(body)

        self.deal_with_data(body=body)

Example #15

0

Show file

    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page)
            )
            print('正在抓取的页面地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.5)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id': item.get('goods_id', ''),
                'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count': str(item.get('join_number_int', '')),
                'page': page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list

Example #16

0

Show file

File: zhe_800_pintuan_parse.py Project: cash2one/python-7

    def get_div_desc_body(self, div_desc_url):
        '''
        得到div_desc的html页面
        :param div_desc_url:
        :return: str类型的data, 出错的情况下返回{}
        '''
        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url,
                                                headers=self.headers)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        try:
            div_desc_data = json.loads(div_desc_body)
            tmp_body = div_desc_data.get('data', '')
        except Exception:
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            tmp_body = ''

        # 清洗
        tmp_body = re.compile(
            r'<div class=\"by_deliver\">.*?</div></div>').sub('', tmp_body)
        tmp_body = re.compile(r'src=.*? />').sub('/>', tmp_body)
        tmp_body = re.compile(r'data-url=').sub('src=\"', tmp_body)
        tmp_body = re.compile(r' />').sub(
            '\" style="height:auto;width:100%;"/>', tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body

Example #17

0

Show file

    def get_one_page_goods_info(self, *params):
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {
            "group": 4,
            "module": "99",
            "page": page,
            "tab": "all"
        }

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data)
        if body == '':
            body = '{}'

        return body

Example #18

0

Show file

File: mia_pintuan.py Project: cash2one/python-7

    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url，得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        for index in range(1, 1000):     # 0跟1返回一样，所有从1开始遍历
            tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(index) + '/0/'
            print('正在抓取: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')

                if tmp_data.get('data_list', []) == []:
                    print('得到的data_list为[], 此处跳过!')
                    break

                else:
                    # print(tmp_data)
                    data_list = [{
                        'goods_id': item.get('sku', ''),
                        'sub_title': item.get('intro', ''),
                        'pid': index,
                    } for item in tmp_data.get('data_list', [])]
                    # pprint(data_list)

                    for item in data_list:
                        goods_list.append(item)
                    sleep(.5)

        pprint(goods_list)
        self.deal_with_data(goods_list=goods_list)
        sleep(8)
        return None

Example #19

0

Show file

File: mogujie_spike.py Project: backchenlin/python

    def traversal_hour_timestamp(self, item):
        '''
        遍历每个需求的整点时间戳
        :param item:
        :return:
        '''
        # 先遍历today的需求的整点时间戳
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(item))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('item为: ', item)
            print('获取到的body为空值! 此处跳过')

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                print('json.loads转换body时出错, 此处跳过!')
                tmp_data = {}

            if tmp_data == {}:
                print('tmp_data为空{}!')
                pass
            else:
                # pprint(tmp_data)
                # print(tmp_data)

                event_time = item
                item_list = tmp_data.get('data', {}).get('list', [])

                self.deal_with_data(event_time, item_list)
                sleep(MOGUJIE_SLEEP_TIME)

Example #20

0

Show file

File: zhe_800_pintuan_parse.py Project: backchenlin/python

    def get_stock_info_dict(self, stock_info_url):
        '''
        得到实时库存信息
        :param stock_info_url:
        :return: 返回dict类型
        '''
        stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        try:
            stock_info_data = json.loads(stock_info_body)
            tmp_stock_info = stock_info_data.get('data', {})
        except Exception:
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            tmp_stock_info = {}

        return tmp_stock_info

Example #21

0

Show file

File: mia_spike.py Project: backchenlin/python

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url，得到近期所有的限时秒杀商品信息
        :return:
        '''
        mia_base_number = MIA_BASE_NUMBER
        while mia_base_number < MIA_MAX_NUMBER:
            tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                mia_base_number)

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '' or body == '[]':
                print('mia_base_number为: ', mia_base_number)
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')
                tmp_hour = tmp_data.get('p_info', {}).get('start_time',
                                                          '')[11:13]
                if tmp_hour == '22':  # 过滤掉秒杀时间为22点的
                    print('--- 销售时间为22点，不抓取!')
                    pass
                else:
                    print(tmp_data)
                    print('mia_base_number为: ', mia_base_number)
                    pid = mia_base_number
                    begin_time = tmp_data.get('p_info',
                                              {}).get('start_time', '')
                    end_time = tmp_data.get('p_info', {}).get('end_time', '')
                    item_list = tmp_data.get('item_list', [])

                    self.deal_with_data(pid, begin_time, end_time, item_list)

            sleep(.35)
            mia_base_number += 1

Example #22

0

Show file

File: mogujie_miaosha_real-times_update.py Project: cash2one/python-7

    def get_item_list(self, event_time):
        '''
        得到event_time中所有的商品信息
        :param event_time:
        :return: item_list 类型 list
        '''
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(event_time))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('获取到的body为空值! 此处跳过')
            item_list = ''

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                tmp_data = {}
                print('json.loads转换body时出错, 此处跳过!')

            if tmp_data == {}:
                print('tmp_data为空{}!')
                item_list = []

            else:
                # pprint(tmp_data)
                # print(tmp_data)

                item_list = tmp_data.get('data', {}).get('list', [])
        sleep(.5)

        return item_list

Example #23

0

Show file

File: jumeiyoupin_miaosha_real-times_update.py Project: cash2one/python-7

    def get_one_page_all_goods_list(self, *params):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        try:
            json_body = json.loads(body)
            # print(json_body)
        except:
            print('json.loads转换body时出错!请检查')
            json_body = {}
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        # sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list

Example #24

0

Show file

File: zhe_800_pintuan.py Project: backchenlin/python

    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_body = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
            if tmp_body == '':
                tmp_body = '{}'
            try:
                tmp_data = json.loads(tmp_body)
                tmp_data = tmp_data.get('objects', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        return zid_list

Example #25

0

Show file

    def _get_1688_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取1688销量靠前的商品信息
        :param keyword:
        :return: a list eg: ['11111', ...]
        '''
        '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品'''
        headers = {
            'authority': 'm.1688.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE',
        }

        params = (
            ('sortType', 'booked'),
            ('filtId', ''),
            ('keywords', keyword[1]),
            ('descendOrder', 'true'),
        )

        url = 'https://m.1688.com/offer_search/-6161.html'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                goods_id_list = Selector(text=body).css('div.list_group-item::attr("data-offer-id")').extract()
                # pprint(goods_id_list)
            except Exception as e:
                self.my_lg.exception(e)
                self.my_lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                goods_id_list = []

        return goods_id_list

Example #26

0

Show file

File: 获取整站所有外链.py Project: cash2one/python-7

def getAllExternalLinks(siteUrl):
    domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
    html = MyRequests.get_url_body(url=siteUrl, headers=headers)
    bsObj = BeautifulSoup(html, 'lxml')
    internalLinks = getInternalLinks(bsObj, domain)
    externalLinks = getExternalLinks(bsObj, domain)

    f = open('result.txt', 'w')
    # 收集外链
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            # print(link)
            f.writelines(link + '\n')
            print("即将获取的外部链接的URL是：" + link)
            # 收集内链
    for link in internalLinks:
        if link not in allIntLinks:
            print("即将获取内部链接的URL是：" + link)
            allIntLinks.add(link)
            getAllExternalLinks(link)
            f.writelines(link + '\n')

Example #27

0

Show file

def run_forever():
    with open('./setting.txt', 'r') as f:
        start = int(f.readline())

    for index in range(start, 99999999999999999):
        if index % 50 == 0:
            with open('./setting.txt', 'w') as f:
                f.write(str(index))
            print('*** 短暂休眠...')
            sleep(2)

        video_id = str(int('65' + 17*'0') + index)

        url = 'https://www.iesdouyin.com/share/video/' + video_id + '/'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # print(body)

        if deal_with_data(video_id=video_id, body=body) is False:
            continue
        else:
            pass

        sleep(.2)

Example #28

0

Show file

File: zhe_800_pintuan_parse.py Project: backchenlin/python

    def get_div_desc_body(self, div_desc_url):
        '''
        得到div_desc的html页面
        :param div_desc_url:
        :return: str类型的data, 出错的情况下返回{}
        '''
        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        try:
            div_desc_data = json.loads(div_desc_body)
            tmp_body = div_desc_data.get('data', '')
        except Exception:
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            tmp_body = ''

        tmp_body = self._wash_div_desc(tmp_body=tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body

Example #29

0

Show file

File: jumeiyoupin_parse.py Project: backchenlin/python

    def get_goods_data(self, goods_id):
        '''
        模拟构造得到data的url, 并得到相应数据
        :param goods_id:
        :return: data 类型dict
        '''
        if goods_id == []:
            self.result_data = {}
            return {}

        goods_url = 'https://h5.jumei.com/product/detail?item_id=' + str(
            goods_id[0]) + '&type=' + str(goods_id[1])
        print('------>>>| 对应的手机端地址为: ', goods_url)

        #** 获取ajaxStaticDetail请求中的数据
        tmp_url = 'https://h5.jumei.com/product/ajaxStaticDetail?item_id=' + goods_id[
            0] + '&type=' + str(goods_id[1])
        self.headers['Referer'] = goods_url
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        if body == '':
            print('获取到的body为空str!')
            self.result_data = {}
            return {}

        try:
            tmp_data = json.loads(body)
            # pprint(tmp_data)
        except Exception:
            print('json.loads转换body时出错!请检查!')
            self.result_data = {}
            return {}

        tmp_data = self.wash_data(data=tmp_data)
        # pprint(tmp_data)

        #** 获取ajaxDynamicDetail请求中的数据
        tmp_url_2 = 'https://h5.jumei.com/product/ajaxDynamicDetail?item_id=' + str(
            goods_id[0]) + '&type=' + str(goods_id[1])
        body_2 = MyRequests.get_url_body(url=tmp_url_2, headers=self.headers)
        # print(body)
        if body_2 == '':
            print('获取到的body为空str!')
            self.result_data = {}
            return {}

        try:
            tmp_data_2 = json.loads(body_2)
            # pprint(tmp_data_2)
        except Exception:
            print('json.loads转换body_2时出错!请检查!')
            self.result_data = {}
            return {}
        tmp_data_2 = self.wash_data_2(data=tmp_data_2)
        # pprint(tmp_data_2)

        tmp_data['data_2'] = tmp_data_2.get('data', {}).get('result', {})
        if tmp_data['data_2'] == {}:
            print('获取到的ajaxDynamicDetail中的数据为空值!请检查!')
            self.result_data = {}
            return {}

        # pprint(tmp_data)

        data = {}
        try:
            data['title'] = tmp_data.get('data', {}).get('name', '')
            data['sub_title'] = ''
            # print(data['title'])

            if data['title'] == '':
                print('获取到的title为空值, 请检查!')
                raise Exception

            # shop_name
            if tmp_data.get('data_2', {}).get('shop_info') == []:
                data['shop_name'] = ''
            else:
                data['shop_name'] = tmp_data.get('data_2', {}).get(
                    'shop_info', {}).get('store_title', '')
            # print(data['shop_name'])

            # 获取所有示例图片
            all_img_url = tmp_data.get('data',
                                       {}).get('image_url_set',
                                               {}).get('single_many', [])
            if all_img_url == []:
                print('获取到的all_img_url为空[], 请检查!')
                raise Exception
            else:
                all_img_url = [{
                    'img_url': item.get('800', ''),
                } for item in all_img_url]
            # pprint(all_img_url)
            data['all_img_url'] = all_img_url

            # 获取p_info
            p_info = self.get_p_info(tmp_data=tmp_data)
            # pprint(p_info)
            data['p_info'] = p_info

            # 获取每个商品的div_desc
            # 注意其商品的div_desc = description + description_usage + description_images
            div_desc = self.get_goods_div_desc(tmp_data=tmp_data)
            # print(div_desc)
            if div_desc == '':
                print('获取到的div_desc为空值! 请检查')
                raise Exception
            data['div_desc'] = div_desc
            '''
            上下架时间 (注意:聚美优品常规今日10点上新商品，销售时长都是24小时)
            '''
            sell_time = self.get_sell_time(
                begin_time=tmp_data.get('data_2', {}).get('start_time'),
                end_time=tmp_data.get('data_2', {}).get('end_time'))
            # pprint(sell_time)
            data['sell_time'] = sell_time

            # 设置detail_name_list
            detail_name_list = self.get_detail_name_list(
                size_attr=tmp_data.get('data_2', {}).get('size_attr', []))
            # print(detail_name_list)
            data['detail_name_list'] = detail_name_list
            '''
            获取每个规格对应价格跟规格以及库存
            '''
            true_sku_info = self.get_true_sku_info(
                size=tmp_data.get('data_2', {}).get('size', []))
            # pprint(true_sku_info)
            if true_sku_info == []:
                print('获取到的sku_info为空值, 请检查!')
                raise Exception
            else:
                data['price_info_list'] = true_sku_info
            '''
            is_delete
            '''
            if int(tmp_data.get('data_2', {}).get('end_time')) < int(
                    time.time()):
                is_delete = 1
            else:
                all_stock = 0
                for item in true_sku_info:
                    all_stock += item.get('rest_number', 0)
                # print(all_stock)
                if all_stock == 0:
                    is_delete = 1
                else:
                    is_delete = 0
            # print(is_delete)
            data['is_delete'] = is_delete

            # all_sell_count
            all_sell_count = tmp_data.get('data_2',
                                          {}).get('buyer_number', '0')
            data['all_sell_count'] = all_sell_count

        except Exception as e:
            print('遇到错误如下: ', e)
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            return {}

        if data != {}:
            # pprint(data)
            self.result_data = data
            return data

        else:
            print('data为空!')
            self.result_data = {}  # 重置下，避免存入时影响下面爬取的赋值
            return {}

Example #30

0

Show file

    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                pintuan_end_time = json.loads(item[1]).get('end_time')
                pintuan_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(pintuan_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                mia_pintuan = MiaPintuanParse()

                if index % 50 == 0:  # 每50次重连一次，避免单次长连无响应报错
                    print('正在重置，并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(pintuan_end_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 拼团开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('begin_time'))

                    elif self.is_recent_time(pintuan_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break，因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1，表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)

                        tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(
                            item[2]) + '/0/'
                        # print(tmp_url)

                        body = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers,
                                                       had_referer=True)

                        if body == '':
                            print('获取到的body为空值! 此处跳过')

                        else:
                            try:
                                tmp_data = json.loads(body)
                            except:
                                tmp_data = {}
                                print('json.loads转换body时出错, 此处跳过!')

                            if tmp_data.get('data_list', []) == []:
                                print('得到的data_list为[]!')
                                print('该商品已被下架限时秒杀活动，此处将其删除')
                                tmp_sql_server._delete_table(
                                    sql_str=self.delete_sql_str,
                                    params=(item[0]))
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                data_list = [{
                                    'goods_id':
                                    item_2.get('sku', ''),
                                    'sub_title':
                                    item_2.get('intro', ''),
                                } for item_2 in tmp_data.get('data_list', [])]
                                # pprint(data_list)

                                pintuan_goods_all_goods_id = [
                                    item_1.get('goods_id', '')
                                    for item_1 in data_list
                                ]
                                # print(pintuan_goods_all_goods_id)
                                '''
                                蜜芽拼团不对内部下架的进行操作，一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团，误删的情况很普遍)
                                '''
                                if item[0] not in pintuan_goods_all_goods_id:  # 内部已经下架的
                                    # print('该商品已被下架限时秒杀活动，此处将其删除')
                                    # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0]))
                                    # print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                    # pass

                                    # 一律更新
                                    mia_pintuan.get_goods_data(
                                        goods_id=item[0])
                                    goods_data = mia_pintuan.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:
                                        goods_data['goods_id'] = str(item[0])
                                        if goods_data[
                                                'pintuan_time'] == {}:  # 当没有拼团时间时，就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间)
                                            now_time = get_shanghai_time()
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = (
                                                        now_time, now_time)
                                        else:
                                            goods_data[
                                                'pintuan_begin_time'], goods_data[
                                                    'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                        pintuan_time=goods_data[
                                                            'pintuan_time'])

                                        # pprint(goods_data)
                                        # print(goods_data)
                                        mia_pintuan.update_mia_pintuan_table(
                                            data=goods_data,
                                            pipeline=tmp_sql_server)
                                        sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度

                                else:  # 未下架的
                                    for item_2 in data_list:
                                        if item_2.get('goods_id',
                                                      '') == item[0]:
                                            mia_pintuan.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = mia_pintuan.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:
                                                goods_data['goods_id'] = str(
                                                    item[0])
                                                goods_data[
                                                    'sub_title'] = item_2.get(
                                                        'sub_title', '')
                                                if goods_data[
                                                        'pintuan_time'] == {}:  # 当没有拼团时间时，就表示已下架拼团
                                                    now_time = get_shanghai_time(
                                                    )
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = (
                                                                now_time,
                                                                now_time)
                                                else:
                                                    goods_data[
                                                        'pintuan_begin_time'], goods_data[
                                                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                                                pintuan_time=
                                                                goods_data[
                                                                    'pintuan_time']
                                                            )

                                                # pprint(goods_data)
                                                # print(goods_data)
                                                mia_pintuan.update_mia_pintuan_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)
                                                sleep(MIA_SPIKE_SLEEP_TIME
                                                      )  # 放慢速度
                                        else:
                                            pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败，数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()