def get_p_info_list(self, p_info_url):
        '''
        得到详情介绍信息
        :param p_info_url:
        :return: 返回一个list
        '''
        # 使用requests
        p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers)
        if p_info_body == '':
            print('获取到的p_info_body为空值, 此处跳过!')
            p_info_body = '{}'

        try:
            p_info_data = json.loads(p_info_body)
            tmp_p_info = p_info_data.get('perportieslist', [])
        except Exception:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            tmp_p_info = []

        if tmp_p_info != []:
            p_info = [{
                'p_name': item.get('name', ''),
                'p_value': item.get('value'),
            } for item in tmp_p_info]
        else:
            p_info = tmp_p_info

        return p_info
Example #2
0
    def get_jump_to_url_and_is_hk(self, body):
        '''
        得到跳转地址和is_hk
        :param body: 待解析的url的body
        :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean
        '''
        if re.compile(r'_sign_direct_url = ').findall(
                body) != []:  # 表明是跳转,一般会出现这种情况的是拼团商品
            # 出现跳转时
            try:
                sign_direct_url = re.compile(
                    r"_sign_direct_url = '(.*?)';").findall(body)[0]
                print('*** 获取到跳转地址为: ', sign_direct_url)
            except IndexError:
                sign_direct_url = ''
                print('获取跳转的地址时出错!')

            body = MyRequests.get_url_body(url=sign_direct_url,
                                           headers=self.headers,
                                           had_referer=True)

            if re.compile(r'://m.miyabaobei.hk/').findall(
                    sign_direct_url) != []:
                # 表示为全球购商品
                print('*** 此商品为全球购商品!')
                is_hk = True
            else:
                is_hk = False

        else:
            is_hk = False
            sign_direct_url = ''

        return (body, sign_direct_url, is_hk)
Example #3
0
    async def _get_target_url_and_content_id_and_csid(self, taobao_short_url):
        '''
        根据给与的淘宝分享短链接, 得到target_url, content_id, csid
        :param taobao_short_url:
        :return:
        '''
        if re.compile(r'contentId').findall(taobao_short_url) != []:
            # 先检查是否已为目标地址
            target_url = taobao_short_url

        else:
            body = MyRequests.get_url_body(url=taobao_short_url,
                                           headers=self.headers)
            # self.my_lg.info(str(body))
            if body == '':
                self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(
                    str(taobao_short_url)))
                return '', '', ''

            try:
                # 获取短连接的目标地址
                target_url = re.compile('var url = \'(.*?)\';').findall(
                    body)[0]
                # self.my_lg.info(str(target_url))
            except IndexError:
                self.my_lg.error(
                    '获取target_url的时候IndexError! 出错短链接地址: {0}'.format(
                        str(taobao_short_url)))
                target_url = ''

        try:
            # 得到contentId
            content_id = re.compile('contentId=(\d+)').findall(target_url)[0]
            # self.my_lg.info(content_id)
        except IndexError:
            self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(
                str(taobao_short_url)))
            content_id = ''

        try:
            # 得到csid
            csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(
                target_url)[0]
            # self.my_lg.info(csid)
        except IndexError:
            self.my_lg.info('此链接为无csid情况的链接...')
            # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url)))
            csid = ''

        try:
            tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag_name = ''

        try:
            tag = re.compile('tag=(.*?)&').findall(target_url)[0]
        except IndexError:
            tag = ''

        return target_url, content_id, csid, tag_name, tag
Example #4
0
    def get_all_img_url(self, goods_id, is_hk):
        '''
        得到all_img_url
        :param goods_id:
        :param is_hk:
        :return:
        '''
        if is_hk is True:  # 全球购
            tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str(
                goods_id) + '.html'
        else:
            tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html'

        tmp_body_2 = MyRequests.get_url_body(url=tmp_url_2,
                                             headers=self.headers,
                                             had_referer=True)
        # print(Selector(text=tmp_body_2).css('div.small').extract())

        if tmp_body_2 == '':
            print('请求tmp_body_2为空值, 此处先跳过!')
            return ''

        all_img_url = []
        for item in Selector(text=tmp_body_2).css('div.small img').extract():
            # print(item)
            tmp_img_url = Selector(
                text=item).css('img::attr("src")').extract_first()
            all_img_url.append({'img_url': tmp_img_url})

        return all_img_url
Example #5
0
    def _deal_with_every_article(self):
        home_articles_link_list = self._get_xiaohongshu_home_aritles_info()
        pprint(home_articles_link_list)

        for item in home_articles_link_list:  # eg: [{'id': '5b311bfc910cf67e693d273e','share_link': 'https://www.xiaohongshu.com/discovery/item/5b311bfc910cf67e693d273e'},...]
            article_id = item.get('id', '')
            article_link = item.get('article_link', '')

            if article_link != '':
                body = MyRequests.get_url_body(url=article_link,
                                               headers=self.headers)
                try:
                    article_info = re.compile(
                        'window.__INITIAL_SSR_STATE__=(.*?)</script>').findall(
                            body)[0]
                except IndexError:
                    self.my_lg.error('获取article_info时IndexError!请检查!')
                    sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
                    continue

                article_info = self._wash_article_info(
                    self.json_2_dict(article_info))
                pprint(article_info)
                sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            else:
                pass
def get_aweme_api_videos_info():
    headers = {
        'accept-encoding':
        'gzip, deflate, br',
        'accept-language':
        'zh-CN,zh;q=0.9',
        'upgrade-insecure-requests':
        '1',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'cache-control':
        'max-age=0',
        'authority':
        'www.douyin.com',
        'cookie':
        '_ba=BA0.2-20180330-5199e-OeUxtvwJvy5ElpWGFLId; _ga=GA1.2.390071767.1522391891; sso_login_status=1; tt_webid=6540458660484122126; __tea_sdk__user_unique_id=10_; __tea_sdk__ssid=e88eef4a-ec1f-497d-b2c7-301239bfdc67; login_flag=d6ee54ffebe3021c3fb67ff863970736; sessionid=7bdfd0e36df78f38c25abd13f0eff3cc; uid_tt=644e532b271dae498b62c659de17afdf; sid_tt=7bdfd0e36df78f38c25abd13f0eff3cc; sid_guard="7bdfd0e36df78f38c25abd13f0eff3cc|1522819290|2591999|Fri\\054 04-May-2018 05:21:29 GMT"',
    }

    params = (
        ('user_id', '94470216810'),
        ('max_cursor', '0'),
        ('count', '20'),
    )

    url = 'https://www.douyin.com/aweme/v1/aweme/post/'
    body = MyRequests.get_url_body(url=url, headers=headers, params=params)
    # print(body)

    deal_with_data(body=body)
    def needIdenCode(self):
        #第一次登录获取验证码尝试,构建request
        # request = Request(self.loginURL, self.postData, self.loginHeaders)
        # response = self.opener.open(request)        #得到第一次登录尝试的相应
        # content = response.read().decode('gbk')
        # status = response.getcode()           # 获取状态吗

        response = requests.post(url=self.loginURL,
                                 headers=self.loginHeaders,
                                 data=json.dumps(self.postData),
                                 proxies=MyRequests._get_proxies())
        content = response.content.decode('gbk')

        status = response.status_code

        #状态码为200,获取成功
        if status == 200:
            print("获取请求成功")
            #u8bf7u8f93u5165u9a8cu8bc1u7801这六个字是请输入验证码的utf-8编码
            pattern = re.compile(u'u8bf7u8f93u5165u9a8cu8bc1u7801', re.S)
            result = re.search(pattern, content)
            #如果找到该字符,代表需要输入验证码
            if result:
                print("此次安全验证异常,您需要输入验证码")
                return content
            #否则不需要
            else:
                print("此次安全验证通过,您这次不需要输入验证码")
                return False
        else:
            print("获取请求失败")
Example #8
0
def my_requests():
    bsdb = get_bsdb()
    user = session['user_num']
    my_request = MyRequests(user, bsdb)
    try:
        requests = my_request.get_all_open_requests()
        requests_dicts = [dict(row) for row in requests]
        for trade in requests_dicts:
            print(trade['tradeAge'])
    except Exception:
        app.logger.error("Couldn't fill my-requests")
        requests_dicts = []

    if len(requests_dicts) == 0:
        return render_template('user/no-trades.html', no_sent_requests=True)
    else:
        return render_template('user/my-requests.html', requests=requests_dicts)
Example #9
0
    def _get_taobao_goods_keywords_goods_id_list(self, keyword):
        '''
        获取该keywords的商品的goods_id_list
        :param keyword: (id, keyword)
        :return: a list
        '''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': '*/*',
            # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306',
            'authority': 's.taobao.com',
            # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw',
        }

        # 获取到的为淘宝关键字搜索按销量排名
        params = (
            ('data-key', 'sort'),
            ('data-value', 'sale-desc'),
            ('ajax', 'true'),
            # ('_ksTS', '1528171408340_395'),
            ('callback', 'jsonp396'),
            ('q', keyword[1]),
            ('imgfile', ''),
            ('commend', 'all'),
            ('ssid', 's5-e'),
            ('search_type', 'item'),
            ('sourceId', 'tb.index'),
            # ('spm', 'a21bo.2017.201856-taobao-item.1'),
            ('ie', 'utf8'),
            # ('initiative_id', 'tbindexz_20170306'),
        )

        s_url = 'https://s.taobao.com/search'
        body = MyRequests.get_url_body(url=s_url, headers=headers, params=params)
        if body == '':
            return []
        else:
            try:
                data = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1]))
                return []

            data = self.json_str_2_dict(json_str=data)
            if data == {}:
                self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(keyword[1]))
                return []
            else:
                goods_id_list = data.get('mainInfo', {}).get('traceInfo', {}).get('traceData', {}).get('allNids', [])
                if goods_id_list is None or goods_id_list == []:
                    self.my_lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []
                else:
                    return goods_id_list
Example #10
0
    def get_true_sku_info(self, sku_info):
        '''
        获取每个规格对应价格跟规格以及其库存
        :param sku_info:
        :return: {} 空字典表示出错 | (true_sku_info, i_s)
        '''
        goods_id_str = '-'.join([item.get('goods_id') for item in sku_info])
        # print(goods_id_str)
        tmp_url = 'https://p.mia.com/item/list/' + goods_id_str
        # print(tmp_url)

        tmp_body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
        # print(tmp_body)

        try:
            tmp_data = json.loads(tmp_body).get('data', [])
            # pprint(tmp_data)
        except Exception as e:
            print('json.loads转换tmp_body时出错!')
            tmp_data = []
            self.result_data = {}
            return {}

        true_sku_info = []
        i_s = {}
        for item_1 in sku_info:
            for item_2 in tmp_data:
                if item_1.get('goods_id') == str(item_2.get('id', '')):
                    i_s = item_2.get('i_s', {})
                    # print(i_s)
                    for item_3 in i_s.keys():
                        tmp = {}
                        if item_3 == 'SINGLE':
                            spec_value = item_1.get('color_name')
                        else:
                            spec_value = item_1.get(
                                'color_name') + '|' + item_3
                        normal_price = str(item_2.get('mp'))
                        detail_price = str(item_2.get('sp'))
                        img_url = item_1.get('img_url')
                        rest_number = i_s.get(item_3)
                        if rest_number == 0:
                            pass
                        else:
                            tmp['spec_value'] = spec_value
                            tmp['normal_price'] = normal_price
                            tmp['detail_price'] = detail_price
                            tmp['img_url'] = img_url
                            tmp['rest_number'] = rest_number
                            true_sku_info.append(tmp)

        return (true_sku_info, i_s)
Example #11
0
    def get_div_from_pc_div_url(self, url, goods_id):
        '''
        根据pc描述的url模拟请求获取描述的div
        :return: str
        '''
        t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

        params_data_1 = {
            'id': goods_id,
            'type': '1',
        }

        tmp_url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdesc/6.0/'
        _params = (
            ('appKey', '12574478'),
            ('t', t),
            ('api', 'mtop.taobao.detail.getdesc'),
            ('v', '6.0'),
            ('type', 'jsonp'),
            ('dataType', 'jsonp'),
            ('timeout', '20000'),
            ('callback', 'mtopjsonp1'),
            ('data', json.dumps(params_data_1)),
        )
        url = tmp_url + '?' + urlencode(_params)
        last_url = re.compile(r'\+').sub('', url)  # 转换后得到正确的url请求地址(替换'+')
        # self.my_lg.info(last_url)

        data = MyRequests.get_url_body(url=last_url, headers=self.headers, params=None, timeout=14, num_retries=3)
        if data == '':
            self.my_lg.error('获取到的div_desc为空值!请检查! 出错goods_id: {0}'.format(goods_id))
            return ''

        try:
            data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0]  # 贪婪匹配匹配所有
            # self.my_lg.info(str(data))
        except IndexError as e:
            self.my_lg.error('获取data时, IndexError出错! 出错goods_id: {0}'.format(goods_id))
            self.my_lg.exception(e)
            return ''

        try:
            data = json.loads(data)
            # pprint(data)
        except JSONDecodeError:
            self.my_lg.error('json转换data时出错, 请检查!')
            data = {}

        div = data.get('data', {}).get('pcDescContent', '')
        # self.my_lg.info(str(div))
        div = self.deal_with_div(div)
        # self.my_lg.info(div)

        return div
Example #12
0
def test_requests():
    url = 'https://superonesfazai.github.io/'
    start_time = time.time()
    for _ in range(200):
        body = MyRequests.get_url_body(url=url, headers=headers)
        if body != '':
            print('success')
        else:
            print(body)

    end_time = time.time()
    print('requests用时:', end_time - start_time)
def getRandomExternalLink(startingPage):
    html = MyRequests.get_url_body(url=startingPage, headers=headers)
    bsObj = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("没有外部链接,准备遍历整个网站")
        domain = urlparse(startingPage).scheme + "://" + urlparse(
            startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(
            0,
            len(internalLinks) - 1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks) - 1)]
Example #14
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = self._json_2_dict(body).get('wareDetailComment',
                                                {}).get('commentInfoList', [])
            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
Example #15
0
    def _get_tmall_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取tmall销量靠前的商品
        :param keyword:
        :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id
        '''
        '''方案: tmall m站的搜索'''
        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': '*/*',
            # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d',
            'authority': 'list.tmall.com',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte',
        }

        params = {
            'page_size': '20',
            'page_no': '1',
            'q': str(keyword[1]),
            'type': 'p',
            'spm': 'a220m.6910245.a2227oh.d100',
            'from': 'mallfp..m_1_suggest',
            'sort': 'd',
        }

        s_url = 'https://list.tmall.com/m/search_items.htm'
        body = MyRequests.get_url_body(url=s_url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            data = self.json_str_2_dict(json_str=body)
            if data == {}:
                self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1]))
                return []
            else:
                _ = data.get('item', [])
                if _ is None or _ == []:
                    self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []
                try:
                    goods_id_list = [str(item.get('url', '')) for item in _]
                except Exception as e:
                    self.my_lg.exception(e)
                    self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                    return []

                return goods_id_list
    def _get_aweme_api_videos_info(self, user_id):
        self.user_id = user_id
        params = (
            ('user_id', self.user_id),
            ('max_cursor', '0'),
            ('count', '20'),
        )

        url = 'https://www.douyin.com/aweme/v1/aweme/post/'
        body = MyRequests.get_url_body(url=url,
                                       headers=self.headers,
                                       params=params)
        # print(body)

        self.deal_with_data(body=body)
Example #17
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page)
            )
            print('正在抓取的页面地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.5)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id': item.get('goods_id', ''),
                'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count': str(item.get('join_number_int', '')),
                'page': page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        return pintuan_goods_id_list
Example #18
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时拼团商品信息
        :return: None
        '''
        goods_list = []
        for index in range(1, 1000):     # 0跟1返回一样,所有从1开始遍历
            tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str(index) + '/0/'
            print('正在抓取: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True)
            # print(body)

            if body == '':
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')

                if tmp_data.get('data_list', []) == []:
                    print('得到的data_list为[], 此处跳过!')
                    break

                else:
                    # print(tmp_data)
                    data_list = [{
                        'goods_id': item.get('sku', ''),
                        'sub_title': item.get('intro', ''),
                        'pid': index,
                    } for item in tmp_data.get('data_list', [])]
                    # pprint(data_list)

                    for item in data_list:
                        goods_list.append(item)
                    sleep(.5)

        pprint(goods_list)
        self.deal_with_data(goods_list=goods_list)
        sleep(8)
        return None
Example #19
0
    def get_one_page_goods_info(self, *params):
        '''
        得到一个页面的html代码
        :param params: 待传入的参数
        :return: '{}' or str
        '''
        gender, page = params
        tmp_url = 'https://api.chuchujie.com/api/'

        client = {
            "ageGroup": "AG_0to24",
            "channel": "QD_web_webkit",
            "deviceId": "0",
            "gender": gender,  # '0' -> 女 | '1' -> 男
            "imei": "0",
            "packageName": "com.culiu.purchase",
            "platform": "wap",
            "sessionId": "0",
            "shopToken": "0",
            "userId": "0",
            "version": "1.0",
            "xingeToken": ""
        }

        query = {
            "group": 4,
            "module": "99",
            "page": page,
            "tab": "all"
        }

        # 切记: Query String Parameters直接这样编码发送即可
        # 如果是要post的数据就得使用post的方法
        data = {
            'client': json.dumps(client),
            'query': json.dumps(query),
            'page': page
        }

        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data)
        if body == '':
            body = '{}'

        return body
    def get_div_desc_body(self, div_desc_url):
        '''
        得到div_desc的html页面
        :param div_desc_url:
        :return: str类型的data, 出错的情况下返回{}
        '''
        # 使用requests
        div_desc_body = MyRequests.get_url_body(url=div_desc_url,
                                                headers=self.headers)
        if div_desc_body == '':
            div_desc_body = '{}'

        # 使用phantomjs
        # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url)
        # # print(div_desc_body)
        # if div_desc_body == '':
        #     div_desc_body = '{}'
        # else:
        #     try:
        #         div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0]
        #         div_desc_body = re.compile(r'&gt;').sub('>', div_desc_body)
        #         div_desc_body = re.compile(r'&lt;').sub('<', div_desc_body)
        #     except:
        #         div_desc_body = '{}'

        try:
            div_desc_data = json.loads(div_desc_body)
            tmp_body = div_desc_data.get('data', '')
        except Exception:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            tmp_body = ''

        # 清洗
        tmp_body = re.compile(
            r'<div class=\"by_deliver\">.*?</div></div>').sub('', tmp_body)
        tmp_body = re.compile(r'src=.*? />').sub('/>', tmp_body)
        tmp_body = re.compile(r'data-url=').sub('src=\"', tmp_body)
        tmp_body = re.compile(r' />').sub(
            '\" style="height:auto;width:100%;"/>', tmp_body)

        if tmp_body != '':
            tmp_body = '<div>' + tmp_body + '</div>'

        return tmp_body
Example #21
0
class TestMthod(unittest.TestCase):
    def setUp(self):
        self.run = MyRequests()

    @unittest.skip("test_01")
    def test_01(self):
        url = "http://localhost:8000/login/"
        data = {"username": "******", "password": "******"}
        #res=my_mock(self.run.run_main,data,url,"POST",data)
        header = ''
        res = self.run.run_main(url, "POST", data, header)

    def test_02(self):
        count = 0
        for i in range(1, 1002):
            if i % 2 == 0 or i % 3 == 0 or i % 5 == 0:
                count = count + 1

        print(count)
Example #22
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        mia_base_number = MIA_BASE_NUMBER
        while mia_base_number < MIA_MAX_NUMBER:
            tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(
                mia_base_number)

            body = MyRequests.get_url_body(url=tmp_url,
                                           headers=self.headers,
                                           had_referer=True)
            # print(body)

            if body == '' or body == '[]':
                print('mia_base_number为: ', mia_base_number)
                print('获取到的body为空值! 此处跳过')

            else:
                try:
                    tmp_data = json.loads(body)
                except:
                    tmp_data = {}
                    print('json.loads转换body时出错, 此处跳过!')
                tmp_hour = tmp_data.get('p_info', {}).get('start_time',
                                                          '')[11:13]
                if tmp_hour == '22':  # 过滤掉秒杀时间为22点的
                    print('--- 销售时间为22点,不抓取!')
                    pass
                else:
                    print(tmp_data)
                    print('mia_base_number为: ', mia_base_number)
                    pid = mia_base_number
                    begin_time = tmp_data.get('p_info',
                                              {}).get('start_time', '')
                    end_time = tmp_data.get('p_info', {}).get('end_time', '')
                    item_list = tmp_data.get('item_list', [])

                    self.deal_with_data(pid, begin_time, end_time, item_list)

            sleep(.35)
            mia_base_number += 1
Example #23
0
    def traversal_hour_timestamp(self, item):
        '''
        遍历每个需求的整点时间戳
        :param item:
        :return:
        '''
        # 先遍历today的需求的整点时间戳
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(item))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('item为: ', item)
            print('获取到的body为空值! 此处跳过')

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                print('json.loads转换body时出错, 此处跳过!')
                tmp_data = {}

            if tmp_data == {}:
                print('tmp_data为空{}!')
                pass
            else:
                # pprint(tmp_data)
                # print(tmp_data)

                event_time = item
                item_list = tmp_data.get('data', {}).get('list', [])

                self.deal_with_data(event_time, item_list)
                sleep(MOGUJIE_SLEEP_TIME)
    def get_stock_info_dict(self, stock_info_url):
        '''
        得到实时库存信息
        :param stock_info_url:
        :return: 返回dict类型
        '''
        stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers)
        if stock_info_body == '':
            print('获取到的stock_info_body为空值!')
            stock_info_body = '{}'

        try:
            stock_info_data = json.loads(stock_info_body)
            tmp_stock_info = stock_info_data.get('data', {})
        except Exception:
            self.result_data = {}  # 重置下,避免存入时影响下面爬取的赋值
            tmp_stock_info = {}

        return tmp_stock_info
    def get_item_list(self, event_time):
        '''
        得到event_time中所有的商品信息
        :param event_time:
        :return: item_list 类型 list
        '''
        tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format(
            str(event_time))
        body = MyRequests.get_url_body(url=tmp_url,
                                       headers=self.headers,
                                       had_referer=True)
        # print(body)

        if body == '':
            print('获取到的body为空值! 此处跳过')
            item_list = ''

        else:
            try:
                body = re.compile('null\((.*)\)').findall(body)[0]
            except Exception:
                print('re匹配body中的数据时出错!')
                body = '{}'

            try:
                tmp_data = json.loads(body)
            except:
                tmp_data = {}
                print('json.loads转换body时出错, 此处跳过!')

            if tmp_data == {}:
                print('tmp_data为空{}!')
                item_list = []

            else:
                # pprint(tmp_data)
                # print(tmp_data)

                item_list = tmp_data.get('data', {}).get('list', [])
        sleep(.5)

        return item_list
Example #26
0
    def _judge_is_taobao_head_img(self, url):
        '''
        判断是否为淘宝默认头像地址
        :param url:
        :return:
        '''
        tmp_proxies = MyRequests._get_proxies()

        try:
            _res = requests.get(url=url,
                                headers=self.headers,
                                proxies=tmp_proxies)
            self.my_lg.info(str(_res.url))
            if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg':
                return True
            else:
                return False
        except:
            self.my_lg.info('检测图片地址时网络错误! 跳过!')
            return False
    def get_one_page_all_goods_list(self, *params):
        '''
        得到一个页面地址的所有商品list
        :return: str | list 类型
        '''
        page = params[0]
        all_goods_list = []
        tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(
            str(page))
        # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
        body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
        # print(body)

        try:
            json_body = json.loads(body)
            # print(json_body)
        except:
            print('json.loads转换body时出错!请检查')
            json_body = {}
            return '网络错误!'

        this_page_item_list = json_body.get('item_list', [])
        if this_page_item_list == []:
            return []

        for item in this_page_item_list:
            if item.get('item_id', '') not in [
                    item_1.get('item_id', '') for item_1 in all_goods_list
            ]:
                item['page'] = page
                all_goods_list.append(item)

        # sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]

        return all_goods_list
Example #28
0
    def _get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_body = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
            if tmp_body == '':
                tmp_body = '{}'
            try:
                tmp_data = json.loads(tmp_body)
                tmp_data = tmp_data.get('objects', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        return zid_list
Example #29
0
    def _get_1688_goods_keywords_goods_id_list(self, keyword):
        '''
        根据keyword获取1688销量靠前的商品信息
        :param keyword:
        :return: a list eg: ['11111', ...]
        '''
        '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品'''
        headers = {
            'authority': 'm.1688.com',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': HEADERS[randint(0, len(HEADERS)-1)],
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE',
        }

        params = (
            ('sortType', 'booked'),
            ('filtId', ''),
            ('keywords', keyword[1]),
            ('descendOrder', 'true'),
        )

        url = 'https://m.1688.com/offer_search/-6161.html'
        body = MyRequests.get_url_body(url=url, headers=headers, params=params)
        # self.my_lg.info(str(body))
        if body == '':
            return []
        else:
            try:
                goods_id_list = Selector(text=body).css('div.list_group-item::attr("data-offer-id")').extract()
                # pprint(goods_id_list)
            except Exception as e:
                self.my_lg.exception(e)
                self.my_lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]))
                goods_id_list = []

        return goods_id_list
def getAllExternalLinks(siteUrl):
    domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
    html = MyRequests.get_url_body(url=siteUrl, headers=headers)
    bsObj = BeautifulSoup(html, 'lxml')
    internalLinks = getInternalLinks(bsObj, domain)
    externalLinks = getExternalLinks(bsObj, domain)

    f = open('result.txt', 'w')
    # 收集外链
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            # print(link)
            f.writelines(link + '\n')
            print("即将获取的外部链接的URL是:" + link)
            # 收集内链
    for link in internalLinks:
        if link not in allIntLinks:
            print("即将获取内部链接的URL是:" + link)
            allIntLinks.add(link)
            getAllExternalLinks(link)
            f.writelines(link + '\n')