Beispiel #1
0
    def _get_origin_comment_list(self, **kwargs) -> list:
        '''
        得到加密的接口数据信息
        :param kwargs:
        :return:
        '''
        csrf = kwargs.get('csrf', '')
        goods_id = kwargs.get('goods_id', '')
        cookies = kwargs.get('cookies', '')

        url = 'https://m.1688.com/page/offerRemark.htm'
        headers = {
            'cookie': cookies,
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': get_random_pc_ua(),
            'accept': 'application/json, text/javascript, */*; q=0.01',
            'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id),
            'authority': 'm.1688.com',
            'x-requested-with': 'XMLHttpRequest',
        }

        origin_comment_list = []
        for i in range(1, self.max_page):
            __wing_navigate_options = {
                'data': {
                    'bizType': 'trade',
                    'itemId': int(goods_id),
                    'offerId': str(goods_id),
                    'page': i,
                    'pageSize': 5,
                    # 'receiveUserId': 989036456,
                    'starLevel': 7
                }
            }
            params = (
                ('_csrf', csrf),
                ('__wing_navigate_type', 'view'),
                ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'),
                ('__wing_navigate_options', dumps(__wing_navigate_options)),
                ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))),
            )
            body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type)
            data = json_2_dict(body, encoding='ascii').get('data', {})
            # pprint(data)
            one = data.get('model', [])
            pprint(one)
            origin_comment_list += one
            sleep(.25)

        return origin_comment_list
Beispiel #2
0
    def _get_params(self, goods_id):
        '''
        得到获取sku_info的params
        :param goods_id:
        :return:
        '''
        t = str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))
        params = (
            ('t', t),
            ('goodsId', str(goods_id)),
            # ('provinceCode', '330000'),
            # ('cityCode', '330100'),
            # ('districtCode', '330102'),
        )

        return params
Beispiel #3
0
def _get_simulate_log_info(retries=10) -> str:
    '''
    print仿生log.info
    :return:
    '''
    time_str = lambda x='': str(get_shanghai_time()) + ',' + str(
        get_random_int_number(100, 999)) + ' [INFO  ] ➞ '
    try:
        time_str = time_str()
    except ValueError:
        if retries > 0:
            return _get_simulate_log_info(retries - 1)
        else:
            return ''

    return time_str
Beispiel #4
0
def get_random_sqlite_obj() -> BaseSqlite3Cli:
    """
    获取随机sqlite 对象
    :return:
    """
    global sqlite3_cli0, sqlite3_cli1, sqlite3_cli2, sqlite3_cli3, sqlite3_cli4

    random_num = get_random_int_number(0, 4)
    if random_num == 0:
        return sqlite3_cli0
    elif random_num == 1:
        return sqlite3_cli1
    elif random_num == 2:
        return sqlite3_cli2
    elif random_num == 3:
        return sqlite3_cli3
    elif random_num == 4:
        return sqlite3_cli4
    else:
        raise NotImplemented
    def _get_one_page_articles(self, page_num) -> list:
        '''
        得到一页新闻
        :param page_num:
        :return:
        '''
        headers = {
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': get_random_pc_ua(),
            'Accept': '*/*',
            'Referer': 'https://36kr.com/',
            'Connection': 'keep-alive',
        }

        params = (
            ('per_page', '20'),
            ('page', str(page_num)),
            ('_', str(datetime_to_timestamp(get_shanghai_time())) +
             str(get_random_int_number(100, 999))),
        )

        url = 'https://36kr.com/api/search-column/mainsite'
        data = json_2_dict(
            Requests.get_url_body(url=url,
                                  headers=headers,
                                  params=params,
                                  cookies=None)).get('data',
                                                     {}).get('items', [])
        # pprint(data)
        if data == []:
            return []

        [
            item.update({'user_info': json_2_dict(item.get('user_info', ''))})
            for item in data
        ]
        # pprint(data)

        return data
Beispiel #6
0
    def _parse_page_from_wx(self, **kwargs):
        '''
        解析wx单个article的info
        :param kwargs:
        :return: a WellRecommendArticle object
        '''
        article_link = kwargs.get('article_link', '')
        article_info = kwargs.get('article_info', {}).get('data', {})
        article_likes = kwargs.get('article_likes', get_random_int_number())

        error_msg = '出错article_url: {0}'.format(article_link)
        try:
            nick_name = article_info.get('user', {}).get('nickname', '')
            assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg

            head_url = article_info.get('user', {}).get('images', '')
            assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg

            profile = ''  # 个人简介或者个性签名(留空)

            share_id = article_info.get('id', '')
            assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg

            title = self.wash_sensitive_info(article_info.get('title',
                                                              ''))  # title默认留空
            comment_content = self.wash_sensitive_info(
                article_info.get('desc', ''))
            assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg

            share_img_url_list = [{  # 如果是视频的话, 则里面第一章图片就是视频第一帧
                'img_url': item.get('original', ''),
                'height': item.get('height'),  # 图片高宽
                'width': item.get('width'),
            } for item in article_info.get('images_list', [])]
            assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg

            div_body = ''  # 默认留空
            gather_url = article_link

            # 原文章原始的创建日期
            tmp_create_time = article_info.get('time', '')
            assert tmp_create_time != '', '获取到的create_time为空值!请检查!'
            create_time = string_to_datetime(tmp_create_time + ':00')

            site_id = 3  # 小红书
            goods_url_list = []  # 该文章待抓取的商品地址
            share_goods_base_info = []

            # wx端tags没有返回值
            tags = self._get_tags_from_wx(article_info=article_info)

            # 视频播放地址
            tmp_video_url = article_info.get('video', '')
            tmp_video_url = re.compile('\?.*').sub('', tmp_video_url)
            video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url)

            likes = article_likes
            collects = article_info.get('fav_count', None)
            assert collects is not None, '获取到的collects为None!请检查!' + error_msg

        except Exception:
            sleep(self.CRAWL_ARTICLE_SLEEP_TIME)
            self.lg.error('遇到错误:', exc_info=True)
            return {}

        _ = WellRecommendArticle()
        _['nick_name'] = nick_name
        _['head_url'] = head_url
        _['profile'] = profile
        _['share_id'] = share_id
        _['title'] = title
        _['comment_content'] = comment_content
        _['share_img_url_list'] = share_img_url_list
        _['div_body'] = div_body
        _['gather_url'] = gather_url
        _['create_time'] = create_time
        _['site_id'] = site_id
        _['goods_url_list'] = goods_url_list
        _['tags'] = tags
        _['share_goods_base_info'] = share_goods_base_info
        _['video_url'] = video_url
        _['likes'] = likes
        _['collects'] = collects

        return _
Beispiel #7
0
'''

from requests import session
from requests_toolbelt import MultipartEncoder
from fzutils.spider.fz_requests import Requests
from fzutils.time_utils import (
    get_shanghai_time,
    datetime_to_timestamp,
)
from fzutils.common_utils import get_random_int_number

cookies = {
    'yd_cookie': '2369844f-fc3f-42742d88d5deabc0ec65d866d61526e32347',
}
_t = str(datetime_to_timestamp(get_shanghai_time())) + str(
    get_random_int_number(100, 999))
data = MultipartEncoder(
    fields={
        'PageIndex': '1',
        'PageSize': '20',
        # 'TimesTamp': '1547813627151',
        'TimesTamp': _t,
        'UserId': '259146',
        'sign': '42531e765ce3055f25f369db3505db8f'
    })
headers = {
    'Host': 'api.yiuxiu.com',
    'accept': 'application/json',
    # 'content-type': 'multipart/form-data; boundary=Boundary+C98168C62FD125E1',
    'content-type': data.content_type,
    # 'token': '',      # jwt token