Exemple #1
0
def init():
    for i in range(20):
        ascp = public.get_ASCP()
        form = {
            'ac': 'wap',
            'format': 'json_raw',
            'min_behot_time': int(time.time()),
            'as': ascp[0],
            'enable_stick': 'false',
            'tag': 'video',
            'cp': ascp[1],
        }
        for item in public.request_data(LIST_API,
                                        cookie=COOKIE,
                                        header=HEADER,
                                        params=form).get('data', []):
            video = {}
            video['_id'] = public.format_id(item.get('title'))
            video['title'] = item.get('title')
            video['from'] = '今日头条'
            video['pic'] = item.get('large_image_url')
            r, s = public.get_rs(item.get('video_id'))
            video['url'] = base64.b64decode(
                public.request_data(
                    DETAIL_API + item.get('video_id'),
                    params={
                        'r': r,
                        's': s
                    },
                    header=HEADER).get('data').get('video_list').
                get('video_1').get('main_url').encode('utf-8')).decode('utf-8')
            video['timestamp'] = int(time.time())
            public.save_data(video, DB_NAME)
Exemple #2
0
def scrapy_list(type_info):
    ascp = public.get_ASCP()
    form = {
        'ac': 'wap',
        'format': 'json_raw',
        'min_behot_time': int(time.time()),
        'as': ascp[0],
        'enable_stick': 'false',
        'tag': type_info.get('tag'),
        'cp': ascp[1],
    }
    for item in public.request_data(LIST_API,
                                    cookie=COOKIE,
                                    header=HEADER,
                                    params=form).get('data', []):
        save_data = {
            '_id':
            public.format_id(item.get('title')),
            'title':
            item.get('title'),
            'imageurls':
            list(map(lambda x: x.get('url'), item.get('image_list')))[:3],
            'tag':
            type_info.get('name'),
            'timestamp':
            int(time.time()),
            'from':
            '今日头条',
            'detail':
            scrapy_detail(item.get('source_url'))
        }
        if save_data.get('detail'):
            public.save_data(save_data, DB_NAME)
Exemple #3
0
def scrapy_detail(id):
    try:
        data = json.loads(
            public.request_data(DETAIL_API + '?id=' + id,
                                header=HEADER).get('data').get('content', []))
        detail = []
        for item in data:
            content = {}
            if item.get('type') == 'cnt_article':
                content['type'] = 'text'
                content['data'] = item.get('desc')
            elif item.get('type') == 'img_url':
                content['type'] = 'image'
                if item.get('img'):
                    content['data'] = item.get('img').get('imgurl1000').get(
                        'imgurl')
                    if not content['data']:
                        content['data'] = item.get('img').get('imgurl640').get(
                            'imgurl')
                    if not content['data']:
                        content['data'] = item.get('img').get('imgurl0').get(
                            'imgurl')
                elif item.get('img_url'):
                    content['data'] = item.get('img_url')
            detail.append(content)
        return detail
    except Exception as e:
        print(e, id)
Exemple #4
0
def scrapy_list(cookie, category__name, timesample):
    list_postdata = {
        'from': 'news_webapp',
        'pd': 'webapp',
        'os': 'android',
        'mid': cookie['BAIDUID'],
        'ver': 6,
        'category_name': category__name,
        'action': 0,
        'display_time': timesample,
        'wf': 0,
    }
    data = public.request_data(LISTAPI, params=list_postdata,
                               cookie=cookie).get('data').get('news', [])
    for item in data:
        save_data = {
            '_id':
            public.format_id(item.get('title')),
            'title':
            item.get('title'),
            'imageurls':
            list(
                map(lambda x: x.get('url_webp', ''), item.get('imageurls',
                                                              []))),
            'tag':
            category__name,
            'timestamp':
            int(time.time()),
            'from':
            '百度新闻',
            'detail':
            scrapy_detail(item.get('nid'), category__name)
        }
        if save_data.get('detail') and len(save_data.get('detail')) > 1:
            public.save_data(save_data, DB_NAME)
Exemple #5
0
def scrapy_list(nextUrl, type_info):
    if nextUrl:
        data = public.request_data(nextUrl)
    else:
        data = public.request_data(LIST_API + type_info['tag'])
    for item in data.get('contList', []):
        save_data = {
            '_id': public.format_id(item.get('name')),
            'title': item.get('name'),
            'imageurls': [item.get('pic')],
            'tag': type_info['name'],
            'timestamp': int(time.time()),
            'from': '澎湃新闻',
            'detail': scrapy_detail(item['contId'])
        }
        if save_data.get('detail'):
            public.save_data(save_data, DB_NAME)
    return data.get('nextUrl')
Exemple #6
0
def init():
    for channel in CHANNELS:
        for i in range(20):
            for item in public.request_data(LIST_API,
                                            params={
                                                'channel': channel,
                                                'page': i
                                            }).get('videos', []):
                video = {}
                video['_id'] = public.format_id(item.get('title'))
                video['title'] = item.get('title')
                video['from'] = '百度视频'
                video['pic'] = item.get('imgh_url')
                video['url'] = public.request_data(
                    DETAIL_API,
                    params={
                        'id': item.get('url').split('/')[-1].split('.')[0]
                    }).get('data').get('main_video').get('source').get('mp4')
                video['timestamp'] = int(time.time())
                public.save_data(video, DB_NAME)
Exemple #7
0
def init():
    for i in range(20):
        form = {
            'cateid': 'uM',
            'mod': 'mpvideo',
            'action': 1,
            'up': i,
            'down': 0,
            'did': '41ea3bde383860b6',
            'imei': '',
            'length': 13,
            'net_type': 2,
            'ad': {
                "originfrom": "huawei-q",
                "imei": "868384032192527",
                "channel": "news_ent",
                "osVersion": "9",
                "deviceModel": "HWI-AL00",
                "platform": "android",
                "from": "fastapp"
            },
            'app_type': 124,
            'cre': 'tianyi',
            'merge': 3,
            'statics': 1,
            'ldid': '',
            'uid': ''
        }
        for item in public.request_data(LIST_API, params=form).get('data', []):
            video = {}
            video['_id'] = public.format_id(item.get('title'))
            video['title'] = item.get('title')
            video['from'] = '新浪视频'
            video['pic'] = item.get('thumb')
            video['url'] = public.request_data(
                DETAIL_API, params={
                    'docUrl': item.get('surl')
                }).get('data').get('videosModule')[0].get('data')[0].get(
                    'videoInfo').get('url')
            video['timestamp'] = int(time.time())
            public.save_data(video, DB_NAME)
Exemple #8
0
def init():
    for lc in range(20):
        for item in public.request_data(API + str(lc)).get('data').get(
                'videoList', []):
            video = {}
            video['_id'] = public.format_id(item.get('title'))
            video['title'] = item.get('title')
            video['from'] = '搜狐视频'
            video['pic'] = item.get('tvPic')
            video['url'] = item.get('playUrl')
            video['timestamp'] = int(time.time())
            public.save_data(video, DB_NAME)
Exemple #9
0
def init():
    for i in range(20):
        form = {'rtype': 0, 'pageNum': i, 'pageSize': 10}
        for item in public.request_data(LIST_API,
                                        params=form,
                                        header=HEADER,
                                        method='POST').get('page').get(
                                            'result', []):
            video = {}
            video['_id'] = public.format_id(item.get('title'))
            video['title'] = item.get('title')
            video['from'] = '看鉴视频'
            video['pic'] = item.get('image')
            video['url'] = public.request_data(
                DETAIL_API,
                params={
                    'videoId': item['rid']
                },
                header=HEADER,
                method='POST').get('upyunVideos')[0].get('playurl')
            video['timestamp'] = int(time.time())
            public.save_data(video, DB_NAME)
Exemple #10
0
def init():
    for channel_id in CHANNEL_IDS:
        for i in range(20):
            form = {
                'app': 'howto_a',
                'udid': '41ea3bde383860b6',
                'channel_id': channel_id,
                'timestamp': int(time.time())
            }
            for item in public.request_data(LIST_API,
                                            params=form).get('contents', []):
                video = {}
                video['_id'] = public.format_id(item.get('video').get('name'))
                video['title'] = item.get('video').get('name')
                video['from'] = '好兔视频'
                video['pic'] = item.get('video').get('share_img')
                video['url'] = public.request_data(
                    DETAIL_API, params={
                        'id': item.get('id')
                    }).get('bitrates')[-1].get('uri')
                video['timestamp'] = int(time.time())
                public.save_data(video, DB_NAME)
Exemple #11
0
def init():
    global API
    for i in range(20):
        res = public.request_data(API)
        for item in res.get('contList', []):
            video = {}
            video['_id'] = public.format_id(item.get('name'))
            video['title'] = item.get('name')
            video['from'] = '梨视频'
            video['pic'] = item.get('pic')
            video['url'] = item.get('videos')[0].get('url')
            video['timestamp'] = int(time.time())
            API = res.get('nextUrl')
            public.save_data(video, DB_NAME)
Exemple #12
0
def scrapy_detail(nid, tag):
    try:
        detail_postdata = {
            'cuid': '',
            'nids': nid,
            'wf': 1,
            'remote_device_type': 1,
            'os_type': 1,
            'screen_size_width': 1080,
            'screen_size_height': 1920,
        }
        data = public.request_data(
            DETAILAPI, params=detail_postdata).get('data').get('news')[0]
        return list(map(format_detail, data.get('content')))
    except Exception as e:
        print(e, 1)
Exemple #13
0
def scrapy_list(type_info, count):
    form = {'channel': type_info.get('tag'), 'page': count}
    data = public.request_data(LIST_API, params=form,
                               header=HEADER).get('data', [])
    if data:
        for item in data:
            save_data = {
                '_id': public.format_id(item.get('title')),
                'title': item.get('title'),
                'imageurls': item.get('img_urls'),
                'tag': type_info.get('name'),
                'timestamp': int(time.time()),
                'from': '腾讯新闻',
                'detail': scrapy_detail(item.get('id'))
            }
            if save_data.get('detail'):
                public.save_data(save_data, DB_NAME)
Exemple #14
0
def init():
    global LIST_API
    for i in range(20):
        res = public.request_data(LIST_API, params=FORM)
        for item in res.get('contList', []):
            video = {}
            video['_id'] = public.format_id(item.get('name'))
            video['title'] = item.get('name')
            video['from'] = '澎湃视频'
            video['pic'] = item.get('pic')
            video['url'] = etree.HTML(
                public.request_html(DETAIL_API,
                                    params={
                                        'contid': item['contId']
                                    })).xpath("//*[@class='m']/@href")[0]
            video['timestamp'] = int(time.time())
            LIST_API = res.get('nextUrl')
            public.save_data(video, DB_NAME)
Exemple #15
0
def scrapy_detail(aid):
    try:
        form = {
            'st': '',
            'df': 'androidphone',
            'loginid': '',
            'os': 'android',
            'city': '',
            'screen': '1080x1794',
            'nw': 'wifi',
            'deviceid': '',
            'publishid': '5286',
            'gv': '5.7.2',
            'uid': '868384032192527',
            'province': '',
            'av': '5.7.2',
            'proid': 'ifengnews',
            'district': '',
            'limit': 5,
            'from': 'xiaomi',
            'sn': '',
            'aid': aid,
            'vt': 5,
        }
        res = public.request_data(DETAIL_API,
                                  params=form).get('body').get('text')
        data = etree.HTML(res)
        detail = []
        for item in data.xpath('//*'):
            if item.xpath('text()'):
                content = {}
                content['type'] = 'text'
                content['data'] = item.xpath('text()')[0]
                detail.append(content)
            if item.xpath('img/@src'):
                content = {}
                content['type'] = 'image'
                content['data'] = item.xpath('img/@src')[0]
                detail.append(content)
        return detail
    except Exception as e:
        print(e)
Exemple #16
0
def scrapy_list(type_info, count):
    form = {
        'cateid': type_info['cateid'],
        'mod': type_info['mod'],
        'action': 2,
        'up': count,
        'down': 0,
        'did': '41ea3bde383860b6',
        'imei': '',
        'length': 13,
        'net_type': 2,
        'ad': {
            "originfrom": "huawei-q",
            "imei": "868384032192527",
            "channel": "news_ent",
            "osVersion": "9",
            "deviceModel": "HWI-AL00",
            "platform": "android",
            "from": "fastapp"
        },
        'app_type': 124,
        'cre': 'tianyi',
        'merge': 3,
        'statics': 1,
        'ldid': '',
        'uid': ''
    }
    for item in public.request_data(LIST_API, params=form).get('data', []):
        if item.get('thumbs'):
            save_data = {
                '_id': public.format_id(item.get('title')),
                'title': item.get('title'),
                'imageurls': item.get('thumbs')[:3],
                'tag': type_info.get('category__name'),
                'timestamp': int(time.time()),
                'from': '新浪新闻',
                'detail': scrapy_detail(item.get('url'))
            }
            if save_data.get('detail'):
                public.save_data(save_data, DB_NAME)
Exemple #17
0
def scrapy_list(type_info, count):
    form = {
        'st': '',
        'df': '',
        'pullNum': count,
        'loginid': '',
        'os': 'android',
        'city': '',
        'screen': '',
        'nw': '',
        'deviceid': '',
        'gv': '5.7.3',
        'publishid': '',
        'uid': '868384032192527',
        'lastDoc': '',
        'province': '',
        'av': 0,
        'district': '',
        'proid': 'ifengnews',
        'action': 'default',
        'id': type_info.get('tag'),
        'sn': '',
        'vt': 5,
    }
    data = public.request_data(LIST_API, params=form)[0]
    if data:
        for item in data.get('item'):
            if item.get('thumbnail') and item.get('id'):
                save_data = {
                    '_id': public.format_id(item.get('title')),
                    'title': item.get('title'),
                    'imageurls': [item.get('thumbnail')],
                    'tag': type_info.get('name'),
                    'timestamp': int(time.time()),
                    'from': '凤凰新闻',
                    'detail': scrapy_detail(item.get('id'))
                }
                if save_data.get('detail'):
                    public.save_data(save_data, DB_NAME)
Exemple #18
0
def scrapy_detail(source_url):
    try:
        res = public.request_data(DETAIL_API + source_url + 'info/',
                                  cookie=COOKIE,
                                  header=HEADER).get('data').get('content')

        data = etree.HTML(res)
        detail = []
        for item in data.xpath('//*/div[1]/*'):
            if item.xpath('text()'):
                content = {}
                content['type'] = 'text'
                content['data'] = item.xpath('text()')[0]
                detail.append(content)
            if item.xpath('img/@src'):
                content = {}
                content['type'] = 'image'
                content['data'] = item.xpath('img/@src')[0]
                detail.append(content)
        return detail
    except Exception as e:
        print(e)
Exemple #19
0
def scrapy_detail(doc_url):
    try:
        data = public.request_data(DETAIL_API + doc_url).get('data')
        if data.get('pics'):
            pics = list(
                map(lambda x: x.get('data').get('pic'), data.get('pics')))
            text = re.split(r'<!--{IMG_\d}-->',
                            data.get('content').replace('<br/>', ''))
            i = 0
            content = []
            for item in text:
                content.append({'type': 'text', 'data': item})
                if i >= 1 and i <= len(text) - 2:
                    content.append({'type': 'image', 'data': pics[i]})
                i += 1
            return content
        elif data.get('picsModule'):
            content = []
            for item in data.get('picsModule')[0].get('data'):
                content.append({'type': 'text', 'data': item.get('alt')})
                content.append({'type': 'image', 'data': item.get('pic')})
            return content
    except Exception as e:
        print(e)