Python match1の例、app.spider_store.common.match1 Pythonの例

コード例 #1

0

ファイルを表示

def eastday_video_download(url):
    html = get_content(url, )
    title = match1(html, r'var\s*redirect_topic\s*=\s*[\'|"](.*?)[\'|"];')
    if title is None:
        title = match1(
            html,
            r'<meta\s*name=[\'|"]description[\'|"]\s*content=[\'|"](.,*?)[\'|"]/>'
        )
    source = match1(html, r'var\s*d_source\s*=\s*[\'|"](.*?)[\'|"];')
    if source is None:
        source = "crawl"
    thumbnail_url = match1(html,
                           r'var\s*global_share_img\s*=\s*[\'|"](.*?)[\'|"];')
    video_url = match1(html, r'var\s*mp4\s*=\s*[\'|"](.*?)[\'|"];')
    if not re.search(r"http|https", video_url):
        video_url = "http:{}".format(video_url)
    if not re.search(r"http|https", thumbnail_url):
        thumbnail_url = "http:{}".format(thumbnail_url)

    data = {
        "type": 'video',
        "title": title,
        "source": source,
        "thumbnail_urls": [thumbnail_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": None,
        "size": None,
    }

    return data

コード例 #2

0

ファイルを表示

def ku6_download(url):
    html = get_content(url)
    type = news_type(url)
    title = match1(
        html,
        r"\$\(['\"]#video-title['\"]\)\.text\(['\"]([\s\S\w\W]+?)['\"]\);")
    if title is None:
        title = match1(html,
                       r"document\.title\s*=\s*['\"]([\s\S\w\W]+?)['\"];")
    title = title.strip()
    source = match1(
        html, r"\$\(['\"]#video-author['\"]\)\.text\(['\"](.*?)['\"]\);")
    img_url = match1(
        html,
        r'[\'|"]poster[\'|"]:\s*[\'|"](.*?)[\'|"],\s*[\'|"]controls[\'|"]:')
    video_url = match1(
        html,
        r'this\.src\(\{type:\s*[\'|"]video/mp4[\'|"], src: [\'|"](.*?)[\'|"]}\);'
    )
    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [img_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": None,
        "size": None,
    }

    return data

コード例 #3

0

ファイルを表示

ファイル: ifeng.py プロジェクト: lihaoABC/trans_api

def ifeng_download(url,
                   title=None,
                   output_dir=output_dir,
                   merge=True,
                   info_only=False,
                   **kwargs):
    # old pattern /uuid.shtml
    # now it could be #uuid
    id = match1(
        url, r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})')
    if id:
        return ifeng_download_by_id(id,
                                    None,
                                    output_dir=output_dir,
                                    merge=merge,
                                    info_only=info_only)

    html = get_content(url)
    uuid_pattern = r'"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"'
    id = match1(
        html,
        r'var vid="([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"'
    )
    if id is None:
        video_pattern = r'"vid"\s*:\s*' + uuid_pattern
        id = match1(html, video_pattern)
    assert id, "can't find video app"
    return ifeng_download_by_id(id,
                                title=title,
                                output_dir=output_dir,
                                merge=merge,
                                info_only=info_only)

コード例 #4

0

ファイルを表示

ファイル: crawler.py プロジェクト: lihaoABC/trans_api

def check_url(url, username, mongo):
    """
    检查url正确性，是否支持一键转换
    :return: site in SITES
    """
    video_host = match1(url, r'http[s]?://([^/]+)/')
    logging.debug(video_host)
    video_url = match1(url, r'http[s]?://[^/]+(.*)')
    logging.debug(video_url)
    if re.search(r'\?', video_url):
        if video_host == "www.ku6.com":
            pass
        else:
            url = url.replace(match1(video_url, r'(\?.*)'), '')
    if not (video_host and video_url):
        if mongo.exists(url):
            mongo.update(url, COD.FORMAT)
            raise AssertionError(r"格式错误")
        else:
            info = mongo.info(url, COD.FORMAT, username)
            mongo.insert(info)
            raise AssertionError(r"格式错误：{}".format(url))
    assert video_host and video_url, r"格式错误：{}".format(url)
    if video_host.endswith(r'.com.cn') or video_host.endswith(r'.ac.cn'):
        video_host = video_host[:-3]

    domain = match1(video_host, r'(\.[^.]+\.[^.]+)$') or video_host
    k = match1(domain, r'([^.]+)')
    # acfun临时处理
    if k == "acfun":
        url = url.replace("https", "http")
    # qq临时处理
    if re.search(r"new\.qq\.com", url) or re.search(r"v\.qq\.com", url):
        k = "qq"
    logging.debug("site is {}".format(k))
    if k not in SITES:
        if mongo.exists(url):
            mongo.update(url, COD.URLES)
            raise AssertionError(r'不支持的url, k={}'.format(k))
        else:
            info = mongo.info(url, COD.URLES)
            mongo.insert(info)
            raise AssertionError(r'不支持的url, k={}'.format(k))
    else:
        if mongo.exists(url):
            if mongo.block(url):
                mongo.update(url, COD.URLEX)
                raise AssertionError(r'此url重复')
        else:
            info = mongo.info(url, COD.BEGIN, username)
            mongo.insert(info)

    return k, url

コード例 #5

0

ファイルを表示

def baomihua_download(url):
    html = get_content(url)
    type = news_type(url)
    title = match1(html, r"var\s*temptitle\s*=\s*'(.*?)';")
    source = match1(html, r"var\s*appName\s*=\s*\"(.*?)\";")
    img_url = match1(html, r"var\s*pic\s*=\s*\"(.*?)\";")
    _id = match1(html, r'flvid\s*=\s*(\d+)')
    if type == "video":
        return baomihua_download_by_id(
            _id,
            title,
            source,
            img_url,
            type,
        )

コード例 #6

0

ファイルを表示

def baomihua_download_by_id(_id, title, source, img_url, type):
    html = get_content(
        'http://play.baomihua.com/getvideourl.aspx?flvid={}&devicetype='
        'phone_app'.format(_id))
    host = match1(html, r'host=([^&]*)')
    _type = match1(html, r'videofiletype=([^&]*)')
    vid = match1(html, r'&stream_name=([^&]*)')
    dir_str = match1(html, r'&dir=([^&]*)').strip()
    video_url = 'http://{}/{}/{}.{}'.format(host, dir_str, vid, _type)
    logging.debug("url is {}".format(video_url))
    if title is None:
        title = match1(html, r'&title=([^&]*)')
        title = urllib.parse.unquote(title)
    if source is None:
        return None
    if img_url is None:
        img_url = match1(html, r'&video_img=([^&]*)')

    ext = _type
    size = int(match1(html, r'&videofilesize=([^&]*)'))
    size = float("{:.2f}".format(int(size) / 1024 / 1024))

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": [img_url],
        "image_urls": None,
        "video_url": [video_url],
        "ext": ext,
        "size": size,
    }

    return data

コード例 #7

0

ファイルを表示

ファイル: ifeng.py プロジェクト: lihaoABC/trans_api

def ifeng_download_by_id(id,
                         title=None,
                         output_dir=output_dir,
                         merge=True,
                         info_only=False):
    assert match1(
        id,
        r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})'), id
    url = 'http://vxml.ifengimg.com/video_info_new/{}/{}/{}.xml'.format(
        id[-2], id[-2:], id)
    xml = get_content(url)
    # 标题
    title_real = match1(xml, r'Name="([^"]+)"')
    title_real = unescape(title_real)
    # 来源
    source = match1(xml, r'ColumnName="([^"]+)"')
    source = unescape(source)
    # 缩略图
    thumbnail_urls = match1(xml, 'SmallPosterUrl="([^"]+)"')
    # 视频下载链接
    video_url = match1(xml, r'VideoPlayUrl="([^"]+)"')
    video_url = video_url.replace('http://wideo.ifeng.com/',
                                  'http://ips.ifeng.com/wideo.ifeng.com/')
    type, ext, size = url_info(video_url)
    # print_info(site_info, title, ext, size)
    data = {
        "title": title_real,
        "source": source,
        "thumbnail_urls": thumbnail_urls,
        "video_url": video_url,
    }
    if not info_only:
        download_urls([video_url],
                      title,
                      ext,
                      size,
                      output_dir,
                      merge=merge,
                      headers=headers)

    return data

コード例 #8

0

ファイルを表示

ファイル: bilibili.py プロジェクト: lihaoABC/trans_api

def bilibili_download(url):
    response = get_content(url)
    html = etree.HTML(response)
    if html.xpath('//title/text()')[0]:
        title = html.xpath('//title/text()')[0]

    elif html.xpath('//meta[@itemprop="name"]/@content')[0]:
        title = html.xpath('//meta[@itemprop="name"]/@content')[0]
    else:
        title = html.xpath('//meta[@property="og:title"]/@content')[0]

    title = match1(title, r'(.*?)_哔哩哔哩')

    if html.xpath('//meta[@itemprop="thumbnailUrl"]/@content'):
        thumbnail_url = html.xpath('//meta[@itemprop="thumbnailUrl"]/@content')
    elif html.xpath('//meta[@itemprop="image"]/@content'):
        thumbnail_url = html.xpath('//meta[@itemprop="image"]/@content')
    else:
        thumbnail_url = html.xpath('//meta[@property="og:image"]/@content')

    source = html.xpath('//meta[@itemprop="author"]/@content')[0]
    video_url = None
    type = news_type(url)

    data = {
        "type": type,
        "title": title,
        "source": source,
        "thumbnail_urls": thumbnail_url,
        "image_urls": None,
        "video_url": video_url,
        "ext": None,
        "size": None,
    }

    return data

コード例 #9

0

ファイルを表示

ファイル: sohu.py プロジェクト: lihaoABC/trans_api

def sohu_video_download(url):
    if re.match(r'http[s]?://share\.vrs\.sohu\.com', url):
        vid = match1(url, 'id=(\d+)')
        source = None
    else:
        html = get_content(url, charset="GBK")
        vid = match1(html, r'\Wvid\s*[\:=]\s*[\'"]?(\d+)[\'"]?;')
        if re.search(r"var\s*wm_username='******';", html):
            source = re.search(r"var\s*wm_username='******';", html).group(1)
        else:
            source = None
    assert vid, "视频vid获取失败，请检查url"

    if re.match(r'http[s]?://tv\.sohu\.com/', url):
        info = json.loads(
            get_content(
                'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.format(vid)))
        if info.get("data") and (info.get("data") is not None):
            for qtyp in [
                    'oriVid', 'superVid', 'highVid', 'norVid', 'relativeId'
            ]:
                if 'data' in info:
                    hqvid = info['data'][qtyp]
                else:
                    hqvid = info[qtyp]
                if hqvid != 0 and hqvid != vid:
                    info = json.loads(
                        get_content(
                            'http://hot.vrs.sohu.com/vrs_flash.action?vid={}'.
                            format(hqvid)))
                    if 'allot' not in info:
                        continue
                    break
            host = info['allot']
            tvid = info['tvid']
            urls = []
            if not source:
                if "wm_data" in info:
                    if 'wm_username' in info["wm_data"]:
                        source = info["wm_data"]["wm_username"]
                    else:
                        source = "crawl"
                else:
                    source = "crawl"
            data = info['data']
            title = data['tvName']
            thumbnail_url = data["coverImg"]
            size = sum(data['clipsBytes'])
            assert len(data['clipsURL']) == len(data['clipsBytes']) == len(
                data['su'])
            for fileName, key in zip(data['su'], data['ck']):
                urls.append(real_url(fileName, key, data['ch']))

        else:
            info = json.loads(
                get_content(
                    'http://my.tv.sohu.com/play/videonew.do?vid={}&referer='
                    'http://my.tv.sohu.com'.format(vid)))
            host = info['allot']
            tvid = info['tvid']
            urls = []
            if not source:
                if "wm_data" in info:
                    if 'wm_username' in info["wm_data"]:
                        source = info["wm_data"]["wm_username"]
                    else:
                        source = "crawl"
                else:
                    source = "crawl"
            data = info['data']
            title = data['tvName']
            thumbnail_url = data["coverImg"]
            size = sum(map(int, data['clipsBytes']))
            assert len(data['clipsURL']) == len(data['clipsBytes']) == len(
                data['su'])
            for fileName, key in zip(data['su'], data['ck']):
                urls.append(real_url(fileName, key, data['ch']))

        data = {
            "type": 'video',
            "title": title,
            "source": source,
            "thumbnail_urls": [thumbnail_url],
            "image_urls": None,
            "video_url": urls,
            "ext": None,
            "size": size,
        }

        return data
    else:
        return None