def start_download(local_dic, pics_dic, path_month):
    """
    判断是否已下载
    下载 未下或未下完的图集
    """
    pics_list = list(pics_dic)
    for pics in pics_list:
        print("正在下载:{0}".format(pics))
        url_pics = pics_dic[pics]           # 准备下载图片的url
        num_start = local_dic[pics]

        soup = comunits.send_requests(url_pics, referer=url_ck, proxy=proxy, need="soup")
        soup = soup.select(".gal_list a[target]")
        url_img_list = []
        for img in soup:
            url_img_list.append(img['href'])

        n = num_start - 1           # 准备下载的控制参数
        nn = num_start - 1
        ns = len(url_img_list)
        for url in url_img_list[nn:]:          # 下载
            n += 1
            if n == ns:
                path_img = path_month + "\\" + pics + "_" + str(n) + "_" + "L.jpg"
            else:
                path_img = path_month + "\\" + pics + "_" + str(n) + "_" + ".jpg"
            response = comunits.send_requests(url, referer=url_pics, proxy=proxy, need="response")
            with open(path_img, "wb") as f:
                f.write(response.content)
                f.close()
            comunits.show_bar(n, ns)              # 进度条
Example #2
0
def get_url_m3u8(url_video):
    """
    从播放页获得url_m3u8
    """
    args = {"referer": url_home, "proxy": proxy, "need": "xpath"}
    obj = comunits.send_requests(url_video, **args)
    script = obj.xpath(
        "//div[@class='original mainPlayerDiv']/script/text()")[0]
    video_id = obj.xpath(
        "//div[@class='original mainPlayerDiv']/@data-video-id")[0]
    # print(script, ID, sep='\n')
    js = "var playerObjList = {};" + script
    js_obj = js_compile(js)
    dic_list = js_obj.eval(
        "flashvars_{0}['mediaDefinitions']".format(video_id))
    dic = dic_list[-1]
    quality = dic["quality"]
    url_m3u8 = dic["videoUrl"]
    # print(quality, url_m3u8, sep="\n")
    # 验证有效性
    r = comunits.send_requests(url_m3u8,
                               referer=url_video,
                               origin=url_home,
                               proxy=proxy,
                               need="response")
    if "#EXTM3U" in r.text:
        return url_m3u8, quality
    else:
        print("播放页代码有改动,找不到m3u8地址")
        return "", ""
Example #3
0
def get_pics(page):
    """
    制作页面 url referer    进入下一页的请求头中的referer是上一页
    返回 页所有图集的字典 —— 名字:链接
    """
    # 制作页面 url referer
    if page == 1:
        url_page = url_home
        referer_page = url_page
    elif page == 2:
        url_page = url_home + '/' + 'page' + '/' + '2' + '/'
        referer_page = url_home
    else:
        url_page = url_home + '/' + 'page' + '/' + str(page) + '/'
        referer_page = url_home + '/' + str(page - 1) + '/'

    # 返回 页所有图集的字典—— 名字:链接
    soup = comunits.send_requests(url_page, referer=referer_page, need="soup")
    items = soup.select('.postlist li')
    pics_dic = {}
    for item in items:
        href = item.select('a')[1]['href']
        name = item.select('a')[1].text
        pics_dic[name] = href

    return pics_dic, url_page
Example #4
0
def start_download(pics, url_pics, num_start, url_page, path_page):
    """
    获取图集的图片总数
    制作图片的 url 和 referer
    图片循环下载
    """

    # 从图集首页获取 图片总数
    soup = comunits.send_requests(url_pics, referer=url_page, need="soup")
    nums = soup.select('.pagenavi > a')[-2].text
    nums = int(nums)
    print("正在下载:{0}".format(pics))
    # 图片循环下载
    for num in range(num_start, nums + 1):
        # 是否顺带第一张图片下载地址
        if num == 1:
            img = soup.select('.main-image>p>a>img')
            url_img = img[0].attrs['src']
            url_img_show = url_pics
        else:
            if num == 2:
                url_img_show = url_pics + '/' + "2"
                referer_show = url_pics
            else:
                url_img_show = url_pics + '/' + str(num)
                referer_show = url_pics + '/' + str(num - 1)

            # 获取图片下载地址
            soup = comunits.send_requests(url_img_show,
                                          referer=referer_show,
                                          need="soup")
            img = soup.select('.main-image>p>a>img')
            url_img = img[0].attrs['src']

        response = comunits.send_requests(url_img,
                                          referer=url_img_show,
                                          need="response")
        if num == nums:
            path_img = path_page + "\\" + pics + "_" + str(num) + "_L.jpg"
        else:
            path_img = path_page + '\\' + pics + '_' + str(num) + '_.jpg'

        with open(path_img, 'wb') as f:
            f.write(response.content)
        comunits.show_bar(num, nums)  # 进度条
        time.sleep(0.3)
Example #5
0
    def get_ts_91MJW(self, url_m3u8):
        """
        m3u8可能1个或2个;ts索引可能6位或3位,或任意位

        return: url_ts的模板 和 ts总数  用于构造ts生成器
        """

        m3u8 = comunits.send_requests(url_m3u8,
                                      origin=self.origin,
                                      need="response")
        m3u8 = m3u8.text
        print("第一个m3u8文件内容节选:", m3u8[:350], m3u8[-150:], sep='\n')

        # 判断如何获得ts索引文件
        if ".m3u8" in m3u8:  # 说明有两个m3u8
            part = re.findall("\n(.+)m3u8", m3u8)
            m3u8b = part[0] + "m3u8"
            url_m3u8b = parse.urljoin(url_m3u8, m3u8b)
            ts = comunits.send_requests(url_m3u8b,
                                        origin=self.origin,
                                        need="response")
            ts = ts.text
        elif ".ts" in m3u8:
            ts = m3u8
            url_m3u8b = url_m3u8
        else:
            return "", 0

        # 找到处理 ts
        # 找头尾的两个ts
        ts_start = re.search("(.*).ts", ts, re.X).group(1)  # X 忽略空格和#后的东西
        ts_end = re.search("(.*).ts\n#EXT-X-ENDLIST", ts).group(1)
        # 找出ts是几位数的索引
        diff = 0
        for i in range(len(ts_end) - 1):
            if ts_start[i] != ts_end[i]:
                diff = i
                break
        # 制作ts模板
        ts_pat = ts_end[:diff] + "{0}.ts"
        url_ts_pat = parse.urljoin(url_m3u8b, ts_pat)
        # ts总数
        ts_total = ts_end[diff:]
        ts_total = int(ts_total) + 1

        return url_ts_pat, ts_total
Example #6
0
    def get_ts_pb(self, url_m3u8):
        """
        """
        args_requests = {
            "referer": self.referer,
            "origin": self.origin,
            "proxy": self.proxy,
            "need": "response"
        }

        m3u8 = comunits.send_requests(url_m3u8, **args_requests)
        m3u8 = m3u8.text
        # print("第一个m3u8文件内容节选:", m3u8[:350], m3u8[-150:], sep='\n')

        # 判断如何获得ts索引文件
        if ".m3u8" in m3u8:  # 说明有两个m3u8
            a = re.compile("(.*).m3u8(.*)").search(m3u8).group(1, 2)
            m3u8b = a[0] + ".m3u8" + a[1]
            url_m3u8b = parse.urljoin(url_m3u8, m3u8b)
            ts = comunits.send_requests(url_m3u8b, **args_requests)
            ts = ts.text
        elif ".ts" in m3u8:
            ts = m3u8
            url_m3u8b = url_m3u8
        else:
            print("第一个m3u8文件有问题")
            return "", 0

        print(url_m3u8b)
        # 制作ts模板
        if "seg-" in ts:
            ts_pat = url_m3u8b.rsplit("/", 1)[-1].replace("index",
                                                          "seg-{0}").replace(
                                                              ".m3u8", ".ts")
            url_ts_pat = parse.urljoin(url_m3u8b, ts_pat)
            # ts总数
            ts_total = re.search("seg-(.*?)-.*\n#EXT-X-ENDLIST", ts).group(1)
            ts_total = int(ts_total)
            return url_ts_pat, ts_total
        else:
            print("ts索引规律已经变了")
            return "", 0
Example #7
0
def get_url_m3u8(i, url_play):
    """从播放页面找出第一个M3U8的url: url_m3u8
    几个播放源几个,几个m3u8
    """

    episode_dic_list = get_source(url_play)
    for episode_dic in episode_dic_list:
        url_play_epis = episode_dic.get(i)
        # 提取vid
        obj = comunits.send_requests(url_play_epis, referer=url_info, need="xpath")
        script = obj.xpath('//section[@class="container"]/script[@type="text/javascript"]/text()')[0]
        vid = findall("vid.*?=(.*);", script)
        vid = vid[0].strip()
        vid = eval(vid).strip()
        # url解码
        url_m3u8 = unquote(vid)
        # 测试有效性
        r = comunits.send_requests(url_m3u8, origin=url_origin, need="response")
        if r.status_code == 200:
            return url_m3u8
    # 如果所有链接都无效
    return ""
Example #8
0
def get_source(url_play):
    # 找到备用源和独家源的每集播放页的url
    obj = comunits.send_requests(url_play, referer=url_info, need="xpath")
    play_container = obj.xpath('//div[@id="playcontainer"]//section')
    episode_dic_list = []  # 其元素是每个源下的集数的字典{name:key}
    for s in play_container[1:]:
        name_epis = s.xpath('./a/text()')
        name_epis = [int(findall("第(.*)集", i)[0]) for i in name_epis]
        key_epis = s.xpath('./a/@href')
        url_epis = [urljoin(url_play, i) for i in key_epis]
        episode_dic = dict(zip(name_epis, url_epis))
        episode_dic_list.append(episode_dic)

    return episode_dic_list
Example #9
0
def get_info():
    """从美剧详情页获取
    :return
        name_section (str): 美剧名,带季度
        episode_dic(字典): 第几集:url的关键部分
        info_section(列表): 导演,演员,评分等信息
        introduce(字符串): 剧情介绍
    """

    tree = comunits.send_requests(url_info, referer=url_home, need="xpath")

    # 美剧名
    name_section = tree.xpath('//h1[@class="article-title"]//text()')[0]
    name_section = findall("《(.*)》", name_section)[0]
    # print(name_section)

    # 每集url线索,第几集
    id_epis = tree.xpath('//a[@onclick="play(this)"]/@id')
    key_epis = ["/vplay/" + i + ".html" for i in id_epis]
    name_epis = tree.xpath('//a[@onclick="play(this)"]/text()')
    name_epis = [int(findall("第(.*)集", i)[0]) for i in name_epis]
    # print(id_epis, name_epis, sep="\n")
    episode_dic_main = dict(zip(name_epis, key_epis))

    # 影片信息
    div = tree.xpath("//div[@class='video_info']")[0]
    # 将lxml.etree._Element转化成字符串
    div = etree.tostring(div, encoding="utf8")
    div = div.decode("utf8")
    # print(div)

    # 预处理字符串,后正则
    div = div.replace("</strong>", '').replace("<br/>", '')
    info_section = findall("<strong>(.*)", div)
    # for i in info_section:
    #     print(i)

    # 剧情简介
    intro = tree.xpath('//p[@class="jianjie"]//text()')
    introduce = ""
    for i in intro:
        introduce = introduce + i
    # print(introduce)

    return name_section, episode_dic_main, info_section, introduce
Example #10
0
def get_pics(year, month):
    """
    返回 pics_dic {图集名字:图集url}
    制作每月图集显示页的  url 和 referer
    获取每月图集显示页的  图集的 url 和 名字
    """
    url_month = url_ck + "/" + "?s={0}-{1}".format(month, year)
    href = []
    name = []
    soup = comunits.send_requests(url_month, referer=url_ck, proxy=proxy, need="soup")
    soup = soup.select(".gal_list a")
    for tag in soup:
        t = tag['href']
        n = t.split('/')[-3]
        a = "https://www.kindgirls.com" + t
        href.append(a)
        name.append(n)
    pics_dic = dict(zip(name, href))
    return pics_dic
Example #11
0
import re
import comunits

url = "https://mp.weixin.qq.com/s/s5ow4FoOKDS_DA6sPY1ysA"

# r = ComUnit.send_requests(url, url, need="response")
tree = comunits.send_requests(url, url, need="xpath")
ret = tree.xpath("//div[@id='js_content']/p/text()")
ret.pop(0)
ret.pop(-1)

a = []
b = []
pat = "http.*com|http.*cn|http.*net|http.*me"

for i in ret:
    if "http" in i:
        n = re.compile(pat).findall(i)
        a = a + n
        # a.append(i.strip())        # 去首尾的空格

for i in a:
    try:
        r = comunits.send_requests(i, i, need="response", mode="empty")
    except:
        continue
    if r.status_code is 200:
        b.append(i)
for i in b:
    print(i)