Example #1
0
def get_work_list_page(url, page_ind):
    res = requests_dora.try_best_2_get("{}?page={}".format(url, page_ind), headers=requests_dora.get_default_headers(), timeout = 30,
                                       get_proxies_fun=get_proxies)
    html_str = res.text
    soup = BeautifulSoup(html_str, "lxml")

    page_last = -1
    if page_ind == 0:
        page_list = soup.select("ul.pager li")
        page_last = page_list[-2].get_text() if len(page_list) != 0 else -1

    work_list = []
    div_list = soup.select("div.views-row")
    for div in div_list:
        a = div.select_one("div.views-field-tid a")
        if a is None:
            continue
        url = "https://www.juzimi.com{}".format(a["href"])

        a = div.select_one("a.xqallarticletilelink")
        if a is None:
            continue
        work = a.text
        if work == "":
            work = "unknown"
        work_list.append({
            "url": url,
            "work": work,
        })

    return work_list, int(page_last)
Example #2
0
def get_song_list(dissid):
    url = "https://c.y.qq.com/qzone/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/n/yqq/playlist/{}.html".format(dissid)
    }
    paramters = {
        "type": "1",
        "json": "1",
        "utf8": "1",
        "onlysong": "0",
        "disstid": dissid,
        "format": "jsonp",
        "g_tk": "1089387893",
        "jsonpCallback": "playlistinfoCallback",
        "loginUin": "857193777",
        "hostUin": "0",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": 0,
        "platform": "yqq",
        "needNewCode": 0
    }
    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramters,
                                             headers=header).text
    cdlist = json.loads(
        html_text.lstrip("playlistinfoCallback(").rstrip(")"))["cdlist"]
    if len(cdlist) >= 1:
        cdlist = cdlist[0]
    song_list = []

    tags = ", ".join([i["name"] for i in cdlist["tags"]])
    for item in cdlist["songlist"]:
        song = {}
        # if "size128" in item:
        #     song["size128"] = item["size128"]
        if "songmid" in item:
            song["songmid"] = item["songmid"]
        else:
            continue

        if "songid" in item:
            song["songid"] = item["songid"]
        else:
            continue

        song["albumname"] = item["albumname"]
        song["songname"] = item["songname"]
        song["singer"] = ", ".join([i["name"] for i in item["singer"]])
        song["tags"] = tags
        song_list.append(song)
    return song_list
Example #3
0
def get_lyric(song):
    songid = song["songid"]
    songmid = song["songmid"]
    url = "https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "referer": "https://y.qq.com/n/yqq/song/{}.html".format(songmid)
    }
    paramters = {
        "nobase64": 1,
        "musicid": songid,
        "callback": "jsonp1",
        "g_tk": "1134533366",
        "jsonpCallback": "jsonp1",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0"
    }
    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramters,
                                             headers=header).text
    res = json.loads(html_text.lstrip("jsonp1(").rstrip(")"))
    if "lyric" in res:
        lyric = res["lyric"]
        # decode
        lyric = html.unescape(lyric)
        lyric = html.unescape(lyric)
        lyric = parse.unquote(lyric)

        it = re.finditer(r"\[(.*?)\](.+)", lyric)
        lyric_lines = []
        for match in it:
            time_pop_up = match.group(1)
            time_split = time_pop_up.split(".")
            ms = float("0.{}".format(time_split[1]))
            sec = time.strptime(time_split[0], "%M:%S").tm_sec
            line = match.group(2)
            line = line.strip()
            if re.search("[::]",
                         line) or line == "" or line == "此歌曲为没有填词的纯音乐,请您欣赏":
                continue
            lyric_lines.append({
                "time": sec + ms,
                "line": line,
            })

        return lyric_lines[1:]
    else:
        return []
Example #4
0
def get_sentences_page(url, page_ind):
    headers = requests_dora.get_default_headers()
    random.seed(time.time)
    headers["User-Agent"] = random.shuffle(default.USER_AGENTS)

    res = requests_dora.try_best_2_get("{}?page={}".format(url, page_ind), headers=headers, timeout=20, get_proxies_fun=get_proxies)
    html_str = res.text
    # html_str = re.sub("[\r]+", "", res.text)
    # html_str = re.sub("[(<br/>)]+", "", html_str)

    soup = BeautifulSoup(html_str, "lxml")

    page_last = -1
    if page_ind == 0:
        page_last = soup.select_one("li.pager-last a")
        page_last = int(page_last.get_text())if page_last else -1

    div_list = soup.select("div.view-content div.views-field-phpcode")
    sentence_list = []
    for div in div_list:
        sentence = div.select_one("a.xlistju").get_text()
        from_ = div.select_one("div.xqjulistwafo")

        author = ""
        work = ""
        if from_ is not None:
            author = from_.select_one("a[title*='作者']")

            author = author.get_text() if author else ""

            work = from_.select_one("a[title*='出自']")
            work = work.get_text() if work else ""

        like = div.select_one("a[title*='喜欢本句']")
        like = like.get_text() if like else ""
        se_like = re.search("喜欢\((\d+)\)", like)
        if se_like:
            like = int(se_like.group(1))
        else:
            like = 0

        sentence_list.append({
            "sentence": sentence,
            "work": work,
            "author": author,
            "like": like,
        })
    if len(sentence_list) == 0:
        logging.warning("the len of %s is 0, re-crawling..." % url)
        return get_sentences_page(url, page_ind)
    return sentence_list, page_last
Example #5
0
def crawl_song_list_page(singer_mid, begin):
    url = "https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/n/yqq/singer/{}.html".format(singer_mid)
    }

    paramter = {
        "g_tk": "5381",
        "jsonpCallback": "MusicJsonCallbacksinger_track",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0",
        "singermid": singer_mid,
        "order": "listen",
        "begin": begin,
        "num": "30",
        "songstatus": "1",
    }

    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramter,
                                             headers=header).text
    json_str = html_text.lstrip(" MusicJsonCallbacksinger_track(").rstrip(
        ")").strip()
    data = json.loads(json_str)["data"]
    song_list = data["list"]
    total_num = data["total"]
    song_list_new = []
    for song in song_list:
        song = song["musicData"]
        song_new = {}
        song_new["albumid"] = song["albumid"]
        song_new["albummid"] = song["albummid"]
        song_new["albumname"] = song["albumname"]
        song_new["songid"] = song["songid"]
        song_new["songmid"] = song["songmid"]
        song_new["songname"] = song["songname"]
        song_list_new.append(song_new)
    return song_list_new, int(total_num)
Example #6
0
def get_album_list(sin, ein):
    url = "https://c.y.qq.com/splcloud/fcgi-bin/fcg_get_diss_by_tag.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=80; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/portal/playlist.html"
    }
    paramter = {
        "g_tk": "1089387893",
        "jsonpCallback": "getPlaylist",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0",
        "categoryId": "10000000",
        "sortId": "5",
        "sin": sin,  # 开始结点
        "ein": ein  # 结束结点,用于翻页
    }
    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramter,
                                             headers=header).text
    res = json.loads(
        html_text.lstrip("getPlaylist(").rstrip(")"))["data"]["list"]
    album_list = []

    for t_item in res:
        album = {}
        ILLEGAL_CHARACTERS_RE = re.compile(
            r"[\000-\010]|[\013-\014]|[\016-\037]")  #用于去掉非法字符
        album["createtime"] = t_item["createtime"]
        album["creator_qq"] = t_item["creator"]["qq"]
        album["creator_name"] = t_item["creator"]["name"]
        album["creator_name"] = ILLEGAL_CHARACTERS_RE.sub(
            r"", album["creator_name"])
        album["creator_isVip"] = t_item["creator"]["isVip"]
        album["dissid"] = t_item["dissid"]  #提取歌单id,用于后续提取歌曲id
        album["dissname"] = t_item["dissname"]  #歌单名称
        album["dissname"] = ILLEGAL_CHARACTERS_RE.sub(r"", album["dissname"])
        album["listennum"] = t_item["listennum"]  #播放量
        album_list.append(album)
    return album_list
Example #7
0
def get_proxies():
    # proxy = "lum-customer-hl_95db9f83-zone-static:[email protected]:22225"
    res = requests_dora.try_best_2_get("http://api.ip.data5u.com/dynamic/get.html?order=53b3de376027aa3f699dc335d2bc0674&sep=3")
    proxy = res.text.strip()

    if not re.match("\d+\.\d+\.\d+\.\d+:\d+", proxy):
        logging.warning("the proxy expired...")
        raise Exception

    proxies = {
        "http": "http://{}".format(proxy),
        "https": "https://{}".format(proxy),
    }
    time.sleep(1)
    logging.warning("current proxy: {}".format(proxy))
    return proxies
Example #8
0
def get_detail(song):
    songid = song["songid"]
    songmid = song["songmid"]

    url = "https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "referer": "https://y.qq.com/n/yqq/song/{}.html".format(songid)
    }
    paramters = {
        "songmid": songmid,
        "tpl": "yqq_song_detail",
        "format": "jsonp",
        "callback": "getOneSongInfoCallback",
        "g_tk": "1134533366",
        "jsonpCallback": "getOneSongInfoCallback",
        "loginUin": "0",
        "hostUin": "0",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": 0,
        "platform": "yqq",
        "needNewCode": 0
    }
    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramters,
                                             headers=header,
                                             verify=True).text
    detail = json.loads(
        html_text.lstrip("getOneSongInfoCallback(").rstrip(")"))["data"]
    song = {}
    if len(detail) > 0:
        detail = detail[0]
        song["subtitle"] = detail["subtitle"]
        song["title"] = detail["title"]
        song["time_public"] = detail["time_public"]
        try:
            song["url"] = json.loads(
                html_text.lstrip("getOneSongInfoCallback(").rstrip(
                    ")"))["url"][str(songid)]
        except:
            song["url"] = ""
    return song
Example #9
0
def get_lyric(song):
    songid = song["songid"]
    songmid = song["songmid"]
    url = "https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "referer": "https://y.qq.com/n/yqq/song/{}.html".format(songmid)
    }
    paramters = {
        "nobase64": 1,
        "musicid": songid,
        "callback": "jsonp1",
        "g_tk": "1134533366",
        "jsonpCallback": "jsonp1",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0"
    }
    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramters,
                                             headers=header).text
    res = json.loads(html_text.lstrip("jsonp1(").rstrip(")"))
    if "lyric" in res:
        lyric = res["lyric"]
        if "此歌曲为没有填词的纯音乐,请您欣赏" in lyric:
            return {}, []
        # decode
        lyric = html.unescape(lyric)
        lyric = html.unescape(lyric)
        lyric = parse.unquote(lyric)

        it = re.finditer(r"\[(\d+):(\d+.\d+)\](.+)", lyric)
        lyric_lines = []
        contributors_dict = {}
        for match in it:
            min = float(match.group(1))
            try:
                sec = float(match.group(2))
            except ValueError:
                sec = 0

            line = match.group(3)
            line = line.strip()
            if line == "":
                continue
            se_contributors = re.search("(.*?)[::](.*)", line)
            if se_contributors:
                contributors_dict[se_contributors.group(
                    1).strip()] = se_contributors.group(2).strip()
                continue
            lyric_lines.append({
                "time": min * 60 + sec,
                "line": line,
            })

        return contributors_dict, lyric_lines[1:]
    else:
        return {}, []
Example #10
0
def get_singer_list_page(area_id, page_ind):
    '''

    :param area_id: "{
                        "id":-100,
                        "name":"全部"
                    },
                    {
                        "id":200,
                        "name":"内地"
                    },
                    {
                        "id":2,
                        "name":"港台"
                    },
                    {
                        "id":5,
                        "name":"欧美"
                    },
                    {
                        "id":4,
                        "name":"日本"
                    },
                    {
                        "id":3,
                        "name":"韩国"
                    },
                    {
                        "id":6,
                        "name":"其他"
                    }"
    :param page_ind:
    :return:
    '''
    url = "https://u.y.qq.com/cgi-bin/musicu.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/portal/singer_list.html"
    }
    paramter = {
        "g_tk":
        "5381",
        "callback":
        "getUCGI9688380858412697",
        "jsonpCallback":
        "getUCGI9688380858412697",
        "loginUin":
        "0",
        "hostUin":
        "0",
        "format":
        "jsonp",
        "inCharset":
        "utf8",
        "outCharset":
        "utf-8",
        "notice":
        "0",
        "platform":
        "yqq",
        "needNewCode":
        "0",
        "data":
        '{"comm":{"ct":24,"cv":10000},"singerList":{"module":"Music.SingerListServer","method":"get_singer_list","param":{"area":%d,"sex":-100,"genre":-100,"index":-100,"sin":%d,"cur_page":%d}}}'
        % (area_id, (page_ind - 1) * 80, page_ind)
    }

    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramter,
                                             headers=header).text
    se = re.search("getUCGI9688380858412697\((.*)\)", html_text)
    json_str = None
    if se:
        json_str = se.group(1)

    data = json.loads(json_str)["singerList"]["data"]
    singerlist = data["singerlist"]
    singer_num_total = data["total"]
    return singerlist, singer_num_total