Esempio n. 1
0
def loc_proxy_ipv4(proxies):
    test_res = requests_dora.try_best_2_get("http://lumtest.com/myip.json", max_times=2, proxies=proxies, timeout=10)
    if test_res is not None and test_res.status_code == 200:
        return(test_res.text)
    else:
        logging.warning("failed to connect to geolocation url ...")
        return None
Esempio n. 2
0
def get_singer_list_page(area_id, page_ind):
    '''

    :param area_id: {'全部': -100, '内地': 200, '港台': 2, '欧美': 5, '日本': 4, '韩国': 3, '其他': 6}
    :param page_ind:
    :return:
    '''
    url = "https://u.y.qq.com/cgi-bin/musicu.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/portal/singer_list.html"
    }
    paramter = {
        "g_tk":
        "5381",
        "callback":
        "getUCGI9688380858412697",
        "jsonpCallback":
        "getUCGI9688380858412697",
        "loginUin":
        "0",
        "hostUin":
        "0",
        "format":
        "jsonp",
        "inCharset":
        "utf8",
        "outCharset":
        "utf-8",
        "notice":
        "0",
        "platform":
        "yqq",
        "needNewCode":
        "0",
        "data":
        '{"comm":{"ct":24,"cv":10000},"singerList":{"module":"Music.SingerListServer","method":"get_singer_list","param":{"area":%d,"sex":-100,"genre":-100,"index":-100,"sin":%d,"cur_page":%d}}}'
        % (area_id, (page_ind - 1) * 80, page_ind)
    }

    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramter,
                                             headers=header).text
    se = re.search("getUCGI9688380858412697\((.*)\)", html_text)
    json_str = None
    if se:
        json_str = se.group(1)

    data = json.loads(json_str)["singerList"]["data"]
    singerlist = data["singerlist"]
    try:
        area_ids = data["tags"]["area"]
    except Exception as e:
        area_ids = None
    singer_num_total = data["total"]
    return singerlist, singer_num_total, area_ids
Esempio n. 3
0
def get_TKK(get_proxies_fun=None):
    '''
    to get a value for calculating the token for constructing translating request
    :return: TKK
    '''
    url = "https://translate.google.cn"
    logging.warning("start getting tkk...")

    res = requests_dora.try_best_2_get(url, get_proxies_fun=get_proxies_fun)

    TKK = re.search("tkk:'(.*?)'", res.text).group(1)
    return TKK
Esempio n. 4
0
def get_data5u_proxies(api=None):
    url = "http://api.ip.data5u.com/dynamic/get.html?order=53b3de376027aa3f699dc335d2bc0674&sep=3"
    if api is not None:
        url = api

    res = requests_dora.try_best_2_get(url)
    proxy = res.text.strip()

    if not re.match("\d+\.\d+\.\d+\.\d+:\d+", proxy):
        logging.warning("the proxy expired...")
        raise Exception

    return get_proxies(proxy)
Esempio n. 5
0
def loc_proxy(proxies):
    res = requests_dora.try_best_2_get("https://proxy6.net/en/myip", max_times=2, proxies=proxies, timeout=10)
    soup = BeautifulSoup(res.text, "lxml")
    # ip = soup.select_one("div.block-head > h1").get_text()
    loc_div = soup.select_one("div.myip-row > div")
    line_list = loc_div.select("dl")
    loc_info = {}
    for line in line_list:
        key = line.select_one("dt").get_text().strip().lower()
        val = line.select_one("dd").get_text().strip()
        loc_info[key] = val

    return loc_info
def get_song_list(dissid):
    url = "https://c.y.qq.com/qzone/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie": "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/n/yqq/playlist/{}.html".format(dissid)
    }
    paramters = {
        "type": "1",
        "json": "1",
        "utf8": "1",
        "onlysong": "0",
        "disstid": dissid,
        "format": "jsonp",
        "g_tk": "1089387893",
        "jsonpCallback": "playlistinfoCallback",
        "loginUin": "857193777",
        "hostUin": "0",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": 0,
        "platform": "yqq",
        "needNewCode": 0
    }
    html_text = requests_dora.try_best_2_get(url=url, params=paramters, headers=header).text
    cdlist = json.loads(html_text.lstrip("playlistinfoCallback(").rstrip(")"))["cdlist"]
    if len(cdlist) >= 1:
        cdlist = cdlist[0]
    song_list = []

    tags = ", ".join([i["name"] for i in cdlist["tags"]])
    for item in cdlist["songlist"]:
        song = {}
        # if "size128" in item:
        #     song["size128"] = item["size128"]
        if "songmid" in item:
            song["songmid"] = item["songmid"]
        else:
            continue

        if "songid" in item:
            song["songid"] = item["songid"]
        else:
            continue

        song["albumname"] = item["albumname"]
        song["songname"] = item["songname"]
        song["singer"] = ", ".join([i["name"] for i in item["singer"]])
        song["tags"] = tags
        song_list.append(song)
    return song_list
Esempio n. 7
0
def crawl_song_list_page(singer_mid, begin):
    url = "https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg"
    header = {
        "user-agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie":
        "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=857193777; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/n/yqq/singer/{}.html".format(singer_mid)
    }

    paramter = {
        "g_tk": "5381",
        "jsonpCallback": "MusicJsonCallbacksinger_track",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0",
        "singermid": singer_mid,
        "order": "listen",
        "begin": begin,
        "num": "30",
        "songstatus": "1",
    }

    html_text = requests_dora.try_best_2_get(url=url,
                                             params=paramter,
                                             headers=header).text
    json_str = html_text.lstrip(" MusicJsonCallbacksinger_track(").rstrip(
        ")").strip()
    data = json.loads(json_str)["data"]
    song_list = data["list"]
    total_num = data["total"]
    song_list_new = []
    for song in song_list:
        song = song["musicData"]
        song_new = {}
        song_new["albumid"] = song["albumid"]
        song_new["albummid"] = song["albummid"]
        song_new["albumname"] = song["albumname"]
        song_new["songid"] = song["songid"]
        song_new["songmid"] = song["songmid"]
        song_new["songname"] = song["songname"]
        song_list_new.append(song_new)
    return song_list_new, int(total_num)
Esempio n. 8
0
def google_search(queryStr, get_proxies_fun=None, page=1):
    url = 'https://www.google.com/search?biw=1920&safe=active&hl=en&q=%s&oq=%s&start=%d' % (
        queryStr, queryStr, (page - 1) * 10)

    response = requests_dora.try_best_2_get(
        url,
        headers=requests_dora.get_default_headers(),
        invoked_by="google_search",
        get_proxies_fun=get_proxies_fun,
        timeout=60)
    status = response.status_code
    if status == 200:
        html = response.text
    else:
        print("status: {}, try again....".format(status))
        random.seed(time.time())
        time.sleep(3 + 5 * random.random())
        return google_search(queryStr, page, get_proxies_fun)

    return html
def get_album_list(sin, ein):
    url = "https://c.y.qq.com/splcloud/fcgi-bin/fcg_get_diss_by_tag.fcg"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "cookie": "RK=7dNm4/X + Yj; tvfe_boss_uuid=bf00ee54e9081ab4; pgv_pvi=8772238336; pac_uid=1_857193777; pgv_pvid=6457341280; o_cookie=80; ptcz=c761e59c8c8d6bd5198866d02a5cb7313af1af468006c455d6c2b5d26201d42e; pgv_si=s10759168; _qpsvr_localtk=0.08285763449905015; ptisp=ctc; luin=o0857193777; lskey=00010000228dd1371b945c68ecfd3b71d3071425024a7a8a2a23e3ffcb5b9904c9f7088d2ea8c01539ffed92; pt2gguin=o0857193777; uin=o0857193777; skey=@Kydi7w0EI; p_uin=o0857193777; p_skey=HjsE9sEjznJfXk*9KFEeW4VZr6i3*tlXZ2nuzEw8kCg_; pt4_token=c-p6sv3JEboA51cSQ3ABqxM8O80Jct3jYYkgy-aEQuE_; p_luin=o0857193777; p_lskey=000400008f9c296cd10c03a5173d22a184aad124d791568e90e4198beb8ad699a4d02fbfc059f71ab3d8758c; ts_last=y.qq.com/portal/playlist.html; ts_refer=ui.ptlogin2.qq.com/cgi-bin/login; ts_uid=3392060960",
        "referer": "https://y.qq.com/portal/playlist.html"
    }
    paramter = {
        "g_tk": "1089387893",
        "jsonpCallback": "getPlaylist",
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0",
        "categoryId": "10000000",
        "sortId": "5",
        "sin": sin, # 开始结点
        "ein": ein # 结束结点,用于翻页
    }
    html_text = requests_dora.try_best_2_get(url=url, params=paramter, headers=header).text
    res = json.loads(html_text.lstrip("getPlaylist(").rstrip(")"))["data"]["list"]
    album_list = []

    for t_item in res:
        album = {}
        ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")#用于去掉非法字符
        album["createtime"] = t_item["createtime"]
        album["creator_qq"] = t_item["creator"]["qq"]
        album["creator_name"] = t_item["creator"]["name"]
        album["creator_name"] = ILLEGAL_CHARACTERS_RE.sub(r"",  album["creator_name"])
        album["creator_isVip"] = t_item["creator"]["isVip"]
        album["dissid"] = t_item["dissid"] #提取歌单id,用于后续提取歌曲id
        album["dissname"] = t_item["dissname"] #歌单名称
        album["dissname"] = ILLEGAL_CHARACTERS_RE.sub(r"",  album["dissname"])
        album["listennum"] = t_item["listennum"] #播放量
        album_list.append(album)
    return album_list
def get_detail(song):
    songid = song["songid"]
    songmid = song["songmid"]

    url = "https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "referer": "https://y.qq.com/n/yqq/song/{}.html".format(songid)
    }
    paramters = {
        "songmid": songmid,
        "tpl": "yqq_song_detail",
        "format": "jsonp",
        "callback": "getOneSongInfoCallback",
        "g_tk": "1134533366",
        "jsonpCallback": "getOneSongInfoCallback",
        "loginUin": "0",
        "hostUin": "0",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": 0,
        "platform": "yqq",
        "needNewCode": 0
    }
    html_text = requests_dora.try_best_2_get(url=url, params=paramters, headers=header, verify=True).text
    detail = json.loads(html_text.lstrip("getOneSongInfoCallback(").rstrip(")"))["data"]
    song = {}
    if len(detail) > 0:
        detail = detail[0]
        song["subtitle"] = detail["subtitle"]
        song["title"] = detail["title"]
        song["time_public"] = detail["time_public"]
        try:
            song["url"] = json.loads(html_text.lstrip("getOneSongInfoCallback(").rstrip(")"))["url"][str(songid)]
        except:
            song["url"] = ""
    return song
def get_lyric(song):
    songid = song["songid"]
    songmid = song["songmid"]
    url = "https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric.fcg"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/59.0.3071.115 Safari/537.36",
        "referer": "https://y.qq.com/n/yqq/song/{}.html".format(songmid)
    }
    paramters = {
        "nobase64": 1,
        "musicid": songid,
        "callback": "jsonp1",
        "g_tk": "1134533366",
        "jsonpCallback": "jsonp1", 
        "loginUin": "0",
        "hostUin": "0",
        "format": "jsonp",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq",
        "needNewCode": "0"
    }
    html_text = requests_dora.try_best_2_get(url=url, params=paramters, headers=header).text
    res = json.loads(html_text.lstrip("jsonp1(").rstrip(")"))
    # if "lyric" in res:
    #     lyric = res["lyric"]
    #     # decode
    #     lyric = html.unescape(lyric)
    #     lyric = html.unescape(lyric)
    #     lyric = parse.unquote(lyric)
    #
    #     it = re.finditer(r"\[(.*?)\](.+)", lyric)
    #     lyric_lines = []
    #     for match in it:
    #         time_pop_up = match.group(1)
    #         time_split = time_pop_up.split(".")
    #         ms = float("0.{}".format(time_split[1]))
    #         sec = time.strptime(time_split[0], "%M:%S").tm_sec
    #         line = match.group(2)
    #         line = line.strip()
    #         if re.search("[::]", line) or line == "" or line == "此歌曲为没有填词的纯音乐,请您欣赏":
    #             continue
    #         lyric_lines.append({
    #             "time": sec + ms,
    #             "line": line,
    #         })
    #
    #     return lyric_lines[1:]
    # else:
    #     return []
    if "lyric" in res:
        lyric = res["lyric"]
        if "此歌曲为没有填词的纯音乐,请您欣赏" in lyric:
            return {}, []
        # decode
        lyric = html.unescape(lyric)
        lyric = html.unescape(lyric)
        lyric = parse.unquote(lyric)

        it = re.finditer(r"\[(\d+):(\d+.\d+)\](.+)", lyric)
        lyric_lines = []
        contributors_dict = {}
        for match in it:
            min = float(match.group(1))
            try:
                sec = float(match.group(2))
            except ValueError:
                sec = 0

            line = match.group(3)
            line = line.strip()
            if line == "":
                continue
            se_contributors = re.search("(.*?)[::](.*)", line)
            if se_contributors:
                contributors_dict[se_contributors.group(1).strip()] = se_contributors.group(2).strip()
                continue
            lyric_lines.append({
                "time": min * 60 + sec,
                "line": line,
            })

        return contributors_dict, lyric_lines[1:]
    else:
        return {}, []
Esempio n. 12
0
def get_entity(query_str, get_proxies_fun, wait=1.5):

    rel_org_name_set = set()
    logging.warning("start crawling {}...".format(query_str))

    text = google_search(query_str, get_proxies_fun)
    random.seed(time.time())
    time.sleep(wait * random.random())

    soup = BeautifulSoup(text, "lxml")

    # is there an entity in google KG?
    div_kg_hearer = soup.select_one("div.kp-header")

    if div_kg_hearer is None:  # if there is no knowledge graph at the right, drop it
        logging.warning("no entity returned for this query")
        return None

    enti_name = div_kg_hearer.select_one("div[role=heading] span")
    enti_name = enti_name.text if enti_name is not None else None
    if enti_name is None or "..." in enti_name:
        se = re.search(
            '\["t-dhmk9MkDbvI",.*\[\["data",null,null,null,null,\[null,"\[\\\\"(.*)\\\\",',
            text)
        if se is not None:
            enti_name = se.group(1)
        else:
            logging.warning(
                "sth went wrong when extracting the name of the entity")
            return None

    # identify the type
    span_list = div_kg_hearer.select("span")
    enti_type = span_list[-1].text if len(span_list) > 1 else "unknown"

    # description from wikipedia
    des = soup.find("h3", text="Description")
    des_info = ""
    if des is not None:
        des_span = des.parent.select_one("span")
        des_info = des_span.text if des_span is not None else ""

    # extract attributes
    attr_tags = soup.select("div.Z1hOCe")
    attr_dict = {}
    for attr in attr_tags:
        attr_str = attr.get_text()
        se = re.search("(.*?)[::](.*)", attr_str)
        if se is None:
            continue
        key_attr = se.group(1)
        val_attr = se.group(2)
        attr_dict[key_attr] = val_attr

    # relevant org name on current page
    a_reltype_list = soup.select("div.MRfBrb > a")
    for a in a_reltype_list:
        rel_org_name_set.add(a["title"].strip())

    # collect next urls e.g. : more x+
    div_list = soup.select("div.yp1CPe")
    next = []
    host = "https://www.google.com"
    for div in div_list:
        a_list = div.select("a.EbH0bb")
        for a in a_list:
            if "http" not in a["href"]:
                next.append("%s%s" % (host, a["href"]))

    # crawl parent org
    a_parent_org = soup.find("a", text="Parent organization")
    if a_parent_org is not None:
        parent_str = a_parent_org.parent.parent.text.strip()
        parent_org = parent_str.split(":")[1]
        rel_org_name_set.add(parent_org.strip())

    # crawl subsidiaries
    a_subsidiaries = soup.find("a", text="Subsidiaries")
    if a_subsidiaries is not None:
        href = a_subsidiaries["href"]
        if "http" not in href:
            subsidiaries_str = a_subsidiaries.parent.parent.text.strip()
            subs = subsidiaries_str.split(":")[1].split(",")
            for sub in subs:
                sub = sub.strip()
                if sub == "MORE":
                    continue
                rel_org_name_set.add(sub)
            next.append("%s%s" % (host, href))

    # scrawl urls in list 'next'
    for url in tqdm(next, desc="crawling relevant org names..."):
        res = requests_dora.try_best_2_get(
            url,
            invoked_by="get_org_name",
            headers=requests_dora.get_default_headers(),
            get_proxies_fun=get_proxies_fun)
        soup = BeautifulSoup(res.text, "lxml")

        # crawl items at the top
        a_list = soup.select("a.klitem")
        for a in a_list:
            rel_org_name = a["title"]
            rel_org_name_set.add(rel_org_name.strip())

        # crawl headings under the map if any
        heading_list = soup.select("div.VkpGBb")
        for heading in heading_list:
            heading_str = heading.select_one("div[role='heading']")
            rel_org_name_set.add(heading_str.get_text())

        random.seed(time.time())
        bar.update()
        time.sleep(wait * random.random())

    rel_org_name_list = [
        org_name for org_name in rel_org_name_set if len(org_name) > 1
    ]
    return {
        "query_str": query_str,
        "name": enti_name,
        "type": enti_type,
        "des": des_info,
        "attributes": attr_dict,
        "rel_org": rel_org_name_list
    }