Beispiel #1
0
def get_account_from_index():
    index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php"
    query_data = {"cd": "member"}
    index_response = net.http_request(index_url,
                                      method="GET",
                                      fields=query_data)
    account_list = {}
    if index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(index_response.status))
    member_list_data = tool.find_sub_string(index_response.data,
                                            '<ul class="thumb">', "</ul>")
    if not member_list_data:
        raise crawler.CrawlerException("页面截取账号列表失败\n%s" % index_response.data)
    member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data)
    for member_info in member_list_find:
        # 获取账号id
        account_id = tool.find_sub_string(member_info, "&ct=", '">')
        if not account_id:
            raise crawler.CrawlerException("账号信息截取账号id失败\n%s" % member_info)
        # 获取成员名字
        account_name = tool.find_sub_string(member_info, '<p class="name">',
                                            "</p>").strip().replace(" ", "")
        if not account_name:
            raise crawler.CrawlerException("账号信息截取成员名字失败\n%s" % member_info)
        account_list[account_id] = account_name
    return account_list
Beispiel #2
0
def get_game_invalid_achievements(game_id):
    game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php"
    query_data = {"AppID": game_id}
    game_index_response = net.http_request(game_index_url,
                                           method="GET",
                                           fields=query_data)
    if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        output.print_msg("游戏 %s 访问失败" % game_id)
        tool.process_exit()
    # game id 不存在
    if game_index_response.data.find(
            "This game cannot be found in the database.") >= 0:
        return
    achievement_text = tool.find_sub_string(
        game_index_response.data,
        '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>")
    # 没有成就
    if not achievement_text:
        return
    achievement_text = achievement_text.strip()
    if not crawler.is_integer(achievement_text):
        invalid_achievement_text = tool.find_sub_string(
            achievement_text, '<font color="#FF0000">', "</font>")
        if invalid_achievement_text:
            output.print_msg("游戏 %s, 存在无效成就,%s" %
                             (game_id, invalid_achievement_text))
        else:
            output.print_msg("游戏 %s, 存在未知成就文字:%s" %
                             (game_id, invalid_achievement_text))
def jkt(file_handle):
    index_url = "http://www.jkt48.com/member/list"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        page = tool.find_sub_string(index_response.data, '<div id="mainCol">',
                                    "<!--end #mainCol-->", 1)
        start_index = 0
        start_index_list = []
        while start_index != -1:
            start_index = page.find('<a name="', start_index + 1)
            start_index_list.append(start_index)
        for i in range(0, len(start_index_list) - 1):
            start = start_index_list[i]
            end = start_index_list[i + 1]
            if end == -1:
                end = len(page)
            split_page = page[start:end]
            team_name = tool.find_sub_string(split_page, "<h2>", "</h2>")
            if team_name.find("Team") == -1:
                team_name = "Team kenkyusei"
            team_name = "JKT48 " + team_name
            member_list = re.findall(
                '<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',
                split_page)
            for member in member_list:
                member = member.replace("<br>", "").replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                japanese_name = english_name = tool.find_sub_string(
                    member, 'alt="', '"')
                file_handle.write(japanese_name + "\t" + english_name + "\t" +
                                  team_name + "\n")
def nmb(file_handle):
    team_list = {
        "teamn": "NMB48 Team N",
        "teamm": "NMB48 Team M",
        "teamb2": "NMB48 Team BII",
        "dkenkyusei": "NMB48 Team Kenkyusei",
        "kenkyusei": "NMB48 Team Kenkyusei",
    }
    index_url = "http://www.nmb48.com/member/"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        team_page_list = re.findall(
            "<!--▼チーム別領域ボックス▼-->([\s|\S]*?)<!--▲チーム別領域ボックス▲--> ",
            index_response.data)
        for team_page in team_page_list:
            team_find = tool.find_sub_string(team_page, '<a name="', '"></a>')
            if team_find:
                if team_find not in team_list:
                    output.print_msg("not found %s in team_list" % team_find)
                    continue
                member_list = re.findall(
                    '<li class="member-box[^"]*">([\s|\S]*?)</li>', team_page)
                for member in member_list:
                    member = member.replace("<br />", "").replace(
                        "\n",
                        "").replace("\r",
                                    "").replace("\t",
                                                "").replace("&nbsp;", " ")
                    japanese_name_find = re.findall(
                        '<h4><a href="[^"]*">([^<]*)</a></h4>', member)
                    english_name_find = re.findall(
                        "<p[\s|\S]*?>([\s|\S]*?)</[p|a]>", member)
                    if len(japanese_name_find) != 1:
                        output.print_msg("error japanese_name_find")
                        continue
                    if len(english_name_find) != 1:
                        output.print_msg("error english_name_find")
                        continue

                    team = team_list[team_find]
                    if english_name_find[0].find("<span>") >= 0:
                        temp = english_name_find[0].split("<span>")
                        english_name_find[0] = temp[0]
                        temp[1] = temp[1].replace("</span>", "")
                        if temp[1].find("研究生") == -1:
                            team += " / " + temp[1].split("/")[-1].strip()
                    japanese_name = japanese_name_find[0].replace(" ",
                                                                  " ").replace(
                                                                      " ", "")
                    first_name, last_name = english_name_find[0].strip().title(
                    ).split(" ", 1)

                    file_handle.write(japanese_name + "\t" + last_name + " " +
                                      first_name + "\t" + team + "\n")
            else:
                output.print_msg("error team_find")
Beispiel #5
0
def get_bbs_forum_url_list(index_url):
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        forum_find = re.findall('<a href="(forum-\w*-\d*.\w*)"[^>]*>([\S]*)</a>', index_response.data)
        host = index_url[0: index_url.rfind("/") + 1]
        forum_url_list = {}
        for forum_path, forum_name in forum_find:
            forum_url_list[host + forum_path] = forum_name
        return forum_url_list
    return None
Beispiel #6
0
def get_one_forum_page_thread_url_list(forum_url):
    forum_response = net.http_request(forum_url, method="GET")
    if forum_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        forum_page = tool.find_sub_string(forum_response.data, '<div id="threadlist"', '<div id="filter_special_menu"', 1)
        thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page)
        host = forum_url[0: forum_url.rfind("/") + 1]
        thread_url_list = {}
        for forum_path, forum_name in thread_find:
            thread_url_list[host + forum_path] = forum_name
        return thread_url_list
    return None
Beispiel #7
0
def get_thread_author_post(thread_url):
    thread_response = net.http_request(thread_url, method="GET")
    if thread_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        post_message = tool.find_sub_string(thread_response.data, '<td class="t_f" id="postmessage_', '<div id="comment_')
        post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")]
        content_type = thread_response.getheader("Content-Type")
        if content_type is None:
            return post_message
        charset = tool.find_sub_string(content_type, "charset=")
        return post_message.decode(charset)

    return None
def ske(file_handle):
    split_list = {
        "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"),
        "SKE48 Team KII":
        ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"),
        "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"),
        "SKE48 Team Kenkyusei":
        ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->")
    }
    index_url = "http://www.ske48.co.jp/profile/list.php"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        for team_name in split_list:
            team_page = tool.find_sub_string(index_response.data,
                                             split_list[team_name][0],
                                             split_list[team_name][1])
            member_list = re.findall("<dl>([\s|\S]*?)</dl>", team_page)
            for member in member_list:
                member = member.replace("<br />",
                                        "").replace("\n", "").replace(
                                            "\r", "").replace("\t", "")
                japanese_name_find = re.findall(
                    '<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member)
                english_name = tool.find_sub_string(member, '<h3 class="en">',
                                                    "</h3>")
                plus_text = tool.find_sub_string(member,
                                                 '<li class="textPlus">',
                                                 "</li>")
                if len(japanese_name_find) != 1:
                    output.print_msg("error japanese_name_find")
                    continue
                if not english_name:
                    output.print_msg("error english_name")
                    continue

                japanese_name = japanese_name_find[0].replace(" ", "")
                first_name, last_name = english_name.strip().title().split(
                    " ", 1)
                if plus_text and plus_text.find("兼任") > 0:
                    team = team_name + " / " + plus_text.split("/")[-1].strip(
                    ).replace("チーム", " Team ").replace("兼任", "")
                else:
                    team = team_name

                file_handle.write(japanese_name + "\t" + last_name + " " +
                                  first_name + "\t" + team + "\n")
def get_account_from_index():
    index_url = "http://blog.nogizaka46.com/"
    index_response = net.http_request(index_url, method="GET")
    account_list = {}
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        member_list_find = re.findall(
            '<div class="unit"><a href="./([^"]*)"><img src="[^>]*alt="([^"]*)" />',
            index_response.data)
        if len(member_list_find) == 0:
            raise crawler.CrawlerException("页面截取成员类别失败\n%s" %
                                           index_response.data)
        for member_info in member_list_find:
            account_list[member_info[0]] = member_info[1].replace(" ", "")
    else:
        raise crawler.CrawlerException(
            crawler.request_failre(index_response.status))
    return account_list
def akb(file_handle):
    for team_id in [1, 2, 3, 4, 12]:
        member_index_url = "http://www.akb48.co.jp/about/members/"
        query_data = {"team_id": team_id}
        member_index_response = net.http_request(member_index_url,
                                                 method="GET",
                                                 fields=query_data)
        if member_index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
            member_list_page = tool.find_sub_string(
                member_index_response.data, '<ul class="memberListUl">',
                "</ul>")
            if member_list_page:
                member_list = re.findall("<li>([\s|\S]*?)</li>",
                                         member_list_page)
                for member in member_list:
                    member = member.replace("<br />",
                                            "").replace("\n", "").replace(
                                                "\r", "").replace("\t", "")
                    japanese_name = tool.find_sub_string(
                        member, '<h4 class="memberListNamej">', "</h4>")
                    english_name = tool.find_sub_string(
                        member, '<p class="memberListNamee">', "</p>")
                    team_find = re.findall(
                        '<h5 class="memberListTeam">([^<]*)</h5>', member)
                    if not japanese_name:
                        output.print_msg("error japanese_name")
                        continue
                    if not english_name:
                        output.print_msg("error english_name")
                        continue
                    if (team_id != 12 and len(team_find) != 1) or (
                            team_id == 12 and len(team_find) != 2):
                        output.print_msg("error team_find")
                        continue

                    japanese_name = japanese_name.replace(" ", "")
                    first_name, last_name = english_name.split(" ", 1)
                    team = team_find[0].strip().replace("  /", " / ")

                    file_handle.write(japanese_name + "\t" + last_name + " " +
                                      first_name + "\t" + team + "\n")
            else:
                output.print_msg("error member_list_page")
def hkt(file_handle):
    index_url = "http://www.hkt48.jp/profile/"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        team_find = re.findall("(<h3>[\s|\S]*?)<!-- / .contsbox --></div>",
                               index_response.data)
        for team_page in team_find:
            team = tool.find_sub_string(team_page, "<h3>", "</h3>")
            if not team:
                output.print_msg("error team")
                continue
            team = team.strip()
            member_list = re.findall("<li>([\s|\S]*?)</li>", team_page)
            for member in member_list:
                member = member.replace("<br />",
                                        "").replace("\n", "").replace(
                                            "\r", "").replace("\t", "")
                name_find = re.findall(
                    """<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> """,
                    member)
                if len(name_find) != 1:
                    output.print_msg("error name_find")
                    continue
                japanese_name, english_name = name_find[0]
                team_plus_find = re.findall(
                    '<div class="team_j">([^<]*)</div>', member)
                team_name = team
                if len(team_plus_find) == 1:
                    if team_plus_find[0].find("兼任") >= 0:
                        team_name = team + " / " + team_plus_find[0].split(
                            "/")[-1].strip().replace("兼任", "")
                japanese_name = japanese_name.replace(" ", "")
                first_name, last_name = english_name.strip().title().split(
                    " ", 1)

                file_handle.write(japanese_name + "\t" + last_name + " " +
                                  first_name + "\t" + team_name + "\n")
Beispiel #12
0
    "flail2-2h": "双手连枷",
    "legendarygem": "傳奇宝石",
}

item_attribute_list = {}
base_host = "http://db.d.163.com"
for item_path, item_position in item_list.items():
    page_count = 1
    item_attribute_list[item_path] = []
    while True:
        if item_position == "傳奇宝石":
            item_index_url = base_host + "/tw/base/legendarygem/"
        else:
            item_index_url = base_host + "/tw/item/%s/legendary.html#page=%s" % (
                item_path, page_count)
        item_index_response = net.http_request(item_index_url, method="GET")
        if item_index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
            # item_index = item_index.decode("UTF-8")
            item_index_page = tool.find_sub_string(item_index_response.data,
                                                   '<div class="cizhui-c-m',
                                                   '<div class="data-options',
                                                   1)
            item_index_page = item_index_page.decode("GBK").encode("UTF-8")
            item_info_list = re.findall('<tr class="[\s|\S]*?</tr>',
                                        item_index_page)
            if len(item_info_list) == 0:
                continue
            for item_info in item_info_list:
                if item_info.find('<em class="transmog-s"></em>') >= 0:
                    continue
                item_url = tool.find_sub_string(item_info, '<a href="', '"')