def get_account_from_index(): index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php" query_data = {"cd": "member"} index_response = net.http_request(index_url, method="GET", fields=query_data) account_list = {} if index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(index_response.status)) member_list_data = tool.find_sub_string(index_response.data, '<ul class="thumb">', "</ul>") if not member_list_data: raise crawler.CrawlerException("页面截取账号列表失败\n%s" % index_response.data) member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data) for member_info in member_list_find: # 获取账号id account_id = tool.find_sub_string(member_info, "&ct=", '">') if not account_id: raise crawler.CrawlerException("账号信息截取账号id失败\n%s" % member_info) # 获取成员名字 account_name = tool.find_sub_string(member_info, '<p class="name">', "</p>").strip().replace(" ", "") if not account_name: raise crawler.CrawlerException("账号信息截取成员名字失败\n%s" % member_info) account_list[account_id] = account_name return account_list
def get_game_invalid_achievements(game_id): game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php" query_data = {"AppID": game_id} game_index_response = net.http_request(game_index_url, method="GET", fields=query_data) if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: output.print_msg("游戏 %s 访问失败" % game_id) tool.process_exit() # game id 不存在 if game_index_response.data.find( "This game cannot be found in the database.") >= 0: return achievement_text = tool.find_sub_string( game_index_response.data, '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>") # 没有成就 if not achievement_text: return achievement_text = achievement_text.strip() if not crawler.is_integer(achievement_text): invalid_achievement_text = tool.find_sub_string( achievement_text, '<font color="#FF0000">', "</font>") if invalid_achievement_text: output.print_msg("游戏 %s, 存在无效成就,%s" % (game_id, invalid_achievement_text)) else: output.print_msg("游戏 %s, 存在未知成就文字:%s" % (game_id, invalid_achievement_text))
def jkt(file_handle): index_url = "http://www.jkt48.com/member/list" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: page = tool.find_sub_string(index_response.data, '<div id="mainCol">', "<!--end #mainCol-->", 1) start_index = 0 start_index_list = [] while start_index != -1: start_index = page.find('<a name="', start_index + 1) start_index_list.append(start_index) for i in range(0, len(start_index_list) - 1): start = start_index_list[i] end = start_index_list[i + 1] if end == -1: end = len(page) split_page = page[start:end] team_name = tool.find_sub_string(split_page, "<h2>", "</h2>") if team_name.find("Team") == -1: team_name = "Team kenkyusei" team_name = "JKT48 " + team_name member_list = re.findall( '<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->', split_page) for member in member_list: member = member.replace("<br>", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name = english_name = tool.find_sub_string( member, 'alt="', '"') file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")
def nmb(file_handle): team_list = { "teamn": "NMB48 Team N", "teamm": "NMB48 Team M", "teamb2": "NMB48 Team BII", "dkenkyusei": "NMB48 Team Kenkyusei", "kenkyusei": "NMB48 Team Kenkyusei", } index_url = "http://www.nmb48.com/member/" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: team_page_list = re.findall( "<!--▼チーム別領域ボックス▼-->([\s|\S]*?)<!--▲チーム別領域ボックス▲--> ", index_response.data) for team_page in team_page_list: team_find = tool.find_sub_string(team_page, '<a name="', '"></a>') if team_find: if team_find not in team_list: output.print_msg("not found %s in team_list" % team_find) continue member_list = re.findall( '<li class="member-box[^"]*">([\s|\S]*?)</li>', team_page) for member in member_list: member = member.replace("<br />", "").replace( "\n", "").replace("\r", "").replace("\t", "").replace(" ", " ") japanese_name_find = re.findall( '<h4><a href="[^"]*">([^<]*)</a></h4>', member) english_name_find = re.findall( "<p[\s|\S]*?>([\s|\S]*?)</[p|a]>", member) if len(japanese_name_find) != 1: output.print_msg("error japanese_name_find") continue if len(english_name_find) != 1: output.print_msg("error english_name_find") continue team = team_list[team_find] if english_name_find[0].find("<span>") >= 0: temp = english_name_find[0].split("<span>") english_name_find[0] = temp[0] temp[1] = temp[1].replace("</span>", "") if temp[1].find("研究生") == -1: team += " / " + temp[1].split("/")[-1].strip() japanese_name = japanese_name_find[0].replace(" ", " ").replace( " ", "") first_name, last_name = english_name_find[0].strip().title( ).split(" ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: output.print_msg("error team_find")
def get_bbs_forum_url_list(index_url): index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: forum_find = re.findall('<a href="(forum-\w*-\d*.\w*)"[^>]*>([\S]*)</a>', index_response.data) host = index_url[0: index_url.rfind("/") + 1] forum_url_list = {} for forum_path, forum_name in forum_find: forum_url_list[host + forum_path] = forum_name return forum_url_list return None
def get_one_forum_page_thread_url_list(forum_url): forum_response = net.http_request(forum_url, method="GET") if forum_response.status == net.HTTP_RETURN_CODE_SUCCEED: forum_page = tool.find_sub_string(forum_response.data, '<div id="threadlist"', '<div id="filter_special_menu"', 1) thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page) host = forum_url[0: forum_url.rfind("/") + 1] thread_url_list = {} for forum_path, forum_name in thread_find: thread_url_list[host + forum_path] = forum_name return thread_url_list return None
def get_thread_author_post(thread_url): thread_response = net.http_request(thread_url, method="GET") if thread_response.status == net.HTTP_RETURN_CODE_SUCCEED: post_message = tool.find_sub_string(thread_response.data, '<td class="t_f" id="postmessage_', '<div id="comment_') post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")] content_type = thread_response.getheader("Content-Type") if content_type is None: return post_message charset = tool.find_sub_string(content_type, "charset=") return post_message.decode(charset) return None
def ske(file_handle): split_list = { "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"), "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"), "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"), "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->") } index_url = "http://www.ske48.co.jp/profile/list.php" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: for team_name in split_list: team_page = tool.find_sub_string(index_response.data, split_list[team_name][0], split_list[team_name][1]) member_list = re.findall("<dl>([\s|\S]*?)</dl>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name_find = re.findall( '<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member) english_name = tool.find_sub_string(member, '<h3 class="en">', "</h3>") plus_text = tool.find_sub_string(member, '<li class="textPlus">', "</li>") if len(japanese_name_find) != 1: output.print_msg("error japanese_name_find") continue if not english_name: output.print_msg("error english_name") continue japanese_name = japanese_name_find[0].replace(" ", "") first_name, last_name = english_name.strip().title().split( " ", 1) if plus_text and plus_text.find("兼任") > 0: team = team_name + " / " + plus_text.split("/")[-1].strip( ).replace("チーム", " Team ").replace("兼任", "") else: team = team_name file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
def get_account_from_index(): index_url = "http://blog.nogizaka46.com/" index_response = net.http_request(index_url, method="GET") account_list = {} if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: member_list_find = re.findall( '<div class="unit"><a href="./([^"]*)"><img src="[^>]*alt="([^"]*)" />', index_response.data) if len(member_list_find) == 0: raise crawler.CrawlerException("页面截取成员类别失败\n%s" % index_response.data) for member_info in member_list_find: account_list[member_info[0]] = member_info[1].replace(" ", "") else: raise crawler.CrawlerException( crawler.request_failre(index_response.status)) return account_list
def akb(file_handle): for team_id in [1, 2, 3, 4, 12]: member_index_url = "http://www.akb48.co.jp/about/members/" query_data = {"team_id": team_id} member_index_response = net.http_request(member_index_url, method="GET", fields=query_data) if member_index_response.status == net.HTTP_RETURN_CODE_SUCCEED: member_list_page = tool.find_sub_string( member_index_response.data, '<ul class="memberListUl">', "</ul>") if member_list_page: member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name = tool.find_sub_string( member, '<h4 class="memberListNamej">', "</h4>") english_name = tool.find_sub_string( member, '<p class="memberListNamee">', "</p>") team_find = re.findall( '<h5 class="memberListTeam">([^<]*)</h5>', member) if not japanese_name: output.print_msg("error japanese_name") continue if not english_name: output.print_msg("error english_name") continue if (team_id != 12 and len(team_find) != 1) or ( team_id == 12 and len(team_find) != 2): output.print_msg("error team_find") continue japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.split(" ", 1) team = team_find[0].strip().replace(" /", " / ") file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: output.print_msg("error member_list_page")
def hkt(file_handle): index_url = "http://www.hkt48.jp/profile/" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: team_find = re.findall("(<h3>[\s|\S]*?)<!-- / .contsbox --></div>", index_response.data) for team_page in team_find: team = tool.find_sub_string(team_page, "<h3>", "</h3>") if not team: output.print_msg("error team") continue team = team.strip() member_list = re.findall("<li>([\s|\S]*?)</li>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") name_find = re.findall( """<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> """, member) if len(name_find) != 1: output.print_msg("error name_find") continue japanese_name, english_name = name_find[0] team_plus_find = re.findall( '<div class="team_j">([^<]*)</div>', member) team_name = team if len(team_plus_find) == 1: if team_plus_find[0].find("兼任") >= 0: team_name = team + " / " + team_plus_find[0].split( "/")[-1].strip().replace("兼任", "") japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.strip().title().split( " ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")
"flail2-2h": "双手连枷", "legendarygem": "傳奇宝石", } item_attribute_list = {} base_host = "http://db.d.163.com" for item_path, item_position in item_list.items(): page_count = 1 item_attribute_list[item_path] = [] while True: if item_position == "傳奇宝石": item_index_url = base_host + "/tw/base/legendarygem/" else: item_index_url = base_host + "/tw/item/%s/legendary.html#page=%s" % ( item_path, page_count) item_index_response = net.http_request(item_index_url, method="GET") if item_index_response.status == net.HTTP_RETURN_CODE_SUCCEED: # item_index = item_index.decode("UTF-8") item_index_page = tool.find_sub_string(item_index_response.data, '<div class="cizhui-c-m', '<div class="data-options', 1) item_index_page = item_index_page.decode("GBK").encode("UTF-8") item_info_list = re.findall('<tr class="[\s|\S]*?</tr>', item_index_page) if len(item_info_list) == 0: continue for item_info in item_info_list: if item_info.find('<em class="transmog-s"></em>') >= 0: continue item_url = tool.find_sub_string(item_info, '<a href="', '"')