def get_article_title(article_page, article_type): if article_type == "t": return tool.find_sub_string(article_page, '<div class="title" node-type="articleTitle">', "</div>") elif article_type == "p": return tool.find_sub_string(article_page, '<h1 class=\\"title\\">', "<\\/h1>") else: return None
def get_account_from_index(): index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php" query_data = {"cd": "member"} index_response = net.http_request(index_url, method="GET", fields=query_data) account_list = {} if index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(index_response.status)) member_list_data = tool.find_sub_string(index_response.data, '<ul class="thumb">', "</ul>") if not member_list_data: raise crawler.CrawlerException("页面截取账号列表失败\n%s" % index_response.data) member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data) for member_info in member_list_find: # 获取账号id account_id = tool.find_sub_string(member_info, "&ct=", '">') if not account_id: raise crawler.CrawlerException("账号信息截取账号id失败\n%s" % member_info) # 获取成员名字 account_name = tool.find_sub_string(member_info, '<p class="name">', "</p>").strip().replace(" ", "") if not account_name: raise crawler.CrawlerException("账号信息截取成员名字失败\n%s" % member_info) account_list[account_id] = account_name return account_list
def jkt(file_handle): index_url = "http://www.jkt48.com/member/list" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: page = tool.find_sub_string(index_response.data, '<div id="mainCol">', "<!--end #mainCol-->", 1) start_index = 0 start_index_list = [] while start_index != -1: start_index = page.find('<a name="', start_index + 1) start_index_list.append(start_index) for i in range(0, len(start_index_list) - 1): start = start_index_list[i] end = start_index_list[i + 1] if end == -1: end = len(page) split_page = page[start:end] team_name = tool.find_sub_string(split_page, "<h2>", "</h2>") if team_name.find("Team") == -1: team_name = "Team kenkyusei" team_name = "JKT48 " + team_name member_list = re.findall( '<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->', split_page) for member in member_list: member = member.replace("<br>", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name = english_name = tool.find_sub_string( member, 'alt="', '"') file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")
def ske(file_handle): split_list = { "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"), "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"), "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"), "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->") } index_url = "http://www.ske48.co.jp/profile/list.php" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: for team_name in split_list: team_page = tool.find_sub_string(page, split_list[team_name][0], split_list[team_name][1]) member_list = re.findall('<dl>([\s|\S]*?)</dl>', team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name_find = re.findall('<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member) english_name = tool.find_sub_string(member, '<h3 class="en">', '</h3>') plus_text = tool.find_sub_string(member, '<li class="textPlus">', '</li>') if len(japanese_name_find) != 1: print "error japanese_name_find" continue if not english_name: print "error english_name" continue japanese_name = japanese_name_find[0].replace(" ", "") first_name, last_name = english_name.strip().title().split(" ", 1) if plus_text and plus_text.find("兼任") > 0: team = team_name + " / " + plus_text.split("/")[-1].strip().replace("チーム", " Team ").replace("兼任", "") else: team = team_name file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
def akb(file_handle): for team_id in [1, 2, 3, 4, 12]: index_url = "http://www.akb48.co.jp/about/members/?team_id=" + str(team_id) return_code, page = tool.http_request(index_url)[:2] if return_code == 1: member_list_page = tool.find_sub_string(page, '<ul class="memberListUl">', '</ul>') if member_list_page: member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name = tool.find_sub_string(member, '<h4 class="memberListNamej">', '</h4>') english_name = tool.find_sub_string(member, '<p class="memberListNamee">', '</p>') team_find = re.findall('<h5 class="memberListTeam">([^<]*)</h5>', member) if not japanese_name: print "error japanese_name" continue if not english_name: print "error english_name" continue if (team_id != 12 and len(team_find) != 1) or (team_id == 12 and len(team_find) != 2): print "error team_find" continue japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.split(" ", 1) team = team_find[0].strip().replace(" /", " / ") file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: print "error member_list_page"
def get_game_invalid_achievements(game_id): game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php" query_data = {"AppID": game_id} game_index_response = net.http_request(game_index_url, method="GET", fields=query_data) if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: output.print_msg("游戏 %s 访问失败" % game_id) tool.process_exit() # game id 不存在 if game_index_response.data.find( "This game cannot be found in the database.") >= 0: return achievement_text = tool.find_sub_string( game_index_response.data, '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>") # 没有成就 if not achievement_text: return achievement_text = achievement_text.strip() if not crawler.is_integer(achievement_text): invalid_achievement_text = tool.find_sub_string( achievement_text, '<font color="#FF0000">', "</font>") if invalid_achievement_text: output.print_msg("游戏 %s, 存在无效成就,%s" % (game_id, invalid_achievement_text)) else: output.print_msg("游戏 %s, 存在未知成就文字:%s" % (game_id, invalid_achievement_text))
def jkt(file_handle): index_url = "http://www.jkt48.com/member/list" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: page = tool.find_sub_string(page, '<div id="mainCol">', "<!--end #mainCol-->", 1) start_index = 0 start_index_list = [] while start_index != -1: start_index = page.find('<a name="', start_index + 1) start_index_list.append(start_index) for i in range(0, len(start_index_list) - 1): start = start_index_list[i] end = start_index_list[i + 1] if end == -1: end = len(page) split_page = page[start: end] team_name = tool.find_sub_string(split_page, "<h2>", "</h2>") if team_name.find("Team") == -1: team_name = "Team kenkyusei" team_name = "JKT48 " + team_name member_list = re.findall('<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',split_page) for member in member_list: member = member.replace("<br>", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name = english_name = tool.find_sub_string(member, 'alt="', '"') file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")
def get_api_info(account_name): photo_index_url = "https://www.flickr.com/photos/%s" % account_name photo_index_return_code, photo_index_page = tool.http_request(photo_index_url)[:2] if photo_index_return_code == 1: user_id = tool.find_sub_string(photo_index_page, '"nsid":"', '"') site_key = tool.find_sub_string(photo_index_page, '"site_key":"', '"') return {"user_id": user_id, "site_key": site_key} return None
def get_article_id(article_url): article_id = tool.find_sub_string(article_url, "http://weibo.com/ttarticle/p/show?id=", "&mod=zwenzhang") if article_id: return "t_%s" % article_id else: article_id = tool.find_sub_string(article_url, "http://weibo.com/p/", "?mod=zwenzhang") if article_id: return "p_%s" % article_id return None
def get_thread_author_post(thread_url): thread_return_code, thread_page, thread_response = tool.http_request(thread_url) if thread_return_code == 1: content_type = tool.get_response_info(thread_response.info(), "Content-Type") charset = tool.find_sub_string(content_type, "charset=") post_message = tool.find_sub_string(thread_page, '<td class="t_f" id="postmessage_', '<div id="comment_') post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")] return post_message.decode(charset) return None
def get_blog_time(blog_page): blog_time_info = tool.find_sub_string(blog_page, '<span class="articleTime">', "</span>") if blog_time_info: blog_time_string = tool.find_sub_string(blog_page, 'pubdate="pubdate">', "</time>").strip() else: blog_time_string = tool.find_sub_string(blog_page, '<span class="date">', "</span>").strip() if blog_time_string: blog_timestamp = time.strptime(blog_time_string, "%Y-%m-%d %H:%M:%S") # 显示时间对应的时间戳,服务器的时区(日本),不对本地时间做转换 return int(time.mktime(blog_timestamp)) return None
def get_image_url_list(blog_page): article_data = tool.find_sub_string(blog_page, '<div class="articleText">', "<!--entryBottom-->", 1) if not article_data: article_data = tool.find_sub_string(blog_page, '<div class="subContentsInner">', "<!--entryBottom-->", 1) image_url_list_find = re.findall('<img [\S|\s]*?src="([^"]*)" [\S|\s]*?>', article_data) image_url_list = [] for image_url in image_url_list_find: # 过滤表情 if image_url.find(".ameba.jp/blog/ucs/") == -1: image_url_list.append(image_url) return image_url_list
def get_article_image_url_list(article_page, article_type): if article_type == "t": article_body = tool.find_sub_string(article_page, '<div class="WB_editor_iframe', '<div class="artical_add_box') elif article_type == "p": article_body = tool.find_sub_string(article_page, '{"ns":"pl.content.longFeed.index"', "</script>") article_body = article_body.replace("\\", "") else: return None if article_body: return re.findall('<img[^>]* src="([^"]*)"[^>]*>', article_body) return None
def get_thread_author_post(thread_url): thread_response = net.http_request(thread_url, method="GET") if thread_response.status == net.HTTP_RETURN_CODE_SUCCEED: post_message = tool.find_sub_string(thread_response.data, '<td class="t_f" id="postmessage_', '<div id="comment_') post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")] content_type = thread_response.getheader("Content-Type") if content_type is None: return post_message charset = tool.find_sub_string(content_type, "charset=") return post_message.decode(charset) return None
def get_member_list(): index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php?cd=member" index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code: member_list_data = tool.find_sub_string(index_page, '<ul class="thumb">', "</ul>") if member_list_data: member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data) for member_info in member_list_find: ct = tool.find_sub_string(member_info, "&ct=", '">') name = tool.find_sub_string(member_info, '<p class="name">', "</p>").strip().replace(" ", "") tool.print_msg("%s\t\t\t%s" % (ct, name), False) if len(member_list_find) > 0: tool.print_msg("复制以上内容到save.data中,删除不需要的行,即可开始运行", False) return None
def hkt(file_handle): index_url = "http://www.hkt48.jp/profile/" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: team_find = re.findall('(<h3>[\s|\S]*?)<!-- / .contsbox --></div>', page) for team_page in team_find: team = tool.find_sub_string(team_page, "<h3>", "</h3>") if not team: print "error team" continue team = team.strip() member_list = re.findall("<li>([\s|\S]*?)</li>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") name_find = re.findall('''<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> ''', member) if len(name_find) != 1: print "error name_find" continue japanese_name, english_name = name_find[0] team_plus_find = re.findall('<div class="team_j">([^<]*)</div>', member) team_name = team if len(team_plus_find) == 1: if team_plus_find[0].find("兼任") >= 0: team_name = team + " / " + team_plus_find[0].split("/")[-1].strip().replace("兼任", "") japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.strip().title().split(" ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")
def get_picasaweb_page_album_id(account_id, picasaweb_url): message_page_return_code, message_page = tool.http_request(picasaweb_url)[:2] if message_page_return_code == 1: # 查找picasaweb页的album id album_archive_url = "https://get.google.com/albumarchive/pwa/%s/album/" % account_id return tool.find_sub_string(message_page, 'href="%s' % album_archive_url, '"') return None
def get_one_page_blog(account_id, page_count): # http://blog.nogizaka46.com/asuka.saito blog_url = "http://blog.nogizaka46.com/%s/?p=%s" % (account_id, page_count) blog_return_code, blog_page = tool.http_request(blog_url)[:2] if blog_return_code == 1: return tool.find_sub_string(blog_page, '<div class="paginate">', '<div class="paginate">', 1) return None
def get_article_url(preview_article_data): page_route = tool.find_sub_string(preview_article_data, '<a target=\\"_blank\\" href=\\"', '\\">') page_route = page_route.replace("\\/", "/").replace("&", "&") if page_route: return "http://weibo.com" + page_route else: return None
def get_max_page_count(coser_id, post_page): max_page_count = tool.find_sub_string(post_page, '<a href="/u/%s/post/cos?&p=' % coser_id, '">') if max_page_count: max_page_count = int(max_page_count) else: max_page_count = 1 return max_page_count
def ske(file_handle): split_list = { "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"), "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"), "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"), "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->") } index_url = "http://www.ske48.co.jp/profile/list.php" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: for team_name in split_list: team_page = tool.find_sub_string(index_response.data, split_list[team_name][0], split_list[team_name][1]) member_list = re.findall("<dl>([\s|\S]*?)</dl>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name_find = re.findall( '<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member) english_name = tool.find_sub_string(member, '<h3 class="en">', "</h3>") plus_text = tool.find_sub_string(member, '<li class="textPlus">', "</li>") if len(japanese_name_find) != 1: output.print_msg("error japanese_name_find") continue if not english_name: output.print_msg("error english_name") continue japanese_name = japanese_name_find[0].replace(" ", "") first_name, last_name = english_name.strip().title().split( " ", 1) if plus_text and plus_text.find("兼任") > 0: team = team_name + " / " + plus_text.split("/")[-1].strip( ).replace("チーム", " Team ").replace("兼任", "") else: team = team_name file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
def get_suid(account_id): index_page_url = "http://www.miaopai.com/u/paike_%s" % account_id index_page_return_code, index_page = tool.http_request(index_page_url)[:2] if index_page_return_code == 1: suid = tool.find_sub_string(index_page, '<button class="guanzhu gz" suid="', '" heade="1" token="">+关注</button>') if suid: return suid return None
def get_user_id(account_id): index_url = "http://changba.com/u/%s" % account_id index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code == 1: user_id = tool.find_sub_string(index_page, "var userid = '", "'") if user_id: return user_id return None
def get_account_id(account_name): account_index_url = "https://twitter.com/%s" % account_name account_index_return_code, account_index_page = tool.http_request(account_index_url)[:2] if account_index_return_code == 1: account_id = tool.find_sub_string(account_index_page, '<div class="ProfileNav" role="navigation" data-user-id="', '">') if account_id: return account_id return None
def nmb(file_handle): team_list = { "teamn": "NMB48 Team N", "teamm": "NMB48 Team M", "teamb2": "NMB48 Team BII", "dkenkyusei": "NMB48 Team Kenkyusei", "kenkyusei": "NMB48 Team Kenkyusei", } index_url = "http://www.nmb48.com/member/" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: team_page_list = re.findall( "<!--▼チーム別領域ボックス▼-->([\s|\S]*?)<!--▲チーム別領域ボックス▲--> ", index_response.data) for team_page in team_page_list: team_find = tool.find_sub_string(team_page, '<a name="', '"></a>') if team_find: if team_find not in team_list: output.print_msg("not found %s in team_list" % team_find) continue member_list = re.findall( '<li class="member-box[^"]*">([\s|\S]*?)</li>', team_page) for member in member_list: member = member.replace("<br />", "").replace( "\n", "").replace("\r", "").replace("\t", "").replace(" ", " ") japanese_name_find = re.findall( '<h4><a href="[^"]*">([^<]*)</a></h4>', member) english_name_find = re.findall( "<p[\s|\S]*?>([\s|\S]*?)</[p|a]>", member) if len(japanese_name_find) != 1: output.print_msg("error japanese_name_find") continue if len(english_name_find) != 1: output.print_msg("error english_name_find") continue team = team_list[team_find] if english_name_find[0].find("<span>") >= 0: temp = english_name_find[0].split("<span>") english_name_find[0] = temp[0] temp[1] = temp[1].replace("</span>", "") if temp[1].find("研究生") == -1: team += " / " + temp[1].split("/")[-1].strip() japanese_name = japanese_name_find[0].replace(" ", " ").replace( " ", "") first_name, last_name = english_name_find[0].strip().title( ).split(" ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: output.print_msg("error team_find")
def akb(file_handle): for team_id in [1, 2, 3, 4, 12]: member_index_url = "http://www.akb48.co.jp/about/members/" query_data = {"team_id": team_id} member_index_response = net.http_request(member_index_url, method="GET", fields=query_data) if member_index_response.status == net.HTTP_RETURN_CODE_SUCCEED: member_list_page = tool.find_sub_string( member_index_response.data, '<ul class="memberListUl">', "</ul>") if member_list_page: member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") japanese_name = tool.find_sub_string( member, '<h4 class="memberListNamej">', "</h4>") english_name = tool.find_sub_string( member, '<p class="memberListNamee">', "</p>") team_find = re.findall( '<h5 class="memberListTeam">([^<]*)</h5>', member) if not japanese_name: output.print_msg("error japanese_name") continue if not english_name: output.print_msg("error english_name") continue if (team_id != 12 and len(team_find) != 1) or ( team_id == 12 and len(team_find) != 2): output.print_msg("error team_find") continue japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.split(" ", 1) team = team_find[0].strip().replace(" /", " / ") file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: output.print_msg("error member_list_page")
def check_big_image(image_url, big_2_small_list): if image_url in big_2_small_list: big_image_display_page_return_code, big_image_display_page = tool.http_request(big_2_small_list[image_url])[:2] if big_image_display_page_return_code == 1: temp_image_url = tool.find_sub_string(big_image_display_page, '<img src="', '"') if temp_image_url != "/img/expired.gif": return temp_image_url, False else: return image_url, True # 如果有发现一个已经过期的图片,那么再往前的图片也是过期的,不用再检查了 return image_url, False
def get_image_url_list(album_page): image_url_list_find = tool.find_sub_string(album_page, '<input type="hidden" id="imageList" value=', ' />') try: image_url_list_find = json.loads(image_url_list_find) except ValueError: return None image_url_list = [] for temp_image_list in image_url_list_find: image_url_list += temp_image_list return image_url_list
def get_one_page_diary_data(account_id, page_count): # http://www.keyakizaka46.com/mob/news/diarKiji.php?cd=member&ct=01&page=0&rw=20 diary_page_url = "http://www.keyakizaka46.com/mob/news/diarKiji.php" diary_page_url += "?cd=member&ct=%02d&page=%s&rw=%s" % (int(account_id), page_count - 1, IMAGE_COUNT_PER_PAGE) diary_return_code, diary_page = tool.http_request(diary_page_url)[:2] if diary_return_code == 1: diary_page = tool.find_sub_string(diary_page, '<div class="box-main">', '<div class="box-sideMember">') if diary_page: return re.findall("<article>([\s|\S]*?)</article>", diary_page) return None
def get_account_page_id(account_id): for i in range(0, 50): index_url = "http://weibo.com/u/%s?is_all=1" % account_id index_page = auto_redirect_visit(index_url) if index_page: account_page_id = tool.find_sub_string(index_page, "$CONFIG['page_id']='", "'") if account_page_id: return account_page_id time.sleep(5) return None
def get_one_forum_page_thread_url_list(forum_url): forum_return_code, forum_page = tool.http_request(forum_url)[:2] if forum_return_code == 1: forum_page = tool.find_sub_string(forum_page, '<div id="threadlist"', '<div id="filter_special_menu"', 1) thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page) host = forum_url[0: forum_url.rfind("/") + 1] thread_url_list = {} for forum_path, forum_name in thread_find: thread_url_list[host + forum_path] = forum_name return thread_url_list return None
def get_video_url_list(tweet_id): video_page_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id video_page_return_code, video_page = tool.http_request(video_page_url)[:2] if video_page_return_code == 1: m3u8_file_url = tool.find_sub_string(video_page, ""video_url":"", """) if m3u8_file_url: m3u8_file_url = m3u8_file_url.replace("\\/", "/") ts_url_list = [] get_ts_url_list(m3u8_file_url, ts_url_list) return "ts", ts_url_list vmap_file_url = tool.find_sub_string(video_page, ""vmap_url":"", """) if vmap_file_url: vmap_file_url = vmap_file_url.replace("\\/", "/") vmap_file_return_code, vmap_file = tool.http_request(vmap_file_url)[:2] if vmap_file_return_code: media_file_url = tool.find_sub_string(vmap_file, "<![CDATA[", "]]>") if media_file_url: file_type = media_file_url.split(".")[-1].split("?")[0] return file_type, media_file_url return "", []
def get_one_forum_page_thread_url_list(forum_url): forum_response = net.http_request(forum_url, method="GET") if forum_response.status == net.HTTP_RETURN_CODE_SUCCEED: forum_page = tool.find_sub_string(forum_response.data, '<div id="threadlist"', '<div id="filter_special_menu"', 1) thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page) host = forum_url[0: forum_url.rfind("/") + 1] thread_url_list = {} for forum_path, forum_name in thread_find: thread_url_list[host + forum_path] = forum_name return thread_url_list return None
def set_csrf_token(): global CSRF_TOKEN index_url = "https://www.instagram.com/instagram" index_page_response = tool.http_request(index_url) if index_page_response[0] == 1: set_cookie_info = tool.get_response_info(index_page_response[2].info(), "Set-Cookie") if set_cookie_info is not None: csrf_token = tool.find_sub_string(set_cookie_info, "csrftoken=", ";") if csrf_token: CSRF_TOKEN = csrf_token return True return False
def get_follow_list(account_id): max_page_count = 1 page_count = 1 follow_list = {} while page_count <= max_page_count: follow_list_url = "http://www.meipai.com/user/%s/friends?p=%s" % (account_id, page_count) follow_list_page_return_code, follow_list_page = tool.http_request(follow_list_url)[:2] if follow_list_page_return_code == 1: follow_list_find = re.findall('<div class="ucard-info">([\s|\S]*?)</div>', follow_list_page) for follow_info in follow_list_find: follow_account_id = tool.find_sub_string(follow_info, '<a hidefocus href="/user/', '"').strip() follow_account_name = tool.find_sub_string(follow_info, 'title="', '"') follow_list[follow_account_id] = follow_account_name if max_page_count == 1: page_info = tool.find_sub_string(follow_list_page, '<div class="paging-wrap">', '</div>') if page_info: page_find = re.findall("friends\?p=(\d*)", page_info) page_find = [int(i) for i in page_find] max_page_count = max(page_find) page_count += 1 else: return None return follow_list
def get_post_page_head(post_url, postfix_list): post_page_return_code, post_page_data = tool.http_request(post_url)[:2] # 不带后缀的可以访问,则直接返回页面 # 如果无法访问,则依次访问带有后缀的页面 if post_page_return_code != 1: for postfix in postfix_list: temp_post_url = post_url + "/" + urllib2.quote(postfix) post_page_return_code, post_page_data = tool.http_request(temp_post_url)[:2] if post_page_return_code == 1: break if post_page_data is not None: return tool.find_sub_string(post_page_data, "<head", "</head>", 3) else: return None
def get_video_info_list(account_id): # http://www.nicovideo.jp/mylist/15614906#+page=1 video_page_url = "http://www.nicovideo.jp/mylist/%s" % account_id video_page_return_code, video_page = tool.http_request(video_page_url)[:2] if video_page_return_code == 1: video_data = tool.find_sub_string(video_page, "Mylist.preload(%s," % account_id, ");").strip() try: video_data = json.loads(video_data) except ValueError: pass else: # 倒序排列,时间越晚的越前面 video_data.reverse() return video_data return None
def get_discount_list(): page_count = 1 total_page_count = 99 discount_list = [] app_id_list = [] while page_count <= total_page_count: index_url = "http://store.steampowered.com/search/results" index_url += "?sort_by=Price_ASC&category1=998&os=win&specials=1&page=%s" % page_count index_page_return_code, index_page = tool.http_request(index_url)[:2] if index_page_return_code != 1: break items_page = tool.find_sub_string(index_page, "<!-- List Items -->", "<!-- End List Items -->") items_page = tool.find_sub_string(items_page, "<a href=", None) items_page = items_page.replace("\n", "").replace("\r", "").replace("<a href=", "\n<a href=") items = items_page.split("\n") for item in items: app_id = tool.find_sub_string(item, 'data-ds-appid="', '"') discount_data = tool.find_sub_string(item, '<div class="col search_discount responsive_secondrow">', "</div>") discount = tool.find_sub_string(discount_data, "<span>", "</span>").replace("-", "").replace("%", "") if not discount: discount = 0 price_data = tool.find_sub_string(item, '<div class="col search_price discounted responsive_secondrow">', "</div>", 2) old_price = tool.find_sub_string(price_data, '<strike>', '</strike>').replace("¥", "").strip() if not old_price: old_price = 0 new_price = tool.find_sub_string(price_data, '<br>', '</div>').replace("¥", "").strip() if not new_price or not new_price.isdigit(): new_price = 0 if app_id not in app_id_list: discount_list.append("%s\t%s\t%s\t%s" % (app_id, discount, old_price, new_price)) app_id_list.append(app_id) if total_page_count == 99: pagination_page = tool.find_sub_string(index_page, '<div class="search_pagination">', None) page_find = re.findall('return false;">([\d]*)</a>', pagination_page) if len(page_find) > 0: total_page_count = 0 for page_id in page_find: total_page_count = max(total_page_count, int(page_id)) page_count += 1 return discount_list
def hkt(file_handle): index_url = "http://www.hkt48.jp/profile/" index_response = net.http_request(index_url, method="GET") if index_response.status == net.HTTP_RETURN_CODE_SUCCEED: team_find = re.findall("(<h3>[\s|\S]*?)<!-- / .contsbox --></div>", index_response.data) for team_page in team_find: team = tool.find_sub_string(team_page, "<h3>", "</h3>") if not team: output.print_msg("error team") continue team = team.strip() member_list = re.findall("<li>([\s|\S]*?)</li>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace( "\r", "").replace("\t", "") name_find = re.findall( """<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> """, member) if len(name_find) != 1: output.print_msg("error name_find") continue japanese_name, english_name = name_find[0] team_plus_find = re.findall( '<div class="team_j">([^<]*)</div>', member) team_name = team if len(team_plus_find) == 1: if team_plus_find[0].find("兼任") >= 0: team_name = team + " / " + team_plus_find[0].split( "/")[-1].strip().replace("兼任", "") japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.strip().title().split( " ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")
def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, json_decode=False, is_auto_proxy=True, is_auto_redirect=True, is_gzip=True, is_url_encode=True, is_auto_retry=True, is_random_ip=True, connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"]): """Http request via urllib3 :param url: the url which you want visit, start with "http://" or "https://" :param method: request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"] :param fields: dictionary type of request data, will urlencode() them to string. like post data, query string, etc not work with binary_data :param binary_data: binary type of request data, not work with post_data :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param encode_multipart: see "encode_multipart" in urllib3.request_encode_body :param is_auto_proxy: is auto use proxy when init PROXY_HTTP_CONNECTION_POOL :param is_auto_redirect: is auto redirect, when response.status in [301, 302, 303, 307, 308] :param is_auto_retry: is auto retry, when response.status in [500, 502, 503, 504] :param connection_timeout: customize connection timeout seconds :param read_timeout: customize read timeout seconds :param is_random_ip: is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"] :param json_decode: is return a decoded json data when response status = 200 if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR """ url = str(url).strip() if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() connection_pool = HTTP_CONNECTION_POOL if PROXY_HTTP_CONNECTION_POOL is not None and is_auto_proxy: connection_pool = PROXY_HTTP_CONNECTION_POOL if is_url_encode: url = url_encode(url) if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) # 设置压缩格式 if is_gzip: header_list["Accept-Encoding"] = "gzip" # 超时设置 timeout = urllib3.Timeout(connect=float(connection_timeout) if connection_timeout > 0 else None, read=read_timeout if read_timeout > 0 else None) retry_count = 0 while True: thread_event.wait() if EXIT_FLAG: tool.process_exit(0) try: if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']: response = connection_pool.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields) else: if binary_data is None: response = connection_pool.request(method, url, fields=fields, encode_multipart=encode_multipart, headers=header_list, redirect=is_auto_redirect, timeout=timeout) else: response = connection_pool.request(method, url, body=binary_data, encode_multipart=encode_multipart, headers=header_list, redirect=is_auto_redirect, timeout=timeout) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data.decode()) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR elif response.status == 429: # Too Many Requests output.print_msg(url + " Too Many Requests, sleep") time.sleep(60) continue elif response.status in [500, 502, 503, 504] and is_auto_retry: # 服务器临时性错误,重试 if retry_count < NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]: retry_count += 1 time.sleep(30) continue else: return response return response except MemoryError: return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE) except Exception as e: message = str(e) if isinstance(e, urllib3.exceptions.ConnectTimeoutError): # 域名无法解析 if message.find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) elif message.find("[Errno 11001] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) elif isinstance(e, urllib3.exceptions.MaxRetryError): if message.find("Caused by ResponseError('too many redirects'") >= 0: return ErrorResponse(HTTP_RETURN_CODE_TOO_MANY_REDIRECTS) elif isinstance(e, urllib3.exceptions.DecodeError): if message.find("'Received response with content-encoding: gzip, but failed to decode it.'") >= 0: return http_request(url, method=method, fields=fields, binary_data=binary_data, header_list=header_list, cookies_list=cookies_list, encode_multipart=encode_multipart, json_decode=json_decode, is_auto_proxy=is_auto_proxy, is_auto_redirect=is_auto_redirect, is_gzip=False, is_url_encode=False, is_auto_retry=is_auto_retry, is_random_ip=is_random_ip, connection_timeout=connection_timeout, read_timeout=read_timeout) # import traceback # output.print_msg(message) # output.print_msg(traceback.format_exc()) output.print_msg(url + " 访问超时,重试中") time.sleep(5) retry_count += 1 if retry_count >= NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]: output.print_msg("无法访问页面:" + url) return ErrorResponse(HTTP_RETURN_CODE_RETRY)
item_attribute_list = {} base_host = "http://db.d.163.com" for item_path, item_position in item_list.items(): page_count = 1 item_attribute_list[item_path] = [] while True: if item_position == "傳奇宝石": item_index_url = base_host + "/tw/base/legendarygem/" else: item_index_url = base_host + "/tw/item/%s/legendary.html#page=%s" % ( item_path, page_count) item_index_response = net.http_request(item_index_url, method="GET") if item_index_response.status == net.HTTP_RETURN_CODE_SUCCEED: # item_index = item_index.decode("UTF-8") item_index_page = tool.find_sub_string(item_index_response.data, '<div class="cizhui-c-m', '<div class="data-options', 1) item_index_page = item_index_page.decode("GBK").encode("UTF-8") item_info_list = re.findall('<tr class="[\s|\S]*?</tr>', item_index_page) if len(item_info_list) == 0: continue for item_info in item_info_list: if item_info.find('<em class="transmog-s"></em>') >= 0: continue item_url = tool.find_sub_string(item_info, '<a href="', '"') item_name = tool.find_sub_string(item_info, 'class="diablo3tip">', "</a>") item_name = item_name.replace("'", "’") item_url = base_host + item_url item_response = net.http_request(item_url, method="GET")
def http_request(url, method="GET", post_data=None, binary_data=None, header_list=None, cookies_list=None, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_CONNECTION_TIMEOUT, is_random_ip=True, json_decode=False, encode_multipart=False, redirect=True, exception_return=""): if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() retry_count = 0 while True: while process.PROCESS_STATUS == process.PROCESS_STATUS_PAUSE: time.sleep(10) if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP: tool.process_exit(0) if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) try: if connection_timeout == 0 and read_timeout == 0: timeout = None elif connection_timeout == 0: timeout = urllib3.Timeout(read=read_timeout) elif read_timeout == 0: timeout = urllib3.Timeout(connect=connection_timeout) else: timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout) if method == "POST": if binary_data is None: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, fields=post_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR return response except urllib3.exceptions.ProxyError: notice = "无法访问代理服务器,请检查代理设置。检查完成后输入(C)ontinue继续程序或者(S)top退出程序:" input_str = tool.console_input(notice).lower() if input_str in ["c", "continue"]: pass elif input_str in ["s", "stop"]: tool.process_exit(0) except urllib3.exceptions.ReadTimeoutError: pass except urllib3.exceptions.ConnectTimeoutError, e: # 域名无法解析 if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) pass # except urllib3.exceptions.MaxRetryError, e: # print_msg(url) # print_msg(str(e)) # # 无限重定向 # # if str(e).find("Caused by ResponseError('too many redirects',)") >= 0: # # return ErrorResponse(-1) # except urllib3.exceptions.ConnectTimeoutError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 域名无法解析 # # if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: # # return ErrorResponse(-2) # except urllib3.exceptions.ProtocolError, e: # print_msg(str(e)) # print_msg(url + " 访问超时,稍后重试") # # 链接被终止 # # if str(e).find("'Connection aborted.', error(10054,") >= 0: # # return ErrorResponse(-3) except Exception, e: if exception_return and str(e).find(exception_return) >= 0: return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH) elif str(e).find("EOF occurred in violation of protocol") >=0: time.sleep(30) tool.print_msg(str(e)) tool.print_msg(url + " 访问超时,稍后重试") traceback.print_exc()
def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, is_auto_redirect=True, is_auto_retry=True, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT, is_random_ip=True, json_decode=False): """Http request via urllib3 :param url: the url which you want visit, start with "http://" or "https://" :param method: request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"] :param fields: dictionary type of request data, will urlencode() them to string. like post data, query string, etc not work with binary_data :param binary_data: binary type of request data, not work with post_data :param header_list: customize header dictionary :param cookies_list: customize cookies dictionary, will replaced header_list["Cookie"] :param encode_multipart: see "encode_multipart" in urllib3.request_encode_body :param is_auto_redirect: is auto redirect, when response.status in [301, 302, 303, 307, 308] :param is_auto_retry: is auto retry, when response.status in [500, 502, 503, 504] :param connection_timeout: customize connection timeout seconds :param read_timeout: customize read timeout seconds :param is_random_ip: is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"] :param json_decode: is return a decoded json data when response status = 200 if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR """ if not (url.find("http://") == 0 or url.find("https://") == 0): return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) method = method.upper() if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]: return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID) if HTTP_CONNECTION_POOL is None: init_http_connection_pool() if header_list is None: header_list = {} # 设置User-Agent if "User-Agent" not in header_list: header_list["User-Agent"] = _random_user_agent() # 设置一个随机IP if is_random_ip: random_ip = _random_ip_address() header_list["X-Forwarded-For"] = random_ip header_list["X-Real-Ip"] = random_ip # 设置cookie if cookies_list: header_list["Cookie"] = build_header_cookie_string(cookies_list) # 超时设置 if connection_timeout == 0 and read_timeout == 0: timeout = None elif connection_timeout == 0: timeout = urllib3.Timeout(read=read_timeout) elif read_timeout == 0: timeout = urllib3.Timeout(connect=connection_timeout) else: timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout) retry_count = 0 while True: if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP: tool.process_exit(0) thread_event.wait() try: if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields) else: if binary_data is None: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields, encode_multipart=encode_multipart) else: response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart) if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode: try: response.json_data = json.loads(response.data) except ValueError: is_error = True content_type = response.getheader("Content-Type") if content_type is not None: charset = tool.find_sub_string(content_type, "charset=", None) if charset: if charset == "gb2312": charset = "GBK" try: response.json_data = json.loads(response.data.decode(charset)) except: pass else: is_error = False if is_error: response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR elif response.status in [500, 502, 503, 504] and is_auto_retry: # 服务器临时性错误,重试 if retry_count < HTTP_REQUEST_RETRY_COUNT: retry_count += 1 time.sleep(30) continue else: return response return response except urllib3.exceptions.ConnectTimeoutError, e: # 域名无法解析 if str(e).find("[Errno 11004] getaddrinfo failed") >= 0: return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED) pass except MemoryError: return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)