def get_one_page_blog(account_id, page_count): # http://moexia.lofter.com/?page=1 blog_pagination_url = "http://blog.sina.com.cn/s/articlelist_%s_0_%s.html" % (account_id, page_count) blog_pagination_response = net.http_request(blog_pagination_url, method="GET") result = { "blog_info_list": [], # 全部日志地址 "is_over": False, # 是不是最后一页 } if blog_pagination_response.status == net.HTTP_RETURN_CODE_SUCCEED: if page_count == 1 and blog_pagination_response.data.find("抱歉,您要访问的页面不存在或被删除!") >= 0: raise crawler.CrawlerException("账号不存在") article_list_selector = PQ(blog_pagination_response.data.decode("UTF-8")).find(".articleList .articleCell") if article_list_selector.size() == 0: raise crawler.CrawlerException("页面截取日志列表失败\n%s" % blog_pagination_response.data) for article_index in range(article_list_selector.size()): result_blog_info = { "blog_url": None, # 日志地址 "blog_time": None, # 日志时间 "blog_title": "", # 日志标题 } article_selector = article_list_selector.eq(article_index) # 获取日志地址 blog_url = article_selector.find("span.atc_title a").attr("href") if not blog_url: raise crawler.CrawlerException("日志列表解析日志地址失败\n%s" % article_selector.html().encode("UTF-8")) result_blog_info["blog_url"] = str(blog_url) # 获取日志标题 blog_title = article_selector.find("span.atc_title a").text().encode("UTF-8") if not blog_title: raise crawler.CrawlerException("日志列表解析日志标题失败\n%s" % article_selector.html().encode("UTF-8")) result_blog_info["blog_title"] = str(blog_title) # 获取日志时间 blog_time = article_selector.find("span.atc_tm").text() if not blog_time: raise crawler.CrawlerException("日志列表解析日志时间失败\n%s" % article_selector.html().encode("UTF-8")) try: result_blog_info["blog_time"] = int(time.mktime(time.strptime(blog_time, "%Y-%m-%d %H:%M"))) except ValueError: raise crawler.CrawlerException("日志时间格式不正确\n%s" % blog_time) result["blog_info_list"].append(result_blog_info) # 获取分页信息 pagination_html = tool.find_sub_string(blog_pagination_response.data, '<div class="SG_page">', '</div>') if not pagination_html: result["is_over"] = True else: max_page_count = tool.find_sub_string(pagination_html, "共", "页") if not crawler.is_integer(max_page_count): raise crawler.CrawlerException("分页信息截取总页数失败\n%s" % pagination_html) result["is_over"] = page_count >= int(max_page_count) else: raise crawler.CrawlerException(crawler.request_failre(blog_pagination_response.status)) return result
def get_one_page_album(account_id, page_count): # http://bcy.net/u/50220/post/cos?&p=1 album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id query_data = {"p": page_count} album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data) result = { "album_info_list": [], # 全部作品信息 "coser_id": None, # coser id "is_over": False, # 是不是最后一页作品 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status)) if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取coser id coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data) if len(coser_id_find) != 1: raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data) if not crawler.is_integer(coser_id_find[0]): raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data) result["coser_id"] = coser_id_find[0] # 获取作品信息 album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item") for album_index in range(0, album_list_selector.size()): album_selector = album_list_selector.eq(album_index) result_album_info = { "album_id": None, # 作品id "album_title": None, # 作品标题 } # 获取作品id album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href") if not album_url: raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = str(album_url).split("/")[-1] if not crawler.is_integer(album_id): raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8"))) result_album_info['album_id'] = album_id # 获取作品标题 album_title = album_selector.find(".postWorkCard__img img").attr("alt") result_album_info["album_title"] = str(album_title.encode("UTF-8")) result["album_info_list"].append(result_album_info) # 判断是不是最后一页 last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a") if last_pagination_selector.size() == 1: max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1]) result["is_over"] = page_count >= max_page_count else: result["is_over"] = True return result
def get_market_game_trade_card_price(game_id, login_cookie): cookies_list = {"steamLogin": login_cookie} market_search_url = "http://steamcommunity.com/market/search/render/" market_search_url += "?query=&count=20&appid=753&category_753_Game[0]=tag_app_%s&category_753_cardborder[0]=tag_cardborder_0" % game_id market_search_response = net.http_request(market_search_url, method="GET", cookies_list=cookies_list, json_decode=True) if market_search_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(market_search_response.status)) market_item_list = {} if not crawler.check_sub_key( ("success", "results_html"), market_search_response.json_data): raise crawler.CrawlerException( "返回信息'success'或'results_html'字段不存在\n%s" % market_search_response.json_data) if market_search_response.json_data["success"] is not True: raise crawler.CrawlerException("返回信息'success'字段取值不正确\n%s" % market_search_response.json_data) card_selector = PQ(market_search_response.json_data["results_html"]).find( ".market_listing_row_link") for index in range(0, card_selector.size()): card_name = card_selector.eq(index).find( ".market_listing_item_name").text() card_min_price = card_selector.eq(index).find( "span.normal_price span.normal_price").text().encode( "UTF-8").replace("¥ ", "") market_item_list[card_name] = card_min_price # {'Pamu': '1.77', 'Fumi (Trading Card)': '2.14', 'Mio (Trading Card)': '1.33', 'Bonnibel (Trading Card)': '1.49', 'Groupshot': '1.87', 'Q-Piddy': '1.35', 'Elle (Trading Card)': '1.19', 'Quill': '1.50', 'Iro (Trading Card)': '1.42', 'Bearverly (Trading Card)': '1.27', 'Cassie (Trading Card)': '1.35'} return market_item_list
def get_self_account_badges(account_id, login_cookie): # 徽章第一页 badges_index_url = "http://steamcommunity.com/profiles/%s/badges/" % account_id cookies_list = {"steamLogin": login_cookie} badges_index_response = net.http_request(badges_index_url, method="GET", cookies_list=cookies_list) if badges_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(badges_index_response.status)) badges_detail_url_list = [] # 徽章div badges_selector = PQ(badges_index_response.data).find( ".maincontent .badges_sheet .badge_row") for index in range(0, badges_selector.size()): badge_html = badges_selector.eq(index).html().encode("UTF-8") # 已经掉落全部卡牌的徽章 if badge_html.find("无剩余卡牌掉落") >= 0: # 徽章详细信息页面地址 badge_detail_url = tool.find_sub_string( badge_html, '<a class="badge_row_overlay" href="', '"/>') if not badge_detail_url: raise crawler.CrawlerException("徽章信息截取徽章详细界面地址失败\n%s" % badge_html) badges_detail_url_list.append(badge_detail_url) # ['http://steamcommunity.com/profiles/76561198172925593/gamecards/459820/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/357200/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/502740/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359600/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/354380/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359670/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/525300/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/337980/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/591420/'] return badges_detail_url_list
def get_one_page_photo(page_count): photo_pagination_url = "http://jigadori.fkoji.com/" query_data = {"p": page_count} photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data) result = { "image_info_list": [], # 全部图片信息 } if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status)) photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find("#wrapper .row .photo") for photo_index in range(0, photo_list_selector.size()): photo_selector = photo_list_selector.eq(photo_index) photo_selector_html = photo_selector.html().encode("UTF-8") result_photo_info = { "account_name": "", # twitter账号 "image_url_list": [], # 图片地址 "tweet_id": None, # tweet id "tweet_time": None, # tweet发布时间 } # 获取tweet id tweet_url = photo_selector.find(".photo-link-outer a").eq(0).attr("href") if not tweet_url: raise crawler.CrawlerException("图片信息截取tweet地址失败\n%s" % photo_selector_html) tweet_id = tool.find_sub_string(tweet_url.strip(), "status/") if not crawler.is_integer(tweet_id): raise crawler.CrawlerException("tweet地址截取tweet id失败\n%s" % tweet_url) result_photo_info["tweet_id"] = int(tweet_id) # 获取twitter账号 account_name = photo_selector.find(".user-info .user-name .screen-name").text() if not account_name: raise crawler.CrawlerException("图片信息截取twitter账号失败\n%s" % photo_selector_html) result_photo_info["account_name"] = str(account_name).strip().replace("@", "") # 获取tweet发布时间 tweet_time = photo_selector.find(".tweet-text .tweet-created-at").text().strip() if not tweet_time: raise crawler.CrawlerException("图片信息截取tweet发布时间失败\n%s" % photo_selector_html) try: result_photo_info["tweet_time"] = int(time.mktime(time.strptime(str(tweet_time).strip(), "%Y-%m-%d %H:%M:%S"))) except ValueError: raise crawler.CrawlerException("tweet发布时间文本格式不正确\n%s" % tweet_time) # 获取图片地址 image_list_selector = photo_selector.find(".photo-link-outer a img") for image_index in range(0, image_list_selector.size()): image_url = image_list_selector.eq(image_index).attr("src") if not image_url: raise crawler.CrawlerException("图片列表截取图片地址失败\n%s" % image_list_selector.eq(image_index).html()) result_photo_info["image_url_list"].append(str(image_url).strip()) result["image_info_list"].append(result_photo_info) return result
def get_account_index_page(account_name): account_index_url = "http://%s.pp.163.com/" % account_name account_index_response = net.http_request(account_index_url, method="GET") result = { "album_url_list": [], # 全部相册地址 } if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(account_index_response.status)) # 页面编码 account_index_html = account_index_response.data.decode("GBK").encode( "UTF-8") if account_index_html.find("<title>该页面不存在</title>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取全部相册地址 album_result_selector = PQ(account_index_html).find("#p_contents li") if album_result_selector.size() == 0: raise crawler.CrawlerException("页面匹配相册列表失败\n%s" % account_index_html) for album_index in range(0, album_result_selector.size()): result["album_url_list"].append( str( album_result_selector.eq(album_index).find("a.detail").attr( "href"))) return result
def get_account_talks(account_id, account_name, talk_list): account_index = "https://7gogo.jp/users/%s" % account_id account_index_response = net.http_request(account_index, method="GET") if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(account_index_response.status)) talk_list_selector = PQ(account_index_response.data.decode("UTF-8")).find( ".UserTalkWrapper .UserTalk") for talk_index in range(0, talk_list_selector.size()): talk_selector = talk_list_selector.eq(talk_index) # 获取talk地址 talk_url_path = talk_selector.attr("href") if not talk_url_path: raise crawler.CrawlerException("talk信息截取talk地址失败\n%s" % talk_selector.html.encode("UTF-8")) talk_id = str(talk_url_path.replace("/", "")) if not talk_id: raise crawler.CrawlerException("talk地址截取talk id失败\n%s" % talk_url_path) # 获取talk名字 talk_name = talk_selector.find(".UserTalk__talkname").text() if not talk_name: raise crawler.CrawlerException("talk信息截取talk名字失败\n%s" % talk_selector.html.encode("UTF-8")) talk_name = crawler.filter_emoji( str(talk_name.encode("UTF-8")).strip()) # 获取talk描述 talk_description = crawler.filter_emoji( talk_selector.find(".UserTalk__description").text()) if talk_description: talk_description = crawler.filter_emoji( str(talk_description.encode("UTF-8")).strip()) else: talk_description = "" if talk_id in talk_list: talk_list[talk_id]["account_list"].append(account_name) else: talk_list[talk_id] = { "account_list": [account_name], "talk_name": talk_name, "talk_description": talk_description, } output.print_msg(account_id + ": " + talk_name + ", " + talk_description)
def get_one_page_account(page_count): account_pagination_url = "http://jigadori.fkoji.com/users" query_data = {"p": page_count} account_pagination_response = net.http_request(account_pagination_url, method="GET", fields=query_data) pagination_account_list = {} if account_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: crawler.CrawlerException(crawler.request_failre(account_pagination_response.status)) account_list_selector = PQ(account_pagination_response.data.decode("UTF-8")).find(".users-list li") for account_index in range(0, account_list_selector.size()): account_selector = account_list_selector.eq(account_index) # 获取成员名字 account_name = account_selector.find(".profile-name").eq(0).text() if not account_name: account_name = "" # raise robot.CrawlerException("成员信息截取成员名字失败\n\%s" % account_selector.html().encode("UTF-8")) else: account_name = account_name.strip().encode("UTF-8") # 获取twitter账号 account_id = account_selector.find(".screen-name a").text() if not account_id: raise crawler.CrawlerException("成员信息截取twitter账号失败\n\%s" % account_selector.html().encode("UTF-8")) account_id = account_id.strip().replace("@", "") pagination_account_list[account_id] = account_name return pagination_account_list
def get_one_page_audio(account_id, page_count): # http://www.ximalaya.com/1014267/index_tracks?page=2 audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id query_data = {"page": page_count} audit_pagination_response = net.http_request(audit_pagination_url, method="GET", fields=query_data, json_decode=True) result = { "audio_info_list": [], # 页面解析出的歌曲信息列表 "is_over": False, # 是不是最后一页 } if audit_pagination_response.status == 404: raise crawler.CrawlerException("账号不存在") elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(audit_pagination_response.status)) if not crawler.check_sub_key( ("res", "html"), audit_pagination_response.json_data): raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" % audit_pagination_response.json_data) if audit_pagination_response.json_data["res"] is not True: raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" % audit_pagination_response.json_data) # 获取歌曲信息 audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find( "ul.body_list li.item") for audio_index in range(0, audio_list_selector.size()): audio_info = { "audio_id": None, # 页面解析出的歌曲id "audio_title": "", # 页面解析出的歌曲标题 } audio_selector = audio_list_selector.eq(audio_index) # 获取歌曲id audio_id = audio_selector.find(".content_wrap").attr("sound_id") if not crawler.is_integer(audio_id): raise crawler.CrawlerException( "歌曲信息匹配歌曲id失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_id"] = str(audio_id) # 获取歌曲标题 audio_title = audio_selector.find(".sound_title").attr("title") if not audio_title: raise crawler.CrawlerException( "歌曲信息匹配歌曲标题失败\n%s" % audio_list_selector.html().encode("UTF-8")) audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip()) result["audio_info_list"].append(audio_info) # 判断是不是最后一页 max_page_count = 1 pagination_list_selector = PQ( audit_pagination_response.json_data["html"]).find( ".pagingBar_wrapper a.pagingBar_page") for pagination_index in range(0, pagination_list_selector.size()): pagination_selector = pagination_list_selector.eq(pagination_index) data_page = pagination_selector.attr("data-page") if data_page is None: continue if not crawler.is_integer(data_page): raise crawler.CrawlerException( "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8")) max_page_count = max(max_page_count, int(data_page)) result["is_over"] = page_count >= max_page_count return result
def check(expected): response = self.client.get('/', follow=True) account = PyQuery(response.content)('ul.account') tools = PyQuery(response.content)('ul.tools') eq_(account.size(), expected) eq_(tools.size(), expected)
def get_one_page_photo(page_count): photo_pagination_url = "http://kelagirls.com/bizhi!findForIndexMore.action" query_data = {"page": page_count} photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data) result = { "image_info_list": [], # 全部图片地址 "is_over": False, # 是不是最后一页壁纸 } if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(photo_pagination_response.status)) photo_list_selector = PQ(photo_pagination_response.data.decode( "UTF-8")).find(".bizhinmore .bizhi") if photo_list_selector.size() == 0: raise crawler.CrawlerException("页面匹配图片列失败\n%s" % photo_pagination_response.data) for photo_index in range(0, photo_list_selector.size()): result_image_info = { "image_id": None, # 图片id "image_url": None, # 图片地址 "model_name": "", # 模特名字 } # 获取图片id image_id = photo_list_selector.eq(photo_index).find( ".bizhibigwrap").attr("id") if not image_id: raise crawler.CrawlerException( "图片列表匹配图片id失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) if not (image_id[0:3] == "big" and crawler.is_integer(image_id[3:])): raise crawler.CrawlerException( "图片列表匹配的图片id格式不正确\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["image_id"] = str(image_id[3:]) # 获取图片地址 image_path = photo_list_selector.eq(photo_index).find( ".bizhibig img").eq(1).attr("src") if not image_path: raise crawler.CrawlerException( "图片列表匹配图片地址失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["image_url"] = "http://kelagirls.com/" + str( image_path.encode("UTF-8")) # 获取模特名字 model_name = photo_list_selector.eq(photo_index).find( ".bzwdown span").eq(0).text().encode("UTF-8") if not model_name: raise crawler.CrawlerException( "图片列表匹配模特名字失败\n%s" % photo_list_selector.eq(photo_index).html().encode("UTF-8")) result_image_info["model_name"] = str(model_name) result["image_info_list"].append(result_image_info) # 判断是不是最后一页 pagination_selector = PQ( photo_pagination_response.data.decode("UTF-8")).find(".pageBottom div") max_page_count = page_count for pagination_index in range(0, pagination_selector.size()): if crawler.is_integer(pagination_selector.eq(pagination_index).text()): max_page_count = max( max_page_count, int(pagination_selector.eq(pagination_index).text())) result["is_over"] = page_count >= max_page_count return result