async def old_parse(self, input_text, *k, **kk): html2 = await get_url_service.get_url_async(input_text) html2 = PyQuery(html2) show_cnt = html2("div#first_videolist div.show_cnt > div") title = html2("div.top_tit > h2").text() total = len(show_cnt) data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in show_cnt: col = PyQuery(i) a = col("dt > a") title = a.text() url = a.attr("href") subtitle = col("dd.d_cnt").text() or title info = { "name": title, "no": title, "subtitle": subtitle, "url": url } data["data"].append(info) return data
def urlHandle(self, input_text): html = PyQuery(common.getUrl(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") print('urlHandle:"' + input_text + '"-->"' + url + '"') return url
async def parse(self, input_text, *k, **kk): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) title = "" for meta in html('meta[itemprop="name"]'): meta = PyQuery(meta) title = meta.attr("content") break data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "QQ视频全集" } for a in html(".mod_episode a"): a = PyQuery(a) _title = "" for span in PyQuery(a("span")): span = PyQuery(span) if span.attr("itemprop") == "episodeNumber": _title = "第%s集" % span.text() elif span.has_class("mark_v"): _title += span.children("img").attr("alt") info = { "name": _title, "no": _title, "subtitle": _title, "url": a.attr("href") } data["data"].append(info) data["total"] = len(data["data"]) return data
async def url_handle(self, input_text): html = await get_url_service.get_url_async(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def Parse(self, input_text): html2 = getUrl(input_text) html2 = PyQuery(html2) w120 = html2("div.gut > div.listTab > div.listPic > div.list > dl.w120 > dt > a") total = len(w120) title = html2("div.gut > div.listTab > div.listPic > div.tab:first-child > p.p1 > i").text() data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for i in w120: i = PyQuery(i) url = i.attr("href") title = i("a > img").attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) return data
async def parse(self, input_text, *k, **kk): if not await self._check_support(input_text): return [] html_text = await get_url_service.get_url_async(input_text) html = PyQuery(html_text) title = html('h1.main_title > a').text() if not title: for a in html('div.crumb-item > a'): a = PyQuery(a) if a.attr('href') in input_text: title = a.text() if not title: try: title = match1(html_text, '<title>([^<]+)').split('-')[0] except AttributeError: pass data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "271视频全集" } data["data"] = await self._get_list_info_api(html_text) return data
def Parse_le(self, input_text): html = PyQuery(get_url(input_text)) items = html('dt.d_tit') title = "LETV" i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item).children('a') name = a.text() no = a.text() subtitle = a.text() url = a.attr('href') if url is None: continue if not re.match('^http://www\.le\.com/.+\.html', url): continue info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "caption": "首页地址列表" } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def url_handle(self, input_text): html = get_url(input_text) html = PyQuery(html) a = html.children('a') a = PyQuery(a) url = a.attr("href") return url
def url_handle(self, input_text): html = PyQuery(get_url(input_text)) a = html.children('a') a = PyQuery(a) url = a.attr("href") logging.debug('urlHandle:"' + input_text + '"-->"' + url + '"') return url
def Parse_v(self,input_text): print(input_text) html = PyQuery(common.getUrl(input_text)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if re.search('www.iqiyi.com/(a_|lib/m)',url): return self.Parse(url)
def parse(self, input_text, *k, **kk): html = get_url(input_text) html = PyQuery(html) p_title = html("div.pl-title") title = p_title.attr("title") list_id = re.search( 'https?://list.youku.com/albumlist/show/id_(\d+)\.html', input_text).group(1) ep = 'https://list.youku.com/albumlist/items?id={}&page={}&size=20&ascending=1&callback=a' first_u = ep.format(list_id, 1) xhr_page = get_url(first_u) json_data = json.loads(xhr_page[14:-2]) # print(json_data) # video_cnt = json_data['data']['total'] xhr_html = json_data['html'] # print(xhr_html) data = { "data": [], "more": False, "title": title, "total": 0, "type": "collection", "caption": "优酷视频全集" } last_num = 1 while True: new_url = ep.format(list_id, last_num) json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 1 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a[target='video'][data-from='2-1']") for item in items: item = PyQuery(item) url = "http:" + item.attr("href") title = item.attr("title") info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num += 1 else: break else: break data["total"] = len(data["data"]) # print(data) return data
def parse(self, input_text, *k, **kk): html = get_url(input_text) m = re.findall('showid:"([0-9]+)",', html) # showid:"307775" if not m: return [] logging.info(m[0]) html = PyQuery(html) p_title = html("li.p-row.p-title") p_title("li>a").remove() p_title("li>span").remove() title = p_title.text().replace(":", '') data = { "data": [], "more": False, "title": title, "total": 0, "type": "list", "caption": "优酷视频全集" } last_num = 0 while True: new_url = "https://list.youku.com/show/episode?id=" + m[ 0] + "&stage=reload_" + str(last_num) + "&callback=a" json_data = get_url(new_url)[14:-2] info = json.loads(json_data) if info.get("error", None) == 0 and info.get("message", None) == "success": new_html = info.get("html", None) if new_html: new_html = PyQuery(new_html) items = new_html("a") for item in items: item = PyQuery(item) num = int(item.text()) url = "http:" + item.attr("href") title = "第%02d集" % num info = { "name": title, "no": title, "subtitle": title, "url": url } data["data"].append(info) last_num = num last_num += 1 else: continue else: break data["total"] = len(data["data"]) return data
def parse(self, input_text, pool=pool_get_url, *k, **kk): logging.debug(input_text) html = PyQuery(get_url(input_text, pool=pool)) datainfo_navlist = PyQuery(html(".progInfo_pic")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") if str(url).startswith("//"): url = "http:" + str(url) logging.info("change %s to %s" % (input_text, url)) result = get_main_parse()(input_text=url, types="list") if result: return result[0]
def Parse(self,input_text, pool=pool_getUrl): logging.debug(input_text) html = PyQuery(getUrl(input_text,pool = pool)) datainfo_navlist = PyQuery(html("#datainfo-navlist")) for a in datainfo_navlist.children('a'): a = PyQuery(a) url = a.attr("href") logging.info("change %s to %s"%(input_text,url)) try: from ..main import Parse as main_parse except Exception as e: from main import Parse as main_parse result = main_parse(input_text=url, types="list") if result: return result[0]
def set_proxy(self): r = requests.get("http://cn-proxy.com/") q = PyQuery(r.content) trs = q("tbody tr") if (len(trs) == 0): self.ip = self.default_ip self.port = self.default_port return tr = trs[min(self.failed_times, len(trs) - 1)] trq = PyQuery(tr) tds = trq.children() ip = tds.eq(0).text() port = int(tds.eq(1).text()) self.ip = ip self.port = port
def Parse(self, input_text): html = PyQuery(self.getUrl(input_text)) items = html('a') title = html('title').text() i = 0 data = { "data": [], "more": False, "title": title, "total": i, "type": "collection" } for item in items: a = PyQuery(item) name = a.attr('title') if name is None: name = a.text() no = name subtitle = name url = a.attr('href') if url is None: continue if name is None or name == "": continue if not re.match( '(^(http|https)://.+\.(shtml|html))|(^(http|https)://.+/video/)', url): continue if re.search( '(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com)', url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))', no): continue unsure = False info = { "name": name, "no": no, "subtitle": subtitle, "url": url, "unsure": unsure } data["data"].append(info) i = i + 1 total = i data["total"] = total return data
def main(): cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('User-agent', 'Mozilla/5.0')] html = opener.open('http://spys.ru/en/https-ssl-proxy/%s/' % page).read() d = PyQuery(lxml.html.fromstring(html)) vars = None for script in d('script').items(): if 'eval' in script.text(): vars = eval_vars(script.text()) if not vars: return cur = 0 while True: ip_match = RE_IP.search(html, cur) if not ip_match: break port_match = RE_DOCUMENT_WRITE.search(html, ip_match.end()) if not port_match: break cur = port_match.end() port_text = '(%s)' % port_match.group(1) port = parse_port(port_text, vars) print('%s:%s' % (ip_match.group(1), port)) print('')
def extract_torrents(html): result = [] pq = PyQuery(html) for row in pq('#torrents_table tbody tr.torrent').items(): data = { 'id': row.attr('id')[len('torrent-'):], 'type': row('td:eq(0) img').attr('title'), 'title': row('td:eq(1) span.title').text(), 'publishers': [], 'authors': [], 'year': row('td:eq(1) span.torYear').text()[1:-1], 'format': row('td:eq(1) span.torFormat').text()[1:-1], 'retail': bool(row('td:eq(1) span.torRetail')), 'tags': [] } for dlink in row('td:eq(1) > a').items(): href = dlink.attr('href') if '/creators/' in href: data['authors'].append({ 'id': href[href.rfind('/') + 1:], 'name': dlink.text() }) elif '/publishers/' in href: data['publishers'].append({ 'id': href[href.rfind('/') + 1:], 'name': dlink.text() }) for tag in row('td:eq(1) > span.taglist > a').items(): href = tag.attr('href') data['tags'].append({ 'id': href[href.rfind('/') + 1:], 'name': tag.text() }) result.append(data) return result
def test_form_valid_li_present(self): ul = PyQuery(self.dom('ul')[0]) li = ul.children() self.assertEqual(len(li), 1) attrib = dict(li[0].attrib.items()) self.assertEqual(attrib.get('ng-show'), 'messages_form[\'email\'].$valid')
def setUp(self): # create an unbound form self.unbound_form = DummyForm() htmlsource = self.unbound_form.as_p() + self.unbound_form.sub1.as_p( ) + self.unbound_form.sub2.as_p() self.dom = PyQuery(htmlsource) self.elements = self.dom('input') + self.dom('select')
def parse_html_page(self): pq = PyQuery(self.html_page) main_table = pq('#mainBody > table.coltable') def find_row(text): for c in main_table.find('td:first-child').items(): if c.text() == text: return c.nextAll().items().next() def find_row_text(text, default=''): row = find_row(text) if row: return row.text() return default def find_row_html(text, default=''): row = find_row(text) if row: return row.html() return default self.info_hash = find_row_text('Info hash') self.title = pq.find('#mainBody > h1').text() self.category, self.subcategory = find_row_text('Type').split(' - ', 1) self.language = find_row_text('Language') self.cover_url = find_row('Picture:').find('img').attr('src') self.small_description = find_row_html('Small Description') self.description = find_row_html('Description') self.torrent_url = find_row('Download').find('a#dlNormal').attr('href') size_string = find_row_text('Size') match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string) self.torrent_size = int(match.group('size').replace(',', ''))
def Parse_lib_m(self,input_text): html = PyQuery(common.getUrl(input_text)) """ album_items = html('div.clearfix').children('li.album_item') title = html('h1.main_title').children('a').text() i =0 data = { "data": [], "more": False, "title": title, "total": i, "type": "list" } for album_item in album_items: no = '第'+str(i+1)+'集' name = title+'('+no+')' url = PyQuery(album_item).children('a').attr('href') subtitle = '' info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) i = i+1 total = i data["total"] = total """ data = { "data": [], "more": False, "title": '', "total": 0, "type": "list", "caption": "271视频全集" } data_doc_id = html('span.play_source').attr('data-doc-id') ejson_url = 'http://rq.video.iqiyi.com/aries/e.json?site=iqiyi&docId='+data_doc_id+'&count=100000' ejson = json.loads(common.getUrl(ejson_url)) ejson_datas = ejson["data"]["objs"] data["total"] = ejson_datas["info"]["total_video_number"] data["title"] = ejson_datas["info"]["album_title"] album_items = ejson_datas["episode"]["data"] for album_item in album_items: no = '第'+str(album_item["play_order"])+'集' name = album_item["title"] url = album_item["play_url"] subtitle = album_item["desciption"] info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data["data"].append(info) #print(ejson) return data
def setUp(self): self.subscription_form = ClientValidatedForm() self.dom = PyQuery(str(self.subscription_form)) self.form_name = b64encode( six.b(self.subscription_form.__class__.__name__)).rstrip( six.b('=')).decode("utf-8") self.maxDiff = None
def parse(self, input_text, *k, **kk): html = get_url(input_text) html = PyQuery(html) html2_url = html("a.more").attr("href") result = get_main_parse()(input_text=html2_url, types="list") if result: return result
def Parse(self, input_text): html = getUrl(input_text) html = PyQuery(html) html2_url = html("a.more").attr("href") try: from ..main import Parse as main_parse except Exception as e: from main import Parse as main_parse result = main_parse(input_text=html2_url, types="list") if result: return result[0]
def get_list_info_html(html): print("get_list_info_html") data = [] album_items = html('ul.site-piclist').children('li') for album_item in album_items: album_item = PyQuery(album_item) site_piclist_info = PyQuery( album_item.children('div.site-piclist_info')) site_piclist_info_title = PyQuery( site_piclist_info.children('p.site-piclist_info_title')) site_piclist_info_title_a = PyQuery( site_piclist_info_title.children('a')) site_piclist_info_title_fs12 = PyQuery( site_piclist_info.children('p.fs12')) site_piclist_info_title_fs12_a = PyQuery( site_piclist_info_title_fs12.children('a')) no = site_piclist_info_title_a.text() #if re.search("预告",no): #continue name = site_piclist_info_title_fs12_a.text() url = site_piclist_info_title_fs12_a.attr('href') if url is None: continue subtitle = site_piclist_info_title_fs12_a.text() info = { "name": name, "no": no, "subtitle": subtitle, "url": url } data.append(info) i = i + 1 return data
def process_decline_view(self, htmlsource): dom = PyQuery(htmlsource) form = dom('#form3') self.assertTrue(form, 'No <form id="#form1"> found in html output') elements = form.find('input') values = dict((elem.name, elem.value) for elem in elements) values.update({'cancel': 'Cancel'}) url = form.attr('action') response = requests.post(url, data=values, verify=True) self.assertEqual(response.status_code, 200, 'PSP did not accept payment cancellation') self.save_htmlsource('decline_form', response.content) # in response check for string 'Cancelled' dom = PyQuery(response.content) tables = dom('table.ncoltable1') self.assertEqual(len(tables), 3) self.assertEqual(tables.eq(1).find('h3').text(), 'Cancelled') form = tables.eq(2).find('form') urlobj = urlparse.urlparse(form.attr('action')) data = dict(urlparse.parse_qsl(urlobj.query)) httpresp = self.client.get(urlobj.path, data, follow=True) self.assertEqual(len(httpresp.redirect_chain), 2, 'No redirection after declining payment') urlobj = urlparse.urlparse(httpresp.redirect_chain[1][0]) self.assertEqual(httpresp.status_code, 200) self.assertEqual(resolve(urlobj.path).url_name, 'viveum')
def index_page(self, response): """获取所有漏洞url,并将相应的url相应传递给detail_page""" for each in response.doc('a[href^="http"]').items(): if re.match( "http://www.cnnvd.org.cn/vulnerability/show/cv_cnnvdid/CNNVD-\d+-\d+", each.attr.href): print each.attr.href self.crawl(each.attr.href, priority=9, retries=10, callback=self.detail_page) self.crawl(response.doc(".dispage >a").filter( lambda i: PyQuery(this).text() == u"下一页").attr.href, retries=10, callback=self.index_page)
def get_auth_key(self): if self.auth_key: return self.auth_key for i in xrange(3): try: response = self.session.get( 'https://bibliotik.me/upload/ebooks') response.raise_for_status() break except Exception: pass response.raise_for_status() pq = PyQuery(response.content) self.auth_key = pq('input[name="authkey"]').val() if not self.auth_key: raise Exception('Could not get the authkey') return self.auth_key
async def parse(self, input_text, *k, **kk): html2 = await get_url_service.get_url_async(input_text) html2 = PyQuery(html2) title = html2("div.top_tit > h2").text() try: pid = match1(input_text, r'http://www.le.com/tv/(\w+).html') api_url = "http://d.api.m.le.com/detail/episode?pid={}&platform=pc&page=1&pagesize=1000&type=1".format( pid) api_data = await get_url_service.get_url_async(api_url) safe_print(api_data) api_json = json.loads(api_data) assert api_json["code"] == "200" api_json_data = api_json["data"] total = api_json_data["total"] data = { "data": [], "more": False, "title": title, "total": total, "type": "list", "caption": "乐视视频全集" } for item in api_json_data["list"]: if item.get("isyugao", 0) != 0: continue item_title = item["title"] info = { "name": item_title, "no": item_title, "subtitle": item["sub_title"], "url": "http://www.le.com/ptv/vplay/{}.html".format(item["vid"]), "icon": item["pic"] } data["data"].append(info) return data except AsyncCancelled: raise except: logging.exception("parse error rollback to old function") return await self.old_parse(input_text, *k, **kk)