def test_fn_with_kwargs(self): "fn() with keyword arguments." pq.fn.test = lambda p=1: pq(this).eq(p) S = pq(self.html) self.assertEqual(S("li").test(0).text(), "Coffee") self.assertEqual(S("li").test().text(), "Tea") self.assertEqual(S("li").test(p=2).text(), "Milk")
def test_map(self): def ids_minus_one(i, elem): return int(self.klass(elem).attr('id')[-1]) - 1 assert self.klass('div', self.html).map(ids_minus_one) == [0, 1] d = pq('<p>Hello <b>warming</b> world</p>') self.assertEqual(d('strong').map(lambda i, el: pq(this).text()), []) # NOQA
def test_fn_with_kwargs(self): "fn() with keyword arguments." pq.fn.test = lambda p=1: pq(this).eq(p) # NOQA S = pq(self.html) self.assertEqual(S('li').test(0).text(), 'Coffee') self.assertEqual(S('li').test().text(), 'Tea') self.assertEqual(S('li').test(p=2).text(), 'Milk')
def test_fn(self): "Example from `PyQuery.Fn` docs." fn = lambda: this.map(lambda i, el: pq(this).outerHtml()) # NOQA pq.fn.listOuterHtml = fn S = pq(self.html) self.assertEqual(S('li').listOuterHtml(), ['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>'])
def test_map(self): def ids_minus_one(i, elem): return int(self.klass(elem).attr("id")[-1]) - 1 assert self.klass("div", self.html).map(ids_minus_one) == [0, 1] d = pq("<p>Hello <b>warming</b> world</p>") self.assertEqual(d("strong").map(lambda i, el: pq(this).text()), []) # NOQA
def test_replaceWith_with_function(self): expected = '''<div class="portlet"> TestimageMy link text imageMy link text 2 Behind you, a three-headed HTML&dash;Entity! </div>''' d = pq(self.html) d('a').replace_with(lambda i, e: pq(e).html()) val = d.__html__() assert val == expected, (repr(val), repr(expected))
def test_remove(self): d = pq(self.html) d('img').remove() val = d('a:first').html() assert val == 'Test My link text', repr(val) val = d('a:last').html() assert val == ' My link text 2', repr(val)
def test_filter(self): assert len(self.klass('div', self.html).filter('.node3')) == 1 assert len(self.klass('div', self.html).filter('#node2')) == 1 assert len(self.klass('div', self.html).filter(lambda i: i == 0)) == 1 d = pq('<p>Hello <b>warming</b> world</p>') self.assertEqual(d('strong').filter(lambda el: True), [])
def test_soup_parser(self): d = pq('<meta><head><title>Hello</head><body onload=crash()>Hi all<p>', parser='soup') self.assertEqual( str(d), '<html><meta/><head><title>Hello</title></head><body onload="crash()">Hi all<p/></body></html>' )
def test_selector(self): expected = 'What' d = pq(b(self.xml), parser='xml') val = d('bar|blah', namespaces={ 'bar': 'http://example.com/bar' }).text() self.assertEqual(repr(val), repr(expected))
def test_get(self): d = pq('http://www.theonion.com/search/', {'q': 'inconsistency'}, method='get') self.assertEqual(d('input[name=q]:last').val(), 'inconsistency') self.assertEqual( d('.news-in-brief h3').text(), 'Slight Inconsistency Found In Bible')
def test_remove(self): d = pq(self.html) d("img").remove() val = d("a:first").html() assert val == "Test My link text", repr(val) val = d("a:last").html() assert val == " My link text 2", repr(val)
def test_serialize(self): d = pq(self.html4) self.assertEqual( d('form').serialize(), 'spam=Spam%2Fspam&order=baked%0D%0Abeans&order=tomato&' 'multiline=multiple%0D%0Alines%0D%0Aof%20text' )
async def get_flag_text(self, data_url): page = self._page_seller_flag net_check() while 1: try: await page.bringToFront() await page.goto(data_url) except errors.TimeoutError: sleep(5) except errors.PageError: sleep(5) else: break await asyncio.sleep(1) content = await page.content() await asyncio.sleep(2) # await page.close() await self.page.bringToFront() doc = pq(content) res = re.search('"tip":"(.*?)"}', doc("pre").text()) if res: return res.group(1) else: logger.info(doc("pre").text()) return None
def test_serialize_dict(self): d = pq(self.html4) self.assertEqual(d('form').serialize_dict(), { 'spam': 'Spam/spam', 'order': ['baked\r\nbeans', 'tomato'], 'multiline': 'multiple\r\nlines\r\nof text', })
def test_filter(self): assert len(self.klass("div", self.html).filter(".node3")) == 1 assert len(self.klass("div", self.html).filter("#node2")) == 1 assert len(self.klass("div", self.html).filter(lambda i: i == 0)) == 1 d = pq("<p>Hello <b>warming</b> world</p>") self.assertEqual(d("strong").filter(lambda el: True), [])
def test_val_for_textarea(self): d = pq(self.html3) self.assertEqual(d('textarea').val(), 'Spam') self.assertEqual(d('textarea').text(), 'Spam') d('textarea').val('42') self.assertEqual(d('textarea').val(), '42') # Note: jQuery still returns 'Spam' here. self.assertEqual(d('textarea').text(), '42')
def _login(self, email, password): self._log("Logging in...") login_page = self._session.get('https://www.fitbit.com/login') form = pq(login_page.content).find('#loginForm') action = form.attr('action') data = dict((i.name, i.value) for i in form.find('input')) data.update({'email': email, 'password': password}) self._session.post(action, data)
def get_url(): doc = pq(driver.page_source) doc = doc.find('.sight_item_caption') li = [] for box in doc.items(): url = 'http://piao.qunar.com' + str(box.find('.name').attr('href')) li.append(url) return li
def test_unicode(self): xml = pq(u("<p>é</p>", 'utf-8')) self.assertEqual(type(xml.html()), text_type) if PY3k: self.assertEqual(str(xml), '<p>é</p>') else: self.assertEqual(unicode(xml), u("<p>é</p>", 'utf-8')) self.assertEqual(str(xml), '<p>é</p>')
def test_selector_with_xml(self): expected = 'What' d = pq('bar|blah', b(self.xml), parser='xml', namespaces=self.namespaces) val = d.text() self.assertEqual(repr(val), repr(expected))
def test_serialize_array(self): d = pq(self.html4) self.assertEqual(d('form').serialize_array(), [ {'name': 'spam', 'value': 'Spam/spam'}, {'name': 'order', 'value': 'baked\r\nbeans'}, {'name': 'order', 'value': 'tomato'}, {'name': 'multiline', 'value': 'multiple\r\nlines\r\nof text'}, ])
def test_get(self): if not HAS_REQUEST: return d = pq(u('http://ru.wikipedia.org/wiki/Заглавная_страница', 'utf8'), method='get') print(d) self.assertEqual( d('#n-mainpage a').text(), u('Заглавная страница', 'utf8'))
def test_get(self): if not HAS_REQUEST: return d = pq(u('http://ru.wikipedia.org/wiki/Заглавная_страница', 'utf8'), method='get') print(d) self.assertEqual(d('#n-mainpage a').text(), u('Заглавная страница', 'utf8'))
def test_get_root(self): doc = pq(b'<?xml version="1.0" encoding="UTF-8"?><root><p/></root>') self.assertEqual(isinstance(doc.root, etree._ElementTree), True) self.assertEqual(doc.encoding, 'UTF-8') child = doc.children().eq(0) self.assertNotEqual(child._parent, no_default) self.assertTrue(isinstance(child.root, etree._ElementTree))
def test_val_for_multiple_elements(self): d = pq(self.html5) # "Get" returns *first* value. self.assertEqual(d('div > *').val(), 'spam') # "Set" updates *every* value. d('div > *').val('42') self.assertEqual(d('#first').val(), '42') self.assertEqual(d('#second').val(), '42') self.assertEqual(d('#third').val(), '42')
def test_next_all(self): d = pq(self.html2) # without filter self.assertEqual(len(d('#term-2').next_all()), 6) # with filter self.assertEqual(len(d('#term-2').next_all('dd')), 5) # when empty self.assertEqual(d('#NOTHING').next_all(), [])
def get_comments(): doc = pq(driver.page_source) doc = doc.find('.mp-comments-list') for item in doc.find('.mp-comments-item').items(): usr = item.find('.mp-comments-username').text() date = item.find('.mp-comments-time').text() comment = item.find('.mp-comments-desc').text() view = {'user': usr, 'date': date, 'comment': comment} print(view)
def test_serialize_pairs_form_values(self): d = pq(self.html4) self.assertEqual( d('form').serialize_pairs(), [ ('spam', 'Spam/spam'), ('order', 'baked\r\nbeans'), ('order', 'tomato'), ('multiline', 'multiple\r\nlines\r\nof text'), ])
def test_next_until(self): d = pq(self.html2) # without filter self.assertEqual(len(d('#term-2').next_until('dt')), 3) # with filter self.assertEqual(len(d('#term-2').next_until('dt', ':not(.strange)')), 2) # when empty self.assertEqual(d('#NOTHING').next_until('*'), [])
def test_session(self): if HAS_REQUEST: import requests session = requests.Session() session.headers.update({'X-FOO': 'bar'}) d = pq(url=self.application_url, data={'q': 'foo'}, method='get', session=session) self.assertIn('HTTP_X_FOO: bar', d('p').text()) else: self.skipTest('no requests library')
def test_serialize_pairs_form_id(self): d = pq(self.html) self.assertEqual(d('#div').serialize_pairs(), []) self.assertEqual(d('#dispersed').serialize_pairs(), [ ('order', 'spam'), ('order', 'eggs'), ('order', 'ham'), ('order', 'tomato'), ('order', 'baked beans'), ]) self.assertEqual(d('.no-id').serialize_pairs(), [ ('spam', 'Spam'), ])
def test_replaceWith(self): expected = '''<div class="portlet"> <a href="/toto">TestimageMy link text</a> <a href="/toto2">imageMy link text 2</a> Behind you, a three-headed HTML&dash;Entity! </div>''' d = pq(self.html) d('img').replace_with('image') val = d.__html__() assert val == expected, (repr(val), repr(expected))
def get_comment_last(url): # 评论总页数 driver.get(url) wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '.mp-pager-next.mp-pager-item'))) doc = pq(driver.page_source) doc = doc.find('#pageContainer') li = [i.text() for i in doc.find('.mp-pager-item').items()] return int(li[-2])
def get_securityfocus_url(): ''' 利用securityfocus_url搜寻更多的链接 :return: ''' results = db.refs.find({'host': 'www.securityfocus.com'}) print('securityfocus:', results.count()) for result in results: url = result['ref'] + '/references' headers = { 'host': result['host'], 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } print(url) try: response = requests.get(url) if response.status_code == 200: context = response.text doc = pq(context) items = doc('#vulnerability a').items() for item in items: ref = item.attr('href') # 更新cve_meta,并且往refs插数据 if db.refs.find({ 'ref': ref, 'cve': result['cve'] }).count() == 0: cve_meta = db.cve_meta.find_one({'_id': result['cve']}) cve_meta['refs'].append({'url': ref}) print(cve_meta) modified_count = db.cve_meta.update_one( {'_id': result['cve']}, {'$set': cve_meta}) print(modified_count) items = db.cve_meta.find({}) pattern = re.compile('[a-zA-Z]+://(.*?)/(.*?)') match = re.match(pattern, str(ref)) refs = {} if match != None: ref = re.sub('\s', '', ref) refs['ref'] = ref refs['host'] = str(match.group(1)) text = ref.split("/")[-1] if text != '': refs['text'] = text else: text = ref.split("/")[-2] refs['text'] = text refs['cve'] = result['cve'] print(refs) db.refs.insert_one(refs) except requests.ConnectionError as e: print('error', e.args)
def get_view(): html = broswer.page_source doc = pq(html) lis = doc('.rev-item.comment-item.clearfix') for li in lis.items(): view = { 'name': li.find('.name').text(), 'level': li.find('.level').text(), 'txt': li.find('.rev-txt').text() } save_to_mongo(view)
def test_unicode(self): xml = pq(u("<html><p>é</p></html>", "utf-8")) self.assertEqual(type(xml.html()), text_type) if PY3k: self.assertEqual(str(xml), "<html><p>é</p></html>") self.assertEqual(str(xml('p:contains("é")')), "<p>é</p>") else: self.assertEqual(unicode(xml), u("<html><p>é</p></html>", "utf-8")) self.assertEqual(str(xml), "<html><p>é</p></html>") self.assertEqual(str(xml(u('p:contains("é")', "utf8"))), "<p>é</p>") self.assertEqual(unicode(xml(u('p:contains("é")', "utf8"))), u("<p>é</p>", "utf8"))
def get_page_url(content): doc = pq(content) items = doc('.paging') total = items.find('b').text() print(total) items = items.find('a').items() hrefs = [] for item in items: print(item.attr('href')) hrefs.append(item.attr('href')) return hrefs
def HouseUrl(url): time.sleep(random.random() * 10) broswer.get(url) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.curr'))) doc = pq(broswer.page_source) doc = doc.find('.house-title') HouseUrls = [] for box in doc.items(): HouseUrl = str(box.find('.houseListTitle').attr('href')) HouseUrls.append(HouseUrl) return HouseUrls
def test_html_replacement(self): html = '<div>Not Me<span>Replace Me</span>Not Me</div>' replacement = 'New <em>Contents</em> New' expected = html.replace('Replace Me', replacement) d = pq(html) d.find('span').html(replacement) new_html = d.outerHtml() self.assertEqual(new_html, expected) self.assertIn(replacement, new_html)
def test_unicode(self): xml = pq(u"<html><p>é</p></html>") self.assertEqual(type(xml.html()), text_type) if PY3k: self.assertEqual(str(xml), '<html><p>é</p></html>') self.assertEqual(str(xml('p:contains("é")')), '<p>é</p>') else: self.assertEqual(text_type(xml), u"<html><p>é</p></html>") self.assertEqual(str(xml), '<html><p>é</p></html>') self.assertEqual(str(xml(u'p:contains("é")')), '<p>é</p>') self.assertEqual(text_type(xml(u'p:contains("é")')), u'<p>é</p>')
def test_val_for_select(self): d = pq(self.html4) self.assertIsNone(d('#first').val()) self.assertEqual(d('#second').val(), 'eggs') self.assertIsNone(d('#third').val()) d('#first').val('spam') d('#second').val('bacon') d('#third').val('eggs') # Selecting non-existing option. self.assertEqual(d('#first').val(), 'spam') self.assertEqual(d('#second').val(), 'bacon') self.assertIsNone(d('#third').val()) d('#first').val('bacon') # Selecting non-existing option. self.assertIsNone(d('#first').val())
def test_val_for_inputs(self): d = pq(self.html2) self.assertEqual(d('input[name="spam"]').val(), 'Spam') self.assertEqual(d('input[name="eggs"]').val(), 'Eggs') self.assertEqual(d('input:checkbox').val(), 'Bacon') self.assertEqual(d('input:radio').val(), 'Ham') d('input[name="spam"]').val('42') d('input[name="eggs"]').val('43') d('input:checkbox').val('44') d('input:radio').val('45') self.assertEqual(d('input[name="spam"]').val(), '42') self.assertEqual(d('input[name="eggs"]').val(), '43') self.assertEqual(d('input:checkbox').val(), '44') self.assertEqual(d('input:radio').val(), '45')
def test_from_url(url, timeout=1): """ get proxy from given url address , and collect proxy and port ; then test them ; return a list of useful proxy_s :param url: :param timeout: second(s) , default 1 second :return: usage: url = 'http://www.ip84.com/pn' proxy_list = test_from_url(url) or like this: proxy_list = test_from_url(url , timeout=3) """ patt_pp = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d]):\d{1,5}') t = requests.get(url, verify=True).text txt = ':'.join(pq(t).text().split(' ')) proxy_port = list(set(re.findall(patt_pp, txt))) return test_from_list(proxy_list=proxy_port, timeout=timeout)
def getbyurl_zh(url): map_data={} import requests a=requests.get(url).content.decode('gbk') from pyquery.pyquery import PyQuery as pq rr={u"题 名":'title', u"页 码":u'载体形态', u"作 者":u'作者', u"出版项":u'出版社', "ISB":u'标准号', u"索取号":'position', u"附注信":u'载体形态'} for i in pq(a)("tr td"): print pq(i).text()[:3] if pq(i).text()[:3] in rr: map_data[ rr[ pq(i).text()[:3] ] ]= pq(i).text()[pq(i).text().index(":")+1:] map_data['img']= "http://book.bookday.cn/book/cover?isbn=%s&w=100&h=150" %map_data[u'标准号'] return map_data
def test_soup_parser(self): d = pq('<meta><head><title>Hello</head><body onload=crash()>Hi all<p>', parser='soup') self.assertEqual(str(d), ( '<html><meta/><head><title>Hello</title></head>' '<body onload="crash()">Hi all<p/></body></html>'))
def handle(self, *args, **kwargs): print '开始下载欧普钢网资源单...' driver = webdriver.PhantomJS() if not os.path.exists(settings.CRAWL_ROOT): os.mkdir(settings.CRAWL_ROOT) print '新建目录: %s' % settings.CRAWL_ROOT today = datetime.datetime.now().strftime('%Y_%m_%d') # yesday = datetime.datetime.now() + datetime.timedelta(days=-1) # yesday_str = yesday.strftime('%m-%d') yesday_str = datetime.datetime.now().strftime('%m-%d') zhPattern = re.compile(u'[\u4e00-\u9fa5]+') print yesday_str today_dir = os.path.join(settings.CRAWL_ROOT, today) if not os.path.exists(today_dir): os.mkdir(today_dir) print '新建目录: %s' % today_dir gangyin_dir = os.path.join(today_dir, 'oupu') if not os.path.exists(gangyin_dir): os.mkdir(gangyin_dir) print '新建目录: %s' % gangyin_dir try: profile = PhoneUserProfile.objects.get(nickname=u'欧普钢网资源单', status=2) except PhoneUserProfile.DoesNotExist: user = User.objects.create_user('__oupu', '__oupu') profile = PhoneUserProfile.objects.create( user=user, phone='-', qq='-', nickname=u'欧普钢网资源单', status=2 ) print '系统用户已生成' driver.get(url) time.sleep(2) q = pq(driver.page_source) # pages = int(pq(q('.z-end')).attr('id')) pages = 500 print '一共%d页' % pages all_results={} for page in range(1, pages+1): driver.get(url2+'_%d.html'%page) print '第%d页' % page time.sleep(2) q = pq(driver.page_source) q = q('table tr') # import pdb # pdb.set_trace() break_out = False for _ in range(1,24): try: product_name = pq(pq(q('tr')[_])('td')[1]).text() shop_sign = pq(pq(q('tr')[_])('td')[3]).text() spec = pq(pq(q('tr')[_])('td')[2]).text() weight = pq(pq(pq(q('tr')[_])('td')[6]).find('p')[0]).text() price = pq(pq(q('tr')[_])('td')[5]).text() provider_name = pq(pq(pq(q('tr')[_])('td')[9]).find('a')[0]).text() release_time = pq(pq(q('tr')[_])('td')[8]).text() print yesday_str print release_time print provider_name if provider_name == '': provider_name = '欧浦商城' contacts = '欧浦商城华东站热线' phone = '021-60717078-8' else: match = zhPattern.search(release_time) if match == None: if release_time != yesday_str: break_out = True break contacts = pq(pq(pq(q('tr')[_])('td')[9]).find('em')[0]).text() phone = pq(pq(pq(q('tr')[_])('td')[9]).find('em')[1]).text() warehouse_name = pq(pq(q('tr')[_])('td')[7]).text() manufacturer = pq(pq(q('tr')[_])('td')[4]).text() except (IndexError,Exception): continue if provider_name not in all_results: all_results[provider_name] = {} if product_name not in all_results[provider_name]: all_results[provider_name][product_name] = [] res = {} res['product_name'] = product_name res['shop_sign'] = shop_sign res['spec'] = spec res['weight'] = weight res['price'] = price res['provider_name'] = provider_name res['warehouse_name'] = warehouse_name res['manufacturer'] = manufacturer res['phone'] = phone res['contacts'] = contacts if res not in all_results[provider_name][product_name]: all_results[provider_name][product_name].append(res) if break_out: break for provider_name, data in all_results.iteritems(): file_name = u'%s-欧普钢网资源单-%s.xls'%(provider_name, today) file_path = os.path.join(gangyin_dir, file_name) wb = xlwt.Workbook() for product_name, rows in data.iteritems(): r='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+' product_name = re.sub(r,'',product_name) ws = wb.add_sheet(product_name) c = 2 ws .write(0, 0 ,u'联系人:%s'%rows[0]['contacts']) ws.write(1, 0, u'电话:%s'%rows[0]['phone']) ws.write(c, 0, u'品名') ws.write(c, 1, u'牌号') ws.write(c, 2, u'规格') ws.write(c, 3, u'产地') ws.write(c, 4, u'仓库') ws.write(c, 5, u'重量') ws.write(c, 6, u'价格') # ws.write(c, 7, u'说明1') for row in rows: c += 1 ws.write(c, 0, row['product_name']) ws.write(c, 1, row['shop_sign']) ws.write(c, 2, row['spec']) ws.write(c, 3, row['manufacturer']) ws.write(c, 4, row['warehouse_name']) ws.write(c, 5, row['weight']) ws.write(c, 6, row['price']) # ws.write(c, 7, row['special']) wb.save(file_path) CrawlExcel.objects.create( create_time=time.time(), crawl_user=profile.user, source=7, # source_id=excel_id, filepath=file_path, provider=provider_name, imported=False ) print provider_name, file_path
def test_selector_with_xml(self): expected = 'What' d = pq('bar|blah', b(self.xml), parser='xml', namespaces={'bar': 'http://example.com/bar'}) val = d.text() self.assertEqual(repr(val), repr(expected))
def test_html_upper_element_name(self): xml = pq('<X>foo</X>', parser='html') self.assertEqual(len(xml('X')), 1) self.assertEqual(len(xml('x')), 1)
def test_post(self): d = pq(self.application_url, {'q': 'foo'}, method='post') self.assertIn('REQUEST_METHOD: POST', d('p').text()) self.assertIn('q=foo', d('p').text())
def test_get(self): d = pq(self.application_url, {'q': 'foo'}, method='get') print(d) self.assertIn('REQUEST_METHOD: GET', d('p').text()) self.assertIn('q=foo', d('p').text())
def test_remove_namespaces(self): expected = 'What' d = pq(b(self.xml), parser='xml').remove_namespaces() val = d('blah').text() self.assertEqual(repr(val), repr(expected))
def test_xhtml_namespace_html_parser(self): expected = 'What' d = pq(self.xhtml, parser='html') d.xhtml_to_html() val = d('div').text() self.assertEqual(repr(val), repr(expected))
def test_selector_html(self): expected = 'What' d = pq('blah', self.xml.split('?>', 1)[1], parser='html') val = d.text() self.assertEqual(repr(val), repr(expected))