def test_new_key(self): data = {"description": "This is meant for a test app"} url = reverse('authkeys.new', locale='en-US') # Check out the creation page, look for the form. resp = self.client.get(url) eq_(200, resp.status_code) page = pq(resp.content) eq_(1, page.find('form.key').length) # We don't have this key yet, right? keys = Key.objects.filter(description=data['description']) eq_(0, keys.count()) # Okay, create it. resp = self.client.post(url, data, follow=False) eq_(200, resp.status_code) # We have the key now, right? keys = Key.objects.filter(description=data['description']) eq_(1, keys.count()) # Okay, and it should belong to the logged-in user key = keys[0] eq_(key.user, self.user) # Take a look at the description and key shown on the result page. page = pq(resp.content) ok_(data['description'], page.find('.key .description').text()) ok_(key.key, page.find('.key .key').text()) # Ensure the secret on the page checks out. secret = page.find('.key .secret').text() ok_(key.check_secret(secret))
def getUserAnswers(self, all): # 获取最新的文件的qID和aID latestFile = self.getLatestAnswerFileName() latestQID = 0 latestAID = 0 if latestFile is None: # 没有符合格式的文件,需要全抓 all = True else: # 计算出最新的questionID和answerID pattern = re.compile('^\[\d{4}-\d{2}-\d{2}\].*-q(\d{1,50})-a(\d{1,50}).html$') match = pattern.findall(latestFile) for pp in match: latestQID = pp[0] latestAID = pp[1] # 默认是要抓第一页的,顺便计算回答的总页数 pageContent = urllib2.urlopen("{}?page={}". format(self.answerURL, self.startPage)).read() d = pq(pageContent) pageMax = self.getMaxPageNumber(d) currentPage = self.startPage ret = False while True: self.logging("parsing page {} of {}".format(currentPage, pageMax), True) # 如果不是需要全部抓取,那么看看现在抓够了没有 # 遇到老答案之后,再向前寻找10个老答案,并更新 ret = self.parseAnswerAndSave(d, latestQID, latestAID, all) if not all and ret: # 不用全抓,而且发现了重复 return if currentPage >= pageMax: # 已经是最后一页 break # 计算下一页的pq值 currentPage += 1 pageContent = urllib2.urlopen("{}?page={}". format(self.answerURL, currentPage)).read() d = pq(pageContent)
def parse_current_docket(docket_record): # grab the file with the URL mangled slightly to grab 100k records docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read() page = pq(etree.fromstring(docket_file, parser)) docket = dict(docket_record) docket['title'] = page('.dyn_wrap h1').text().strip() assert docket['title'], 'no title found' headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()] docket['comments'] = [] # check if there's a no-records message if len(page('.rgMasterTable .rgNoRecords')): return docket for row in page('.rgMasterTable tbody tr').items(): tds = row.find('td') cell_text = [item.text().strip() for item in tds.items()] cdata = dict(zip(headers, cell_text)) link = pq(tds[-1]).find('a') doc = { 'url': urlparse.urljoin(docket['url'], link.attr('href')), 'details': {}, 'release': [fix_spaces(cdata['Release'])], 'date': cdata['Date Received'], 'doctype': 'public_submission', } vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url']) if vc_matches: doc['id'] = vc_matches[0] doc['subtype'] = 'comment' detail_columns = ['Organization', 'First Name', 'Last Name'] else: ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url']) if ep_matches: doc['id'] = "EP-%s" % ep_matches[0] doc['subtype'] = 'exparte' detail_columns = ['Organization'] else: assert False, "expected either comment or exparte link: %s" % doc['url'] for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')): if cftc_label in detail_columns and cdata[cftc_label]: doc['details'][rdg_label] = cdata[cftc_label] docket['comments'].append(doc) assert len(docket['comments']) < 100000, "we probably exceeded one page" # then strip out all the ones that aren't about this document release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip()) docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release] return docket
def test__render(self): w = MultiEmailWidget() output = w.render('test', ['*****@*****.**', '*****@*****.**']) self.assertEqual(1, len(pq('textarea', output))) self.assertEqual( pq('textarea', output).text(), '[email protected],[email protected]')
def scrape_press_releases(): releases_page = pq(scraperwiki.scrape(BASE_URL + 'news-releases')) for row in releases_page.find('.recordListTitle'): sleep(1) title = '' date = None content = '' attachments = [] links = pq(row).find('a') page = pq(scraperwiki.scrape(links.eq(0).attr('href'))) title = _extract_title_from(page) content = _readable(page.find('.content').html()) date = _extract_date_from(page) for attachment in page.find('.file_link a'): att = pq(attachment) attachments.append({att.text(): att.attr('html')}) args = [title, date, content] kwargs = {} if len(attachments): kwargs.update(attachments=attachments) gasp.add_press_release(*args, **kwargs)
def getAnimeURL(searchText): try: searchText = sanitiseSearchText(searchText) html = requests.get(BASE_URL + "/anime/all?name=" + searchText.replace(" ", "%20")) ap = pq(html.text) animeList = [] #If it's taken us to the search page if ap.find('.cardDeck.pure-g.cd-narrow[data-type="anime"]'): for entry in ap.find('.card.pure-1-6'): entryTitle = pq(entry).find('a').text() entryURL = pq(entry).find('a').attr('href') anime = {} anime['title'] = entryTitle anime['url'] = BASE_URL + entryURL animeList.append(anime) closestName = difflib.get_close_matches(searchText.lower(), [x['title'].lower() for x in animeList], 1, 0.85)[0] closestURL = '' for anime in animeList: if anime['title'].lower() == closestName: return anime['url'] #Else if it's taken us right to the series page, get the url from the meta tag else: return ap.find("meta[property='og:url']").attr('content') return None except: #traceback.print_exc() return None
def test_delete(self): """Can delete badge""" user = self._get_user() badge = Badge(creator=user, title="Test III", description="Another test") badge.save() slug = badge.slug badge.award_to(user) self.client.login(username="******", password="******") r = self.client.get(reverse('badger.views.detail', args=(badge.slug,)), follow=True) doc = pq(r.content) eq_('badge_detail', doc.find('body').attr('id')) delete_url = doc.find('a.delete_badge').attr('href') ok_(delete_url is not None) r = self.client.get(delete_url) doc = pq(r.content) eq_('badge_delete', doc.find('body').attr('id')) eq_("1", doc.find('.awards_count').text()) r = self.client.post(delete_url, {}, follow=True) doc = pq(r.content) try: badge = Badge.objects.get(slug=slug) ok_(False) except Badge.DoesNotExist: ok_(True)
def get_content(self,total_pq): '''获取用户发表微博内容''' data = total_pq("div[node-type=feed_list_content]") i = 0 for d in data : d = pq(d) if i == 0 and str(d("span")) != "": #不爬取置顶帖/热帖span.W_icon_feedpin/feedhot self.containsFirstTagWeibo = True else: if '//' in d.text(): #用户发表微博存在"转发"情况 p1=re.compile('(.*?)\s?//\s?<a',re.S) #找出用户自己所发内容,不含//后面的转发内容 match = p1.search(d.outerHtml()) if match: if match.group(1).strip() == '': #发表内容为空 self.content_list.append('') else: data_pq = pq(match.group(1)) #print '~~~~~~~~~~~~',data_pq.outerHtml() content = self.get_content_src(data_pq) #print '1111111111', content self.content_list.append(content) else: #用户发表的内容就是含有//本身 self.content_list.append(d.text()) else: #用户直接发表微博,没有转发情况 content = self.get_content_src(d) self.content_list.append(content) i = i+1 return self.content_list
def run(self): opener = build_opener(HTTPCookieProcessor()) d = pq(opener.open(self.url).read()) rates = d(".count") self.descrip = pq(rates[0]).html() self.service = pq(rates[1]).html() self.speed = pq(rates[2]).html()
def test_map(self): def ids_minus_one(i, elem): return int(self.klass(elem).attr('id')[-1]) - 1 assert self.klass('div', self.html).map(ids_minus_one) == [0, 1] d = pq('<p>Hello <b>warming</b> world</p>') self.assertEqual(d('strong').map(lambda i,el: pq(this).text()), [])
def test_bug869301_revisions_feed_locale(self): """Links to documents in revisions feed with ?all_locales should reflect proper document locale, regardless of requestor's locale""" d = document(title='HTML9', locale="fr") d.save() now = datetime.datetime.now() for i in xrange(1, 6): created = now + datetime.timedelta(seconds=5 * i) revision(save=True, document=d, title='HTML9', comment='Revision %s' % i, content="Some Content %s" % i, is_approved=True, created=created) resp = self.client.get('%s?all_locales' % reverse('wiki.feeds.recent_revisions', args=(), kwargs={'format': 'rss'}, locale='en-US')) self.assertEqual(200, resp.status_code) feed = pq(resp.content) self.assertEqual(5, len(feed.find('item'))) for i, item in enumerate(feed.find('item')): href = pq(item).find('link').text() self.assertTrue('/fr/' in href)
def get_autolab_grades(): #Autolab has their SSL certificates misconfigured, so we won't verify them s = authenticate('https://autolab.cs.cmu.edu/auth/users/auth/shibboleth',{"verify":False}) main = s.get('https://autolab.cs.cmu.edu').content d = pq(main) current_courses = d('#content > .rolodex > .course > h1 > a') grades = {} for course in current_courses: page_1 = s.get('https://autolab.cs.cmu.edu%s/assessments' % d(course).attr('href')).content gradebook = pq(pq(page_1)('.action-links > li > a')[1]).attr('href') course_page = s.get('https://autolab.cs.cmu.edu%s' % gradebook).content course_name = d(course).text() cd = pq(course_page) grades[course_name] = {} assignments = cd('.grades tr') for assgn in assignments: if d(assgn).attr('class') == 'header': continue name = cd(assgn).find("td > span > a").text() score = cd(assgn).find("td > a").text() total = cd(assgn).find("span.max_score").text() if name is not None and score is not None and total is not None: grades[course_name][name] = [float(score), float(total)] return grades
def ods2csv(content,admins=''): file_like_object = StringIO(content) xml = zipfile.ZipFile(file_like_object).read('content.xml') def rep_repl(match): return '<table:table-cell>%s' %match.group(2) * int(match.group(1)) def repl_empt(match): n = int(match.group(1)) pat = '<table:table-cell/>' return pat*n if (n<100) else pat p_repl = re.compile(r'<table:table-cell [^>]*?repeated="(\d+)[^/>]*>(.+?table-cell>)') p_empt = re.compile(r'<table:table-cell [^>]*?repeated="(\d+)[^>]*>') xml = re.sub(p_repl, rep_repl, xml) xml = re.sub(p_empt, repl_empt, xml) d = pq(xml, parser='xml') ns={'table': 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'} selr = CSSSelector('table|table-row', namespaces=ns) selc = CSSSelector('table|table-cell', namespaces=ns) rowxs = pq(selr(d[0])) data = [] for ir,rowx in enumerate(rowxs): cells = pq(selc(rowx)) if cells.text(): data.append([cells.eq(ic).text().encode('utf-8') for ic in range(len(cells))]) if data: return data else: logger_script=logging.getLogger("Script Error") logger_script.warning("Google retuned empty file for table <b>%s</b>" % IP_table_name) sending_log("DHCP_ERROR: script got an error","<b>Google returned empty file</b>, script ended without any changes made!",error_log_file_name=error_log_file_name,admins=admins) sys.exit(exit_code_dict['google_returned_empty_file'])
def test_answer_creator_can_edit(self): """The creator of an answer can edit his/her answer.""" self.client.login(username='******', password='******') # Initially there should be no edit links response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_(0, len(doc('ol.answers a.edit'))) # Add an answer and verify the edit link shows up content = 'lorem ipsum dolor sit amet' response = post(self.client, 'questions.reply', {'content': content}, args=[self.question.id]) doc = pq(response.content) eq_(1, len(doc('ol.answers a.edit'))) new_answer = self.question.answers.order_by('-created')[0] eq_(1, len(doc('#answer-%s a.edit' % new_answer.id))) # Make sure it can be edited content = 'New content for answer' response = post(self.client, 'questions.edit_answer', {'content': content}, args=[self.question.id, new_answer.id]) eq_(200, response.status_code) # Now lock it and make sure it can't be edited self.question.is_locked = True self.question.save() response = post(self.client, 'questions.edit_answer', {'content': content}, args=[self.question.id, new_answer.id]) eq_(403, response.status_code)
def test_top_contributors(self): # There should be no top contributors since there are no solutions. cache_top_contributors() response = get(self.client, 'questions.questions') doc = pq(response.content) eq_(0, len(doc('#top-contributors ol li'))) # Solve a question and verify we now have a top conributor. answer = Answer.objects.all()[0] answer.created = datetime.now() answer.save() answer.question.solution = answer answer.question.save() cache_top_contributors() response = get(self.client, 'questions.questions') doc = pq(response.content) lis = doc('#top-contributors ol li') eq_(1, len(lis)) eq_('pcraciunoiu', lis[0].text) # Make answer 8 days old. There should no be top contributors. answer.created = datetime.now() - timedelta(days=8) answer.save() cache_top_contributors() response = get(self.client, 'questions.questions') doc = pq(response.content) eq_(0, len(doc('#top-contributors ol li')))
def common_vote(self): """Helper method for question vote tests.""" # Check that there are no votes and vote form renders response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_('0 people', doc('div.have-problem mark')[0].text) eq_(1, len(doc('div.me-too form'))) # Vote post(self.client, 'questions.vote', args=[self.question.id]) # Check that there is 1 vote and vote form doesn't render response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_('1 person', doc('div.have-problem mark')[0].text) eq_(0, len(doc('div.me-too form'))) # Voting again (same user) should not increment vote count post(self.client, 'questions.vote', args=[self.question.id]) response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_('1 person', doc('div.have-problem mark')[0].text)
def common_answer_vote(self): """Helper method for answer vote tests.""" # Check that there are no votes and vote form renders response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_(1, len(doc('form.helpful input[name="helpful"]'))) # Vote post(self.client, 'questions.answer_vote', {'helpful': 'y'}, args=[self.question.id, self.answer.id]) # Check that there is 1 vote and vote form doesn't render response = get(self.client, 'questions.answers', args=[self.question.id]) doc = pq(response.content) eq_('1 out of 1 person', doc('#answer-1 div.helpful mark')[0].text) eq_(0, len(doc('form.helpful input[name="helpful"]'))) # Voting again (same user) should not increment vote count post(self.client, 'questions.answer_vote', {'helpful': 'y'}, args=[self.question.id, self.answer.id]) doc = pq(response.content) eq_('1 out of 1 person', doc('#answer-1 div.helpful mark')[0].text)
def _delete_flow(self, user): """Private method used to walk through account deletion flow.""" self.client.login(email=user.email) user_id = User.objects.get(email=user.email).id r = self.client.get(reverse('profile.edit')) doc = pq(r.content) # Make sure there's a link to a confirm deletion page, and nothing # pointing directly to the delete URL. eq_(reverse('profile.delete_confirm'), doc('a.btn-danger').attr('href'), 'We see a link to a confirmation page.') self.assertFalse(any((reverse('profile.delete') in el.action) for el in doc('#main form')), "We don't see a form posting to the account delete URL.") # Follow the link to the deletion confirmation page. r = self.client.get(doc('a.btn-danger').attr('href')) # Test that we can go back (i.e. cancel account deletion). doc = pq(r.content) eq_(reverse('profile.edit'), doc('#cancel-action').attr('href')) # Test that account deletion works. delete_url = doc('#delete-action').closest('form').attr('action') r = self.client.post(delete_url, follow=True) eq_(200, r.status_code) self.assertFalse(_logged_in_html(r)) # Make sure the user data isn't there anymore assert not User.objects.get(id=user_id).first_name assert not User.objects.get(id=user_id).email assert not User.objects.get(id=user_id).is_active
def movie_search(self, **kw): """ Fallback for movie.search """ keyword = kw['q'] url = 'http://movie.douban.com/subject_search?search_text=%s&cat=1002' % \ keyword.encode('utf-8') html = pq(url=url, parser='html', opener=lambda url, **kw: urllib2.urlopen(urllib2.Request(url, headers={ 'User-Agent': self.recbysns.UA, 'Cookie': self.cookie}), timeout=10).read()) movies = [] for movie in html('#content table .item'): movie = pq(movie) id = int(re.match('http://movie.douban.com/subject/(\d+)/', movie('.nbg').attr('href')).group(1)) image = movie('.nbg img').attr('src') pub = movie('.pl2>.pl').text() rating = pq(movie('.pl2 .star')) if rating and rating('.rating_nums').text(): numRaters = int(re.match(u'\((\d+)', rating('.pl').text()).group(1)) average = rating('.rating_nums').text() rating = {"numRaters": numRaters, "average": average} else: rating = {"numRaters": 0, "average": 0} titles = [title.strip() for title in movie('.pl2>a').text().split('/')] movies.append({'id': id, 'titles': titles, 'image': image, 'pub': pub, 'rating': rating}) return {'movies': movies, 'total': len(movies)}
def scrape_inspection(inspection_url, facility): try: inspection = {} inspection['facility'] = facility['_id'] if 'id' in facility: inspection['facility_id'] = facility['id'] inspection['_id'] = inspection_url inspection['url'] = inspection_url inspection_resp = requests.get(inspection['url']) doc = pq(inspection_resp.content) info = doc.find('div#inspectionInfo tr td') for (counter, pair) in enumerate(grouper(info, 2)): value = pq(pair[1]).text() if counter == 0: date = dateutil.parser.parse(value) inspection['date'] = date.date() elif counter == 2: inspection['priority'] = value elif counter == 3: inspection['purpose'] = value elif counter == 4: inspection['result'] = value elif counter == 5: inspection['actions'] = value print "inspection: %s" % inspection save_inspection(inspection) return inspection, inspection_resp except: logger.exception("Could not scrape inspection %s" % inspection.get('url', ''))
def test_mozillian_can_vouch(self): """ Tests the vouching system's happy path. Kind of a big test because we want to: a. Test registration's happy path b. Test vouching c. Test account deletion """ moz_client = self.mozillian_client r = moz_client.get(reverse('profile', args=[self.pending.username])) eq_(200, r.status_code) doc = pq(r.content) self.assertTrue(doc('form#vouch-form')) vouch_url = reverse('vouch') data = dict(vouchee=self.pending.get_profile().id) vouched_profile = moz_client.post(vouch_url, data, follow=True) self.pending = User.objects.get(pk=self.pending.pk) eq_(200, vouched_profile.status_code) r = moz_client.get(reverse('profile', args=[self.pending.username])) eq_(200, r.status_code) doc = pq(r.content) self.assertTrue(not doc('form#vouch-form')) eq_(self.pending.get_profile().vouched_by.user, self.mozillian, 'Credit given')
def get_results(self, response, sort=True): """Return pks of add-ons shown on search results page.""" addons = pq(response.content)('#pjax-results div[data-addon]') pks = [int(pq(a).attr('data-addon')) for a in addons] if sort: return sorted(pks) return pks
def test_support_link(self): # Test no link if no support url or contribution. self.enable_waffle() r = self.client.get(self.add) eq_(pq(r.content)('.support-link').length, 0) # Test support email if no support url. self.webapp.support_email = {'en-US': '*****@*****.**'} self.webapp.save() r = self.client.get(self.add) doc = pq(r.content)('.support-link') eq_(doc.length, 1) # Test link to support url if support url. self.webapp.support_url = {'en-US': 'test'} self.webapp.save() r = self.client.get(self.add) doc = pq(r.content)('.support-link a') eq_(doc.length, 1) eq_(doc.attr('href'), 'test') # Test link to support flow if contribution. c = Contribution.objects.create(addon=self.webapp, user=self.user, type=amo.CONTRIB_PURCHASE) r = self.client.get(self.add) doc = pq(r.content)('.support-link a') eq_(doc.length, 1) eq_(doc.attr('href'), reverse('support', args=[c.id]))
def test_known_authors_filter(self): # There are a total of 11 revisions url = urlparams(reverse('dashboards.revisions', locale='en-US'), authors=RevisionDashboardForm.ALL_AUTHORS) response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(response.status_code, 200) page = pq(response.content) revisions = page.find('.dashboard-row') eq_(11, revisions.length) # Only testuser01 is in the Known Authors group, and has 2 revisions url = urlparams(reverse('dashboards.revisions', locale='en-US'), authors=RevisionDashboardForm.KNOWN_AUTHORS) response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(response.status_code, 200) page = pq(response.content) revisions = page.find('.dashboard-row') eq_(2, revisions.length) # Of the 11 revisions, 9 are by users not in the Known Authors group url = urlparams(reverse('dashboards.revisions', locale='en-US'), authors=RevisionDashboardForm.UNKNOWN_AUTHORS) response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(response.status_code, 200) page = pq(response.content) revisions = page.find('.dashboard-row') eq_(9, revisions.length)
def test_bug_709938_interests(self): testuser = self.user_model.objects.get(username='******') self.client.login(username=testuser.username, password=TESTUSER_PASSWORD) url = reverse('users.user_edit', args=(testuser.username,)) response = self.client.get(url, follow=True) doc = pq(response.content) test_tags = [u'science,Technology,paradox,knowledge,modeling,big data,' u'vector,meme,heuristics,harmony,mathesis universalis,' u'symmetry,mathematics,computer graphics,field,chemistry,' u'religion,astronomy,physics,biology,literature,' u'spirituality,Art,Philosophy,Psychology,Business,Music,' u'Computer Science'] form = self._get_current_form_field_values(doc) form['user-interests'] = test_tags response = self.client.post(url, form, follow=True) eq_(200, response.status_code) doc = pq(response.content) eq_(1, doc.find('ul.errorlist li').length) assert ('Ensure this value has at most 255 characters' in doc.find('ul.errorlist li').text())
def get_perm(perm): jq = pq(base_url.format(perm, name)) over = len(jq(".ws-ds-text")) if not (over): return [ { "Name": pq(x)("td:eq(0)").text().encode("utf-8"), "Email": "{}@illinois.edu".format( pq(x)("script") .text() .replace('displayIllinois("', "") .replace("'", "") .replace(")", "") .replace('"', "") .encode("utf-8") ), } for x in jq(".ws-ds-dept-details table tr:gt(0):odd") ] else: return concr( get_perm, map(lambda x: "{}{}".format(perm, x), get_character_permutations(num_characters=1)), max_workers=10, )
def get_perm(perm): jq = pq( "http://www1.appstate.edu/cgi-bin/cgiwrap/jmm/newcso4.pl", data={"last": name, "first": "{}*".format(perm), "type": "student"}, method="post", ) over = len(jq("p:contains('too many results')")) if not (over): return [ { "Name": re.findall(r"name\: .*", pq(x).text())[0] .replace("name:", "") .strip() .encode("utf-8"), "Email": pq(x)("a[href^=mailto]").text(), } for x in jq("#maintext table tr:gt(0) td pre") if re.findall(r"name\: .*", pq(x).text()) ] else: with concurrent.futures.ThreadPoolExecutor(max_workers=10) as thread: return list( itertools.chain( *list( thread.map( get_perm, ["{}{}".format(perm, x) for x in get_character_permutations(num_characters=1)], ) ) ) )
def test_post_bad_site(self): self.client.login(username='******', password='******') url = '/datasheet/bulk_import/' with open(self.fpath_bad_site) as f: response = self.client.post(url, { 'datasheet_id': self.ds.pk, 'organization': 'Coast Savers', 'project_id': 1, 'csvfile': f } ) d = pq(response.content) el = d("ul.errorlist li") self.assertEqual(response.status_code, 400, response.content) self.assertEqual(len(el), 1) self.assertTrue("TestSite3" in el[0].text_content(), el[0].text_content()) self.assertTrue("is not in the database" in el[0].text_content(), el[0].text_content()) # Now add the site ca = State.objects.get(name="California") testsite3 = Site(sitename="TestSite3", state=ca, county="Santa Cruz") testsite3.save() with open(self.fpath_bad_site) as f: response = self.client.post(url, { 'datasheet_id': self.ds.pk, 'organization': 'Coast Savers', 'project_id': 1, 'csvfile': f } ) d = pq(response.content) el = d("ul.errorlist li") self.assertEqual(response.status_code, 200, response.content) self.assertEqual(len(el), 0)
def parse_count_by_use(html, city): #dir_name = get_result_path() file_name1 = os.path.join(dir_name, 'shenzhen_gov_ershoufang_count_by_use_day_region.txt') file_name2 = os.path.join(dir_name, 'shenzhen_gov_ershoufang_count_by_use_month_region.txt') table1 = pq(html)("#ctl00_ContentPlaceHolder1_clientList1 tr") table2 = pq(html)("#ctl00_ContentPlaceHolder1_clientList2 tr") date1 = pq(html)('#ctl00_ContentPlaceHolder1_lblCurTime1').text() date2 = pq(html)('#ctl00_ContentPlaceHolder1_lblCurTime2').text() infos1 = infos2 = '' if os.path.exists(file_name1): fr = open(file_name1, 'r') infos1 = fr.read().decode('utf8') fr.close() if os.path.exists(file_name2): fr = open(file_name2, 'r') infos2 = fr.read().decode('utf8') fr.close() #print len(table1) #print len(table2) info1 = get_info(table1, date1, city) #print 'table2' info2 = get_info(table2, date2, city) #print 'end table2' if date1 + ',' + city not in infos1: fw1 = open(file_name1, 'a') fw1.write(','.join(info1).encode('utf8') + '\n') fw1.close() #print 'end date1' if date2 + ',' + city not in infos2: fw2 = open(file_name2, 'a') fw2.write(','.join(info2).encode('utf8') + '\n') fw2.close()
def test_perf_warning(self): eq_(self.addon.ts_slowness, None) doc = pq(self.client.get(self.detail_url).content) eq_(doc('.performance-note').length, 0) self.addon.update(ts_slowness=100) doc = pq(self.client.get(self.detail_url).content) eq_(doc('.performance-note').length, 1)
def cam_get_name(course_page): doc = pq(course_page) name = doc('title')[0].text[:doc('title')[0].text.find('|') - 1] return name
def getVideo_title(url): html = bilibili().getHtml(url) doc = pq(html) video_title = doc('#viewbox_report > h1 > span').text() return video_title
import requests from pyquery import PyQuery as pq # 知乎热门话题 url = 'https://www.zhihu.com/explore' # 伪造User-Agent,防止被反爬 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } # 使用requests获取html html = requests.get(url, headers=headers).text # 使用pyquery解析html doc = pq(html) items = doc('.explore-tab .feed-item').items() for item in items: # 标题 question = item.find('h2').text() # 作者 author = item.find('.author-link-line').text() # 内容 answer = pq(item.find('.content').html()).text() # a 代表以追加的方式写入到文本,指定编码为utf-8 file = open('explore.txt', 'a', encoding='utf-8') file.write('\n'.join([question, author, answer])) # 每个热门话题以50个等号的形式分割 file.write('\n' + '=' * 50 + '\n') file.close()
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }
def parse_artifact(item): if item is None: return None artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> div.on-edit-hide') for li in lis: l = pq(li) strtype = l('h4> span.tag').text().strip() #logger.info(strtype) if strtype != u"网站" and strtype != "app": continue link = l('h4> b> a').attr("href").strip() if link == "": continue domain = None type = None if strtype == u"网站": type, app_market, app_id = url_helper.get_market(link) if type == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is None: continue if flag is False: domain = None if type != 4010: type, app_market, app_id = url_helper.get_market(link) if type == 4040: domain = app_id elif type == 4050: if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is None and type !=4030 and type != 4020: continue name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc)) artifact = { "type":type, "name":name, "desc":desc, "link":link, "domain": domain } artifacts.append(artifact) logger.info("") return artifacts
def cam_get_requirements(course_page): doc = pq(course_page) temptext = doc('fieldset.collapsible.collapsed.group-entry-requirements').find('p').eq(0).text() alevel = temptext[:temptext.find('\n')] ib = temptext[(temptext.find('\n') + 2):] return (alevel + '; ' + ib)
def cam_get_description(course_page): doc = pq(course_page) return "placeholder description"
## 初始化 # 字符串初始化 # doc = pq(html) # print(doc('a')) # URL初始化 # doc = pq('https://github.com') # print(doc('title')) # 文件初始化 # doc = pq(filename='test.html') # print(doc('p')) ## 基本css选择器 with open('test.html', encoding='utf-8') as f: html = f.read() doc = pq(html) # print(doc('#container .list li')) # print(type(doc('#container .list li'))) ## 查找节点 # 子节点 # find # item = doc('.list') # print(item) # # print(type(item)) # lis = item.find('li') # # print(type(lis)) # print(lis) # children # lst = item.children()
from pyquery import PyQuery as pq doc = pq(url='https://www.toutiao.com/') print(doc('title').text()) # 和上面的是一样的 # import requests # # doc = pq(requests.get('https://www.toutiao.com/').text) # print(doc('title'))
def ox_get_requirements(course_page): doc = pq(course_page) alvl = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(0).text() hghr = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(1).text() ib = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(2).text() return (alvl + "; " + hghr + "; " + ib)
def get_page(self, url): return pq(self.s.get(url).text)
import requests from pyquery import PyQuery as pq from pathlib import Path import ff14_tool_fish.fishinfo as fishinfo url='https://cn.ff14angler.com/index.php' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'} f = requests.get(url,headers=headers) html = f.text html.encode('utf-8') ff=pq(html) sel=ff('select[name=fish]').find('option').items() path = "F:/htmls/fish/" for i in sel: id=i.attr('value') if id=='0': continue elif int(id) <= 3157: id_file = Path(path + id + ".html") if id_file.exists(): continue print(id) url1='https://cn.ff14angler.com/fish/'+id print(url1) html1 = requests.get(url1).text html1.encode('utf-8') my_file = Path(path) if my_file.exists(): fishinfo.write_to_file(path + id + ".html", html1)
def _get_total_cases_by_region(self, href, html): lga_norm_map = { 'Locally Acquired—close contact with confirmed case': DataTypes.SOURCE_CONFIRMED, 'Locally acquired—no known contact': DataTypes.SOURCE_COMMUNITY, 'Locally acquired—contact known': DataTypes.SOURCE_CONFIRMED, 'Interstate acquired': DataTypes.SOURCE_INTERSTATE, 'Overseas acquired': DataTypes.SOURCE_OVERSEAS, 'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION, 'Total': DataTypes.TOTAL, } hhs_norm_map = { 'Total cases': DataTypes.TOTAL, 'Active cases': DataTypes.STATUS_ACTIVE, 'Total recovered': DataTypes.STATUS_RECOVERED, 'Total deaths': DataTypes.STATUS_DEATHS } du = self._get_date(href, html) if href == self.STATS_BY_REGION_URL_2: regions = [] # Add by HHS table # Total cases | Active cases | Total recovered | Total deaths table = pq(html)('#QLD_Cases_By_HHS')[0] headers = [ hhs_norm_map[pq(i).text().strip().strip('[12345]').strip()] for i in table[0][0][1:] ] for tr in table[3]: hhs = pq(tr[0]).text().strip().strip('*') for xx, td in enumerate(tr[1:]): value = int(pq(td).text().strip().replace(',', '')) regions.append( DataPoint(region_schema=Schemas.HHS, region_parent='AU-QLD', region_child=hhs.title(), datatype=headers[xx], value=value, date_updated=du, source_url=href)) # Add by LGA table # Overseas acquired | Locally acquired—contact known | # Locally acquired—no known contact | Interstate acquired | # Under investigation | Total table = pq(html)('table#LGA')[0] headers = [ lga_norm_map[pq(i).text().strip()] for i in table[0][0][1:] ] for tr in table[1][1:]: lga = pq(tr[0]).text().split('(')[0].strip() for xx, td in enumerate(tr[1:]): value = int(pq(td).text().strip().replace(',', '')) regions.append( DataPoint(region_schema=Schemas.LGA, region_parent='AU-QLD', region_child=lga.title(), datatype=headers[xx], value=value, date_updated=du, source_url=href)) return regions else: table = pq(pq(html)('table.table.table-bordered.header-basic')) if not table: return None if not 'Total confirmed' in pq(table[0]).text().replace( '\n', ' ').replace(' ', ' '): #print("NOT TOTAL:", table.text()) return None regions = [] for tr in table('tr'): if 'total' in pq(tr).text().lower(): continue tds = pq(tr)('td') for x, td in enumerate(tds): if x == 0: # HACK: one day had "271" prefixed to "North West" hhs_region = pq(td).text().strip().lstrip( '271*').strip().strip('*') elif x >= 1: if len(tds) > 2: # New format: # HHS* # Active cases # Recovered cases # Deaths # Total confirmed cases to date datatype = [ DataTypes.STATUS_ACTIVE, DataTypes.STATUS_RECOVERED, DataTypes.STATUS_DEATHS, DataTypes.TOTAL ][x - 1] else: datatype = DataTypes.TOTAL try: value = int(pq(td).text().strip()) regions.append( DataPoint(region_schema=Schemas.HHS, region_parent='AU-QLD', region_child=hhs_region.title(), datatype=datatype, value=value, date_updated=du, source_url=href)) except ValueError: # WARNING!!! pass return regions
def fetchMiaopaiData(): uname = '/app/yxtk/script/useragent.txt' f1 = open("/app/yxtk/script/data/onlylady.sql",'w',buffering=-1) with open(uname) as f: useragents = f.readlines() userAgent = random.choice(useragents) headers = { 'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'http://streetstyle.onlylady.com/', 'Referer':'http://streetstyle.onlylady.com/', 'Upgrade-Insecure-Requests':'1', 'X-Requested-With':'XMLHttpRequest', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' } while True: for j in range(1,12): time.sleep(1) if j == 1: url = 'http://streetstyle.onlylady.com/' if j == 2: url = 'http://fashion.onlylady.com/' if j == 3: url = 'http://show.onlylady.com/' if j == 4: url = 'http://luxury.onlylady.com/' if j == 5: url = 'http://accessories.onlylady.com/' if j == 6: url = 'http://jewelry.onlylady.com/' if j == 7: url = 'http://watch.onlylady.com/' if j == 8: url = 'http://hufu.onlylady.com/' if j == 9: url = 'http://zhuangban.onlylady.com/' if j == 10: url = 'http://hair.onlylady.com/' if j == 11: url = 'http://body.onlylady.com/' print url; try: encoding_support = ContentEncodingProcessor req = urllib2.Request(url) res = urllib2.urlopen(req) html = res.read() res.close() doc = pq(html) divs = doc('div.c3_r_item') for div in divs.items(): clo_url = div('div.bt').children('a').attr('href') m = re.findall(r'(\w*[0-9]+)\w*',str(clo_url)) clo_id = str(m[2]) clo_pic = div('div.img').children('a').children('img').attr('src') print clo_pic clo_title = div('div.bt').children('a').text() clo_title = "\" "+clo_title.replace('\"','\'')+" \"" clo_title = clo_title.replace("\n",'') clo_title = clo_title.replace(",",',') print clo_title clo_date = div('div.date').text() imageUrl=qiniuUpdate(clo_pic.strip()) req = urllib2.Request(clo_url) res = urllib2.urlopen(req) html1 = unicode(res.read(),'GBK') html1 = re.sub(r'<script>(.*?)</script>','',html1) res.close() doc1 = pq(html1) con = doc1('div.detail_content') con('img').removeAttr("style") con('img').removeAttr("width") con('img').removeAttr("height") con('img').attr("style","width:100%") p = con('div.detail_content').html() if p is None or p =='': continue p = re.sub(r' ','',p) p = re.sub(r'<style.*>([\S\s\t]*?)</style>','',p) p = re.sub(r'<script.*>([\S\s\t]*?)</script>','',p) p = re.sub(r'<p[^>]*>','<p>',p) p = re.sub(r'<(?!img|br|p|/p).*?>','',p) p = re.sub(r'\r','',p) p = re.sub(r'\n','',p) p = re.sub(r'\s','',p) p = re.sub(r'src=',' src=',p) #newqiniu = pq(p) #imgs = newqiniu('img') #for image in imgs.items(): #imgurl = image('img').attr('src') #newimgurl = qiniuUpdate(imgurl.strip()) #p = p.replace(str(imgurl),str(newimgurl)) sql = "INSERT INTO 3rd_clothes(id,creator,modifier,create_time,modify_time,is_deleted,clothes_id,title,clothes_date,img_url,sort,user_id,thumbnail_url,source,tag,content,push_flag,recommend_flag,view_status)VALUES(NULL,'sys','sys',now(),now(),'n','"+clo_id+"'," +clo_title.strip() + ",'" + clo_date.strip() +"','"+clo_pic.strip()+"','0','','"+imageUrl+"','onlylady','','"+p.strip()+"',0,NULL,0);"+'\n' print sql f1.writelines(sql) file_name = urllib2.unquote(clo_pic.strip()).decode('utf8').split('/')[-1] os.remove('/app/yxtk/script/'+file_name) except Exception as e: print e break f1.close()
a = ''' <body> <h><a href='www.biaoti.com'>tomtao626</a></h> <p class='vota'>段落1</p> <p>段落2</p> </body> ''' # 提取内容 """ 提取标签内容,用.text() 提取标签属性值,用.attr() 提取子孙节点内容用.text(),如果只提取子节点用.html(),可能提取出来的是子节点的整个标签 """ doc = pq(a) # 提取标签内容 print(doc('h').text()) # 'tomtao626' print(doc('h').html()) # '<a href="www.biaoti.com">tomtao626</a>' print(doc('body').html()) # '\n <h><a href="www.biaoti.com">tomtao626</a></h>\n <p>段落1</p>\n <p>段落2</p>\n' print(doc('p').text()) # '段落1 段落2' print(doc('p').text().split(' ')) # ['段落1', '段落2'] print(doc('p:nth-of-type(1)').text()) # '段落1' print(doc('body').text()) # 'tomtao626 段落1 段落2' # 提取标签属性 print(doc('h a').attr('href')) # 'www.biaoti.com' print(doc('p').attr('class')) # 识别标签 # 只根据标签来识别
def _get_total_dhr(self, href, html): # Technically, this is a bad idea adding other irrelevant values to # DHR here..but separating it everywhere thru this class would be # far worse! sbr2_map = { 'Number of confirmed cases': DataTypes.TOTAL, 'Last 24 hours': DataTypes.NEW, 'Active cases': DataTypes.STATUS_ACTIVE, 'Recovered': DataTypes.STATUS_RECOVERED, 'Current hospitalisations': DataTypes.STATUS_HOSPITALIZED, 'Deaths': DataTypes.STATUS_DEATHS } du = self._get_date(href, html) if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2): # New format as of 22 April r = [] table = pq(html)('#QLD_Cases') if table: for tr in table[1:]: datatype = sbr2_map[pq( tr[0]).text().strip().strip('[12345]').strip()] if datatype is None: continue r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=datatype, value=int(pq(tr[1]).text().strip()), date_updated=du, source_url=href)) deaths = pq(html)('.qh-fact-wrapper .lost span') if deaths: r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.STATUS_DEATHS, value=int(pq(deaths[0]).text().strip()), date_updated=du, source_url=href)) return r else: # As of 9th April, the format has added recovered/deaths etc info totals_dict = self.__get_totals_from_table(html) if not totals_dict: return [] r = [] r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.STATUS_RECOVERED, value=totals_dict['recovered'], date_updated=du, source_url=href)) r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.STATUS_DEATHS, value=totals_dict['deaths'], date_updated=du, source_url=href)) r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.STATUS_ACTIVE, value=totals_dict['active'], date_updated=du, source_url=href)) return r
import requests, json, re from pyquery import PyQuery as pq headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' } r = requests.get( 'https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3%E8%A1%8C%E6%94%BF%E5%8D%80%E4%BA%BA%E5%8F%A3%E5%88%97%E8%A1%A8', headers=headers) doc = pq(r.text) cities = [] for i in doc('.wikitable').eq(0).find('tbody tr'): tds = doc(doc(i).find('td')) if len(tds.eq(1).text()): cities.append({ 'COUNTYNAME': tds.eq(1).text(), 'COUNTYENG': tds.eq(2).text(), 'COUNTYTYPE': tds.eq(3).text(), 'COUNTYFLAG': 'https:%s' % (tds.eq(4).find('img').eq(0).attr('src')), 'COUNTYAREA': tds.eq(5).text(),
def _get_total_cases_tested(self, href, html): #if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2): # New format as of 22 April tested = pq(html)('.qh-fact-wrapper .tested span') if tested: return self._extract_number_using_regex( compile('([0-9,]+)'), pq(tested[0]).text().strip(), region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', date_updated=self._get_date(href, html), datatype=DataTypes.TESTS_TOTAL, source_url=href) # NOTE: This is actually a different page to the press releases! # I needed to get some of these from web.archive.org. # Some of the stats may be a day or more old, # so will need to add the date of the stat as well(!) value = self._extract_number_using_regex( compile('Total samples tested: <strong>([0-9,]+)'), html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', date_updated=self._get_date(href, html), datatype=DataTypes.TESTS_TOTAL, source_url=href) if value: return value # Find the start of the # samples tested table th_regex = compile( '<th id="table[^"]+">[^<]*?As at ([^<]+)[^<]*?</th>[^<]*' '<th id="table[^"]+">[^<]*?(?:Samples|Patients) tested[^<]*?</th>', DOTALL | MULTILINE) match = th_regex.search(html) if not match: #print("NOT INITIAL MATCH!") return None # WARNING!!! # Get the date - it's in format "30 March 2020" date_updated = self._extract_date_using_format(match.group(1).strip()) slice_from = match.end(1) # CHECK ME! html = html[slice_from:] # Get the # samples total value = self._extract_number_using_regex( compile( # Total number changed from being enclosed in a <strong> # tag to a <b> tag, so changed to be as broad as NSW # <strong>Total</strong></td> # <td headers="table59454r1c2"><b>37,334</b></td> r'<td[^>]*?>(?:<[^</>]+>)?Total(?:</[^<>]+>)?</td>' r'[^<]*?<td[^>]*?>.*?([0-9,]+).*?</td>', MULTILINE | DOTALL), html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', date_updated=date_updated, datatype=DataTypes.TESTS_TOTAL, source_url=href) if not value: #print("NOT SECOND MATCH!") return None # WARNING! return value
def test_send_message_to_prefilled(self): url = urlparams(reverse('messages.new'), to=self.user2.username) response = self.client.get(url, follow=True) eq_(200, response.status_code) eq_(self.user2.username, pq(response.content)('#id_to')[0].attrib['value'])
def test_public_stats_stats_notes(self): addon = amo.tests.addon_factory(public_stats=True) response = self.client.get(self.get_public_url(addon)) assert pq(response.content)('#stats-note h2').length == 1
#PyQueryの利用 from pyquery import PyQuery as pq html = ''' <ul> <li id = "nya">item1</li> <li>item2</li> <li>item3</li> <li>item4</li> </ul> ''' #インスタンスの生成 dom = pq(html) #CSSセレクタの利用 print("①", dom("#hoge")) #DOMの操作 dom("li").each(lambda i, node: pq(node).attr(class_="fuga")) print("②", dom) #スクレイピング dom = pq("http://www.python.org/about/")
def test_no_markup_in_message_list(self): response = self._test_send_message_to(self.user2.username) eq_( pq(response.content)('read').text(), pq(response.content)('read').html())
def test_page(self): self._step() r = self.client.get(self.url) eq_(r.status_code, 200) eq_(pq(r.content)('#submit-details').length, 1)
def test_send_message_page(self): # Make sure page loads. response = self.client.get(reverse('messages.new'), follow=True) eq_(200, response.status_code) assert len(pq(response.content)('#id_message'))
def test_hotness_ignore(self): # Defaults to ignore compat mode for Fx v10, both are compatible. res = self.client.get(self._url(version='10.0')) assert res.status_code == 200 assert pq(res.content)('.featured-addons').length == 2
def _get_review_detail_page_source(self, url): r = requests.get(url, headers = self.headers, proxies=PROXY) r.encoding = 'utf-8' page_source = r.text return pq(fromstring(page_source))
def test_hotness_strict(self): # Defaults to strict compat mode, both are within range. res = self.client.get(self._url()) assert res.status_code == 200 assert pq(res.content)('.featured-addons').length == 2
def test_page(self): self._step() r = self.client.get(self.url) eq_(r.status_code, 200) eq_(pq(r.content)('#upload-file').length, 1)
def test_no_version(self): """Don't display a version number for themes.""" r = self.client.get(self.url) assert pq(r.content)('h1 .version') == []
def test_hotness_strict_filtered(self): # Defaults to strict compat mode, one is within range. res = self.client.get(self._url(version='6.0')) assert res.status_code == 200 assert pq(res.content)('.featured-addons').length == 1 self.assertContains(res, self.addon2.name)