def OilPrice(request): driver = webdriver.PhantomJS() driver.get('http://www.bitauto.com/youjia/') html_source = driver.page_source #print html_source soup = BeautifulSoup(html_source) sichuan = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > th:nth-of-type(2) > a") #90号汽油 type1 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(5)") #93号汽油 type2 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(6)") #97号汽油 type3 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(7)") #0号柴油(元/升) type4 = soup.select("body > div.bt_page > div.bt_page > div > div.oilTableOut > table > tbody > tr:nth-of-type(11) > td:nth-of-type(8)") sichuan = str(sichuan[0].get_text()) type1 = str(type1[0].get_text()) type2 = str(type2[0].get_text()) type3 = str(type3[0].get_text()) type4 = str(type4[0].get_text()) print sichuan.encode('utf8') oilprice = Oil(city_name=sichuan.encode('utf8'),typeone_price=type1, typetwo_price=type2, typethree_price=type3, typefour_price=type4, date=datetime.datetime.now().date()) oilprice.save() print driver.current_url driver.quit responseHtml = '<html><body>Scraping Oil Price Successfully!</body></html>' return HttpResponse(responseHtml)
def getWeibos(self, keyword, page=1, count=None): url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'].decode('gb2312') soup = BeautifulSoup(infos) total_soup = soup.select('.headerR1')[0] total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip() return_val = {'total_count': int(total_num), 'msgs':[]} allmsgs = [] msgs_soup = soup.select('.nr_con') for msg_soup in msgs_soup: avatar = 'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href') nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':') nickname = nickandtext[0] text = nickandtext[1] ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text() allmsgs.append({ 'avatar': avatar, 'nickname': nickname, 'text': text, 'datetime': ts, }) return_val['msgs'] = allmsgs return return_val
def get_links_from(channel,pages): #http://bj.ganji.com/jiaju/a3o11/ #ttp://bj.ganji.com/wupinjiaohuan/o3/#两种不同url if channel in ['http://bj.ganji.com/xuniwupin/','http://bj.ganji.com/qitawupin/','http://bj.ganji.com/ershoufree/','http://bj.ganji.com/wupinjiaohuan/']: list_view = '{}o{}/'.format(channel,str(pages)) wb_data = requests.get(list_view,headers=headers) #time.sleep(1) soup = BeautifulSoup(wb_data.text,'lxml') if soup.find('ul','pageLink clearfix'): for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dt > div > a'): item_link = link.get('href') url_list.insert_one({'url':item_link}) print(item_link) else: #pass print('重复页面') else: list_view = '{}a3o{}/'.format(channel,str(pages)) wb_data = requests.get(list_view,headers=headers) #time.sleep(1) soup = BeautifulSoup(wb_data.text,'lxml') if soup.find('ul','pageLink clearfix'): for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a'): item_link = link.get('href') url_list.insert_one({'url':item_link}) print(item_link) else: #pass print('重复页面')
def get_page_info_from(url,data=None): web_data = requests.get(url) if web_data.status_code == 404: pass else: web_data.encoding = "utf-8" soup_page = BeautifulSoup(web_data.text,'lxml') page_tips = soup_page.select('div.newstop span') page_title = soup_page.select('div.newstop h2')[0].text page_contents= soup_page.select('div.lhnewcon p') page_imgs = soup_page.select('div.lhnewcon img') #content = '' tip = '' content_list = [] imgs = [] for page_content in page_contents: # content += page_content.get_text()+' ' content_list.append(page_content.get_text()) for page_tip in page_tips: tip += page_tip.get_text()+' ' for page_img in page_imgs: templink = page_img.get('src') templink = templink.replace('../../..','http://ilonghua.sznews.com') imgs.append(templink) data ={ 'title':page_title.strip(), # 'content':content, 'tip':tip, 'contents':content_list, 'imgs':imgs, #'url':url } return data
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # The oldest year for audit reports continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content li") for result in results: report = audit_report_from(result, url, year, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#content li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the Peer Review doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL)) result = doc.find("div", id='content').find("a", text=True) report = peer_review_from(result, year_range) inspector.save_report(report)
def get_items_info(sellerType): item_urls = get_link_list(sellerType) counter = 1 #计数器 for item_url in item_urls: wb_data = requests.get(item_url) soup = BeautifulSoup(wb_data.text,'lxml') title = soup.title.text price = soup.select('#content span.price') area = soup.select('span.c_25d') date = soup.select('li.time') totalView = get_views_num(item_url) # print(title,price,area,date,totalView,sellerType,sep='\n------------\n') data = { '序号':counter, '标题':title, '价格':price[0].text, '地区':None if area==[] else list(area[0].stripped_strings), #防止因某些商品没有地区信息而中断 '日期':date[0].text, '浏览量':totalView, '卖家类型':'个人' if sellerType == 0 else '商家', } counter += 1 print(data)
def getPageItems(): for href in content_list.find({}, {'url' : 1}): if href in url_list.find({}, {'url' : 1}): print '已抓取过' else: url=href.get('url') request = urllib2.Request(url) response = urllib2.urlopen(request) pageCode = response.read().decode('utf-8') soup = BeautifulSoup(pageCode,'lxml') if not pageCode: print "页面加载失败...." return None title=soup.select('#main > div.col.detailPrimary.mb15 > div.col_sub.mainTitle > h1') date=soup.select('li.time') price=soup.select('span.price.c_f50') pattern = re.compile(r'\d+') result = re.findall(pattern,str(title)) data={ 'title':result[1], 'date':date[0].text, 'price':price[0].text, 'url':url, } content_list.insert_one(data) print data
def scrape(): hold = [] hold.append(['playername', 'points']) for page in build_fp_pages(): r = requests.get(page) soup = BS(r.text, 'html.parser') if 'espn' in page: for row in soup.select('.playerTableTable tr'): try: p_check = row.findAll(class_="playertablePlayerName") if len(p_check) == 0: continue defense_name = p_check[0].text defense_points = row.find_all('td')[-1].text defense = unicode_normalize(defense_name, defense_points) hold.append(defense) except Exception, e: print 'Error scraping ESPN data: ' + str(e) else: for row in soup.select('tr.mpb-available'): try: hold.append([str(row.find_all('td')[0].text), str(row.find_all('td')[-1].text)]) except Exception, e: print 'Error scraping FanPros data: ' + str(e)
def _find_image(self, detail_url): """ Find URL of image from detail page. """ detail_page = urllib2.urlopen(detail_url) soup = BeautifulSoup(detail_page.read()) file = open(path, "a") file.write(detail_url) file.write(",") for img in soup.find_all("img"): if img.get("alt") == word.decode("shift-jis"): print "extract image url : " + img.get("src") file.write(img.get("src")) file.write(",") if len(soup.select("td.t-left a")) == 0: file.write("none.") file.write("\n") else: origin = str(soup.select("td.t-left a")[0].get("href")) file.write(origin) file.write("\n") file.close()
def get_question_info(question_link): print('processing: {}'.format(question_link.text)) res = requests.get(requests.compat.urljoin('http://www.mypythonquiz.com/', question_link.attrs['href'])) soup = BeautifulSoup(res.text, 'lxml') title = question_link.text question_id = question_link.attrs['href'].split('qid=') question = soup.select('.myspan')[0] question = question.getText().split(':')[1].strip() try: code = soup.select('.codesample code')[0] code = code.getText() except IndexError: code = None answer_values = [i.attrs['value'] for i in soup.select('input[name="answer"]')] answer_list = [i.getText() for i in soup.select('.content .myspan')[1:]] answers = dict(zip(answer_values, answer_list)) choices, description = get_correct_answer(question_id, answers) print('done') return {'title': title, 'question': question, 'code': code, 'choices': choices, 'description': description}
def get_tweet_details(tweet_id,user='',retry=0): print tweet_id try: headers_custom= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} text_selector=".TweetDetail-text.u-textLarge" stat_selector='.TweetDetail-statCount, .TweetAction-count' tweet_template=Template('https://mobile.twitter.com/$user/status/$tweet_id') uri=tweet_template.substitute({'user':user,'tweet_id':tweet_id}) response=requests.get(uri,headers=headers_custom) if response.status_code != 200: if retry%10==0: print 'Too many requests in tweet download!' time.sleep(1) return get_tweet_details(tweet_id,user=user,retry=retry+1) if retry<max_retry else {} soup = BeautifulSoup(response.content, 'html.parser') text=safe_list_get(map(lambda x:x.get_text(),soup.select(text_selector)),0,'') stats=soup.select(stat_selector) pases=get_stat(stats,0) likes=get_stat(stats,1) date=soup.select('.TweetDetail-timeAndGeo') date=map(lambda d:d.get_text(),date) responses=soup.select('.Timeline-base .Tweet-body') def response_to_data(r): user_name=r.select('.UserNames-screenName')[0].get_text() text=r.select('.Tweet-text')[0].get_text() date=r.select('.Tweet-timestamp time')[0].attrs['datetime'] return {'text':text,'user':user_name,'date':date} responses=map(response_to_data,responses) return {'text':text,'pases':pases,'likes':likes,'date':safe_list_get(date,0,''),'responses':responses} except : traceback.print_exc() return {}
def parser_video_info_html(self, html_doc): video_info = {} soup = BeautifulSoup(html_doc) # Get Title data = soup.select('#content > h1') if len(data) > 0: video_info['Title'] = unicode(data[0].contents[0]) # Get Date data = soup.select('#content > .head-list > li') if len(data) == 2: date_str = str(unicode(data[0].contents[1])).strip() video_info['Date'] = datetime.strptime(date_str, '%b %d, %Y').date() # Get Models data = soup.select('#content > .head-list > li > a') if len(data): video_info['Models'] = [] for model in data: video_info['Models'].append(unicode(model.contents[0])) # Get Rating and VoteCount data = soup.select('.star-holder > p') if len(data): votes_list = data[0].contents[0] votes_list = re.split('/| |\(|\)', votes_list) votes_list.remove('') video_info['Rating'] = float(unicode(votes_list[0])) video_info['VoteCount'] = int(unicode(votes_list[2])) return video_info
def get_pages_info(url): if "zhuanzhuan" in url: ganji_url.delete_one({"url": url}) return try: wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, 'lxml') titles = soup.select('h1.title-name') times = soup.select('i.pr-5') types = soup.select('ul.det-infor > li:nth-of-type(1) > span') prices = soup.select('i.f22.fc-orange.f-type') places = soup.select("ul.det-infor > li > a") for titles, times, types, prices, places in zip(titles, times, types, prices, places): data = { '标题': titles.get_text(), '发布时间': times.get_text().strip().split(' ')[0] if len(times) > 0 else "", '类型' : types.get_text(), '价格' : prices.get_text(), '交易地点':places.get_text(), 'url': url } ganji_data2.insert_one(data) except Exception as e: print(e) time.sleep(3)
def test_admin_add(self): '''Admin can be added via add member page''' app = self._get_test_app() owner = factories.User(fullname='My Owner') factories.User(fullname="My Fullname", name='my-user') group = self._create_group(owner['name']) env, response = self._get_group_add_member_page(app, owner, group['name']) add_form = response.forms['add-member-form'] add_form['username'] = '******' add_form['role'] = 'admin' add_response = submit_and_follow(app, add_form, env, 'save') assert_true('2 members' in add_response) add_response_html = BeautifulSoup(add_response.body) user_names = [u.string for u in add_response_html.select('#member-table td.media a')] roles = [r.next_sibling.next_sibling.string for r in add_response_html.select('#member-table td.media')] user_roles = dict(zip(user_names, roles)) assert_equal(user_roles['My Owner'], 'Admin') assert_equal(user_roles['My Fullname'], 'Admin')
def test_remove_member(self): '''Member can be removed from group''' app = self._get_test_app() user_one = factories.User(fullname='User One', name='user-one') user_two = factories.User(fullname='User Two') other_users = [ {'name': user_two['id'], 'capacity': 'member'} ] group = self._create_group(user_one['name'], other_users) remove_url = url_for(controller='group', action='member_delete', user=user_two['id'], id=group['id']) env = {'REMOTE_USER': user_one['name'].encode('ascii')} remove_response = app.post(remove_url, extra_environ=env, status=302) # redirected to member list after removal remove_response = remove_response.follow(extra_environ=env) assert_true('Group member has been deleted.' in remove_response) assert_true('1 members' in remove_response) remove_response_html = BeautifulSoup(remove_response.body) user_names = [u.string for u in remove_response_html.select('#member-table td.media a')] roles = [r.next_sibling.next_sibling.string for r in remove_response_html.select('#member-table td.media')] user_roles = dict(zip(user_names, roles)) assert_equal(len(user_roles.keys()), 1) assert_equal(user_roles['User One'], 'Admin')
def test_attendee_name_required(self): self.event.settings.set('attendee_names_asked', True) self.event.settings.set('attendee_names_required', True) cr1 = CartPosition.objects.create( event=self.event, session=self.session_key, item=self.ticket, price=23, expires=now() + timedelta(minutes=10) ) response = self.client.get('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), follow=True) doc = BeautifulSoup(response.rendered_content) self.assertEqual(len(doc.select('input[name=%s-attendee_name]' % cr1.identity)), 1) # Not all required fields filled out, expect failure response = self.client.post('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), { '%s-attendee_name' % cr1.identity: '', 'email': 'admin@localhost' }, follow=True) doc = BeautifulSoup(response.rendered_content) self.assertGreaterEqual(len(doc.select('.has-error')), 1) # Corrected request response = self.client.post('/%s/%s/checkout/questions/' % (self.orga.slug, self.event.slug), { '%s-attendee_name' % cr1.identity: 'Peter', 'email': 'admin@localhost' }, follow=True) self.assertRedirects(response, '/%s/%s/checkout/payment/' % (self.orga.slug, self.event.slug), target_status_code=200) cr1 = CartPosition.objects.current.get(identity=cr1.identity) self.assertEqual(cr1.attendee_name, 'Peter')
def test_membership_list(self): '''List group admins and members''' app = self._get_test_app() user_one = factories.User(fullname='User One', name='user-one') user_two = factories.User(fullname='User Two') other_users = [ {'name': user_two['id'], 'capacity': 'member'} ] group = self._create_group(user_one['name'], other_users) member_list_url = url_for(controller='group', action='members', id=group['id']) env = {'REMOTE_USER': user_one['name'].encode('ascii')} member_list_response = app.get( member_list_url, extra_environ=env) assert_true('2 members' in member_list_response) member_response_html = BeautifulSoup(member_list_response.body) user_names = [u.string for u in member_response_html.select('#member-table td.media a')] roles = [r.next_sibling.next_sibling.string for r in member_response_html.select('#member-table td.media')] user_roles = dict(zip(user_names, roles)) assert_equal(user_roles['User One'], 'Admin') assert_equal(user_roles['User Two'], 'Member')
def generate_with_kw(self, id3, kw, update): url = self.url.replace('KEYWORD', urllib.quote_plus(kw.encode('utf8'))) logger.debug('Crawling %(url)s' % locals()) soup = BeautifulSoup(requests.get(url).content, 'lxml') results = soup.select('table.mp3Tracks tr td.songTitle a') if not results: logger.info('Amazon, no results for %(kw)s' % locals()) return False result = results[0] url = result.get("href") logger.debug("Found specific url: %(url)s" % locals()) soup = BeautifulSoup(requests.get(url).content, 'lxml') id3.add(WXXX(encoding=3, desc=u"Amazon url", url=url)) album = soup.select("#fromAlbum a") if album and (update or "TALB" not in id3): id3.add(TALB(encoding=3, text=album[0].find(text=True).strip(" \n"))) images = soup.select("div#coverArt_feature_div img") + soup.select('#prodImageContainer img') if images and (update or 'APIC:Cover' not in id3): data = requests.get(images[0].get("src")).content id3.add(APIC(encoding=3, mime="image/jpeg", type=3, desc=u"Cover", data=data)) details = [filter(lambda x: x not in ("\n", "", " "), detail.find_all(text=True)) for detail in soup.select("div.content li") if detail.find("strong")] for detail in details: if detail and detail[0] == "Genres:" and (update or "TCON" not in id3) and len(detail) >= 2: id3.add(TCON(encoding=3, text=detail[1])) return True
def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None): if subtopic_url.startswith("http://httphttp://"): # See notes to IG's web team subtopic_url = subtopic_url.replace("http://http", "") body = utils.download(subtopic_url) doc = BeautifulSoup(body) results = doc.select("#body-row02-col02andcol03 a") if not results: results = doc.select("#body-row02-col01andcol02andcol03 a") if not results and "There are currently no reports in this category" not in doc.text: raise AssertionError("No report links found for %s" % subtopic_url) topic_name = TOPIC_NAMES[topic] # Broadcasting Board of Governors is a fully independent agency if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors': agency = 'bbg' else: agency = 'state' for result in results: report = report_from(result, year_range, agency, topic_name, subtopic) if report: inspector.save_report(report)
def expand_links(self): """Expand any links referenced in the message.""" if '<blockquote>' in self.html: # links have been already expanded return False changed = False for link in BeautifulSoup(self.html, 'html5lib').select('a'): url = link.get('href', '') try: rv = requests.get(url) except requests.exceptions.ConnectionError: continue if rv.status_code == 200: soup = BeautifulSoup(rv.text, 'html5lib') title_tags = soup.select('title') if len(title_tags) > 0: title = title_tags[0].string.strip() else: title = url description = 'No description found.' for meta in soup.select('meta'): if meta.get('name', '').lower() == 'description': description = meta.get('content', description).strip() break # add the detail of the link to the rendered message tpl = ('<blockquote><p><a href="{url}">{title}</a></p>' '<p>{desc}</p></blockquote>') self.html += tpl.format(url=url, title=title, desc=description) changed = True return changed
def selectDomain(self, html): soup = BeautifulSoup(html) subdomainList = [] for i in xrange(len(soup.select('[class=domain]'))): subDomain = soup.select('[name=domain'+ str(i+1) +']')[0].attrs['value'] subdomainList.append(subDomain) return subdomainList
def ZhilianFirmPage(firmUrl=''): ''' # Function: 获取智联招聘的企业详细信息页面里面的企业基本信息及招聘列表。 # Params : firmUrl=页面网址 # Steps : 先判断域名,如果是“标准页面”则正常解析,如果是“Special页面”则在得到“标准页面”后才正式解析。 # Notes : 企业页面就复杂了,分为普通页面和VIP页面,网址不同,源码也不同 ''' # === 开始解析网址 === # 无论是Special页面还是标准页面,都必须要解析。 webTarget = webPageSourceCode(firmUrl) soup = BeautifulSoup(webTarget['html'], 'html5lib') # === 根据域名判断当前为“标准页面”还是“Special页面” === subDomain = urlAnalyse(firmUrl)['subloc'][0] if subDomain == 'special' : # 如果是"Special页面"则获取其标准页面的URL,并重新加载此函数。 # 只能用正则表达式获取`<!-- -->`隐藏标签的内容。 finder = re.findall(re.compile(r' href="(.+?)"'), str(soup.select('td[align=right]'))) standardUrl = finder[0] if finder else '' if len(standardUrl) : print 'Redirecting from a special company page to a standard page...' ZhilianFirmPage(standardUrl) # 以标准页面重新加载此函 return '' # === 在标准页面中获取该公司所有招聘信息的链接 === # ===>>> 不过有一点:页面只会显示一个城市的招聘,其他城市的信息则是Javascript动态加载的。 # 也就是说,还不如直接在搜索主页按照企业名搜索的强。 resu = soup.select('[class=positionListContent1] [class*=jobName] a[href]') data = [t['href'] for t in resu ] print 'Done of retrieving %d job links of this company.' %len(resu) return data # 返回所有正在招聘的职位链接
def get_item_info(link): time.sleep(random.uniform(0,3)) #随机休息 try: webdata = requests.get(link, headers=headers) if webdata.status_code == 200: soup = BeautifulSoup(webdata.text, 'html.parser') title = soup.title.text post_time = soup.select('i.pr-5') views = get_view(link) type = soup.select('ul.det-infor > li > span > a ') price = soup.select(' i.f22.fc-orange.f-type ') address = soup.select('ul.det-infor > li') address_str = '' if address[2] == 0 else address[2].get_text().replace(' ', '').replace('\r', '').replace('\n', '').replace( '\xa0', '')[5:] use = soup.select('div.second-dt-bewrite > ul > li ') data = { 'link': link, 'title': title, 'post_time': get_text(post_time)[:-3], 'views': views, 'price': get_text(price), 'type': get_text(type), 'address': address_str, 'use': get_use(use)[:get_use(use).find('新')+1] } item_info_lb.insert_one(data) else: print('{} has some problems,please try again later'.format(link)) pass except Exception as e: print (Exception,':',e)
def urls_for(self): only = self.options.get('topics') if only: # if only... only = set(only.split(',')) only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o for o in only] yield from self.urls_for_topics(only) # If there are topics selected, ONLY yield URLs for those. return # First yield the URLs for the topics that are tangential to the main # Calendar Year reports. yield from self.urls_for_topics(ADDITIONAL_TOPICS) # Not getting reports from specific topics, iterate over all Calendar Year # reports. page = BeautifulSoup(utils.download(BASE_URL)) # Iterate over each "Calendar Year XXXX" link for li in page.select('.field-items li'): md = RE_CALENDAR_YEAR.search(li.text) if md: cur_year = int(md.group(1)) if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]: href = li.select('a')[0]['href'] next_url = urljoin(BASE_URL, href) # The first page of reports is yielded. yield next_url # Next, read all the pagination links for the page and yield those. So # far, I haven't seen a page that doesn't have all of the following # pages enumerated. next_page = BeautifulSoup(utils.download(next_url)) for link in next_page.select('li.pager-item a'): yield urljoin(BASE_URL, link['href'])
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: url = AUDITS_REPORTS_URL.format(str(year)[2:4]) doc = BeautifulSoup(utils.download(url)) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError("NASA (%d)" % year) for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and any empty rows continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORT_URL)) results = doc.select("#subContainer ul li") if not results: raise inspector.NoReportsFoundError("NASA (other)") for result in results: report = other_report_from(result, year_range) if report: inspector.save_report(report)
def send_to_server(api_base, stub_base, path): """ Send the file at the given `path` to the given `api_base`. Path components will be appended to the `api_base` and are presumed to match. """ relative_path = os.path.relpath(path, stub_base) url = urlparse.urljoin(api_base, relative_path) logger.info('sending {} to {}'.format(path, url)) data = json.dumps(json.load(open(path, 'r'))) r = requests.post(url, data=data, headers={'content-type': 'application/json'}) # regulations-core returns 204 on a successful POST if r.status_code != 204: try: soup = BeautifulSoup(r.text, 'html.parser') exception = soup.select("#summary h1")[0].text exception_value = soup.select("#summary .exception_value")[0].text logger.error("error sending {}: {}, {}".format( r.status_code, exception, exception_value)) except: logger.error("error sending {}: {}".format(r.status_code, r.reason))
def fetch_from_landing_page(self, landing_url): """Returns a tuple of (pdf_link, summary_text, is_unreleased).""" unreleased = False page = BeautifulSoup(utils.download(landing_url)) summary = None field_items = page.select('.field-items') if field_items: text = [node.strip() for node in field_items[0].findAll(text=True)] summary = '\n\n'.join(text).strip() if not summary: logging.info('\tno summary text found') if (summary and (RE_NOT_AVAILABLE.search(summary) or RE_NOT_AVAILABLE_2.search(summary) or RE_NOT_AVAILABLE_3.search(summary) or RE_NOT_AVAILABLE_4.search(summary) or RE_CLASSIFIED.search(summary))): unreleased = True report_url = None pdf_link = page.select('.file a') if not pdf_link: logging.warn('No pdf link found on page: {0}'.format(landing_url)) else: report_url = pdf_link[0]['href'] return report_url, summary, unreleased
def run(options): year_range = inspector.year_range(options, archive) doc = BeautifulSoup(utils.download(REPORTS_URL)) # Pull the semiannual reports semiannul_results = doc.select("#AnnualManagementReports select")[0] for result in semiannul_results.select("option"): report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the special reports special_report_table = doc.find("table", attrs={"bordercolor": "#808080"}) for index, result in enumerate(special_report_table.select("tr")): if not index: # Skip the header row continue report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range) if report: inspector.save_report(report) # Pull the audit reports for year in year_range: if year < 2001: # The oldest fiscal year page available continue year_url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(year_url)) for index, result in enumerate(doc.select("#main table tr")): if not index: # Skip the header row continue report = report_from(result, year_url, report_type='audit', year_range=year_range) if report: inspector.save_report(report)
def test_custom_css(self): '''Add some custom css to the head element''' app = self._get_test_app() # current tagline intro_response_html = BeautifulSoup(app.get('/').body) style_tag = intro_response_html.select('head style') assert_equal(len(style_tag), 0) # set new tagline css env, config_response = _get_admin_config_page(app) config_form = config_response.forms['admin-config-form'] config_form['ckan.site_custom_css'] = 'body {background-color:red}' webtest_submit(config_form, 'save', status=302, extra_environ=env) # new tagline not visible yet new_intro_response_html = BeautifulSoup(app.get('/').body) style_tag = new_intro_response_html.select('head style') assert_equal(len(style_tag), 1) assert_equal(style_tag[0].text.strip(), 'body {background-color:red}') # reset config value _reset_config(app) reset_intro_response_html = BeautifulSoup(app.get('/').body) style_tag = reset_intro_response_html.select('head style') assert_equal(len(style_tag), 0)
def test_voucher_double(self): self.quota_tickets.size = 2 self.quota_tickets.save() v = Voucher.objects.create(item=self.ticket, event=self.event, valid_until=now() + timedelta(days=2), block_quota=True) CartPosition.objects.create( event=self.event, cart_id=self.session_key, item=self.ticket, price=23, expires=now() + timedelta(minutes=10), voucher=v ) CartPosition.objects.create( event=self.event, cart_id=self.session_key, item=self.ticket, price=23, expires=now() + timedelta(minutes=10), voucher=v ) self._set_session('payment', 'banktransfer') response = self.client.post('/%s/%s/checkout/confirm/' % (self.orga.slug, self.event.slug), follow=True) doc = BeautifulSoup(response.rendered_content, "lxml") self.assertEqual(CartPosition.objects.filter(cart_id=self.session_key, voucher=v).count(), 1) self.assertEqual(len(doc.select(".alert-danger")), 1) self.assertFalse(Order.objects.exists()) response = self.client.post('/%s/%s/checkout/confirm/' % (self.orga.slug, self.event.slug), follow=True) doc = BeautifulSoup(response.rendered_content, "lxml") self.assertFalse(CartPosition.objects.filter(cart_id=self.session_key, voucher=v).exists()) self.assertEqual(len(doc.select(".thank-you")), 1) self.assertEqual(Order.objects.count(), 1) self.assertEqual(OrderPosition.objects.count(), 1)
def crawler(query): #크롤링 전 데이터 세팅 currnet_searching_page = 1 have_more_page_to_search = True today_yy_mm_dd = datetime.datetime.now().strftime("%Y.%m.%d") # today_yy_mm_dd = '2020.01.22' # 테스트 코드 print('크롤링 시작 전 값 세팅 확인 \ncurrnet_searching_page: ', currnet_searching_page, '\nhave_more_page_to_search: ', have_more_page_to_search, '\ntoday_yy_mm_dd: ', today_yy_mm_dd) # 해당 키워드에 해당하는 최신 기사 제목을 얻음 latest_news_title_in_database = db.select_latest_news(query) # 크롤링 시작 while have_more_page_to_search: url = "https://search.naver.com/search.naver?&where=news&query=" + query + "&sm=tab_pge&sort=1&photo=0&field=0&reporter_article=&pd=3&ds=" + today_yy_mm_dd + "&de=" + today_yy_mm_dd + "&mynews=0&start=" + str( currnet_searching_page) + "&refresh_start=0" print('크롤링 시작! url 확인 \nurl: ', url) req = requests.get(url) cont = req.content soup = BeautifulSoup(cont, 'html.parser') # 검색 결과가 없을 때 처리 (mm월 dd일 00시 mm에 기사가 올라오지 않을 때) noresult = soup.select('.noresult_tab') if noresult: print('no result') break # <a>태그에서 제목과 링크주소 추출 atags = soup.select('._sp_each_title') # 첫번째 기사 제목 확인 if currnet_searching_page == 1: print('크롤링 시작! 첫번째 기사 제목 확인 \nurl: ', atags[0].text.replace("'", "")) first_searched_title = atags[0].text.replace("'", "") for atag in atags: # 새로운 뉴스가 없음 -> 크롤링 중단 if atag.text.replace("'", "") == latest_news_title_in_database: have_more_page_to_search = False print('새로운 뉴스가 없음 -> 크롤링 중단') break else: subKeywords = db.select_sub_keyword(query) print('sub key word: ', subKeywords) # 등록해놓은 서브 키워드에 맞는 기사 제목만 필터링 해서 데이터 베이스에 저장 for sub in subKeywords: if sub in atag.text: db.insert_scrapped_news(atag.text, atag['href'], query) # 저장해놓은 첫번째 기사와 제목이 같으면 이하부터 중복 기사로 처리 if db.is_latest_news(first_searched_title) == 0: db.insert_latest_news(query, first_searched_title) # 본문요약본 contents_lists = soup.select('ul.type01 dl') for contents_list in contents_lists: contents_cleansing(contents_list) # 본문요약 정제화 # 페이지 처리 및 크롤링 계속 할지 말지 결정 for page in soup.select(".paging"): if "다음페이지" in page.text: currnet_searching_page = currnet_searching_page + 10 else: have_more_page_to_search = False print('finish')
# ユーザの入力した問題がユーザの入力したコンテストのレベルの問題に存在しない場合、処理終了 print("Problem '{prob}' doesn't exist in {level}".format(level=level, prob=prob)) exit() # ログイン login.login() # ------------------------------test part start---------------------------------- # 「問題」ページのurlを作成 tasks_url = "https://atcoder.jp/contests/{level}{round}/tasks".format(level=level, round=round) # 「問題」ページを取得 html = config.SESSION.get(tasks_url) soup = BeautifulSoup(html.text, 'lxml') a = soup.select('a') plob_path_map = {} for ai in a: try: # aタグのテキストを取得 text = ai.get_text() if text in prob_list: # テキストの文字列が問題リストに含まれていた場合、aタグのhref属性を取得 link = ai.attrs['href'] # 問題とリンクをマッピング plob_path_map[text] = link except: pass if prob not in plob_path_map: # ユーザが入力した問題のリンクが取得できなかった場合
import requests from bs4 import BeautifulSoup req = requests.get('https://www.naver.com') html = req.text #print(html) soup = BeautifulSoup(html, 'html.parser') issues = soup.select( '#PM_ID_ct > div.header > div.section_navbar > div.area_hotkeyword.PM_CL_realtimeKeyword_base > div.ah_roll.PM_CL_realtimeKeyword_rolling_base > div > ul > li > a' ) #print(issues) for issue in issues: print("[" + issue.select_one('span[class="ah_r"]').text + "] " + issue.select_one('span[class="ah_k"]').text)
def check_jiekou(self): req=requests.get("http://jiekou.xiaomil.com/",headers=headers) soup=BeautifulSoup(req.text,'lxml') url_list=soup.select("xiaomil_ul form div lib_3 a") for url in url_list: print(url.get('href'))
def w_url(href): driver = webdriver.Firefox() # 打开火狐浏览器 driver.set_page_load_timeout(30) driver.get(href) # 打开界面 time.sleep(3) driver.add_cookie({ 'name': 'gldjc_sessionid', 'value': '39e0c310-83f6-4fd4-a10e-983392b87cc6', 'path': '/', 'domain': '.gldjc.com', 'expiry': None, 'secure': False, 'httpOnly': True }) driver.add_cookie({ 'name': 'location_name', 'value': '%25E5%25B1%25B1%25E4%25B8%259C', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1535531192, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'location_code', 'value': '370000', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1535531192, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': '_gat_gtag_UA_110560299_1', 'value': '1', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1532939793, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'loginUuid', 'value': '39e0c310-83f6-4fd4-a10e-983392b87cc6', 'path': '/', 'domain': '.gldjc.com', 'expiry': None, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'nTalk_CACHE_DATA', 'value': '{uid:kf_9318_ISME9754_6349427345656906564,tid:1532939192431315}', 'path': '/', 'domain': '.gldjc.com', 'expiry': None, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'NTKF_T2D_CLIENTID', 'value': 'guest6DE8EBA3-F3AF-F1D9-ECD9-EA4BC870E82E', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1596011199, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': '_ga', 'value': 'GA1.2.1004482802.1532939194', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1596011199, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': '_gid', 'value': 'GA1.2.1482668967.1532939200', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1533025599, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'INFO_PRICE_LOCATION', 'value': '1_1', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1540715203, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'Hm_lvt_727d5904b141f326c9cb1ede703d1162', 'value': '1532939192', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1564475203, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'Hm_lpvt_727d5904b141f326c9cb1ede703d1162', 'value': '1532939203', 'path': '/', 'domain': '.gldjc.com', 'expiry': None, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'Hm_lvt_82698a74ed862e6a03fc9e4cbac594a6', 'value': '1532939192', 'path': '/', 'domain': '.gldjc.com', 'expiry': 1564475203, 'secure': False, 'httpOnly': False }) driver.add_cookie({ 'name': 'Hm_lpvt_82698a74ed862e6a03fc9e4cbac594a6', 'value': '1532939203', 'path': '/', 'domain': '.gldjc.com', 'expiry': None, 'secure': False, 'httpOnly': False }) time.sleep(3) driver.refresh() driver.execute_script(""" (function () { var y = document.body.scrollTop; var step = 100; window.scroll(0, y); function f() { if (y < document.body.scrollHeight) { y += step; window.scroll(0, y); setTimeout(f, 50); } else { window.scroll(0, y); document.title += "scroll-done"; } } setTimeout(f, 1000); })(); """) time.sleep(2) pageSource = driver.page_source soup = BeautifulSoup(pageSource, 'lxml') title = soup.find(attrs={'class': 'highcharts-title'}).text print(title) # com_names = soup.find_all(class_='data_table') wb = workbook.Workbook() # 创建Excel对象 ws = wb.active # 获取当前正在操作的表对象 # 往表中写入标题行,以列表形式写入! ws.append(['序号', '名称', '规格型号', '单位', '税率', '除税价(元)', '含税价(元)', '日期', '备注']) trs = soup.select("#infoprice_table tr") ulist = [] for tr in range(1, len(trs)): ui = [] for td in trs[tr]: ui.append(td) ulist.append(ui) for i in range(len(ulist)): xh = ulist[i][0].text mc = ulist[i][1].text ggxh = ulist[i][2].text dw = ulist[i][3].text sl = ulist[i][4].text csj = ulist[i][5].text result = ulist[i][6].img['src'] urllib.request.urlretrieve(result, 'D:/YZM/1.png') image = Image.open('D:/YZM/1.png') hsj = tesserocr.image_to_text(image) print(hsj) rq = ulist[i][7].text bz = ulist[i][8].text ws.append([xh, mc, ggxh, dw, sl, csj, hsj, rq, bz]) print(title) wb.save('1.xlsx') driver.close()
''' body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(2) > td:nth-child(1) > a body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(4) > td:nth-child(1) > a body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(6) > td:nth-child(1) > a body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(8) > td:nth-child(1) > a body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(65) > tbody > tr:nth-child(42) > td:nth-child(1) > a /html/body/div[4]/div[2]/div/div[2]/table[3]/tbody/tr[6]/td[1]/a ''' #href = soup.select('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-of-type(65) > tbody > tr:nth-child(2) > td:nth-of-type(1) > a') href = soup.select('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table') railways_condition = ''' <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%811%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁1号线</a> <a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%812%E5%8F%B7%E7%BA%BF">北京地铁2号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%814%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁4号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%815%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁5号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%816%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁6号线</a> <a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%817%E5%8F%B7%E7%BA%BF">北京地铁7号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%818%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁8号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%819%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁9号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8110%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁10号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8113%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁13号线</a> <a href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8115%E5%8F%B7%E7%BA%BF" target="_blank">北京地铁15号线</a> <a target="_blank" href="/item/%E5%8C%97%E4%BA%AC%E5%9C%B0%E9%93%8116%E5%8F%B7%E7%BA%BF">北京地铁16号线</a>
from bs4 import BeautifulSoup import requests import csv url = 'http://www2.copasa.com.br/servicos/qualidadeagua/pesqtel.asp?letra=D&cidade=443&periodoInicial=01%2F2019&periodoFinal=04%2F2020' html = requests.get(url) soup = BeautifulSoup(html.text) tables = soup.select("#mesames table") for table in tables: headers = [th.text.encode("utf-8") for th in table.select("tr th")] with open("out.csv", "a") as f: wr = csv.writer(f) wr.writerow(headers) wr.writerows( [ [td.text.encode("utf-8") for td in row.find_all("td")] for row in table.select("tr + tr") ] )
def D_parse_Set(cls, html): soup = BeautifulSoup(html, 'html.parser') temp = soup.select(".cy_cosList li div") kv_info = [cls.clean1(item) for item in temp] return kv_info
#import chardet #speed up # Initialize index_url = 'http://coolshell.cn/page/68' count = 0 data = pd.DataFrame(columns=('title', 'link', 'reads')) # Get links while index_url != 'End': index_res = requests.get(index_url) index_res.encoding = 'utf-8' index_soup = BeautifulSoup(index_res.text, 'html.parser') # Get url from index page index = index_soup.select('header h2 a') for i in index: count = count + 1 data.loc[count] = [i.text, i['href'], ''] # Go to next link page try: index_url = index_soup.select( 'nav .wp-pagenavi .nextpostslink')[0]['href'] except: index_url = 'End' # Get contents for data_id in list(range(1, count + 1)): page_url = data['link'].loc[data_id] page_res = requests.get(page_url)
def parse_Img(cls,html): soup = BeautifulSoup(html,'html.parser') temp = soup.select(".tc p") info = [cls.clean2(item) for item in temp if cls.clean2(item) ] return info
#!/usr/bin/env python3 import requests from bs4 import BeautifulSoup url = "https://www.humblebundle.com/books/linux-unix-oreilly-books" tierDict = {} resp = requests.get(url) soup = BeautifulSoup(resp.text, 'html.parser') #Bundle Tiers tiers = soup.select(".dd-game-row") for tier in tiers: #only for headline if tier.select(".dd-header-headline"): #grab tier name and price tiername = tier.select(".dd-header-headline")[0].text.strip() #grab tier product names productNames = tier.select(".dd-image-box-caption") productNames = [prodName.text.strip() for prodName in productNames] #add one product tier to our datastructure tierDict[tiername] = {"products": productNames} #old tiers tierHeadlines = soup.select(".dd-header-headline") strippedTiernames = [tier.text.strip() for tier in tierHeadlines] #product Names
""" Scrape the TOC Extract its links and put them in the Pile O' Links """ expand_toc_js = config['toc_js'] print(f"Scraping table of contents: {config['toc_url']}") toc_scrape_result = scraper.scrape( config['toc_url'], wait_for_selector=config['toc_selector'], js=expand_toc_js) # Record the scrape results in included_scraped_urls and redirects mark_url_included(toc_scrape_result['final_url']) redirects[config['toc_url']] = toc_scrape_result['final_url'] soup = BeautifulSoup(toc_scrape_result['html'], 'html.parser') toc_element = soup.select(config['toc_selector'])[0] remove_blacklisted_selectors(toc_element) if config['rewrite_toc']: toc_element = config['rewrite_toc'](toc_element) def is_post_link(tag, post_url_pattern=None): if tag.attrs['href'] is None: return False if tag.attrs['href'].startswith('javascript:'): return False if post_url_pattern is None: # Not filtering TOC links at all return True return re.match(post_url_pattern, tag.attrs['href']) is not None
for i in range(19): if i == 2 or i == 8: continue # 랭킹페이지 접근 driver.get(url + str(i + 1)) for j in range(10): # 장르마다 1위-10위까지 반복 # xpath 를 사용하여, 1위로 찍힌 영화명을 클릭 (클릭해야 영화정보가 있는 사이트로 넘어갈 수 있음) driver.find_element_by_xpath("//*[@id='old_content']/table/tbody/tr[" + str(j + 2) + "]/td[2]/div/a").click() soup = BeautifulSoup(driver.page_source, 'html.parser') Long_movie_infos = soup.select('#content > div.article') # 장르 '드라마' 에 있는 첫 번째 영화에서 추출해야하는 영화정보 제일 큰 셀렉터 # 출력 양식 설정. 근데 이 포문이 의미가 있을까 ? for Long_movie_info in Long_movie_infos: title = '<영화제목>' + '\n' + str( Long_movie_info.select_one('div.mv_info_area > div.mv_info > h3 > a:nth-child(1)').text) + ' (' + str( Long_movie_info.select_one('div.mv_info_area > div.mv_info > strong').text) + ')' print(title + '\n') poster = '<영화포스터>' + '\n' + str(Long_movie_info.select_one('img').attrs['src']).replace('//', '') print(poster + '\n') director = '<감독>' + '\n' + str( Long_movie_info.select_one('div.mv_info_area > div.mv_info > dl > dd:nth-child(4) > p > a').text) print(director + '\n')
def get_item_info(item_url_queue, item_info_queue, header): while True: while item_url_queue.empty(): time.sleep(0.01) tmp_links = item_url_queue.get() if tmp_links == "#END#": # 遇到结束标志,推出进程 print("get_item_info Quit {}".format(item_url_queue.qsize())) print("队列剩余" + str(item_info_queue.qsize())) break else: print("开始获取 " + str(tmp_links) + " 数据") r = requests.get(tmp_links, headers=header) while r.status_code != 200: time.sleep(10) print(r.status_code) print("重新获取 " + str(tmp_links) + " 数据") r = requests.get(tmp_links, headers=header) r.encoding = 'utf-8' html = r.text soup = BeautifulSoup(html, "lxml") try: item_list = soup.select("li.ws-g.DetailVariant") title = soup.find('h1').string.strip() times = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") for i in item_list: cigar_name = i.find( 'div', attrs={ 'class': "ws-u-1 DetailVariant-variantName", }).find( text=True).strip() pricelist = i.select( 'div.ws-u-1-3.ws-u-lg-1-4.DetailVariant-formPrice > span.preis') numslist = i.find_all( 'span', attrs={ 'class': re.compile(r'einheitlabel')}) tmp_itemurl = i.find( 'a', attrs={ 'class': 'ws-u-1 ws-u-lg-4-24 DetailVariant-col DetailVariant-image'})['href'] itemurl = 'https://www.cigarworld.de' + tmp_itemurl if len(pricelist) == len(numslist): for i in range(len(pricelist)): tmp_name = str(cigar_name) price = pricelist[i].text.replace("€", "").strip() tmp_nums = numslist[i].text tmp_stock = numslist[i].get('title').strip() if tmp_stock: stock = tmp_stock else: stock = "in stock" #nums = re.sub(r'\D',"",tmp_nums) nums = tmp_nums name = title + " " + tmp_name + ' ' + str(nums) details = '0' detailed = price cigarinfo = { 'title': title, 'cigar_name': name, 'detailed': detailed, 'stock': stock, 'details': details, 'cigar_price': price, 'itemurl': itemurl, 'times': times} item_info_queue.put(cigarinfo) else: print("比对不通过 " + tmp_links) except Exception as err: print(str(tmp_links) + " 商品获取报错") print(err)
<td>技术类</td> <td>1</td> <td>深圳</td> <td>2017-11-24</td> </tr> </tbody> </table> """ soup = BeautifulSoup(html, 'lxml') # 1. 获取所有tr标签 # trs = soup.select('tr') # print(trs) # 2. 获取第2个tr标签 # tr = soup.select('tr')[1] # print(tr) # 3. 获取所有class等于even的tr标签 # tr = soup.select('.even') # print(tr) # tr = soup.select('tr[class="even"]') # print(tr) # 4. 获取所有a标签的href属性 # alist = soup.select('a') # for a in alist: # href = a['href'] # print(href) # 5. 获取所有的职位信息(纯文本) trs = soup.select('tr') for tr in trs: info = list(tr.stripped_strings) print(info)
def get_artical_detail(self, artical_href, dict_artcical): # time.sleep(0.1) #获取文章详细内容的url,发现和href中的三个关键参数有关 parameters = { 'DbCode': '', 'DbName': '', 'FileName': '', } pattern_DbCode = re.compile(r'.*?[dD]b[cC]ode=\s?(.*?)&') pattern_DbName = re.compile(r'.*?[dD]b[nN]ame=\s?(.*?)&') pattern_FileName = re.compile(r'.*?[fF]ile[nN]ame=\s?(.*?)&') parameters['DbCode'] = re.search(pattern_DbCode, artical_href).group(1) parameters['DbName'] = re.search(pattern_DbName, artical_href).group(1) parameters['FileName'] = re.search(pattern_FileName, artical_href).group(1) print('FileName=' + parameters['FileName']) req = requests.get(GET_ARTICAL_DETAIL_URL, params=parameters, headers=my_parameters.headers_kns) #请求到文章详细内容的页面后,获取文章关键词 soup = BeautifulSoup(req.text, 'lxml') keyword = [] try: keyword_list = soup.find('label', attrs={ 'id': 'catalog_KEYWORD' }).parent.find_all('a') for item in keyword_list: keyword.append(item.text.strip(';\r\n\t ')) except: pass #将获取的关键词保存为列表,插入到每个文章的信息中 dict_artcical['关键词'] = keyword #查找摘要 try: summary = soup.find('span', attrs={'id': 'ChDivSummary'}).text except: summary = "kong" dict_artcical['摘要'] = summary #查找相似文献 parameters.update({ 'curdbcode': 'CJFQ', 'reftype': '604', 'catalogId': 'lcatalog_func604', 'catalogName': '相似文献', }) ajax_url = 'https://kns.cnki.net/kcms/detail/frame/asynlist.aspx?' dict_artcical['相似文献'] = self.find_ajax(ajax_url, parameters) #查找读者推荐 parameters.update({ 'curdbcode': 'CJFQ', 'reftype': '605', 'catalogId': 'lcatalog_func605', 'catalogName': '读者推荐', }) dict_artcical['读者推荐'] = self.find_ajax(ajax_url, parameters) #获取复合影响因子、获取综合影响因子 parameters_fators = { 'pcode': '', 'pykm': '', } infomation = soup.select('.sourinfo .title a') pattern = re.compile( r'.*?\(\'(.*?)\',\'(.*?)\',\'(.*?)\',\'(.*?)\'\);') parameters_fators['pcode'] = pattern.search(str(infomation)).group(2) parameters_fators['pykm'] = pattern.search(str(infomation)).group(4) if parameters_fators['pykm'] in Journal_Point.keys(): dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = Journal_Point[ parameters_fators['pykm']][0:2] else: try: dict_artcical['复合影响因子'], dict_artcical[ '综合影响因子'] = self.get_Impact_Factor2(parameters_fators) except: dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] = 0, 0 print("没找到") #将期刊的代号插入到字典中 if (dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] != 0, 0): Journal_Point[parameters_fators['pykm']] = [ dict_artcical['复合影响因子'], dict_artcical['综合影响因子'] ]
set desktop picture to POSIX file "%s" end tell END""" dt = datetime.datetime.now() cd = str(dt.year) + '0' + str(dt.month) + str(dt.day) while True: dt = datetime.datetime.now() if (dt.hour == 0 and dt.minute == 2 and dt.second == 0) or (dt.hour == 15 and dt.minute == 0 and dt.second == 0): os.makedirs('Bing', exist_ok=True) url = "http://bingwallpaper.com/" sc = requests.get(url) soup = BeautifulSoup(sc.text, 'lxml') #check lxml? print(sc.text) image = soup.select('.cursor_zoom img') image_url = image[0].get('src') response = requests.get(image_url) with open(os.path.join('Bing', cd + '.jpg'), 'wb') as file: file.write(response.content) #change desktop background #os.system('gsettings set org.gnome.desktop.background picture-uri file:///home/radioactive/Bing/'+cd+'.jpg') file_path = '/Users/asmita.mitra/PythonScripts/crawlers/Bing/' + cd + '.jpg' subprocess.Popen(SCRIPT % file_path, shell=True) print('Wallpaper set to ' + file_path) break sys.exit()
def _get_confirmation_trade_offer_id( confirmation_details_page: str) -> str: soup = BeautifulSoup(confirmation_details_page, 'html.parser') full_offer_id = soup.select('.tradeoffer')[0]['id'] return full_offer_id.split('_')[1]
#url = "http://www.ebay.com/sch/" + word.rstrip('\r\n') url = "http://www.ebay.com/sch/" + "Actnovate" # Check the response for URL connectivity and Reading the content of the URL page = requests.get(url) response = page.status_code content = page.content #print response #--200 suucess response #print content soup = BeautifulSoup(content, 'html.parser') #print soup #print soup #soup1= soup.find_all('a', class_='vip')[0].get_text() #soup2= soup.find_all('a', class_='vip') hrefs = [d["href"] for d in soup.select(".lvtitle a")] #print hrefs for link in hrefs: # Check the response for URL connectivity and Reading the content of the URL sub_page = requests.get(link) sub_response = sub_page.status_code sub_content = sub_page.content #print sub_content sub_soup = BeautifulSoup(sub_content, 'html.parser') #print sub_soup title = sub_soup.find_all('span', id='vi-lkhdr-itmTitl')[0].get_text() print title price = sub_soup.find_all('span', id='prcIsum')[0].get_text()
def start(company_code, start_date, end_date): print(f"company_code: {company_code} 뉴스기사 크롤링 시작") mkdir(company_code) unique_news_titles = set() page = 1 processing_date = end_date while True: url = 'https://finance.naver.com/item/news_news.nhn?code=' + str( company_code) + '&page=' + str(page) source_code = requests.get(url).text html = BeautifulSoup(source_code, "lxml") dates = [ datetime.datetime.strptime(date.get_text(), ' %Y.%m.%d %H:%M').date() for date in html.select('.date') ] titles = [ re.sub('\n', '', str(title.get_text())) for title in html.select('.title') ] links = [ 'https://finance.naver.com' + link.find('a')['href'] for link in html.select('.title') ] flag = True result_date = [] result_title = [] result_contents = [] for row in list(zip(dates, titles, links)): date = row[0] title = row[1] link = row[2] if date > end_date: continue if title in unique_news_titles: continue unique_news_titles.add(title) source_code = requests.get(link).text html = BeautifulSoup(source_code, "lxml") contents = str(html.select("div#news_read")) contents.find("<span") a = contents.find("<a") contents = remove_filename(contents[0:a]) if processing_date != date: # row 단위로 뉴스기사를 읽어오다가 날짜가 달라진 경우 result = { "날짜": result_date, "기사제목": result_title, "본문내용": result_contents } df_result = pd.DataFrame(result) df_result.to_csv( f"./{NEWS_DIR}/{company_code}/{company_code}_{str(processing_date)[:10]}.csv", mode='w', encoding='utf-8-sig') processing_date = date result_date.clear() result_title.clear() result_contents.clear() if start_date > date: # 현재 읽어오려는 뉴스기사의 날짜가 원하는 날짜보다 더 과거의 날짜인 경우 flag = False break result_date.append(date) result_title.append(title) result_contents.append(contents) if not flag: break # print(f"company_code: {company_code}, processing_date: {processing_date}, 크롤링 끝난 페이지: {page}") page += 1
def GetEachContents(driver, EachUrl): EmptyFolder = 0 url = baseUrl + str(EachUrl) driver.get(url) driver.find_element(By.CSS_SELECTOR, "._97aPb > div:nth-child(1)").click() sleep(2) #저장할 폴더 경로 f_url = ProjectFolder + '/MobileTest_img/' + EachUrl[3:] #폴더 있으면 삭제 if (os.path.isdir(f_url)): print('이미 다운받은 게시글입니다.') return EmptyFolder #저장할 폴더 생성 else: os.mkdir(f_url) print(f_url + '을 작업중입니다.') image = list() # FirstOne = True if (driver.find_elements_by_css_selector(".coreSpriteRightChevron") or driver.find_elements(By.CLASS_NAME, "vi798")): while (True): pageString = driver.page_source soup = BeautifulSoup(pageString, "lxml") LiTagList = soup.select(".FFVAD") LiTagList += soup.select(".tWeCl") if len(LiTagList) == 0: driver.get(url) continue try: for LiTag in LiTagList: image.append(LiTag.attrs['src']) except KeyError as keyerr: print(keyerr) print(LiTag) print("!!!!!!!KEYERROR!!!!!!!!!") driver.get(url) continue if (driver.find_elements_by_css_selector(".coreSpriteRightChevron") ): driver.find_element_by_css_selector( ".coreSpriteRightChevron").click() sleep(1) print("click") else: break else: while (True): pageString = driver.page_source soup = BeautifulSoup(pageString, "lxml") EachContent = soup.select(".FFVAD") + soup.select(".tWeCl") if len(EachContent) == 0: driver.get(url) continue else: image.append(EachContent[0].attrs['src']) break cnt = 0 image = list(set(image)) for img in image: cnt += 1 if (len(image) > 1): if ".mp4" in img: urllib.request.urlretrieve(img, f_url + str(cnt) + ".mp4") else: urllib.request.urlretrieve(img, f_url + str(cnt) + ".jpg") else: if ".mp4" in img: urllib.request.urlretrieve(img, f_url + str(cnt) + ".mp4") else: urllib.request.urlretrieve(img, f_url + str(cnt) + ".jpg") print(str(cnt) + "개의 게시글 콘텐츠를 폴더에 저장하였습니다.") if image == []: EmptyFolder += 1 print("------------------------------") return EmptyFolder
from pymongo import MongoClient client = MongoClient('localhost', 27017) db = client.dbsparta naver_movie = 'https://movie.naver.com/movie/running/current.nhn' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = requests.get(naver_movie, headers=headers) # HTML을 BeautifulSoup이라는 라이브러리를 활용해 검색하기 용이한 상태로 만듦 soup = BeautifulSoup(data.text, 'html.parser') movies = soup.select('.lst_detail_t1 > li') # print(movies) for movie in movies: if not movie.select('dt.tit') == None: for title_info in movie.select('dt.tit'): title = title_info.a.text link = naver_movie + title_info.a.attrs['href'] # print(title) director = ', '.join( [d.text for d in movie.select('span.link_txt')[1].select('a')]) if len(movie.select('span.num')) > 1: rate = movie.select('span.num')[1].text # print(rate) img = movie.select('div.thumb > a > img')[0].attrs['src'].split( '?')[0]
from bs4 import BeautifulSoup import requests driver = webdriver.Chrome("<웹드라이버>") driver.implicitly_wait(3) driver.get("<주소>") driver.find_element_by_name('authUser').send_keys('<아이디>') driver.find_element_by_name('authPass').send_keys('<비밀번호>') driver.find_element_by_css_selector('.uxd-btn').click() #버튼클릭 driver.get("<리뷰주소>") driver.find_element_by_css_selector('#regNothanksLink').click() #버튼클릭 text_file = open("Output.csv", "w") for i in range(0, 100000): #게시물 k = "<리뷰주소>" + str(i) driver.get(k) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') #notices = soup.select('p.vproductlist') notices = soup.select('span.vproductListItem') if "reviews/review/view" in requests.request("GET", k).url: for j in range(len(notices)): text_file.write(notices[j].text.strip()) text_file.write("{0}\n".format(i)) else: pass text_file.close()
def crawlData(self, MAX_PAGE=int(sys.argv[1])): # 초기화 END_NUMBER = MAX_PAGE * 20 + 1 QUERY_SET = { "title": "div.goods_info > div.goods_name > a", } PP = pprint.PrettyPrinter(indent=4, sort_dicts=False) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) result_data = {} for info_number in range(1, END_NUMBER): bookinfo_key = "bookinfo" + str(info_number) result_data[bookinfo_key] = {} # 크롤링 for page in range(1, MAX_PAGE + 1): req = requests.get( "http://www.yes24.com/24/Category/More/001001044?ElemNo=104&ElemSeq=1&PageNumber=" + str(page) ) html = req.text soup = BeautifulSoup(html, "lxml") suffix_info_number = (page - 1) * 20 + 1 result = soup.select(QUERY_SET["title"]) # title, URL for item in result: if len(item.text) > 0: bookinfo_key = "bookinfo" + str(suffix_info_number) result_data[bookinfo_key]["title"] = item.text result_data[bookinfo_key]["url"] = ( "https://www.yes24.com" + item.attrs["href"] ) result_data[bookinfo_key]["rank"] = suffix_info_number suffix_info_number = suffix_info_number + 1 suffix_info_number = (page - 1) * 20 + 1 # author, publisher, publish_date, right_price, sales_price, isbn, page for number in range(suffix_info_number, suffix_info_number + 20): bookinfo_key = "bookinfo" + str(number) try: print(result_data[bookinfo_key]["title"] + "을 처리하는 중입니다...") req = requests.get(result_data[bookinfo_key]["url"]) except: break html = req.text soup = BeautifulSoup(html, "lxml") result_data[bookinfo_key]["publisher"] = soup.select( "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_pub > a" )[0].text result_data[bookinfo_key]["publish_date"] = soup.select( "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_date" )[0].text result_data[bookinfo_key]["right_price"] = make_integer_from_string( soup.select( "#yDetailTopWrap > div.topColRgt > div.gd_infoBot > div.gd_infoTbArea > div > table > tbody > tr > td > span > em" )[0].text.replace(",", "") ) result_data[bookinfo_key]["sales_price"] = int( result_data[bookinfo_key]["right_price"] * 0.9 ) result_data[bookinfo_key]["isbn"] = make_integer_from_string( soup.select( "#infoset_specific > div.infoSetCont_wrap > div > table > tbody > tr:nth-of-type(3) > td" )[0].text # //*[@id="infoset_specific"]/div[2]/div/table/tbody/tr[3]/td # #infoset_specific > div.infoSetCont_wrap > div > table > tbody > tr:nth-of-type(3) > td ) # 쪽수확인중 값 처리를 위해 try...except 문 삽입 try: result_data[bookinfo_key]["page"] = make_integer_from_string( re.findall( "\d+쪽", soup.select("#infoset_specific > div.infoSetCont_wrap")[ 0 ].text, )[0] ) except: result_data[bookinfo_key]["page"] = -1 result_data[bookinfo_key]["sales_point"] = ( make_integer_from_string( soup.select("span.gd_ratingArea > span.gd_sellNum")[ 0 ].text.replace(",", "") ) or "none" ) # publisher # span으로 처리한 링크 없는 저자 이름을 위해 try...except 문 삽입 result_data[bookinfo_key]["author"] = [] try: authors = soup.select( "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_auth > a" ) for author in authors: result_data[bookinfo_key]["author"].append(author.text) except: authors2 = soup.select( "#yDetailTopWrap > div.topColRgt > div.gd_infoTop > span.gd_pubArea > span.gd_auth > span" ) for author in authors2: result_data[bookinfo_key]["author"].append(author.text) # tags result_data[bookinfo_key]["tags"] = [] tags = ( soup.select( "#infoset_goodsCate > div.infoSetCont_wrap > dl > dd > ul > li > a" ) or "none" ) tags2 = soup.select("span.tag > a") or "none" for tag in tags: if str(type(tag)) != "<class 'str'>": result_data[bookinfo_key]["tags"].append(tag.text) for tag in tags2: if str(type(tag)) != "<class 'str'>": result_data[bookinfo_key]["tags"].append(tag.text) with open( "yes24_" + datetime.today().strftime("%Y_%m%d_%H%M_%S") + ".json", "w", encoding="UTF-8", ) as outfile: json.dump(result_data, outfile, ensure_ascii=False)
import requests from bs4 import BeautifulSoup target_url = 'https://www.google.com/search?q=english&hl=en&lr=lang_en' r = requests.get(target_url) soup = BeautifulSoup(r.content, 'html.parser') titles = [i.string for i in soup.select('.vvjwJb')] urls = [ ''.join(i.string.split(' ')).replace('›', '/') for i in soup.select('.UPmit') ] print(titles) print(urls)
genre = div.find('div',{'class':'playtxt'}).find('span',{'class':'ptxt-genre'}).text temp_dict[str(i+1)]={'artist':str(artist), 'track':str(track), 'album':str(album), 'genre':str(genre)} return temp_dict def toJson(fma_dict): with open('{}_chart.json'.format(genre), 'w', encoding='utf-8') as file : json.dump(fma_dict, file, ensure_ascii=False, indent='\t') fma_dict={} req1 = requests.get('https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page=1&per_page=200/'.format(genre)) source1 = req1.text html2 = BeautifulSoup(source1, 'lxml') final_page2=html2.select('a[href^="https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page="]'.format(genre)) final_page=final_page2[6].text final_page=int(final_page) final_song2=html2.find('div', {'class': 'pagination-full'}).find_all("b") final_song=final_song2[2].text final_song=int(final_song) for page in range(1,final_page+1): req = requests.get('https://freemusicarchive.org/genre/{}/?sort=track_date_published&d=1&page={}&per_page=200'.format(genre, page)) source = req.text html = BeautifulSoup(source, 'lxml') fma_dict = dict(fma_dict, **fma_Crawling(html,page)) toJson(fma_dict)
import requests as rq from bs4 import BeautifulSoup base_url = 'https://pjt3591oo.github.io' page_path = '/page%d' page = 2 res = rq.get(base_url) soup = BeautifulSoup(res.content, 'lxml') posts = soup.select('body main.page-content div.wrapper div.home div.p') for post in posts: title = post.find('h3').text.strip() descript = post.find('h4').text.strip() author = post.find('span').text.strip() print(title, descript, author) while True: sub_path = page_path%(page) page += 1 res = rq.get(base_url + sub_path) if (res.status_code != 200): break soup = BeautifulSoup(res.content, 'lxml') posts = soup.select('body main.page-content div.wrapper div.home div.p') for post in posts:
def _11_shopping(lst): for i in gender: url = "http://www.11st.co.kr/browsing/BestSeller.tmall?method=getBestSellerMain&cornerNo=2&dispCtgrNo=" + switch_site( i) custom_header = { "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" } req = requests.get(url, headers=custom_header) html = BeautifulSoup(req.text, "html.parser") items = html.select("p") allword = [] keyword = [] for item in items: allword.append(item.text.strip()) first_one = 0 for j in range(0, 10): allword.remove(allword[first_one]) split_list = [] for j in allword: r_j = j.replace('[', ' ').replace(']', ' ').replace('/', ' ').replace( '(', ' ').replace(')', ' ') split_list = r_j.split(" ") for k in split_list: keyword.append(k) keyword = ' '.join(keyword).split() word_count = counter(keyword) word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True) #keyword 삭제 banlist = ['남성', '남자', '여성', '여자'] x = 0 for j in word_count: if j[0] in banlist: del word_count[x] x = x + 1 else: x = x + 1 keyword2 = [] #상위 20개만 추출 for j in range(0, 20): keyword2.append(word_count[j]) k = 1 for j in keyword2: gender_ = swithch_gender(i) values1 = (str(k), j[0], str(swithch_gender(i)), str(j[1])) query1 = "insert into _11_shopping (rank,keyword,date_,gender,score) values(%s,%s,cast(now() as char),%s,%s)" curs.execute(query1, values1) k = k + 1 return 0
import requests from bs4 import BeautifulSoup import re from akshare.obor.get_countries_from_invest_web import get_countries_url web_site = get_countries_url() for item_1 in web_site.iloc[:, 0]: # item_1 = web_site.iloc[:, 0][0] url = 'https://cn.investing.com' + item_1 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36' } res = requests.post(url, headers=headers) soup = BeautifulSoup(res.text, 'lxml') title = soup.select('title')[0].get_text().split('-')[0].strip().split('_')[0] if title == "科威特股市指数": continue useful_web = soup.find_all(attrs={'id': 'cr1'})[0].find_all(attrs={'class': 'bold left noWrap elp plusIconTd'})[0].select('a')[0]['href'] useful_title = soup.find_all(attrs={'id': 'cr1'})[0].find_all(attrs={'class': 'bold left noWrap elp plusIconTd'})[0].select('a')[0]['title'] url = 'https://cn.investing.com' + useful_web + '-historical-data' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36' } res = requests.post(url, headers=headers) soup = BeautifulSoup(res.text, 'lxml') data = soup.find_all(text=re.compile('window.histDataExcessInfo'))[0].strip() para_data = re.findall(r'\d+', data) start_date = '2000/01/01' end_date = '2019/10/17'
from __future__ import print_function from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys import time from bs4 import BeautifulSoup import sys browser = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs') #browser = webdriver.Firefox() browser.get( "https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=zWuVSTFwAmUZtJhrjzSYy5ufa9Vwpz6XltTHq4n%2B9gMYSfr7u1CU1i2lx00TDWH67lxWsQ6v%2FrbtGwzAUB%2FtOU%2FdDeCkxleINu%2BSBVhHZM%2BIpGI3GSP9dWr%2F8u9MCc9T2OGPRUf%2FnqWVFuWaH2y7CrS7mFrDxsW1r6%2BWtQtj5qO6pb0fC98X0j%2F7ua2%2FHygyWaTGybgLZnzu83SuX64zYXSk%2FM8eVuQYqDHVLhv%2F6oNjjoTmpFlSkVcSfnu9ryzz4KE%2FoYnM%2Fefy83sE%2FJDBPA%3D%3D&city=4951&cid=1732641&tag=41460a09-3e65-d173-1233-629e2428d88e&gclid=Cj0KEQjwxbDIBRCL99Wls-nLicoBEiQAWroh6uLlQWnHWRlc9Euu6Pg_XC1NRtBzj5Yb8HkVs-MjQLMaAigh8P8HAQ&tick=636295974842&txtuuid=c48ab805-f9ed-45d4-bb4a-2377625889d9&pagetypeid=103&origin=TW&aid=81837&userId=5fcd3f05-8c16-4426-acdf-5ee6bb07f69f&languageId=20&sessionId=xhjywu5gunsz0c5oexhquovf&storefrontId=3¤cyCode=TWD&htmlLanguage=zh-tw&trafficType=User&cultureInfoName=zh-TW&textToSearch=%E5%8F%B0%E5%8C%97%E5%B8%82&guid=c48ab805-f9ed-45d4-bb4a-2377625889d9&isHotelLandSearch=true&checkIn=2017-05-14&checkOut=2017-05-15&los=1&rooms=1&adults=2&children=0&ckuid=5fcd3f05-8c16-4426-acdf-5ee6bb07f69f&priceCur=TWD&hotelReviewScore=5" ) soup = BeautifulSoup(browser.page_source, "html.parser") while len(soup.select('.btn.btn-right')) > 0: for ele in soup.select('.hotel-info h3'): print(ele.text) # print(ele.text.encode(sys.stdin.encoding, "replace").decode(sys.stdin.encoding)) browser.find_element_by_id("paginationNext").click() time.sleep(3) soup = BeautifulSoup(browser.page_source) browser.close()