def parse_pagination(html) -> Tuple[List, str]: """find all job links and next page link in pagination html""" print("Parse Pagination!") sel = Selector(text=html) jobs = sel.css('div.item h3 a::attr(href)').extract() next_page = sel.css('a[aria-label=Next]::attr(href)').extract_first() return jobs, next_page
def parse_gallery(html): title = html.css("a::text").get() url = html.css("a::attr(href)").get() slides = [] gallery_html = download(ROOT + url) gallery_page_document = Selector(text=gallery_html) for slide_element in gallery_page_document.css(".rsSlideContent"): slide = slide_element.css("img::attr(src)").get() slides = slides + [slide] next_gallery_page_url = gallery_page_document.css( ".gall_next_page > a::attr(href)" ).get() while next_gallery_page_url is not None: gallery_html = download(ROOT + url + next_gallery_page_url) gallery_page_document = Selector(text=gallery_html) for slide_element in gallery_page_document.css(".rsSlideContent"): slide = slide_element.css("img::attr(src)").get() slides = slides + [slide] next_gallery_page_url = gallery_page_document.css( ".gall_next_page > a::attr(href)" ).get() slides = [slide for slide in slides if slide is not None] return (title, url, GalleryContent(slides), None)
def test_check_server(): agent = make_connected_agent() agent.start(auto_register=False) port = random.randint(5000, 9999) agent.web.start(hostname="0.0.0.0", port=port) # wait for web server to be up counter = 0 while counter < 4: if agent.web.server is not None: break counter += 1 time.sleep(0.1) assert agent.web.server is not None response = requests.get(f"http://localhost:{port}/spade") sel = Selector(text=response.text) assert sel.css("title::text").get() == "fake agent" assert sel.css("img::attr(src)").get() == agent.avatar assert sel.css("ul.products-list > li").getall() == [] agent.stop()
def weibo_spider(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36' } response = requests.get(url=search_url, headers=headers) #构造css文本 css_text = Selector(text=response.text) wrap_list = css_text.css('.card') #搜索结果的第一个 weibo_detail_url = 'https:' + wrap_list[0].css( '.content .from a[target=_blank]::attr(href)').extract_first() print('本次搜索排第一的内容页面 --> %s ' % weibo_detail_url) #评论链接的id获取 comment_id = css_text.css('.card-wrap')[0].css( '::attr(mid)').extract_first() print('该内容的comment_id --> %s ' % comment_id) #用户名 user_name = wrap_list[0].css('.info div a[class=name]::attr(nick-name)' ).extract_first() #a[class=name] print('发布该内容的用户 --> %s ' % user_name) #uid user_uid = wrap_list[0].css( '.info div a[class=s-btn-c]::attr(uid)').extract_first() print('该用户的UID --> %s ' % user_uid) #个人页面 personal_page = wrap_list[0].css( '.info div a[class=name]::attr(href)').extract_first() print('该用户的个人页面 --> %s ' % personal_page) return comment_id
def crawl_detail_page(url, title, news_type): resp_text = requests.get(url).text sel = Selector(resp_text) author = sel.css('#articlewrap > div.article-info > div > a::text').extract_first() or \ sel.css('#articlewrap > div.article-info > div::text').extract_first() or '' author = author.replace("\n", "").strip() print(f'保存数据 {news_type} {title} {author} {url} 到 数据库') # 用户自由发挥保存。
def job(): br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [ ("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" ), ('authority', 'www.skybet.com'), ('method', 'GET'), ('path', '/football/specials/transfer-specials'), ('scheme', 'https'), ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2'), ('cache-control', 'max-age=0'), ('cookie', 'sbgCFcustrandno01=82.2; sbgCAcid01=3EBFE797C814A6A4B6B182391BA5B43B; _ga=GA1.2.200552123.1496285371; betCEsessid01=3q9ad11ench0v48vqd6o2cvnf3; sbgCEsitevisitor=11; sbgCEsiteactiveBet=7b674d7ff7c810a5bad7b5f11999f7fd; sbgCAtrial01=betCOB,BetNGU,betTrial7,bingoappNativeNavBar,vegasappNativeNavBar; beta_site_stick=YmV0YQ==; sbgCEsiteactiveSSO=cc89f90083ecb66e2cfc2c50a43dc22d; config=%7B%22attemptedSSOLogin%22%3Atrue%7D; SSOSESSID=828d661bfc4a76371081e3fe508ef81f; __SBA_POLLTIME=1499223165446|loggedOut|true; SSO_SESS_ID=828d661bfc4a76371081e3fe508ef81f; betSsoAutoLoginAttempted=1499223668675; s_pers=%20s_vnum%3D1501814810661%2526vn%253D1%7C1501814810661%3B%20s_invisit%3Dtrue%7C1499225410331%3B%20s_getNewRepeat%3D1499223610336-Repeat%7C1501815610336%3B; s_sess=%20s_ctq%3D0%3B%20s_cc%3Dtrue%3B%20c%3DundefinedDirect%2520LoadDirect%2520Load%3B%20s.prop69%3Dvisitnumber_1%3B%20s_sq%3D%3B' ), ('upgrade-insecure-requests', '1') ] br.open(url) print br.response s = requests.session() text = s.get(url, headers=header).text print text selector = Selector(text=text) print len((selector.css('div.mktgrp > * >table'))) print len((selector.css('div.mktgrp > * >h3'))) player_odds_list = [] club_odds = []
def get_authors_list(self, seek_type=1): """ Get all authors from http://qsc.zww.cn/. The seek_type is always 1. """ for i in range(1, 5): # To-do: this is hardcode list page, currently, there are only 90 pages, but there should be another more intelligent way to get the total page Nos. req = self.poetry_request(1, i) content = req.content.decode('gb2312', 'ignore') # Decode in Mandarin selector = Selector(text = content) content_in_script = selector.css('script').get() for line in content_in_script.splitlines(): # Iterate each author list page if not line.startswith('parent.QTS.filllist'): # This is html text line doesn't contain values we evaluate. continue inner_selector = Selector(text = line) anchor_lines = inner_selector.css('a').getall() for anchor in anchor_lines: # Interate each anchor tag if re.match(r'.*doseek2\((10,.*,.*)\);.*', anchor): # doseek2(10, .*, .*) stands for a author possessing one/some poetry in this website. author = re.search(r'(?<=\);">)[^\…]+', anchor).group(0) seek_type, value, pageno = re.search(r'(?<=onclick="doseek2\()[^\)]+', anchor).group(0).split(',') desc = self.get_author_info(seek_type, value, 1) # Pageno should always be 1 with scopedsession() as session: # Add every author to DB because adding a batch of keys may have some primary keys exist in the table, # and it will trigger rollback w/o adding the non-existing new keys. session.add( CiAuthor( name = author, desc = desc ) )
def get_winelab_price(url, result_list=None, need_print=False): try: cookies = dict(currentRegion='RU-PRI') site_page = requests.get(url, cookies=cookies, timeout=3) # write_to_file(site_page, '/root/pricepinger_v2/winelab.txt') sel = Selector(site_page.text) except Exception: if need_print: print("Ошибка получения по url " + url) return None # print(site_page.status_code) if site_page.status_code != requests.codes.ok: return None # title = sel.css('title::text').extract_first() # title = sel.css('div.description::text').extract_first() #.capitalize() title = sel.css('h1::text').extract_first() # .capitalize() price_purple = format_price(sel.css('span.purple::text').extract_first()) price_yellow = format_price(sel.css('span.yellow::text').extract_first()) price_green = format_price(sel.css('span.green::text').extract_first()) price = max(price_purple, price_yellow, price_green) date_ts = datetime.datetime.now() result = [title, float(price), url, 'winelab', date_ts] if result_list is not None: result_list.append(result) if need_print is True: echo_list_to_sting(result) return result
def parse_job(html) -> dict: """find job details in job listing page""" sel = Selector(text=html) # setup some processing helpers join = lambda css, sep='': sep.join(sel.css(css).extract()).strip() first = lambda css: sel.css(css).extract_first(' ').strip() item = {} item['title'] = sel.css('h2.title::text').extract_first() item['location'] = join('.job-meta a::text', ', ') item['job_type'] = join('ul.list-unstyled a::text') item['posted_date'] = join( 'div#affix-box p:contains("Posted:")::text').split(': ')[1] item['saved_times'] = join( 'div#affix-box div:contains("Saved ")>strong::text') item['description'] = join('div.box-item-details p ::text') item['views'] = first( 'div#affix-box li:contains("unique views")>strong::text') item['unique_views'] = first( 'div#affix-box li:contains("views")>strong::text') bullets = lambda css: [ ''.join(bullet.css('::text').extract()) for bullet in sel.css(css) ] h4_bullet = 'div.box-item-details h4:contains("{}")+ul>li'.format h3_bullet = 'div.box-item-details h3:contains("{}")+ul>li'.format item['about_you'] = bullets(h4_bullet('About You')) item['your_role'] = bullets(h4_bullet('Your role')) item['requirements'] = bullets(h4_bullet('Requirements')) item['nice_to_have'] = bullets(h4_bullet('Nice to have')) item['why_work_with_us'] = bullets(h4_bullet('Why work with us')) item['desired_skills'] = bullets(h3_bullet('Desired Skills')) item['contact'] = bullets(h3_bullet('Contact Info')) return item
def scrape_noticia(html_content): selector = Selector(text=html_content) url = selector.css("head link[rel=canonical]::attr(href)").get() title = selector.css("h1#js-article-title::text").get() timestamp = selector.css("#js-article-date::attr(datetime)").get() writer = selector.css(".z--font-bold::text").get() shares_count = selector.css(".tec--toolbar__item::text").get() comments_count = selector.css(".tec--btn::attr(data-count)").get() summary = selector.css( ".tec--article__body > p:nth-child(1) ::text").getall() sources = selector.css(".z--mb-16 .tec--badge::text").getall() categories = selector.css("#js-categories .tec--badge::text").getall() if not writer: writer = selector.css(".z--font-bold a::text").get() if not shares_count: shares_count = 0 else: shares_count = str(shares_count).split()[0] sources = [str(source).strip() for source in sources] categories = [str(category).strip() for category in categories] return { "url": url, "title": title, "timestamp": timestamp, "writer": str(writer).strip(), "shares_count": int(shares_count), "comments_count": int(comments_count), "summary": "".join(summary), "sources": sources, "categories": categories, }
def get_dilan_price(url, result_list=None, need_print=False): try: site_page = requests.get(url, timeout=20) sel = Selector(site_page.text) except Exception: if need_print: print("Ошибка получения по url " + url) return None if site_page.status_code != requests.codes.ok: return None # write_to_file(site_page, '/root/pricepinger_v2/dilan.txt') # title = sel.css('title::text').extract_first().split(' -')[0] title = sel.css('h1::text').extract_first() # .capitalize() # price = sel.css('div.__price::text').extract_first().split()[0] price = sel.css('div.price::text').extract_first().split()[0] date_ts = datetime.datetime.now() result = [title, float(price), url, 'Dilan', date_ts] if result_list is not None: result_list.append(result) if need_print is True: echo_list_to_sting(result) return result
def detailData(self, response, **kwargs): titleName = response.xpath('/html/body/div[2]/div[2]/div/p[2]/text()' ).extract_first() # 获取标题名字 select = Selector(response.text) publishTime = select.css('.p2 span::text').get() # 获取时间 excelData = select.css('.Section1').getall() # 表格数据 if excelData: # 做判断,如果为表格,单独存入TXT 不是 正常存入数据 print('我是表格') content = excelData[0] try: with open('./' + titleName + '.txt', 'a+', encoding='utf-8') as f: f.write(titleName) f.write('\n') f.write('\n') f.write(publishTime) f.write('\n') f.write('\n') f.write(content) f.write('\n') except IOError as ex: print('写入目标文件错误,错误原因:'.ex) else: content = select.css('.p3 P span::text').getall() self.writeData(titleName=titleName, publishTime=publishTime, content=content)
def parse_song(html: Selector): lyrics = ''.join( html.css( 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8)::text' ).getall()).strip() title = html.css( 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > b ::text' ).get() band = html.css( 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text' ).get()[:-len(' lyrics')] written_by = html.css( 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(17) > small ::text' ).get() if written_by: written_by = [ author.strip() for author in written_by[len('Writer(s): '):].split(',') if author ] return { 'lyrics': lyrics, 'title': title, 'band': band, 'written_by': written_by, 'album': None }
def parse_detail(self, detail_page): """ :param detail_page: :return: """ selector = Selector(detail_page) title = selector.css('h1 span ::text').get() sorce = selector.xpath('//strong/text()').get() for detail in selector.css('.indent #info'): author = detail.xpath( '//span[@class="pl"][text()=" 作者"]/following-sibling::a/text()' ).get() publish_house = detail.xpath( '//*[text()="出版社:"]/following-sibling::text()').get() Producer = detail.xpath( '//*[text()="出品方:"]/following-sibling::a/text()').get() Producer_year = detail.xpath( '//*[text()="出版年:"]/following-sibling::text()').get() page_num = detail.xpath( '//*[text()="页数:"]/following-sibling::text()').get() price = detail.xpath( '//*[text()="定价:"]/following-sibling::text()').get() Binding = detail.xpath( '//*[text()="装帧:"]/following-sibling::text()').get() ISBN = detail.xpath( '//*[text()="ISBN:"]/following-sibling::text()').get()
def song_info(html: Selector): author_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text' lyrics = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8) ::text' author = html.css(author_css).get() text = html.css(lyrics).getall() return {'author': author, 'lyrics': ''.join(text)}
def extract_technique_synergy(sel: parsel.Selector): if sel.css('a::attr(title)').get() == 'Temtem Types': return None try: return sel.css('a::attr(title)').get().split()[0] except AttributeError: return None
def song_info(html: Selector): author_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text' lyrics = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8) ::text' album_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.panel.songlist-panel.noprint > div.songinalbum_title > b ::text' author = html.css(author_css).get() text = html.css(lyrics).getall() album = html.css(album_css).get()
def crawl_detail_page(url, title, news_type): resp_text = requests.get(url).text sel = Selector(resp_text) author = sel.css('#articlewrap > div.article-info > div > a::text').extract_first() or \ sel.css('#articlewrap > div.article-info > div::text').extract_first() or '' author = author.replace("\n", "").strip() print(f'使用print模拟保存到数据库 {news_type} {title} {author} {url}' ) # ,实际为调用数据库插入函数,压根不需要return item出来在另外文件的地方进行保存。
def tv(canal): response = get(f'https://meuguia.tv/programacao/canal/{canal}').text s = Selector(response) return { 'nome': s.css('h2::text').get(), 'inicio': s.css('div.time::text').get(), 'tipo': s.css('h3::text').get(), }
def requestAllNames(url): response = requests.get(defaultUrl + url) selector = Selector(text=response.text) titles = selector.css(".product_pod > h3 > a::text").getall() for title in titles: print(title) next = selector.css(".next > a::attr(href)").get() if (next): return requestAllNames(next)
def get_html(title_url): result = requests.get(title_url) se = Selector(result.text) title = se.css('div.content-title p::text').get() content =se.css('div.content-title ul span::text').getall() view_num = se.css('#views::text').get() print(title) print(content) print(view_num)
def parse_trun(self, response): # 解析比赛轮次和球队 logging.debug(response.url) sch_trun = response.css("table.linkblock a.OddsLink").extract() # schInfo["schedule_teams"] = schInfo["id"] if len(sch_trun) == 0: # 无轮次 # scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"]) logging.debug(scheduleInfo) # schInfo = ScheduleInfo() schInfo["id"] = scheduleInfo["id"] schInfo["area"] = scheduleInfo["area"] schInfo["country"] = scheduleInfo["country"] schInfo["match_name"] = scheduleInfo["match_name"] schInfo["sch_idx"] = scheduleInfo["sch_idx"] schInfo["sch_name"] = scheduleInfo["sch_name"] schInfo["sch_type"] = scheduleInfo["sch_type"] schInfo["sch_group"] = scheduleInfo["sch_group"] schInfo["sch_trun"] = "无" schInfo["id"] = response.meta["scheduleInfoObj"]["id"] + "_0" schInfo["sch_url"] = response.url.replace(self.base_url, "") yield schInfo else: # 有轮次 for t in sch_trun: odds_sel = Selector(text=t) trun_name = odds_sel.css("a::text").extract_first() if trun_name == None or trun_name.strip() == "": trun_name = odds_sel.css("a b::text").extract_first() if trun_name == None or trun_name.strip() == "": continue trun_name = trun_name.strip() tmp_name = trun_name.encode("utf-8") if tmp_name == None or tmp_name == "" or tmp_name == "全部": continue # scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"]) logging.debug(scheduleInfo) # schInfo = ScheduleInfo() schInfo["id"] = scheduleInfo["id"] schInfo["area"] = scheduleInfo["area"] schInfo["country"] = scheduleInfo["country"] schInfo["match_name"] = scheduleInfo["match_name"] schInfo["sch_idx"] = scheduleInfo["sch_idx"] schInfo["sch_name"] = scheduleInfo["sch_name"] schInfo["sch_type"] = scheduleInfo["sch_type"] schInfo["sch_group"] = scheduleInfo["sch_group"] schInfo["sch_trun"] = trun_name schInfo["id"] = response.meta["scheduleInfoObj"][ "id"] + "_" + trun_name schInfo["sch_url"] = odds_sel.xpath( "//@href").extract_first().strip() yield schInfo
def parse_article(self, url, html) -> dict: """Parse html for data""" sel = Selector(text=html) data = { 'url': url, 'date': sel.css('time::attr(datetime)').extract_first(), 'title': sel.css('h1 ::text').extract_first(), } return data
def _get_item(self, entry: Selector): video_id = entry.css('videoId::text').get() return PodcastItem( item_id=video_id, title=entry.css('title::text').get(), description=entry.css('description::text').get(), date=datetime.fromisoformat(entry.css('published::text').get()), image=entry.css('group > thumbnail::attr(url)').get(), content_type="video/mp4", )
def scrape_noticia(html_content): selector = Selector(text=html_content) url = selector.css("meta[property='og:url']::attr(content)").get() title = selector.css("h1#js-article-title::text").get() timestamp = selector.css("#js-article-date::attr(datetime)").get() writer = selector.css("a.tec--author__info__link::text").get() shares_count = selector.css("div.tec--toolbar__item::text").re_first( r"\d+") comments_count = selector.css("#js-comments-btn::text").re_first(r"\d+") summary = "".join( selector.css(".tec--article__body > p:first-child *::text").getall()) sources = selector.css("div.z--mb-16 .tec--badge::text").getall() categories = selector.css("#js-categories a.tec--badge::text").getall() return { "url": url, "title": title, "timestamp": timestamp, "writer": writer.strip() if writer else writer, "shares_count": (int(shares_count) if shares_count else 0), "comments_count": (int(comments_count) if comments_count else 0), "summary": summary, "sources": [source.strip() for source in sources] if sources else sources, "categories": [categorie.strip() for categorie in categories] if categories else categories, }
def parse(html): document = Selector(text=html) memes = [ catch_errors(parse_meme, element) for element in document.css("main .media-element") ] memes = [meme for meme in memes if meme is not None] title = document.css("title::text").get() next_page_url = "/kwejk/page/" + get_last_part_url( document.css(".btn-next::attr(href)").get()) return Page(title, memes, next_page_url)
def download_one(url): response = requests.get(url,timeout=40) response.raise_for_status() response.encoding="gbk" sel=Selector(response.text) title=sel.css('em::text').get() f=open(title+'.txt',mode='w',encoding='gbk') f.write(title) for list in sel.css('#content::text').getall(): print(list.strip(),file=f) f.close()
def parse(html): document = Selector(text=html) memes = [ catch_errors(parse_meme, element) for element in document.css("article.story") ] memes = [meme for meme in memes if meme is not None] title = document.css("title::text").get() next_page_url = "/anonimowe/page/" + find_id_in_url( document.css("nav.pagination > div.next > a::attr(href)").get() ) return Page(title, memes, next_page_url)
def parse(self, response): sel = Selector(text=response.body.decode('utf-8')) products = sel.css('.product__title::attr(href)').extract() yield \ merge({'products': products}, \ pick(['status', '_url'], \ vars(response))) url = sel.css('.js-next::attr(href)').extract_first() if url: req = sub(r'\?.*', '', response.request.url) + url yield Request(req, callback=self.parse)
def parse(html): document = Selector(text=html) memes = [ catch_errors(parse_meme, element) for element in document.css(".demotivator") ] memes = [meme for meme in memes if meme is not None] title = document.css("title::text").get() next_page_url = "/demotywatory/page/" + get_last_part_url( document.css("a.next-page::attr(href)").get()) return Page(title, memes, next_page_url)
async def test_stop(test_client): agent = Agent("jid@server", "password") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade/stop") response = await response.text() sel = Selector(text=response) assert sel.css("div.alert-warning > span::text").get().strip() == "Agent is stopping now." with LogCapture() as log: try: await client.get("/spade/stop/now/", timeout=0.0005) except requests.exceptions.ReadTimeout: pass log.check_present(('spade.Web', 'WARNING', "Stopping agent from web interface.")) counter = 5 while agent.is_alive() and counter > 0: counter -= 0.5 time.sleep(0.5) assert not agent.is_alive()
async def test_request_home(test_client): agent = make_connected_agent("jid@server", "password") future = agent.start(auto_register=False) future.result() agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade") response = await response.text() sel = Selector(text=response) assert sel.css("title::text").get() == "jid agent" assert sel.css("img::attr(src)").get() == agent.avatar assert sel.css("ul.products-list > li").getall() == [] agent.stop()
def getheaders(): '从网页源代码内解析出 uuid与Xsrftoken' z1 = s.get('https://www.zhihu.com/') sel = Selector(z1.text) jsdata = sel.css('div#data::attr(data-state)').extract_first() xudid = json.loads(jsdata)['token']['xUDID'] xsrf = json.loads(jsdata)['token']['xsrf'] headers = headers_raw_to_dict(post_headers_raw) headers['X-UDID'] = xudid headers['X-Xsrftoken'] = xsrf return headers
def load_chapters(url): """ Loads all chapters from a manga comic and returns a list for dictionaries with related data. :return: chapter list in asc order """ text = requests.get(url).text sel = Selector(text) hel_gen = sel.css(".chlist h3, .chlist h4") chapter_gen = map(hel_to_chapter, hel_gen) available_chapter_gen = filter(lambda v: v['title'], chapter_gen) return reversed(list(available_chapter_gen))
async def test_add_get(test_client): agent = Agent("jid@server", "password") agent.web.add_get("/test", lambda request: {"number": 42}, "examples/hello.html") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/test") response = await response.text() sel = Selector(text=response) assert sel.css("h1::text").get().strip() == "42" agent.stop()
def main(argv=None, progname=None): parser = argparse.ArgumentParser(prog=progname, description=__doc__) parser.add_argument('expr', metavar='EXPRESSION', help="A CSSexpression, or a XPath expression if --xpath is given.") parser.add_argument('file', metavar='FILE', nargs='?', help="If missing, it reads the HTML content from the standard input.") parser.add_argument('--xpath', action='store_true', help="Given expression is a XPath expression.") parser.add_argument('--re', metavar='PATTERN', help="Apply given regular expression.") parser.add_argument('--encoding', metavar='ENCODING', default='utf-8', help="Input encoding. Default: utf-8.") parser.add_argument('--repr', action='store_true', help="Output result object representation instead of as text.") # TODO: Output this and parsel version. args = parser.parse_args(argv) if args.file: text = open(args.file).read() else: text = sys.stdin.read() if isinstance(text, six.binary_type): try: text = text.decode(args.encoding) except UnicodeDecodeError: parser.error("Failed to decode input using encoding: %s" % args.encoding) sel = Selector(text=text) if args.xpath: result = sel.xpath(args.expr) else: result = sel.css(args.expr) if args.re: regex = args.re.encode(args.encoding) regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape') out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE)) else: out = result.extract() if args.repr: pprint.pprint(out) else: print("\n".join(out)) return 0
def get_alexa_demographics(url, db_session=False): if db_session is not False: result = list(db_session.query(WebsitesCache).filter_by(link=url)) if len(result) > 0 and result[0].male_ratio_alexa >= 0: return float(result[0].male_ratio_alexa), float(result[0].female_ratio_alexa) else: return 0.0, 0.0 orig_url = url url = "http://www.alexa.com/siteinfo/" + url response = requests.get(url) # We need the decode part because Selector expects unicode. selector = Selector(response.content.decode('utf-8')) bars = selector.css("#demographics-content .demo-col1 .pybar-bg") values = [] for bar in bars: value = bar.css("span::attr(style)").extract()[0] value = int(re.search(r'\d+', value).group()) values.append(value) male_ratio = 0.0 female_ratio = 0.0 if sum(values) == 0: print "No alexa rating for " + url else: male_ratio = float(values[0] + values[1]) / sum(values) female_ratio = float(values[2] + values[3]) / sum(values) print url print values print male_ratio, female_ratio # Do we want to cache the result? if db_session is not False: try: db_session.query(WebsitesCache).filter(WebsitesCache.link==orig_url) \ .update({ 'male_ratio_alexa': male_ratio, 'female_ratio_alexa': female_ratio }) db_session.commit() except: print "Could not update " + url return male_ratio, female_ratio
async def test_get_messages(test_client): agent = Agent("jid@server", "password") agent.web.setup_routes() client = await test_client(agent.web.app) # add messages to trace for i in range(5): msg = Message(body=str(i), sender="{}@server".format(i), to="receiver@server") agent.traces.append(msg) response = await client.get("/spade/messages/") response = await response.text() sel = Selector(text=response) assert len(sel.css("ul.timeline > li").getall()) == 6 # num messages + end clock agent.stop()
def download_chapter(chapter, folder_name): """ Grabs all images from a chapter and writes them down to filesystem. """ folder_name = werkzeug.utils.secure_filename(folder_name) # if the folder does not exist ... if not os.path.exists(folder_name): os.mkdir(folder_name) text = requests.get(chapter['href']).text sel = Selector(text) for value in sel.css("select[class='m'] > option::attr(value)").extract(): value = int(value) url = re.sub(r'\d+\.html', '%d.html' % value, chapter['href']) download_page(url, folder_name)
async def test_get_behaviour(test_client): class EmptyOneShotBehaviour(OneShotBehaviour): async def run(self): self.kill() agent = Agent("jid@server", "password") behaviour = EmptyOneShotBehaviour() agent.add_behaviour(behaviour) agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.get("/spade/behaviour/OneShotBehaviour/EmptyOneShotBehaviour/") response = await response.text() sel = Selector(text=response) assert sel.css("section.content-header > h1::text").get().strip() == "OneShotBehaviour/EmptyOneShotBehaviour" agent.stop()
async def test_add_post(test_client): agent = Agent("jid@server", "password") async def handle_post(request): form = await request.post() number = form["number"] return {"number": number} agent.web.add_post("/test", handle_post, "examples/hello.html") agent.web.setup_routes() client = await test_client(agent.web.app) response = await client.post("/test", data={"number": 1024}) response = await response.text() sel = Selector(text=response) assert sel.css("h1::text").get() == "1024" agent.stop()
def download_page(url, folder_name): text = requests.get(url).text sel = Selector(text) for src in sel.css("img[id='image']::attr(src)").extract(): basename = os.path.basename(src) safe_basename = werkzeug.utils.secure_filename(basename) filename = os.path.join(folder_name, safe_basename) filename = os.path.abspath(filename) # file is not there or has a invalid size ... if not os.path.exists(filename) or os.path.getsize(filename) == 0: data = requests.get(src).content with open(filename, 'wb') as file: file.write(data) print('{0} written.'.format(filename)) else: print('{0} exists. Skipping.'.format(filename))
def _find_match(self, sel: Selector) -> Match: xpath = lambda x: sel.xpath(x).extract_first(default='').strip() item = Match() item['url'] = urljoin(self.url_base, xpath(".//a/@href")) item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0] item['game'] = next((g for g in self.games if g in item['url'].lower())) item['time'] = xpath("td[@class='status']/span/text()") item['time_secs'] = time_to_seconds(item['time']) item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp()) item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()") item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title") item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]" "/span[contains(@class,'flag')]/@class").split()[-1] item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()") item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title") item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]" "/span[contains(@class,'flag')]/@class").split()[-1] scores = sel.css('.score::text').extract() item['t1_score'] = scores[0] if scores else None item['t2_score'] = scores[1] if len(scores) > 1 else None return item
async def test_get_agent(test_client): agent = make_presence_connected_agent("jid@server", "password") future = agent.start(auto_register=False) future.result() agent.web.setup_routes() client = await test_client(agent.web.app) jid = "friend@server" item = Item(jid=JID.fromstr(jid)) agent.presence.roster._update_entry(item) response = await client.get(f"/spade/agent/{jid}/") response = await response.text() sel = Selector(text=response) assert sel.css("section.content-header > h1::text").get().strip() == jid agent.stop()
def search_command(value): url = "{domain}/search.php?".format(domain=domain) quote = urllib.parse.quote(value) query = { 'name': quote, 'name_method': 'cw', 'author': '', 'author_method': 'cw', 'artist': '', 'artist_method': 'cw', 'is_complete': '', 'type': '', 'advopts': '1', 'rating': '', 'rating_method': 'eq', 'released': '', 'released_method': 'eq', 'genres[Sci-fi]': '0', 'genres[Horror]': '0', 'genres[Sports]': '0', 'genres[Action]': '0', 'genres[Shoujo Ai]#': '0', 'genres[Drama]': '0', 'genres[Fantasy]': '0', 'genres[Mystery]': '0', 'genres[Gender Bender]': '0', 'genres[One Shot]': '0', 'genres[Psychological]': '0', 'genres[Tragedy]': '0', 'genres[Historical]': '0', 'genres[Mecha]': '0', 'genres[Yuri]': '0', 'genres[Seinen]': '0', 'genres[Adult]': '0', 'genres[Slice of Life]': '0', 'genres[Doujinshi]': '0', 'genres[Romance]': '0', 'genres[School Life]': '0', 'genres[Comedy]': '0', 'genres[Shoujo]': '0', 'genres[Ecchi]': '0', '#genres[Harem]': '0', 'genres[Smut]': '0', 'genres[Yaoi]': '0', 'genres[Shounen Ai]': '0', 'genres[Martial Arts]': '0', 'genres[Josei]': '0', 'genres[Shounen]': '0', 'genres[Mature]': '0', 'genres[Webtoons]': '0', 'genres[Supernatural]': '0', 'genres[Adventure]': '0', } url += urllib.parse.urlencode(query) try: data = requests.get(url).text except urllib.error.URLError: # mangafox requires a 5 seconds delay # between searches import time time.sleep(5) data = requests.get(url).text sel = Selector(data) results = list() for link in sel.css('td:first-child > a:first-child'): manga_url = link.css('::attr(href)').extract_first() name = manga_url[7:].split('/')[2] results.append(odict([ ('title', link.css('::text').extract_first()), ('name', "%s (use for download)" % name), ('url', manga_url), ])) if len(results): print("") for manga in results: for key, value in manga.items(): print("%s: %s" % (key, value)) print("") else: print('No results found')
with open('lamberti.json') as jsonfile: jsondata = json.load(jsonfile) with open('lamberti.geojson') as geojsonfile: geojsondata = json.load(geojsonfile) for booth in jsondata: booth_no = int(booth['nr'][-2:]) r = requests.get(booth['url']) if r.status_code == 200: text = r.text selector = Selector(text=text) booth_name = selector.css('.booth-title::text').get() booth_descr = selector.css('.booth-body > p::text').getall() if isinstance(booth_descr, list): booth_descr = " ".join(booth_descr) booth_owner_company = selector.css( '.contactParticle--company::text').get() booth_owner_name = selector.css( '.contactParticle--name\:firstname\,lastname::text').get() booth_owner_street = selector.css( '.contactParticle--street::text').get() booth_owner_city = selector.css( '.contactParticle--city\:postal_code\,locality::text').get() booth_owner_phone = selector.css('.contactParticle--phone::text').get() booth_owner_email = selector.css(