def vacancy_parse(self, response: HtmlResponse): # host without subdomains parsed_uri = tldextract.extract(response.url) host = "{domain}.{suffix}".format(domain=parsed_uri.domain, suffix=parsed_uri.suffix).lower() # url url = urljoin(response.url, urlparse(response.url).path) # name name = response.css('h1.header *::text').extract_first() # company company = response.css( '*.vacancy-company-name-wrapper > span[itemprop="identifier"] > meta[itemprop="name"]::attr(content)' ).extract_first().strip() # salary salary_data = response.css( 'div.vacancy-title > span[itemprop="baseSalary"]') salary_min = salary_data.css( 'meta[itemprop="minValue"]::attr(content)').extract_first() salary_max = salary_data.css( 'meta[itemprop="maxValue"]::attr(content)').extract_first() salary_currency = salary_data.css( 'meta[itemprop="currency"]::attr(content)').extract_first() # return result yield JobparserItem( host=host, url=url, name=name, company=company, salary_min=salary_min, salary_max=salary_max, salary_currency=salary_currency, )
def parse(self, response: HtmlResponse): # 下载之后的由engine调用,获取提取到的数据(数据解析), # 返回一个字典对象 print(type(response)) print('--->', response.url) item = {} # 返回List[<Selector>, ...] articles = response.css('.article') for article in articles: item['info_url'] = article.css('.author').xpath('./a/@href').get() item['name'] = article.css('.author').xpath('./a/img/@alt').get() item['img'] = article.css('.author').xpath('./a/img/@src').get() content = ''.join( article.css('.content').xpath('./span/text()').extract()) item['content'] = base64.b16encode( content.encode(encoding='utf-8')).decode() yield item # 找出下一页的url next_url = response.css('.pagination').xpath( './/li[last()]/a/@href').get() # 发起下一页的请求 yield Request('http://www.qiushibaike.com' + next_url, callback=self.parse, priority=10, dont_filter=True)
def book_parse(self, response: HtmlResponse): # Получение url-а и парсинг информации о книжке book_url = response.url book_title = response.xpath( '//div[@id="product-title"]/h1/text()').extract_first() book_authors = response.xpath( '//div[@class="authors"]/a[@data-event-label="author"]/text()' ).extract() book_rating = response.xpath( '//div[@id="rate"]/text()').extract_first() # Парсинг цены. Три разных тега. price_normal = response.css( 'span.buying-price-val-number::text').extract_first() price_new = response.css( 'span.buying-pricenew-val-number::text').extract_first() price_old = response.css( 'span.buying-priceold-val-number::text').extract_first() # Возвращаем один item yield BookparserItem(url=book_url, title=book_title, authors=book_authors, price_normal=price_normal, price_new=price_new, price_old=price_old, rating=book_rating)
def test_working_selector_after_reload(self): response = HtmlResponse(url='http://127.0.0.1/aaa', body='<html><body><h1>test</h1></body></html>') response.css('h1') filename = self.storage.save(response) response2 = self.storage.load(filename) response2.css('h1')
def parse(self, response): marker_txt = re.findall(re.compile("markerData.*\}", re.MULTILINE), response.body_as_unicode()) if not len(marker_txt): return markers_json = "{\"" + marker_txt[0] markers = list(json.loads(markers_json).values())[0] if not len(markers): return for marker in markers: marker_response = HtmlResponse(url="", body=marker["info"].encode("utf-8")) hours = re.findall(r"\{\"label.*\}", marker["info"]) hours = hours[0] parsed_hours = json.loads(hours) addr_parts = marker_response.css(".address span:not(.phone)::text").extract() url = marker_response.css("header a").xpath("@href").extract_first() city, state = addr_parts[-1].split(",") yield GeojsonPointItem(lat=marker.get("lat"), lon=marker.get("lng"), name=marker_response.css("header a::text").extract_first(default=None), addr_full=", ".join(addr_parts), city=city.strip(), state=state.strip(), country="United States", phone=marker_response.css(".phone::text").extract_first(), website=url, opening_hours=get_hours(parsed_hours["days"]), ref=url.split("/")[-1].split(".")[0])
def post_parse(self, response: HtmlResponse): # title = response.css('div.title-info-main h1.title-info-title').extract_first() title = response.css( 'div.title-info-main span.title-info-title-text::text' ).extract_first() price = response.css( 'div.item-price-value-wrapper span.js-item-price::attr(content)' ).extract_first() list = response.css( 'ul.item-params-list li.item-params-list-item').extract() list2 = response.css( 'ul.item-params-list span.item-params-label::text').extract() list3 = response.css( 'ul.item-params-list li.item-params-list-item::text').extract() for i, item in enumerate(list3[:]): if item == ' ': list3.remove(item) dict = {} for i, item in enumerate(list2): dict[item] = list3[i] # param_name = 1 yield { 'title': title, 'price': price, 'list': dict #list, }
def parse_my_follow_real_link(self, response: HtmlResponse): self.logger.warn('开始解析{0}真实视频'.format(response.url)) title = response.css('#viewvideo-title::text').extract_first().strip() author = response.css('a[href*="uprofile.php"]').css( 'span::text').extract_first().strip() # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同 view_key = response.url.split('viewkey=')[1] # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下 if '/' in title: title = title.replace('/', '') encrypted_url = response.css('video').extract_first().split( 'strencode("')[1].split('"))')[0] first_encrypted = encrypted_url.split('"')[0] second_excrypted = encrypted_url.split('"')[2] video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted) if video_link: # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url video_link_list = video_link.split('//') real_video_link = video_link_list[0] + '//' + video_link_list[ 1] + '/' + video_link_list[2] self.logger.warn('视频:{0} 分析完毕,丢入下载 pipelines'.format(title)) down_file_name = title + '-' + author + '-' + view_key yield DownloadVideoItem(file_urls=real_video_link, file_name=down_file_name) else: self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
def parse(self, response: HtmlResponse): items = response.css('#main-ad-list').css( '.item:not(.list_native):not(.yap-loaded)') new_item_count = 0 for item in items: attrib = item.attrib id = attrib['data-list_id'] product = Product.get_by_olx_id(int(id)) if product is not None: continue new_item_count = new_item_count + 1 url: str = item.css('.OLXad-list-link::attr(href)').get() if url is None: url = item.css('.OLXad-list-link-featured::attr(href)').get() yield response.follow(url, callback=self.parse_item, meta=response.meta.copy()) if new_item_count > 0: for next_link in response.css('a.link[rel="next"]'): yield response.follow(next_link, callback=self.parse, meta=response.meta)
def start_requests(self): self.driver.get(self.start_urls[0]) res = HtmlResponse(url='index html', body=self.driver.page_source, encoding="utf-8") title_text = res.css('#main section h1 a::text')[0].root.strip() self.driver.find_element_by_link_text(title_text).click() # 点击进去 time.sleep(2) if title_text in self.driver.page_source: self.detail_parse(self.driver.page_source, title_text) while True: try: key_word = self.driver.find_elements_by_class_name( "next")[0].text # 进行下一篇文章抓取 except (TypeError, IndexError): self.driver.quit() key_word = None if not key_word: break self.driver.find_element_by_link_text(key_word).click() time.sleep(2) res = HtmlResponse(url='next html', body=self.driver.page_source, encoding="utf-8") title_text = res.css('.article-info h1 a::text').extract_first() self.detail_parse(self.driver.page_source, title_text)
def getAudio(author, sing): reqStr = author + ' ' + sing reqStr = reqStr.replace('& ', '') #print reqStr reqStr = reqStr.replace(' ', '+') #print reqStr googurl = 'https://www.google.ru/search?newwindow=1&ei=O41KWrz4J-nX6QTFgbWYDQ&q=%s' % reqStr req = requests.get(googurl, headers=headers) response = HR(url=googurl, body=req.text, encoding='utf-8') try: links = response.css('.kv ._Rm::text').extract() mp3party = '' for link in links: if 'mp3party.net' in link: mp3party = 'http://' + link break print mp3party req = requests.get(mp3party, headers=headers) response = HR(url=mp3party, body=req.text, encoding='utf-8') audio = response.css('.jp-play::attr(href)').extract_first() print audio except: audio = '' return audio
def vacancy_parse(self, response: HtmlResponse): name = response.css('div.vacancy-title \ h1.header ::text').extract() salary = [ response.css( 'span[itemprop="baseSalary"] meta[itemprop="minValue"] ::attr(content)' \ ).extract_first(), \ response.css( 'span[itemprop="baseSalary"] meta[itemprop="maxValue"] ::attr(content)' \ ).extract_first(), \ response.css( 'span[itemprop="baseSalary"] meta[itemprop="currency"] ::attr(content)' \ ).extract_first() ] vacancy_link = response.url site_scraping = self.allowed_domains[0] yield JobParserItem( name=name, \ salary=salary, \ vacancy_link=vacancy_link, \ site_scraping=site_scraping )
def spider(): driver = webdriver.Chrome() # 启动谷歌驱动,打开浏览器 url = 'https://www.shiyanlou.com/courses/427' driver.get(url) # 打开待爬取页面 page = 1 result = [] while True: WebDriverWait(driver, 10).until( # 显式等待 10 秒,默认轮询 0.5 秒 EC.text_to_be_present_in_element( # 设置条件 (By.XPATH, '//ul[@class="pagination"]/li[@class="active"]'), str(page) # 上一行代码块里是否有这一行的字段 )) html = driver.page_source response = HtmlResponse(url=url, body=html.encode('utf-8')) for i in response.css('div.comment-item-wrapper'): d = { 'username': i.css('a.username::text').extract_first().strip(), 'content': reduce(lambda a, b: a + b, (i.css('p::text').extract())) } result.append(d) if response.css('div.comment-box li.disabled.next-page'): break # 如果有 class 属性值为 disalbed 的 li 标签,表示没有下一页了 page += 1 # chromedirver 无法自动定位到当前页面未显示区域,下面两行代码起到定位作用 ac = driver.find_element_by_css_selector( 'div.comment-box li.next-page a') ActionChains(driver).move_to_element(ac).perform() driver.find_element_by_css_selector( 'div.comment-box li.next-page').click() driver.quit() with open('comments.json', 'w') as f: json.dump(result, f)
def vacansy_parse(self, response: HtmlResponse): name_vac = '' name1 = response.css( 'div.vacancy-title h1.header span.highlighted::text').getall( ) # Наименование вакансии name2 = response.css('div.vacancy-title h1.header::text' ).extract_first() # HH добавили... if len( name1 ) == 0: # ...в заголовок имени span, приходится добавлять новые обработчики и соответственно проверки if name2 is not None: name_vac = name2 else: if name2 is None: name_vac = name1 else: name_vac = name1[0] + name2 salary = response.css('div.vacancy-title p.vacancy-salary::text' ).extract_first() # Зарплата min_sal, max_sal = self.parse_salara(salary) url = response.url # Сделаем примитивный вывод показывающий результат #print('name = {}, salary = {}, min_sal = {}, max_sal = {}, url = {}'.format(name, salary, min_sal, max_sal, url)) yield JobparserItem( name=name_vac, min_salary=min_sal, max_salary=max_sal, link=url, source='HH') # Передаем сформированный item в pipeline
def parse(self, response: HtmlResponse): next_page = response.css('a[rel=next]::attr(href)').extract_first() yield response.follow(next_page, callback=self.parse) vacancy = response.css('a[target=_blank]::attr(href)').extract() print(vacancy) for link in vacancy[0:19]: yield response.follow(link, self.vacancy_parse)
def parse_info(self, response: HtmlResponse): # 解析详情页面的回调函数 item = XHItem() # 读取请求对象中传入meta中的uid数据 item['uid'] = response.meta.get('uid') item['name'] = response.meta.get('name') item['image_urls'] = [ 'http://www.521609.com' + response.css('#bigimg').xpath('./@src').get() ] item['images'] = [] yield item # 下一页url next_url = response.css('.pagelist').xpath( './li[last()]/a/@href').get() if next_url != '#': next_url = 'http://www.521609.com/daxuexiaohua/' + next_url yield Request(next_url, callback=self.parse_info, meta={ 'uid': item['uid'], 'name': item['name'] }, priority=10)
def find_features(self, response): room_features = response.css('div.room__features').getall() features_room = [] features_apartment = [] isroom = 1 for room_features_obj in room_features: room_features_response = HtmlResponse(url=room_features_obj, body=room_features_obj, encoding='utf-8') room__feature = room_features_response.css( 'div.room__feature').getall() for room_feature_obj in room__feature: room_feature_response = HtmlResponse(url=room_feature_obj, body=room_feature_obj, encoding='utf-8') strong = room_feature_response.css('p strong::text').get() paragraph = room_feature_response.css('p::text').get() line = "" if strong: line = strong if paragraph: line = line + paragraph if isroom and line: features_room.append(line) elif line: features_apartment.append(line) isroom = 0 features = { 'features_room': features_room, 'features_shared_apartment': features_apartment } return features
def book_parse(self, response: HtmlResponse): # Получение url-а и парсинг информации о книжке book_url = response.url book_title = response.css( 'h1.item-detail__title::text').extract_first() book_authors = response.xpath( '//div[@class="item-tab__chars-list"]/div[1]/span[2]/a/text()' ).extract() book_rating = response.css( 'span.rating__rate-value::text').extract_first() # Парсинг цены. Два разных тега price_normal = response.css( 'div.item-actions__price b::text').extract_first() price_old = response.css( 'div.item-actions__price-old::text').extract_first() # Возвращаем один item yield BookparserItem(url=book_url, title=book_title, authors=book_authors, price_normal=price_normal, price_new=None, price_old=price_old, rating=book_rating)
def vacancy_parse(self, response: HtmlResponse): vaca = response.xpath( '//div[contains(@class,"vacancy-title")]//h1[@class="header"]//text()' ).extract() salary = response.css( 'div.vacancy-title p.vacancy-salary::text').extract() salary = salary() try: sal_min = response.css( 'div.vacancy-title meta[itemprop="minValue"]::attr(content)' ).extract() except: sal_min = 'NaN' try: sal_max = response.css( 'div.vacancy-title meta[itemprop="maxValue"]::attr(content)' ).extract() except: sal_max = 'NaN' link = response.css( 'div.bloko-column_xs-4 div[itemscope="itemscope"] meta[itemprop="url"]::attr(content)' ).extract() yield JobparserItem(name=vaca[0], link=link[0], salary=salary, min_salary=sal_min, max_salary=sal_max, source='hh.ru')
def vacansy_parse(self, response: HtmlResponse): name = response.css('h1.rFbjy::text').extract_first() # Наименование вакансии salary = response.css('span._2Wp8I').extract_first() # Зарплата url = response.request.url source = 'superjob.ru' yield JobparserItem(name=name, min_salary=salary, max_salary=salary, url=url, source=source) # Передаем сформированный item в pipeline
def parse_ads(self, response: HtmlResponse): loader = ItemLoader(item=AvitoRealEstate(), response=response) loader.add_xpath('photos', '//div[contains(@class, "gallery-img-wrapper")]//div[contains(@class, "gallery-img-frame")]/@data-url') loader.add_css('title', 'h1.title-info-title span.title-info-title-text::text') par_names = response.css('li.item-params-list-item span.item-params-label::text').extract() for i in range(len(par_names)): par_names[i] = par_names[i].replace(' ', '') par_data = response.css('li.item-params-list-item::text').extract() my_dict = {'Этаж:': 'floor', 'Этажейвдоме:': 'house_floors', 'Типдома:': 'house_type', 'Количествокомнат:': 'rooms', 'Общаяплощадь:': 'total_s', 'Жилаяплощадь:': 'living_s', 'Площадькухни:': 'kitchen_s', 'Отделка:': 'otdelka' } result_dict = {} for i in range(len(par_names)): result_dict[my_dict[par_names[i]]] = par_data[i * 2 + 1] for keys in result_dict: loader.add_value(keys, result_dict[keys]) print(1) yield loader.load_item()
def parse_detail(self, response): json_response = json.loads(response.body_as_unicode()) # print(json_response) if json_response['data'] is None or json_response['data']['navigationBar'] is None: print("接口服务调用异常,接口url:{}, 响应:{}".format(response.url, json_response)) return html_response = HtmlResponse(url="detail HTML string", body=json_response['data']['navigationBar'], encoding='utf-8') categoryfacetsecond = html_response.css('div.sub-title-nav ul li:last-child a::attr(categoryfacetsecond)').get() catalogids = html_response.css('div.sub-title-nav ul li:last-child a::attr(catalogids)').get().split(',') classificationid = html_response.css('div.sub-title-nav ul li:last-child a::attr(classificationid)').get() detail_payload = { 'categoryFacetSecond': categoryfacetsecond, 'catalogIds': catalogids, 'classificationId': classificationid, 'zzbh': self.zzbh } print(detail_payload) yield response.follow(self.nav_detail_url, callback=self.parse_nav_detail, method='POST', headers={'Content-Type': 'application/json'}, body=json.dumps(detail_payload))
def vacansy_parse(self, response: HtmlResponse): link_vac = response.url name = response.css('div.vacancy-title h1.header::text').extract_first() salary = response.css('div.vacancy-title p.vacancy-salary::text').extract() link = 'hh.ru' # print(name, salary) yield JobparserItem(name=name, salary_from=''.join(salary), salary_to='', link_vac=link_vac, link_site=link)
def vacancy_parse(self, response: HtmlResponse): vac = response.css('div._3MVeX h1._3mfro::text').extract_first() salary = response.css('div._3MVeX span._2Wp8I span::text').extract() yield JobparserItem(name=vac, salary=salary, link=response.url, source='superjob.ru')
def post_parse(self, response: HtmlResponse): title = response.css('article h1::text').extract_first() date = response.css('article time::attr(datetime)').extract_first() yield { 'title': title, 'date': date, }
def parse(self, response:HtmlResponse): vacancies = response.css("a.icMQ_._6AfZ9::attr(href)").extract() for vacancy in vacancies: yield response.follow(vacancy, callback=self.vacancy_parse) next_page = response.css("a.icMQ_._1_Cht._3ze9n.f-test-button-dalshe.f-test-link-Dalshe::attr(href)").extract_first() if next_page: yield response.follow(next_page, callback=self.parse)
def parse(self, response: HtmlResponse): detail_urls = response.css('.excerpt h2 a::attr(href)').extract() for detail_url in detail_urls: yield response.follow(detail_url, callback=self.parse_detail) next_page = response.css('li.next-page a::attr(href)').extract_first(None) if next_page: yield response.follow(next_page, callback=self.parse)
def vacancy_parse(self, response: HtmlResponse): name = response.css( "div.vacancy-title h1.header::text").extract_first() salary = response.css( "div.vacancy-title p.vacancy-salary::text").extract_first() company = ''.join( response.css('a.vacancy-company-name span::text').extract()) yield JobparserItem(name=name, salary=salary, company=company)
def parse(self, response: HtmlResponse): next_page = response.css( 'a.HH-Pager-Controls-Next::attr(href)').extract_first() yield response.follow(next_page, callback=self.parse) vacancies = response.css( 'div.vacancy-serp-item__info a.bloko-link::attr(href)').extract() for link in vacancies: yield response.follow(link, callback=vacancy_parse)
def parse(self, response: HtmlResponse): next_page = response.css( 'a.f-test-button-dalshe::attr(href)').extract_first() yield response.follow(next_page, callback=self.parse) vacancy = response.css( 'div.f-test-vacancy-item a::attr(href)').extract() for link in vacancy: yield response.follow(link, self.vacancy_parse)
def parse(self, response: HtmlResponse): next_page = response.css( 'a.pagination-next__text::attr(href)').extract_first() books_links = response.css( 'a.product-title-link::attr(href)').extract() for link in books_links: yield response.follow(link, callback=self.book_parse) yield response.follow(next_page, callback=self.parse)
def parse_field(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<table><tr>%s</tr></table>' % html) row = response.css('tr')[0] node = response.css('td')[0] lobbyist = Loader(self.spider, response, Lobbyist(), row) lobbyist.add_value(None, fn(node)) item = lobbyist.load_item() actual = dict(item) return actual
def parse_json(self,response): data = response.body[1:-1] js = json.loads(data) response = HtmlResponse(url=response.url,body=js['data'].encode('utf8')) for href in response.css(settings["el_nacional"]['links']): full_url = response.urljoin(href.extract()) yield scrapy.Request(full_url, callback=self.parse_links)
def parse_row(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<table>%s</table>' % html) row = response.css('tr')[0] item = fn(response, row) actual = dict(item) return actual
def parse_lobbyist(self, html, fn): response = HtmlResponse('http://localhost/test.html', body='<book>%s</book>' % html) rows = response.css('row') item = fn(response, rows) actual = dict(item) return actual
def parse_links(self, response): try: reponse.css except: response = HtmlResponse(url=response.url,body=response.body) fecha = limpiar_autor_tc(response.css(settings[self.name]['fecha']).extract()[0].split('|')[1]) current_date = True if(len(fecha)>10): current_date = obtener_fecha_tipo6(fecha.split(" ")[0]) if(current_date): titulo = limpiar_autor_tc(response.css(settings[self.name]['titulo']).extract()[0]) body = limpiar_ult_n(response.css(settings[self.name]['body']).extract()) yield { 'titulo': titulo, 'autor': response.css(settings[self.name]['autor']).extract()[0], 'fecha': fecha, 'body': [body], 'link': response.url, }
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})