def parse_topic(self, response: scrapy.http.response.html.HtmlResponse): messages = [] topic_id = self.last_part(response.url) for i, message in enumerate(response.css("tr")): topic_url = message.css( "td[class=subject] > a::attr(href)").extract_first() if topic_url is None: continue message_id = self.last_part(topic_url) messages.append({ "id": message_id, "author": message.css("td[class=author] ::text").extract_first(), "date": message.css("td[class=lastPostDate] ::text").extract_first(), "file": self.locate_email_file(topic_id, i, message_id, False) }) file_name = self.locate_email_file(topic_id, i, message_id, True) if os.path.exists(file_name): self.log("Skipped %s/%s - already fetched" % (topic_id, message_id)) continue yield response.follow( "%s/%s/message/raw?msg=%s/%s/%s" % (self.root, self.prefix, self.name, topic_id, message_id), functools.partial(self.save_email, file_name=file_name)) yield { "topic": response.css("h2 ::text").extract_first(), "id": topic_id, "messages": messages }
def parse(self, response: scrapy.http.response.html.HtmlResponse): for topic in response.css("tr a::attr(href)"): topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % ( self.root, self.name, self.last_part(topic.extract())) yield response.follow(topic_url, self.parse_topic) for next_page in response.css("body > a"): self.log("Page: %s -> %s" % (self.last_part(response.url), self.last_part( next_page.css("::attr(href)").extract_first()))) yield response.follow(next_page, self.parse)
def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse): self.logger.info('Found new keyword page: {}'.format(response.url)) keyword = response.css('a.mw-selflink.selflink::text').get() if keyword: yield INCARKeywordItem(keyword=keyword)
def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse): self.logger.info('Found new keyword page: {}'.format(response.url)) keyword = response.css('div[id=mw-content-text] p strong::text').extract_first() if keyword: yield INCARKeywordItem(keyword=keyword)
def parse(self, response: scrapy.http.response.html.HtmlResponse): # Popular heroes parser: results = response.css('div.heroes-overview').css('div.r-row') for i, result in enumerate(results): hero = result.css('div.r-icon-text').css('div.r-body').css( 'div.r-none-mobile').css('a::text').get() temp = result.css('div.r-10') matches = temp[0].css('div.r-body::text').get() winrate = temp[1].css('div.r-body::text').get() kda = temp[2].css('div.r-body::text').get() temp = result.css('div.r-175') try: role = temp[0].css('div.r-body').css('div.group').css( 'span::text').get() except IndexError: role = 'Undefined' try: line = temp[1].css('div.r-body').css('div.group').css( 'span::text').get() except IndexError: line = 'Undefined' yield { i: { 'hero': hero, 'matches': matches, 'winrate': winrate, 'KDA': kda, 'role': role, 'line': line } }
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css('div.element') for i, result in enumerate(results): name = result.css('p.name').css('a::text').get() year = result.css('p.name').css('span.year::text').get() duration = result.css('div.info').css('span.gray')[0].css('::text').get() country = result.css('div.info').css('span.gray')[1].css('::text').get().split('<')[0] author = result.css('div.info').css('span.gray')[1].css('a::text').get() yield {i: {'name': name, 'year': year, 'duration': duration, 'country': country, 'author': author}}
def parse_form(self, response: scrapy.http.response.html.HtmlResponse): try: input_element_list = response.css( 'form input::attr(value)').extract() except KeyError as e: return None # Set up form with generative keys formdata = self._create_formdata(input_element_list) yield scrapy.FormRequest(url=self.start_urls[0], formdata=formdata, callback=self.parse_results)
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css("li.expanded-shelf-content-item-wrapper").css( "div.yt-lockup-content") for i, result in enumerate(results): yield { i: [ result.css('a.yt-uix-tile-link::text').get(), result.css('span.accessible-description::text').get()[3:], result.css('div.yt-lockup-byline').css( 'a.spf-link::text').get(), *result.css( 'ul.yt-lockup-meta-info').css('li::text').getall(), str('https://youtube.com' + result.css('a.yt-uix-tile-link::attr(href)').get()) ] }
def parse(self, response: scrapy.http.response.html.HtmlResponse): results = response.css("div.b-post_padbot_15") for result in results: in_script = result.css('script::text').getall() secure = 'Безопасная сделка' in in_script[0] price = (in_script[0][497:-10] if secure else in_script[0][183:-10]).replace(' ', '').replace(';', '') text = in_script[1][142:-78] html_string = in_script[2][17:-3] type_ = 'Проект' if 'Проект' in html_string else 'Вакансия' if 'Вакансия' in html_string else 'Конкурс' yield { bool(result.css('img.b-pic_margtop_1')): { 'title': result.css('a.b-post__link::text').get(), 'secure': secure, 'price': price, 'text': text, 'type': type_ } }
def get_crawl_list( self, response: scrapy.http.response.html.HtmlResponse) -> List: """ DOMの内容から企業情報が載っているURlを取得する. Args: response (scrapy.http.response.html.HtmlResponse): オブジェクト Returns: List: 企業の情報が入ったListを返す. """ company_list = [] company_list_box = response.css(".entryList01") company_list_tag = company_list_box.css("li") for company in company_list_tag: company_path = company.css("a::attr(href)").extract_first() company_url = response.urljoin(company_path) company_list.append({"url": company_url}) return company_list
def get_name(self, response: scrapy.http.response.html.HtmlResponse) -> str: return response.css('h1.top::text').get()
def get_titles( self, response: scrapy.http.response.html.HtmlResponse) -> List[str]: return response.css('button.accordion-trigger::text').getall()