Ejemplo n.º 1
0
 def parse_topic(self, response: scrapy.http.response.html.HtmlResponse):
     messages = []
     topic_id = self.last_part(response.url)
     for i, message in enumerate(response.css("tr")):
         topic_url = message.css(
             "td[class=subject] > a::attr(href)").extract_first()
         if topic_url is None:
             continue
         message_id = self.last_part(topic_url)
         messages.append({
             "id":
             message_id,
             "author":
             message.css("td[class=author] ::text").extract_first(),
             "date":
             message.css("td[class=lastPostDate] ::text").extract_first(),
             "file":
             self.locate_email_file(topic_id, i, message_id, False)
         })
         file_name = self.locate_email_file(topic_id, i, message_id, True)
         if os.path.exists(file_name):
             self.log("Skipped %s/%s - already fetched" %
                      (topic_id, message_id))
             continue
         yield response.follow(
             "%s/%s/message/raw?msg=%s/%s/%s" %
             (self.root, self.prefix, self.name, topic_id, message_id),
             functools.partial(self.save_email, file_name=file_name))
     yield {
         "topic": response.css("h2 ::text").extract_first(),
         "id": topic_id,
         "messages": messages
     }
Ejemplo n.º 2
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        for topic in response.css("tr a::attr(href)"):
            topic_url = "%s/forum/?_escaped_fragment_=topic/%s/%s" % (
                self.root, self.name, self.last_part(topic.extract()))
            yield response.follow(topic_url, self.parse_topic)

        for next_page in response.css("body > a"):
            self.log("Page: %s -> %s" %
                     (self.last_part(response.url),
                      self.last_part(
                          next_page.css("::attr(href)").extract_first())))
            yield response.follow(next_page, self.parse)
Ejemplo n.º 3
0
    def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse):
        self.logger.info('Found new keyword page: {}'.format(response.url))

        keyword = response.css('a.mw-selflink.selflink::text').get()

        if keyword:
            yield INCARKeywordItem(keyword=keyword)
 def parse_keyword(self, response: scrapy.http.response.html.HtmlResponse):
     self.logger.info('Found new keyword page: {}'.format(response.url))
     
     keyword = response.css('div[id=mw-content-text] p strong::text').extract_first()
     
     if keyword:
         yield INCARKeywordItem(keyword=keyword)
Ejemplo n.º 5
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        # Popular heroes parser:
        results = response.css('div.heroes-overview').css('div.r-row')
        for i, result in enumerate(results):
            hero = result.css('div.r-icon-text').css('div.r-body').css(
                'div.r-none-mobile').css('a::text').get()
            temp = result.css('div.r-10')
            matches = temp[0].css('div.r-body::text').get()
            winrate = temp[1].css('div.r-body::text').get()
            kda = temp[2].css('div.r-body::text').get()
            temp = result.css('div.r-175')
            try:
                role = temp[0].css('div.r-body').css('div.group').css(
                    'span::text').get()
            except IndexError:
                role = 'Undefined'

            try:
                line = temp[1].css('div.r-body').css('div.group').css(
                    'span::text').get()
            except IndexError:
                line = 'Undefined'

            yield {
                i: {
                    'hero': hero,
                    'matches': matches,
                    'winrate': winrate,
                    'KDA': kda,
                    'role': role,
                    'line': line
                }
            }
Ejemplo n.º 6
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     results = response.css('div.element')
     for i, result in enumerate(results):
         name = result.css('p.name').css('a::text').get()
         year = result.css('p.name').css('span.year::text').get()
         duration = result.css('div.info').css('span.gray')[0].css('::text').get()
         country = result.css('div.info').css('span.gray')[1].css('::text').get().split('<')[0]
         author = result.css('div.info').css('span.gray')[1].css('a::text').get()
         yield {i: {'name': name, 'year': year, 'duration': duration,
                    'country': country, 'author': author}}
Ejemplo n.º 7
0
    def parse_form(self, response: scrapy.http.response.html.HtmlResponse):
        try:
            input_element_list = response.css(
                'form input::attr(value)').extract()
        except KeyError as e:
            return None

        # Set up form with generative keys
        formdata = self._create_formdata(input_element_list)

        yield scrapy.FormRequest(url=self.start_urls[0],
                                 formdata=formdata,
                                 callback=self.parse_results)
Ejemplo n.º 8
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     results = response.css("li.expanded-shelf-content-item-wrapper").css(
         "div.yt-lockup-content")
     for i, result in enumerate(results):
         yield {
             i: [
                 result.css('a.yt-uix-tile-link::text').get(),
                 result.css('span.accessible-description::text').get()[3:],
                 result.css('div.yt-lockup-byline').css(
                     'a.spf-link::text').get(), *result.css(
                         'ul.yt-lockup-meta-info').css('li::text').getall(),
                 str('https://youtube.com' +
                     result.css('a.yt-uix-tile-link::attr(href)').get())
             ]
         }
Ejemplo n.º 9
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):
        results = response.css("div.b-post_padbot_15")
        for result in results:
            in_script = result.css('script::text').getall()
            secure = 'Безопасная сделка' in in_script[0]
            price = (in_script[0][497:-10] if secure else
                     in_script[0][183:-10]).replace('&nbsp',
                                                    '').replace(';', '')
            text = in_script[1][142:-78]
            html_string = in_script[2][17:-3]
            type_ = 'Проект' if 'Проект' in html_string else 'Вакансия' if 'Вакансия' in html_string else 'Конкурс'

            yield {
                bool(result.css('img.b-pic_margtop_1')): {
                    'title': result.css('a.b-post__link::text').get(),
                    'secure': secure,
                    'price': price,
                    'text': text,
                    'type': type_
                }
            }
    def get_crawl_list(
            self, response: scrapy.http.response.html.HtmlResponse) -> List:
        """
        DOMの内容から企業情報が載っているURlを取得する.

        Args:
            response (scrapy.http.response.html.HtmlResponse): オブジェクト

        Returns:
            List: 企業の情報が入ったListを返す.
        """
        company_list = []

        company_list_box = response.css(".entryList01")
        company_list_tag = company_list_box.css("li")

        for company in company_list_tag:
            company_path = company.css("a::attr(href)").extract_first()
            company_url = response.urljoin(company_path)

            company_list.append({"url": company_url})

        return company_list
 def get_name(self,
              response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.css('h1.top::text').get()
 def get_titles(
         self,
         response: scrapy.http.response.html.HtmlResponse) -> List[str]:
     return response.css('button.accordion-trigger::text').getall()