Exemple #1
0
    def parse(self, response: Response):

        pictures = response.xpath("//img/@src[starts-with(., 'http')]")
        strings = response.xpath(
            "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
        )

        yield {
            'url':
            response.url,
            'payload': [{
                'type': 'text',
                'data': text.get().strip()
            } for text in strings] + [{
                'type': 'image',
                'data': image.get()
            } for image in pictures]
        }

        if response.url == self.start_urls[0]:

            refs = response.xpath("//a/@href")
            ref = [refs.get() for r in refs][:15]
            for r in ref:
                yield scrapy.Request('http://www.posolstva.org.ua' + r,
                                     self.parse)
Exemple #2
0
 def parse_movie(self, response: Response):
     item = {}
     item['entity'] = 'movie'
     item['movie'] = response.xpath(
         '//h1/span[@property="v:itemreviewed"]/text()').get().split()[0]
     item['year'] = response.xpath(
         '//h1/span[@class="year"]/text()').get()[1:-1]
     item['score'] = response.xpath('//strong/text()').get()
     item['director'] = response.xpath(
         '//a[@rel="v:directedBy"]/text()').getall()
     item['actor'] = response.xpath(
         '//a[@rel="v:starring"]/text()').getall()
     item['genre'] = response.xpath(
         '//span[@property="v:genre"]/text()').getall()
     info = ''.join(response.xpath('//div[@id="info"]/text()').getall())
     item['country'] = info.replace('/', '').split()[0]
     item['length'] = re.search(
         r'\d+',
         response.xpath(
             '//span[@property="v:runtime"]/text()').get()).group()
     item['rank'] = re.search(
         r'\d+',
         response.xpath('//span[@class="top250-no"]/text()').get()).group()
     item['img_url'] = response.xpath(
         '//div[@id="mainpic"]//img/@src').get()
     # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
     # item['name'] = response.xpath('//div[@id="name"]').get()
     # item['description'] = response.xpath('//div[@id="description"]').get()
     return item
Exemple #3
0
 def parse(self, response: Response):
     img_elems = response.xpath("//img/@data-src[starts-with(., 'http')]")
     text_elems = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 20]/text()"
     )
     yield {
         'url': response.url,
         'payload':
             [
                 {
                     'type': 'text',
                     'data': text.get().strip()
                 } for text in text_elems
             ] +
             [
                 {
                     'type': 'image',
                     'data': image.get()
                 } for image in img_elems
             ]
     }
     if response.url == self.start_urls[0]:
         link_elems = response.xpath(
             "//a/@href[starts-with(., 'https://isport.ua/') or starts-with(., '/')]"
         )
         links = [
             link.get() for link in link_elems if link.get() != "/"
         ]
         for link in links[:19]:
             if link.startswith("/"):
                 link = "https://isport.ua" + link
             yield scrapy.Request(link, self.parse)
Exemple #4
0
    def parse(self, response: Response):
        '''
        对获得到的结果进行转换
        :param response: 获得到的url
        :return: 直接返回item
        '''
        # 首先提取出所有的图片
        image_lists = response.xpath('.//div[@id = "list_img"]//img')
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        for image in image_lists:
            description = image.xpath('.//@alt').extract()[0]
            src = image.xpath('.//@src').extract_first()
            item = SecretSpiderItem(image_description=description)
            if src.startswith('http') or src.startswith('https'):
                item['image_urls'] = [src]
            else:
                full_url = SITE_BASE_URL + src
                item['image_urls'] = [full_url]
            yield item

        # 下面这一段代码 我们来判断是否有下一页来决定是否来构造对应得url(无法通过有效的响应得出来是否存在下一页)
        pages = response.xpath('//div[@class="page_num"]//a')
        next_page_url = ''
        for page in pages:
            page_text = page.xpath('./text()').extract_first()
            page_url = page.xpath('./@href').extract_first()
            if page_text == '下一页':
                next_page_url = page_url
        if next_page_url is not '':
            yield Request(url=next_page_url, callback=self.parse)
Exemple #5
0
    def parse(self, response: Response):
        image_elements = response.xpath("//img/@src")
        text_elements = response.xpath(
            "//*[not(self::script)][not(self::style)][not(self::title)][string-length(normalize-space(text())) > 0]/text()"
        )
        yield {
            'url':
            response.url,
            'text_elements':
            map(lambda text: text.get().strip(), text_elements),
            'image_elements':
            map(
                lambda image: 'https://kpi.ua' + image.get()
                if image.get().startswith('/') else image.get(),
                image_elements)
        }

        if response.url == self.start_urls[0]:
            link_elems = response.xpath(
                "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]"
            )
            links = [link.get() for link in link_elems if link.get() != "/"]
            for link in links[:20]:
                if link.startswith("/"):
                    link = "https://kpi.ua" + link
                yield scrapy.Request(link, self.parse)
 def parse(self, response: Response):
     all_images = response.xpath("//img/@src[starts-with(., 'http')]")
     all_text = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
     )
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'text',
             'data': text.get().strip()
         } for text in all_text] + [{
             'type': 'image',
             'data': image.get()
         } for image in all_images]
     }
     n = response.url == self.start_urls[0]
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '//www.ukr.net/')][substring(., string-length() - 4) = '.html']"
         )
         selected_links = [link.get() for link in all_links][:19]
         for link in selected_links:
             link = 'https:' + link
             yield scrapy.Request(link, self.parse)
Exemple #7
0
 def parse(self, response: Response):
     images = response.xpath("//img/@src")
     texts = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > "
         "30]/text()")
     hyperlinks = response.xpath("//a/@href")
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'text',
             'data': text.get()
         } for text in texts] + [{
             'type': 'image',
             'data': image.get()
         } for image in images] + [{
             'type': 'hyperlink',
             'data': hyperlink.get()
         } for hyperlink in hyperlinks]
     }
     if response.url == self.start_urls[0]:
         links = response.xpath("//a/@href")
         selected_links = list(set(link.get() for link in links))[:19]
         for link in selected_links:
             yield scrapy.Request('http://basketball365.ru' + link,
                                  self.parse)
Exemple #8
0
    def parse(self, response: Response):
        all_images = response.xpath("//img/@src[starts-with(., 'http')]")
        all_text = response.xpath(
            "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
        )

        yield {
            'url':
            response.url,
            'payload': [{
                'type': 'text',
                'data': text.get().strip()
            } for text in all_text] + [{
                'type': 'image',
                'data': image.get()
            } for image in all_images]
        }

        if response.url == self.start_urls[0]:
            link_elems = response.xpath(
                "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]"
            )
            links = [
                link.get() for link in link_elems
                if link.get() != "https://kpi.ua/"
            ][:19]
            for l in links:
                link = 'https://kpi.ua/' + l
                yield scrapy.Request(link, self.parse)
 def parse(self, response: Response):
     all_images = response.xpath("//div[@class='foto']/@style[starts-with(., 'background-image: url(/')]")
     all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()")
     yield {
         'url': response.url,
         'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] +
                    [{'type': 'image', 'data': 'https://stejka.com' + image.get()[22:len(image.get())-2]} for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '/rus/')]")
         selected_links = ['https://stejka.com' + link.get() for link in all_links][:20]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Exemple #10
0
 def parse(self, response: Response):
     all_images = response.xpath("//img/@data-src[starts-with(., 'http')]")
     all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > "
                               "30]/text()")
     yield {
         'url': response.url,
         'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] +
                    [{'type': 'image', 'data': image.get()} for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '/')]")
         selected_links = ['https://isport.ua'
                           '' + link.get() for link in all_links][:20]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Exemple #11
0
 def extract_market(response: Response) -> dict:
     data = {}
     for field in response.xpath("//div[contains(@class, 'group-ema-referral-overview')]/dl/dl"):
         key = '\n'.join(field.xpath("dt[@role='heading']/button/text()").getall())
         value = '\n'.join(field.xpath("dd[@role='region']/div/p/text()").getall())
         data[key] = value
     return data
Exemple #12
0
 def parse(self, response: Response):
     all_images = response.xpath("//img/@src[starts-with(., 'https')]")
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'image',
             'data': image.get()
         } for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., 'https://uahotels.info/')]")
         selected_links = [link.get() for link in all_links][:19]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Exemple #13
0
    def parse(self, response: Response) -> Iterator[Union[Request, Dict]]:
        """Обработчик http ответа от сайта.

        Парсит таблицы с данными и делает запросы к следующим страницам.

        :param response: ответ, получаемый из Scrapy
        :return: итератор по словарям с результатами парсинга и по запросам к следующим страницам
        """
        symbol = response.meta['symbol']
        link_extractor = LinkExtractor(
            allow=
            rf'https://www\.nasdaq\.com/symbol/{symbol.lower()}/insider-trades\?page=\d+'
        )
        link: Link
        for link in link_extractor.extract_links(response):
            match_page_number: Optional[Match] = re.search(
                r'page=(\d+)', link.url)
            if match_page_number is not None:
                page_number: int = int(match_page_number.group(1))
                if page_number <= MAX_PAGE:
                    yield Request(link.url, meta={'symbol': symbol})

        for row in response.xpath(
                '//div[@id="content_main"]//div[@class="genTable"]/table[@class="certain-width"]/tr'
        ):
            raw_row = RawRow.from_selector(row, symbol)
            try:
                yield ParsedRow.from_raw_row(raw_row).as_dict()
            except ValueError:
                logging.exception(
                    'Ошибка при парсинге строки таблицы с инсайдерскими сделками.'
                )
Exemple #14
0
def _load_model(response: Response) -> Dict:
    script = response.xpath(
        "/html/body/script[text()[contains(.,'window.jsonModel = ')]]/text()"
    ).extract_first()
    jsmodel = script[len("window.jsonModel = ") :]
    model = json.loads(jsmodel)
    return model
Exemple #15
0
 def parse(self, response: Response):
     products = response.xpath("//div[contains(@class, 'cell item')]")[:20]
     for product in products:
         yield {
             'description': product.xpath("./h3/a[@class='b1c-name-uk']/text()").get(),
             'price': product.xpath("substring-before(./p[contains(@class, 'b1c-withoutprice')]/text(),' грн.')").get(),
             'img': product.xpath("./div/a/img[@id='product']/@src[starts-with(., 'https')]").get()
         }
Exemple #16
0
 def parse(self, response: Response):
     products = response.xpath("//div[contains(@class, 'row table-row')]")[:20]
     for number in range(20):
         yield {
             'description': products.xpath("//a[contains(@class, 'pnameh')]/text()").extract()[number],
             'price': products.xpath("//div[contains(@class, 'pprice')]/text()").extract()[number],
             'img': 'https://odissey.kiev.ua/' + products.xpath("//img[contains(@class, 'thumbnail')]/@src").extract()[number]
         }
Exemple #17
0
 def _get_floorplan_images(self, response: Response) -> List[str]:
     xpath = "//div[@id = 'floorplan-1']//div[contains(@class, 'ui-modal-gallery__asset')]/@style"
     style = response.xpath(xpath).extract_first()
     if style:
         match = re.match(r".*url\('(.*)'\).*", style)
         if match:
             return [(match.group(1))]
     return []
Exemple #18
0
 def parse(self, response: Response):
     products = response.xpath(
         "//section[contains(@class, 'product-tile_product')]")[:20]
     for product in products:
         yield {
             'description': product.xpath("./@data-name").get(),
             'price': product.xpath("./@data-price").get(),
             'img': product.xpath("./@data-img").get()
         }
    def parse(self, response: Response, **kwargs):
        articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]')
        for article in articles:
            download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get()
            download_url = response.urljoin(download_url)

            info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get()
            info_url = response.urljoin(info_url)

            yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
Exemple #20
0
def _load_property_page_model(response: Response) -> Dict:
    script = (
        response.xpath(
            "/html/body/script[text()[contains(.,'window.PAGE_MODEL = ')]]/text()"
        )
        .extract_first()
        .strip()
    )
    jsmodel = script[len("window.PAGE_MODEL = ") :]
    model = json.loads(jsmodel)
    return model
Exemple #21
0
 def parse(self, response: Response):
     products = response.xpath(
         "//div[contains(@class, 'product-block')]")[:19]
     for product in products:
         yield {
             'description':
             product.xpath(".//img[@class='img-responsive']/@title").get(),
             'price':
             product.xpath(".//span[@class='special-price']/text()").get(),
             'img':
             product.xpath(".//img[@class='img-responsive']/@src").get()
         }
 def parse(self, response: Response):
     products = response.xpath(
         "//div[contains(@class, 'ypi-grid-list__item_body')]")[:20]
     for product in products:
         yield {
             'description':
             product.xpath(".//a[@class='product-title']/@title").get(),
             'price':
             product.xpath(".//span[@class='ty-price-num']/text()").get(),
             'img':
             product.xpath(".//img[@class='ty-pict cm-image']/@src").get()
         }
Exemple #23
0
 def parse(self, response: Response):
     products = response.xpath("//ul[@id=\"product_list\"]/li")[:20]
     for product in products:
         yield {
             'description':
             product.xpath(".//a[@class='b1c-name-uk']/@title").get(),
             'price':
             product.xpath(
                 ".//div[@class='content_price']/span/text()").get(),
             'img':
             product.xpath(".//img[@class='b1c-img']/@src").get()
         }
Exemple #24
0
 def parse_main_page(self, response: Response):
     book_urls = response.xpath(BOOK_URL)
     genre_urls = response.xpath(GENRE_URL)
     for url in book_urls:
         short_name = get_book_name_from_url(url.get())
         yield Request(
             url=BASE_URL.format(short_name),
             callback=self.parse_book_info,
             cb_kwargs=dict(short_name=short_name)
         )
     genre_urls = [x.get() for x in genre_urls]
     genres = list(set([x.replace('/', '') for x in genre_urls]))
     for k in range(2):
         yield Request(
             url=response.urljoin(genre_urls[k]),
             callback=self.parse_books_in_page
         )
         yield Request(
             url=GENRE_LIST_URL.format(genres[k]),
             callback=self.parse_genre_list,
             cb_kwargs=dict(genre=genres[k])
         )
Exemple #25
0
 def parse(self, response: Response):
     products = response.xpath(
         "//li[contains(@class, 'product-item')]")[:20]
     for product in products:
         yield {
             'description':
             product.xpath(
                 ".//div[@class='item-info']/p[@class='h4']/a/text()").get(
                 ),
             'price':
             product.xpath(".//span[@class='value']/text()").get(),
             'img':
             product.xpath(".//img[@class='img-product']/@src").get()
         }
Exemple #26
0
 def parse(self, response: Response):
     furnitures = response.xpath(
         "//div[contains(@class, 'product-block')]")[:20]
     for furniture in furnitures:
         yield {
             'price':
             furniture.xpath(
                 "./div[@class='product-meta']//span[@class='special-price']/text()"
             ).get(),
             'description':
             furniture.xpath(".//a[@class='img']/@title").get(),
             'image':
             furniture.xpath(".//img/@src").get()
         }
Exemple #27
0
 def parse(self, response: Response):
     products = response.xpath("//div[contains(@class, 'port-i')]")[:20]
     for product in products:
         yield {
             'description':
             product.xpath(
                 ".//img[@class='UI-CATALOG-PRODUCT-IMAGE']/@title").get(),
             'price':
             product.xpath(
                 ".//span[@class='price-value UAH']/@content").get(),
             'img':
             product.xpath(
                 ".//img[@class='UI-CATALOG-PRODUCT-IMAGE']/@src").get()
         }
Exemple #28
0
 def parse(self, response: Response):
     all_images = response.xpath(
         "//div/@style[starts-with(., 'background-image')]")
     all_text = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
     )
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'text',
             'data': text.get().strip()
         } for text in all_text] + [{
             'type': 'image',
             'data': image.get()
         } for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath("//a/@href[starts-with(., '/rus')]")
         selected_links = [link.get() for link in all_links][:19]
         for link in selected_links:
             print(link)
             yield scrapy.Request(link, self.parse)
Exemple #29
0
 def parse(self, response: Response):
     products = response.xpath("//div[contains(@class, 'item_div')]")[:20]
     for files in range(20):
         yield {
             'description':
             products.xpath(
                 "//div[contains(@class, 'item_nazvanie')]/a/text()").
             extract()[files],
             'price':
             products.xpath("//div[contains(@class, 'price fl')]/text()"
                            ).extract()[files],
             'img':
             products.xpath("//img/@src").extract()[files],
         }
Exemple #30
0
 def parse(self, response: Response):
     products = response.xpath(
         "//div[contains(@class, 'product-container')]")[:20]
     for product in products:
         yield {
             'description':
             product.xpath(
                 ".//img[@class='replace-2x img-responsive']/@title").get(),
             'price':
             product.xpath(
                 ".//span[@class='price product-price']/text()").get(),
             'img':
             product.xpath(
                 ".//img[@class='replace-2x img-responsive']/@src").get()
         }