Example #1
0
    def parse(self, response: Response):
        image_elements = response.xpath("//img/@src")
        text_elements = response.xpath(
            "//*[not(self::script)][not(self::style)][not(self::title)][string-length(normalize-space(text())) > 0]/text()"
        )
        yield {
            'url':
            response.url,
            'text_elements':
            map(lambda text: text.get().strip(), text_elements),
            'image_elements':
            map(
                lambda image: 'https://kpi.ua' + image.get()
                if image.get().startswith('/') else image.get(),
                image_elements)
        }

        if response.url == self.start_urls[0]:
            link_elems = response.xpath(
                "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]"
            )
            links = [link.get() for link in link_elems if link.get() != "/"]
            for link in links[:20]:
                if link.startswith("/"):
                    link = "https://kpi.ua" + link
                yield scrapy.Request(link, self.parse)
Example #2
0
 def parse(self, response: Response):
     img_elems = response.xpath("//img/@data-src[starts-with(., 'http')]")
     text_elems = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 20]/text()"
     )
     yield {
         'url': response.url,
         'payload':
             [
                 {
                     'type': 'text',
                     'data': text.get().strip()
                 } for text in text_elems
             ] +
             [
                 {
                     'type': 'image',
                     'data': image.get()
                 } for image in img_elems
             ]
     }
     if response.url == self.start_urls[0]:
         link_elems = response.xpath(
             "//a/@href[starts-with(., 'https://isport.ua/') or starts-with(., '/')]"
         )
         links = [
             link.get() for link in link_elems if link.get() != "/"
         ]
         for link in links[:19]:
             if link.startswith("/"):
                 link = "https://isport.ua" + link
             yield scrapy.Request(link, self.parse)
Example #3
0
    def parse(self, response: Response):
        '''
        对获得到的结果进行转换
        :param response: 获得到的url
        :return: 直接返回item
        '''
        # 首先提取出所有的图片
        image_lists = response.xpath('.//div[@id = "list_img"]//img')
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        for image in image_lists:
            description = image.xpath('.//@alt').extract()[0]
            src = image.xpath('.//@src').extract_first()
            item = SecretSpiderItem(image_description=description)
            if src.startswith('http') or src.startswith('https'):
                item['image_urls'] = [src]
            else:
                full_url = SITE_BASE_URL + src
                item['image_urls'] = [full_url]
            yield item

        # 下面这一段代码 我们来判断是否有下一页来决定是否来构造对应得url(无法通过有效的响应得出来是否存在下一页)
        pages = response.xpath('//div[@class="page_num"]//a')
        next_page_url = ''
        for page in pages:
            page_text = page.xpath('./text()').extract_first()
            page_url = page.xpath('./@href').extract_first()
            if page_text == '下一页':
                next_page_url = page_url
        if next_page_url is not '':
            yield Request(url=next_page_url, callback=self.parse)
Example #4
0
 def parse(self, response: Response):
     images = response.xpath("//img/@src")
     texts = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > "
         "30]/text()")
     hyperlinks = response.xpath("//a/@href")
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'text',
             'data': text.get()
         } for text in texts] + [{
             'type': 'image',
             'data': image.get()
         } for image in images] + [{
             'type': 'hyperlink',
             'data': hyperlink.get()
         } for hyperlink in hyperlinks]
     }
     if response.url == self.start_urls[0]:
         links = response.xpath("//a/@href")
         selected_links = list(set(link.get() for link in links))[:19]
         for link in selected_links:
             yield scrapy.Request('http://basketball365.ru' + link,
                                  self.parse)
Example #5
0
    def parse(self, response: Response):

        pictures = response.xpath("//img/@src[starts-with(., 'http')]")
        strings = response.xpath(
            "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
        )

        yield {
            'url':
            response.url,
            'payload': [{
                'type': 'text',
                'data': text.get().strip()
            } for text in strings] + [{
                'type': 'image',
                'data': image.get()
            } for image in pictures]
        }

        if response.url == self.start_urls[0]:

            refs = response.xpath("//a/@href")
            ref = [refs.get() for r in refs][:15]
            for r in ref:
                yield scrapy.Request('http://www.posolstva.org.ua' + r,
                                     self.parse)
 def parse(self, response: Response):
     all_images = response.xpath("//img/@src[starts-with(., 'http')]")
     all_text = response.xpath(
         "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
     )
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'text',
             'data': text.get().strip()
         } for text in all_text] + [{
             'type': 'image',
             'data': image.get()
         } for image in all_images]
     }
     n = response.url == self.start_urls[0]
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '//www.ukr.net/')][substring(., string-length() - 4) = '.html']"
         )
         selected_links = [link.get() for link in all_links][:19]
         for link in selected_links:
             link = 'https:' + link
             yield scrapy.Request(link, self.parse)
Example #7
0
    def parse(self, response: Response):
        all_images = response.xpath("//img/@src[starts-with(., 'http')]")
        all_text = response.xpath(
            "//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()"
        )

        yield {
            'url':
            response.url,
            'payload': [{
                'type': 'text',
                'data': text.get().strip()
            } for text in all_text] + [{
                'type': 'image',
                'data': image.get()
            } for image in all_images]
        }

        if response.url == self.start_urls[0]:
            link_elems = response.xpath(
                "//a/@href[starts-with(., 'https://kpi.ua/') or starts-with(., '/')]"
            )
            links = [
                link.get() for link in link_elems
                if link.get() != "https://kpi.ua/"
            ][:19]
            for l in links:
                link = 'https://kpi.ua/' + l
                yield scrapy.Request(link, self.parse)
Example #8
0
 def parse_movie(self, response: Response):
     item = {}
     item['entity'] = 'movie'
     item['movie'] = response.xpath(
         '//h1/span[@property="v:itemreviewed"]/text()').get().split()[0]
     item['year'] = response.xpath(
         '//h1/span[@class="year"]/text()').get()[1:-1]
     item['score'] = response.xpath('//strong/text()').get()
     item['director'] = response.xpath(
         '//a[@rel="v:directedBy"]/text()').getall()
     item['actor'] = response.xpath(
         '//a[@rel="v:starring"]/text()').getall()
     item['genre'] = response.xpath(
         '//span[@property="v:genre"]/text()').getall()
     info = ''.join(response.xpath('//div[@id="info"]/text()').getall())
     item['country'] = info.replace('/', '').split()[0]
     item['length'] = re.search(
         r'\d+',
         response.xpath(
             '//span[@property="v:runtime"]/text()').get()).group()
     item['rank'] = re.search(
         r'\d+',
         response.xpath('//span[@class="top250-no"]/text()').get()).group()
     item['img_url'] = response.xpath(
         '//div[@id="mainpic"]//img/@src').get()
     # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
     # item['name'] = response.xpath('//div[@id="name"]').get()
     # item['description'] = response.xpath('//div[@id="description"]').get()
     return item
    def parse_sv_links(self, response: Response) -> FoundLink:
        """
        Yields BrokenLink items for broken links not caught by an error or exception and moves them through the
        broken_link_detector pipeline.

        :param response: A response produced by a Rule
        :return: A BrokenLink Item to be passed to the pipeline
        """
        title = response.css('title::text').get()

        if self.css:
            links = response.css(self.css)
        else:
            links = response

        links = links.xpath('./descendant::*[@href]')

        for link in links:
            if 'vufind' in link.attrib['href'] or 'sfx' in link.attrib['href']:
                link_obj = FoundLink()
                link_obj['a_origin'] = response.url
                link_obj['b_title'] = title
                link_obj['c_url'] = assemble_absolute_link(
                    response.url, link.attrib['href'])
                link_obj['d_text'] = link.xpath('./text()').get()
                yield link_obj
Example #10
0
 def _parse_sections(self, response: Response, folder_root: pathlib.Path):
     section_name = response.css(".ds-section-headline::text").get()
     meta = {'folder_root': folder_root}
     yield from response.follow_all(
         css=".layout-weekly-edition-section .teaser a.headline-link",
         callback=self._parse_article,
         meta=meta)
    def parse(self, response: Response, **kwargs):
        articles = response.xpath('//div[@class="pad5 english_article persian_article small_font"]')
        for article in articles:
            download_url = article.css('.article_links').xpath('(.//a)[3]/@href').get()
            download_url = response.urljoin(download_url)

            info_url = article.css('.article_links').xpath('(.//a)[4]/@href').get()
            info_url = response.urljoin(info_url)

            yield Request(info_url, cb_kwargs={'download_url': download_url}, callback=self.parse_info)
Example #12
0
 def get_api_hastag_posts(self, response: Response):
     hashtag = response.json()['data']['hashtag']
     url = self.get_url_to_query_next_posts_for_api(hashtag)
     if url:
         yield response.follow(url, callback=self.get_api_hastag_posts)
         posts: list = hashtag['edge_hashtag_to_media']['edges']
         for post in posts:
             yield InstaPostItem(data=post['node'])
             if post['node']['edge_media_to_comment']['count'] > 30 or post[
                     'node']['edge_liked_by']['count'] > 100:
                 yield response.follow(f'/p/{post["node"]["shortcode"]}/',
                                       callback=self.post_page_parse)
Example #13
0
def test_crawl(tmpdir):
    settings = {'CRAWL_ONCE_PATH': str(tmpdir)}
    crawler = get_crawler(settings_dict=settings)
    req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})
    req2 = scrapy.Request('http://example.com/2')
    req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})

    resp1 = Response(req1.url, request=req1)
    resp2 = Response(req2.url, request=req2)

    with opened_middleware(crawler) as mw:

        # 1. check spider middleware interface
        assert len(mw.db) == 0
        assert crawler.stats.get_value('crawl_once/initial') == 0
        output = [{}, scrapy.Request('http://example.com')]

        # crawl_once is False
        res = list(mw.process_spider_output(resp2, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 0

        # crawl_once is True
        res = list(mw.process_spider_output(resp1, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0
        assert crawler.stats.get_value('crawl_once/stored') == 1

        # 2. check downloader middleware interface
        assert mw.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0

        with pytest.raises(IgnoreRequest):
            mw.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1

        assert mw.process_request(req3, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0

    crawler = get_crawler(settings_dict=settings)
    with opened_middleware(crawler) as mw2:
        # it reuses the same file, so there are records
        assert len(mw2.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 1
        assert mw2.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
        with pytest.raises(IgnoreRequest):
            mw2.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert mw2.process_request(req3, crawler.spider) is None
 def parse(self, response: Response):
     all_images = response.xpath("//div[@class='foto']/@style[starts-with(., 'background-image: url(/')]")
     all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > 30]/text()")
     yield {
         'url': response.url,
         'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] +
                    [{'type': 'image', 'data': 'https://stejka.com' + image.get()[22:len(image.get())-2]} for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '/rus/')]")
         selected_links = ['https://stejka.com' + link.get() for link in all_links][:20]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Example #15
0
 async def parse_book(self, response: Response) -> dict:
     url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
     page = response.meta["playwright_page"]
     await page.screenshot(
         path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True
     )
     await page.close()
     return {
         "url": response.url,
         "title": response.css("h1::text").get(),
         "price": response.css("p.price_color::text").get(),
         "breadcrumbs": response.css(".breadcrumb a::text").getall(),
         "image": f"books/{url_sha256}.png",
     }
Example #16
0
    def parse_forum_page(self,
                         response: Response,
                         forum_url: str = None) -> None:
        """
        Forum page callback. Parses TopicItem.
        Follows next forum page and threads.
        :param forum_url: forum url, from first page. Will be extracted from response meta if not provided.
        :param response: scrapy crawl response
        """
        if forum_url is None:
            forum_url = response.meta['forum_url']

        # threads = response.css('a.topictitle')
        threads = response.css(
            'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,'
            'div.sticky_read_locked,div.announce_read,div.announce_read_locked'
        )
        # if len(threads) != len(threads2):
        #     print(response.url)
        too_old_thread_found = False
        for thread_container in threads:
            thread = thread_container.css('a.topictitle')
            topic_loader = ItemLoader(item=TopicItem(), response=response)
            thread_href_selector = thread.css('a::attr(href)')
            thread_link = response.urljoin(thread_href_selector.get())
            topic_loader.add_value('id',
                                   thread_href_selector.re(r'-(t[0-9]*).html'))
            topic_loader.add_value('thread_link', thread_link)
            topic_loader.add_value('forum_link', forum_url)
            topic_loader.add_value('name', thread.css('a::text').get())
            yield topic_loader.load_item()

            if not self.full_crawl:
                last_post_date_candidates = thread_container.css(
                    'span.post-date::text').getall()
                last_post_date = max(
                    map(lambda x: parse_date(x), last_post_date_candidates))
                if last_post_date < self.start_date:
                    too_old_thread_found = True
                    continue

            yield scrapy.Request(thread_link + "?sd=d",
                                 callback=self.parse_thread)

        next_page = response.css('a[rel=next]::attr(href)').get()
        if next_page and not too_old_thread_found:
            next_request = response.urljoin(next_page)
            yield scrapy.Request(next_request,
                                 callback=self.parse_forum_page,
                                 meta={'forum_url': forum_url})
Example #17
0
    def parse_word(self, response: Response) -> dict:
        """
        Parses the word and subtracts the type(f, m, adj, v or v*), the url and the message to send
        :param response: scrapy.http.response.Response
        :return: dict
        """

        # extract type, one of: (f, m, adj, v or v*)
        l_items = response.css(
            r"tr>td[colspan='2'][valign='TOP'][width='650']>font>i::text"
        ).extract()
        l_items = list(map(lambda item: item.strip(), l_items))

        type_possibilities = ["m", "f", "adj", "adv", "v", "v*", "pl", 'símb']

        l_type = list(filter(lambda item: item in type_possibilities, l_items))

        # should at least have 1 type, if not raise because there is a case that we do not control
        l_type = [item.strip() for item in l_type]
        try:
            s_type = l_type[0]
        except IndexError:
            str_err = "Something wrong with this l_items: '{}' in url: '{}'".format(
                l_items, response.url)
            logger.error(str_err)
            raise IndexError(str_err)

        # if the type is plural, then add and s to the type
        if len(l_type) > 1:
            if "pl" == l_type[1]:
                s_type += "s"

        # get the word from the title
        word = response.css(r"span[class='enc']::text").extract()[0].strip()

        data = {
            'word': word,  # it's only 1 element
            'type': s_type,
            'url': response.url,
            'used': False,
            'next_dict_id': self.start_id
        }

        # creates the message to send to twitter depending on the type of the word
        data["msg"] = return_twitter_msg(data)
        print(data)

        yield data
Example #18
0
 def extract_market(response: Response) -> dict:
     data = {}
     for field in response.xpath("//div[contains(@class, 'group-ema-referral-overview')]/dl/dl"):
         key = '\n'.join(field.xpath("dt[@role='heading']/button/text()").getall())
         value = '\n'.join(field.xpath("dd[@role='region']/div/p/text()").getall())
         data[key] = value
     return data
Example #19
0
    def errback_broken_link(self, failure: Failure) -> FoundLink:  # Failure may not be the right typehint
        """
        Handles behavior for links which cause Twisted failures - which is most of the broken links this spider
        hopes to find

        :param failure: A Twisted failure raised by the Retry middleware
        :return: None
        """
        # Structure of this function heavily inspired by:
        # https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-errbacks

        # If its a TCP or DNS error, short-circuit to the pipeline
        if failure.check(DNSLookupError, TCPTimedOutError):
            self.logger.info(f'Handled DNS/TCP related error. {failure.request.url}')
            request = failure.request
            dummy_response = Response(
                url=request.url,
                status=404,  # Kind of a lie
                request=request
            )
            yield from self.parse_broken_link(dummy_response)

        # If the client timed out, report that
        elif failure.check(TimeoutError):
            self.logger.info(f'Client timeout. {failure.request.url}')
            self.logger.error(repr(failure))
Example #20
0
 def parse(self, response: Response, **kwargs):
     try:
         js_data = self.get_js_shared_data(response)
         yield scrapy.FormRequest(
             self.__login_url,
             method='POST',
             callback=self.parse,
             formdata={
                 'username': self.__login,
                 'enc_password': self.__password
             },
             headers={'X-CSRFToken': js_data['config']['csrf_token']})
     except AttributeError as e:
         if response.json().get('authenticated'):
             yield response.follow(self.__tag_url,
                                   callback=self.first_tag_page_parse)
Example #21
0
 def first_tag_page_parse(self, response: Response):
     js_data = self.get_js_shared_data(response)
     hashtag: dict = js_data['entry_data']['TagPage'][0]['graphql'][
         'hashtag']
     url = self.get_url_to_query_next_posts_for_api(hashtag)
     if url:
         yield response.follow(url, callback=self.get_api_hastag_posts)
     hashtag['posts_count'] = hashtag['edge_hashtag_to_media']['count']
     posts = hashtag.pop('edge_hashtag_to_media')['edges']
     yield InstaHashTagItem(data=hashtag)
     for post in posts:
         yield InstaPostItem(data=post['node'])
         if post['node']['edge_media_to_comment']['count'] > 30 or post[
                 'node']['edge_liked_by']['count'] > 100:
             yield response.follow(f'/p/{post["node"]["shortcode"]}/',
                                   callback=self.post_page_parse)
Example #22
0
 def parse(self, response: Response):
     all_images = response.xpath("//img/@src[starts-with(., 'https')]")
     yield {
         'url':
         response.url,
         'payload': [{
             'type': 'image',
             'data': image.get()
         } for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., 'https://uahotels.info/')]")
         selected_links = [link.get() for link in all_links][:19]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Example #23
0
 def _parse_world_this_week(self, response: Response,
                            folder_root: pathlib.Path):
     meta = {'folder_root': folder_root}
     yield from response.follow_all(
         css=".layout-weekly-edition-wtw .weekly-edition-wtw__item a",
         callback=self._parse_article,
         meta=meta)
Example #24
0
def _load_model(response: Response) -> Dict:
    script = response.xpath(
        "/html/body/script[text()[contains(.,'window.jsonModel = ')]]/text()"
    ).extract_first()
    jsmodel = script[len("window.jsonModel = ") :]
    model = json.loads(jsmodel)
    return model
Example #25
0
def test_log_formatter_scrapy_1():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()

    for case in get_test_requests():
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, foo_spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format(
            request=original, url=original.url)
        assert logstr == expected
Example #26
0
 def parse(self, response: Response):
     all_images = response.xpath("//img/@data-src[starts-with(., 'http')]")
     all_text = response.xpath("//*[not(self::script)][not(self::style)][string-length(normalize-space(text())) > "
                               "30]/text()")
     yield {
         'url': response.url,
         'payload': [{'type': 'text', 'data': text.get().strip()} for text in all_text] +
                    [{'type': 'image', 'data': image.get()} for image in all_images]
     }
     if response.url == self.start_urls[0]:
         all_links = response.xpath(
             "//a/@href[starts-with(., '/')]")
         selected_links = ['https://isport.ua'
                           '' + link.get() for link in all_links][:20]
         for link in selected_links:
             yield scrapy.Request(link, self.parse)
Example #27
0
    def process_response(self, request: Request, response: Response,
                         spider: Spider) -> Response:
        if request.meta.get('dont_cache', False):
            return response

        # Skip cached responses and uncacheable requests
        if 'cached' in response.flags or '_dont_cache' in request.meta:
            request.meta.pop('_dont_cache', None)
            return response

        # RFC2616 requires origin server to set Date header,
        # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
        if 'Date' not in response.headers:
            response.headers['Date'] = formatdate(usegmt=True)

        # Do not validate first-hand responses
        cachedresponse = request.meta.pop('cached_response', None)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/firsthand', spider=spider)
            self._cache_response(spider, response, request, cachedresponse)
            return response

        if self.policy.is_cached_response_valid(cachedresponse, response,
                                                request):
            self.stats.inc_value('httpcache/revalidate', spider=spider)
            return cachedresponse

        self.stats.inc_value('httpcache/invalidate', spider=spider)
        self._cache_response(spider, response, request, cachedresponse)
        return response
Example #28
0
    def parse(self, response: Response) -> Iterator[Union[Request, Dict]]:
        """Обработчик http ответа от сайта.

        Парсит таблицы с данными и делает запросы к следующим страницам.

        :param response: ответ, получаемый из Scrapy
        :return: итератор по словарям с результатами парсинга и по запросам к следующим страницам
        """
        symbol = response.meta['symbol']
        link_extractor = LinkExtractor(
            allow=
            rf'https://www\.nasdaq\.com/symbol/{symbol.lower()}/insider-trades\?page=\d+'
        )
        link: Link
        for link in link_extractor.extract_links(response):
            match_page_number: Optional[Match] = re.search(
                r'page=(\d+)', link.url)
            if match_page_number is not None:
                page_number: int = int(match_page_number.group(1))
                if page_number <= MAX_PAGE:
                    yield Request(link.url, meta={'symbol': symbol})

        for row in response.xpath(
                '//div[@id="content_main"]//div[@class="genTable"]/table[@class="certain-width"]/tr'
        ):
            raw_row = RawRow.from_selector(row, symbol)
            try:
                yield ParsedRow.from_raw_row(raw_row).as_dict()
            except ValueError:
                logging.exception(
                    'Ошибка при парсинге строки таблицы с инсайдерскими сделками.'
                )
Example #29
0
    def parse_forum(self, response: Response) -> None:
        """
        Forum callback. Parses ForumItem.
        Follows subforum links and thread links (through self.parse_forum_page() method).
        :param response: scrapy crawl response
        """
        forum_loader = ItemLoader(item=ForumItem(), response=response)
        forum_loader.add_value('link', response.request.url)
        forum_loader.add_css('name', 'h2 > a::text')
        yield forum_loader.load_item()

        subforums = response.css('a.forumtitle::attr(href)').getall()
        for forum in subforums:
            next_request = response.urljoin(forum)
            yield scrapy.Request(next_request, callback=self.parse_forum)

        yield from self.parse_forum_page(response, response.url)
Example #30
0
 def _get_floorplan_images(self, response: Response) -> List[str]:
     xpath = "//div[@id = 'floorplan-1']//div[contains(@class, 'ui-modal-gallery__asset')]/@style"
     style = response.xpath(xpath).extract_first()
     if style:
         match = re.match(r".*url\('(.*)'\).*", style)
         if match:
             return [(match.group(1))]
     return []
 def start_requests(self):
     cookies = build_cookies(self)
     file_dir = os.getcwd()
     
     sub_dir = os.sep.join(['ship', 'shipping'])
     
     x = 0
     for fn_item in os.walk(sub_dir):
         for fn in fn_item[2]:
             self.source = fn.split(u' - ')[0]
             file_path = os.sep.join([file_dir, sub_dir, fn])
             response = Response(file_path, body=''.join(open(file_path, u'r').readlines()))
             response.body_as_unicode = lambda :response.body
             hxs = HtmlXPathSelector(response)
             a_tags = hxs.select('//table[@class="text2"]//a[@class="links2"]')
             for a_tag in a_tags:
                 detail_url = a_tag.select('@href').extract()[0]
                 try:
                     title = a_tag.select('text()').extract()[0]
                 except Exception as e:
                     continue
                 yield Request(self.home_page + detail_url, self.parse, cookies=cookies)
     
     print x
Example #32
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault('encoding', self.encoding)
     return Response.replace(self, *args, **kwargs)
Example #33
0
 def replace(self, *args, **kwargs):
     kwargs.setdefault("encoding", getattr(self, "_encoding", None))
     return Response.replace(self, *args, **kwargs)