def parse_sv_links(self, response: Response) -> FoundLink:
        """
        Yields BrokenLink items for broken links not caught by an error or exception and moves them through the
        broken_link_detector pipeline.

        :param response: A response produced by a Rule
        :return: A BrokenLink Item to be passed to the pipeline
        """
        title = response.css('title::text').get()

        if self.css:
            links = response.css(self.css)
        else:
            links = response

        links = links.xpath('./descendant::*[@href]')

        for link in links:
            if 'vufind' in link.attrib['href'] or 'sfx' in link.attrib['href']:
                link_obj = FoundLink()
                link_obj['a_origin'] = response.url
                link_obj['b_title'] = title
                link_obj['c_url'] = assemble_absolute_link(
                    response.url, link.attrib['href'])
                link_obj['d_text'] = link.xpath('./text()').get()
                yield link_obj
Exemple #2
0
    def parse_forum_page(self,
                         response: Response,
                         forum_url: str = None) -> None:
        """
        Forum page callback. Parses TopicItem.
        Follows next forum page and threads.
        :param forum_url: forum url, from first page. Will be extracted from response meta if not provided.
        :param response: scrapy crawl response
        """
        if forum_url is None:
            forum_url = response.meta['forum_url']

        # threads = response.css('a.topictitle')
        threads = response.css(
            'div.topic_read,div.topic_read_hot,div.topic_read_locked,div.topic_moved,div.sticky_read,'
            'div.sticky_read_locked,div.announce_read,div.announce_read_locked'
        )
        # if len(threads) != len(threads2):
        #     print(response.url)
        too_old_thread_found = False
        for thread_container in threads:
            thread = thread_container.css('a.topictitle')
            topic_loader = ItemLoader(item=TopicItem(), response=response)
            thread_href_selector = thread.css('a::attr(href)')
            thread_link = response.urljoin(thread_href_selector.get())
            topic_loader.add_value('id',
                                   thread_href_selector.re(r'-(t[0-9]*).html'))
            topic_loader.add_value('thread_link', thread_link)
            topic_loader.add_value('forum_link', forum_url)
            topic_loader.add_value('name', thread.css('a::text').get())
            yield topic_loader.load_item()

            if not self.full_crawl:
                last_post_date_candidates = thread_container.css(
                    'span.post-date::text').getall()
                last_post_date = max(
                    map(lambda x: parse_date(x), last_post_date_candidates))
                if last_post_date < self.start_date:
                    too_old_thread_found = True
                    continue

            yield scrapy.Request(thread_link + "?sd=d",
                                 callback=self.parse_thread)

        next_page = response.css('a[rel=next]::attr(href)').get()
        if next_page and not too_old_thread_found:
            next_request = response.urljoin(next_page)
            yield scrapy.Request(next_request,
                                 callback=self.parse_forum_page,
                                 meta={'forum_url': forum_url})
Exemple #3
0
 async def parse_book(self, response: Response) -> dict:
     url_sha256 = hashlib.sha256(response.url.encode("utf-8")).hexdigest()
     page = response.meta["playwright_page"]
     await page.screenshot(
         path=Path(__file__).parent / "books" / f"{url_sha256}.png", full_page=True
     )
     await page.close()
     return {
         "url": response.url,
         "title": response.css("h1::text").get(),
         "price": response.css("p.price_color::text").get(),
         "breadcrumbs": response.css(".breadcrumb a::text").getall(),
         "image": f"books/{url_sha256}.png",
     }
Exemple #4
0
    def parse_word(self, response: Response) -> dict:
        """
        Parses the word and subtracts the type(f, m, adj, v or v*), the url and the message to send
        :param response: scrapy.http.response.Response
        :return: dict
        """

        # extract type, one of: (f, m, adj, v or v*)
        l_items = response.css(
            r"tr>td[colspan='2'][valign='TOP'][width='650']>font>i::text"
        ).extract()
        l_items = list(map(lambda item: item.strip(), l_items))

        type_possibilities = ["m", "f", "adj", "adv", "v", "v*", "pl", 'símb']

        l_type = list(filter(lambda item: item in type_possibilities, l_items))

        # should at least have 1 type, if not raise because there is a case that we do not control
        l_type = [item.strip() for item in l_type]
        try:
            s_type = l_type[0]
        except IndexError:
            str_err = "Something wrong with this l_items: '{}' in url: '{}'".format(
                l_items, response.url)
            logger.error(str_err)
            raise IndexError(str_err)

        # if the type is plural, then add and s to the type
        if len(l_type) > 1:
            if "pl" == l_type[1]:
                s_type += "s"

        # get the word from the title
        word = response.css(r"span[class='enc']::text").extract()[0].strip()

        data = {
            'word': word,  # it's only 1 element
            'type': s_type,
            'url': response.url,
            'used': False,
            'next_dict_id': self.start_id
        }

        # creates the message to send to twitter depending on the type of the word
        data["msg"] = return_twitter_msg(data)
        print(data)

        yield data
Exemple #5
0
 def _parse_sections(self, response: Response, folder_root: pathlib.Path):
     section_name = response.css(".ds-section-headline::text").get()
     meta = {'folder_root': folder_root}
     yield from response.follow_all(
         css=".layout-weekly-edition-section .teaser a.headline-link",
         callback=self._parse_article,
         meta=meta)
Exemple #6
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product'):
         self.data_read_callback({
             'name':
             product_container.css(
                 'div.productTitleContent a ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.product_price_text ::text').get().strip()),
             'link':
             response.request.url + product_container.css(
                 'div.productTitleContent a ::attr(href)').get().strip()
         })
     next_page = response.css('div.pagination a.next ::attr(href)').get()
     if next_page:
         yield response.follow(next_page, self.parse)
Exemple #7
0
    def parse(self, response: Response, current_page: Optional[int] = None) -> Generator:
        page_count = response.css(".pager .current::text").re_first(r"Page \d+ of (\d+)")
        page_count = int(page_count)
        for page in range(2, page_count + 1):
            yield response.follow(f"/catalogue/page-{page}.html", cb_kwargs={"current_page": page})

        current_page = current_page or 1
        for book in response.css("article.product_pod a"):
            yield response.follow(
                book,
                callback=self.parse_book,
                meta={
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": f"page-{current_page}",
                },
            )
Exemple #8
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product-container'):
         self.data_read_callback({
             'name':
             product_container.css('a.product-name ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.product-price ::text').get().strip()),
             'link':
             product_container.css(
                 'a.product-name ::attr(href)').get().strip()
         })
     next_page = response.css('li.pagination_next a::attr(href)').get()
     if next_page:
         request_url_split = urlsplit(response.request.url)
         yield response.follow(
             f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}",
             self.parse)
Exemple #9
0
    def parse_word(self, response: Response) -> dict:

        # extract type, one of: (f, m, adj, v or v*)
        l_items = response.css(
            r"tr>td[colspan='2'][valign='TOP'][align='left'][width='650']>font::text"
        ).extract()
        l_items = list(map(lambda item: item.strip(), l_items))
        l_items = list(filter(lambda item: item != "", l_items))
        first_def = l_items[0]

        # get the word from the title
        word = response.css(r"span[class='enc']::text").extract()[0].strip()

        data = {
            'word': word,  # it's only 1 element
            'definition': first_def,
            'url': response.url
        }

        yield data
Exemple #10
0
    def parse(self, response: Response) -> dict:
        """
        Parses the search page from the dictionary and gets the URL's of the words and follows the first one
        :param response: response from the scrapy request
        :type response: scrapy.http.response.Response
        :return: yields a dictionary with word, type, url and msg for the dictionary word
        """

        # get <a> elements and retrieve the first one
        tag_urls = response.css(r"a[href*='GECART']::attr(href)")

        # get the next page GECART ID and save it
        js_next = response.css("a[class='SEGUENTS']::attr(href)")[0].get()
        start_id_next = js_next.split("(")[-1].split(")")[0]
        self.start_id = start_id_next

        for tag in tag_urls:
            url = tag.get()
            yield scrapy.Request(url, callback=self.parse_word)

            time.sleep(1)
Exemple #11
0
 def _parse_article(self, response: Response):
     folder_root = response.meta["folder_root"]
     title = response.css(".article__headline::text").get() + ".pdf"
     soup = BeautifulSoup(response.text, 'lxml')
     self._remove_html_node(soup.find("header", class_="ds-masthead"))
     self._remove_html_node(soup.find("div", class_="article__section"))
     self._remove_html_node(soup.find("div", class_="layout-article-links"))
     self._remove_html_node(soup.find("div", class_="newsletter-signup"))
     self._remove_html_node(soup.find("aside", class_="article__aside"))
     self._remove_html_node(
         soup.find("div", class_="layout-related-articles"))
     self._remove_html_node(soup.find("footer"))
     with open(folder_root / title, mode='w+b') as dest:
         pisa_status = pisa.CreatePDF(str(soup), dest=dest)
Exemple #12
0
    def parse_thread(self, response: Response) -> None:
        """
        Thread page callback. Parses PostItem.
        Follows next thread page.
        :param response: scrapy crawl response
        """

        posts = response.css('div.post.panel-body')
        post_number = 1
        too_old_post_found = False
        for post in posts:
            post_loader = ItemLoader(item=PostItem(), selector=post)
            post_loader.add_value(
                'username',
                post.css('a.username-coloured::text,a.username::text').get())
            post_date_string = post.css('div.post-date::text')[1].get()[3:-1]
            if post_date_string is None:
                continue
            post_date = parse_date(post_date_string)
            post_loader.add_value('date', str(post_date))
            post_loader.add_value(
                'post_id',
                post.css('div.post-date > a::attr(href)').re(r'.html#(.*)'))
            post_loader.add_value('thread_url', response.request.url)
            post_loader.add_value('post_number', post_number)
            post_number += 1
            post_loader.add_value('content', post.css('div.content').get())
            if not self.full_crawl:
                if post_date < self.start_date:
                    too_old_post_found = True
                    continue
            yield post_loader.load_item()

        next_page = response.css('a[rel=next]::attr(href)').get()
        if next_page and not too_old_post_found:
            next_request = response.urljoin(next_page)
            yield scrapy.Request(next_request, callback=self.parse_thread)
Exemple #13
0
 def parse(self, response: Response, **kwargs):
     for product_container in response.css('div.product'):
         availability = product_container.css(
             "span.p-cat-availability ::text").get().strip()
         if not availability.startswith("Skladem"):
             continue
         self.data_read_callback({
             'name':
             product_container.css('a.p-name span ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'span.p-det-main-price ::text').get().strip()),
             'link':
             response.request.url + product_container.css(
                 'a.p-name ::attr(href)').get().strip()[1:]
         })
     next_page = response.css(
         'div.pagination a.s-page.pagination-page ::attr(href)').get()
     if next_page:
         request_url_split = urlsplit(response.request.url)
         yield response.follow(
             f"{request_url_split.scheme}://{request_url_split.netloc}{next_page}",
             self.parse)
Exemple #14
0
    def request_associator(self, request: Request, response: Response):
        """
        Persists the url of the page in which a link was found past the point that information would usually be
        destroyed by hiding it in the resulting request's meta dictionary

        :param request: The Request object currently passed to the Rule's LinkExtractor
        :param response: The Response object which produced request
        :return: the same request as input
        """
        title = response.css('title::text').get()

        request.meta['origin'] = response.url
        request.meta['origin_title'] = title

        return request
Exemple #15
0
 def parse_top_books(self, response: Response):
     for selector in response.css(BOOK_PATH):
         short_name = selector.xpath(SHORT_NAME_PATH)
         last_chapter = selector.xpath(LAST_CHAPTER_PATH)
         is_vip = selector.xpath(IS_VIP_PATH)
         try:
             short_name = short_name.get().split('/')[2]
             last_chapter = int(last_chapter.get().replace(',', ''))
             is_free = not is_vip
             yield {
                 SHORT_NAME: short_name,
                 LAST_CHAPTER: last_chapter,
                 IS_FREE: is_free
             }
         except:
             pass
Exemple #16
0
    def parse_forum(self, response: Response) -> None:
        """
        Forum callback. Parses ForumItem.
        Follows subforum links and thread links (through self.parse_forum_page() method).
        :param response: scrapy crawl response
        """
        forum_loader = ItemLoader(item=ForumItem(), response=response)
        forum_loader.add_value('link', response.request.url)
        forum_loader.add_css('name', 'h2 > a::text')
        yield forum_loader.load_item()

        subforums = response.css('a.forumtitle::attr(href)').getall()
        for forum in subforums:
            next_request = response.urljoin(forum)
            yield scrapy.Request(next_request, callback=self.parse_forum)

        yield from self.parse_forum_page(response, response.url)
Exemple #17
0
    def parse(self, response: Response) -> None:
        """
        Default scrapy callback. To be used on forum main page.
        Follows subforum links.

        :param response: scrapy crawl resposne
        :returns :class:`hyperreal.crawler.hypercrawler.items.PostItem`,
        :class:`hyperreal.crawler.hypercrawler.items.ForumItem`, :class:`hypercrawler.items.TopicItem`
        """
        date = self.settings.get('START_DATE')
        self.full_crawl = date is None
        if not self.full_crawl:
            self.start_date = date

        subforums = response.css('a.forumtitle::attr(href)').getall()
        for forum in subforums:
            next_request = response.urljoin(forum)
            yield scrapy.Request(next_request, callback=self.parse_forum)
Exemple #18
0
 def parse(self, response: Response):
     current_code1 = None
     for code in response.css(
             "#main > #content > #content-inner > span.text"):
         if len(code.css('b')) != 0:
             current_code1 = code.css('b::text').extract_first().strip()
             continue
         else:
             current_item = IarcItem()
             current_item['code1'] = current_code1
             current_item['code2'] = code.css(
                 '::text').extract_first().strip()
             current_item['code2_name'] = code.css(
                 'a::text').extract_first().strip()
             yield Request(response.urljoin(
                 code.css('a').xpath('@href').extract_first()),
                           self.parse_code2,
                           meta={'item': current_item})
Exemple #19
0
 def parse(self, response: Response, **kwargs):
     is_empty_page = True
     for product_container in response.css('div.ramecekshop'):
         is_empty_page = False
         self.data_read_callback({
             'name':
             product_container.css('a.nadpisramecek ::text').get().strip(),
             'price':
             self._get_price_from_string(
                 product_container.css(
                     'a.objednejkosobr ::text').get().strip()),
             'link':
             self._get_product_link(
                 response,
                 product_container.css(
                     'a.nadpisramecek ::attr(href)').get().strip())
         })
     if not is_empty_page:
         yield response.follow(self._get_next_page_url(response),
                               self.parse)
Exemple #20
0
    def parse(self, response: Response):
        """Parse URLs.

        :param Response response: HTTP response returned by URL requested
        """
        arts = Selector(response).css("article")
        logging.info("La page {} contient {} articles".format(
            response.url, len(arts)))
        for art in arts:
            # title
            art_title_section = art.css("div.title-and-meta")

            # url
            art_rel_url = art_title_section.css(
                "h2.node__title a::attr(href)").get()

            if art_rel_url is not None:
                yield response.follow(art_rel_url, callback=self.parse_article)

        # get next page from bottom pagination to iterate over pages
        next_page = response.css("li.pager-next a::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
Exemple #21
0
    def parse(self, response: Response):

        # list of jokes in main page
        l_jokes = response.css('article[class="chiste mb10"]')

        # this will tell if we arrived at the last joke page or not
        if l_jokes:
            # get all the jokes in string
            for joke in l_jokes:
                l_strings = [x.get() for x in joke.css("p[itemprop='articleBody']::text")]
                s_joke = "".join(l_strings)
                url_id = joke.css("a[class='compartir']::attr('href')")[0].get()

                d_joke = {"hash_id": url_id, "user_str_id": "1000Chistes", "user_name": "1000Chistes", "joke": s_joke}

                yield d_joke

            time.sleep(5)

            # follow onto the next page
            new_page_number = int(response.url.split(r"/")[-1]) + 1
            new_url = "{url}/{page_num}".format(url=r"/".join(response.url.split(r"/")[:-1]), page_num=new_page_number)
            print(new_url)
            yield response.follow(new_url, self.parse)
Exemple #22
0
    def parse_article(self, response: Response):
        """Specific parsing logic for Geotribu articles

        :param Response response: HTTP response returned by URL requested
        """
        logging.info("Start parsing ARTICLE: {}".format(
            response.css("title::text").getall()[0]))
        item = ArticleItem()

        # contenu de la art
        art = response.css("article")[0]

        # titre
        art_title_section = art.css("div.title-and-meta")
        art_title = art_title_section.css("h2.node__title a::text").get()
        item["title"] = art_title

        # type d'article - jusqu'en 2013, les revues de presse étaient des articles
        # comme les autres et n'étaient pas aussi structurées
        if "revue de presse" in art_title.lower():
            item["kind"] = "rdp"
        else:
            item["kind"] = "art"

        # url
        art_rel_url = art_title_section.css(
            "h2.node__title a::attr(href)").get()
        item["url_full"] = art_rel_url

        # date de publication
        art_date = art.css("div.date")
        art_date_day = art_date.css("span.day::text").get()
        art_date_month = art_date.css("span.month::text").get()
        art_date_year = art_date.css("span.year::text").get()
        item["published_date"] = (art_date_day, art_date_month, art_date_year)

        # tags
        item["tags"] = art_title_section.css(
            "span.taxonomy-tag a::text").getall()

        # récupération de l'intro
        try:
            item["intro"] = art.css(
                "div.field-name-field-introduction").getall()[0]
        except IndexError:
            logging.debug("Article doesn't have introduction.")
            item["intro"] = None

        # corps
        art_raw_body = art.css("div.field-name-body")
        art_out_body = []
        for el in art_raw_body:
            art_out_body.append(el.get())

        item["body"] = art_out_body

        # images URLS (converted into absolute)
        item["image_urls"] = [
            response.urljoin(i) for i in art.css("img").xpath("@src").getall()
        ]

        # author
        author_block = art.css("div.view.view-about-author")
        if author_block:
            # author thumbnail
            thumbnail = (art.css("div.view.view-about-author").css(
                "img").xpath("@src").getall())
            if thumbnail and len(thumbnail):
                thumbnail = (art.css("div.view.view-about-author").css(
                    "img").xpath("@src").getall()[0])
            else:
                thumbnail = "?"

            # author name
            name = (author_block.css(
                "div.views-field.views-field-field-nom-complet").css(
                    "div.field-content::text").getall())
            if name and len(name):
                author_block.css(
                    "div.views-field.views-field-field-nom-complet").css(
                        "div.field-content::text").getall()[0]
            else:
                name = "?"

            item["author"] = {
                "thumbnail":
                thumbnail,
                "name":
                name[0],
                "description":
                author_block.css(
                    "div.views-field.views-field-field-description p").getall(
                    ),
            }
        else:
            item["author"] = {
                "thumbnail": "?",
                "name": art_title_section.css("span.username a::text").get(),
                "description": "",
            }

        yield item