Esempio n. 1
0
    def next_page(self, response: scrapy.http.Response) -> scrapy.Request:
        """
        Goes to next page.

        :param response: response object
        :return: request for next page
        """
        # go to next page
        next_url = response.xpath("//a[@title='下一页']/@href").extract_first()
        if next_url is not None:
            self.log('Next page {}'.format(next_url), level=logging.INFO)
            time.sleep(random.random())
            return response.follow(
                url=next_url,
                callback=self.parse,
                # reuse the current proxy
                meta={'proxy': response.request.meta['proxy']},
                errback=self.handle_failure)
        else:
            # try to build the page by ourself
            arguments = self.decode_url(response.request.url)
            arguments['page'] += 1
            url = self.format_url(arguments)
            self.log('Next page (manually) {}'.format(url), level=logging.INFO)
            return response.follow(
                url=url,
                callback=self.parse,
                # reuse the current proxy
                meta={'proxy': response.request.meta['proxy']},
                errback=self.handle_failure)
    def parseCity(self, response: scrapy.http.Response):
        #example https://www.tripadvisor.in/Attractions-g186338-Activities-London_England.html#FILTERED_LIST

        attractionBoxs = response.css(
            'div.attraction_list.attraction_list_short > div.attraction_element > div > div > div > div > div.listing_title'
        )

        tourSetRegex = ".+([0-9]+).*"
        tourSetRegChecker = re.compile(tourSetRegex)

        for attraction in attractionBoxs:
            pointName = attraction.css('a::text').extract_first()
            if not tourSetRegChecker.match(pointName):
                attractionUrl = response.urljoin(
                    attraction.css('a::attr(href)').extract_first())
                response.meta['rank'] += 1
                yield response.follow(url=attractionUrl,
                                      callback=self.parseAttractionsPage,
                                      meta=response.meta)

        nextPageLink = response.css(
            'div.al_border.deckTools.btm > div > div.unified.pagination > a.nav.next.rndBtn.ui_button.primary.taLnk::attr(href)'
        )
        if nextPageLink:
            nextPageLink = response.urljoin(nextPageLink.extract_first())
            self.log("nextpage: " + nextPageLink)
            if response.meta['rank'] < 100:
                yield response.follow(nextPageLink,
                                      callback=self.parseCity,
                                      meta=response.meta)
    def parseCityAttractionsListPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Mumbai/d953

        print(
            'PARSING ATTRACTION LIST ####################################################################################'
        )
        print(response.url)

        self.incrementRequestCount()
        hrefs = response.css('div.ptm *> h2 > a')
        for href in hrefs:
            pointURL = href.css('::attr(href)').extract_first().strip()
            pointName = href.css('::text').extract_first().strip()
            yield response.follow(pointURL,
                                  callback=self.parseAttractionsPage,
                                  meta={
                                      'countryName':
                                      response.meta['countryName'],
                                      'cityName': response.meta['cityName'],
                                      'pointName': pointName
                                  })

        nextPageLink = response.css(
            'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)'
        ).extract_first()
        if nextPageLink:
            yield response.follow(nextPageLink,
                                  callback=self.parseCityAttractionsListPage,
                                  meta=response.meta)
Esempio n. 4
0
    def parse(self, response: scrapy.http.Response):

        # Extract every link to a landing page:
        for title in response.css('.document-row > h3 > a'):
            yield response.follow(title, self.parse_landing_page)

        # Extract the link to the next page of results:
        for next_page in response.css('.next > a'):
            yield response.follow(next_page, self.parse)
Esempio n. 5
0
    def parse_posts_list(self, response: scrapy.http.Response):
        # Fetch the posts
        for href in response.css("#posts a::attr(href)"):
            if href.get().startswith("/p"):
                yield response.follow(href, self.parse_thread)

        # Fetch all pages
        for href in response.css(".pagination a::attr(href)"):
            yield response.follow(href, self.parse_posts_list)
    def parseCountryAttractionsListPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Netherlands/d60

        self.incrementRequestCount()
        hrefs = response.css('div.ptm *> h2 > a::attr(href)').extract()
        for href in hrefs:
            yield response.follow(href, callback=self.parseAttractionsPage)

        nextPageLink = response.css(
            'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)'
        ).extract_first()
        if nextPageLink:
            yield response.follow(
                nextPageLink, callback=self.parseCountryAttractionsListPage)
Esempio n. 7
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        titles = response.xpath("//div[@class='r-ent']")
        for title in titles:
            try:
                url = title.xpath("div[@class='title']/a/@href").get()
                yield response.follow(url, callback=self.parse_content)
            except Exception:
                pass

        next_page = response.xpath(
            "//div[@class='btn-group btn-group-paging']/a[@class='btn wide'][2]/@href"
        ).get()
        if next_page and self.i < self.max_pages:
            self.logger.info(f'follow {next_page}')
            self.i += 1
            yield response.follow(next_page, callback=self.parse)
    def parseCountryPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/India/d723-ttd

        self.incrementRequestCount()

        breadcrumbs = response.css('div.crumbler *> span::text').extract()
        countryName = breadcrumbs[1].strip()

        countryListing = CountryListing(crawler=self.name,
                                        sourceURL=response.url,
                                        crawlTimestamp=getCurrentTime(),
                                        countryName=countryName)
        yield countryListing.jsonify()

        if skipNonRequired:
            if processName(countryName) not in processedRequiredCountries:
                # do not process this country's cities
                print('Skipping country: ', countryName)
                return
        countryId = response.url.split('/')[-1].split('-')[0][1:]
        cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format(
            countryId)
        yield response.follow(cityListingURL,
                              callback=self.parseCountryCities,
                              meta={'countryName': countryName})
 def parse(self, response: scrapy.http.Response):
     # example page:  https://www.viator.com/Amsterdam/d525-ttd
     countryMenuBox = response.css(
         '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)'
     )
     hrefs = countryMenuBox.css('a::attr(durl)').extract()
     for href in hrefs:
         yield response.follow(href, callback=self.parseCountryPage)
Esempio n. 10
0
 def parse_landing_page(self, response: scrapy.http.Response):
     # On a landing page, we can extract all the documents, or infer the JSON link and use that.
     #    yield {'title': pub.css('h1 ::text').extract_first().strip()}
     for pub in response.css('.publication'):
         # This is a publication, so let's infer the API link:
         lp_url = list(urlsplit(response.url))
         lp_url[2] = "/api/content%s" % lp_url[2]
         api_json_url = urlunsplit(lp_url)
         yield response.follow(api_json_url, self.parse_content_api_json)
Esempio n. 11
0
 def get_next_vimeo_overview_page(self, response: scrapy.http.Response):
     """
     if there is a "next"-button at the bottom of the vimeo-user's overview page:
     grabs the url from it and yields it
     """
     # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get()
     next_vimeo_overview_page = response.css(
         '#pagination > ol > li.pagination_next a::attr(href)').get()
     if next_vimeo_overview_page is not None:
         yield response.follow(next_vimeo_overview_page, self.parse)
    def parseX(self, response: scrapy.http.Response):
        #"https://www.trip.skyscanner.com/bangkok/things-to-do

        hrefs = response.css('div.items_list *> h2 > a::attr(href)').extract()
        for href in hrefs:
            self.log("visiting: " + href)
            response.meta['rank'] += 1
            yield response.follow(href,
                                  callback=self.parseAttractionsPage,
                                  meta=response.meta)

        nextPageLink = response.css(
            'div.items_list > div:nth-child(2) > ul > li.next.next_page > a::attr(href)'
        ).extract_first()
        if nextPageLink:
            self.log("nextpage: " + nextPageLink)
            if response.meta['rank'] < 100:
                yield response.follow(nextPageLink,
                                      callback=self.parseX,
                                      meta=response.meta)
Esempio n. 13
0
    def parseCountryCities(self, response: scrapy.http.Response):
        # example page: https://www.viator.com/pascities.jspa?country=723

        self.incrementRequestCount()

        hrefs = response.css(
            'div.unit.size-pas-cities *> a::attr(durl)').extract()
        for href in hrefs:
            yield response.follow(href,
                                  callback=self.parseCityPage,
                                  meta=response.meta)
Esempio n. 14
0
 def parse(self, response: scrapy.http.Response):
     hrefs = response.css('div.tours > a::attr(href)').extract()
     attractionNumber = 1
     for href in hrefs:
         href = response.urljoin(href)
         self.log("visiting: " + href)
         meta = urlToCityAndCountryMapping[response.url]
         meta['rank'] = attractionNumber
         yield response.follow(href,
                               callback=self.parseAttractionsPage,
                               meta=meta)
         attractionNumber += 1
Esempio n. 15
0
 def parse_delivery_method(self, response: scrapy.http.Response):
     data = json.loads(response.body)
     data = data['result']['forceGet']['shipping_methods']['data']
     method_names = {elem['delivery_block_label'] for elem in data}
     il = response.meta.get('item loader')
     if len(method_names) > 1 and Names.PAGE_PICK_UP_LABEL.value in method_names:
         il.add_value('delivery_method', Names.DELIVER_ALL.value)
     elif Names.PAGE_PICK_UP_LABEL.value in method_names:
         il.add_value('delivery_method', Names.DELIVER_TO_STORE.value)
     else:
         il.add_value('delivery_method', Names.DELIVER_TO_HOME.value)
     yield response.follow(url=response.meta[Names.ACTIONS_URL_KEY], callback=self.parse_actions, meta=response.meta)
Esempio n. 16
0
 def parse_main(
     self, response: scrapy.http.Response
 ) -> Union[Iterator[Issue], scrapy.http.Request]:
     links = (
         response.css("font.hdr b")[-1].xpath("../../../../../../*")[-1].
         xpath('.//td[@valign="top"]').xpath(".//a[not(@hidden)][@href]"))
     for link in links:
         href = link.attrib["href"]
         if href.endswith(".pdf") or href.endswith(".djvu"):
             yield Issue(file=response.urljoin(href),
                         text=link.css("::text").get())
         else:
             yield response.follow(url=href, callback=self.parse_page)
    def parse(self, response: scrapy.http.Response):
        # must always be fired

        venuesQueryURL = 'https://api.tripexpert.com/v1/venues?destination_id={}&api_key={}&limit={}'
        for city in availableCities:
            if processName(city['name']) not in processedRequiredCities:
                if skipNonRequired:
                    print('Skipping', city['name'])
                    continue
            queryURL = venuesQueryURL.format(city['id'], apiKey, limit)
            yield response.follow(queryURL, callback=self.parseCityVenues, meta={
                'city_id': city['id']
            })
Esempio n. 18
0
    def parse_thread(self, response: scrapy.http.Response):
        page = response.url.split("/")[3]  # http://blbla.com/< 3
        folder = os.path.join(os.getcwd(), "downloaded_data", self.userid, "posts")
        os.makedirs(folder, exist_ok=True)
        filename = f"{folder}/{page}.html"
        with open(filename, "wb") as f:
            f.write(response.body)
        self.log("Saved file %s" % filename)

        # Fetch other pages of the same thread
        if self.fetch_full_thread:
            for href in response.css(".pagination a::attr(href)"):
                yield response.follow(href, self.parse_thread)
Esempio n. 19
0
 def parse_m3u8(self, response: scrapy.http.Response):
     current_url = response.url  # m3u8的url,用于生成目录名和拼接后续请求的url
     another_m3u8 = re.search(r'\S+\.m3u8', response.text)
     # 如果指定了另一m3u8文件
     if another_m3u8:
         yield response.follow(another_m3u8.group(0),
                               callback=self.parse_m3u8)
         return
     # 判断是否加密
     match = re.search(r'#EXT-X-KEY:METHOD=AES-128(\S+)', response.text)
     if match:
         # 获取AES加密的key
         info = match.group(1)
         key = urljoin(current_url,
                       re.search(r'URI="([^"]+)"', info).group(1))
         self.key = requests.get(url=key,
                                 headers={
                                     'USER_AGENT': USER_AGENT
                                 }).content
         # 是否提供了IV
         match = re.search(r'IV=0x([0-9A-Fa-f]{32})', info)
         self.iv = bytes.fromhex(match.group(1)) if match else None
     else:
         self.key = None
     self.file_names = re.findall(r'\S+\.ts\S*',
                                  response.text)  # m3u8包含的ts片段名
     self.directory = hashlib.md5(
         current_url.encode('utf-8')).hexdigest()  # ts片段存储目录
     if not os.path.exists(self.directory):
         os.mkdir(self.directory)
     for i, file_name in enumerate(self.file_names):
         file_path = os.path.join(self.directory, f'{i}.ts')  # 存储位置,按序命名
         # 忽略已下载的片段
         if os.path.exists(file_path):
             self.logger.info(f'{i}.ts already crawled')
             continue
         yield response.follow(file_name,
                               callback=self.parse_ts,
                               meta={'file_path': file_path})
Esempio n. 20
0
    def parse_main(self, response: scrapy.http.Response):
        def fake_request_to_set_default_sorting():
            return response.follow(url='https://allo.ua/ru/products/mobile/dir-asc/klass-kommunikator_smartfon/order-price/0', callback=self.fake_parser)

        yield fake_request_to_set_default_sorting()
        categories = response.xpath('//a[@class="level-top"]')
        assert len(categories) in range(14, 17)
        for category in categories:
            cat_name = category.xpath('./span/text()').extract_first()
            if cat_name in self.cat_1_exceptions:
                continue
            url = category.xpath('./@href').extract_first().replace('allo.ua/ua/', 'allo.ua/ru/')
            yield response.follow(url, meta={Names.CAT_TREE_KEY: [cat_name]})
Esempio n. 21
0
    def apply_filter(self, response: scrapy.http.Response) -> scrapy.Request:
        """
        Applies the filter to the request.

        :param response: response object
        """
        url = self.format_url(response.request.meta['extra'])
        self.log('Process page {}'.format(url), level=logging.INFO)
        yield response.follow(
            url=url,
            dont_filter=True,
            callback=self.parse,
            meta={'proxy': response.request.meta['proxy']},
            errback=self.handle_failure)
Esempio n. 22
0
    def parseCityPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Lucknow/d23770-ttd

        self.incrementRequestCount()
        breadcrumbs = response.css('div.crumbler *> span::text').extract()
        countryName = breadcrumbs[1]
        if countryName != response.meta['countryName']:
            if countryName is None:
                countryName = response.meta['countryName'].strip()
            else:
                self.log(
                    'Country name mismatch.\nExpected: {}\nFound: {}'.format(
                        meta['countryName'], countryName))
        if len(breadcrumbs) == 4:
            regionName, cityName = breadcrumbs[2:4]
            cityName = cityName.strip()
            regionName = regionName.strip()
        else:
            # example page: https://www.viator.com/Mumbai/d953-ttd
            regionName, cityName = None, breadcrumbs[2]
            cityName = cityName.strip()
        countryName = countryName.strip()

        cityListing = CityListing(crawler=self.name,
                                  sourceURL=response.url,
                                  crawlTimestamp=getCurrentTime(),
                                  countryName=countryName,
                                  cityName=cityName,
                                  regionName=regionName)
        yield cityListing.jsonify()

        if skipNonRequired:
            if processName(cityName) not in processedRequiredCities:
                # do not process this country's cities
                print('Skipping city: ', countryName, cityName)
                return

        attractionsPageURL = response.url[:-4]
        yield response.follow(attractionsPageURL,
                              callback=self.parseCityAttractionsListPage,
                              meta={
                                  'countryName': countryName,
                                  'cityName': cityName,
                              })
Esempio n. 23
0
 def parse(self, response: scrapy.http.Response):
     news_divs = response.xpath("//div[contains(@class, 'news-info')]")
     if not news_divs:  # no news any more in this page
         return
     has_new_news = False
     for news_div in news_divs:
         item = self._parse_news_info(news_div)
         uri = item.get('url').replace(self.base_url, '')
         if not self.state.get(uri):
             self.state.setdefault(uri, True)
             has_new_news = True
             yield item
     # go to next page
     if has_new_news:
         cur_page_no = int(response.url.split('/')[-1])
         next_url = f"{self.page_url}{cur_page_no+1}"
         yield response.follow(url=next_url,
                               dont_filter=False,
                               callback=self.parse)
Esempio n. 24
0
    def parse(self, response: scrapy.http.Response):
        rows = response.xpath('//table[@class="brd_list_n"]/tbody/tr')

        for row in rows:
            result = {
                '산학연계여부': row.xpath('td[1]/text()').get(),
                '지방청': row.xpath('td[2]/text()').get(),
                '채용유무': row.xpath('td[3]/text()').get(),
            }
            link = row.xpath('th/a/@href').get()
            yield response.follow(link, self.parse_content, meta=result)

        page_info = response.xpath('//div[@class="topics"]').get()

        current_page = int(page_info.split('(')[1].split('/')[0])
        total_page = int(page_info.split('/')[1].split(' ')[0])

        if current_page < total_page:
            yield self.scrap_page(current_page + 1)
Esempio n. 25
0
    def parse(self, response:scrapy.http.Response):
        postmatch = re.compile(r'\s*ID[::\s]+(.+)\s+帖.*\s+楼.*\s+天[^::]*[::\s]+(.+)\s+原[^::]*[::\s]+(.*)')               ##用于辨析禁言格式的regex语句. .group(1)=ID, .group(2)=天数, group(3)=原因
        posts = response.xpath('//div[@class="dfsj_post mbm"]')                                                                 ##找出所有楼层
        for post in posts:
            bannerusername = post.xpath('descendant::a[@class="xw1"]/text()').extract_first()                                   ##禁言人
            posttext = ''.join(post.xpath('descendant::td[@class="t_f"]/descendant-or-self::text()').extract())                 ##文章内容
            postmatched = postmatch.match(posttext)                                                                             ##regex match                
            if postmatched:
                userid = postmatched.group(1).strip('\r')
                duration = postmatched.group(2).strip('\r')
                reason = postmatched.group(3).strip('\r')
                extraction = {
                    'bannerusername': bannerusername,
                    'userid': userid,
                    'duration': duration,
                    'reason': reason
                    }
                yield extraction

        next_page = response.xpath('//a[@class="bm_h"]/@href').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback = self.parse)
    def parseCityVenues(self, response: scrapy.http.Response):
        # example page: https://api.tripexpert.com/v1/venues?destination_id=3&api_key=6cb54d22babb25cc64ae730f17455338&limit=100

        self.incrementRequestCount()

        venues = json.loads(response.text)['response']['venues']

        venueIdURL = 'https://api.tripexpert.com/v1/venues/{}?api_key={}'
        for index, venue in enumerate(venues):
            venueType = int(venue['venue_type_id'])
            if 'rank_in_destination' not in venue:
                venueRank = 1 + (index // 3)
            else:
                venueRank = int(venue['rank_in_destination'])
            if venueRank > numForType[venueType]:
                # This venue is too poor for our interest
                continue

            queryURL = venueIdURL.format(venue['id'], apiKey)
            yield response.follow(queryURL, callback=self.parseVenueDetails, meta={
                'city_id': response.meta['city_id'],
                'venue_id': venue['id']
            })
Esempio n. 27
0
    def parseAttractionsPage(self, response: scrapy.http.Response):
        # example page: https://www.viator.com/Amsterdam-attractions/Albert-Cuyp-Market/d525-a8126

        print(
            'PARSING ATTRACTION PAGE @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
        )
        print(response.url)

        self.incrementRequestCount()
        breadcrumbs = response.css('div.crumbler *> span::text').extract()
        if breadcrumbs:
            countryName = breadcrumbs[1].strip()
            cityName = breadcrumbs[-3].strip()
            # -2 is the word 'attractions'
            pointName = breadcrumbs[-1].strip()
            # we don't really care about the region once we have the city?
        else:
            countryName = response.meta['countryName']
            cityName = response.meta['cityName']
            pointName = response.meta['pointName']

        data = response.css('div.cms-content')
        description, notes = None, None
        if len(data) > 0:
            description = data[0]
            description = '\n'.join(
                description.css('div::text').extract()).strip()
        if len(data) > 1:
            notes = data[1].css('::text').extract_first()
            if notes:
                notes = notes.strip()

        sideBox = response.css(
            'body > div.page.mtl > div.body > div.main-wide.unitRight > div.page-bg.line.light-border-b > div.unitRight.aside > div > div.mtmm.mhmm > div.line > div'
        )
        address = sideBox.css(
            'meta[itemprop="streetAddress"]::attr(content)').extract_first()
        if address:
            address = address.strip()

        ratingBox = sideBox.css('p[itemprop="aggregateRating"]')
        avgRating, ratingCount = None, None
        if ratingBox:
            bestRating = int(
                ratingBox.css('meta[itemprop="bestRating"]::attr(content)').
                extract_first())
            worstRating = int(
                ratingBox.css('meta[itemprop="worstRating"]::attr(content)').
                extract_first())
            givenRating = float(
                ratingBox.css('meta[itemprop="ratingValue"]::attr(content)').
                extract_first())
            ratingCount = int(
                ratingBox.css(
                    'span[itemprop="reviewCount"]::text').extract_first())
            avgRating = scaleRating(givenRating=givenRating,
                                    worstRating=worstRating,
                                    bestRating=bestRating)

        pointListing = PointListing(crawler=self.name,
                                    sourceURL=response.url,
                                    crawlTimestamp=getCurrentTime(),
                                    countryName=countryName,
                                    cityName=cityName,
                                    pointName=pointName,
                                    description=description,
                                    notes=notes,
                                    address=address,
                                    avgRating=avgRating,
                                    ratingCount=ratingCount)

        yield pointListing.jsonify()

        pointImage = response.css(
            'div.img-product > img::attr(src)').extract_first()
        if pointImage:
            pointImage = pointImage.strip()
        yield ImageResource(crawler=self.name,
                            sourceURL=response.url,
                            crawlTimestamp=getCurrentTime(),
                            countryName=countryName,
                            cityName=cityName,
                            pointName=pointName,
                            imageURL=pointImage).jsonify()

        yield response.follow('?subPageType=reviews',
                              callback=self.parseReviewsPage,
                              meta={
                                  'countryName': countryName,
                                  'cityName': cityName,
                                  'pointName': pointName
                              })
    def parse(self, response: scrapy.http.Response):
        # We get our soup.
        soup = BeautifulSoup(response.text, 'html.parser')
        # We create an empty dictionary to store all data about this page.
        data = {}
        data['title'] = soup.h1.string
        data['uri'] = response.url
        log.info(f"Processing {data['uri']}")
        try:
            href = soup.select_one(
                "#ctl00_placeHolderMain_linkEmailArticle")["href"]
            data['id'] = re.search(r"id=([0-9]+)", href).group(1)
        except Exception as e:
            log.info(f"Error ({data['uri']}): {e}")
            error.info("Error ({data['uri']}): {e}")
            return None
        # Mark as visited
        self.visited.add(data['uri'][data['uri'].find("article.aspx"):])
        self.visited.add(f"article.aspx?id={data['id']}")

        # Basic stuff
        data['abstract'] = soup.select_one(".articleblockconteiner p").text

        # Images
        data['images'] = []
        for img in soup.select("img.mbimg"):
            image = {}
            image["thumbUrl"] = f"http://www.yivoencyclopedia.org{img['src']}"
            href = img.parent["href"]
            image["viewerUrl"] = re.search(r"(http.*)&article", href).group(0)
            caption = img.parent.find_next_sibling("div")
            if caption:
                image["imgDesc"] = caption.text.replace(
                    "SEE MEDIA RELATED TO THIS ARTICLE", "").strip()
            data['images'].append(image)

        # Links
        data['links'] = []
        for a in soup.select(
                f"#ctl00_placeHolderMain_panelArticleText a[href^='article.aspx/']"
        ):
            link = {}
            link["href"] = f"http://www.yivoencyclopedia.org/{a['href']}"
            link["text"] = a.text.strip()
            if len(link["text"]
                   ) > 0:  # Strangely, there are sometimes empty links
                data['links'].append(link)
                # With yield, we can either return a new URL to be crawled
                # or the final data.
                if self.check_queue(link['href']):
                    yield response.follow(link["href"])

        # Glossary terms
        data['glossary'] = []
        for span in soup.select(".term"):
            term = span.text.strip()
            if len(term) > 0:  # Strangely, there are sometimes empty spans
                data['glossary'].append(term)

        # Subrecords, i.e., multi-page articles (like Poland)
        data['subrecords'] = []
        isMain = True
        for index, a in enumerate(
                soup.select("#ctl00_placeHolderMain_panelPager a")):
            sr = {}
            sr["href"] = f"http://www.yivoencyclopedia.org" + a["href"]
            sr["page"] = a.text.strip()
            if index == 0 and sr["href"] != data['uri']:
                isMain = False
            if not isMain and index == 0:
                data['parent'] = sr["href"]
                if self.check_queue(sr['href']):
                    yield response.follow(sr["href"])
            if isMain and index != 0:
                data['subrecords'].append(sr)
                if self.check_queue(sr['href']):
                    yield response.follow(sr["href"])

        # Subconcepts, i.e., H2 headings on the same page (not really a concept, but maybe useful)
        data['subconcepts'] = []
        for index, h2 in enumerate(soup.select("h2.entry")):
            sc = h2.text.strip()
            if index == 0 and not isMain:
                data['title'] = f"{sc} ({data['title']})"
                break
            # The following H2 headings are NOT stored as concepts:
            stops = [
                "About this Article", "Suggested Reading",
                "YIVO Archival Resources", "Author", "Translation"
            ]
            if sc in stops:
                break
            data['subconcepts'].append(sc)

        # Next record in alphabet
        next_article = soup.select_one(
            '#ctl00_placeHolderMain_linkNextArticle')
        if next_article:
            data['next_article'] = next_article['href']

        # Here we yield the data of this page.
        yield data
        if next_article and self.check_queue(data['next_article']):
            yield response.follow(data['next_article'])
Esempio n. 29
0
 def parse(self, response: scrapy.http.Response):
     selector_xpath = "//div[@class='mainbody']/div[@class='centent']/ul[position()>1]/li/a/@href"
     for i in response.xpath(selector_xpath).extract():
         yield response.follow(i, callback=self.parse_page)
Esempio n. 30
0
    def parse(self, response: scrapy.http.Response):
        if "Служебная:Вход" in unquote(response.url):
            log.info("Login page, skipping")
            return None
        log.info(f"Processing {response.url}")
        # We get our soup.
        soup = BeautifulSoup(response.text, 'html.parser')
        # We create an empty dictionary to store all data about this page.
        data = {}

        if "AllPages" in response.url or "Все_страницы" in unquote(
                response.url):
            log.info("Processing index page")
            for a in soup.select("div.mw-allpages-nav a"):
                h = a["href"]
                if self.check_queue(h):
                    log.info(f"New index page: {a['href']}")
                    yield response.follow(h)
            for a in soup.select(".mw-allpages-chunk li a"):
                h = self.lower_case(a["href"])
                if self.check_queue(h):
                    log.info(f"New page: {a['href']}")
                    yield response.follow(h)
            return None

        data['uri'] = self.lower_case(response.url)
        log.info(f"Processing {data['uri']}")
        if response.status == 404:
            log.info(f"Page not found: {response.url}")
            return data
        try:  # "wgArticleId":9907
            data['id'] = re.search(r'"wgArticleId":([0-9]+),',
                                   soup.head.get_text()).group(1)
        except Exception as e:
            log.info(f"Error getting ID ({data['uri']}): {e}")
            error.info(f"Error getting ID ({data['uri']}): {e}")
            # return {}

        # Basic stuff
        data['title'] = soup.select_one("h1.firstHeading").text.strip()
        abstract = soup.select_one("#mw-content-text p")
        if abstract:
            data['abstract'] = abstract.get_text().replace("\n", "").strip()
        else:
            error.info(f"No abstract: {response.url}")

# Links
        data["links"] = []
        for a in soup.select("#mw-content-text p a"):
            link = {}
            h = self.lower_case(a['href'].replace('&action=edit&redlink=1',
                                                  '').replace('?title=', '/'))
            link["href"] = h
            link["text"] = a.text.strip()
            if len(link["text"]
                   ) > 0:  # Strangely, there are sometimes empty links
                data["links"].append(link)
                if self.check_queue(h):
                    yield response.follow(h)

        #Category
        data["categories"] = []
        for a in soup.select("#mw-normal-catlinks ul li a"):
            data["categories"].append(a.text.strip())

        # Here we yield the data of this page.
        yield data