Beispiel #1
0
 def parse(self,response):
   konten_selektor = 'article.list-content__item'
   jumlah_berita = 0
   for konten in response.css(konten_selektor):
     link_selector = 'h3.media__title a ::attr(href)'
     url = konten.css(link_selector).extract_first()
     self.total_scraped += 1
     if (not isBerita(url)):
       jumlah_berita = jumlah_berita +1
       continue
     url = url+'?single=1'
     jumlah_berita = jumlah_berita +1
     yield scrapy.Request(url, callback=self.parse_artikel)
     
   
   
   if jumlah_berita> 19 :
     self.hal = self.hal+1
     next_page = 'https://news.detik.com/indeks/'+str(self.hal)+'?date='+self.tanggal
     request = scrapy.Request(url=next_page)
     yield request
   else:
     try :
       rasio = self.total_scraped//self.dropped_count
       if rasio < 2:
         kirim_notif(self.name)
     except:
       pass
    def parse(self, response):
        konten_selektor = 'div.article__list.clearfix'
        #menghitung jumlah berita di halaman
        jumlah_berita = 0
        for konten in response.css(konten_selektor):
            #crawl on each url
            link_selector = 'a.article__link ::attr(href)'
            link = konten.css(link_selector).extract_first() + "?page=all"
            self.total_scraped += 1
            jumlah_berita = jumlah_berita + 1

            req = scrapy.Request(link, callback=self.parse_artikel)
            yield req

        print("jumlah berita  =", jumlah_berita, "----halaman =", self.hal)
        #find next page if any.
        if jumlah_berita > 14:
            self.hal = self.hal + 1
            next_page = 'https://indeks.kompas.com/?site=news&date=' + self.tanggal + '&page=' + str(
                self.hal)
            req = scrapy.Request(next_page, callback=self.parse)
            yield req
        else:
            try:
                rasio = self.total_scraped // self.dropped_count
                if rasio < 2:
                    kirim_notif(self.name)
            except:
                pass
            print("scraping ---- Selesai Total halaman = ", self.hal)
            print("jumlah berita  =", jumlah_berita, "----halaman =", self.hal)
 def parse(self, response):
     konten_selektor = '.row.mb-30'
     #menghitung jumlah berita di halaman
     jumlah_berita = 0
     for konten in response.css(konten_selektor):
         #crawl on each url
         link_selector = 'a ::attr(href)'
         link = konten.css(link_selector).extract_first()
         jumlah_berita = jumlah_berita + 1
         self.total_scraped += 1
         req = scrapy.Request(link, callback=self.parse_artikel)
         yield req
     #find next page if any.
     if jumlah_berita > 14:
         self.hal = self.hal + 1
         next_page = ('https://www.bisnis.com/index/page/?c=0&d=' +
                      self.tanggal + '&d=' + self.tanggal + '&per_page=' +
                      str(self.hal))
         req = scrapy.Request(next_page, callback=self.parse)
         yield req
     else:
         try:
             rasio = self.total_scraped // self.dropped_count
             if rasio < 2:
                 kirim_notif(self.name)
         except:
             pass
         print("scraping ---- Selesai Total halaman = ", self.hal)
         print("jumlah berita  =", jumlah_berita, "----halaman =", self.hal)
  def parse(self,response):
    #print('========',self.start_urls)
    berita_selector ="div.txt_subkanal.txt_index h2 a::attr(href)"
    i = self.i
    #jumlah berita untuk mengecek halaman apakah masih bisa di scrape
    jumlah_berita = 0

    for baris in response.css(berita_selector):
      # crawl each url in particular page
      url = baris.getall()[0]
      self.total_scrapped += 1
      if  (not isBerita(url)):
        jumlah_berita = jumlah_berita+1
        continue
      jumlah_berita = jumlah_berita+1
      req = scrapy.Request(url, callback=self.parse_artikel) 
      #yield request
      yield req
      

    #go to next page

    
    if jumlah_berita>=39:
      np_sel = 'div.pagination section nav a::attr(href)'
      next_page = response.css(np_sel).getall()[-1]
      req = scrapy.Request(next_page,callback=self.parse)
      self.i = i +1
      yield  req
    else:
      try:
        rasio = self.total_scraped//self.dropped_count
        if rasio < 2:
          kirim_notif(self.name)
      except:
        pass
      sys.exit("scraping Republika - selesai")
    def parse(self, response):
        konten_selektor = 'article.simple-post'
        #menghitung jumlah berita di halaman
        jumlah_berita = 0
        for konten in response.css(konten_selektor):
            #crawl on each url
            link_selector = 'a ::attr(href)'
            link = konten.css(link_selector).extract_first()
            self.total_scraped += 1
            if (link in self.url_seen):
                sys.exit()
            if (not isBerita(link)):
                jumlah_berita = jumlah_berita + 1
                continue

            jumlah_berita = jumlah_berita + 1

            req = scrapy.Request(link, callback=self.parse_artikel)
            self.url_seen.append(link)
            yield req

        #find next page if any.
        if jumlah_berita > 9:
            self.hal = self.hal + 1
            next_page = ('https://www.antaranews.com/indeks/' + self.tanggal +
                         '/' + str(self.hal))
            req = scrapy.Request(next_page, callback=self.parse)
            yield req
        else:
            try:
                rasio = self.total_scraped // self.dropped_count
                if rasio < 2:
                    kirim_notif(self.name)
            except:
                pass
            print("scraping ---- Selesai Total halaman = ", self.hal)
            print("jumlah berita  =", jumlah_berita, "----halaman =", self.hal)