Beispiel #1
0
 def parse_essay(self, response):
     socket.setdefaulttimeout(30)
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.select('.widget-listing '):  # 静态文章列表
         response.meta['title'] = i.select_one('a').get('title')
         mm = i.select_one('h5').text  # 形如 '09 जनवरी 2021'
         ss = self.hindi_month[mm.split()[1]] + ' ' + mm.split()[0] + ' ' + mm.split()[2]  # ss 形如 'Jan 09 2021'
         response.meta['pub_time'] = Util.format_time2(ss)
         if self.time == None or Util.format_time3(Util.format_time2(ss)) >= int(self.time):
             yield Request(url=i.select_one('a').get('href'), meta=response.meta, callback=self.parse_item)
         else:
             self.logger.info('时间截止!')
     try:  # 尝试动态加载,(有的二级目录没有动态加载
         i = 1
         self.params['path'] = response.replace('https://www.aajtak.in', '')
         while True:
             if flag:
                 self.params['id'] = str(i)
                 i += 1
                 api_rq = requests.get(self.api_url, params=self.params, headers=self.headers)
                 if api_rq.status_code == 200:
                     soup = BeautifulSoup(api_rq.text, 'html.parser')
                     for i in soup.select('.widget-listing '):  # 动态加载的二级目录
                         response.meta['title'] = i.select_one('a').get('title')
                         mm = i.select_one('h5').text  # 形如 '09 जनवरी 2021'
                         ss = self.hindi_month[mm.split()[1]]+' '+ mm.split()[0]+' '+mm.split()[2]  # ss 形如 'Jan 09 2021'
                         response.meta['pub_time'] = Util.format_time2(ss)
                         if self.time == None or Util.format_time3(Util.format_time2(ss)) >= int(self.time):
                             yield Request(url=i.select_one('a').get('href'),meta=response.meta, callback=self.parse_item)
                         else:
                             flag = False
                             self.logger.info('时间截止!')
     except:
         self.logger.info('No more dynamic news loading!')
Beispiel #2
0
 def parse_essay(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.find_all(class_='amp-wp-content amp-loop-list'):
         tt = i.select_one(
             '.featured_time ').text.split()  # 形如 ['2','दिन','ago' ]
         try:
             pub_time = tt[0] + ' ' + self.hindi_time_ago[tt[1]] + ' ' + tt[
                 2]  # 形如 '2 days ago'
         except:
             pub_time = Util.format_time(0)
         if self.time is None or Util.format_time3(
                 Util.format_time2(pub_time)) >= int(self.time):  # 未截止,True
             response.meta['title'] = i.select_one('h2').text
             response.meta['abstract'] = i.select_one(
                 '.large-screen-excerpt-design-3').text
             response.meta['pub_time'] = Util.format_time2(pub_time)
             response.meta['images'] = [i.select_one('amp-img').get('src')]
             yield Request(url=i.select_one('a').get('href'),
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
             break
     if flag:
         nextPage = soup.select_one('#pagination a').get(
             'href', 'Next Page No More')
         yield Request(nextPage,
                       meta=response.meta,
                       callback=self.parse_essay)
Beispiel #3
0
 def parse_page(self, response):
     meta = {}
     soup = bs(response.text, "html.parser")
     category1 = soup.select(".breadcrumb > li")[1].text if soup.select(
         ".breadcrumb > li") else None
     if category1 == 'SPORTS' or category1 == 'OPINION' or category1 == 'TECH TREND':
         category2 = None
     else:
         category2 = soup.select(".breadcrumb > li")[2].text.strip(
         ) if soup.select(".breadcrumb > li")[2] else None
     meta["category1"] = category1
     meta["category2"] = category2
     self.logger.info(category1)
     self.logger.info(category2)
     for i in soup.find_all(class_="entry-title"):
         news_url = i.find("a").get("href")
         yield scrapy.Request(news_url, callback=self.parse_news, meta=meta)
     pub_time = soup.find_all(class_="meta-date")[-1].text.strip()
     self.logger.info((Util.format_time2(pub_time)))
     if self.time == None or Util.format_time3(
             Util.format_time2(pub_time)) >= int(self.time):
         url = soup.find(
             class_="next page-numbers").get("href") if soup.find(
                 class_="next page-numbers") else None
         self.logger.info(url)
         if url:
             yield scrapy.Request(url, callback=self.parse_page)
     else:
         self.logger.info('时间截止')
Beispiel #4
0
 def parse(self, response):   # 新闻列表 有完整新闻
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     last_pub_time= Util.format_time2(soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')[-1].text)
     if self.time is None or Util.format_time3(last_pub_time) >= int(self.time):
         all_pub_time = [Util.format_time2(i.text) for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')]
         all_title = [i.text.strip() for i in soup.select('.lk-tle')]
         all_images = ['https://www.myanmarisis.org'+i.get('src') for i in soup.select('.img-responsive.lk-img')]
         all_body = [i.text.strip() for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div p')]
         for i in range(9):
             item=DemoItem()
             item['pub_time'] = all_pub_time[i]
             item['images'] = [all_images[i]]
             item['title'] = all_title[i]
             item['body'] = all_body[i]
             item['category1']='event'
             item['category2'] =None
             item['abstract'] = all_body[i].split('\n')[0]
             yield item
     else:
         self.logger.info('时间截止!')
         flag = False
     if flag:
         try:
             nextPage=soup.select_one('.active ~ li a').get('href')
             yield Request(url=nextPage)
         except:
             self.logger.info("Next page no more.")
Beispiel #5
0
 def parse2(self, response):
     html = BeautifulSoup(response.text)
     item = DemoItem()
     item['category1'] = response.meta['category1']
     if response.meta['category1'] != 'life':
         item['title'] = html.select('#landing-headline > h1')[0].text
         item['body'] = ''
         flag = False
         for i in html.select('#article-content > p'):
             item['body'] += i.text
             if i.text != '' and flag == False:
                 flag = True
                 item['abstract'] = i.text
         item['pub_time'] = Util.format_time2(
             html.select('#m-pd2 > span')[-1].text)
         item['images'] = []
         for i in html.select('#article-content img'):
             item['images'].append(i.attrs['src'])
         yield item
     else:
         item['title'] = html.select('#art-hgroup > h1')[0].text
         item['body'] = ''
         flag = False
         for i in html.select('#article-content > p'):
             item['body'] += (i.text + '\n')
             if i.text != '' and flag == False:
                 flag = True
                 item['abstract'] = i.text
         item['pub_time'] = Util.format_time2(
             html.select('.art-byline > span')[-1].text)
         item['images'] = []
         for i in html.select('#article-content img'):
             item['images'].append(i.attrs['src'])
         yield item
Beispiel #6
0
 def parse2(self, response):
     item = DemoItem()
     html = BeautifulSoup(response.text)
     list = response.url.split('/')
     item['title'] = html.select('.title')[0].text
     item['category1'] = list[3]
     if re.findall(r'\d+', list[4]) == []:
         item['category2'] = list[4]
     item['body'] = ''
     flag = False
     for i in html.select('#content-body-244757-498257 > p'):
         item['body'] += (i.text + '\n')
         if i.text != '' and flag == False:
             flag = True
             item['abstract'] = i.text
     if html.select('.dateLine > p') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.dateLine > p')[0].text)
     elif html.select('.dateString') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.dateString')[0].text)
     if html.select('.margin-bottom-15 img') != []:
         item['images'] = [
             'https://www.cnnphilippines.com' +
             html.select('.margin-bottom-15 img')[0].attrs['src'],
         ]
     yield item
Beispiel #7
0
 def parse(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.select('.post'):
         url = i.select_one('a').get('href')
         response.meta['title'] = i.select_one('h2 > a ').text
         response.meta['abstract'] = i.select_one('div.exceprt ').text
         response.meta['pub_time'] = Util.format_time2(
             i.select_one('.date').text)
         if self.time is None or Util.format_time3(
                 Util.format_time2(i.select_one('.date').text)) >= int(
                     self.time):
             yield Request(url,
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
             break
     if flag:
         try:
             nextPage = soup.find(
                 class_='next page-numbers').get('href') if soup.find(
                     class_='next page-numbers').get('href') else None
             if nextPage:
                 yield Request(url=nextPage,
                               meta=response.meta,
                               callback=self.parse)
         except:
             self.logger.info('Next page no more!')
Beispiel #8
0
 def parse_eassys(self, response):  # 各类二级目录的文章的翻页和url爬取
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     if re.match(r'.*photo-gallery.*', response.url):  # 照片的
         for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'):
             try:
                 url = 'https://zeenews.india.com' + t.select_one('a').get('href')
             except:
                 continue
             response.meta['title'] = t.select_one('h3').text
             response.meta['images'] = [t.select_one('img').get('src')]
             response.meta['pub_time'] = t.select_one('.photo-date').text.strip()
             if self.time is None or Util.format_time3(Util.format_time2(t.select_one('.photo-date').text.strip())) >= int(self.time):
                 yield Request(url, callback=self.parse_item_photo, meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     elif re.match(r'.*video.*', response.url):  # 视频的
         for i in soup.find_all(attrs={'class': 'mini-video mini-video-h margin-bt30px'}):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + i.select_one('a').get('href')
             #self.logger.info( url)
             response.meta['images'] = [i.select_one('img').get('src')]
             response.meta['title'] = i.select_one('h3').text
             response.meta['pub_time'] = i.select_one('.date').text.strip()
             if self.time is None or Util.format_time3(Util.format_time2(i.select_one('span.date').text.strip())) >= int(self.time):
                 yield Request(url, callback=self.parse_item_video, meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     else:
         for t in soup.find_all(class_='section-article margin-bt30px clearfix'):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + t.select_one('a').get('href')
             response.meta['title'] = t.select_one('h3.margin-bt10px').text
             tt = t.select_one('span.date').text.strip().split()
             try:
                 pub_time = self.hindi_month[tt[0]] +' '+tt[1]+' '+tt[2]+' '+tt[3]+' '+tt[5]
             except:
                 pub_time = t.select_one('span.date').text.strip()
             response.meta['pub_time'] = pub_time
             response.meta['images'] = [t.select_one('img').get('src')]
             if self.time is None or Util.format_time3(Util.format_time2(pub_time)) >= int(self.time):
                 yield Request(url=url, meta=response.meta, callback=self.parse_item)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     if flag:
         try:
             nextPage = 'https://zeenews.india.com/'+ soup.find(class_='next last').select_one('a').get('href')
             yield Request(nextPage,callback=self.parse_eassys,meta=response.meta)
         except:
             self.logger.info('Next page no more!')
Beispiel #9
0
 def parse_detail(self, response):
     item = DemoItem()
     html = BeautifulSoup(response.text, 'html.parser')
     item['category1'] = response.meta['category1']
     item['category2'] = response.meta['category2']
     if html.select_one("div.container h1") is not None:
         item['title'] = html.select_one("div.container h1").text
     item['body'] = ''
     if html.select("div.col-24 p"):
         bodies = html.select("div.col-24 p")
         b_list = [b.text for b in bodies]
         item['body'] = '\n'.join(b_list)
         item['abstract'] = bodies[0].text
     item['images'] = []
     if html.select("div.col-24 figure img"):
         images = html.select("div.col-24 figure img")
         for i in images:
             item['images'].append(i['src'])
     if html.select_one("p.byline span.date") is not None:
         ex = 'Published on (.*)'
         pub_time = html.select_one("p.byline span.date").text
         pub_time = re.findall(ex, pub_time)
         if pub_time:
             pub_time = pub_time[0]
             pub_time = Util.format_time2(pub_time)
             item['pub_time'] = pub_time
         else:
             item['pub_time'] = Util.format_time()
     else:
         item['pub_time'] = Util.format_time()
     yield item
Beispiel #10
0
 def parse2(self, response):
     html = BeautifulSoup(response.text)
     list = response.url.split('/')
     category1 = list[4]
     if len(list) > 5 and list[5] != 'page':
         category2 = list[5]
     else:
         category2 = ''
     for i in html.select(
             'div[class="default-post-category-content post_box"] .default-category-image > a'
     ):
         yield Request(i.attrs['href'],
                       meta={
                           'category1': category1,
                           'category2': category2
                       },
                       callback=self.parse3)
     if html.select(
             '.previous_posts > a'
     ) != [] and (self.time == None or Util.format_time3(
             Util.format_time2(
                 html.select(
                     'div[class="default-post-category-content post_box"] .post_date'
                 )[-1].text)) >= int(self.time)):
         yield Request(html.select('.previous_posts > a')[0].attrs['href'],
                       callback=self.parse2)
     else:
         self.logger.info('截止')
Beispiel #11
0
    def get_next_page(self, response):
        soup = bs(response.text, "html.parser")
        item = response.meta["item"]
        div_list = soup.find_all(
            "div", class_="data-bg-hover data-bg data-bg-categorised")
        for div in div_list:
            article_url = div.select_one("a").get("href")
            # print(article_url)
            # last_time = soup.find_all("span",class_="item-metadata posts-date")[11].text.strip()
            yield scrapy.Request(article_url,
                                 callback=self.get_news_detail,
                                 meta={"item": item})  # 层与层之间通过meta参数传递数据

        if self.time == None or Util.format_time3(
                Util.format_time2(
                    soup.find_all("article")[-1].find(
                        "span", class_="item-metadata posts-date").text.strip(
                        ))) >= int(self.time):
            url = soup.find(
                "a", class_="next page-numbers").get("href") if soup.find(
                    "a", class_="next page-numbers") else None
            if url:
                yield scrapy.Request(url,
                                     meta=response.meta,
                                     callback=self.get_next_page)
        else:
            self.logger.info('时间截止')
Beispiel #12
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]

        item["title"] = soup.find(class_="post-title entry-title").text.strip()
        item["pub_time"] = Util.format_time2(
            soup.find(class_="published timeago").text.strip())

        content = soup.find(class_="post-body entry-content")
        images = [img.get("src") for img in content.find_all("img")
                  ] if content.find_all("img") else []
        item["images"] = images
        body1 = ''
        for div in content.find_all(dir="ltr"):
            body1 += (div.text.strip() + '\n')
        if body1 == '':
            body1 = content.text

        body = ''
        for b in body1.split("\n"):
            if b != '':
                body += (b + '\n')
        item["body"] = body
        item["abstract"] = body.split("\n")[0]
        yield item
Beispiel #13
0
    def parse_item(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        item = DemoItem()
        category = response.url.split('/')[-3].split('_')
        if len(category) == 3:
            item['category1'] = category[1]
            item['category2'] = category[2]
        else:
            item['category1'] = category[0]
            item['category2'] = category[1]

        item['title'] = soup.select_one('h1.entry-title').text

        item['pub_time'] = Util.format_time2(
            soup.select('span.td-post-date > time')[0].text)
        item['images'] = [
            i.get('data-src') for i in soup.select('div.td-post-content img')
        ]
        item['abstract'] = soup.select('div.td-post-content > p')[0].text

        ss = ''
        for i in soup.select('div.td-post-content > p'):
            ss += i.text + r'\n'
        item['body'] = ss

        return item
Beispiel #14
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]
        pub_time = soup.find(
            "span",
            "entry-meta-date updated").find("a").text.strip() if soup.find(
                "span", "entry-meta-date updated") else "0000-00-00 00:00:00"
        if pub_time:
            item["pub_time"] = Util.format_time2(pub_time)

        div = soup.find("div", class_="entry-content clearfix")
        images = [img.get("src") for img in div.find_all("img")
                  ] if div.find_all("img") else None
        item["images"] = images
        title = soup.find("h1", class_="entry-title").text.strip()
        item["title"] = title
        abstract1 = [a.text.strip() for a in div.find_all("li")
                     ] if div.find_all("li") else div.find("p").text.strip()
        abstract = ''
        for a in abstract1:
            abstract += a
        item["abstract"] = abstract
        body = [p.text.strip()
                for p in div.find_all("p")] if div.find_all("p") else None
        body = "\n".join(body)
        item["body"] = body
        yield item
Beispiel #15
0
 def parse3(self, response):
     html = BeautifulSoup(response.text)
     item = DemoItem()
     list = response.url.split('/')
     item['title'] = html.select('.news-title')[0].text
     item['category1'] = list[3]
     if re.findall(r'\d+', list[4]) == []:
         item['category2'] = list[4]
     item['body'] = ''
     for i in html.select('.article-content > p'):
         item['body'] += (i.text + '\n')
     if html.select('.article-content > p') != []:
         item['abstract'] = html.select('.article-content > p')[0].text
     self.logger.info(
         html.select('.timestamp-entry > .date-posted')[0].text)
     if html.select('.timestamp-entry > .date-posted') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.timestamp-entry > .date-posted')[0].text)
     else:
         item['pub_time'] = Util.format_time()
     if html.select('.article-content > .embed-wrap img') != []:
         item['images'] = [
             html.select('.article-content > .embed-wrap img')
             [0].attrs['src'],
         ]
     yield item
Beispiel #16
0
    def parse_category2(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        url_list = soup.find_all('h2', class_='entry-title grid-title')
        for url in url_list:
            news_url = url.find('a').get('href')
            yield scrapy.Request(news_url,
                                 meta=response.meta,
                                 callback=self.parse_details)

        #截止时间
        ddl = soup.find(
            'time',
            class_='entry-date published').text.strip()  #January 10, 2021
        ddl = Util.format_time2(ddl)  #2021-01-10 00:00:00
        ddl = Util.format_time3(ddl)  #1610208000

        #翻页
        if soup.find('a', class_='next page-numbers'):
            next_url = soup.find('a', class_='next page-numbers').get('href')
            if (self.time == None or ddl >= int(self.time)):
                yield scrapy.Request(next_url,
                                     meta=response.meta,
                                     callback=self.parse_category2)
            else:
                self.logger.info('时间截止')
Beispiel #17
0
 def parse(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.find_all(
             class_='td_module_10 td_module_wrap td-animation-stack'):
         url = i.select_one('h3 > a').get('href')
         response.meta['title'] = i.select_one('h3 > a ').text
         response.meta['abstract'] = i.select_one('div.td-excerpt').text
         response.meta['pub_time'] = i.select_one('.td-post-date').text
         if self.time == None or Util.format_time3(
                 Util.format_time2(
                     i.select_one('.td-post-date').text)) >= int(self.time):
             yield Request(url,
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
     if flag:
         try:
             nextPage = soup.find(
                 class_='page-nav td-pb-padding-side').select('a')[-1].get(
                     'href') if soup.find(
                         class_='page-nav td-pb-padding-side').select(
                             'a')[-1].get('href') else None
             if nextPage:
                 yield Request(url=nextPage,
                               meta=response.meta,
                               callback=self.parse)
         except:
             self.logger.info('Next page no more!')
Beispiel #18
0
 def parse_item(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     item = DemoItem()
     item['title'] = response.meta['title']
     item['category1'] = response.meta['category1']
     item['abstract'] = response.meta['abstract']
     item['images'] = response.meta['images']
     item['category2'] = response.meta['category2']
     if re.findall('headline', response.url):  # 一般新闻
         ss = ''
         for i in soup.select('.dit > p > b'):
             ss += i.text + '\n'
         try:
             ss += soup.select_one('.dit > p > span').text
         except:
             pass
         item['body'] = ss
         tt = soup.select_one('.colort').text.split(
         )  # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM']
         tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[
             5]  # 形如 January 6 2021 02:12:12 PM
         item['pub_time'] = Util.format_time2(tt)
     elif re.findall('watchvid', response.url):  # 视频新闻
         item['body'] = soup.select_one('.dit > p').text
         item['pub_time'] = soup.select_one('.colort').text
     else:  # 图片新闻
         item['body'] = soup.select_one('.news_saa > p').text
         item['pub_time'] = Util.format_time(0)
     return item
Beispiel #19
0
    def parse_details(self,response):
        item=DemoItem()
        soup=BeautifulSoup(response.text,'lxml')
        item['category1']=response.meta['category1']
        item['category2']=response.meta['category2']

        item['title']=soup.find('h1',class_='post-title entry-title').text.strip() if soup.find('h1',class_='post-title entry-title') else None

        item['body'] = ''#不能忘记初始化
        item['abstract']=''
        if soup.select('.entry-content p,.entry-content h3'):
            body_list=soup.select('.entry-content p,.entry-content h3')#这个写法可以同时提取到多个不同的标签
            for body in body_list:
                item['body'] += body.text.strip()
                item['body'] +='\n'
            item['abstract']=body_list[0].text.strip() 

           
        
        item['images']=[]
        image_list=soup.select('.entry-content p>img,.single-featured-image>img')if soup.select('.entry-content p>img,.single-featured-image>img') else None
        if(image_list):
            for image in image_list:
                image=image.get('src')
                item['images'].append(image)


        pub=soup.find('span',class_='date meta-item tie-icon').text.strip() if soup.find('span',class_='date meta-item tie-icon') else None
        if(pub):
            pub=Util.format_time2(pub)
            item['pub_time']=pub
        
        yield item
Beispiel #20
0
    def parse(self, response):
        meta = {}
        soup = bs(response.text, "html.parser")

        category1 = soup.find(class_="tdb-title-text").text.strip()
        meta["category1"] = category1
        for i in soup.find_all(
                class_="td-module-container td-category-pos-above"):
            news_url = i.find(
                class_="entry-title td-module-title").find("a").get("href")
            category2 = i.find(
                class_="td-module-meta-info").select_one("a").text.strip()
            if category2 == category1:
                category2 = None
            meta["category2"] = category2
            yield scrapy.Request(news_url, callback=self.parse_news, meta=meta)

        pub = soup.find_all(
            class_="entry-date updated td-module-date")[-1].text.strip()
        if self.time == None or Util.format_time3(
                Util.format_time2(pub)) >= int(self.time):
            url = soup.find(
                "div",
                "page-nav td-pb-padding-side").select("a")[-1].get("href")
            yield Request(url, callback=self.parse)
        else:
            self.logger.info('时间截止')
Beispiel #21
0
    def get_news_detail(self, response):
        '''
        :param response: x新闻正文response
        :return: 新闻页面详情信息
        '''
        item = response.meta["item"]

        soup = bs(response.text, "html.parser")

        title = soup.find("h1", class_="entry-title").text
        pub_time = Util.format_time2(
            soup.find("time", class_="entry-date updated td-module-date").text)
        image_list = [
            img.get("src")
            for img in soup.find_all("img",
                                     class_="entry-thumb td-modal-image")
        ] if soup.find("img", class_="entry-thumb td-modal-image") else None
        body = ''
        for p in soup.find("div", class_="td-post-content").select("p"):
            body += (p.text + '\n')

        abstract = body.split("।")[0]

        item["title"] = title
        item["pub_time"] = pub_time
        item["images"] = image_list
        item["abstract"] = abstract
        item["body"] = body

        yield item
Beispiel #22
0
    def get_next_page(self, response):

            item = response.meta["item"]
            soup = bs(response.text, "html.parser")
            #len(soup.find("div", class_="entry-crumbs").select("i"))代表有几级目录
            item['category1'] = soup.find("div", class_="entry-crumbs").find("span",class_="td-bred-no-url-last").text if len(soup.find("div", class_="entry-crumbs").select("i")) == 1 else soup.find("div", class_="entry-crumbs").select("span")[-2].text
            item['category2'] = soup.find("div", class_="entry-crumbs").find("span",class_="td-bred-no-url-last").text if len(soup.find("div", class_="entry-crumbs").select("i")) != 1 else " "
            for h3 in soup.find("div", class_="td-big-grid-wrapper").select("h3"):
                article_url = h3.select_one("a").get("href")

                yield scrapy.Request(article_url,meta=response.meta,callback=self.get_news_detail)

            for h3 in soup.select_one("#td-outer-wrap > div.td-main-content-wrap > div > div > div.td-pb-span8.td-main-content > div").find_all("h3", class_="entry-title td-module-title"):
                article_url = h3.select_one("a").get("href")

                yield scrapy.Request(article_url,meta=response.meta,callback=self.get_news_detail)

            temp_time = soup.find_all("div",class_="td-block-span6")[-1].select("time")[-1].text if soup.find_all("div",class_="td-block-span6") else "January 1, 1970"

            if self.time == None or Util.format_time3(Util.format_time2(temp_time)) >= int(self.time):
                    next_url = None
                    if soup.find("div", class_="page-nav td-pb-padding-side"):#排除没有页面条的情况
                        #排除到最后一页的情况
                        next_url = soup.find("div",class_="page-nav td-pb-padding-side").select("a")[-1].get("href") if soup.find("div",class_="page-nav td-pb-padding-side").select("a")[-1].select("i") else None
                    if next_url:
                            yield scrapy.Request(next_url, meta=response.meta, callback=self.get_next_page)
            else:
                self.logger.info('时间截止')
Beispiel #23
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")

        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]

        title = soup.find(class_="tdb-title-text").text.strip()
        item["title"] = title
        pub_time = soup.find(
            class_="entry-date updated td-module-date").text.strip()
        item["pub_time"] = Util.format_time2(pub_time)
        images = [
            soup.find("div",
                      "tdb-block-inner td-fix-index").find("img").get("src")
        ] if soup.find("div", "tdb-block-inner td-fix-index") else None
        if soup.find_all("div", "wp-block-image"):
            for img in soup.find_all("div", "wp-block-image"):
                images.append(img.find("img").get("src"))
        item["images"] = images
        abstract = soup.select_one(
            "div.wpb_wrapper > div > div > p").text.strip() if soup.select_one(
                "div.wpb_wrapper > div > div > p") else None
        item["abstract"] = abstract
        body = soup.find(
            class_="tdb-caption-text").text.strip() + '\n' if soup.find(
                class_="tdb-caption-text") else ''
        for p in soup.select("div.wpb_wrapper > div > div > p"):
            body += (p.text.strip() + '\n')
        item["body"] = body

        self.logger.info(item)
        self.logger.info('\n')

        yield item
Beispiel #24
0
 def parse_essay(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.select('.CatNewsFirst_FirstNews '):
         tt = i.select_one('h1 ~ span').text.split('|')[0].strip()
         pub_time = Util.format_time2(tt.split()[1] + ' ' + tt.split()[0] +
                                      ' ' + tt.split()[2])
         url = 'http://www.univarta.com' + i.select_one('a').get('href')
         response.meta['title'] = i.select_one('a').text
         response.meta['pub_time'] = pub_time
         try:
             response.meta['images'] = [i.select_one('img').get('src')]
         except:
             response.meta['images'] = []
         response.meta['abstract'] = i.select_one('h1 ~ p').text
         if self.time is None or Util.format_time3(pub_time) >= int(
                 self.time):
             yield Request(url=url,
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
     if flag:
         try:
             nextPage = response.meta['cate_url'] + soup.select_one(
                 '.jp-current ~ a').get('href')
             self.logger.info(nextPage)
             yield Request(nextPage,
                           meta=response.meta,
                           callback=self.parse_essay)
         except:
             self.logger.info('Next page no more!')
Beispiel #25
0
    def get_news_detail(self,response):
        '''
        :param response: x新闻正文response
        :return: 新闻页面详情信息
        '''
        item = response.meta["item"]

        soup = bs(response.text, "html.parser")

        title = soup.find("header", class_="td-post-title").select_one("h1").text
        pub_time = soup.find("header", class_="td-post-title").select_one("time").text

        image_list = []
        if soup.find("div", class_="td-post-featured-image"):
            image_list = [a.select_one("img").get("src") for a in soup.find("div", class_="td-post-featured-image").select("a")]
        body = ''
        part = soup.find("div", class_="td-post-content").select("p") if soup.find("div",class_="td-post-content").select("p") else soup.find("div", class_="td-post-content").select("div")
        for p in part:
            body += (p.text + '\n')
        abstract = body.split("।", 1)[0]
        item["title"] = title
        item["pub_time"] = Util.format_time2(pub_time)
        item["images"] = image_list
        item["abstract"] = abstract
        item["body"] = body
        yield item
Beispiel #26
0
 def parse(self, response):
     html = BeautifulSoup(response.text)
     if response.url == 'https://www.sunstar.com.ph/Philippines':
         for i in html.select('.tablecenter > a')[0:8]:
             yield Request(i.attrs['href'])
     elif re.findall(
             r'https://www.sunstar.com.ph/article/\d+/\S+?/\S+?/\S+?',
             response.url) != []:
         item = DemoItem()
         list = response.url.split('/')
         item['title'] = html.select('.titleArticle > h1')[0].text
         item['category1'] = list[5]
         if re.findall(r'\d+', list[6]) == []:
             item['category2'] = list[6]
         item['body'] = html.select('.col-sm-11 p')[0].text
         item['abstract'] = html.select('.col-sm-11 p')[0].text
         item['pub_time'] = Util.format_time2(
             html.select('.articleDate')[0].text)
         if html.select('.imgArticle > img') != []:
             item['images'] = [
                 html.select('.imgArticle > img')[0].attrs['src'],
             ]
         yield item
     else:
         for i in html.select('.sectionTopWidget > div > div .ratio'):
             yield Request(i.attrs['href'])
         for i in html.select(
                 '.moreSectionWidget > div > div a[class="title-C20 title blu-hover"]'
         ):
             yield Request(i.attrs['href'])
Beispiel #27
0
    def get_news_detail(self, response):
        '''
        :param response: x新闻正文response
        :return: 新闻页面详情信息
        '''
        item = response.meta["item"]

        soup = bs(response.text, "html.parser")
        title = soup.find("h1",
                          class_="entry-title").text.strip() if soup.find(
                              "h1", class_="entry-title") else None
        pub_time = Util.format_time2(
            soup.find("span", class_="item-metadata posts-date").text.strip())
        image_list = [
            soup.find("div", class_="entry-content").find(
                "figure", class_="wp-block-image size-large").select_one(
                    "img").get("data-src")
        ] if soup.find("div", class_="entry-content").find(
            "figure", class_="wp-block-image size-large") else []
        body = ''
        for p in soup.find("div", class_="entry-content").select("p"):
            body += (p.text.strip() + '\n')
        if soup.find("pre", class_="wp-block-code"):
            # for code in soup.find("pre", class_="wp-block-code"):
            body += soup.find("pre", class_="wp-block-code").text
        abstract = body.split('।')[0]  # 摘要是文章的第一句话
        item["title"] = title
        item["pub_time"] = pub_time
        item["images"] = image_list
        item["abstract"] = abstract
        item["body"] = body
        yield item
Beispiel #28
0
    def parse_details(self,response):
        item=DemoItem()
        soup=BeautifulSoup(response.text,'lxml')
        item['category1']=response.meta['category1']
        item['category2']=response.meta['category2']

        item['title']=soup.find('h1',class_='entry-title').text.strip() if soup.find('h1',class_='entry-title') else None
    
        item['body'] = ''#不能忘记初始化
        body_list=soup.find('div',class_='entry clearfix').select('p') if soup.find('div',class_='entry clearfix').select('p')else None
        for body in body_list:
            item['body'] += body.text.strip()
            item['body'] +='\n'
        item['abstract']=soup.find('div',class_='entry clearfix').select('p')[0].text.strip() if soup.find('div',class_='entry clearfix').select('p') else None
   
        item['images']=[]
        image_list=soup.find('div',class_='entry clearfix').select('p>img')if soup.find('div',class_='entry clearfix').select('p>img') else None
        if(image_list):
            for image in image_list:
                image=image.get('src')
                item['images'].append(image)


        pub=soup.find('span',class_='updated').text.strip() if soup.find('span',class_='updated').text.strip() else None
        if(pub):
            pub=Util.format_time2(pub)
            item['pub_time']=pub

        yield item
Beispiel #29
0
    def get_next_page(self, response):

        item = response.meta["item"]
        soup = bs(response.text, "html.parser")
        div = soup.find("div", class_="twp-row")
        for article in div.find_all("a", class_="post-thumbnail"):
            news_url = article.get("href")
            item['category1'] = soup.find_all("li",
                                              class_="trail-item")[1].text
            item['category2'] = soup.find_all(
                "li", class_="trail-item"
            )[2].text if len(soup.find_all("li", class_="trail-item")) > 3 or (
                soup.find_all("li", class_="trail-item")[-1].text.split(" ")[0]
                != "Page" and len(soup.find_all(
                    "li", class_="trail-item")) == 3) else None

            yield scrapy.Request(news_url,
                                 meta=response.meta,
                                 callback=self.get_news_detail)

        if self.time == None or Util.format_time3(
                Util.format_time2(
                    div.find_all("span", class_="item-metadata posts-date")
                    [-1].text.strip())) >= int(self.time):
            next_url = soup.find(
                "a", class_="next page-numbers").get("href") if soup.find(
                    "a", class_="next page-numbers") else None
            if next_url:
                yield scrapy.Request(next_url,
                                     meta=response.meta,
                                     callback=self.get_next_page)
        else:
            self.logger.info('时间截止')
Beispiel #30
0
 def parse_news(self, response):
     soup = bs(response.text)
     item = DemoItem()
     item["pub_time"] = Util.format_time2(
         soup.select('.post-meta > span')[1].text)
     title = soup.find(
         "h1", class_="post-title entry-title").text.strip() if soup.find(
             "h1", class_="post-title entry-title") else None
     item["title"] = title
     image = [
         soup.find("div", class_="single-post-thumb").find("img").get("src")
     ] if soup.find("div", class_="single-post-thumb") else None
     item["images"] = image
     category1 = soup.select_one(
         "#main-content > div > article > div > p > span:nth-child(3) > a"
     ).text.strip()
     item["category1"] = category1
     item["category2"] = None
     abstract = soup.find(
         "div", class_="entry").find("p").text.strip() if soup.find(
             "div", class_="entry") else soup.find(
                 "div", class_="entry").find("p").text.strip()
     item["abstract"] = abstract
     body = [
         p.text.strip()
         for p in soup.find("div", class_="entry").find_all("p")
     ] if soup.find("div", class_="entry") else None
     body = "\n".join(body)
     item["body"] = body
     yield item