def parse(self, response): soup = bs(response.text, 'html.parser') for i in soup.find_all(class_='post-title entry-title'): url = i.find('a').get('href') self.logger.info(url) yield scrapy.Request(url, callback=self.parse_news) if soup.find(class_='blog-pager-older-link'): next_url = soup.find('a', class_='blog-pager-older-link').get('href') pub_time1 = soup.find_all( class_='date-header')[-1].find('span').text pub_time = time.strftime( "%Y-%m-%d %H:%M:%S", datetime.datetime(int(pub_time1.split()[-1]), self.month[pub_time1.split()[1]], int(pub_time1.split()[0])).timetuple()) self.logger.info(pub_time) if self.time == None or Util.format_time3(pub_time) >= int( self.time): self.logger.info(next_url) yield scrapy.Request(next_url, callback=self.parse) else: self.logger.info('时间截止')
def parse(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.select('article'): pub_time = Util.format_time2(soup.select_one('.updated').text) response.meta['title'] = soup.select_one('h2.entry-title a').text response.meta['pub_time'] = pub_time if self.time is None or Util.format_time3(pub_time) >= int( self.time): yield Request(url=i.select_one('a').get('href'), meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') if flag: try: nextPage = soup.select_one('.nav-previous a').get('href') yield Request(nextPage, meta=response.meta, callback=self.parse) except: self.logger.info('Next page no more')
def parse_essay(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.find_all( class_='fbt-col-lg-12 col-md-4 col-xs-6 padding-reset'): tt = soup.select_one('.post-info').text.split() pub_time = Util.format_time2(tt[1] + ' ' + tt[0] + ' ' + tt[2]) response.meta['title'] = soup.select_one('.post-content').text response.meta['pub_time'] = pub_time if self.time is None or Util.format_time3(pub_time) >= int( self.time): yield Request(url=i.select_one('.post-content a').get('href'), meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') if flag: try: if soup.find_all( class_='NavigationButton')[-1].get('value') == 'Next': nextPage = soup.find_all( class_='NavigationButton')[-1].get('onclick').replace( "window.location='", '')[:-2] if re.match('http', nextPage): #self.logger.info('11111111111111111111111111111111翻页') yield Request(nextPage, meta=response.meta, callback=self.parse_essay) else: nextPage = 'https://www.pressnote.in/' + nextPage #self.logger.info('2222222222222222222222222222翻页') yield Request(nextPage, meta=response.meta, callback=self.parse_essay) except: self.logger.info('Next page no more')
def parse_essay(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.select('#container > div')[1:-2]: # 每页的文章 url = i.select_one('a').get('href') try: pub_time = i.select_one('.entryDate').text if i.select_one( '.entryDate').text else i.select_one('.meta_date').text except Exception: continue if self.time == None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): yield scrapy.Request(url, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: if soup.select('.pagi-next') != []: yield Request(soup.select('.pagi-next')[0].attrs['href'], callback=self.parse_essay) else: for i in soup.select('.numbered-pagination a'): yield Request(i.attrs['href'], callback=self.parse_essay)
def parse_category1(self, response): soup = BeautifulSoup(response.text, features="lxml") news_url = [] news_content = soup.select(".o-article .k-list-sec .allBox ul li") for n in news_content: news_url.append(n.find("a").get("href")) for url in news_url: yield scrapy.Request(url, callback=self.parse_detail) next_page = soup.select_one(".o-listing .pagination a").get( "href") if soup.select_one(".o-listing .pagination a") else None LastTimeStamp = Util.format_time3( str( khulasaa_time_switch( BeautifulSoup( requests.get(news_url[-1]).text, features="lxml").select_one( ".author-disc .date .author span").text))) if next_page: if self.time is None or LastTimeStamp >= self.time: yield scrapy.Request(next_page, callback=self.parse_category1) elif LastTimeStamp < self.time: self.logger.info("$$$时间截止$$$") else: self.logger.info("$$$该页已经到底$$$")
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() try: pub_time = self._4matTime(soup.select_one('.timestamp').text) except: pub_time = Util.format_time(0) if self.time is None or Util.format_time3(pub_time) > int( self.time): # 时间截止,遍历了所有url,没能截止下来 item['pub_time'] = pub_time item['title'] = soup.select('.active')[-1].text.strip() item['category1'] = response.url.split('/')[3] item['category2'] = response.url.split( '/')[5] if response.url.split('/')[3] == 'myanmar' else 'news' item['body'] = soup.select_one('article').text.strip() item['abstract'] = soup.select_one('article').text.strip().split( '\n')[0] item['images'] = [ i.get('src') for i in soup.select_one('section').select('img') ] return item else: self.logger.info('时间截止') self.stopCount += 1
def parse_page(self, response): soup = bs(response.text, "html.parser") if soup.find(class_="post-outer") != None: for i in soup.find_all(class_="post-title entry-title"): news_url = i.find("a").get("href") yield scrapy.Request(news_url, callback=self.parse_news, meta=response.meta) pub = soup.find_all(class_="published timeago")[-1].text.strip() time = parse.quote( soup.find_all(class_="published timeago")[-1].get("title")) # page = soup.select_one("div.blog-pager > span.showpageOf").text #.split("of")[-1].strip() # self.logger.info(page) if self.time == None or Util.format_time3( Util.format_time2(pub)) >= int(self.time): url = response.meta[ "url"] + "?updated-max={}&max-results=8#PageNo={}" response.meta["p"] += 1 yield scrapy.Request(url.format(time, response.meta["p"]), callback=self.parse_page, meta=response.meta) else: self.logger.info('时间截止')
def get_next_page(self, response): soup = bs(response.text, "html.parser") item = response.meta["item"] a_list = soup.find_all("a", class_="td-image-wrap") for a in a_list: yield scrapy.Request(a.get("href"), callback=self.get_news_detail, meta={"item": item}) # 层与层之间通过meta参数传递数据 if self.time == None or Util.format_time3( Util.format_time2( soup.find_all( "time", class_="entry-date updated td-module-date") [-1].text)) >= int(self.time): next_url = soup.find( "div", class_="page-nav td-pb-padding-side" ).select("a")[-1].get("href") if soup.find( "div", class_="page-nav td-pb-padding-side") else None if next_url: yield scrapy.Request(next_url, meta=response.meta, callback=self.get_next_page) else: self.logger.info('时间截止')
def parse_category1(self, response): soup = BeautifulSoup(response.text, features="lxml") news_url = [] if re.match(r"\S+/page/\d+$", response.url): # https://www.tv9hindi.com/india/page/2 news_content = soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing").select( "li h3 a") for li in news_content: news_url.append(li.get("href")) else: top_content = soup.select_one(".newsTop9 .topNewscomp ul").find_all("h3", class_="h3") for h3 in top_content: news_url.append(h3.find("a").get("href")) news_content = soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing").select( "li h3 a") if soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing") else None for li in news_content: news_url.append(li.get("href")) for url in news_url: yield scrapy.Request(url, callback=self.parse_detail) last_timeStamp = Util.format_time3(Util.format_time2(soup.find_all("div", class_="col2 ComListing")[-1].find_all("div", class_="catTime flex")[-1].find("span").text.strip())) next_page = soup.find("a", class_="next page-numbers").get("href") if soup.find("a", class_="next page-numbers") else None pattern_pingback = '<!--<link rel="pingback" href="https://www.tv9hindi.com/xmlrpc.php">-->' # 判断是否到达最后一页 next_soup = BeautifulSoup(requests.get(next_page).text, features="lxml") if next_page: if self.time is None: if int(next_page.rsplit("/", 1)[-1]) <= 50: yield scrapy.Request(next_page, callback=self.parse_category1) else: pass elif last_timeStamp >= int(self.time): yield scrapy.Request(next_page, callback=self.parse_category1) else: self.logger.info("时间截止") else: self.logger.info("该目录已经结束")
def parse_2(self, response): page_soup = BeautifulSoup(response.text, 'lxml') if len(page_soup.select('article.article_content div.tag-content-left > a')): news_list = page_soup.select('article.article_content div.tag-content-left > a')#取出各个新闻的标签 for i in news_list: news_url = 'https://hindi.mykhel.com' + i.attrs['href'] yield Request(news_url,callback=self.parse_3) if page_soup.select('section div.prev-next-story.clearfix.click-for-more a.next.half_width')[ 0].get('href'): next_page_url = 'https://hindi.mykhel.com/' + \ page_soup.select('section div.prev-next-story.clearfix.click-for-more a.next.half_width')[ 0].get('href') #得到第二页中最后一条新闻的url last_time_url = 'https://hindi.mykhel.com' + BeautifulSoup(requests.get(next_page_url).text,'lxml').select('article.article_content div.tag-content-left > a')[-1].attrs['href'] #拿最后一条新闻的时间 last_time = time_font(BeautifulSoup(requests.get(last_time_url).text,'lxml').select('div.os-breadcrumb div.os-posted-by time')[0].get('datetime')) if self.time == None or Util.format_time3(last_time) >= int(self.time):# 截止功能 if len(page_soup.select('section div.prev-next-story.clearfix.click-for-more a')): yield Request(next_page_url,callback=self.parse_2) else: self.logger.info('时间截至') elif len(page_soup.select('div.os-sports-m-news.clearfix div.os-more.clearfix a')): list_url = 'https://hindi.mykhel.com' + page_soup.select('div.os-sports-m-news.clearfix div.os-more.clearfix a')[0].get('href') yield Request(list_url,callback=self.parse_2)
def parse_category(self, response): soup = BeautifulSoup(response.text, features="lxml") category = soup.find('div', class_="box-shadow-block box-shadow-1 text-center").text if soup.find('div', class_="box-shadow-block box-shadow-1 text-center").text else None articles = list(soup.find_all('div', class_="col-md-6 col-sm-6 col-xs-12")) if soup.find_all('div', class_="col-md-6 col-sm-6 col-xs-12") else None article_hrefs = [] for article in articles: if article.select_one('a').get('href') == "Nation.html": article_hrefs.append(article.select('a')[-1].get('href')) else: article_hrefs.append(article.select_one('a').get('href')) for detail_url in article_hrefs: # yield Request(detail_url, callback=self.parse_detail) check_soup = BeautifulSoup(requests.get(detail_url).content) #不加content会出错,原因是因为这里的wb_data是requests对象,无法用BeautifulSoup解析 if check_soup.select_one('div.date_and_author_container span').text.split(" ")[1]: temp_time = check_soup.select_one('div.date_and_author_container span').text.split(" ")[1] else: temp_time = check_soup.select_one('td.miscinfo').text.split(" ")[1] adjusted_time = time_adjustment(temp_time) self.logger.info("当前时间:"+adjusted_time+"$$$$$$$$$$$$$$") if self.time is None or Util.format_time3(adjusted_time) >= int(self.time): yield Request(detail_url, callback=self.parse_detail, meta={'category': category}) else: self.logger.info("时间截止") break
def headlinehindi_time_switch1(time_string): # 2020-12-23T17:50:27+05:30 # 返回时间戳 time_string = time_string.rsplit("+", 1)[0] return Util.format_time3( str(datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")))
def parse_1(self, response): page_soup = BeautifulSoup(response.text, 'lxml') # 如果可以换页 if len( page_soup.select( 'article.topicBox div.pagination.border0 li.last')): next_page_url = page_soup.select( 'article.topicBox div.pagination.border0 li.last > a')[0].get( 'href') this_page_url = response.url j = 0 # j决定爬取的页数 while j < 3 and next_page_url: this_page_soup = BeautifulSoup( requests.get(this_page_url).text, 'lxml') news_url_list = this_page_soup.select('ul.topicList > li > a') last_new_url = news_url_list[-1].get('href') last_time = time_font( BeautifulSoup( requests.get(last_new_url).text, 'lxml').select( 'div.articleHd div.dateInfo span.fl')[0].text) if self.time == None or Util.format_time3(last_time) >= int( self.time): for news_url in news_url_list: yield Request(news_url.get('href'), callback=self.parse_2) else: self.logger.info('时间截止') break j = j + 1 this_page_url = next_page_url elif len( page_soup.select( 'article.topicBox div.newsFJagran div.pagination.border0 ul > li' )): self.logger.info('本页面特殊翻页: ' + response.url) j = 1 this_page_url = response.url + '/' + str(j) while j < 3: this_page_soup = BeautifulSoup( requests.get(this_page_url).text, 'lxml') news_url_list = this_page_soup.select('ul.topicList > li > a') try: last_new_url = news_url_list[-1].get('href') except: self.logger.info(response.url) last_time = time_font( BeautifulSoup( requests.get(last_new_url).text, 'lxml').select( 'div.articleHd div.dateInfo span.fl')[0].text) if self.time == None or Util.format_time3(last_time) >= int( self.time): for news_url in news_url_list: Request(news_url.get('href'), callback=self.parse_2) else: self.logger.info('时间截止') break j = j + 1 this_page_url = response.url + '/' + str(j) else: news_url_list = page_soup.select('ul.topicList > li > a') for news_url in news_url_list: last_time = time_font( BeautifulSoup( requests.get(news_url.get('href')).text, 'lxml').select('div.articleHd div.dateInfo span.fl') [0].text) if self.time == None or Util.format_time3(last_time) >= int( self.time): yield Request(news_url.get('href'), callback=self.parse_2)
def parse_news_list(self, response): home_url = 'https://doh.gov.ph/' time2 = '' soup = BeautifulSoup(response.text, "html.parser") news_list = soup.select("div.panel>div>div.view-content>div") # 新闻列表 for news in news_list: # 发布日期和时间 date = news.find("span", class_="field-content content-time").text.strip() dtime = " 00:00:00" # 日期 pub_time_list = re.split(" |,", date) if date else None if pub_time_list: if pub_time_list[0] == "January": time2 = pub_time_list[-1] + "-01-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "February": time2 = pub_time_list[-1] + "-02-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "March": time2 = pub_time_list[-1] + "-03-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "April": time2 = pub_time_list[-1] + "-04-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "May": time2 = pub_time_list[-1] + "-05-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "June": time2 = pub_time_list[-1] + "-06-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "July": time2 = pub_time_list[-1] + "-07-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "August": time2 = pub_time_list[-1] + "-08-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "September": time2 = pub_time_list[-1] + "-09-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "October": time2 = pub_time_list[-1] + "-10-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "November": time2 = pub_time_list[-1] + "-11-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "December": time2 = pub_time_list[-1] + "-12-" + pub_time_list[ 1] + dtime response.meta['pub_time'] = time2 #新闻列表 url = urljoin(home_url, news.find("a").get("href")) yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) # 翻页 next_page = "https://doh.gov.ph/" + soup.select_one( "li.pager-next>a").get("href") if soup.select_one( "li.pager-next>a") else None if self.time == None or (time2 and Util.format_time3(time2) >= int(self.time)): if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('时间截止')
def parse_news_list(self, response): soup = BeautifulSoup(response.text, "html.parser") # 每页第一条新闻 web = soup.find("td", { "valign": "top" }).select_one("td>font>a").text.strip() if soup.find( "td", { "valign": "top" }).select_one("td>font>a").text else None if web and web == "PhilBoxing.com": url = soup.find("td", { "valign": "top" }).select_one("td>a").get("href") if soup.find( "td", { "valign": "top" }).select_one("td>a").get("href") else None abstract = soup.find("td", { "valign": "top" }).select_one("td>font.newsblurb").text.strip().split( "\r\n\r\n") if soup.find("td", { "valign": "top" }).select_one("td>font.newsblurb").text else None response.meta["abstract"] = ' '.join( abstract) if abstract else None if url: yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) # 除第一条新闻的其他新闻 table = soup.find("table", { "width": "100%", "height": "100%" }) if soup.find("table", { "width": "100%", "height": "100%" }) else None p = table.select("p")[2] if table and table.select("p")[2] else None web_list = p.select("p>font>a") if p and p.select("p>font>a") else None news_list = p.select("p>a") if p and p.select("p>a") else None abstract_list = p.select( "p>font.newsblurb") if p and p.select("p>font.newsblurb") else None i = 0 if web_list: for web in web_list: if web.text.strip() == "PhilBoxing.com": url = news_list[2 * i].get("href") if news_list and news_list[ 2 * i].get("href") else None abstract = abstract_list[i].text.strip().split( "\r\n\r\n" ) if abstract_list and abstract_list[i].text else None response.meta["abstract"] = ' '.join( abstract) if abstract else None if url: yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) i += 1 else: i += 1 # 翻页 time_list = p.find_all("font", {"size": "2"})[-1].text.split(" ") if time_list: if time_list[-2] == "Jan": time = time_list[-1] + "-01-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Feb": time = time_list[-1] + "-02-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Mar": time = time_list[-1] + "-03-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Apr": time = time_list[-1] + "-04-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "May": time = time_list[-1] + "-05-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Jun": time = time_list[-1] + "-06-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Jul": time = time_list[-1] + "-07-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Aug": time = time_list[-1] + "-08-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Sept": time = time_list[-1] + "-09-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Oct": time = time_list[-1] + "-10-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Nov": time = time_list[-1] + "-11-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Dec": time = time_list[-1] + "-12-" + time_list[-3] + " 00:00:00" else: time = None if time and (self.time == None or Util.format_time3(time) >= int(self.time)): font_list = soup.select("font.boxertablebody") if soup.select( "font.boxertablebody") else None a_list = font_list[-1].select( "a") if font_list and font_list[-1].select("a") else None next_page = "http://philboxing.com/news/" + a_list[0].get( "href") if a_list and a_list[0].get("href") else None if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('时间截止')
def parse(self, response, **kwargs): header = { 'Accept': 'application/json,text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '11', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'PHPSESSID=h2q86fctchauhq3ngeg8cu2ld7', 'Host': 'www.macaupostdaily.com', 'Origin': 'https://www.macaupostdaily.com', 'Referer': 'https://www.macaupostdaily.com/', 'sec-ch-ua': 'Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'sec-ch-ua-mobile': '?0', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } url = 'https://www.macaupostdaily.com/' url_list = [] time_list = [] title_list = [] img_list = [] news_soup = BeautifulSoup(response.text, 'lxml') for i in news_soup.find('ul', class_='new_list', id='fu').find_all('li'): url_list.append('https://www.macaupostdaily.com' + i.find('a').get('href')) time_list.append( i.find('div', class_='time').text.strip('\n').strip(' ') + ":00") title_list.append(i.find('strong').text.strip('\n')) img_list.append(url + i.find('img').get('src')) request_url = 'https://www.macaupostdaily.com/index.php/Article/news_list' # 后面post得到的url i = 2 while (i): Data = {'cid': '', 'page': "%d" % i} rep = requests.post(url=request_url, data=Data, headers=header).json() for list in rep['list']: url_list.append("https://www.macaupostdaily.com/article" + list['id'] + ".html") title_list.append(list['title']) time_list.append(list['time'] + ":00") img_list.append('https://www.macaupostdaily.com' + list['img']) for new in range(0, len(url_list)): if self.time == None or Util.format_time3( time_list[new]) >= int(self.time): yield Request(url_list[new], callback=self.parse_2, meta={ 'time': time_list[new], 'title': title_list[new], 'img': img_list[new] }) if Util.format_time3(time_list[-1]) < int(self.time): break url_list = [] time_list = [] img_list = [] title_list = [] i = i + 1
def nayalook_time_switch1_2(time_string): # 3 days ago # 返回时间戳 return Util.format_time3(str(Util.format_time2(time_string)))
def nayalook_time_switch1(time_string): # 30/11/2020 # 返回时间戳 return Util.format_time3(str(datetime.strptime(time_string, "%d/%m/%Y")))
def parse_2(self, response, **kawargs): page = BeautifulSoup(response.text, 'lxml') category1 = page.find('h1', class_='page-title').text if page.find('ul', id='posts-container', class_='posts-items') != None: for i in page.find('ul', id='posts-container', class_='posts-items').find_all( 'a', class_='post-thumb'): images = i.find('img').get('data-src') yield Request(i.attrs['href'], callback=self.parse_3, meta={ 'images': images, 'category1': category1 }) else: for i in page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces' ).find_all('div', class_='featured-area'): images = i.find('img').get('data-src') yield Request(i.find('a').get('href'), callback=self.parse_3, meta={ 'images': images, 'category1': category1 }) #看能否爬下一页 if page.find('span', class_='last-page first-last-pages') != None: next_page = page.find( 'span', class_='last-page first-last-pages').find('a').attrs['href'] if page.find('div', class_='year-month') != None: time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \ page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[ -1].text pub_time = time_font_2(time) elif page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces') != None: pub_time = time_font( page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces'). find_all('span', class_='date meta-item tie-icon')[-1].text) elif page.find('ul', id='posts-container', class_='posts-items') != None: pub_time = time_font( page.find('ul', id='posts-container', class_='posts-items').find_all( 'span', class_='date meta-item tie-icon')[-1].text) if self.time == None or Util.format_time3(pub_time) >= int( self.time): yield Request(next_page, callback=self.parse_2) #这是第二种二级目录 elif page.find('li', class_='the-next-page') != None: next_page = page.find( 'li', class_='the-next-page').find('a').attrs['href'] if page.find('div', class_='year-month') != None: time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \ page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[ -1].text pub_time = time_font_2(time) elif page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces') != None: pub_time = time_font( page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces'). find_all('span', class_='date meta-item tie-icon')[-1].text) elif page.find('ul', id='posts-container', class_='posts-items') != None: pub_time = time_font( page.find('ul', id='posts-container', class_='posts-items').find_all( 'span', class_='date meta-item tie-icon')[-1].text) if self.time == None or Util.format_time3(pub_time) >= int( self.time): yield Request(next_page, callback=self.parse_2)
def nhandan_time_switch1(time_string): # 2020年12月25日 星期五 # 返回时间戳 time_string = time_string.rsplit(" ", 1)[0] return Util.format_time3(str(datetime.strptime(time_string, "%Y年%m月%d日")))
def parse_news(self, response): item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] response1 = response.text.replace('<br>', ' ') soup = BeautifulSoup(response1, "html.parser") # 时间 temp = soup.select_one("div.title_text") if soup.select_one( "div.title_text") else None pub_time_list = re.split( " |,", temp.select_one("p").text) if temp.select_one("p").text else None if pub_time_list: if pub_time_list[-5] == "Jan": time = pub_time_list[-4] + "-01-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Feb": time = pub_time_list[-4] + "-02-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Mar": time = pub_time_list[-4] + "-03-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Apr": time = pub_time_list[-4] + "-04-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "May": time = pub_time_list[-4] + "-05-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Jun": time = pub_time_list[-4] + "-06-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Jul": time = pub_time_list[-4] + "-07-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Aug": time = pub_time_list[-4] + "-08-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Sept": time = pub_time_list[-4] + "-09-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Oct": time = pub_time_list[-4] + "-10-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Nov": time = pub_time_list[-4] + "-11-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Dec": time = pub_time_list[-4] + "-12-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" item["pub_time"] = time if self.time == None or Util.format_time3(time) >= int(self.time): # 标题 item["title"] = temp.find("a").text.strip() if temp.find( "a").text else None # 摘要和正文 body = [] temp_list = soup.select_one("div.detail_text").find_all( "p") if soup.select_one("div.detail_text").find_all( "p") else None if temp_list: for temp in temp_list: body.append(temp.text.strip()) item["abstract"] = body[0] item["body"] = "\n".join(body) else: item["abstract"] = None item["body"] = None # 图片 images = [] image_list = soup.select("div.article_image") if soup.select( "div.article_image") else None if image_list: for image in image_list: images.append(image.find("img").get("src")) item["images"] = images yield item else: self.logger.info('时间截止')
def parse_eassys(self, response): # 各类二级目录的文章的翻页和url爬取 soup = BeautifulSoup(response.text, 'html.parser') flag = True if re.match(r'.*photo-gallery.*', response.url): # 照片的 for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'): try: url = 'https://zeenews.india.com' + t.select_one('a').get( 'href') except: continue response.meta['title'] = t.select_one('h3').text response.meta['images'] = [t.select_one('img').get('src')] response.meta['pub_time'] = t.select_one( '.photo-date').text.strip() if self.time is None or Util.format_time3( Util.format_time2( t.select_one('.photo-date').text.strip())) >= int( self.time): yield Request(url, callback=self.parse_item_photo, meta=response.meta) else: flag = False self.logger.info('时间截止') break elif re.match(r'.*video.*', response.url): # 视频的 for i in soup.find_all( attrs={'class': 'mini-video mini-video-h margin-bt30px' }): # 该目录初始的文章 url = 'https://zeenews.india.com' + i.select_one('a').get( 'href') #self.logger.info( url) response.meta['images'] = [i.select_one('img').get('src')] response.meta['title'] = i.select_one('h3').text response.meta['pub_time'] = i.select_one('.date').text.strip() if self.time is None or Util.format_time3( Util.format_time2( i.select_one('span.date').text.strip())) >= int( self.time): yield Request(url, callback=self.parse_item_video, meta=response.meta) else: flag = False self.logger.info('时间截止') break else: for t in soup.find_all( class_='section-article margin-bt30px clearfix' ): # 该目录初始的文章 url = 'https://zeenews.india.com' + t.select_one('a').get( 'href') response.meta['title'] = t.select_one('h3.margin-bt10px').text tt = t.select_one('span.date').text.strip().split() try: pub_time = self.hindi_month[tt[0]] + ' ' + tt[ 1] + ' ' + tt[2] + ' ' + tt[3] + ' ' + tt[5] except: pub_time = t.select_one('span.date').text.strip() response.meta['pub_time'] = pub_time response.meta['images'] = [t.select_one('img').get('src')] if self.time is None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): yield Request(url=url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: try: nextPage = 'https://zeenews.india.com/' + soup.find( class_='next last').select_one('a').get('href') yield Request(nextPage, callback=self.parse_eassys, meta=response.meta) except: self.logger.info('Next page no more!')
def parse_news_list(self, response): soup = BeautifulSoup(response.text, "html.parser") news_list = soup.select("tbody>tr") if soup.select("tbody>tr") else [] time2 = None for news in news_list: url = "https://dfa.gov.ph" + news.select_one("a").get("href") pub_time_list = news.find( "td", class_="list-date small").text.strip().split(" ") if news.find( "td", class_="list-date small") else None if pub_time_list: if pub_time_list[1] == "January": time2 = pub_time_list[2] + "-01-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "February": time2 = pub_time_list[2] + "-02-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "March": time2 = pub_time_list[2] + "-03-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "April": time2 = pub_time_list[2] + "-04-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "May": time2 = pub_time_list[2] + "-05-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "June": time2 = pub_time_list[2] + "-06-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "July": time2 = pub_time_list[2] + "-07-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "August": time2 = pub_time_list[2] + "-08-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "September": time2 = pub_time_list[2] + "-09-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "October": time2 = pub_time_list[2] + "-10-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "November": time2 = pub_time_list[2] + "-11-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "December": time2 = pub_time_list[2] + "-12-" + pub_time_list[ 0] + " 00:00:00" response.meta["pub_time"] = time2 yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) next_page = "https://dfa.gov.ph" + soup.select_one( "li.pagination-next>a").get("href") if soup.select_one( "li.pagination-next>a") else None if self.time == None or (time2 and Util.format_time3(time2) >= int(self.time)): if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('时间截止')