def _4matTime(self, time): timels = time.strip().split() time = timels[1] + ' ' + timels[0] + ' ' + timels[2] return Util.format_time2(time)
from selenium import webdriver import unittest from ddt import data, ddt import time from demo.util import Util testdata = Util.read_excel("D:/python-webUI-auto/data/data.xlsx", "Sheet1") @ddt class Search_by_ddt(unittest.TestCase): def setUp(self): self.driver = webdriver.Chrome("../tools/chromedriver.exe") self.driver.maximize_window() self.driver.get("https://www.baidu.com") self.driver.implicitly_wait(5) @data(*testdata) def test_search_by_ddt(self, data): search_string = data["content"] print("搜索内容->:%s" % search_string) search_input = self.driver.find_element_by_id('kw') # 找到后,键入 java 并提交搜索 search_input.send_keys(search_string) time.sleep(3) search_input.submit() def tearDown(self): """测试结束后的操作,这里基本上都是关闭浏览器""" self.driver.quit()
def parse_news(self, response): soup = BeautifulSoup(response.text, "html.parser") # 发布时间 pub_time_list = re.split( " |,", soup.select_one("h2.page-header>small").text) if soup.select_one( "h2.page-header>small") else None time2 = Util.format_time() if pub_time_list: if pub_time_list[-4] == "January": time2 = pub_time_list[-1] + "-01-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "February": time2 = pub_time_list[-1] + "-02-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "March": time2 = pub_time_list[-1] + "-03-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "April": time2 = pub_time_list[-1] + "-04-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "May": time2 = pub_time_list[-1] + "-05-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "June": time2 = pub_time_list[-1] + "-06-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "July": time2 = pub_time_list[-1] + "-07-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "August": time2 = pub_time_list[-1] + "-08-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "September": time2 = pub_time_list[-1] + "-09-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "October": time2 = pub_time_list[-1] + "-10-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "November": time2 = pub_time_list[-1] + "-11-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "December": time2 = pub_time_list[-1] + "-12-" + pub_time_list[ -3] + " 00:00:00" pub_time = time2 # 标题 temp = soup.select_one("h2.page-header") [s.extract() for s in temp('small')] title = temp.text.strip() # 正文 body_list2 = [] body_list = re.split("\r\n|\n", soup.select_one("div.col-md-12>p").text.strip()) for b in body_list: if b: body_list2.append(b) body = "\n".join(body_list2) # 摘要 abstract = body_list2[0] # 图片 images = [] temp_list = soup.select("center>img") for t in temp_list: images.append("http://www.tourism.gov.ph" + t.get("src")) item = DemoItem() item["category1"] = "News Updates" item["category2"] = "Featured News" item["pub_time"] = pub_time item["title"] = title item["abstract"] = abstract item["body"] = body item["images"] = images yield item
def nayalook_time_switch1_2(time_string): # 3 days ago # 返回时间戳 return Util.format_time3(str(Util.format_time2(time_string)))
def nayalook_time_switch2(time_string): # 返回%Y-%m-%d %H:%M:%S # 3 days ago return Util.format_time2(time_string)
def parse(self, response, **kwargs): header = { 'Accept': 'application/json,text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Length': '11', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'PHPSESSID=h2q86fctchauhq3ngeg8cu2ld7', 'Host': 'www.macaupostdaily.com', 'Origin': 'https://www.macaupostdaily.com', 'Referer': 'https://www.macaupostdaily.com/', 'sec-ch-ua': 'Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'sec-ch-ua-mobile': '?0', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } url = 'https://www.macaupostdaily.com/' url_list = [] time_list = [] title_list = [] img_list = [] news_soup = BeautifulSoup(response.text, 'lxml') for i in news_soup.find('ul', class_='new_list', id='fu').find_all('li'): url_list.append('https://www.macaupostdaily.com' + i.find('a').get('href')) time_list.append( i.find('div', class_='time').text.strip('\n').strip(' ') + ":00") title_list.append(i.find('strong').text.strip('\n')) img_list.append(url + i.find('img').get('src')) request_url = 'https://www.macaupostdaily.com/index.php/Article/news_list' # 后面post得到的url i = 2 while (i): Data = {'cid': '', 'page': "%d" % i} rep = requests.post(url=request_url, data=Data, headers=header).json() for list in rep['list']: url_list.append("https://www.macaupostdaily.com/article" + list['id'] + ".html") title_list.append(list['title']) time_list.append(list['time'] + ":00") img_list.append('https://www.macaupostdaily.com' + list['img']) for new in range(0, len(url_list)): if self.time == None or Util.format_time3( time_list[new]) >= int(self.time): yield Request(url_list[new], callback=self.parse_2, meta={ 'time': time_list[new], 'title': title_list[new], 'img': img_list[new] }) if Util.format_time3(time_list[-1]) < int(self.time): break url_list = [] time_list = [] img_list = [] title_list = [] i = i + 1
def nayalook_time_switch1(time_string): # 30/11/2020 # 返回时间戳 return Util.format_time3(str(datetime.strptime(time_string, "%d/%m/%Y")))
def nhandan_time_switch1(time_string): # 2020年12月25日 星期五 # 返回时间戳 time_string = time_string.rsplit(" ", 1)[0] return Util.format_time3(str(datetime.strptime(time_string, "%Y年%m月%d日")))
def parse_2(self, response, **kawargs): page = BeautifulSoup(response.text, 'lxml') category1 = page.find('h1', class_='page-title').text if page.find('ul', id='posts-container', class_='posts-items') != None: for i in page.find('ul', id='posts-container', class_='posts-items').find_all( 'a', class_='post-thumb'): images = i.find('img').get('data-src') yield Request(i.attrs['href'], callback=self.parse_3, meta={ 'images': images, 'category1': category1 }) else: for i in page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces' ).find_all('div', class_='featured-area'): images = i.find('img').get('data-src') yield Request(i.find('a').get('href'), callback=self.parse_3, meta={ 'images': images, 'category1': category1 }) #看能否爬下一页 if page.find('span', class_='last-page first-last-pages') != None: next_page = page.find( 'span', class_='last-page first-last-pages').find('a').attrs['href'] if page.find('div', class_='year-month') != None: time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \ page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[ -1].text pub_time = time_font_2(time) elif page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces') != None: pub_time = time_font( page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces'). find_all('span', class_='date meta-item tie-icon')[-1].text) elif page.find('ul', id='posts-container', class_='posts-items') != None: pub_time = time_font( page.find('ul', id='posts-container', class_='posts-items').find_all( 'span', class_='date meta-item tie-icon')[-1].text) if self.time == None or Util.format_time3(pub_time) >= int( self.time): yield Request(next_page, callback=self.parse_2) #这是第二种二级目录 elif page.find('li', class_='the-next-page') != None: next_page = page.find( 'li', class_='the-next-page').find('a').attrs['href'] if page.find('div', class_='year-month') != None: time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \ page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[ -1].text pub_time = time_font_2(time) elif page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces') != None: pub_time = time_font( page.find( 'div', class_='masonry-grid-wrapper masonry-with-spaces'). find_all('span', class_='date meta-item tie-icon')[-1].text) elif page.find('ul', id='posts-container', class_='posts-items') != None: pub_time = time_font( page.find('ul', id='posts-container', class_='posts-items').find_all( 'span', class_='date meta-item tie-icon')[-1].text) if self.time == None or Util.format_time3(pub_time) >= int( self.time): yield Request(next_page, callback=self.parse_2)
def parse_news_list(self, response): soup = BeautifulSoup(response.text, "html.parser") news_list = soup.select("tbody>tr") if soup.select("tbody>tr") else [] time2 = None for news in news_list: url = "https://dfa.gov.ph" + news.select_one("a").get("href") pub_time_list = news.find( "td", class_="list-date small").text.strip().split(" ") if news.find( "td", class_="list-date small") else None if pub_time_list: if pub_time_list[1] == "January": time2 = pub_time_list[2] + "-01-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "February": time2 = pub_time_list[2] + "-02-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "March": time2 = pub_time_list[2] + "-03-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "April": time2 = pub_time_list[2] + "-04-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "May": time2 = pub_time_list[2] + "-05-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "June": time2 = pub_time_list[2] + "-06-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "July": time2 = pub_time_list[2] + "-07-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "August": time2 = pub_time_list[2] + "-08-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "September": time2 = pub_time_list[2] + "-09-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "October": time2 = pub_time_list[2] + "-10-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "November": time2 = pub_time_list[2] + "-11-" + pub_time_list[ 0] + " 00:00:00" elif pub_time_list[1] == "December": time2 = pub_time_list[2] + "-12-" + pub_time_list[ 0] + " 00:00:00" response.meta["pub_time"] = time2 yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) next_page = "https://dfa.gov.ph" + soup.select_one( "li.pagination-next>a").get("href") if soup.select_one( "li.pagination-next>a") else None if self.time == None or (time2 and Util.format_time3(time2) >= int(self.time)): if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('时间截止')
def parse_news_list(self, response): soup = BeautifulSoup(response.text, "html.parser") # 每页第一条新闻 web = soup.find("td", { "valign": "top" }).select_one("td>font>a").text.strip() if soup.find( "td", { "valign": "top" }).select_one("td>font>a").text else None if web and web == "PhilBoxing.com": url = soup.find("td", { "valign": "top" }).select_one("td>a").get("href") if soup.find( "td", { "valign": "top" }).select_one("td>a").get("href") else None abstract = soup.find("td", { "valign": "top" }).select_one("td>font.newsblurb").text.strip().split( "\r\n\r\n") if soup.find("td", { "valign": "top" }).select_one("td>font.newsblurb").text else None response.meta["abstract"] = ' '.join( abstract) if abstract else None if url: yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) # 除第一条新闻的其他新闻 table = soup.find("table", { "width": "100%", "height": "100%" }) if soup.find("table", { "width": "100%", "height": "100%" }) else None p = table.select("p")[2] if table and table.select("p")[2] else None web_list = p.select("p>font>a") if p and p.select("p>font>a") else None news_list = p.select("p>a") if p and p.select("p>a") else None abstract_list = p.select( "p>font.newsblurb") if p and p.select("p>font.newsblurb") else None i = 0 if web_list: for web in web_list: if web.text.strip() == "PhilBoxing.com": url = news_list[2 * i].get("href") if news_list and news_list[ 2 * i].get("href") else None abstract = abstract_list[i].text.strip().split( "\r\n\r\n" ) if abstract_list and abstract_list[i].text else None response.meta["abstract"] = ' '.join( abstract) if abstract else None if url: yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) i += 1 else: i += 1 # 翻页 time_list = p.find_all("font", {"size": "2"})[-1].text.split(" ") if time_list: if time_list[-2] == "Jan": time = time_list[-1] + "-01-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Feb": time = time_list[-1] + "-02-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Mar": time = time_list[-1] + "-03-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Apr": time = time_list[-1] + "-04-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "May": time = time_list[-1] + "-05-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Jun": time = time_list[-1] + "-06-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Jul": time = time_list[-1] + "-07-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Aug": time = time_list[-1] + "-08-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Sept": time = time_list[-1] + "-09-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Oct": time = time_list[-1] + "-10-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Nov": time = time_list[-1] + "-11-" + time_list[-3] + " 00:00:00" elif time_list[-2] == "Dec": time = time_list[-1] + "-12-" + time_list[-3] + " 00:00:00" else: time = None if time and (self.time == None or Util.format_time3(time) >= int(self.time)): font_list = soup.select("font.boxertablebody") if soup.select( "font.boxertablebody") else None a_list = font_list[-1].select( "a") if font_list and font_list[-1].select("a") else None next_page = "http://philboxing.com/news/" + a_list[0].get( "href") if a_list and a_list[0].get("href") else None if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('时间截止')
def parse_news(self, response): item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] response1 = response.text.replace('<br>', ' ') soup = BeautifulSoup(response1, "html.parser") # 时间 temp = soup.select_one("div.title_text") if soup.select_one( "div.title_text") else None pub_time_list = re.split( " |,", temp.select_one("p").text) if temp.select_one("p").text else None if pub_time_list: if pub_time_list[-5] == "Jan": time = pub_time_list[-4] + "-01-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Feb": time = pub_time_list[-4] + "-02-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Mar": time = pub_time_list[-4] + "-03-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Apr": time = pub_time_list[-4] + "-04-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "May": time = pub_time_list[-4] + "-05-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Jun": time = pub_time_list[-4] + "-06-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Jul": time = pub_time_list[-4] + "-07-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Aug": time = pub_time_list[-4] + "-08-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Sept": time = pub_time_list[-4] + "-09-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Oct": time = pub_time_list[-4] + "-10-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Nov": time = pub_time_list[-4] + "-11-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" elif pub_time_list[-5] == "Dec": time = pub_time_list[-4] + "-12-" + pub_time_list[ -6] + " " + pub_time_list[-2] + ":00" item["pub_time"] = time if self.time == None or Util.format_time3(time) >= int(self.time): # 标题 item["title"] = temp.find("a").text.strip() if temp.find( "a").text else None # 摘要和正文 body = [] temp_list = soup.select_one("div.detail_text").find_all( "p") if soup.select_one("div.detail_text").find_all( "p") else None if temp_list: for temp in temp_list: body.append(temp.text.strip()) item["abstract"] = body[0] item["body"] = "\n".join(body) else: item["abstract"] = None item["body"] = None # 图片 images = [] image_list = soup.select("div.article_image") if soup.select( "div.article_image") else None if image_list: for image in image_list: images.append(image.find("img").get("src")) item["images"] = images yield item else: self.logger.info('时间截止')
def parse_news(self, response): item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["pub_time"] = response.meta["pub_time"] if response.meta["pub_time"] else Util.format_time() soup = BeautifulSoup(response.text, "html.parser") temp = soup.find("div", {"itemprop": "articleBody"}) if soup.find("div", {"itemprop": "articleBody"}) else None temp1 = temp.find("p", {"style": "text-align: center;"}) if temp and temp.find("p", {"style": "text-align: center;"}) else None item["title"] = temp1.text.strip() if temp1 and temp1.text else None body = [] temp2_list = temp.find_all("p", {"style": "text-align: justify;"}) if temp.find_all("p", {"style": "text-align: justify;"}) else [] for temp2 in temp2_list: [s.extract() for s in temp2('script')] b = temp2.get_text().strip().split('\xa0') if temp2.text else None b = ' '.join(b) if b else None if b: body.append(b) item["abstract"] = body[0] if body else None item["body"] = '\n'.join(body) if body else None images = [] temp3_list = temp.find_all("p", {"style": "text-align: center;"}) if temp and temp.find_all("p", {"style": "text-align: center;"}) else [] for temp3 in temp3_list: image = "https://dfa.gov.ph" + temp3.find("img").get("src") if temp3.find("img") and temp3.find("img").get("src") else None if image: images.append(image) item["images"] = images self.logger.info(response.meta["pub_time"]) self.logger.info(item) # yield item
def parse_news(self, response): item = DemoItem() soup = BeautifulSoup(response.text, "html.parser") # 文章发布时间 temp = soup.find("time", {"class": "css-1sbuyqj"}) if soup.find( "time", {"class": "css-1sbuyqj"}) else None temp_text = temp.text.strip() if temp and temp.text else None time_list = re.split(",| ", temp_text) if temp_text else None time2 = Util.format_time() if time_list: if time_list[3] == "Jan": time2 = time_list[5] + "-01-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Feb': time2 = time_list[5] + "-02-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Mar': time2 = time_list[5] + "-03-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Apr': time2 = time_list[5] + "-04-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'May': time2 = time_list[5] + "-05-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Jun': time2 = time_list[5] + "-06-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Jul': time2 = time_list[5] + "-07-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Aug': time2 = time_list[5] + "-08-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Sept': time2 = time_list[5] + "-09-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Oct': time2 = time_list[5] + "-10-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Nov': time2 = time_list[5] + "-11-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Dec': time2 = time_list[5] + "-12-" + time_list[1] + " " + time_list[ 6] + ":00" item["pub_time"] = time2 # 文章图片 images = [] img = soup.select_one("picture>img").get("src") if soup.select_one( "picture>img") else None if img: images.append(img) item["images"] = images # 文章内容 body = [] p_list = soup.select("p.css-158dogj") for p in p_list: if p.text: body.append(p.text.strip()) item['body'] = "\n".join(body) if body else None # 文章摘要 abstract = soup.find("p", { "id": "article-summary" }).text.strip() if soup.find("p", {"id": "article-summary"}) else '' if abstract == '' or abstract == '.': abstract = body[0] if body else None item["abstract"] = abstract # 一级目录 item["category1"] = response.meta["category1"] # 二级目录 item["category2"] = response.meta["category2"] # 文章标题 item["title"] = soup.find("h1", { "id": "link-1b44e840" }).text.strip() if soup.find("h1", {"id": "link-1b44e840"}) else None yield item
def parse_3(self, response): item = response.meta['item'] new_soup = BeautifulSoup(response.text) try: item['title'] = new_soup.select('div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.col-sm-16.sec-info > h1')[0].text item['pub_time'] = time_font(new_soup.select('div.text-danger.sub-info-bordered div.time')[0].text) if len(new_soup.select('div.text-danger.sub-info-bordered div.time')) else Util.format_time() item['body'] = '' if len(new_soup.select('.col-sm-16.sec-info p')): for bodys in new_soup.select('.col-sm-16.sec-info p'): item['body'] += bodys.text else: for bodys in new_soup.select('.carousel-caption p'): item['body'] += bodys.text item['abstract'] = new_soup.select('.col-sm-16.sec-info p')[0].text if len(new_soup.select('.col-sm-16.sec-info p')) else new_soup.select_one('.carousel-caption p') item['images'] = [] if len(new_soup.select( 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img')): new_images_list = new_soup.select( 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img') for new_images in new_images_list: item['images'].append(new_images.get('src')) except: pass yield item
def headlinehindi_time_switch1(time_string): # 2020-12-23T17:50:27+05:30 # 返回时间戳 time_string = time_string.rsplit("+", 1)[0] return Util.format_time3( str(datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")))
def parse_news_list(self, response): home_url = 'https://doh.gov.ph/' time2 = '' soup = BeautifulSoup(response.text, "html.parser") news_list = soup.select("div.panel>div>div.view-content>div") # 新闻列表 for news in news_list: # 发布日期和时间 date = news.find("span", class_="field-content content-time").text.strip() dtime = " 00:00:00" # 日期 pub_time_list = re.split(" |,", date) if date else None if pub_time_list: if pub_time_list[0] == "January": time2 = pub_time_list[-1] + "-01-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "February": time2 = pub_time_list[-1] + "-02-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "March": time2 = pub_time_list[-1] + "-03-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "April": time2 = pub_time_list[-1] + "-04-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "May": time2 = pub_time_list[-1] + "-05-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "June": time2 = pub_time_list[-1] + "-06-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "July": time2 = pub_time_list[-1] + "-07-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "August": time2 = pub_time_list[-1] + "-08-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "September": time2 = pub_time_list[-1] + "-09-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "October": time2 = pub_time_list[-1] + "-10-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "November": time2 = pub_time_list[-1] + "-11-" + pub_time_list[ 1] + dtime elif pub_time_list[0] == "December": time2 = pub_time_list[-1] + "-12-" + pub_time_list[ 1] + dtime response.meta['pub_time'] = time2 #新闻列表 url = urljoin(home_url, news.find("a").get("href")) yield scrapy.Request(url, meta=response.meta, callback=self.parse_news) # 翻页 next_page = "https://doh.gov.ph/" + soup.select_one( "li.pager-next>a").get("href") if soup.select_one( "li.pager-next>a") else None if self.time == None or (time2 and Util.format_time3(time2) >= int(self.time)): if next_page: yield scrapy.Request(next_page, meta=response.meta, callback=self.parse_news_list) else: self.logger.info('time out')
def parse_eassys(self, response): # 各类二级目录的文章的翻页和url爬取 soup = BeautifulSoup(response.text, 'html.parser') flag = True if re.match(r'.*photo-gallery.*', response.url): # 照片的 for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'): try: url = 'https://zeenews.india.com' + t.select_one('a').get( 'href') except: continue response.meta['title'] = t.select_one('h3').text response.meta['images'] = [t.select_one('img').get('src')] response.meta['pub_time'] = t.select_one( '.photo-date').text.strip() if self.time is None or Util.format_time3( Util.format_time2( t.select_one('.photo-date').text.strip())) >= int( self.time): yield Request(url, callback=self.parse_item_photo, meta=response.meta) else: flag = False self.logger.info('时间截止') elif re.match(r'.*video.*', response.url): # 视频的 for i in soup.find_all( attrs={'class': 'mini-video mini-video-h margin-bt30px' }): # 该目录初始的文章 url = 'https://zeenews.india.com' + i.select_one('a').get( 'href') #self.logger.info( url) response.meta['images'] = [i.select_one('img').get('src')] response.meta['title'] = i.select_one('h3').text response.meta['pub_time'] = i.select_one('.date').text.strip() if self.time is None or Util.format_time3( Util.format_time2( i.select_one('span.date').text.strip())) >= int( self.time): yield Request(url, callback=self.parse_item_video, meta=response.meta) else: flag = False self.logger.info('时间截止') else: for t in soup.find_all( class_='section-article margin-bt30px clearfix' ): # 该目录初始的文章 url = 'https://zeenews.india.com' + t.select_one('a').get( 'href') response.meta['title'] = t.select_one('h3.margin-bt10px').text tt = t.select_one('span.date').text.strip().split() try: pub_time = self.hindi_month[tt[0]] + ' ' + tt[ 1] + ' ' + tt[2] + ' ' + tt[3] + ' ' + tt[5] except: pub_time = t.select_one('span.date').text.strip() response.meta['pub_time'] = pub_time response.meta['images'] = [t.select_one('img').get('src')] if self.time is None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): yield Request(url=url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') if flag: try: nextPage = 'https://zeenews.india.com/' + soup.find( class_='next last').select_one('a').get('href') yield Request(nextPage, callback=self.parse_eassys, meta=response.meta) except: self.logger.info('Next page no more!')