def parse_detail(self, response): item = DemoItem() html = BeautifulSoup(response.text, 'html.parser') item['category1'] = response.meta['category1'] item['category2'] = response.meta['category2'] if html.select_one("div.container h1") is not None: item['title'] = html.select_one("div.container h1").text item['body'] = '' if html.select("div.col-24 p"): bodies = html.select("div.col-24 p") b_list = [b.text for b in bodies] item['body'] = '\n'.join(b_list) item['abstract'] = bodies[0].text item['images'] = [] if html.select("div.col-24 figure img"): images = html.select("div.col-24 figure img") for i in images: item['images'].append(i['src']) if html.select_one("p.byline span.date") is not None: ex = 'Published on (.*)' pub_time = html.select_one("p.byline span.date").text pub_time = re.findall(ex, pub_time) if pub_time: pub_time = pub_time[0] pub_time = Util.format_time2(pub_time) item['pub_time'] = pub_time else: item['pub_time'] = Util.format_time() else: item['pub_time'] = Util.format_time() yield item
def parse_detail(self, response): item = DemoItem() soup = BeautifulSoup(response.text, features='lxml') item['category1'] = response.meta['category'] item['title'] = soup.select_one('div.heading_container').text if soup.select_one('div.heading_container') else soup.select_one('.heading.clsNewsTitleHeading1').text p_list = [] if soup.select('div.newscontent p'): all_p = soup.select('div.newscontent p') else: all_p = soup.select('div[align="justify"]') for paragraph in all_p: p_list.append(paragraph.text) body = '\n'.join(p_list) if len(p_list): item['abstract'] = p_list[0] item['body'] = body image_list = [] imgs = soup.select('div[align="center"] img') if soup.select('div[align="center"] img') else None if imgs: for img in imgs: image_list.append(img.get('src')) item['images'] = image_list if soup.select_one('div.date_and_author_container span'): temp_time = soup.select_one('div.date_and_author_container span').text.split(" ")[1] else: temp_time = soup.select_one('td.miscinfo').text.split(" ")[1] try: item['pub_time'] = time_adjustment(temp_time) except Exception: item['pub_time'] = Util.format_time() yield item
def parse3(self, response): html = BeautifulSoup(response.text) item = DemoItem() list = response.url.split('/') item['title'] = html.select('.news-title')[0].text item['category1'] = list[3] if re.findall(r'\d+', list[4]) == []: item['category2'] = list[4] item['body'] = '' for i in html.select('.article-content > p'): item['body'] += (i.text + '\n') if html.select('.article-content > p') != []: item['abstract'] = html.select('.article-content > p')[0].text self.logger.info( html.select('.timestamp-entry > .date-posted')[0].text) if html.select('.timestamp-entry > .date-posted') != []: item['pub_time'] = Util.format_time2( html.select('.timestamp-entry > .date-posted')[0].text) else: item['pub_time'] = Util.format_time() if html.select('.article-content > .embed-wrap img') != []: item['images'] = [ html.select('.article-content > .embed-wrap img') [0].attrs['src'], ] yield item
def parse_details(self, response): item = DemoItem() soup = BeautifulSoup(response.text, 'lxml') item['category1'] = response.meta['category1'] item['category2'] = response.meta['category2'] item['abstract'] = response.meta['abstract'] item['title'] = soup.find( 'h1', class_='post-title entry-title').text.strip() if soup.find( 'h1', class_='post-title entry-title') else None item['body'] = '' #不能忘记初始化 item['body'] += soup.find( 'div', class_="post-body entry-content" ).select('div')[2].text if soup.find( 'div', class_='post-body entry-content').select('div')[2] else None item['images'] = [] image_list = soup.find('div', class_='separator').select('a') if soup.find( 'div', class_='separator') else None if (image_list): for image in image_list: image = image.get('href') item['images'].append(image) pub = soup.find('span', class_='updated').text.strip() if soup.find( 'span', class_='updated') else None if (pub): pub = Util.format_time2(pub) item['pub_time'] = pub else: pub = Util.format_time() yield item
def parse_news(self, response): item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["pub_time"] = response.meta["pub_time"] if response.meta[ "pub_time"] != None else Util.format_time() soup = BeautifulSoup(response.text, "html.parser") temp = soup.find("div", {"itemprop": "articleBody"}) if soup.find( "div", {"itemprop": "articleBody"}) else None item["title"] = soup.select('h1.entry-title')[0].text body = [] temp2_list = temp.find_all( "p", {"style": "text-align: justify;"}) if temp.find_all( "p", {"style": "text-align: justify;"}) else [] for temp2 in temp2_list: [s.extract() for s in temp2('script')] b = temp2.get_text().strip().split('\xa0') if temp2.text else None b = ' '.join(b) if b else None if b: body.append(b) item["abstract"] = body[0] if body else None item["body"] = '\n'.join(body) if body else None images = [] temp3_list = temp.find_all( "p", {"style": "text-align: center;"}) if temp and temp.find_all( "p", {"style": "text-align: center;"}) else [] for temp3 in temp3_list: image = "https://dfa.gov.ph" + temp3.find("img").get( "src") if temp3.find("img") and temp3.find("img").get( "src") else None if image: images.append(image) item["images"] = images yield item
def parse_page(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True try: lpt = soup.select('.td-post-date > time')[-2].get( 'datetime').replace('T', ' ')[:-6] last_pub_time = Util.format_time3(lpt) except: last_pub_time = Util.format_time3(Util.format_time()) if self.time is None or last_pub_time >= int(self.time): for i in soup.select('.item-details > h3> a'): url = i.get('href') yield Request(url, callback=self.parse_item, meta=response.meta) else: flag = False self.logger.info('时间截止') if flag: try: # 翻页 nextPage = soup.select_one('a.last ~a').get('href') yield Request(url=nextPage, callback=self.parse_page, meta=response.meta) except: print('Next Page NO more')
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() item['title'] = response.meta['title'] item['category1'] = response.meta['category1'] item['abstract'] = response.meta['abstract'] item['images'] = response.meta['images'] item['category2'] = response.meta['category2'] if re.findall('headline', response.url): # 一般新闻 ss = '' for i in soup.select('.dit > p > b'): ss += i.text + '\n' try: ss += soup.select_one('.dit > p > span').text except: pass item['body'] = ss tt = soup.select_one('.colort').text.split( ) # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM'] tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[ 5] # 形如 January 6 2021 02:12:12 PM item['pub_time'] = Util.format_time2(tt) elif re.findall('watchvid', response.url): # 视频新闻 item['body'] = soup.select_one('.dit > p').text item['pub_time'] = soup.select_one('.colort').text else: # 图片新闻 item['body'] = soup.select_one('.news_saa > p').text item['pub_time'] = Util.format_time(0) return item
def parse_essay(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.find_all(class_='amp-wp-content amp-loop-list'): tt = i.select_one( '.featured_time ').text.split() # 形如 ['2','दिन','ago' ] try: pub_time = tt[0] + ' ' + self.hindi_time_ago[tt[1]] + ' ' + tt[ 2] # 形如 '2 days ago' except: pub_time = Util.format_time(0) if self.time is None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): # 未截止,True response.meta['title'] = i.select_one('h2').text response.meta['abstract'] = i.select_one( '.large-screen-excerpt-design-3').text response.meta['pub_time'] = Util.format_time2(pub_time) response.meta['images'] = [i.select_one('amp-img').get('src')] yield Request(url=i.select_one('a').get('href'), meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: nextPage = soup.select_one('#pagination a').get( 'href', 'Next Page No More') yield Request(nextPage, meta=response.meta, callback=self.parse_essay)
def process_item(self, item, spider): m = hashlib.md5() # md5生成 m.update(item['response_url'].encode(encoding='utf-8')) item['md5'] = m.hexdigest() item['images'] = json.dumps(item['images'])# images生成 item['cole_time'] = Util.format_time() # cole_time生成 self.sql_serve(item,spider) # 数据库insert return item
def parse3(self, response): html = BeautifulSoup(response.text) item = DemoItem() item['title'] = response.meta['title'] item['category1'] = response.url.split('/')[3] item['category2'] = response.url.split('/')[4] item['body'] = '' for i in html.select('section.content > section:nth-of-type(1) p'): item['body'] += (i.text + '\n') item['abstract'] = response.meta['abstract'] item['pub_time'] = Util.format_time(response.meta['pub_time']) item['images'] = [ response.meta['images'], ] yield item
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() item['category1'] = response.meta['category1'] item['category2'] = 'News Details' item['title'] = soup.select_one('.read-content h5').text item['pub_time'] = Util.format_time() # 文章列表只有 时分 没有年日月。 item['images'] = response.meta['images'] ss = '' for p in soup.select('.read-content p'): ss += p.text ss += '\n' item['body'] = ss item['abstract'] = soup.select('.read-content p')[0].text return item
def parse_time(self, response): html = BeautifulSoup(response.text) if re.findall('headline', response.url): # 一般新闻 tt = html.select_one('.colort').text.split( ) # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM'] tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[ 5] # 形如 January 6 2021 02:12:12 PM timetext = Util.format_time2(tt) elif re.findall('watchvid', response.url): # 视频新闻 timetext = html.select_one('.colort').text else: # 图片新闻 timetext = Util.format_time(0) if self.time == None or Util.format_time3(timetext) >= int(self.time): yield Request(response.meta['nextPage'], callback=self.parse_essay, meta=response.meta) else: self.logger.info('截止') yield Request(response.url, meta=response.meta, callback=self.parse_item)
def pares_article(self, response): item = DemoItem() soup = BeautifulSoup(response.text, "html.parser") images = [] picture = soup.find("div", {"class": "artl__head"}) pic = picture.find("img").get( "src") if picture and picture.find("img") else None if pic: images.append(pic) datetime = soup.find("meta", {"property": "published_time"}) if datetime: # 具体到每个文章中可能没有时间,所有拿到的发布时时间精确到分 datetime = datetime.get("content") time = datetime.replace(":", ".").replace(",", ".").replace(" ", "") time = time.split(".") pub_time = ("20{}-{}-{} {}:{}:00".format(time[4], time[3], time[2], time[0], time[1])) # 已经添加秒 else: pub_time = Util.format_time() title = soup.find("h1").text.strip() if soup.find("h1") else None body_list = [] temp_list = soup.select('.wrap__ctnt p') for temp in temp_list: [s.extract() for s in temp('div')] [s.extract() for s in temp('label')] if temp.text and temp.text != '\n': body_list.append(temp.text.strip()) item['body'] = '\n'.join(body_list) abstract = body_list[0] if body_list else None item["images"] = images item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item['title'] = title item['pub_time'] = pub_time item['abstract'] = abstract yield item
def parse_news_url(self,response): meta = {} soup = bs(response.text,"html.parser") results = json.loads(response.text) if soup.find("h1","break-long-words exception-message") == None and results != []: for i in range(0, len(results)): news_url = 'https://www.spot.ph' + results[i]["url"] meta["title"] = results[i]["title"] meta["pub_time"] = Util.format_time(results[i]["date_published"]) print(news_url) print('\n') yield scrapy.Request(news_url, callback=self.parse_news, meta=meta) # #测试? if self.time == None or int(results[i]["date_published"]) >= int(self.time): # if i < len(results): # continue # else: yield Request(news_url,meta=response.meta,callback=self.parse_news_url) else: self.logger.info('时间截止') else: pass
def parse_news(self, response): item = DemoItem() soup = BeautifulSoup(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["title"] = response.meta["title"] # 发布时间 news_time = soup.find( "time", class_="entry-date updated td-module-date" ).get("datetime") if soup.find( "time", class_="entry-date updated td-module-date") else None time_list = re.split("T|\+", news_time) if news_time else [] item["pub_time"] = time_list[0] + " " + time_list[ 1] if time_list else Util.format_time() # 图片 images = [] temp = soup.select_one( "div.td-post-featured-image a") if soup.select_one( "div.td-post-featured-image a") else None img = temp.get("href") if temp and temp.get("href") else None if img: images.append(img) item["images"] = images # 新闻正文 body = [] p_list = soup.select("div.td-fix-index>p") for p in p_list: if p.text: body.append(p.text.strip()) if body == []: p_list = soup.select("div.td-ss-main-content p") for p in p_list: if p.text: body.append(p.text.strip()) item['body'] = "\n".join(body) if body else None item["abstract"] = body[0] if body else None yield item
def parse_news(self, response): soup = BeautifulSoup(response.text, "html.parser") item = DemoItem() item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["pub_time"] = response.meta["pub_time"] if response.meta[ "pub_time"] else Util.format_time() item['title'] = soup.find("h5", class_="page__title title").text.strip() # 图片 images = [] div_list = soup.find_all("div", class_="field-item even") if len(div_list) == 2: image = div_list[0].find("img").get("src") if div_list[0].find( "img") else None if image: images.append(image) item["images"] = images # 新闻正文 btext_list = [] temp_list = div_list[-1].findAll({"p", "li", "div", "span"}) if len(temp_list) == 1: btext = temp_list[0].text.strip() else: for temp2 in temp_list: if temp2 and temp2.text.strip() not in btext_list: btext_list.append(temp2.text.strip()) btext = '\n'.join(btext_list) body_list = [] body_list2 = btext.split("\xa0") for b in body_list2: if b: body_list.append(b) body = ' '.join(body_list) item['abstract'] = body.split('.')[0] + '...' item['body'] = body yield item
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() try: pub_time = self._4matTime(soup.select_one('.timestamp').text) except: pub_time = Util.format_time(0) if self.time is None or Util.format_time3(pub_time) > int( self.time): # 时间截止,遍历了所有url,没能截止下来 item['pub_time'] = pub_time item['title'] = soup.select('.active')[-1].text.strip() item['category1'] = response.url.split('/')[3] item['category2'] = response.url.split( '/')[5] if response.url.split('/')[3] == 'myanmar' else 'news' item['body'] = soup.select_one('article').text.strip() item['abstract'] = soup.select_one('article').text.strip().split( '\n')[0] item['images'] = [ i.get('src') for i in soup.select_one('section').select('img') ] return item else: self.logger.info('时间截止') self.stopCount += 1
def parse_3(self, response): item = response.meta['item'] new_soup = BeautifulSoup(response.text) try: item['title'] = new_soup.select('div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.col-sm-16.sec-info > h1')[0].text item['pub_time'] = time_font(new_soup.select('div.text-danger.sub-info-bordered div.time')[0].text) if len(new_soup.select('div.text-danger.sub-info-bordered div.time')) else Util.format_time() item['body'] = '' if len(new_soup.select('.col-sm-16.sec-info p')): for bodys in new_soup.select('.col-sm-16.sec-info p'): item['body'] += bodys.text else: for bodys in new_soup.select('.carousel-caption p'): item['body'] += bodys.text item['abstract'] = new_soup.select('.col-sm-16.sec-info p')[0].text if len(new_soup.select('.col-sm-16.sec-info p')) else new_soup.select_one('.carousel-caption p') item['images'] = [] if len(new_soup.select( 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img')): new_images_list = new_soup.select( 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img') for new_images in new_images_list: item['images'].append(new_images.get('src')) except: pass yield item
def parse_news(self, response): item = DemoItem() soup = BeautifulSoup(response.text, "html.parser") # 文章发布时间 temp = soup.find("time", {"class": "css-1sbuyqj"}) if soup.find( "time", {"class": "css-1sbuyqj"}) else None temp_text = temp.text.strip() if temp and temp.text else None time_list = re.split(",| ", temp_text) if temp_text else None time2 = Util.format_time() if time_list: if time_list[3] == "Jan": time2 = time_list[5] + "-01-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Feb': time2 = time_list[5] + "-02-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Mar': time2 = time_list[5] + "-03-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Apr': time2 = time_list[5] + "-04-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'May': time2 = time_list[5] + "-05-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Jun': time2 = time_list[5] + "-06-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Jul': time2 = time_list[5] + "-07-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Aug': time2 = time_list[5] + "-08-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Sept': time2 = time_list[5] + "-09-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Oct': time2 = time_list[5] + "-10-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Nov': time2 = time_list[5] + "-11-" + time_list[1] + " " + time_list[ 6] + ":00" elif time_list[3] == 'Dec': time2 = time_list[5] + "-12-" + time_list[1] + " " + time_list[ 6] + ":00" item["pub_time"] = time2 # 文章图片 images = [] img = soup.select_one("picture>img").get("src") if soup.select_one( "picture>img") else None if img: images.append(img) item["images"] = images # 文章内容 body = [] p_list = soup.select("p.css-158dogj") for p in p_list: if p.text: body.append(p.text.strip()) item['body'] = "\n".join(body) if body else None # 文章摘要 abstract = soup.find("p", { "id": "article-summary" }).text.strip() if soup.find("p", {"id": "article-summary"}) else '' if abstract == '' or abstract == '.': abstract = body[0] if body else None item["abstract"] = abstract # 一级目录 item["category1"] = response.meta["category1"] # 二级目录 item["category2"] = response.meta["category2"] # 文章标题 item["title"] = soup.find("h1", { "id": "link-1b44e840" }).text.strip() if soup.find("h1", {"id": "link-1b44e840"}) else None yield item
def parse_list(self, response): meta = {} soup = bs(response.text, "html.parser") s_url = 'https://api.summitmedia-digital.com/spot/v1/channel/get/' if soup.find("div", "nav nav-section"): p = 0 while p >= 0: if soup.find( "div", "section-header").find("h1").text == "Top 10 Lists": url = s_url + response.url.split( "/")[-2] + '__' + response.url.split( "/")[-1] + '/' + str(p) + '/20' else: url = s_url + response.url.split("/")[-1] + '/' + str( p) + '/20' # self.logger.info(url) results = json.loads(requests.get(url).text) if requests.get(url).status_code == 200 and results != []: p = p + 1 pub_time1 = results[-1]["date_published"] if self.time == None or int(pub_time1) >= int(self.time): for i in range(0, len(results)): news_url = 'https://www.spot.ph' + results[i]["url"] meta["title"] = results[i]["title"] meta["pub_time"] = Util.format_time( results[i]["date_published"]) # self.logger.info(news_url) # self.logger.info('\n') yield Request(news_url, callback=self.parse_news, meta=meta) else: self.logger.info('时间截止') p = -1 else: p = -1 else: p = 0 while p >= 0: url = s_url + response.url.split("/")[-1].split( "?")[0] + '/' + str(p) + '/20' results = json.loads(requests.get(url).text) if requests.get(url).status_code == 200 and results != []: p = p + 1 pub_time1 = results[-1]["date_published"] if self.time == None or int(pub_time1) >= int(self.time): for i in range(0, len(results)): news_url = 'https://www.spot.ph' + results[i]["url"] meta["title"] = results[i]["title"] meta["pub_time"] = Util.format_time( results[i]["date_published"]) # self.logger.info(news_url) # self.logger.info('\n') yield Request(news_url, callback=self.parse_news, meta=meta) else: self.logger.info('时间截止') p = -1 else: p = -1
def parse_news(self, response): soup = BeautifulSoup(response.text, "html.parser") # 发布时间 pub_time_list = re.split( " |,", soup.select_one("h2.page-header>small").text) if soup.select_one( "h2.page-header>small") else None time2 = Util.format_time() if pub_time_list: if pub_time_list[-4] == "January": time2 = pub_time_list[-1] + "-01-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "February": time2 = pub_time_list[-1] + "-02-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "March": time2 = pub_time_list[-1] + "-03-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "April": time2 = pub_time_list[-1] + "-04-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "May": time2 = pub_time_list[-1] + "-05-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "June": time2 = pub_time_list[-1] + "-06-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "July": time2 = pub_time_list[-1] + "-07-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "August": time2 = pub_time_list[-1] + "-08-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "September": time2 = pub_time_list[-1] + "-09-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "October": time2 = pub_time_list[-1] + "-10-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "November": time2 = pub_time_list[-1] + "-11-" + pub_time_list[ -3] + " 00:00:00" elif pub_time_list[-4] == "December": time2 = pub_time_list[-1] + "-12-" + pub_time_list[ -3] + " 00:00:00" pub_time = time2 # 标题 temp = soup.select_one("h2.page-header") [s.extract() for s in temp('small')] title = temp.text.strip() # 正文 body_list2 = [] body_list = re.split("\r\n|\n", soup.select_one("div.col-md-12>p").text.strip()) for b in body_list: if b: body_list2.append(b) body = "\n".join(body_list2) # 摘要 abstract = body_list2[0] # 图片 images = [] temp_list = soup.select("center>img") for t in temp_list: images.append("http://www.tourism.gov.ph" + t.get("src")) item = DemoItem() item["category1"] = "News Updates" item["category2"] = "Featured News" item["pub_time"] = pub_time item["title"] = title item["abstract"] = abstract item["body"] = body item["images"] = images yield item