def run(self): response = requests.get(self.url, headers=self.headers) if not response.status_code == 200: print('请求失败,地址有误' + self.url) return False print('请求地址:' + self.url) response.encoding = 'utf-8' self.html = response.text soup = BeautifulSoup(self.html, 'html.parser') urls = soup.select('.list ul li') print(len(urls)) for item in urls: pushtime = item.span.get_text() title = item.a.get_text() url_c = item.a['href'] print(self.getOne(pushtime, 'hnjyt')) if (self.getOne(pushtime, 'hnjyt') > 0): print(title + '已存在') continue self.saveOne(pushtime, 'hnjyt', title, url_c, '') r = requests.get(url_c) r.encoding = 'utf-8' self.html = r.text #解析内容页 soup_c = BeautifulSoup(self.html, 'html.parser') articetext = soup_c.select('.article')[0].get_text() #文章内容 #print(articetext) matchFlag = True #re.search(u'辅导员|化学',articetext) if matchFlag: print(pushtime + '|' + title + ':匹配到了') SendMail.mail(SendMail(), title, url_c + '\n\t' + articetext) else: print('文章:' + title + '未匹配到')
def run(self): #pageNo = input('输入页数:') pageNo = '1' if not pageNo: pageNo = 'index_1.html' else: pageNo = 'index_'+pageNo+'.html' response = requests.get(self.url + pageNo, headers=self.headers) if not response.status_code == 200: print('请求失败,地址有误'+self.url + pageNo) return False print('请求地址:'+self.url + pageNo) self.download(self.url + pageNo, pageNo) soup = BeautifulSoup(self.html,'html.parser') urls = soup.select('.list_b_info.right') urlcount = 0 for item in urls: if (urlcount > 2): break else: urlcount = urlcount + 1 print('当前下载数量'+ str(urlcount)) dir = item.h2.a['title'] url_c = item.h2.a['href'] if not os.path.exists(dir): os.makedirs(dir) fileName = dir +'/'+item.h2.a['title']+'.html' self.download(item.h2.a['href'],fileName) #解析列表页 soup_c = BeautifulSoup(self.html,'html.parser', from_encoding="gb18030") urls_c = soup_c.select('.article_body p a') count = 0 for item_c in urls_c: if item_c.span: count = count + 1 url_t = item_c['href'] name = item_c.get_text() childFileName = dir +'/'+ str(count) +self.replaceName(name) +'.html' #print(childFileName) if os.path.exists(childFileName): print(childFileName + '文件已存在') continue else: self.download(url_t,childFileName) soup_s = BeautifulSoup(self.html,'html.parser', from_encoding="gb18030") try: articetextBody = soup_s.select('.article_body') if not articetextBody: articetextBody = soup_s.select('.detail-content') articetext = articetextBody[0].get_text() matchFlag = re.search(u'辅导员|化学',articetext.decode('utf8')) if matchFlag: SendMail.mail(SendMail(),name,url_t+ '\n\t' +articetext) else: print('文章 名称:' + name + '未匹配到') except Exception: print( childFileName +' 解析内容失败')
def getNBCNews(): now = int(time.time()) timeArray = time.localtime(now) Ymd = time.strftime('%Y-%m-%d', timeArray) if Ymd not in os.listdir(): os.mkdir(Ymd) base_url = 'https://www.nbcnews.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } result = getHtml(base_url + '/tech-media', headers) soup = result[1] newslist = soup.findAll('article') index = 0 news_url = [] for news in newslist: index = index + 1 #print(news) href = news.find('a') if href: href = news.find('a').get('href') news_url.append(base_url + href) for new_url in news_url: print(new_url) try: result = getHtml(new_url, headers) soup = result[1] article = soup.find('div', class_='article') if not article: continue img_url = article.find('picture') imgpath = None if img_url: img_url = img_url.find('img').get('src') img_name = new_url[new_url.rfind('/') + 1:] + '.jpg' imgpath = os.path.join(Ymd, img_name) downloadImg(img_url, imgpath) text = article.get_text() #print(len(text)) #result = bdfy.translate(text) dst = '' #dst = result['trans_result'][0].get('dst') #print('原文',text,'译文',dst) SendMail.mail(SendMail, img_url, text + '\n' + dst, imgpath) except: traceback.print_exc() print('解析:', new_url, '失败')
def run(): newslist = getNewList('http://ent.163.com/special/00032IAD/ent_json.js') for url in newslist: try: r = synonym.getByUrl(url) #print(r) if r is not None: #没有是None continue news = parseUrl(url) text = bdnlp.nplParse(news[1]) synonym.downloadText(text, news[0] + '/dest.txt', 'utf-8') files = news[2] files.append(news[0] + '/dest.txt') SendMail.mail(SendMail, news[0], news[1] + '\n' + text, files) except: traceback.print_exc() pass
def run(): newslist = getNewList('http://yule.sohu.com/_scroll_newslist/%s/news.inc' %(getToday())) for url in newslist: try: if url.find('picture') > 0 : #组图 or url.find('music') > 0 continue r = synonym.getByUrl(url) #print(r) if r is not None: #没有是None continue news = parseUrl(url) text = bdnlp.nplParse(news[1]) synonym.downloadText(text,news[0] + '/dest.txt','utf-8') files = news[2] files.append(news[0] + '/dest.txt') SendMail.mail(SendMail,news[0],news[1] + '\n' + text,files) except: traceback.print_exc() pass
def run(self): #pageNo = input('输入页数:') pageNo = '1' if not pageNo: pageNo = 'index_1.html' else: pageNo = 'index_'+pageNo+'.html' response = requests.get(self.url + pageNo, headers=self.headers) if not response.status_code == 200: print('请求失败,地址有误'+self.url + pageNo) return False print('请求地址:'+self.url + pageNo) self.download(self.url + pageNo, pageNo) # 解析汇总页 soup = BeautifulSoup(self.html,'html.parser') urls = soup.select('.list_b_info.right') for item in urls: #print(item.h2.a['title']) #print(item.h2.a['href']) dir = item.h2.a['title'] url_c = item.h2.a['href'] if not os.path.exists(dir): os.makedirs(dir) self.download(item.h2.a['href'],dir +'/'+item.h2.a['title']+'.html') #解析列表页 soup_c = BeautifulSoup(self.html,'html.parser', from_encoding="gb18030") urls_c = soup_c.select('.article_body p a') count = 0 for item_c in urls_c: if item_c.span: count = count + 1 url_t = item_c['href'] name = item_c.get_text() self.download(url_t,dir +'/'+ str(count) +self.replaceName(name) +'.html') soup_s = BeautifulSoup(self.html,'html.parser', from_encoding="gb18030") articetext = soup_s.select('.article_left.border')[0].get_text() matchFlag = re.search('辅导员|化学',articetext) if matchFlag: SendMail.mail(SendMail,name,url_t+ '\t\n' +articetext) else: print('文章 名称:' + name + '未匹配到')
def run(cat): try: newslist = getNewList('http://ent.cri.cn/roll/' + cat) for url in newslist: try: if url.find('picture') > 0: #组图 continue r = synonym.getByUrl(url) #print(r) if r is not None: #没有是None continue news = parseUrl(url) text = bdnlp.nplParse(news[1]) synonym.downloadText(text,news[0] + '/dest.txt','utf-8') files = news[2] files.append(news[0] + '/dest.txt') SendMail.mail(SendMail,news[0],news[1] + '\n' + text,files) except: traceback.print_exc() pass #break except: pass
def run(self): for target in self.targets: self.url_addr = self.url + target response = requests.get(self.url_addr, headers=self.headers) if not response.status_code == 200: print('请求失败,地址有误' + self.url_addr) return False response.encoding = 'utf-8' self.html = response.text soup = BeautifulSoup(self.html, 'html.parser') urls = soup.select('.main ul li') print(len(urls)) for item in urls: pushtime = item.span.get_text() title = item.a.get_text() url_c = item.a['href'] print(self.getOne(pushtime, target)) if (self.getOne(pushtime, target) > 0): print(title + '已发送') continue self.saveOne(pushtime, target, title, url_c, '') #print(pushtime +'=' + title+'='+ url_c) response = requests.get(url_c, headers=self.headers) response.encoding = 'utf-8' self.html = response.text #解析内容页 soup_c = BeautifulSoup(self.html, 'html.parser') articetext = soup_c.select('.main')[0].get_text() #文章内容 #print(articetext) matchFlag = re.search(u'辅导员|化学|长垣', articetext) if matchFlag: print(pushtime + '|' + title + ':匹配到了') SendMail.mail(SendMail(), title, url_c + '\n\t' + articetext) else: print('文章:' + title + '未匹配到')
def getKorNews(): now = int(time.time()) timeArray = time.localtime(now) Ymd = time.strftime('%Y-%m-%d', timeArray) if Ymd not in os.listdir(): os.mkdir(Ymd) base_url = 'https://entertain.naver.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } result = getKorHtml(base_url + '/ranking', headers) soup = result[1] newslist = soup.select('#ranking_news li') news_url = [] for news in newslist: href = news.find('a') if href: href = news.find('a').get('href') href = base_url + href news_url.append(href) #print('获取连接数:',str(len(news_url))) for new_url in news_url: print(new_url) try: result = getKorHtml(new_url, headers) soup = result[1] title = soup.find('h2', class_='end_tit') if not title: continue title = title.get_text().strip().replace('\n', '') #标题 text = soup.find('div', id="articeBody") if not text: continue result = bdfy.translateOther(title, 'kor', 'zh') print(result) title_dst = result['trans_result'][0].get('dst') #print('原文',title,'译文',title_dst) srcText = text.get_text().strip().replace('\n', '') dstText = '' #if len(srcText) > 1000: #nowText = srcText #dstText = '' #while len(nowText) > 1000: # result = bdfy.translateOther(nowText[0:1000],'kor','zh') # dstText += result['trans_result'][0].get('dst') # nowText = nowText[1000:] #result = bdfy.translateOther(nowText[len(srcText)/1000 * 1000 :],'kor','zh') #dstText += result['trans_result'][0].get('dst') #else: # result = bdfy.translateOther(srcText,'kor','zh') # dstText = result['trans_result'][0].get('dst') img_url = text.find('img') imgpath = None if img_url: img_url = img_url.get('src') img_name = title_dst + '.jpg' imgpath = os.path.join(Ymd, img_name) downloadImg(img_url, imgpath) SendMail.mail( SendMail, title_dst, title + '|' + srcText + '\n' + title_dst + '|' + dstText, imgpath) except: traceback.print_exc() print('解析:', new_url, '失败')
def getHollywoodNews(): now = int(time.time()) timeArray = time.localtime(now) Ymd = time.strftime('%Y-%m-%d', timeArray) if Ymd not in os.listdir(): os.mkdir(Ymd) base_url = 'https://www.hollywoodreporter.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } topics = [ '/topic/movies', '/topic/tv', '/topic/entertainment-industry', '/topic/technology' ] for topic in topics: result = getHtml(base_url + topic, headers) soup = result[1] newslist = soup.findAll('article') news_url = [] for news in newslist: href = news.find('a') if href: href = news.find('a', class_='topic-card__link').get('href') if href.index('http') < 0: href = base_url + href news_url.append(href) #print('获取连接数:',str(len(news_url))) for new_url in news_url: print(new_url) try: result = getHtml(new_url, headers) soup = result[1] title = soup.find('h1', class_='article__headline') if not title: continue title = title.get_text().replace('\n', '') #标题 deck = soup.find('h2', class_='article__deck') if not deck: deck = 'no deck' deck = deck.get_text().replace('\n', '') # 副标题 text = soup.find('div', class_='article__body') if not text: continue result = bdfy.translate(title) title_dst = result['trans_result'][0].get('dst') print('原文', title, '译文', title_dst) result = bdfy.translate(deck) deck_dst = result['trans_result'][0].get('dst') print('原文', deck, '译文', deck_dst) srcText = '' dstText = '' ps = text.select('p') for p in ps: #if len(p.get_text()) < 10: # continue #result = bdfy.translate(p.get_text().replace('\n','')) #print(result) #dstText = result['trans_result'][0].get('dst') srcText += p.get_text().replace('\n', '') #dstText += dstText img_url = soup.find('figure').find('img') imgpath = None if img_url: img_url = img_url.get('src') img_name = new_url[new_url.rfind('/') + 1:] + '.jpg' imgpath = os.path.join(Ymd, img_name) downloadImg(img_url, imgpath) #result = bdfy.translate(srcText) #dst = result['trans_result'][0].get('dst') #print('原文',srcText,'译文',dst) SendMail.mail( SendMail, title_dst, title + '|' + deck + '|' + srcText + '\n' + title_dst + '|' + deck_dst + '|' + dstText, imgpath) except: traceback.print_exc() print('解析:', new_url, '失败')