class NewsCrawler(): def __init__(self): # 初始化一个数据库对象 self.dbUtil = DatabaseUtil() # 根据新闻的 url,获取新闻的具体内容 def getNewsContent(self, url: str): content = [] req = urllib.request.Request(url=url) response = urllib.request.urlopen(req, timeout=5) try: response_read = response.read().decode('utf-8') except: response_read = response.read().decode('gbk') soup = BeautifulSoup(response_read, 'lxml') article = soup.select_one('div.article') if not article: article = soup.select_one('div#artibody') paras = article.select('p') if paras: for para in paras: line = para.text.strip() if len(line) >= 1: content.append(line) return content # Python3 支持的指定参数形式 def parseNewsDetail(self, r: dict): entity = {} # 新闻标题 entity['title'] = r['title'] print(entity['title']) # 新闻的 url,进一步获取新闻的具体内容,从而生成关键字、摘要 entity['url'] = r['url'] print(entity['url']) # 新闻的时间 timeInt = int(r['time']) entity['time'] = timeInt # 新闻的具体文本内容 content = self.getNewsContent(entity['url']) contentConcat = '' if len(content) >= 1: contentConcat = '\n'.join(content) entity['content'] = contentConcat # 使用 SnowNLP 提供的生成关键词、提取摘要的接口 # 具体算法是 TextRank,毕业论文需要具体介绍这种方法 if contentConcat: s = SnowNLP(contentConcat) # 生成关键词,因为有的时候关键词是一个单字(常用字),所以只保留长度大于等于 2 的 keywords10 = s.keywords(10) keywords = [ x for x in keywords10 if (len(x) > 1 and self.validKeywords(x)) ] keywordsSQL = '|'.join(keywords) entity['keywords'] = keywordsSQL # 生成一个三句话的摘要 summary = s.summary(3) summarySQL = '|'.join(summary) entity['summary'] = summarySQL # print(keywordsSQL) # print(summarySQL) # print('='*80) # 写入数据库中 self.dbUtil.insert(entity) def getSinaRollNews(self, startPage, endPage): for i in range(startPage, endPage): print("Page {}".format(i)) params = globalParams.copy() params['page'] = str(i) allHtml = requests.get(newsUrl, params=params, headers=headers) pageHtml = allHtml.content.decode('gbk') pageHtml = pageHtml[pageHtml.index('{'):-1] # 解析获取到的 JSON 格式的新闻列表 data_str = eval(pageHtml, Dummy()) data_str = json.dumps(data_str) data_str = json.loads(data_str) data_str = data_str['list'] for r in data_str: # 对每一个具体的新闻都进行解析,并且写入数据库 self.parseNewsDetail(r) # 稍微暂停一下,避免爬取频率太高 time.sleep(random.random() * 6.0) time.sleep(10) # 有时候得到的关键词是一些标点符号、纯数字等,所以过滤掉 def validKeywords(self, x: str): if ('—' in x) or (':') in x or ('"' in x) or \ (',' in x) or ('。' in x) or (x.isdigit()): return False else: return True