def GetNewsListEs(url, time=dt.datetime.min, label='', maxOverdue=5): soup = News.GetSoup(url, 'lxml') maxPage = int( soup.body.select('div[id="pagerNoDiv"]')[0].select( 'a[class="page-btn"]')[0].previous_sibling.text) newsList = [] overdueCount = 0 maxTime = dt.datetime.min for p in range(1, maxPage + 1): pageUrl = url[0:len(url) - 5] + '_' + str(p) + '.html' try: urlList = GetNewsUrlEs(pageUrl) except Exception as e: News.WriteLog(str(e) + '. url = ' + pageUrl) continue for newsUrl in urlList: try: news = GetNewsEs(newsUrl) news.label = label # if news.time <= time: if news.time.date() < time.date(): overdueCount += 1 else: newsList.append(news) maxTime = news.time if news.time > maxTime else maxTime print(news.url) print(news.time) print(news.title) except Exception as e: News.WriteLog(str(e) + ', url = ' + newsUrl) continue if overdueCount >= maxOverdue: return newsList, maxTime return newsList, maxTime
def GetNewsUrlEs(url): soup = News.GetSoup(url, 'lxml') newsListContent = soup.body.select('ul[id="newsListContent"]')[0] sumList = newsListContent.select('li') urlList = [] for sum in sumList: sumContent = sum.select('div')[-1].select('p') title = sumContent[0].text.strip() info = sumContent[1].text.strip() time = sumContent[2].text.strip() pageUrl = sumContent[0].a['href'] urlList.append(pageUrl) return urlList
def GetNewsEs(url): soup = News.GetSoup(url, 'lxml') # 获取网页内容基本信息 newsContent = soup.body.select('div[class="newsContent"]')[0] contentBody = soup.body.select('div[id="ContentBody"]')[0] if newsContent.parent == contentBody: # 研报样式 title = soup.body.select( 'div[class="report-title"]')[0].h1.text.strip() newsInfo = soup.body.select('div[class="report-infos"]')[0] time = dt.datetime.strptime(newsInfo.contents[3].text.strip(), '%Y年%m月%d日 %H:%M') source = newsInfo.contents[5].text.strip( ) + ' ' + newsInfo.contents[7].text.strip() abstract = '' newsBody = newsContent elif contentBody.parent == newsContent: # 资讯样式 title = newsContent.h1.text.strip() newsInfo = newsContent.select('div[class="Info"]')[0] newsBody = contentBody time = dt.datetime.strptime( newsInfo.select('div[class="time"]')[0].text.strip(), '%Y年%m月%d日 %H:%M') source = newsInfo.img['alt'] if newsInfo.img is not None else '' absTagList = newsBody.select('div[class="b-review"]') if len(absTagList) == 0: abstract = '' else: abstract = absTagList[0].text.strip() else: raise 'Unknown page style: url = ' + url sectionList = [] news = News.News(url, time, title, source, abstract, '', sectionList) # 识别段落 secTitle = '' secContent = '' for c in newsBody.contents: if c.name == 'p' and len(c.attrs) == 0: # 段落标题和正文都存在于<p></p>标签中,且标签无属性 # 标题判断:<p></p>中整段文本全为加粗,即<p></p>中存在<strong></strong>标签且无非空白文本位于<strong></strong>之外,且无<span></span>子节点 if c.strong is not None and c.strong.span is None: isTitle = True for cc in c.contents: if not (isinstance(cc, str) and cc.strip() == '' or cc.name == 'strong'): isTitle = False break if c.strong.text.strip() == c.text.strip(): isTitle = True if isTitle: # 如果发现了新的标题,则认为新段落开始,将前面已经有段落内容存入段落列表 if secTitle != '' or secContent != '': sectionList.append( News.Section(secTitle, secContent, news, url, len(sectionList))) secTitle = '' secContent = '' secTitle = c.text.strip() continue # 正文判断:<p></p>中至少直接有一处非空白文本(直接位于<p></p>中,而非子标签中) for cc in c.contents: if isinstance(cc, str) and cc.strip() != '': secContent += c.text + os.linesep break if secContent != '' or secContent != '': sectionList.append( News.Section(secTitle, secContent, news, url, len(sectionList))) return news
# 从文件获取之前的关键字,并存入关键字集合 kwSet = set() for keyword in fileinput.input(os.path.join('.', 'dict', 'keyword')): if keyword[-1] == os.linesep: keyword = keyword[0:len(keyword) - 1] kwSet.add(keyword) # 从同花顺获取概念/行业,并追加到关键字集合 thsUrl = ['http://q.10jqka.com.cn/gn/', 'http://q.10jqka.com.cn/thshy/'] for url in thsUrl: soup = bs4.BeautifulSoup('', 'lxml') retry = 0 while soup.text == '' and retry <= 20: retry += 1 try: soup = News.GetSoup(url) except Exception as e: nothingtodo = 0 cateItemList = soup.select('div[class="cate_items"]') for cateItem in cateItemList: cateList = cateItem.select('a') for cate in cateList: text = cate.text kwSet.add(text) tail = text[max(0, len(text) - 2):len(text)] if tail == '行业' or tail == '概念' or tail == '板块': kwSet.add(text[0:len(text) - 2]) # 从东方财富获取概念/行业/地域,并追加到关键字集合 esUrl = ['http://quote.eastmoney.com/center/BKList.html#notion_0_0?sortRule=0'] for url in esUrl: