예제 #1
0
파일: NewsEs.py 프로젝트: GongCQ/Text
def GetNewsListEs(url, time=dt.datetime.min, label='', maxOverdue=5):
    soup = News.GetSoup(url, 'lxml')

    maxPage = int(
        soup.body.select('div[id="pagerNoDiv"]')[0].select(
            'a[class="page-btn"]')[0].previous_sibling.text)
    newsList = []
    overdueCount = 0
    maxTime = dt.datetime.min
    for p in range(1, maxPage + 1):
        pageUrl = url[0:len(url) - 5] + '_' + str(p) + '.html'
        try:
            urlList = GetNewsUrlEs(pageUrl)
        except Exception as e:
            News.WriteLog(str(e) + '. url = ' + pageUrl)
            continue
        for newsUrl in urlList:
            try:
                news = GetNewsEs(newsUrl)
                news.label = label
                # if news.time <= time:
                if news.time.date() < time.date():
                    overdueCount += 1
                else:
                    newsList.append(news)
                    maxTime = news.time if news.time > maxTime else maxTime
                    print(news.url)
                    print(news.time)
                    print(news.title)
            except Exception as e:
                News.WriteLog(str(e) + ', url = ' + newsUrl)
                continue
            if overdueCount >= maxOverdue:
                return newsList, maxTime
    return newsList, maxTime
예제 #2
0
파일: NewsEs.py 프로젝트: GongCQ/Text
def GetNewsUrlEs(url):
    soup = News.GetSoup(url, 'lxml')

    newsListContent = soup.body.select('ul[id="newsListContent"]')[0]
    sumList = newsListContent.select('li')
    urlList = []
    for sum in sumList:
        sumContent = sum.select('div')[-1].select('p')
        title = sumContent[0].text.strip()
        info = sumContent[1].text.strip()
        time = sumContent[2].text.strip()
        pageUrl = sumContent[0].a['href']
        urlList.append(pageUrl)
    return urlList
예제 #3
0
파일: NewsEs.py 프로젝트: GongCQ/Text
def GetNewsEs(url):
    soup = News.GetSoup(url, 'lxml')

    # 获取网页内容基本信息
    newsContent = soup.body.select('div[class="newsContent"]')[0]
    contentBody = soup.body.select('div[id="ContentBody"]')[0]
    if newsContent.parent == contentBody:  # 研报样式
        title = soup.body.select(
            'div[class="report-title"]')[0].h1.text.strip()
        newsInfo = soup.body.select('div[class="report-infos"]')[0]
        time = dt.datetime.strptime(newsInfo.contents[3].text.strip(),
                                    '%Y年%m月%d日 %H:%M')
        source = newsInfo.contents[5].text.strip(
        ) + ' ' + newsInfo.contents[7].text.strip()
        abstract = ''
        newsBody = newsContent
    elif contentBody.parent == newsContent:  # 资讯样式
        title = newsContent.h1.text.strip()
        newsInfo = newsContent.select('div[class="Info"]')[0]
        newsBody = contentBody
        time = dt.datetime.strptime(
            newsInfo.select('div[class="time"]')[0].text.strip(),
            '%Y年%m月%d日 %H:%M')
        source = newsInfo.img['alt'] if newsInfo.img is not None else ''
        absTagList = newsBody.select('div[class="b-review"]')
        if len(absTagList) == 0:
            abstract = ''
        else:
            abstract = absTagList[0].text.strip()
    else:
        raise 'Unknown page style: url = ' + url
    sectionList = []
    news = News.News(url, time, title, source, abstract, '', sectionList)

    # 识别段落
    secTitle = ''
    secContent = ''
    for c in newsBody.contents:
        if c.name == 'p' and len(c.attrs) == 0:  # 段落标题和正文都存在于<p></p>标签中,且标签无属性
            # 标题判断:<p></p>中整段文本全为加粗,即<p></p>中存在<strong></strong>标签且无非空白文本位于<strong></strong>之外,且无<span></span>子节点
            if c.strong is not None and c.strong.span is None:
                isTitle = True
                for cc in c.contents:
                    if not (isinstance(cc, str) and cc.strip() == ''
                            or cc.name == 'strong'):
                        isTitle = False
                        break
                if c.strong.text.strip() == c.text.strip():
                    isTitle = True
                if isTitle:  # 如果发现了新的标题,则认为新段落开始,将前面已经有段落内容存入段落列表
                    if secTitle != '' or secContent != '':
                        sectionList.append(
                            News.Section(secTitle, secContent, news, url,
                                         len(sectionList)))
                    secTitle = ''
                    secContent = ''
                    secTitle = c.text.strip()
                    continue
            # 正文判断:<p></p>中至少直接有一处非空白文本(直接位于<p></p>中,而非子标签中)
            for cc in c.contents:
                if isinstance(cc, str) and cc.strip() != '':
                    secContent += c.text + os.linesep
                    break
    if secContent != '' or secContent != '':
        sectionList.append(
            News.Section(secTitle, secContent, news, url, len(sectionList)))
    return news
예제 #4
0
파일: Dict.py 프로젝트: GongCQ/Text
# 从文件获取之前的关键字,并存入关键字集合
kwSet = set()
for keyword in fileinput.input(os.path.join('.', 'dict', 'keyword')):
    if keyword[-1] == os.linesep:
        keyword = keyword[0:len(keyword) - 1]
    kwSet.add(keyword)

# 从同花顺获取概念/行业,并追加到关键字集合
thsUrl = ['http://q.10jqka.com.cn/gn/', 'http://q.10jqka.com.cn/thshy/']
for url in thsUrl:
    soup = bs4.BeautifulSoup('', 'lxml')
    retry = 0
    while soup.text == '' and retry <= 20:
        retry += 1
        try:
            soup = News.GetSoup(url)
        except Exception as e:
            nothingtodo = 0
    cateItemList = soup.select('div[class="cate_items"]')
    for cateItem in cateItemList:
        cateList = cateItem.select('a')
        for cate in cateList:
            text = cate.text
            kwSet.add(text)
            tail = text[max(0, len(text) - 2):len(text)]
            if tail == '行业' or tail == '概念' or tail == '板块':
                kwSet.add(text[0:len(text) - 2])

# 从东方财富获取概念/行业/地域,并追加到关键字集合
esUrl = ['http://quote.eastmoney.com/center/BKList.html#notion_0_0?sortRule=0']
for url in esUrl: