コード例 #1
0
def parsingContent(link):
    t = time.localtime()
    news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(
        t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min)
    title = ''
    content = ''

    fullLink = 'http://www.chinapeace.gov.cn' + link
    try:
        p = requests.get(fullLink)
        s = BeautifulSoup(p.content, features='html.parser')
    except:
        lw.log_writer('中央政法委员会' + fullLink + '失败')
        return {
            'news_link': fullLink.strip(),
            'news_title': title.strip(),
            'news_source': '10',
            'news_content': content.strip(),
            'news_date': news_date
        }

    lw.log_writer('中央政法委员会开始爬取' + fullLink)

    try:
        title = s.find('div', {'class': 'title'}).text.replace('\n', '')
    except:
        lw.log_writer('中央政法委员会')

    try:
        contentList = s.find('div', {'class': 'content_main'}).findAll('p')
        for p in contentList:
            if len(p.text) > 5 and p.find('img') is None:
                content += '<p>' + p.text.replace('\n', '') + '</p>'
            else:
                pass
    except:
        lw.log_writer('中央政法委员会获取内容错误')

    rst = {
        'news_link': fullLink.strip(),
        'news_title': title.strip(),
        'news_source': '中央政法委',
        'news_content': content.strip(),
        'news_date': news_date
    }

    return rst
コード例 #2
0
def parsingContent(link):
    t = time.localtime()
    news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min)
    title = ''
    content = ''

    fullLink = 'http://fund.eastmoney.com/a/' + link
    try:
        p = requests.get(fullLink)
        s = BeautifulSoup(p.content, features = 'html.parser')
    except:
        lw.log_writer('东方财富脚本爬取' + fullLink + '失败')
        return {'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '东方财富基金资讯', 'news_content': content.strip(), 'news_date': news_date}

    lw.log_writer('东方财富脚本开始爬取' + fullLink)



    try: # 爬取标题
        title = s.find('h1').text.replace('\n', '')
    except:
        lw.log_writer('东方财富脚本爬取标题错误')

    try:
        contentList = s.findAll('div', {'id': 'ContentBody'})[0].findAll('p')
        for p in contentList:
            if len(p.text) > 5 and p.find('img') is None:
                content += '<p>' + p.text.replace('\n', '') + '</p>'
            else:
                pass
    except:
        lw.log_writer('东方财富脚本爬取内容错误')



    rst = {'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '东方财富基金资讯',
           'news_content': content.strip(), 'news_date': news_date}

    return rst
コード例 #3
0
def parsingContent(link, title):
    t = time.localtime()
    news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(
        t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min)
    title = title
    content = ''

    fullLink = link
    try:
        p = requests.get(fullLink)
        s = BeautifulSoup(p.content, features='html.parser')
    except:
        lw.log_writer('新华网财经' + fullLink + '失败')
        return {
            'news_link': fullLink.strip(),
            'news_title': title.strip(),
            'news_source': '11',
            'news_content': content.strip(),
            'news_date': news_date
        }

    lw.log_writer('新华网财经开始爬取' + fullLink)

    # try:
    #     title = s.find('div', {'class': 'title'}).text.replace('\n', '')
    # except:
    #     lw.log_writer('新华网财经')

    try:
        contentList = s.findAll('p')
        for p in contentList:
            if len(p.text) > 5 and p.find('img') is None:
                content += '<p>' + p.text.replace('\n', '') + '</p>'
            else:
                pass
    except:
        lw.log_writer('新华网财经获取内容错误')

    rst = {
        'news_link': fullLink.strip(),
        'news_title': title.strip(),
        'news_source': '11',
        'news_content': content.strip(),
        'news_date': news_date
    }

    return rst
コード例 #4
0
def parsingContent(link):
    fullLink = 'http://www.cs.com.cn' + link
    p = requests.get(fullLink)
    s = BeautifulSoup(p.content, features='html.parser')

    lw.log_writer('中国证券报脚本开始爬取' + fullLink)

    title = ''
    content = ''

    # 爬取标题
    try:
        title = s.find('h1').text.replace('\n', '')
    except:
        lw.log_writer('中国证券报文章标题获取错误')

    try:
        contentList = s.find('section').findAll()

        for p in contentList:
            if len(p.text) > 5 and p.find(
                    'img') is None and 'TRS' not in p.text:
                content += '<p>' + p.text.replace('\n', '') + '</p>'
            else:
                pass
    except:
        lw.log_writer('中国证券报文章内容爬取错误')

    t = time.localtime()
    news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(
        t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min)

    rst = {
        'news_link': fullLink.strip(),
        'news_title': title.strip(),
        'news_source': '8',
        'news_content': content.strip(),
        'news_date': news_date
    }

    return rst
コード例 #5
0
def parsingContent(link):
    page = requests.get(link)
    s = BeautifulSoup(page.content, features='html.parser')

    lw.log_writer('财新网脚本开始爬取' + link)

    title = ''
    content = ''

    try:  # parsing title
        title = s.find('div', {
            'id': 'the_content'
        }).find('h1').text.replace('\n', '').replace('\r', '')
    except:
        lw.log_writer('财新网文章标题获取错误')

    try:
        contentList = s.find('div', {'id': 'Main_Content_Val'}).findAll('p')
        for p in contentList:
            if p.find('img') is None:
                content += str(p).replace('\r', '').replace('\n', '')
            else:
                pass
    except:
        lw.log_writer('财新网文章内容获取错误')

    t = time.localtime()
    news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(
        t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min)

    rst = {
        'news_link': link.strip(),
        'news_title': title.strip(),
        'news_source': '1',
        'news_content': content.strip(),
        'news_date': news_date
    }

    return rst
コード例 #6
0
def main():
    print('新华网财经')
    print()

    # ============= 测试Connection =============

    mydb = connectDB()
    mycursor = mydb.cursor()

    # ============= 测试Connection END =============

    try:
        r = requests.get('http://xinhuanet.com/fortunepro/')
        soup = BeautifulSoup(r.content, features='html.parser')

    except:
        return

    # ============== 主页面爬取 ================

    news_list = soup.find('ul', {'class': 'silder_nav clearfix'}).findAll('li')
    news_list_item = {}
    # news_list_item_belong = {}

    for i in news_list:
        # currentLi = i.findAll('a')
        # for a in currentLi:
        # news_list_item[a.find('a').text] = a.find('a').get('href')
        try:
            news_list_item[i.find('a').get('href')] = i.find('a').text
        except:
            pass
            # news_list_item_belong[a.find('a').get('href')] = a.find('div', {'class': 'dysMiddleResultConItemRelevant clearfix'}).text

    print('共', len(news_list_item), '个结果')
    print()
    # ============== 主页面爬取 END ===============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in news_list_item:
        try:
            sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str(
                a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(a)
            else:
                pass
        except:
            lw.log_writer('新华网财经添加新闻错误')
            pass

    lw.log_writer('新华网财经本轮新增新闻有' + str(len(confirmed_new)) + '条')

    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link, news_list_item[link])

            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('新华网财经在添加数据时失败')

            lw.log_writer('新华网财经新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('新华网财经本轮结束')
        mydb.close()
コード例 #7
0
def minorRandomPause():
    randomTime = random.randint(600, 900)
    lw.log_writer('新华网财经脚本进入休眠' + str(randomTime) + '秒')
    time.sleep(randomTime)
コード例 #8
0
def main():
    print('中共中央政法委员会')
    print()

    # ============= 测试Connection =============

    mydb = connectDB()
    mycursor = mydb.cursor()

    # ============= 测试Connection END =============

    try:
        r = requests.get(
            'http://www.chinapeace.gov.cn/chinapeace/c100008/list2020.shtml')
        soup = BeautifulSoup(r.content, features='html.parser')

    except:
        return

    # ============== 主页面爬取 ================

    news_list = soup.find('div', {
        'class': "w1200 bgfff"
    }).find('div', {
        'class': 'list_box_left'
    }).findAll('li')
    news_list_item = {}

    for i in news_list:
        currentLi = i.findAll('a')
        for a in currentLi:
            news_list_item[a.text] = a.get('href')

    print('共', len(news_list_item), '个结果')
    print()
    # ============== 主页面爬取 END ===============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in news_list_item:
        try:
            sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str(
                a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(news_list_item[a])
            else:
                pass
        except:
            lw.log_writer('中央政协委员会首页添加新闻错误')
            pass

    lw.log_writer('中央政协委员会本轮新增新闻有' + str(len(confirmed_new)) + '条')

    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link)

            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('中央政协委员会在添加数据时失败')

            lw.log_writer('中央政协新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('中央政协本轮结束')
        mydb.close()
コード例 #9
0
def main():
    print('财新网')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    # ============= 测试Connection END =============

    r = requests.get('http://www.caixin.com/')
    soup = BeautifulSoup(r.content, features='html.parser')

    main_page_item = {}  # 用于储存全部该页面数据

    # ============= 主页面爬取 =============
    main_list = soup.find('div', {'class': 'news_list'}).findAll('dl')

    for item in main_list:
        a = item.find('dd').find('p').find('a')
        main_page_item[a.text] = a.get('href')

    print('This round has ', len(main_page_item), ' items')
    print()
    # ============= 主页面爬取 END =============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(
                a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            print('添加新的新闻错误')
            print()
            pass
    print('本轮新的新闻有', len(confirmed_new), '条')
    # ============== 数据库对照 END =================

    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            try:
                sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
                rst = parsingContent(link)
                # ======= 标签 - 新增 12.15 ==========
                gov_tag = module_news_govTag.tagGov(mycursor,
                                                    str(rst['news_title']),
                                                    str(rst['news_content']))
                com_tag = module_news_comTag.tagCom(mycursor,
                                                    str(rst['news_title']),
                                                    str(rst['news_content']))
                topic_tag = module_news_topicTag.tagTopic(
                    mycursor, str(rst['news_title']), str(rst['news_content']))
                # ======= 标签 - 新增 12.15 END ==========
                val = (str(rst['news_title']), str(rst['news_source']),
                       str(rst['news_date']), str(rst['news_content']),
                       str(rst['news_link']), gov_tag, com_tag, topic_tag)
                mycursor.execute(sql, val)
                mydb.commit()
                lw.log_writer('财新网' + str(mycursor.rowcount) + '条')
                minorRandomPause()
            except:
                print('Getting info error')
                print()
                break

    lw.log_writer('财新网脚本本轮结束')
    mydb.close()
コード例 #10
0
def main():
    print('天天基金网新闻')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    print(len(mycursor.fetchall()), ' Connection works')
    print()
    # ============= 测试Connection END =============

    r = requests.get('http://fund.eastmoney.com/a/cjjyw.html')
    soup = BeautifulSoup(r.content, features = 'html.parser')

    # ============== 主页面爬取 ==============
    main_list = soup.find('div', {'class': 'mainCont'}).findAll('ul') # 此处包含页面4个ul
    main_page_item = {} # 用于储存全部该页面的数据
    
    for i in main_list:
        currentUl = i.findAll('a')
        for a in currentUl:
            main_page_item[a.text] = a.get('href')

    print('共', len(main_page_item), '个结果')
    print()
    # ============== 主页面爬取 END ==============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            lw.log_writer('东方财富脚本首页添加新闻错误')
            pass

    lw.log_writer('东方财富脚本本轮新增新闻有' + str(len(confirmed_new)) + '条')
    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link)
            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag)
            try:
                mycursor.execute(sql, val)
                mydb.commit()
            except:
                lw.log_writer('东方财富脚本在添加数据时失败')
            lw.log_writer('东方财富脚本新增' + str(mycursor.rowcount) + '条')
            minorRandomPause()
        
        lw.log_writer('东方财富脚本轮结束')
        mydb.close()
コード例 #11
0
def main():
    print('中国证券网')
    print()

    # ============= 测试Connection =============
    mydb = connectDB()
    mycursor = mydb.cursor()
    mycursor.execute('SELECT * FROM ttd.news LIMIT 10;')
    print(len(mycursor.fetchall()), ' Connection works')
    print()
    # ============= 测试Connection END =============

    r = requests.get('http://www.cs.com.cn/')
    soup = BeautifulSoup(r.content, features='html.parser')

    # ============== 主页面爬取 ==============
    main_page_item = {}

    top_part = soup.find('div', {
        'class': 'box410 ch_focus space_l1'
    }).findAll('li')
    for i in top_part:
        try:
            if 'http' not in i.find('a').get('href'):
                main_page_item[i.text] = i.find('a').get('href')
        except:
            pass

    mid_part = soup.find('div', {'class': 'box_l1'}).findAll('li')
    for i in mid_part:
        try:
            if 'http' not in i.find('a').get('href'):
                main_page_item[i.text] = i.find('a').get('href')
        except:
            pass

    print('共', len(main_page_item), '个结果')
    print()
    # ============== 主页面爬取 END ==============

    # ============== 数据库对照 =================
    confirmed_new = []
    for a in main_page_item:
        try:
            sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(
                a) + '\';'
            mycursor.execute(sql)
            compareResult = mycursor.fetchall()
            if len(compareResult) == 0:
                confirmed_new.append(main_page_item[a])
            else:
                pass
        except:
            lw.log_writer('中国证券报首页添加新闻错误')
            pass

    print('本轮新的新闻有', len(confirmed_new), '条')
    # ============== 数据库对照 END =================

    # ============== 爬取主代码 =================
    if len(confirmed_new) == 0:
        print('没有发现新增新闻,即将关闭DB链接')
        print()
        mydb.close()
    else:
        for link in confirmed_new:
            sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)'
            rst = parsingContent(link[1:])
            # ======= 标签 - 新增 12.15 ==========
            gov_tag = module_news_govTag.tagGov(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            com_tag = module_news_comTag.tagCom(mycursor,
                                                str(rst['news_title']),
                                                str(rst['news_content']))
            topic_tag = module_news_topicTag.tagTopic(mycursor,
                                                      str(rst['news_title']),
                                                      str(rst['news_content']))
            # ======= 标签 - 新增 12.15 END ==========
            val = (str(rst['news_title']), str(rst['news_source']),
                   str(rst['news_date']), str(rst['news_content']),
                   str(rst['news_link']), gov_tag, com_tag, topic_tag)
            mycursor.execute(sql, val)
            mydb.commit()
            lw.log_writer('中国证券报' + str(mycursor.rowcount) + '条')
            minorRandomPause()

        lw.log_writer('中国证券报脚本本轮结束')
        mydb.close()