def parsingContent(link): t = time.localtime() news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str( t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min) title = '' content = '' fullLink = 'http://www.chinapeace.gov.cn' + link try: p = requests.get(fullLink) s = BeautifulSoup(p.content, features='html.parser') except: lw.log_writer('中央政法委员会' + fullLink + '失败') return { 'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '10', 'news_content': content.strip(), 'news_date': news_date } lw.log_writer('中央政法委员会开始爬取' + fullLink) try: title = s.find('div', {'class': 'title'}).text.replace('\n', '') except: lw.log_writer('中央政法委员会') try: contentList = s.find('div', {'class': 'content_main'}).findAll('p') for p in contentList: if len(p.text) > 5 and p.find('img') is None: content += '<p>' + p.text.replace('\n', '') + '</p>' else: pass except: lw.log_writer('中央政法委员会获取内容错误') rst = { 'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '中央政法委', 'news_content': content.strip(), 'news_date': news_date } return rst
def parsingContent(link): t = time.localtime() news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str(t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min) title = '' content = '' fullLink = 'http://fund.eastmoney.com/a/' + link try: p = requests.get(fullLink) s = BeautifulSoup(p.content, features = 'html.parser') except: lw.log_writer('东方财富脚本爬取' + fullLink + '失败') return {'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '东方财富基金资讯', 'news_content': content.strip(), 'news_date': news_date} lw.log_writer('东方财富脚本开始爬取' + fullLink) try: # 爬取标题 title = s.find('h1').text.replace('\n', '') except: lw.log_writer('东方财富脚本爬取标题错误') try: contentList = s.findAll('div', {'id': 'ContentBody'})[0].findAll('p') for p in contentList: if len(p.text) > 5 and p.find('img') is None: content += '<p>' + p.text.replace('\n', '') + '</p>' else: pass except: lw.log_writer('东方财富脚本爬取内容错误') rst = {'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '东方财富基金资讯', 'news_content': content.strip(), 'news_date': news_date} return rst
def parsingContent(link, title): t = time.localtime() news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str( t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min) title = title content = '' fullLink = link try: p = requests.get(fullLink) s = BeautifulSoup(p.content, features='html.parser') except: lw.log_writer('新华网财经' + fullLink + '失败') return { 'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '11', 'news_content': content.strip(), 'news_date': news_date } lw.log_writer('新华网财经开始爬取' + fullLink) # try: # title = s.find('div', {'class': 'title'}).text.replace('\n', '') # except: # lw.log_writer('新华网财经') try: contentList = s.findAll('p') for p in contentList: if len(p.text) > 5 and p.find('img') is None: content += '<p>' + p.text.replace('\n', '') + '</p>' else: pass except: lw.log_writer('新华网财经获取内容错误') rst = { 'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '11', 'news_content': content.strip(), 'news_date': news_date } return rst
def parsingContent(link): fullLink = 'http://www.cs.com.cn' + link p = requests.get(fullLink) s = BeautifulSoup(p.content, features='html.parser') lw.log_writer('中国证券报脚本开始爬取' + fullLink) title = '' content = '' # 爬取标题 try: title = s.find('h1').text.replace('\n', '') except: lw.log_writer('中国证券报文章标题获取错误') try: contentList = s.find('section').findAll() for p in contentList: if len(p.text) > 5 and p.find( 'img') is None and 'TRS' not in p.text: content += '<p>' + p.text.replace('\n', '') + '</p>' else: pass except: lw.log_writer('中国证券报文章内容爬取错误') t = time.localtime() news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str( t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min) rst = { 'news_link': fullLink.strip(), 'news_title': title.strip(), 'news_source': '8', 'news_content': content.strip(), 'news_date': news_date } return rst
def parsingContent(link): page = requests.get(link) s = BeautifulSoup(page.content, features='html.parser') lw.log_writer('财新网脚本开始爬取' + link) title = '' content = '' try: # parsing title title = s.find('div', { 'id': 'the_content' }).find('h1').text.replace('\n', '').replace('\r', '') except: lw.log_writer('财新网文章标题获取错误') try: contentList = s.find('div', {'id': 'Main_Content_Val'}).findAll('p') for p in contentList: if p.find('img') is None: content += str(p).replace('\r', '').replace('\n', '') else: pass except: lw.log_writer('财新网文章内容获取错误') t = time.localtime() news_date = str(t.tm_year) + '-' + str(t.tm_mon) + '-' + str( t.tm_mday) + '-' + str(t.tm_hour) + '-' + str(t.tm_min) rst = { 'news_link': link.strip(), 'news_title': title.strip(), 'news_source': '1', 'news_content': content.strip(), 'news_date': news_date } return rst
def main(): print('新华网财经') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() # ============= 测试Connection END ============= try: r = requests.get('http://xinhuanet.com/fortunepro/') soup = BeautifulSoup(r.content, features='html.parser') except: return # ============== 主页面爬取 ================ news_list = soup.find('ul', {'class': 'silder_nav clearfix'}).findAll('li') news_list_item = {} # news_list_item_belong = {} for i in news_list: # currentLi = i.findAll('a') # for a in currentLi: # news_list_item[a.find('a').text] = a.find('a').get('href') try: news_list_item[i.find('a').get('href')] = i.find('a').text except: pass # news_list_item_belong[a.find('a').get('href')] = a.find('div', {'class': 'dysMiddleResultConItemRelevant clearfix'}).text print('共', len(news_list_item), '个结果') print() # ============== 主页面爬取 END =============== # ============== 数据库对照 ================= confirmed_new = [] for a in news_list_item: try: sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str( a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(a) else: pass except: lw.log_writer('新华网财经添加新闻错误') pass lw.log_writer('新华网财经本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link, news_list_item[link]) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('新华网财经在添加数据时失败') lw.log_writer('新华网财经新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('新华网财经本轮结束') mydb.close()
def minorRandomPause(): randomTime = random.randint(600, 900) lw.log_writer('新华网财经脚本进入休眠' + str(randomTime) + '秒') time.sleep(randomTime)
def main(): print('中共中央政法委员会') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() # ============= 测试Connection END ============= try: r = requests.get( 'http://www.chinapeace.gov.cn/chinapeace/c100008/list2020.shtml') soup = BeautifulSoup(r.content, features='html.parser') except: return # ============== 主页面爬取 ================ news_list = soup.find('div', { 'class': "w1200 bgfff" }).find('div', { 'class': 'list_box_left' }).findAll('li') news_list_item = {} for i in news_list: currentLi = i.findAll('a') for a in currentLi: news_list_item[a.text] = a.get('href') print('共', len(news_list_item), '个结果') print() # ============== 主页面爬取 END =============== # ============== 数据库对照 ================= confirmed_new = [] for a in news_list_item: try: sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str( a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(news_list_item[a]) else: pass except: lw.log_writer('中央政协委员会首页添加新闻错误') pass lw.log_writer('中央政协委员会本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('中央政协委员会在添加数据时失败') lw.log_writer('中央政协新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('中央政协本轮结束') mydb.close()
def main(): print('财新网') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') # ============= 测试Connection END ============= r = requests.get('http://www.caixin.com/') soup = BeautifulSoup(r.content, features='html.parser') main_page_item = {} # 用于储存全部该页面数据 # ============= 主页面爬取 ============= main_list = soup.find('div', {'class': 'news_list'}).findAll('dl') for item in main_list: a = item.find('dd').find('p').find('a') main_page_item[a.text] = a.get('href') print('This round has ', len(main_page_item), ' items') print() # ============= 主页面爬取 END ============= # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str( a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: print('添加新的新闻错误') print() pass print('本轮新的新闻有', len(confirmed_new), '条') # ============== 数据库对照 END ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: try: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic( mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) mycursor.execute(sql, val) mydb.commit() lw.log_writer('财新网' + str(mycursor.rowcount) + '条') minorRandomPause() except: print('Getting info error') print() break lw.log_writer('财新网脚本本轮结束') mydb.close()
def main(): print('天天基金网新闻') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://fund.eastmoney.com/a/cjjyw.html') soup = BeautifulSoup(r.content, features = 'html.parser') # ============== 主页面爬取 ============== main_list = soup.find('div', {'class': 'mainCont'}).findAll('ul') # 此处包含页面4个ul main_page_item = {} # 用于储存全部该页面的数据 for i in main_list: currentUl = i.findAll('a') for a in currentUl: main_page_item[a.text] = a.get('href') print('共', len(main_page_item), '个结果') print() # ============== 主页面爬取 END ============== # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: lw.log_writer('东方财富脚本首页添加新闻错误') pass lw.log_writer('东方财富脚本本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('东方财富脚本在添加数据时失败') lw.log_writer('东方财富脚本新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('东方财富脚本轮结束') mydb.close()
def main(): print('中国证券网') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://www.cs.com.cn/') soup = BeautifulSoup(r.content, features='html.parser') # ============== 主页面爬取 ============== main_page_item = {} top_part = soup.find('div', { 'class': 'box410 ch_focus space_l1' }).findAll('li') for i in top_part: try: if 'http' not in i.find('a').get('href'): main_page_item[i.text] = i.find('a').get('href') except: pass mid_part = soup.find('div', {'class': 'box_l1'}).findAll('li') for i in mid_part: try: if 'http' not in i.find('a').get('href'): main_page_item[i.text] = i.find('a').get('href') except: pass print('共', len(main_page_item), '个结果') print() # ============== 主页面爬取 END ============== # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str( a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: lw.log_writer('中国证券报首页添加新闻错误') pass print('本轮新的新闻有', len(confirmed_new), '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link[1:]) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) mycursor.execute(sql, val) mydb.commit() lw.log_writer('中国证券报' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('中国证券报脚本本轮结束') mydb.close()