def get_hjbk_page(url): #获取文章URL链接,获取当日文章 column_url = url + '/page' page_num = 1 article_list = [] def get_url(page_num): html = json.loads(post_data(column_url, {'page': page_num}))['data'] added_url = 0 for i in html: try: wenzhang_url = i['NewsID'] except: continue search_obj = re.search(r'{0}.+?'.format(today_date), wenzhang_url) if not search_obj: continue wenzhang_url = url_base + '/C/' + wenzhang_url[: 8] + "/" + wenzhang_url + ".html" article_list.append(wenzhang_url) added_url += 1 if added_url == len(html): return 1 else: return 0 while True: status = get_url(page_num) if status: page_num += 1 else: break parse_log.debug(article_list) return len(article_list), article_list
def find_data(url): #获取导航栏URL nav_url = [] html = get_data(url) soup = BeautifulSoup(html, "html.parser") h2 = soup.find_all('h2') for i in h2: try: wenzhang_url = i.find('a')['href'] except: continue search_obj = re.search(r'.+licai-news.+'.format(today_date), wenzhang_url) if not search_obj: continue type_name = i.find('b').text nav_url.append({'type_name': type_name, 'type_url': wenzhang_url}) parse_log.debug(nav_url) nav_url.append({ 'type_name': u'要闻', 'type_url': u'https://www.rong360.com/licai-news/?typeid=151' }) return nav_url
def find_data(url): #获取导航栏URL html = get_data(url) soup = BeautifulSoup(html, "html.parser") td = soup.find_all('div', class_='m-nav') nav = td[0] li = nav.find_all('a') nav_url = [] for i in li: parse_log.debug(i) nav_url.append({'type_name': i.text, 'type_url': i['href']}) parse_log.debug(nav_url) return nav_url
def find_data(url): #获取导航栏URL html = get_data(url) soup = BeautifulSoup(html, "html.parser") td = soup.find_all('div', class_='overview bg-white mb20 clearfix') nav = td[0] li = nav.find_all('li') parse_log.debug(li) a = [] nav_url = [] for i in li: a.append(i.find('a')) parse_log.debug(nav_url) filter_list = [ u'黄金答疑', u'现货黄金', u'黄金T+D', u'纸黄金', u'现货白银', u'白银T+D', u'实物黄金', u'铂-银-钯' ] for i in a: if i.text in filter_list: continue parse_log.debug('u\'' + i.text + '\'' + ':' + '\'\'' + ',') nav_url.append({'type_name': i.text, 'type_url': url_base + i['href']}) parse_log.debug(nav_url) return nav_url
def get_all_article(): article = [] nav_url = find_data(url_base) article_num = 0 article_list = [] # nav_url.append(url_base) parse_log.info('开始爬取当日所有文章链接....') for i in nav_url: # 遍历导航栏链接 if i['type_name'] == u'黄金博客': page_num, art_list = get_hjbk_page(i['type_url']) else: page_num, art_list = get_page(i['type_url']) if not page_num: # 去除没有文章的分类 continue parse_log.debug(u'已获取{1}文章链接{0}条'.format(page_num, i['type_name'])) article_list.append({ 'type_name': i['type_name'], 'art_list': art_list }) total = 0 upload_end = [] for i in article_list: #upload_type = news_type[i['type_name']] waiting_upload = i['art_list'] parse_log.debug('开始上传,待上传{0}条'.format(len(waiting_upload))) total = total + len(waiting_upload) for a in waiting_upload: if a not in upload_end: article_title, artibody = handle_article_CMDs[i['type_name']]( a) if not artibody or not article_title: continue article.append({ 'title': article_title, 'body': artibody, 'type': i['type_name'], 'url': a, 'source_url': url_base }) parse_log.debug(u'正在上传:{0} 栏目.....{1}/{2}.........{3}'.format( i['type_name'], waiting_upload.index(a), len(waiting_upload), a)) upload_end.append(a) else: parse_log.debug('已存在') continue parse_log.info('已获取文章{0}条'.format(len(article))) return article
def get_url_rules_3(url): #获取文章URL链接,获取当日文章 column_url = url page_num = 2 article_list = [] def get_rules_more(page_num): 'http://money.163.com/special/00252G50/macro_02.html' html = get_data(column_url) added_url = 0 soup = BeautifulSoup(html, "html.parser") a_list_div = soup.find('div', class_='col_l') a = a_list_div.find_all('a') for i in a: try: wenzhang_url = i['href'] except: continue search_obj = re.search(r'http://.+{0}.+?html'.format(today_date), wenzhang_url) if not search_obj: continue article_list.append(wenzhang_url) added_url += 1 if added_url: column_url_ = column_url.rstrip( '.html') + '_' + '%02d' % page_num + '.html' return 1 else: return 0 while True: status = get_rules_more(page_num) if status: page_num += 1 else: break parse_log.debug(article_list) return len(article_list), article_list
def get_page(url): #获取文章URL链接,获取当日文章 column_url = url article_list = [] def get_rules_page_1(): html = get_data(column_url) soup = BeautifulSoup(html, "html.parser") a = soup.find_all('a') for i in a: try: wenzhang_url = i['href'] except: continue search_obj = re.search(r'http://.+{0}.+?html'.format(today_date), wenzhang_url) if not search_obj: continue article_list.append(wenzhang_url) def get_rules_more(): data = get_data( 'http://money.163.com/special/002557S5/newsdata_idx_index.js?callback=data_callback' ) data = str(data.lstrip('data_callback').strip('()')) html = json.loads(data) for i in html: wenzhang_url = i['docurl'] search_obj = re.search(r'http://{0}.+?html'.format(today_date), wenzhang_url) if not search_obj: continue article_list.append(wenzhang_url) get_rules_page_1() #get_rules_more() #article_list = list(set(article_list)) parse_log.debug(article_list) return len(article_list), article_list
def get_url_rules_2(url): #获取文章URL链接,获取当日文章 column_url = url page_num = 2 article_list = [] html = get_data(column_url) added_url = 0 soup = BeautifulSoup(html, "html.parser") a_list_div = soup.find('div', class_='col_l') a = a_list_div.find_all('a') for i in a: try: wenzhang_url = i['href'] except: continue search_obj = re.search(r'http://.+{0}.+?html'.format(today_date), wenzhang_url) if not search_obj: continue article_list.append(wenzhang_url) parse_log.debug(article_list) return len(article_list), article_list
def find_data(url): #获取导航栏URL html = get_data(url) soup = BeautifulSoup(html, "html.parser") try: nav = soup.find('div', class_='nav common_wrap') except: return [] li = nav.find_all('a') parse_log.debug(li) nav_url = [] parse_log.debug(nav_url) filter_list = [ u'行情', u'大盘', u'净值', u'评论', u'百科', u'博客', u'专题', u'滚动', u'港股' ] for i in li: if i.text in filter_list: continue parse_log.debug('u\'' + i.text + '\'' + ':' + '\'\'' + ',') nav_url.append({'type_name': i.text, 'type_url': i['href']}) parse_log.debug(nav_url) return nav_url