Exemple #1
0
def get_hjbk_page(url):
    #获取文章URL链接,获取当日文章
    column_url = url + '/page'
    page_num = 1
    article_list = []

    def get_url(page_num):
        html = json.loads(post_data(column_url, {'page': page_num}))['data']
        added_url = 0
        for i in html:
            try:
                wenzhang_url = i['NewsID']
            except:
                continue
            search_obj = re.search(r'{0}.+?'.format(today_date), wenzhang_url)
            if not search_obj:
                continue
            wenzhang_url = url_base + '/C/' + wenzhang_url[:
                                                           8] + "/" + wenzhang_url + ".html"
            article_list.append(wenzhang_url)
            added_url += 1

        if added_url == len(html):
            return 1
        else:
            return 0

    while True:
        status = get_url(page_num)
        if status:
            page_num += 1
        else:
            break
    parse_log.debug(article_list)
    return len(article_list), article_list
Exemple #2
0
def find_data(url):
    #获取导航栏URL
    nav_url = []
    html = get_data(url)
    soup = BeautifulSoup(html, "html.parser")
    h2 = soup.find_all('h2')
    for i in h2:
        try:
            wenzhang_url = i.find('a')['href']
        except:
            continue
        search_obj = re.search(r'.+licai-news.+'.format(today_date),
                               wenzhang_url)
        if not search_obj:
            continue
        type_name = i.find('b').text
        nav_url.append({'type_name': type_name, 'type_url': wenzhang_url})
    parse_log.debug(nav_url)
    nav_url.append({
        'type_name':
        u'要闻',
        'type_url':
        u'https://www.rong360.com/licai-news/?typeid=151'
    })

    return nav_url
Exemple #3
0
def find_data(url):
    #获取导航栏URL
    html = get_data(url)
    soup = BeautifulSoup(html, "html.parser")
    td = soup.find_all('div', class_='m-nav')
    nav = td[0]
    li = nav.find_all('a')
    nav_url = []
    for i in li:
        parse_log.debug(i)
        nav_url.append({'type_name': i.text, 'type_url': i['href']})
    parse_log.debug(nav_url)
    return nav_url
def find_data(url):
    #获取导航栏URL
    html = get_data(url)
    soup = BeautifulSoup(html, "html.parser")
    td = soup.find_all('div', class_='overview bg-white mb20 clearfix')
    nav = td[0]
    li = nav.find_all('li')
    parse_log.debug(li)
    a = []
    nav_url = []
    for i in li:
        a.append(i.find('a'))
    parse_log.debug(nav_url)
    filter_list = [
        u'黄金答疑', u'现货黄金', u'黄金T+D', u'纸黄金', u'现货白银', u'白银T+D', u'实物黄金',
        u'铂-银-钯'
    ]
    for i in a:
        if i.text in filter_list:
            continue
        parse_log.debug('u\'' + i.text + '\'' + ':' + '\'\'' + ',')
        nav_url.append({'type_name': i.text, 'type_url': url_base + i['href']})

    parse_log.debug(nav_url)
    return nav_url
def get_all_article():

    article = []
    nav_url = find_data(url_base)
    article_num = 0
    article_list = []
    # nav_url.append(url_base)
    parse_log.info('开始爬取当日所有文章链接....')
    for i in nav_url:
        # 遍历导航栏链接
        if i['type_name'] == u'黄金博客':
            page_num, art_list = get_hjbk_page(i['type_url'])
        else:
            page_num, art_list = get_page(i['type_url'])
        if not page_num:
            # 去除没有文章的分类
            continue
        parse_log.debug(u'已获取{1}文章链接{0}条'.format(page_num, i['type_name']))
        article_list.append({
            'type_name': i['type_name'],
            'art_list': art_list
        })

    total = 0
    upload_end = []

    for i in article_list:
        #upload_type = news_type[i['type_name']]
        waiting_upload = i['art_list']
        parse_log.debug('开始上传,待上传{0}条'.format(len(waiting_upload)))
        total = total + len(waiting_upload)

        for a in waiting_upload:
            if a not in upload_end:
                article_title, artibody = handle_article_CMDs[i['type_name']](
                    a)
                if not artibody or not article_title:
                    continue
                article.append({
                    'title': article_title,
                    'body': artibody,
                    'type': i['type_name'],
                    'url': a,
                    'source_url': url_base
                })
                parse_log.debug(u'正在上传:{0} 栏目.....{1}/{2}.........{3}'.format(
                    i['type_name'], waiting_upload.index(a),
                    len(waiting_upload), a))
                upload_end.append(a)
            else:
                parse_log.debug('已存在')
                continue

    parse_log.info('已获取文章{0}条'.format(len(article)))
    return article
def get_url_rules_3(url):
    #获取文章URL链接,获取当日文章
    column_url = url
    page_num = 2
    article_list = []

    def get_rules_more(page_num):
        'http://money.163.com/special/00252G50/macro_02.html'
        html = get_data(column_url)
        added_url = 0
        soup = BeautifulSoup(html, "html.parser")
        a_list_div = soup.find('div', class_='col_l')
        a = a_list_div.find_all('a')
        for i in a:
            try:
                wenzhang_url = i['href']
            except:
                continue
            search_obj = re.search(r'http://.+{0}.+?html'.format(today_date),
                                   wenzhang_url)
            if not search_obj:
                continue
            article_list.append(wenzhang_url)
            added_url += 1

        if added_url:
            column_url_ = column_url.rstrip(
                '.html') + '_' + '%02d' % page_num + '.html'
            return 1
        else:
            return 0

    while True:
        status = get_rules_more(page_num)
        if status:
            page_num += 1
        else:
            break

    parse_log.debug(article_list)
    return len(article_list), article_list
def get_page(url):
    #获取文章URL链接,获取当日文章
    column_url = url
    article_list = []

    def get_rules_page_1():
        html = get_data(column_url)
        soup = BeautifulSoup(html, "html.parser")
        a = soup.find_all('a')
        for i in a:
            try:
                wenzhang_url = i['href']
            except:
                continue
            search_obj = re.search(r'http://.+{0}.+?html'.format(today_date),
                                   wenzhang_url)
            if not search_obj:
                continue
            article_list.append(wenzhang_url)

    def get_rules_more():

        data = get_data(
            'http://money.163.com/special/002557S5/newsdata_idx_index.js?callback=data_callback'
        )
        data = str(data.lstrip('data_callback').strip('()'))
        html = json.loads(data)
        for i in html:
            wenzhang_url = i['docurl']
            search_obj = re.search(r'http://{0}.+?html'.format(today_date),
                                   wenzhang_url)
            if not search_obj:
                continue
            article_list.append(wenzhang_url)

    get_rules_page_1()
    #get_rules_more()
    #article_list = list(set(article_list))
    parse_log.debug(article_list)
    return len(article_list), article_list
def get_url_rules_2(url):
    #获取文章URL链接,获取当日文章
    column_url = url
    page_num = 2
    article_list = []

    html = get_data(column_url)
    added_url = 0
    soup = BeautifulSoup(html, "html.parser")
    a_list_div = soup.find('div', class_='col_l')
    a = a_list_div.find_all('a')
    for i in a:
        try:
            wenzhang_url = i['href']
        except:
            continue
        search_obj = re.search(r'http://.+{0}.+?html'.format(today_date),
                               wenzhang_url)
        if not search_obj:
            continue
        article_list.append(wenzhang_url)

    parse_log.debug(article_list)
    return len(article_list), article_list
def find_data(url):
    #获取导航栏URL
    html = get_data(url)
    soup = BeautifulSoup(html, "html.parser")
    try:
        nav = soup.find('div', class_='nav common_wrap')
    except:
        return []
    li = nav.find_all('a')
    parse_log.debug(li)
    nav_url = []
    parse_log.debug(nav_url)
    filter_list = [
        u'行情', u'大盘', u'净值', u'评论', u'百科', u'博客', u'专题', u'滚动', u'港股'
    ]
    for i in li:
        if i.text in filter_list:
            continue
        parse_log.debug('u\'' + i.text + '\'' + ':' + '\'\'' + ',')
        nav_url.append({'type_name': i.text, 'type_url': i['href']})

    parse_log.debug(nav_url)
    return nav_url