Beispiel #1
0
def traveldict():

    # 景点信息
    # for i in range(5):
    #     print '正在获取旅游资讯...'
    #     list_dic = []
    #     types=["文化古迹","自然风光","公园","古建筑","寺庙","遗址","古镇","陵墓陵园","故居","宗教"] #实际不止这些分组 需要自己补充
    #     for type in types:
    #         url="http://piao.qunar.com/ticket/list.htm?keyword=%E7%83%AD%E9%97%A8%E6%99%AF%E7%82%B9&region=&from=mpl_search_suggest&subject="\
    #             +type+"&page=" + str(i)
    #         soup = download_soup_waitting(url,'utf-8',1)
    #         search_list = soup.find('div', attrs={'id': 'search-list'})
    #         sight_items = search_list.findAll('div', attrs={'class': 'sight_item'})
    #         for sight_item in sight_items:
    #             name = sight_item['data-sight-name']
    #             districts = sight_item['data-districts']
    #             print name + "--" + districts
    #             result = name + districts
    #             print result
    #             list_dic.append(result)
    #     globallist.list_travel.extend(list_dic)

    # 旅游资讯
    urlcol = [r'http://www.cntour2.com/#']
    print('正在获取traveldict...')
    for url in urlcol:
        list_dic = []
        soup = download_soup_waitting(url, 'utf8', 1)
        # print soup
        try:
            item = soup.find('div', {'class': 'main_l'})
            news_obj = item.find('ul', {'class': 'news'})
            news = news_obj.find_all('li')
            for new in news:
                # print new
                title = new.find('a').get_text()
                href = r'http://www.cntour2.com/' + new.find('a').get('href')
                # print title, href
                content_page = download_soup_waitting(href, 'utf8', 1)
                content_obj = content_page.find('div', {'id': 'content'})
                if content_obj is None:
                    print("格式不相符")
                else:
                    companybrief = content_obj.get_text().strip()
                    ss = companybrief.encode('utf-8')
                    uu = ss.replace('\n', '')
                    # print title, href, uu
                    result = title + uu
                    list_dic.append(result)
                    # print result
        except:
            print("got exception")
            # raise
            continue

        globallist.list_travel.extend(list_dic)
def moneydict():
    for url in url_col:
        list_dic = []
        print('正在获取moneydict...')
        result = download_json_waitting(url, 1)
        if url == 'http://jingji.cctv.com/data/index.json':
            result = json.loads(result, strict=False)
            result = result['rollData']
            for item in result:
                title = item['title']
                url = item['url']
                soup = download_soup_waitting(url, 'utf-8', 1)
                try:
                    content = soup.find('div', {'class': 'cnt_bd'})
                    # 剔除无关标签
                    [
                        s.extract()
                        for s in content(['div', 'style', 'pre', 'script'])
                    ]
                    result = title + content.get_text().strip().replace(
                        '\n', '')
                    # print result
                    list_dic.append(result)
                except:
                    print('格式不相符')
        else:
            try:
                result = result.replace("data_callback(", '{"data_callback":',
                                        1)[:-1] + "}"
                result = json.loads(result, strict=False)
                result = result["data_callback"]
                for item in result:
                    title = item['title']
                    url = item['docurl']
                    # print url
                    soup = download_soup_waitting(url, 'gbk', 1)
                    try:
                        content = soup.find('div', {'id': 'endText'})
                        [
                            s.extract()
                            for s in content(['div', 'style', 'pre', 'script'])
                        ]
                        result = title + content.get_text().strip().replace(
                            '\n', '')
                        # print result
                        list_dic.append(result)
                    except:
                        print('格式不相符')
            except:
                pass

        globallist.list_money.extend(list_dic)
def newsdict():
    # 国内、国际、社会类新闻
    tags = ['china','society','world']
    for tag in tags:
        list_dic = []
        print ('正在获newsdict...')
        url = r'http://news.cctv.com/' + tag + r'/data/index.json'
        result = download_json_waitting(url,1)
        result = json.loads(result,strict=False)
        result = result['rollData']
        for item in result:
            title = item["title"]
            url = item['url']
            try:
                soup = download_soup_waitting(url,'utf-8',1)
                content = soup.find('div', {'class': 'cnt_bd'})
                # 剔除无关标签
                [s.extract() for s in content(['div', 'script'])]
                # print title, content.get_text().strip().replace('\n', '')
                result = title + content.get_text().strip().replace('\n', '')
                # print result
                list_dic.append(result)
            except:
                print ("格式不相符")
        globallist.list_news.extend(list_dic)
def sportsdict():
    # NBA && CBA & 国足 & 国际足球 & 综合
    for url in urlcol:
        print ('正在获取sportsdict...')
        list_dic = []
        result = download_json_waitting(url, 1)
        result = result.replace("data_callback(", '{"data_callback":', 1)[:-1] + "}"
        result = json.loads(result,strict=False)
        items = result['data_callback']
        for item in items:
            title = item['title']
            docurl = item['docurl']
            # print title, docurl
            soup = download_soup_waitting(docurl, 'gbk', 1)
            try:
                post = soup.find('div', id="endText")
                if post is None:
                    print ("格式不相符")
                else:
                    companybrief = post.get_text().strip()
                    uu = companybrief.replace('\n', '')
                    # print uu
                    list_dic.append(uu)
            except:
                print ("链接异常,跳往下一链接")
        globallist.list_sport.extend(list_dic)
def artdict():
    nums = ['']
    for url in urlcol:
        for num in nums:
            print('正在获取artdict....')
            list_dic = []
            page_url = url + num + '.js'
            # print url, page_url
            result = download_json_waitting(page_url, 1)
            result = result.replace("data_callback(", '{"data_callback":',
                                    1)[:-1] + "}"
            result = json.loads(result, strict=False)
            items = result['data_callback']
            for item in items:
                title = item['title']
                docurl = item['docurl']
                # print (title, docurl)
                soup = download_soup_waitting(docurl, 'gbk', 1)
                try:
                    post = soup.find('div', id="endText")
                    if post is None:
                        print("格式不相符")
                    else:
                        companybrief = post.get_text().strip()
                        ss = companybrief.encode('utf-8')
                        uu = ss.replace('\n', '')
                        list_dic.append(uu)
                except:
                    print("链接异常,跳往下一链接")

            globallist.list_art.extend(list_dic)
Beispiel #6
0
def mildict():
    for i in range(1, 3):
        list_dic = []
        print('正在获mildict...')
        url = 'http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=jssd&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page='+ str(i) +\
              '&show_num=10'
        result = download_json_waitting(url, 1)
        result = json.loads(result, strict=False)
        items = result["result"]["data"]
        for item in items:
            title = item['title']
            url = item['url']
            try:
                soup = download_soup_waitting(url, 'utf-8', 1)
                content = soup.find('div', {'id': 'article'})
                [
                    s.extract()
                    for s in content(['div', 'style', 'pre', 'script'])
                ]
                content = content.get_text().strip().replace('\n', '')
                # print title + "---" + url + "----" + content
                list_dic.append(title + content)
            except:
                print(title + ":" + url + "格式不相符")

        globallist.list_mil.extend(list_dic)
def techdict():
    for page in netease_pages:
        list_dic = []
        print("正在获取techdict...")
        url = r'http://tech.163.com/special/00097UHL/tech_datalist' + page + r'.js?callback=data_callback'
        result = download_json_waitting(url, 1)
        result = result.replace("data_callback(", '{"data_callback":',
                                1)[:-1] + "}"
        result = json.loads(result, strict=False)
        result = result["data_callback"]
        for item in result:
            title = item['title']
            url = item['docurl']
            # print title,url
            try:
                soup = download_soup_waitting(url, 'gbk', 1)
                content = soup.find('div', {'id': 'endText'})
                [
                    s.extract()
                    for s in content(['div', 'style', 'pre', 'script'])
                ]
                # print title, url, content.get_text().strip().replace('\n', '')
                result = title + content.get_text().strip().replace('\n', '')
                list_dic.append(result)
            except:
                print(title + ":" + url + "格式不相符")
        globallist.list_tech.extend(list_dic)

    for page in sina_pages:
        list_dic = []
        print("正在获取科技最新资讯...")
        url = r'http://feed.mix.sina.com.cn/api/roll/get?pageid=402&lid=2559&num=20&versionNumber=1.2.8&page=' + \
              page + '&encode=utf-8&callback=feedCardJsonpCallback&_=' + str(t)
        result = download_json_waitting(url, 1)
        result = result.replace("try{feedCardJsonpCallback(", '', 1)[:-14]
        result = json.loads(result, strict=False)
        items = result["result"]['data']
        for item in items:
            # print item['title'], item['summary'], item['keywords']
            result = item['title'] + item['summary'] + item['keywords']
            # print result
            list_dic.append(result)
        globallist.list_tech.extend(list_dic)