def sportsdict():
    # NBA && CBA & 国足 & 国际足球 & 综合
    for url in urlcol:
        print ('正在获取sportsdict...')
        list_dic = []
        result = download_json_waitting(url, 1)
        result = result.replace("data_callback(", '{"data_callback":', 1)[:-1] + "}"
        result = json.loads(result,strict=False)
        items = result['data_callback']
        for item in items:
            title = item['title']
            docurl = item['docurl']
            # print title, docurl
            soup = download_soup_waitting(docurl, 'gbk', 1)
            try:
                post = soup.find('div', id="endText")
                if post is None:
                    print ("格式不相符")
                else:
                    companybrief = post.get_text().strip()
                    uu = companybrief.replace('\n', '')
                    # print uu
                    list_dic.append(uu)
            except:
                print ("链接异常,跳往下一链接")
        globallist.list_sport.extend(list_dic)
def newsdict():
    # 国内、国际、社会类新闻
    tags = ['china','society','world']
    for tag in tags:
        list_dic = []
        print ('正在获newsdict...')
        url = r'http://news.cctv.com/' + tag + r'/data/index.json'
        result = download_json_waitting(url,1)
        result = json.loads(result,strict=False)
        result = result['rollData']
        for item in result:
            title = item["title"]
            url = item['url']
            try:
                soup = download_soup_waitting(url,'utf-8',1)
                content = soup.find('div', {'class': 'cnt_bd'})
                # 剔除无关标签
                [s.extract() for s in content(['div', 'script'])]
                # print title, content.get_text().strip().replace('\n', '')
                result = title + content.get_text().strip().replace('\n', '')
                # print result
                list_dic.append(result)
            except:
                print ("格式不相符")
        globallist.list_news.extend(list_dic)
def artdict():
    nums = ['']
    for url in urlcol:
        for num in nums:
            print('正在获取artdict....')
            list_dic = []
            page_url = url + num + '.js'
            # print url, page_url
            result = download_json_waitting(page_url, 1)
            result = result.replace("data_callback(", '{"data_callback":',
                                    1)[:-1] + "}"
            result = json.loads(result, strict=False)
            items = result['data_callback']
            for item in items:
                title = item['title']
                docurl = item['docurl']
                # print (title, docurl)
                soup = download_soup_waitting(docurl, 'gbk', 1)
                try:
                    post = soup.find('div', id="endText")
                    if post is None:
                        print("格式不相符")
                    else:
                        companybrief = post.get_text().strip()
                        ss = companybrief.encode('utf-8')
                        uu = ss.replace('\n', '')
                        list_dic.append(uu)
                except:
                    print("链接异常,跳往下一链接")

            globallist.list_art.extend(list_dic)
Beispiel #4
0
def mildict():
    for i in range(1, 3):
        list_dic = []
        print('正在获mildict...')
        url = 'http://platform.sina.com.cn/news/news_list?app_key=2872801998&channel=mil&cat_1=jssd&show_all=0&show_cat=1&show_ext=1&tag=1&format=json&page='+ str(i) +\
              '&show_num=10'
        result = download_json_waitting(url, 1)
        result = json.loads(result, strict=False)
        items = result["result"]["data"]
        for item in items:
            title = item['title']
            url = item['url']
            try:
                soup = download_soup_waitting(url, 'utf-8', 1)
                content = soup.find('div', {'id': 'article'})
                [
                    s.extract()
                    for s in content(['div', 'style', 'pre', 'script'])
                ]
                content = content.get_text().strip().replace('\n', '')
                # print title + "---" + url + "----" + content
                list_dic.append(title + content)
            except:
                print(title + ":" + url + "格式不相符")

        globallist.list_mil.extend(list_dic)
def techdict():
    for page in netease_pages:
        list_dic = []
        print("正在获取techdict...")
        url = r'http://tech.163.com/special/00097UHL/tech_datalist' + page + r'.js?callback=data_callback'
        result = download_json_waitting(url, 1)
        result = result.replace("data_callback(", '{"data_callback":',
                                1)[:-1] + "}"
        result = json.loads(result, strict=False)
        result = result["data_callback"]
        for item in result:
            title = item['title']
            url = item['docurl']
            # print title,url
            try:
                soup = download_soup_waitting(url, 'gbk', 1)
                content = soup.find('div', {'id': 'endText'})
                [
                    s.extract()
                    for s in content(['div', 'style', 'pre', 'script'])
                ]
                # print title, url, content.get_text().strip().replace('\n', '')
                result = title + content.get_text().strip().replace('\n', '')
                list_dic.append(result)
            except:
                print(title + ":" + url + "格式不相符")
        globallist.list_tech.extend(list_dic)

    for page in sina_pages:
        list_dic = []
        print("正在获取科技最新资讯...")
        url = r'http://feed.mix.sina.com.cn/api/roll/get?pageid=402&lid=2559&num=20&versionNumber=1.2.8&page=' + \
              page + '&encode=utf-8&callback=feedCardJsonpCallback&_=' + str(t)
        result = download_json_waitting(url, 1)
        result = result.replace("try{feedCardJsonpCallback(", '', 1)[:-14]
        result = json.loads(result, strict=False)
        items = result["result"]['data']
        for item in items:
            # print item['title'], item['summary'], item['keywords']
            result = item['title'] + item['summary'] + item['keywords']
            # print result
            list_dic.append(result)
        globallist.list_tech.extend(list_dic)
def moneydict():
    for url in url_col:
        list_dic = []
        print('正在获取moneydict...')
        result = download_json_waitting(url, 1)
        if url == 'http://jingji.cctv.com/data/index.json':
            result = json.loads(result, strict=False)
            result = result['rollData']
            for item in result:
                title = item['title']
                url = item['url']
                soup = download_soup_waitting(url, 'utf-8', 1)
                try:
                    content = soup.find('div', {'class': 'cnt_bd'})
                    # 剔除无关标签
                    [
                        s.extract()
                        for s in content(['div', 'style', 'pre', 'script'])
                    ]
                    result = title + content.get_text().strip().replace(
                        '\n', '')
                    # print result
                    list_dic.append(result)
                except:
                    print('格式不相符')
        else:
            try:
                result = result.replace("data_callback(", '{"data_callback":',
                                        1)[:-1] + "}"
                result = json.loads(result, strict=False)
                result = result["data_callback"]
                for item in result:
                    title = item['title']
                    url = item['docurl']
                    # print url
                    soup = download_soup_waitting(url, 'gbk', 1)
                    try:
                        content = soup.find('div', {'id': 'endText'})
                        [
                            s.extract()
                            for s in content(['div', 'style', 'pre', 'script'])
                        ]
                        result = title + content.get_text().strip().replace(
                            '\n', '')
                        # print result
                        list_dic.append(result)
                    except:
                        print('格式不相符')
            except:
                pass

        globallist.list_money.extend(list_dic)
def entdict():
    tags = ['index', 'star', 'movie','music']
    url_hotfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5Br-f8NhqwJRmPAcz2PmZYl_yQ3EieKYuOslJRM3HRl-3T9kqnwvtRWwLB-1C2SEmptvAP1Bfy0s7kgEgw..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230584&containerid=230584&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4151604225452173&page=1&moduleID=pagecard"
    # 明星信息流
    url_starfeed = "https://api.weibo.cn/2/guest/cardlist?gsid=_2AkMu5WfMf8NhqwJRmPAcz2PmZYl_yQ3EieKYuZYXJRM3HRl-3T9kqnZftRVqWDRdwTGKDWtA7iBOAX-N3elOcA..&uid=1008938494835&wm=3333_2001&i=8bb4ee5&b=1&from=1073193010&checktoken=807ca79ae3fa897b262e3b63c3882698&c=iphone&networktype=wifi&v_p=45&skin=default&s=ee9f63c1&v_f=1&did=eb4621d547f0e7cb9eef4a41403ee866&lang=zh_CN&sflag=1&ua=iPhone9,2__weibo__7.3.1__iphone__os10.3.1&aid=01AhjayctpFPjOzJEmy46JLMop9TgsXKgsxZQYIpcPoBa-nn8.&lon=116.2697240292689&count=20&fid=230781&containerid=230781&uicode=10000011&lat=40.04127809492162&offset=1&max_id=4140648884038081&page=1&moduleID=pagecard"
    urlcol = []
    urlcol.append(url_hotfeed)
    urlcol.append(url_starfeed)

    list_dic = []

    #  网易娱乐
    for tag in tags:
        print "正在获取entdict..."
        url = r'http://ent.163.com/special/000380VU/newsdata_' + tag + r'.js'
        result = download_json_waitting(url, 1)
        result = result.replace("data_callback(", '{"data_callback":', 1)[:-1] + "}"
        result = json.loads(result)
        items = result["data_callback"]
        for item in items:
            try:
                title = item['title']
                docurl = item['docurl']
                # print title
                list_dic.append(title)
            except:
                pass

    # 微博信息流
    for url in urlcol:
        print ("正在获取微博信息流...")
        response = urllib2.urlopen(url)
        res = response.read()
        res = json.loads(res)
        try:
            list_dict = []
            for cards in res["cards"]:
                # print cards
                if cards["card_type"] == 9:
                    if "text" in cards["mblog"]:
                        # print cards["mblog"]["text"]
                        list_dic.append(cards["mblog"]["text"])
        except KeyError, e:
            print ("no key: " + str(e))
Beispiel #8
0
def esportsdict():
    for i in range(1, 10):
        print('正在获取esportsdict....')
        list_dic = []
        url = 'http://www.dadianjing.cn/index.php?m=Index&a=xhrList&cid=1&page=' + str(
            i)
        result = download_json_waitting(url, 1)
        try:
            result = json.loads(result, strict=False)
            items = result["data"]["list"]
            for item in items:
                title = item['title']
                summary = item['summary']
                # print title + "---" + summary
                list_dic.append(title + summary)

            globallist.list_esports.extend(list_dic)

        except:
            print("链接异常")