Exemple #1
0
def getRssInfo():
    url = "http://www.huxiu.com/rss/0.xml"
    d = feedparser.parse(url)
    #     print d.feed.title
    #     print d.feed.link
    #     print d.feed.description
    # print d.etag
    # print d.modifed
    infoList = []
    for entry in d.entries:
        info = {}
        info["url"] = entry.link
        info["newsid"] = getMd5(info["url"])
        info["title"] = entry.title
        info["description"] = entry.description
        info["ctime"] = (long)(time.mktime(entry.published_parsed))
        info["author"] = entry.source.title
        info["source"] = ctable
        info["keywords"] = ""
        #         print entry
        #         print info['url']
        #         print info['newsid']
        #         print info['title'],info['ctime']
        #         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['author']
        #         print entry.published_parsed
        #         print info['description']
        soup = BeautifulSoup(entry.description, "html.parser", from_encoding="utf-8")
        img = soup.find("img")
        info["thumb"] = img.get("src") if img else ""
        info["summary"] = soup.getText()
        #         print info['thumb']
        #         print info['summary']
        infoList.append(info)
    return infoList
Exemple #2
0
def getHtmlInfo():
    url='http://www.zdnet.com.cn/'  
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, 'html.parser',from_encoding='gbk')
        soup=soup.find('div',{'id':'tab1'})
        if not soup:
            return newsList
        itemList=soup.find_all('div',{'class':'qu_loop'})
        for item in itemList:
            nInfo={}
            head=item.find('div',{'class':'qu_tix'})
            nInfo['url']=head.find('b').find('a').get('href')
            nInfo['newsid']=getMd5(nInfo['url'])
            nInfo['title']=head.find('b').getText()
            desc=head.find('p')
            nInfo['summary']=desc.getText()
            metas=head.find('p',{'class':'meta'})
            nInfo['keywords']=','.join(a.getText() for a in metas.find_all('a')) if metas else ''            
            nInfo['thumb']=item.find('div',{'class':'qu_ims'}).find('img').get('src')
            timeStr= item.find('div',{'class':'qu_times'}).getText()            
            nInfo['ctime']= long(time.mktime(time.strptime(timeStr,u'%Y-%m-%d %H:%M:%S'))) 
            nInfo['source']=ctable
            nInfo['author']=''  
            nInfo['description']=str(desc) 
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['keywords'],nInfo['thumb']
#             print nInfo['summary']  
            newsList.append(nInfo)
    return newsList
Exemple #3
0
def getHtmlInfo():
    url=r'http://tech.163.com/'   
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, "html.parser",from_encoding='gbk')
        itemList=soup.find_all('div',{'class':"hot_board clearfix"})
        for item in itemList:
            nInfo={}
            nInfo['url']=item.find('a').get('href')
            head=item.find('div',{'class':'hb_detail'})            
            nInfo['title']=head.find('h3').getText() # getText() is a safer way than .string to get text
            nInfo['newsid']=getMd5(nInfo['url'])      
            desc=head.find('p').string   
            nInfo['summary']=desc
            img=item.find('div',{'class':'img_box'})
            if img:
                img=img.find('img')
                nInfo['thumb']=img.get('src') if img.get('src') else img.get('data-src')
            else:
                continue
            nInfo['keywords']=head.find('span').getText()
            timeStr=head.find('em').get('time')
            nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%m/%d/%Y %H:%M:%S')))
            nInfo['source']=ctable
            nInfo['author']=''
            nInfo['description']=desc
            newsList.append(nInfo)
    return newsList
Exemple #4
0
def getHtmlInfo():
    url = "http://www.donews.com/original/"
    content = getHtml(url)
    #     print content
    newsList = []
    if content:
        soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
        block = soup.find("ul", {"class": "art_list mt11"})
        if not block:
            return newsList
        itemList = block.find_all("li")
        for item in itemList:
            nInfo = {}
            head = item.find("h5", {"class": "title"}).find("a")
            nInfo["url"] = head.get("href")
            title = head.getText()  # the result returned by getText() is unicode
            nInfo["title"] = r1(u"(?:^[【,「].*?[】,」])?(.*)", title)
            nInfo["newsid"] = getMd5(nInfo["url"])
            desc = item.find("p", {"class": "info"})
            nInfo["summary"] = desc.getText().strip()
            img = item.find("img")
            nInfo["thumb"] = img.get("src") if img else ""
            nInfo["keywords"] = ""
            timeStr = item.find("span", {"class": "time"}).getText()
            timeStr = time.strftime("%Y年").decode("utf-8") + timeStr
            nInfo["ctime"] = long(time.mktime(time.strptime(timeStr, u"%Y年%m月%d日 %H:%M")))
            nInfo["source"] = ctable
            nInfo["author"] = item.find("span", {"class": "place"}).getText()
            nInfo["description"] = str(desc)
            # print nInfo['newsid'],nInfo['url']
            # print nInfo['author'],nInfo['thumb']
            # print nInfo['summary']
            newsList.append(nInfo)
    return newsList
Exemple #5
0
def getHtmlInfo(url):
#     url='http://news.hiapk.com/internet/'
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, 'html.parser',from_encoding='utf-8')
        itemList=soup.find_all('div',{'class':"box"})
        for item in itemList:
            nInfo={}
            head=item.find('strong').find('a')
            nInfo['url']=head.get('href')
            title=head.getText()
            nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) 
            nInfo['newsid']=getMd5(nInfo['url'])  
            desc=item.find('p',{'class':'intro'})
            nInfo['summary']=desc.getText()
            nInfo['thumb']=item.find('img').get('src')  
            nInfo['keywords']=','.join(i.getText() for i in item.find('p',{'class':'clearfix tag'}).find_all('a'))
            timeStr= time.strftime('%Y')+'-'+item.find('span',{'class':'right time'}).getText()
            timeLong=long(time.mktime(time.strptime(timeStr,'%Y-%m-%d')))  
            threshold=time.time()+1000  
            if timeLong>threshold:      # if today is in the new year
                timeLong=timeLong-3600*24*365
            nInfo['ctime']= timeLong
            nInfo['source']=ctable
            nInfo['author']=''  
            nInfo['description']=str(desc)
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['keywords'],nInfo['thumb']
#             print nInfo['summary']  
            newsList.append(nInfo)
    return newsList
Exemple #6
0
def getRssInfo():
    url='http://www.tmtpost.com/rss.xml'
    d=feedparser.parse(url)
#     print d.feed.title
#     print d.feed.link
#     print d.feed.description 
#     print d.etag
#     print d.modifed
    infoList=[]
    for entry in d.entries:
        info={}
#         print entry
        info['url']=entry.link
        info['newsid']=getMd5(info['url'])
#         print info['newsid'],info['url']
        info['title']=entry.title
        info['ctime']=(long)(time.mktime(entry.published_parsed))
        info['author']=entry.author
#         print timeFormat.getTimeStamp(info['ctime']),info['title']
#         print info['author']
        info['source']=ctable
        tags=entry.tags if 'tags' in entry else None        
        info['keywords']=','.join(tag.term for tag in tags) if tags else ''        
        info['description']=entry.content[0].value
        soup = BeautifulSoup(info['description'], "html.parser",from_encoding='utf-8')        
        img=soup.find('img')
        info['thumb']=img.get('src') if img else ''
        info['summary']=' '.join(p.getText().strip() for p in soup.find_all('p'))
#         print info['keywords'],info['thumb']
#         print info['summary']
#         print info['description']
        infoList.append(info)
    return infoList
Exemple #7
0
def getHtmlInfo(url):    
#     url='http://it.sohu.com/internet_2014.shtml'
    content=getHtml(url)
#     print content
    newsDict={}
    if content:
        soup = BeautifulSoup(content, "html.parser",from_encoding='gbk')
        itemList=soup.find_all('div',{'class':"item clear"})
        for item in itemList:
            nInfo={}
            item_txt=item.find('div',{'class':"item-txt"})            
            head=item_txt.find('h3').find('a')
            nInfo['title']= head.getText()
            nInfo['url']=head.get('href')             
            nInfo['newsid']=getMd5(nInfo['url'])
            item_info=item_txt.find('div',{'class':"news-info"}) 
            timeStr=item_info.find('span',{'class':"time"}).getText()  
            nInfo['ctime']=long(time.mktime(time.strptime(timeStr, u'%Y年%m月%d日%H:%M')))
            desc=item_txt.find('p')
            nInfo['summary']=desc.getText()
            item_pic=item.find('div',{'class':"item-pic"})
            nInfo['thumb']=item_pic.find('img').get('src') if item_pic else ''
            nInfo['keywords']=''
            nInfo['source']=ctable    
            nInfo['author']=item_info.find('span',{'class':"edit-info"}).getText().strip()
            nInfo['description']=str(desc)                
            newsDict[nInfo['newsid']]=nInfo
#             print nInfo['ctime'],nInfo['title']
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['author'],nInfo['thumb']
#             print nInfo['summary']
    return newsDict
Exemple #8
0
def getJsInfo():
    url=r'http://it.sohu.com/tag/0270/000021270_86_utf8.inc?_='
    url+=str(long(time.time()*1000))
#     print url
    content=getHtml(url)
#     print content   
    newsDict={}
    if content:
        infoList=json.loads(r1('.*?(\[.*\])',content),encoding='utf-8')                  
        for info in infoList:
            try:
                nInfo={}
                nInfo['newsid']=getMd5(info['url'])
                nInfo['title']= info['title']                
                nInfo['url']=info['url']                
                nInfo['summary']=info['text']
                nInfo['thumb']=info['image']
                nInfo['keywords']=','.join(tag['name'] for tag in info['tags']) if len(info['tags'])>0 else ''                    
                nInfo['source']=ctable
                nInfo['ctime']=long(time.mktime(time.strptime(info['time'],u'%Y年%m月%d日%H:%M')))
                nInfo['author']= info['source']            
                nInfo['description']=''             
                newsDict[nInfo['newsid']]=nInfo
            except:
                print 'Error:',info['url']
                logging.error(info['url'])
    return newsDict
Exemple #9
0
def getRssInfo():
    url='http://36kr.com/feed'
    d=feedparser.parse(url)
#     print d.feed.title
#     print d.feed.link
#     print d.feed.description 
#     print d.etag
#     print d.modifed
    infoList=[]
    for entry in d.entries:
        info={}
        info['url']=entry.link
        info['newsid']=getMd5(info['url'])
        info['title']=entry.title
        info['description']=entry.description
        info['ctime']=(long)(time.mktime(entry.published_parsed))
        info['author']=entry.author
        info['source']=ctable
        info['keywords']=''
        soup = BeautifulSoup(entry.description, "html.parser",from_encoding='utf-8')
        img=soup.find('img')
        info['thumb']=img.get('src') if img else ''
        info['summary']=soup.getText()
        #         print entry
#         print info['newsid'],info['url']
#         print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['title']
#         print info['author'],info['thumb']
#         print info['description']
#         print info['summary']
        infoList.append(info)
    return infoList
Exemple #10
0
def getHtmlInfo():
    url='http://www.bnet.com.cn/files/list_more_2013.php?class_id=129&page=1'  
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, 'html.parser',from_encoding='gbk')
        itemList=soup.find_all('div',{'class':"item"})
        for item in itemList:
            nInfo={}
            head=item.find('h3').find('a')
            nInfo['url']=head.get('href')
            title=head.getText()
            nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title)
            nInfo['newsid']=getMd5(nInfo['url'])        
            desc=item.find('p',{'class':'summary'})      
            nInfo['summary']=desc.getText().strip()
            img=item.find('a',{'class':'thumb'}).find('img')
            nInfo['thumb']=img.get('src')  
            nInfo['keywords']=','.join(i.getText() for i in item.find_all('span')[1].find_all('a'))
            timeStr= r1('/(\d{4}/\d{4})/',nInfo['url'])
            nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y/%m%d')))                             
            nInfo['source']=ctable
            nInfo['author']=''   
            nInfo['description']=str(desc) 
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['keywords'],nInfo['thumb']
#             print nInfo['summary']        
            newsList.append(nInfo)
    return newsList
Exemple #11
0
def getRssInfo():
    url = "http://techcrunch.cn/feed/"
    d = feedparser.parse(url)
    #     print d.feed.title
    #     print d.feed.link
    #     print d.feed.description
    #     print d.etag
    #     print d.modifed
    infoList = []
    for entry in d.entries:
        info = {}
        #         print entry
        info["url"] = entry.link
        info["newsid"] = getMd5(info["url"])
        #         print info['newsid'],info['url']
        info["title"] = entry.title
        info["ctime"] = (long)(time.mktime(entry.published_parsed))
        info["author"] = entry.author
        #         print timeFormat.getTimeStamp(info['ctime']),info['title']
        #         print info['author']
        info["source"] = ctable
        tags = entry.tags if "tags" in entry else None
        info["keywords"] = ",".join(tag.term for tag in tags) if tags else ""
        info["description"] = entry.description
        soup = BeautifulSoup(info["description"], "html.parser", from_encoding="utf-8")
        img = soup.find("img")
        info["thumb"] = img.get("src") if img else ""
        info["summary"] = " ".join(p.getText().strip() for p in soup.find_all("p"))
        #         print info['keywords'],info['thumb']
        #         print soup.getText()
        #         print info['description']
        infoList.append(info)
    return infoList
Exemple #12
0
def getHtmlInfo():
    url = "http://www.ciweek.com/v7/list.jsp"
    domain = "http://www.ciweek.com"
    content = getHtml(url)
    #     print content
    newsList = []
    if content:
        soup = BeautifulSoup(content, "html.parser", from_encoding="gbk")
        itemList = soup.find_all("dl", {"class": "clearfix"})
        for item in itemList:
            nInfo = {}
            head = item.find("h2").find("a")
            nInfo["url"] = domain + head.get("href")
            nInfo["title"] = head.getText()
            nInfo["newsid"] = getMd5(nInfo["url"])
            desc = item.find("p").find("span")
            nInfo["summary"] = desc.getText()
            img = item.find("img")
            nInfo["thumb"] = img.get("src") if img else ""
            nInfo["keywords"] = ""
            timeStr = item.find("span", {"class": "date hidden-xs"}).getText()
            nInfo["ctime"] = long(time.mktime(time.strptime(timeStr, "%Y-%m-%d")))
            nInfo["source"] = ctable
            nInfo["author"] = ""
            nInfo["description"] = str(desc)
            newsList.append(nInfo)
    return newsList
Exemple #13
0
def getHtmlInfo(url):
#     url='http://news.csdn.net/news/1'  
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, 'html.parser',from_encoding='utf-8')
        itemList=soup.find_all('div',{'class':"unit"})
        for item in itemList:
            nInfo={}
            head=item.find('h1').find('a')
            nInfo['url']=head.get('href')
            title=head.getText()
            nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title)
            nInfo['newsid']=getMd5(nInfo['url'])    
            desc=item.find('dd')         
            nInfo['summary']=desc.getText()
            img=item.find('img')
            nInfo['thumb']=img.get('src') if img else ''                              
            nInfo['keywords']=','.join(tag.getText() for tag in item.find('div',{'class':'tag'}).find_all('a'))
            timeStr= item.find('span',{'class':'ago'}).getText()
            timeStamp=r1('(\d{4}-\d{2}-\d{2} \d{2}:\d{2})',timeStr)
            if not timeStamp:
                hourago=r1(u'(\d{1,2})小时前',timeStr)
                ctime=time.time()-int(hourago)*3600 if hourago else time.time()
            else:
                ctime=time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M'))
            nInfo['ctime']= long(ctime)                    
            nInfo['source']=ctable
            nInfo['author']=''          
            nInfo['description']=str(desc)
            newsList.append(nInfo)
    return newsList
Exemple #14
0
def getHtmlInfo():
    domain='http://news.ittime.com.cn'
    url='http://news.ittime.com.cn/newslist.shtml'   
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content,"html.parser", from_encoding='utf-8')
        itemList=soup.find_all('div',{'class':"left-list"})
        for item in itemList:
            nInfo={}
            head=item.find('h3').find('a')
            nInfo['url']=domain+head.get('href')
            nInfo['title']=head.getText()
            nInfo['newsid']=getMd5(nInfo['url'])   
            desc=item.find('p')             
            nInfo['summary']=desc.getText()  
            img=item.find('a',{'class':'img_212'}).find('img')
            nInfo['thumb']=domain+img.get('src')
            nInfo['keywords']=','.join([i.getText() if i.getText() else '' for i in item.find('span',{'style':'float:left;'}).find_all('a')])
            timeStr=r1('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',item.find('div',{'class':'box-other1'}).get_text())
            nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M:%S')))         
            nInfo['source']=ctable
            author=item.find('div',{'class':'box-other3'}).find('a')
            nInfo['author']=author.getText() if author else ''
            nInfo['description']=str(desc)
#             print nInfo['ctime'],nInfo['title']
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['author'],nInfo['thumb']
#             print nInfo['keywords'],nInfo['summary']
            newsList.append(nInfo)
    return newsList
Exemple #15
0
def getHtmlInfo():
    url = r"http://www.huxiu.com"
    wap_url = "http://m.huxiu.com"
    content = getHtml(url)
    # print content
    newsList = []
    if content:
        soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
        itemList = soup.find_all("div", {"class": "mod-b mod-art "})
        itemList += soup.find_all("div", {"class": "mod-b mod-art mod-b-push"})
        for item in itemList:
            nInfo = {}
            head = item.find("", {"class": "mob-ctt"})
            if not head:
                continue
            title = head.find("h3")
            if not title:
                continue
            title = title.find("a")
            nInfo["url"] = url + title.get("href")
            nInfo["title"] = title.getText()
            nInfo["newsid"] = getMd5(nInfo["url"])
            nInfo["summary"] = item.find("div", {"class": "mob-sub"}).getText()
            nInfo["description"] = nInfo["summary"]
            nInfo["thumb"] = item.find("img", {"class": "lazy"}).get("data-original")
            nInfo["keywords"] = ""
            timeStr = head.find("span", {"class": "time"}).getText()
            timeSec = time.time()
            min_num = r1(u"(\d{1,2})分钟前", timeStr)
            if min_num:
                timeSec -= 60 * long(min_num)
            else:
                hour_num = r1(u"(\d{1,2})小时前", timeStr)
                if hour_num:
                    timeSec -= 3600 * long(hour_num)
                else:
                    day_num = r1(u"(\d{1,2})天前", timeStr)
                    timeSec = timeSec - long(day_num) * 24 * 3600 if day_num else timeSec
            nInfo["ctime"] = timeSec
            author_div = item.find("div", {"class": "mob-author"})
            nInfo["author"] = ""
            if author_div:
                author_span = author_div.find("span", {"class": "author-name "})
                nInfo["author"] = author_span.getText() if author_span else ""
            nInfo["source"] = ctable
            newsList.append(nInfo)
    return newsList
Exemple #16
0
def getHtmlInfo():
    url='http://www.leiphone.com/page/1'  
    content=getHtml(url)
#     print content
    newsList=[]
    if content:
        soup = BeautifulSoup(content, "html.parser",from_encoding='utf-8')
        soup=soup.find('div',{'class':"lph-pageList index-pageList"})
        if not soup:
            return newsList
        itemList=soup.find_all('li',{'class':'pbox clr'})
        for item in itemList:
            nInfo={}
            word=item.find('div',{'class':'word'})
            head=word.find('a')
            nInfo['url']=head.get('href')
            title=head.find('div',{'class':'tit'}).getText()
            nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) 
            nInfo['newsid']=getMd5(nInfo['url'])             
            desc=word.find('div',{'class':'des'})         
            nInfo['summary']=desc.getText().strip()             
            img=item.find('img',{'class':'lazy'}).get('data-original')
            if not img:
                img=item.find('img',{'class':'lazy'}).get('src')
            nInfo['thumb']=img               
            nInfo['keywords']=''         
            time_block= item.find('div',{'class':'time'})
            if not time_block:
                continue
            timeStr=' '.join(i.getText().replace(' / ','-') for i in time_block.find_all('span'))
            nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M')))   
            author=word.find('div',{'class':'aut'})                        
            nInfo['author']=author.getText().strip() if author else ''
            nInfo['source']=ctable
            nInfo['description']=str(desc)
#             print nInfo['ctime'],nInfo['title']
#             print nInfo['newsid'],nInfo['url']
#             print nInfo['author'],nInfo['thumb']
#             print nInfo['keywords'],nInfo['summary']
            newsList.append(nInfo)
    return newsList
Exemple #17
0
def getJsInfo():
    url=r'http://feed.mix.sina.com.cn/api/roll/get?pageid=1&lid=21&num=30&versionNumber=1.2.6&page=1&encode=utf-8&callback=feedCardJsonpCallback&_='
    url+=str(long(time.time()*1000))
#     print url
    content=getHtml(url)
#     print content
   
    newsList=[]
    if content:
        info=json.loads(r1('.*?\((\{.*\})\)',content),encoding='utf-8')
        if info.has_key('result'):
            tResult=info['result']  
            infoList=[]
            if 'data' in tResult:
                infoList+=tResult['data']
#             if 'cre' in tResult:
#                 infoList+=tResult['cre']
#             if 'pdps' in tResult:
#                 infoList+=tResult['pdps']
#             if 'top' in tResult:
#                 infoList+=tResult['top']
            for info in infoList:
                try:
                    nInfo={}
                    nInfo['newsid']=getMd5(info['url'])
                    nInfo['title']= info['title']                
                    nInfo['url']=info['url']                
                    nInfo['summary']=info['summary']
                    nInfo['thumb']=info['img']['u']                    
                    nInfo['keywords']=info['keywords'] if 'keywords' in info else ''
                    nInfo['source']=ctable
                    nInfo['ctime']=long(info['ctime'])   
                    nInfo['author']=info['author']
                    nInfo['description']=''
                    newsList.append(nInfo)
                except:
                    print 'Error:',info['url']
                    logging.error(info['url'])
    return newsList