def getRssInfo(): url = "http://www.huxiu.com/rss/0.xml" d = feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList = [] for entry in d.entries: info = {} info["url"] = entry.link info["newsid"] = getMd5(info["url"]) info["title"] = entry.title info["description"] = entry.description info["ctime"] = (long)(time.mktime(entry.published_parsed)) info["author"] = entry.source.title info["source"] = ctable info["keywords"] = "" # print entry # print info['url'] # print info['newsid'] # print info['title'],info['ctime'] # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['author'] # print entry.published_parsed # print info['description'] soup = BeautifulSoup(entry.description, "html.parser", from_encoding="utf-8") img = soup.find("img") info["thumb"] = img.get("src") if img else "" info["summary"] = soup.getText() # print info['thumb'] # print info['summary'] infoList.append(info) return infoList
def getHtmlInfo(): url='http://www.zdnet.com.cn/' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, 'html.parser',from_encoding='gbk') soup=soup.find('div',{'id':'tab1'}) if not soup: return newsList itemList=soup.find_all('div',{'class':'qu_loop'}) for item in itemList: nInfo={} head=item.find('div',{'class':'qu_tix'}) nInfo['url']=head.find('b').find('a').get('href') nInfo['newsid']=getMd5(nInfo['url']) nInfo['title']=head.find('b').getText() desc=head.find('p') nInfo['summary']=desc.getText() metas=head.find('p',{'class':'meta'}) nInfo['keywords']=','.join(a.getText() for a in metas.find_all('a')) if metas else '' nInfo['thumb']=item.find('div',{'class':'qu_ims'}).find('img').get('src') timeStr= item.find('div',{'class':'qu_times'}).getText() nInfo['ctime']= long(time.mktime(time.strptime(timeStr,u'%Y-%m-%d %H:%M:%S'))) nInfo['source']=ctable nInfo['author']='' nInfo['description']=str(desc) # print nInfo['newsid'],nInfo['url'] # print nInfo['keywords'],nInfo['thumb'] # print nInfo['summary'] newsList.append(nInfo) return newsList
def getHtmlInfo(): url=r'http://tech.163.com/' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, "html.parser",from_encoding='gbk') itemList=soup.find_all('div',{'class':"hot_board clearfix"}) for item in itemList: nInfo={} nInfo['url']=item.find('a').get('href') head=item.find('div',{'class':'hb_detail'}) nInfo['title']=head.find('h3').getText() # getText() is a safer way than .string to get text nInfo['newsid']=getMd5(nInfo['url']) desc=head.find('p').string nInfo['summary']=desc img=item.find('div',{'class':'img_box'}) if img: img=img.find('img') nInfo['thumb']=img.get('src') if img.get('src') else img.get('data-src') else: continue nInfo['keywords']=head.find('span').getText() timeStr=head.find('em').get('time') nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%m/%d/%Y %H:%M:%S'))) nInfo['source']=ctable nInfo['author']='' nInfo['description']=desc newsList.append(nInfo) return newsList
def getHtmlInfo(): url = "http://www.donews.com/original/" content = getHtml(url) # print content newsList = [] if content: soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8") block = soup.find("ul", {"class": "art_list mt11"}) if not block: return newsList itemList = block.find_all("li") for item in itemList: nInfo = {} head = item.find("h5", {"class": "title"}).find("a") nInfo["url"] = head.get("href") title = head.getText() # the result returned by getText() is unicode nInfo["title"] = r1(u"(?:^[【,「].*?[】,」])?(.*)", title) nInfo["newsid"] = getMd5(nInfo["url"]) desc = item.find("p", {"class": "info"}) nInfo["summary"] = desc.getText().strip() img = item.find("img") nInfo["thumb"] = img.get("src") if img else "" nInfo["keywords"] = "" timeStr = item.find("span", {"class": "time"}).getText() timeStr = time.strftime("%Y年").decode("utf-8") + timeStr nInfo["ctime"] = long(time.mktime(time.strptime(timeStr, u"%Y年%m月%d日 %H:%M"))) nInfo["source"] = ctable nInfo["author"] = item.find("span", {"class": "place"}).getText() nInfo["description"] = str(desc) # print nInfo['newsid'],nInfo['url'] # print nInfo['author'],nInfo['thumb'] # print nInfo['summary'] newsList.append(nInfo) return newsList
def getHtmlInfo(url): # url='http://news.hiapk.com/internet/' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, 'html.parser',from_encoding='utf-8') itemList=soup.find_all('div',{'class':"box"}) for item in itemList: nInfo={} head=item.find('strong').find('a') nInfo['url']=head.get('href') title=head.getText() nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) nInfo['newsid']=getMd5(nInfo['url']) desc=item.find('p',{'class':'intro'}) nInfo['summary']=desc.getText() nInfo['thumb']=item.find('img').get('src') nInfo['keywords']=','.join(i.getText() for i in item.find('p',{'class':'clearfix tag'}).find_all('a')) timeStr= time.strftime('%Y')+'-'+item.find('span',{'class':'right time'}).getText() timeLong=long(time.mktime(time.strptime(timeStr,'%Y-%m-%d'))) threshold=time.time()+1000 if timeLong>threshold: # if today is in the new year timeLong=timeLong-3600*24*365 nInfo['ctime']= timeLong nInfo['source']=ctable nInfo['author']='' nInfo['description']=str(desc) # print nInfo['newsid'],nInfo['url'] # print nInfo['keywords'],nInfo['thumb'] # print nInfo['summary'] newsList.append(nInfo) return newsList
def getRssInfo(): url='http://www.tmtpost.com/rss.xml' d=feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList=[] for entry in d.entries: info={} # print entry info['url']=entry.link info['newsid']=getMd5(info['url']) # print info['newsid'],info['url'] info['title']=entry.title info['ctime']=(long)(time.mktime(entry.published_parsed)) info['author']=entry.author # print timeFormat.getTimeStamp(info['ctime']),info['title'] # print info['author'] info['source']=ctable tags=entry.tags if 'tags' in entry else None info['keywords']=','.join(tag.term for tag in tags) if tags else '' info['description']=entry.content[0].value soup = BeautifulSoup(info['description'], "html.parser",from_encoding='utf-8') img=soup.find('img') info['thumb']=img.get('src') if img else '' info['summary']=' '.join(p.getText().strip() for p in soup.find_all('p')) # print info['keywords'],info['thumb'] # print info['summary'] # print info['description'] infoList.append(info) return infoList
def getHtmlInfo(url): # url='http://it.sohu.com/internet_2014.shtml' content=getHtml(url) # print content newsDict={} if content: soup = BeautifulSoup(content, "html.parser",from_encoding='gbk') itemList=soup.find_all('div',{'class':"item clear"}) for item in itemList: nInfo={} item_txt=item.find('div',{'class':"item-txt"}) head=item_txt.find('h3').find('a') nInfo['title']= head.getText() nInfo['url']=head.get('href') nInfo['newsid']=getMd5(nInfo['url']) item_info=item_txt.find('div',{'class':"news-info"}) timeStr=item_info.find('span',{'class':"time"}).getText() nInfo['ctime']=long(time.mktime(time.strptime(timeStr, u'%Y年%m月%d日%H:%M'))) desc=item_txt.find('p') nInfo['summary']=desc.getText() item_pic=item.find('div',{'class':"item-pic"}) nInfo['thumb']=item_pic.find('img').get('src') if item_pic else '' nInfo['keywords']='' nInfo['source']=ctable nInfo['author']=item_info.find('span',{'class':"edit-info"}).getText().strip() nInfo['description']=str(desc) newsDict[nInfo['newsid']]=nInfo # print nInfo['ctime'],nInfo['title'] # print nInfo['newsid'],nInfo['url'] # print nInfo['author'],nInfo['thumb'] # print nInfo['summary'] return newsDict
def getJsInfo(): url=r'http://it.sohu.com/tag/0270/000021270_86_utf8.inc?_=' url+=str(long(time.time()*1000)) # print url content=getHtml(url) # print content newsDict={} if content: infoList=json.loads(r1('.*?(\[.*\])',content),encoding='utf-8') for info in infoList: try: nInfo={} nInfo['newsid']=getMd5(info['url']) nInfo['title']= info['title'] nInfo['url']=info['url'] nInfo['summary']=info['text'] nInfo['thumb']=info['image'] nInfo['keywords']=','.join(tag['name'] for tag in info['tags']) if len(info['tags'])>0 else '' nInfo['source']=ctable nInfo['ctime']=long(time.mktime(time.strptime(info['time'],u'%Y年%m月%d日%H:%M'))) nInfo['author']= info['source'] nInfo['description']='' newsDict[nInfo['newsid']]=nInfo except: print 'Error:',info['url'] logging.error(info['url']) return newsDict
def getRssInfo(): url='http://36kr.com/feed' d=feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList=[] for entry in d.entries: info={} info['url']=entry.link info['newsid']=getMd5(info['url']) info['title']=entry.title info['description']=entry.description info['ctime']=(long)(time.mktime(entry.published_parsed)) info['author']=entry.author info['source']=ctable info['keywords']='' soup = BeautifulSoup(entry.description, "html.parser",from_encoding='utf-8') img=soup.find('img') info['thumb']=img.get('src') if img else '' info['summary']=soup.getText() # print entry # print info['newsid'],info['url'] # print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(info['ctime'])),info['title'] # print info['author'],info['thumb'] # print info['description'] # print info['summary'] infoList.append(info) return infoList
def getHtmlInfo(): url='http://www.bnet.com.cn/files/list_more_2013.php?class_id=129&page=1' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, 'html.parser',from_encoding='gbk') itemList=soup.find_all('div',{'class':"item"}) for item in itemList: nInfo={} head=item.find('h3').find('a') nInfo['url']=head.get('href') title=head.getText() nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) nInfo['newsid']=getMd5(nInfo['url']) desc=item.find('p',{'class':'summary'}) nInfo['summary']=desc.getText().strip() img=item.find('a',{'class':'thumb'}).find('img') nInfo['thumb']=img.get('src') nInfo['keywords']=','.join(i.getText() for i in item.find_all('span')[1].find_all('a')) timeStr= r1('/(\d{4}/\d{4})/',nInfo['url']) nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y/%m%d'))) nInfo['source']=ctable nInfo['author']='' nInfo['description']=str(desc) # print nInfo['newsid'],nInfo['url'] # print nInfo['keywords'],nInfo['thumb'] # print nInfo['summary'] newsList.append(nInfo) return newsList
def getRssInfo(): url = "http://techcrunch.cn/feed/" d = feedparser.parse(url) # print d.feed.title # print d.feed.link # print d.feed.description # print d.etag # print d.modifed infoList = [] for entry in d.entries: info = {} # print entry info["url"] = entry.link info["newsid"] = getMd5(info["url"]) # print info['newsid'],info['url'] info["title"] = entry.title info["ctime"] = (long)(time.mktime(entry.published_parsed)) info["author"] = entry.author # print timeFormat.getTimeStamp(info['ctime']),info['title'] # print info['author'] info["source"] = ctable tags = entry.tags if "tags" in entry else None info["keywords"] = ",".join(tag.term for tag in tags) if tags else "" info["description"] = entry.description soup = BeautifulSoup(info["description"], "html.parser", from_encoding="utf-8") img = soup.find("img") info["thumb"] = img.get("src") if img else "" info["summary"] = " ".join(p.getText().strip() for p in soup.find_all("p")) # print info['keywords'],info['thumb'] # print soup.getText() # print info['description'] infoList.append(info) return infoList
def getHtmlInfo(): url = "http://www.ciweek.com/v7/list.jsp" domain = "http://www.ciweek.com" content = getHtml(url) # print content newsList = [] if content: soup = BeautifulSoup(content, "html.parser", from_encoding="gbk") itemList = soup.find_all("dl", {"class": "clearfix"}) for item in itemList: nInfo = {} head = item.find("h2").find("a") nInfo["url"] = domain + head.get("href") nInfo["title"] = head.getText() nInfo["newsid"] = getMd5(nInfo["url"]) desc = item.find("p").find("span") nInfo["summary"] = desc.getText() img = item.find("img") nInfo["thumb"] = img.get("src") if img else "" nInfo["keywords"] = "" timeStr = item.find("span", {"class": "date hidden-xs"}).getText() nInfo["ctime"] = long(time.mktime(time.strptime(timeStr, "%Y-%m-%d"))) nInfo["source"] = ctable nInfo["author"] = "" nInfo["description"] = str(desc) newsList.append(nInfo) return newsList
def getHtmlInfo(url): # url='http://news.csdn.net/news/1' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, 'html.parser',from_encoding='utf-8') itemList=soup.find_all('div',{'class':"unit"}) for item in itemList: nInfo={} head=item.find('h1').find('a') nInfo['url']=head.get('href') title=head.getText() nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) nInfo['newsid']=getMd5(nInfo['url']) desc=item.find('dd') nInfo['summary']=desc.getText() img=item.find('img') nInfo['thumb']=img.get('src') if img else '' nInfo['keywords']=','.join(tag.getText() for tag in item.find('div',{'class':'tag'}).find_all('a')) timeStr= item.find('span',{'class':'ago'}).getText() timeStamp=r1('(\d{4}-\d{2}-\d{2} \d{2}:\d{2})',timeStr) if not timeStamp: hourago=r1(u'(\d{1,2})小时前',timeStr) ctime=time.time()-int(hourago)*3600 if hourago else time.time() else: ctime=time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M')) nInfo['ctime']= long(ctime) nInfo['source']=ctable nInfo['author']='' nInfo['description']=str(desc) newsList.append(nInfo) return newsList
def getHtmlInfo(): domain='http://news.ittime.com.cn' url='http://news.ittime.com.cn/newslist.shtml' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content,"html.parser", from_encoding='utf-8') itemList=soup.find_all('div',{'class':"left-list"}) for item in itemList: nInfo={} head=item.find('h3').find('a') nInfo['url']=domain+head.get('href') nInfo['title']=head.getText() nInfo['newsid']=getMd5(nInfo['url']) desc=item.find('p') nInfo['summary']=desc.getText() img=item.find('a',{'class':'img_212'}).find('img') nInfo['thumb']=domain+img.get('src') nInfo['keywords']=','.join([i.getText() if i.getText() else '' for i in item.find('span',{'style':'float:left;'}).find_all('a')]) timeStr=r1('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})',item.find('div',{'class':'box-other1'}).get_text()) nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M:%S'))) nInfo['source']=ctable author=item.find('div',{'class':'box-other3'}).find('a') nInfo['author']=author.getText() if author else '' nInfo['description']=str(desc) # print nInfo['ctime'],nInfo['title'] # print nInfo['newsid'],nInfo['url'] # print nInfo['author'],nInfo['thumb'] # print nInfo['keywords'],nInfo['summary'] newsList.append(nInfo) return newsList
def getHtmlInfo(): url = r"http://www.huxiu.com" wap_url = "http://m.huxiu.com" content = getHtml(url) # print content newsList = [] if content: soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8") itemList = soup.find_all("div", {"class": "mod-b mod-art "}) itemList += soup.find_all("div", {"class": "mod-b mod-art mod-b-push"}) for item in itemList: nInfo = {} head = item.find("", {"class": "mob-ctt"}) if not head: continue title = head.find("h3") if not title: continue title = title.find("a") nInfo["url"] = url + title.get("href") nInfo["title"] = title.getText() nInfo["newsid"] = getMd5(nInfo["url"]) nInfo["summary"] = item.find("div", {"class": "mob-sub"}).getText() nInfo["description"] = nInfo["summary"] nInfo["thumb"] = item.find("img", {"class": "lazy"}).get("data-original") nInfo["keywords"] = "" timeStr = head.find("span", {"class": "time"}).getText() timeSec = time.time() min_num = r1(u"(\d{1,2})分钟前", timeStr) if min_num: timeSec -= 60 * long(min_num) else: hour_num = r1(u"(\d{1,2})小时前", timeStr) if hour_num: timeSec -= 3600 * long(hour_num) else: day_num = r1(u"(\d{1,2})天前", timeStr) timeSec = timeSec - long(day_num) * 24 * 3600 if day_num else timeSec nInfo["ctime"] = timeSec author_div = item.find("div", {"class": "mob-author"}) nInfo["author"] = "" if author_div: author_span = author_div.find("span", {"class": "author-name "}) nInfo["author"] = author_span.getText() if author_span else "" nInfo["source"] = ctable newsList.append(nInfo) return newsList
def getHtmlInfo(): url='http://www.leiphone.com/page/1' content=getHtml(url) # print content newsList=[] if content: soup = BeautifulSoup(content, "html.parser",from_encoding='utf-8') soup=soup.find('div',{'class':"lph-pageList index-pageList"}) if not soup: return newsList itemList=soup.find_all('li',{'class':'pbox clr'}) for item in itemList: nInfo={} word=item.find('div',{'class':'word'}) head=word.find('a') nInfo['url']=head.get('href') title=head.find('div',{'class':'tit'}).getText() nInfo['title']=r1(u'(?:^[【,「].*?[】,」])?(.*)',title) nInfo['newsid']=getMd5(nInfo['url']) desc=word.find('div',{'class':'des'}) nInfo['summary']=desc.getText().strip() img=item.find('img',{'class':'lazy'}).get('data-original') if not img: img=item.find('img',{'class':'lazy'}).get('src') nInfo['thumb']=img nInfo['keywords']='' time_block= item.find('div',{'class':'time'}) if not time_block: continue timeStr=' '.join(i.getText().replace(' / ','-') for i in time_block.find_all('span')) nInfo['ctime']= long(time.mktime(time.strptime(timeStr,'%Y-%m-%d %H:%M'))) author=word.find('div',{'class':'aut'}) nInfo['author']=author.getText().strip() if author else '' nInfo['source']=ctable nInfo['description']=str(desc) # print nInfo['ctime'],nInfo['title'] # print nInfo['newsid'],nInfo['url'] # print nInfo['author'],nInfo['thumb'] # print nInfo['keywords'],nInfo['summary'] newsList.append(nInfo) return newsList
def getJsInfo(): url=r'http://feed.mix.sina.com.cn/api/roll/get?pageid=1&lid=21&num=30&versionNumber=1.2.6&page=1&encode=utf-8&callback=feedCardJsonpCallback&_=' url+=str(long(time.time()*1000)) # print url content=getHtml(url) # print content newsList=[] if content: info=json.loads(r1('.*?\((\{.*\})\)',content),encoding='utf-8') if info.has_key('result'): tResult=info['result'] infoList=[] if 'data' in tResult: infoList+=tResult['data'] # if 'cre' in tResult: # infoList+=tResult['cre'] # if 'pdps' in tResult: # infoList+=tResult['pdps'] # if 'top' in tResult: # infoList+=tResult['top'] for info in infoList: try: nInfo={} nInfo['newsid']=getMd5(info['url']) nInfo['title']= info['title'] nInfo['url']=info['url'] nInfo['summary']=info['summary'] nInfo['thumb']=info['img']['u'] nInfo['keywords']=info['keywords'] if 'keywords' in info else '' nInfo['source']=ctable nInfo['ctime']=long(info['ctime']) nInfo['author']=info['author'] nInfo['description']='' newsList.append(nInfo) except: print 'Error:',info['url'] logging.error(info['url']) return newsList