def setRdInfo(newsitem,rd): try: print("Setting up Requestdata ... NewsTitle is " + newsitem["title"]) detail= detailUrl(newsitem) print detail rd.setClipUrl(detail['shareUrl']) rd.setClipTitle(newsitem["title"]) pubtime = newsitem['releaseDate'] rd.setPublishTime(pubtime) imagesrc = [] rd.setCategory('image') imglist = detail['images'] for img in imglist: imagesrc.append(img['imageUrl']) rd.setSourceUrl(imagesrc) content = '' text = BeautifulSoup(detail['html']).find_all('p') for t in text: if t.text!=None: content+=t.text content = content.replace('\r','').replace('\n','').replace('\t','').replace('"','“') rd.setContent(content) if len(content)>0: f = file('./newstext/'+getMd5(rd._clip_title)+'.txt','w+') f.write(content) f.close() print '====*'+rd._clip_title+'*====' print json.dumps(rd.hostparseToJson()) return rd,True except: appLogger.error(traceback.format_exc()) print('crawl page failed') return rd,False
def setRdInfo(newsitem,rd): try: print("Setting up Requestdata ... NewsTitle is " + newsitem["title"]) detailurl = detailUrl(newsitem) print detailurl rd.setClipUrl(detailurl) rd.setClipTitle(newsitem["title"]) pubtime = newsitem['date'] rd.setPublishTime(pubtime) imagesrc = [] rd.setCategory('image') soup = BeautifulSoup(getList(detailurl)) text = soup.find_all("p") imglist = soup.find_all("img") for img in imglist: if img.has_attr('src') and img['src'] != '': imagesrc.append(img['src'].replace('\\"', '')) rd.setSourceUrl(imagesrc) content = '' for t in text: if t.string: content += t.string content = content.replace('\r', '').replace('\n', '').replace('\t', '').replace('"', '“') rd.setContent(content) print '====*'+rd._clip_title+'*====' print json.dumps(rd.hostparseToJson()) return rd, True except KeyError: appLogger.warning(traceback.format_exc()) return rd, False except: appLogger.error(traceback.format_exc()) return rd, False
def setRdInfo(newsurl, rd): try: detailurl = newsurl print detailurl time.sleep(5) detail = getList(detailurl) soup = BeautifulSoup(detail, 'lxml') rd.setClipUrl(detailurl) title = soup.find('title') appLogger.info("Setting up Requestdata ... NewsTitle is " + title.text) rd.setClipTitle(title.text) info = soup.find('div', attrs={ 'class': 'xinhua-info' }).text.split('2017') print info rd.setPublishTime('2017' + info[1]) rd.setClipSource(info[0]) videosrc = [] rd.setCategory('video') vlist = soup.find_all('video') for i in vlist: videosrc.append(i["src"]) rd.setSourceUrl(videosrc) print json.dumps(rd.hostparseToJson()) return rd, True except: appLogger.error(traceback.format_exc()) appLogger.info('crawl page failed') return rd, False
def getNewsList(channel): lasttime = CurrentTime print("Crawling Channels ... channelid is " +channel['name']) pagenum =0 rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') while (cmp(lasttime,TenDaysAgoTime) == 1): time.sleep(1) print channel['name'] ,pagenum try : newsList = id2Url(channel,pagenum) if newsList==None or len(newsList)<1: break except: appLogger.error(traceback.format_exc()) print("fail to crawl page") break for newsitem in newsList: time.sleep(0.5) #print(newsitem["title"]) rd,isSuccess= setRdInfo(newsitem,rd) if not isSuccess: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") if len(rd._source_url)!=0 or len(rd._content)!=0: sendToMQ(rd) lasttime = rd._publish_time break
def main(): global wechatConf, wechatCrawlerQueue, Channels, CurrentTime, TenDaysAgoTime if len(sys.argv) != 3: usage() config_dir = sys.argv[2] configFile = os.path.join(config_dir, MODULENAME + ".conf") #1.load system config appConf = appSystemVars.appSystemConf appConf.loadConfigBuffer(MAINCONFIGBUFFER) crawlerDB = appConf.getCrawlerDB() resultManager = appConf.getResultManager() DBPC = appConf.getDBPC() logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() tracksource = appConf.getTrackSource() tracksourceHost = tracksource.getHost() tracksourcePort = tracksource.getPort() #2. load wechat config try: Channels = getWechatChannels(tracksourceHost, tracksourcePort) except: appLogger.error(traceback.format_exc()) wechatConf = loadConfig(configFile) Channels = wechatConf['channels'] wechatCrawlerQueue = appCrawlerQueue(ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue()) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler appLogger.info("Start wechatCrawler ...") print len(Channels) start = time.time() channel_list = '太行日报 新华视点 淮北日报 新华视界 新华国际 微黔江 天津日报 人民日报 上海观察 深圳特区报' for channel in Channels: if channel['name'] in channel_list: #print channel['name'] getNewsList(channel) time.sleep(300) end = time.time() #crawl timeline print end - start
def wechatNewsList(channel): url = 'http://weixin.sogou.com/weixin?type=1&query=' + channel[ "sign"] + '&ie=utf8&_sug_=y&_sug_type_=' phantomjs_cmd = 'python bin/load_phantomjs.py wechat load_page "%s"' % ( url, ) p = subprocess.Popen(phantomjs_cmd, shell=True) p.communicate() success, news = False, None res = json.load(open('wechat.out', 'r+')) #print res if not res['status']: success = True img_news = res['data']['img_news'] video_news = res['data']['video_news'] else: appLogger.error(res['data']['error']) return success, img_news, video_news
def weiboNewsList(channel): url = 'http://www.weibo.com/' + channel[ 'sign'] + '?profile_ftype=1&is_all=1#_0' success, news = False, None phantomjs_cmd = 'python bin/load_phantomjs.py weibo load_page "%s"' % ( url, ) p = subprocess.Popen(phantomjs_cmd, shell=True) p.communicate() #time.sleep(300) res = json.load(open('weibo.out', 'r+')) #print res if not res['status']: success = True news = res['data']['news'] else: appLogger.error(res['data']['error']) return success, news
def getNewsList(channel): appLogger.info("Crawling Channels ... channelid is " + channel['name']) time.sleep(10) print channel['sign'], channel['name'].encode('utf-8') try: isSuccess, newsList = weiboNewsList(channel) if not isSuccess or (newsList is None or len(newsList) < 1): return except: appLogger.error(traceback.format_exc()) appLogger.warn("fail to crawl " + channel['name'] + " main page") return for rd in newsList: rd['trackSourceId'] = int(channel['id']) appLogger.info(rd['clipTitle'] + "is successfully crawled , sending to MQ...") if len(rd['sourceUrl']) != 0 or len(rd['content']) != 0: sendToMQ(rd)
def getChannelNewsList(channelid): print("Crawling Channels ... channelid is " + channelid) pagenum = 1 lasttime = CurrentTime rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') #print channelid while cmp(lasttime, TenDaysAgoTime) == 1: #print pagenum listurl = id2Url(ChannelIds[channelid], pagenum) print listurl if ChannelIds[channelid] == 'T1457068979049': newsLists = getList(listurl)[u'\u89c6\u9891'] else: newsLists = getList(listurl)[ChannelIds[channelid]] if len(newsLists) == 0: break for newsitem in newsLists: #print json.dumps(newsitem) try: if newsitem.has_key('mp4_url') or newsitem.has_key('m3u8_url'): rd = setVinfo(newsitem, rd) else: rd = setRdInfo(newsitem, rd) lasttime = rd._publish_time if not rd: continue #print rd._publish_time +":::::"+ rd._clip_title print("Newsitem is successfully crawled , sending to MQ...") #print rd.hostparseToStr() rd_json = rd.hostparseToJson() rd_base64 = base64.encodestring(json.dumps(rd_json)) setTask(rd_base64) except: appLogger.error(traceback.format_exc()) print("Newsitem crawling failed") continue pagenum += 1
def getNewsList(channel): print channel pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is " +channel['name']) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') while (cmp(lasttime,TenDaysAgoTime) == 1): time.sleep(1) print channel['id'],pagenum clipurl = id2Url(channel,pagenum) try : newsLists = json.loads(getList(clipurl)) newsList = [] for i in newsLists['dataList']: for j in i['list']: newsList.append(j) if newsList==None or len(newsList)<1: break except: appLogger.error(traceback.format_exc()) break for newsitem in newsList: rd,isSuccess= setRdInfo(newsitem,rd) if not isSuccess: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") if len(rd._source_url)!=0 or len(rd._content)!=0: sendToMQ(rd) lasttime = rd._publish_time pagenum += 1
def setRdInfo(newsitem, rd): try: #print json.dumps(newsitem) print("Setting up Requestdata ... NewsTitle is " + newsitem["title"]) clip_url = detail_url(newsitem['id']) detail = json.loads(getList(clip_url)) rd.setClipTitle(newsitem["title"]) pubtime = newsitem['publish_time'] rd.setPublishTime(pubtime) imagesrc = [] rd.setCategory('image') if newsitem.has_key("childs_data"): imglist = newsitem["childs_data"] for img in imglist: imagesrc.append(img['host'] + img['dir'] + img['filepath'] + img['filename']) rd.setSourceUrl(imagesrc) rd.setClipUrl(newsitem['content_url']) soup = BeautifulSoup(detail['content']) text = soup.find_all("p") content = '' for t in text: if t.string: content += t.string content = content.replace('\r', '').replace('\n', '').replace( '\t', '').replace('"', '“') rd.setContent(content) rdv = rd.copyrd() rdv.setCategory('video') vsrc = [] if detail.has_key('content_video_url'): vsrc.append(detail['content_video_url']) rdv.setSourceUrl(vsrc) rd.hostparseToJson() rdv.hostparseToJson() return rd, rdv, True except: appLogger.error(traceback.format_exc()) print('crawl page failed') return rd, rd, False
def setRdInfo(newsitem, rd): try: print("Setting up Requestdata ... NewsTitle is " + newsitem["title"]) detailurl = detailUrl(newsitem) print detailurl rd.setClipUrl(detailurl) rd.setClipTitle(newsitem["title"]) pubtime = DateFormat(int(newsitem['ts']) / 1000) rd.setPublishTime(pubtime) rd.setClipSource(newsitem['site']) imagesrc = [] rd.setCategory('image') soup = BeautifulSoup(getList(detailurl)) text = soup.find_all("p") imglist = soup.find_all("img") for img in imglist: if img.has_attr('src') and img['src']: imagesrc.append(img['src'].replace('\\"', '')) if img.has_attr('data-src') and img['data-src']: imagesrc.append(img['data-src'].replace('\\"', '')) rd.setSourceUrl(imagesrc) content = '' for t in text: if t.string: content += t.string content = content.replace('\r', '').replace('\n', '').replace( '\t', '').replace('"', '“') rd.setContent(content) if content: f = file('./newstext/' + getMd5(rd._clip_title) + '.txt', 'w+') f.write(content) f.close() print '====*' + rd._clip_title + '*====' print json.dumps(rd.hostparseToJson()) return rd, True except: appLogger.error(traceback.format_exc()) print('crawl page failed') return rd, False
def setRdInfo(newsitem,rd): try: detailurl = newsitem print detailurl soup = BeautifulSoup(getList(detailurl)) info = soup.find('section',attrs={'class':'dbt'}) contentsoup = soup.find('div',attrs={'class':'wen'}) title = info.h1.text pubtime = info.h2.span.i.text print("Setting up Requestdata ... NewsTitle is " + title) rd.setClipUrl(detailurl) rd.setClipTitle(title) rd.setPublishTime(pubtime) imagesrc = [] rd.setCategory('image') text = contentsoup.find_all("p") imglist = contentsoup.find_all("img") for img in imglist: if img['src']!='': imagesrc.append('http://news.2500sz.com/'+img['src'].replace('\\"','')) rd.setSourceUrl(imagesrc) content = '' for t in text: if t.text!=None: content+=t.text content = content.replace('\r','').replace('\n','').replace('\t','').replace('"','“') rd.setContent(content) if len(content)>0: f = file('./newstext/'+getMd5(rd._clip_title)+'.txt','w+') f.write(content) f.close() print json.dumps(rd.hostparseToJson()) return rd,True except: appLogger.error(traceback.format_exc()) print('crawl page failed') return rd,False
logConfigger = appConf.getLogger() ampqer = appConf.getMQ() timeperiod = appConf.getTimePeriod() #2. load wenhui config wenhuiConf = loadConfig(configFile) Channels = getChannels() #wenhuiCrawlerQueue = appCrawlerQueue (wenhuiConf["amqpurl"],wenhuiConf["request_queue"], wenhuiConf["request_queue"], wenhuiConf["request_queue"]) wenhuiCrawlerQueue = appCrawlerQueue( ampqer.getURL(), ampqer.getExchange(), ampqer.getRoutingKey(), ampqer.getHostQueue() ) CurrentTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) TenDaysAgo = (datetime.datetime.now() - datetime.timedelta(int(timeperiod))) TenDaysAgoTime = TenDaysAgo.strftime("%Y-%m-%d %H:%M:%S") #start crawler #print("Start wenhuiCrawler ...") print Channels for channel in Channels: getNewsList(channel) #crawl timeline if __name__ == '__main__': try: main() except: appLogger.error(traceback.format_exc())