def addKu6Url(self): websiteId = tools.getWebsiteId(Constance.KU6) urlDict = { # 视频 最多播放 今日 'http://top.ku6.com/v_t1d1c126000p%d.html': 15, # 视频 最多播放 本周 'http://top.ku6.com/v_t1d2c126000p%d.html': 15, # 视频 最多播放 本月 'http://top.ku6.com/v_t1d3c126000p%d.html': 15, # 视频 最新推荐 'http://top.ku6.com/v_t7d4c126000p%d.html': 30, # 专辑 最多播放 今日 'http://top.ku6.com/p_t1d1c126000p%d.html': 1, # 专辑 最多播放 本周 'http://top.ku6.com/p_t1d2c126000p%d.html': 1, # 专辑 最多播放 本月 'http://top.ku6.com/p_t1d3c126000p%d.html': 15, # 专辑 最新推荐 'http://top.ku6.com/p_t2d4c126000p%d.html': 30 } for baseUrl, pageCount in urlDict.items(): for i in range(1, pageCount + 1): url = baseUrl % i log.debug("ku6 base url = %s" % url) self.addUrl(url, websiteId)
def addPPTVUrl(self): baseUrl = 'http://list.pptv.com/channel_list.html?page=%d&type=210548&sort=1' pageCount = 17 websiteId = tools.getWebsiteId(Constance.PPTV) for i in range(1, pageCount + 1): url = baseUrl % i log.debug("pptv base url = %s" % url) self.addUrl(url, websiteId)
def addWangYiUrl(self): baseUrl = 'http://so.v.163.com/search/000-0-0000-1-%d-0-纪录片/' pageCount = 614 websiteId = tools.getWebsiteId(Constance.WANG_YI) for i in range(1, pageCount + 1): url = baseUrl % i log.debug("wangyi base url = %s" % url) self.addUrl(url, websiteId)
def addTouDouUrl(self): # 全部 剧集 # 添加首页 后续页面在tudou里添加 baseUrl = 'http://www.tudou.com/s3portal/service/pianku/data.action?pageSize=90&app=mainsitepc&deviceType=1&tags=&tagType=3&firstTagId=8&areaCode=&initials=&hotSingerId=&pageNo=1&sortDesc=quality' websiteId = tools.getWebsiteId(Constance.TUDOU) self.addUrl(baseUrl, websiteId, Constance.EPISODE) # 视频 # 添加首页 后续页面在tudou里添加 baseUrl = 'http://www.tudou.com/list/itemData.action?tagType=1&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key=' websiteId = tools.getWebsiteId(Constance.TUDOU) self.addUrl(baseUrl, websiteId, Constance.VIDEO) # 栏目 # 添加首页 后续页面在tudou里添加 baseUrl = 'http://www.tudou.com/list/playlistData.action?tagType=2&firstTagId=8&areaCode=&tags=&initials=&hotSingerId=&page=1&sort=2&key=' websiteId = tools.getWebsiteId(Constance.TUDOU) self.addUrl(baseUrl, websiteId, Constance.ITERM)
def addTencentUrl(self): baseUrl = 'http://v.qq.com/x/documentarylist/?itype=-1&offset=%d&sort=4' pageCount = 121 # 后续改成动态获取网页上的尾页 正则:'data-total="(.+)"></div)' websiteId = tools.getWebsiteId(Constance.TENCENT) #self.addUrl('http://v.qq.com/x/documentarylist/?itype=-1&offset=0&sort=4', websiteId) for i in range(0, pageCount * 20, 20): url = 'http://v.qq.com/x/documentarylist/?itype=-1&offset=%d&sort=4' % i log.debug("tencent base url = %s" % url) self.addUrl(url, websiteId)
def addCCTVUrl(self): urlsList = [] filepath = '..\\urls\\cctv.conf' for line in fileinput.input(filepath): url = tools.getInfo(line, '= (.+?)\n') if url: urlsList.extend(url) websiteId = tools.getWebsiteId(Constance.CCTV) for url in urlsList: log.debug("Add url %s to DB" % url) self.addUrl(url, websiteId)
def myTest(): websiteId = tools.getWebsiteId('cctv.com') urls = [] f = open('D:\html.txt', 'r') while True: line = f.readline() if not line: break url = tools.getInfo(line, '"url".+"(.+?)"') if url != []: urls.extend(url) for url in urls: parseLeafUrl(url, websiteId) #myTest()
def addKanKanUrl(self): def getPageNum(url): html = tools.getHtml(url) regex = 'list-pager-v2.*>(.*?)</a><a id="pagenav_next"' pageNum = tools.getInfo(html, regex) pageNum = len(pageNum) == 0 and '1' or pageNum[0] # log.debug(pageNum) return int(pageNum) # 全部视频 websiteId = tools.getWebsiteId(Constance.KAN_KAN) baseUrl = 'http://movie.kankan.com/type/documentary/' log.debug("kankan base url = %s" % baseUrl) self.addUrl(baseUrl, websiteId) pageCount = getPageNum(baseUrl) log.debug("kankan 页数 = %d" % pageCount) for i in range(2, pageCount + 1): url = baseUrl + 'page%d/' % i log.debug("kankan base url = %s" % url) self.addUrl(url, websiteId) # 按类型 log.debug('----------------------按类型分类-----------------------') regex = '"div_genre">(.*?)</dd>' html = tools.getHtml(baseUrl) typeBlockUrl = tools.getInfo(html, regex) regex = ' href="(.*?)"' typeUrls = tools.getInfo(typeBlockUrl, regex) for typeBaseUrl in typeUrls: log.debug("kankan type base url = %s" % typeBaseUrl) pageCount = getPageNum(typeBaseUrl) log.debug("kankan 页数 = %d" % pageCount) self.addUrl(typeBaseUrl, websiteId) for i in range(2, pageCount + 1): url = typeBaseUrl + 'page%d/' % i log.debug("kankan type base url = %s" % url) self.addUrl(url, websiteId)
def addSoHuUrl(self): baseUrl = "http://www.sohu.com/" websiteId = tools.getWebsiteId(Constance.SOHU) self.addUrl(baseUrl, websiteId)
def addTencentUrl(self): baseUrl = "http://www.qq.com" websiteId = tools.getWebsiteId(Constance.TENCENT) self.addUrl(baseUrl, websiteId)
def addYoukuUrl(self): # 取页数 def getPageNum(pageUrl): html = tools.getHtml(pageUrl) pageNumRegexs = [ '<ul class="yk-pages">.*>(.*?)</a></li><li class="next"', '<ul class="yk-pages">.*>(.*?)</span></li><li class="next"' ] pageNum = tools.getInfo(html, pageNumRegexs) pageNum = len(pageNum) == 0 and '0' or pageNum[0] # log.debug(pageNum) return int(pageNum) showUrl = 'http://list.youku.com/category/show/c_84_s_1_d_1_p_1.html' videoUrl = 'http://list.youku.com/category/video/c_84_d_1_s_1_p_1.html' showPageCount = getPageNum(showUrl) videoPageCount = getPageNum(videoUrl) websiteId = tools.getWebsiteId(Constance.YOUKU) ## 全部节目类 for i in range(1, showPageCount + 1): url = showUrl.replace('_p_1.html', '_p_%d.html' % i) # log.debug("youku base url = %s"%url) self.addUrl(url, websiteId, 'show') ## 全部视频类 for i in range(1, videoPageCount + 1): url = videoUrl.replace('_p_1.html', '_p_%d.html' % i) # log.debug("youku base url = %s"%url) self.addUrl(url, websiteId, 'video') log.debug('----------------------按节目分类-----------------------') ## 节目 按分类 html = tools.getHtml(showUrl) # 取地区url regex = '<label>地区:</label>(.*?)</ul>' regionUrlBlock = tools.getInfo(html, regex) regex = 'href="(.*?)">' regionUrls = tools.getInfo(regionUrlBlock, regex) for regionUrl in regionUrls: log.debug("地区url = " + regionUrl) # 取地区下类型url html = tools.getHtml(regionUrl) regex = "<label>类型:</label>(.*?)</ul>" typeUrlBlock = tools.getInfo(html, regex) regex = 'href="(.*?)">' typeUrls = tools.getInfo(typeUrlBlock, regex) # 取每个类型的页数,拼出每页的地址,存到数据库 for typeUrl in typeUrls: log.debug("typeUrl = " + typeUrl) pageNum = getPageNum(typeUrl) for i in range(1, pageNum + 1): url = typeUrl.replace('.html', '_p_%d.html' % i) self.addUrl(url, websiteId, 'show') log.debug('----------------------按视频分类-----------------------') ## 视频 按分类 regex = "<label>类型:</label>(.*?)</ul>" html = tools.getHtml(videoUrl) typeUrlBlock = tools.getInfo(html, regex) regex = 'href="(.*?)">' typeUrls = tools.getInfo(typeUrlBlock, regex) # 取每个类型的页数,拼出每页的地址,存到数据库 for typeUrl in typeUrls: log.debug("typeUrl = " + typeUrl) pageNum = getPageNum(typeUrl) for i in range(1, pageNum + 1): url = typeUrl.replace('.html', '_p_%d.html' % i) self.addUrl(url, websiteId, 'video')
def addIFengUrl(self): baseUrl = "http://www.ifeng.com/" websiteId = tools.getWebsiteId(Constance.IFENG) self.addUrl(baseUrl, websiteId)
def addV1Url(self): # 添加首页 后续页面在v1里添加 baseUrl = 'http://api.v1.cn/v1Enhanced/interfaceForJsonP?callback=jQuery18308286485691806487_1477619118750&obj=cms.getArticle&cid=1147&page=1&nums=24&_=1477619416282' websiteId = tools.getWebsiteId(Constance.V1) self.addUrl(baseUrl, websiteId)
def addWangYiUrl(self): baseUrl = "http://www.163.com/" websiteId = tools.getWebsiteId(Constance.WANG_YI) self.addUrl(baseUrl, websiteId)
def addPeopleUrl(self): baseUrl = "http://www.people.com.cn/" websiteId = tools.getWebsiteId(Constance.PEOPLE) self.addUrl(baseUrl, websiteId)
def addCCTVUrl(self): baseUrl = "http://www.cctv.com/" websiteId = tools.getWebsiteId(Constance.CCTV) self.addUrl(baseUrl, websiteId)
def addSinaUrl(self): baseUrl = "http://www.sina.com.cn/" websiteId = tools.getWebsiteId(Constance.SINA) self.addUrl(baseUrl, websiteId)
def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): log.debug("collector 已满 size = %d" % len(Collector._urls)) return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) beginTime = time.time() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) beginTime = time.time() Collector._urls.extend(urlsList) log.debug('put get url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() mylock.release()
def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) urlsList = list(urlsList) Collector._urls.extend(urlsList) #更新已取到的url状态为doing for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) mylock.release()
def __inputData(self): log.debug('buffer size %d' % self.getMaxReadSize()) log.debug('buffer can write size = %d' % self.getMaxWriteSize()) if self.getMaxWriteSize() == 0: log.debug("collector 已满 size = %d" % self.getMaxReadSize()) return beginTime = time.time() urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize( ) else self.getMaxWriteSize() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif Collector._website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(Collector._website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) # 存url self.putUrls(urlsList) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export()
def addXinHuaUrl(self): baseUrl = "http://www.xinhuanet.com/" websiteId = tools.getWebsiteId(Constance.XIN_HUA) self.addUrl(baseUrl, websiteId)