def getVideoList(): pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is VideoChannel") rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') print("Crawling VideoChannel ....") domaindb = sqlite3.connect("news.db") cursor = domaindb.cursor() cursor.execute( "create table if not exists sohunews (id integer primary key,pid text)" ) #get requestdata while (cmp(lasttime, TenDaysAgoTime) == 1): #print pagenum listurl = id2Url(36, pagenum) time.sleep(1) newsLists = json.loads(getList(listurl)) if not newsLists.has_key("articles") or len( newsLists["articles"]) == 0: break for newsitem in newsLists["articles"]: if not newsitem.has_key("link") or not newsitem.has_key( "time") or newsitem["link"] == "": continue cursor.execute("select * from sohunews where pid='" + newsitem["link"] + "'") if len(cursor.fetchall()) > 0: #print("Newsitem has been crawled before, pass...") pass mid = getVideoMid(newsitem["link"]) clipurl = Videodetail(mid) #print clipurl newsdetail = getList(clipurl) if newsdetail == None: continue rd.setClipUrl(clipurl) rd.setClipTitle(newsitem["title"]) rd.setCategory('video') rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000)) rd.setViewCount(int(newsitem["commentNum"])) if newsitem.has_key("media"): rd.setClipSource(newsitem["media"]) newsdetail = json.loads(newsdetail) srcurl = [] for item in newsdetail["message"]["playurl"]: if newsdetail["message"]["playurl"][item]!="" and \ newsdetail["message"]["playurl"][item]!=0 and \ newsdetail["message"]["playurl"][item]!=[] : srcurl.append(newsdetail["message"]["playurl"][item]) rd.setSourceUrl(srcurl) if len(srcurl) == 0: continue print(rd._clip_title + "is successfully crawled , sending to MQ...") sendToMQ(rd) print("successfully sent to MQ !") domaindb.execute("insert into sohunews(pid) values('" + newsitem["link"] + "')") domaindb.commit() lasttime = rd._publish_time #print type(rd._view_count) pagenum += 1 domaindb.close()
def getAlbumList(channelid): pagenum = 1 lasttime = CurrentTime print("Crawling Channels ... channelid is " + channelid) rd = RequestData() rd.setTrackSourceID(TRACESOURCEID) rd.setSouceType('app') rd.setMetaID('') domaindb = sqlite3.connect("news.db") cursor = domaindb.cursor() cursor.execute( "create table if not exists sohunews (id integer primary key,pid text)" ) #get requestdata while (cmp(lasttime, TenDaysAgoTime) == 1): #print pagenum listurl = AlbumUrl(ChannelIds[channelid], pagenum) time.sleep(1) newsLists = json.loads(getList(listurl)) if not newsLists.has_key("news") or len(newsLists["news"]) == 0: break for newsitem in newsLists["news"]: if not newsitem.has_key("gid"): continue cursor.execute("select * from sohunews where pid='" + str(newsitem["gid"]) + "'") if len(cursor.fetchall()) > 0: #print("Newsitem has been crawled before, pass...") continue clipurl = Albumdetail(str(newsitem["gid"])) newsdetail = getList(clipurl) if newsdetail == None: continue rd.setClipUrl(clipurl) rd.setClipTitle(newsitem["title"]) rd.setPublishTime(DateFormat(int(newsitem["time"]) / 1000)) rd.setViewCount(int(newsitem["commentNum"])) rd.setCategory('image') newsdetail = xmltodict.parse(newsdetail) srcurl = [] new_srcurl = [] if newsdetail["root"].has_key("gallery"): if type(newsdetail["root"]["gallery"] ["photo"]) is types.ListType: for img in newsdetail["root"]["gallery"]["photo"]: srcurl.append(img["pic"]) else: srcurl.append( newsdetail["root"]["gallery"]["photo"]["pic"]) if len(srcurl) == 0: continue #FIX https://seals.vobile.cn/trac/ProjectManagement/ticket/743 for url in srcurl: if url.find(',http') > 0: new_srcurl.append(url[:url.find(',http')]) else: new_srcurl.append(url) rd.setSourceUrl(new_srcurl) print(rd._clip_title + "is successfully crawled , sending to MQ...") sendToMQ(rd) print("successfully sent to MQ !") domaindb.execute("insert into sohunews(pid) values('" + str(newsitem["gid"]) + "')") domaindb.commit() lasttime = rd._publish_time pagenum += 1 domaindb.close()