def acqMainItems(): while True: try: page = tz.decodeForThisSys(acqHtml(mySpiderCfgMain.iter.next())) except: print 'done' return items = pq(tz.decodeForThisSys(page)).find('.nr3 dd a') for item in items: itemT = pq(item) peeps = wrapper.goalClass.selectBy(gameUrl=itemT.attr('href')) if peeps.count() == 0: mainItems.put(itemT.attr('href'))
def dealCertainItem(): while True: # if mainItems.empty(): # time.sleep(5) # if mainItems.empty(): # return urlT = mainItems.get() page = tz.decodeForThisSys(acqHtml(urlT)) q = pq(page) def acqScore(): id = q.find('#softid').val() if id == None: return -1.0, -1 try: objT = json.loads( acqHtml('http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id)) except: print 'http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id return -1.0, -1 r = objT['Score'] commentCount = objT['Normal'] + objT['DOWN'] + objT['UP'] return r, commentCount def acqImg(): r = '' url = q.find('div.dl>dl>dd>img').attr('src') if not tz.emptyOrNoneAll(url): r = acqHtml(url) return r def acqDate(str): r = datetime.date(1949, 10, 1) if not tz.emptyOrNoneAll(str): try: r = datetime.datetime.strptime(str, '%Y/%m/%d').date() except: return r return r def acqCommentAll(): id = q.find('#softid').val() if id == None: return '' try: strT = acqHtml( 'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback()&encoding=gb2312' % id) strT = strT[strT.index('= {') + 2:strT.index(';window.Pinglun.GetHot')] objT = json.loads(strT) strRs = [] for item in objT['comments']: strRs.append('%s,%s:%s @%s\n'%(item['city'],item['ip'],item['content'],item['datetime'])) return ''.join(strRs) except: print 'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback($data)&encoding=gb2312' % id print strT return '' if q.find('h1[itemprop=name]').text().strip() == '': continue rt = acqScore() gameObj = { 'name': q.find('h1[itemprop=name]').text().strip(), 'softwareVersion': q.find('span[itemprop=softwareVersion]').text().strip(), 'ename': q.find('span.ename').text().strip(), 'img': acqImg(), 'gameType': q.find('div.dl>dl>dt>span:eq(0)>b:eq(0)>a').text(), 'inLanguage': q.find('div.dl>dl>dt>span:eq(0)>b:eq(1)>em').text(), 'fileSize': q.find('div.dl>dl>dt>span:eq(1)>b:eq(0)>em').text(), 'fileComany': q.find('div.dl>dl>dt>span:eq(1)>b:eq(1)>em').text(), 'startingTime': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(0)>em').text()), 'dateModified': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(1)>em').text()), 'operatingSystem': q.find('div.dl>dl>dt>span:eq(3) a').text(), 'tag': q.find('div.dl>dl>dt>span:eq(4)>em').text(), 'zt_text': q.find('div.dl>dl>dt>span:eq(5)>em').text(), 'pf_score': rt[0], 'commentCount': rt[1], 'pf_score_des': q.find('div.pinja_box').text().strip(), 'game_des': mySpiderTools.myDecodeHtml(q.find('li.yx1>span').text().strip()), 'gameUrl': urlT, 'commentAll':acqCommentAll(), } if len(gameObj['name']) != 0: gameItems.put(gameObj) print 'now index is %d,put %s' % (mySpiderCfgMain.countT, gameObj['name']) # lockT = thread.allocate_lock() # lockT.acquire() mySpiderCfgMain.countT += 1