def acqCertainItemsSingle(self, itemMain, acqCount): try: page = acqHtml(itemMain['url']) # page = acqHtml(itemMain) except: return False, q = pq(page) objT = itemMain.copy() objT['viewCount'] = int(q.find('.checknum strong').text()) objT['content'] = mySpiderTools.myDecodeHtml(q.find('.share_info').html()) if objT['title'] != '': peeps = self.wrapper.goalClass.selectBy(title=objT['title']) if peeps.count() != 0: return False, self.certainItems.put(objT) return True, objT else: return False,
def acqCertainItemsSingle(self, itemMain, acqCount): try: page = acqHtml(itemMain).decode('gb2312', 'ignore') # page = acqHtml(itemMain) except: return False, q = pq(page) def getDate(): t = datetime.datetime(1999, 9, 9).date() try: t = datetime.datetime.strptime(q.find('.col-01:eq(0)').text().strip()[3:], '%Y-%m-%d').date() except: pass return t # try: # objT = {'name': q.find('table.tsjb_nr tr:eq(0) td:eq(0)').text().strip(), # 'email': q.find('table.tsjb_nr tr:eq(0) td:eq(1)').text(), # 'title': q.find('table.tsjb_nr tr:eq(1) td:eq(0)').text(), # 'type': q.find('table.tsjb_nr tr:eq(1) td:eq(1)').text(), # 'content': q.find('table.tsjb_nr tr:eq(2) td:eq(0)').text(), # 'replyTime': getDate(), # 'replyContent': mySpiderTools.myDecodeHtml(q.find('table.tsjb_nr tr:eq(4) td:eq(0)').text().strip()) # } q.find('.content:eq(1) p:last').remove() objT = { 'url': itemMain, 'title': q.find('.article h1').text().strip(), 'date': getDate(), 'author': q.find('.info .col-02').text().strip(), 'content': mySpiderTools.myDecodeHtml(q.find('.content:eq(1)').text().strip()), } # except : # traceback.extract_stack() # return False, if objT['title'] != '': peeps = self.wrapper.goalClass.selectBy(title=objT['title']) if peeps.count() != 0: return False, self.certainItems.put(objT) return True, objT
def acqCertainItemsSingle(self, itemMain, acqCount): try: page = acqHtml(itemMain['url']).decode('gb2312', 'ignore') except: return False, q = pq(page) objT = itemMain.copy() objT['content'] = mySpiderTools.myDecodeHtml(q.find('.stream_left_content').html().strip()) return True, objT
def acqCertainItemsSingle(self, itemMain, acqCount): try: page = acqHtml(itemMain).decode('gb2312','ignore') except: return False, q = pq(page) def getDate(): t = datetime.datetime(1999, 9, 9) try: t = datetime.datetime.strptime(q.find('table.tsjb_nr tr:eq(3) td:eq(0)').text().strip(), '%Y-%m-%d %H:%M:%S') except: pass return t # try: objT = {'name': q.find('table.tsjb_nr tr:eq(0) td:eq(0)').text().strip(), 'email': q.find('table.tsjb_nr tr:eq(0) td:eq(1)').text(), 'title': q.find('table.tsjb_nr tr:eq(1) td:eq(0)').text(), 'type': q.find('table.tsjb_nr tr:eq(1) td:eq(1)').text(), 'content': q.find('table.tsjb_nr tr:eq(2) td:eq(0)').text(), 'replyTime': getDate(), 'replyContent': mySpiderTools.myDecodeHtml(q.find('table.tsjb_nr tr:eq(4) td:eq(0)').text().strip()) } # except : # traceback.extract_stack() # return False, if objT['name'] != '': peeps = self.wrapper.goalClass.selectBy(name=objT['name']) if peeps.count() != 0: return False, self.certainItems.put(objT) return True, objT
def dealCertainItem(): while True: # if mainItems.empty(): # time.sleep(5) # if mainItems.empty(): # return urlT = mainItems.get() page = tz.decodeForThisSys(acqHtml(urlT)) q = pq(page) def acqScore(): id = q.find('#softid').val() if id == None: return -1.0, -1 try: objT = json.loads( acqHtml('http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id)) except: print 'http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id return -1.0, -1 r = objT['Score'] commentCount = objT['Normal'] + objT['DOWN'] + objT['UP'] return r, commentCount def acqImg(): r = '' url = q.find('div.dl>dl>dd>img').attr('src') if not tz.emptyOrNoneAll(url): r = acqHtml(url) return r def acqDate(str): r = datetime.date(1949, 10, 1) if not tz.emptyOrNoneAll(str): try: r = datetime.datetime.strptime(str, '%Y/%m/%d').date() except: return r return r def acqCommentAll(): id = q.find('#softid').val() if id == None: return '' try: strT = acqHtml( 'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback()&encoding=gb2312' % id) strT = strT[strT.index('= {') + 2:strT.index(';window.Pinglun.GetHot')] objT = json.loads(strT) strRs = [] for item in objT['comments']: strRs.append('%s,%s:%s @%s\n'%(item['city'],item['ip'],item['content'],item['datetime'])) return ''.join(strRs) except: print 'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback($data)&encoding=gb2312' % id print strT return '' if q.find('h1[itemprop=name]').text().strip() == '': continue rt = acqScore() gameObj = { 'name': q.find('h1[itemprop=name]').text().strip(), 'softwareVersion': q.find('span[itemprop=softwareVersion]').text().strip(), 'ename': q.find('span.ename').text().strip(), 'img': acqImg(), 'gameType': q.find('div.dl>dl>dt>span:eq(0)>b:eq(0)>a').text(), 'inLanguage': q.find('div.dl>dl>dt>span:eq(0)>b:eq(1)>em').text(), 'fileSize': q.find('div.dl>dl>dt>span:eq(1)>b:eq(0)>em').text(), 'fileComany': q.find('div.dl>dl>dt>span:eq(1)>b:eq(1)>em').text(), 'startingTime': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(0)>em').text()), 'dateModified': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(1)>em').text()), 'operatingSystem': q.find('div.dl>dl>dt>span:eq(3) a').text(), 'tag': q.find('div.dl>dl>dt>span:eq(4)>em').text(), 'zt_text': q.find('div.dl>dl>dt>span:eq(5)>em').text(), 'pf_score': rt[0], 'commentCount': rt[1], 'pf_score_des': q.find('div.pinja_box').text().strip(), 'game_des': mySpiderTools.myDecodeHtml(q.find('li.yx1>span').text().strip()), 'gameUrl': urlT, 'commentAll':acqCommentAll(), } if len(gameObj['name']) != 0: gameItems.put(gameObj) print 'now index is %d,put %s' % (mySpiderCfgMain.countT, gameObj['name']) # lockT = thread.allocate_lock() # lockT.acquire() mySpiderCfgMain.countT += 1