コード例 #1
0
    def acqCertainItemsSingle(self, itemMain, acqCount):

        try:
            page = acqHtml(itemMain['url'])
            # page = acqHtml(itemMain)
        except:
            return False,

        q = pq(page)

        objT = itemMain.copy()

        objT['viewCount'] = int(q.find('.checknum strong').text())
        objT['content'] = mySpiderTools.myDecodeHtml(q.find('.share_info').html())

        if objT['title'] != '':

            peeps = self.wrapper.goalClass.selectBy(title=objT['title'])

            if peeps.count() != 0:
                return False,

            self.certainItems.put(objT)

            return True, objT

        else:
            return False,
コード例 #2
0
ファイル: mainAskTao.py プロジェクト: ZJZZuse/myMadeLib
    def acqCertainItemsSingle(self, itemMain, acqCount):

        try:
            page = acqHtml(itemMain).decode('gb2312', 'ignore')
            # page = acqHtml(itemMain)
        except:
            return False,

        q = pq(page)

        def getDate():
            t = datetime.datetime(1999, 9, 9).date()
            try:
                t = datetime.datetime.strptime(q.find('.col-01:eq(0)').text().strip()[3:],
                                               '%Y-%m-%d').date()
            except:
                pass
            return t

        # try:
        # objT = {'name': q.find('table.tsjb_nr tr:eq(0) td:eq(0)').text().strip(),
        #         'email': q.find('table.tsjb_nr tr:eq(0) td:eq(1)').text(),
        #         'title': q.find('table.tsjb_nr tr:eq(1) td:eq(0)').text(),
        #         'type': q.find('table.tsjb_nr tr:eq(1) td:eq(1)').text(),
        #         'content': q.find('table.tsjb_nr tr:eq(2) td:eq(0)').text(),
        #         'replyTime': getDate(),
        #         'replyContent': mySpiderTools.myDecodeHtml(q.find('table.tsjb_nr tr:eq(4) td:eq(0)').text().strip())
        #         }

        q.find('.content:eq(1) p:last').remove()

        objT = {
            'url': itemMain,
            'title': q.find('.article h1').text().strip(),
            'date': getDate(),
            'author': q.find('.info .col-02').text().strip(),
            'content': mySpiderTools.myDecodeHtml(q.find('.content:eq(1)').text().strip()),
        }
        # except :
        #     traceback.extract_stack()
        #     return False,

        if objT['title'] != '':

            peeps = self.wrapper.goalClass.selectBy(title=objT['title'])

            if peeps.count() != 0:
                return False,

            self.certainItems.put(objT)

        return True, objT
コード例 #3
0
ファイル: mainSikeSpider.py プロジェクト: ZJZZuse/myMadeLib
    def acqCertainItemsSingle(self, itemMain, acqCount):

        try:
            page = acqHtml(itemMain['url']).decode('gb2312', 'ignore')
        except:
            return False,

        q = pq(page)

        objT = itemMain.copy()

        objT['content'] = mySpiderTools.myDecodeHtml(q.find('.stream_left_content').html().strip())

        return True, objT
コード例 #4
0
    def acqCertainItemsSingle(self, itemMain, acqCount):

        try:
            page = acqHtml(itemMain).decode('gb2312','ignore')
        except:
            return False,

        q = pq(page)

        def getDate():
            t = datetime.datetime(1999, 9, 9)
            try:
                t = datetime.datetime.strptime(q.find('table.tsjb_nr tr:eq(3) td:eq(0)').text().strip(),
                                               '%Y-%m-%d %H:%M:%S')
            except:
                pass
            return t

        # try:
        objT = {'name': q.find('table.tsjb_nr tr:eq(0) td:eq(0)').text().strip(),
                'email': q.find('table.tsjb_nr tr:eq(0) td:eq(1)').text(),
                'title': q.find('table.tsjb_nr tr:eq(1) td:eq(0)').text(),
                'type': q.find('table.tsjb_nr tr:eq(1) td:eq(1)').text(),
                'content': q.find('table.tsjb_nr tr:eq(2) td:eq(0)').text(),
                'replyTime': getDate(),
                'replyContent': mySpiderTools.myDecodeHtml(q.find('table.tsjb_nr tr:eq(4) td:eq(0)').text().strip())
                }
        # except :
        #     traceback.extract_stack()
        #     return False,

        if objT['name'] != '':

            peeps = self.wrapper.goalClass.selectBy(name=objT['name'])

            if peeps.count() != 0:
                return False,

            self.certainItems.put(objT)

        return True, objT
コード例 #5
0
ファイル: mainFcSpider.py プロジェクト: ZJZZuse/myMadeLib
def dealCertainItem():
    while True:

        # if mainItems.empty():
        #     time.sleep(5)
        #     if mainItems.empty():
        #         return

        urlT = mainItems.get()

        page = tz.decodeForThisSys(acqHtml(urlT))

        q = pq(page)

        def acqScore():
            id = q.find('#softid').val()

            if id == None:
                return -1.0, -1

            try:
                objT = json.loads(
                    acqHtml('http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id))
            except:
                print 'http://dy.www.yxdown.com/open/op.ashx?action=/soft/votes/data.json&sid=%s' % id
                return -1.0, -1

            r = objT['Score']

            commentCount = objT['Normal'] + objT['DOWN'] + objT['UP']

            return r, commentCount

        def acqImg():
            r = ''
            url = q.find('div.dl>dl>dd>img').attr('src')

            if not tz.emptyOrNoneAll(url):
                r = acqHtml(url)

            return r

        def acqDate(str):
            r = datetime.date(1949, 10, 1)

            if not tz.emptyOrNoneAll(str):
                try:
                    r = datetime.datetime.strptime(str, '%Y/%m/%d').date()
                except:
                    return r

            return r

        def acqCommentAll():
            id = q.find('#softid').val()

            if id == None:
                return ''

            try:
                strT = acqHtml(
                    'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback()&encoding=gb2312' % id)

                strT = strT[strT.index('= {') + 2:strT.index(';window.Pinglun.GetHot')]

                objT = json.loads(strT)

                strRs = []

                for item in objT['comments']:
                    strRs.append('%s,%s:%s @%s\n'%(item['city'],item['ip'],item['content'],item['datetime']))

                return ''.join(strRs)

            except:
                print 'http://pl.yxdown.com/ping.ashx/hot.js?key=soft&vote=6&sid=%s&count=10&callback=window.Pinglun.GetHotCommentsCallback($data)&encoding=gb2312' % id
                print strT
                return ''


        if q.find('h1[itemprop=name]').text().strip() == '':
            continue

        rt = acqScore()

        gameObj = {
            'name': q.find('h1[itemprop=name]').text().strip(),
            'softwareVersion': q.find('span[itemprop=softwareVersion]').text().strip(),
            'ename': q.find('span.ename').text().strip(),
            'img': acqImg(),
            'gameType': q.find('div.dl>dl>dt>span:eq(0)>b:eq(0)>a').text(),
            'inLanguage': q.find('div.dl>dl>dt>span:eq(0)>b:eq(1)>em').text(),
            'fileSize': q.find('div.dl>dl>dt>span:eq(1)>b:eq(0)>em').text(),
            'fileComany': q.find('div.dl>dl>dt>span:eq(1)>b:eq(1)>em').text(),
            'startingTime': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(0)>em').text()),
            'dateModified': acqDate(q.find('div.dl>dl>dt>span:eq(2)>b:eq(1)>em').text()),
            'operatingSystem': q.find('div.dl>dl>dt>span:eq(3) a').text(),
            'tag': q.find('div.dl>dl>dt>span:eq(4)>em').text(),
            'zt_text': q.find('div.dl>dl>dt>span:eq(5)>em').text(),
            'pf_score': rt[0],
            'commentCount': rt[1],
            'pf_score_des': q.find('div.pinja_box').text().strip(),
            'game_des': mySpiderTools.myDecodeHtml(q.find('li.yx1>span').text().strip()),
            'gameUrl': urlT,
            'commentAll':acqCommentAll(),
        }

        if len(gameObj['name']) != 0:
            gameItems.put(gameObj)
            print 'now index is %d,put %s' % (mySpiderCfgMain.countT, gameObj['name'])

            # lockT = thread.allocate_lock()

            # lockT.acquire()
            mySpiderCfgMain.countT += 1