コード例 #1
0
ファイル: collector.py プロジェクト: bighomework/BigV
    def test_parseWeiboLst(self, uid):
        fd = '../BigVs/' + str(uid)
        if os.path.exists(fd):
            with open('../BigVs/' + str(uid), 'r') as f:
                rawdoc = f.read()

            d = PyQuery(rawdoc.decode('utf-8'))
            fg = FigureItem()

            fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'follow').text()
            fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'fans').text()
            fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr(
                'node-type') == 'weibo').text()

            fg.name = d('span').filter('.name').text()
            fg.verify = d('.pf_verified_info').contents()[0]
            fg.intro = d('.pf_intro').text()

            for i in d('.layer_menulist_tags').items('a'):
                fg.tags.append(i.text())

            return fg
        else:
            print 'file not exists'
コード例 #2
0
 def _parseHeadinfo(self, doc):
           
     fg = FigureItem()
     strimdata  = ''
     jdiclst = []
     scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc)
     if scripts:
         for i in scripts:
             jdiclst.append( json.loads(i) )
     else:
         print '_fetch_manload: raw doc parse error'
         
     for jdic in jdiclst:
         if 'ns' in jdic:
             if jdic['ns'] == 'pl.header.head.index':
                 strimdata = jdic['html']
                 d = PyQuery( strimdata ) 
                 break
     else:
         raise Exception('_parseHeadinfo error')
     
     
     info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) )
     m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) 
     if m:
         t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d'))
     else:
         t = 0  #2012-07-06
     
     fg.uid       = self.uid
     fg.domainid  = self.remoteReader.domain
     fg.establish = t
     fg.follow = re.search(self.followmask, strimdata).group(1)
     fg.fans = re.search(self.fansmask, strimdata).group(1)
     fg.weibo = re.search(self.weibomask, strimdata).group(1)
     
     text1 = d('span').filter('.name').text()
     text2 = d('strong').filter('.W_f20.W_Yahei').text()
     if text1:
         fg.name = text1
     else:
         fg.name = text2
          
     try:
         fg.verify = d('.pf_verified_info').contents()[0]
     except:
         fg.verify = ''
         
         
     fg.intro = d('.pf_intro').text()
      
     for i in d('.layer_menulist_tags').items('a'):
         fg.tags.append( i.text() ) 
         
     if not fg.isValid():
         print '    - Thread {0} weibo figure info not enough'.format(self.no)
             
     else:
         return fg    
コード例 #3
0
 def run(self):
     fg = FigureItem()
     while not self.q.empty():
         uid = int(self.q.get()[0])
         doc = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostweibo(uid) )
         self._parseHeadinfo(doc)
         self.remoteReader.finishFetching()
         self.localReader.record( fg ) 
コード例 #4
0
 def itemCast(self, row):
     fg = FigureItem()
     fg.uid = row[0]
     fg.domainid = row[1]
     fg.name = row[2]
     fg.follow = row[3]
     fg.fans = row[4]
     fg.weibo = row[5]
     fg.establish = row[6]
     return fg
コード例 #5
0
   def __init__(self, queue=None, no=0, skip=False):         
 
       super(WeiboFetcher, self).__init__() 
       
       self.initRemask()
       self.remoteReader   = Page() 
       
       self.figure   = FigureItem()
       self.q = queue
       self.no = no
       self.repeat = 0
       self.skip = skip
コード例 #6
0
 def __init__(self, queue):         
     super(FigureFetcher, self).__init__()
     self.localReader = FigureDatabase()
     self.figure = FigureItem() 
     
     self.q = queue