def _parseHeadinfo(self, doc): fg = FigureItem() strimdata = '' jdiclst = [] scripts = re.findall('<script>FM\.view\((.*)\);?</script>', doc) if scripts: for i in scripts: jdiclst.append( json.loads(i) ) else: print '_fetch_manload: raw doc parse error' for jdic in jdiclst: if 'ns' in jdic: if jdic['ns'] == 'pl.header.head.index': strimdata = jdic['html'] d = PyQuery( strimdata ) break else: raise Exception('_parseHeadinfo error') info = self.remoteReader.getDoc( self.remoteReader.makeUrl_hostinfo(self.uid) ) m = re.search(r'注册时间[.\s\S]+(\d{4})-(\d{2})-(\d{2})', info) if m: t = time.mktime(time.strptime('%s-%s-%s' % (m.group(1), m.group(2), m.group(3)), '%Y-%m-%d')) else: t = 0 #2012-07-06 fg.uid = self.uid fg.domainid = self.remoteReader.domain fg.establish = t fg.follow = re.search(self.followmask, strimdata).group(1) fg.fans = re.search(self.fansmask, strimdata).group(1) fg.weibo = re.search(self.weibomask, strimdata).group(1) text1 = d('span').filter('.name').text() text2 = d('strong').filter('.W_f20.W_Yahei').text() if text1: fg.name = text1 else: fg.name = text2 try: fg.verify = d('.pf_verified_info').contents()[0] except: fg.verify = '' fg.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): fg.tags.append( i.text() ) if not fg.isValid(): print ' - Thread {0} weibo figure info not enough'.format(self.no) else: return fg
def test_parseWeiboLst(self, uid): fd = '../BigVs/' + str(uid) if os.path.exists(fd): with open('../BigVs/' + str(uid), 'r') as f: rawdoc = f.read() d = PyQuery(rawdoc.decode('utf-8')) fg = FigureItem() fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr( 'node-type') == 'follow').text() fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr( 'node-type') == 'fans').text() fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr( 'node-type') == 'weibo').text() fg.name = d('span').filter('.name').text() fg.verify = d('.pf_verified_info').contents()[0] fg.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): fg.tags.append(i.text()) return fg else: print 'file not exists'
def test_parseWeiboLst(self, uid): fd = '../BigVs/' + str(uid) if os.path.exists(fd): with open( '../BigVs/' + str(uid), 'r' ) as f: rawdoc = f.read() d = PyQuery( rawdoc.decode('utf-8') ) fg = FigureItem() fg.follow = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'follow').text() fg.fans = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'fans').text() fg.weibo = d('strong').filter(lambda i, this: PyQuery(this).attr('node-type') == 'weibo').text() fg.name = d('span').filter('.name').text() fg.verify = d('.pf_verified_info').contents()[0] fg.intro = d('.pf_intro').text() for i in d('.layer_menulist_tags').items('a'): fg.tags.append( i.text() ) return fg else: print 'file not exists'