def parse_artist(d): data = dict() pq = d.pq data['picture'] = pq('#sidebar .artist-image img').attr('src') data['review'] = pq('.review-body .editorial-text').html() data['genres'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.details .genres a') ] data['styles'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.details .styles a') ] data['active'] = pq('#sidebar dd.active').text() data['formed'] = pq('#sidebar dd.birth').text() # data['members'] = [ # { # 'name': m.text(), # 'url': m.attr('href') # } for m in pq_iter(pq('#sidebar .group-members li a')) # ] data['moods'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.sidebar-module.moods a') ] data['themes'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.sidebar-module.themes a') ] data['discography'] = [ { 'year': e('.year').text(), 'thumbnail': e('.thumbnail-img img').attr('src'), 'title': e('.title a:first-child').text(), 'url': e('.title a:first-child').attr('href'), 'label': e('td.label .full-title').text(), 'rating': float_or(e('td.ed-rating .allmusic.rating').attr('data-stars')), } for e in pq_iter(pq('#discography .album-table tbody tr')) ] data['photo_gallery'] = [ json.loads( e.attr('data-large') ) for e in pq_iter(pq('#sidebar .media-gallery div.media-gallery-image.thumbnail')) ] return data
def get_url_set(city): cates = ('banjia', 'baomu', 'baojie', 'weixiu', 'jiadianweixiu', 'shumashoujiweixiu', 'kongtiaoyiji', 'jiazheng', 'zhongdiangong', 'yuesao', 'guandao', 'bianminfuwu') for cate in cates: print 'Get list of %s' % cate n = 0 while 1: url = r"http://%s.ganji.com/%s/f%d" % (city, cate, n) content = request(url) if content: doc = _(content) nodes = doc('.list .ft-14') if not nodes: break for node in nodes: url_ = _(node).attr('href') text = _(node).text() if not Entry.query.filter_by(title=text).first(): print url_ yield url_ n += 32
def parse_search(text): pq = _(text) return { 'results': [ { 'thumbnail': r('div.image .thumbnail img').attr('src'), 'title': r('div.title a').text(), 'artist': { 'name': r('div.artist').text(), 'url': r('div.artist a').attr('href'), }, 'url': r('div.title a').attr('href'), } for r in pq_iter(pq('table.search-results tr')) ] }
__author__ = 'tianchi.ltc' import urllib2 from pyquery import PyQuery as _ Q_HOST = 'baidu.com' QUERY = 'ip=' + Q_HOST HOST_N_METHOD = 'http://ip.cn/index.php' req = urllib2.Request(HOST_N_METHOD) res = urllib2.urlopen(req, QUERY) str = res.read() # print str # doc=pq(url=HOST_N_METHOD) # encoding issue doc = _(str) # encoding issue # print doc print doc('.well')
def pq_iter(pq): for e in pq: yield _(e)
def parse_album(d): data = dict() pq = d.pq data['artist'] = { 'name': pq('.album-artist a').text(), 'url': pq('.album-artist a').attr('href') } data['title'] = pq('.album-title').text() data['review'] = pq('.review-body .editorial-text').html() data['rating'] = float_or(pq('.allmusic.rating').attr('data-stars')) data['release_date'] = pq('.details .release-date').text() data['duration'] = pq('.details .duration').text() data['album_art'] = json.loads(pq('div.album-art .image-container').attr('data-large')) data['similar_albums'] = parse_album_similar_albums(pq) data['genres'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.details .genres a') ] data['styles'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.details .styles a') ] data['moods'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.sidebar-module.moods a') ] data['themes'] = [ { 'name': _(e).text(), 'url': _(e).attr('href') } for e in pq('.sidebar-module.themes a') ] data['medias'] = list() for media_title in pq_iter(pq('#tracks h2')): media = dict() media['name'] = media_title('.disc-num').text() data['medias'].append(media) media['tracks'] = list() for track_row in pq_iter(media_title.next()('tbody tr')): track = dict() track['position'] = track_row('td.tracknum').text() track['title'] = track_row('td.title div.title a').text() track['url'] = track_row('td.title div.title a').attr('href') media['tracks'].append(track) track['composers'] = list() track['duration'] = track_row('td.time').text() for composer in pq_iter(track_row('td.title div.artist a')): track['composers'].append({ 'name': composer.text(), 'url': composer.attr('href'), }) track['performers'] = list() for performer in pq_iter(track_row('td.performer div.primary a')): track['performers'].append({ 'name': performer.text(), 'url': performer.attr('href') }) return data
def get_detail(url): data = {} content = request(url) if not content: return doc = _(content) # check check_list = doc('.bd-box .rz-icon span') is_ok = False for e in check_list: if _(e).text() in (u'手机已认证', u'个人实名已认证', u'企业已认证'): is_ok = True break; if not is_ok: print 'no auth one' return None # desc desc_list = doc('.pr-cont .nbd') data['desc'] = _(desc_list[0]).text() # brief brief_list = doc('.box-cont p') data['brief'] = _(brief_list[1]).text() # title brief_list = doc('.box-cont h1') data['title'] = _(brief_list[0]).text() # c1_list, c2_list = doc('.contList')[:2] # address data['address'] = _(c1_list[0]).find('.wt2').text() # serviceitems item_list = _(c1_list[1]).find('.wt2 a') items = [] for e in item_list: items.append(_(e).text()) data['serviceitems'] = items # worktime if len(c1_list) >= 3: data['worktime'] = _(c1_list[2]).find('.wt2').text() else: data['worktime'] = u" " # serviceareas if len(c1_list) >= 4: data['serviceareas'] = _(c1_list[3]).find('.wt2').text() else: data['serviceareas'] = set() # linkman if len(c2_list) >= 1: data['linkman'] = _(c2_list[0]).find('.wt2 strong').text() else: data['linkman'] = u" " # ontracts tel_list = doc('.tel-box span') tels = [] for e in tel_list: maybe_tel = _(e).text() if is_phone.match(maybe_tel): tels.append(maybe_tel) data['contracts'] = tels return data