def getLocation(userName): req = urllib2.Request('https://flightaware.com/adsb/stats/user/'+userName) req.add_header('user-agent', cons.headers()) html = urllib2.urlopen(req).read() loc = re.search(r'"latitude":(-?[0-9]*.[0-9]*),"longitude":(-?[0-9]*.[0-9]*),', html) location = (loc.group(1), loc.group(2)) return location
def read_url(url, encoding="gb18030"): try: req = urllib.request.Request(url) req.add_header('user-agent', headers()) content = urllib.request.urlopen(req, timeout=5).read() content = gzip.decompress(content).decode(encoding) except Exception as e: print(e) print(inspect.stack()[1][3] + ' occused error') sleep(5) req = urllib.request.Request(url) req.add_header('user-agent', headers()) content = urllib.request.urlopen(req).read() content = gzip.decompress(content).decode("gb18030") # 网页gb2312的编码要用这个 soup = BeautifulSoup(content, "lxml") return soup
def getInfo(userId): req = urllib2.request('https://mm.taobao.com/self/aiShow.htm?&userId=%s' % userId) req.add_header('user-agent', headers()) html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8') #print html return html
def getAlbumList(userId): req = urllib2.request( 'https://mm.taobao.com/self/model_album.htm?&user_id=' % userId) req.add_header('user-agent', headers()) html = urllib2.urlopen(req).read().decode('gbk').encode('utf-8') #print html reg = r'class="mm-first" href="//(.*?)"' return re.findall(reg, html)[::2]
def getPicture(userId, album_id): req = urllib.request.Request( 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id=%s&album_id=%s' % (userId, album_id)) req.add_header('user-agent', headers()) html = urllib.request.urlopen(req).read().decode('gbk') result = json.loads(html) for k in result['picList']: print(k['picUrl'])
def getUserName(): req = urllib2.Request('https://flightaware.com/ajax/ignoreuser/adsb/adsb_stats.rvt?table=sites&start=0&length=1000') req.add_header('user-agent', cons.headers()) html = urllib2.urlopen(req).read() dict = json.loads(html) # print dict['data'][0]['user_username'] names = list() for item in dict['data']: names.append(item['user_username']) return names
def getUrlList(): req = urllib.request.Request( 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' ) req.add_header('user-agent', headers()) html = urllib.request.urlopen( req, data= b'q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=¤tPage=1&pageSize=100' ).read().decode('gbk') # print(html) result = json.loads(html) return result['data']['searchDOList']
def getAlbumUrl(userId): req = urllib.request.Request( 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%%20=%s' % userId) req.add_header('user-agent', headers()) html = urllib.request.urlopen(req).read().decode('gbk') reg = r'<a class="mm-first" href="//(.*?)" target="_blank">' reg1 = r'mm.taobao.com/self/album_photo.htm?(.*?)&album_id=(.*?)&album_flag=0' albumList = re.findall(reg, html)[::2] albumId = [] for j in albumList: # j.split('album_id') albumId.append(j.split('album_id=')[1].split('&')[0]) return albumId
def getUrlList(): req = urllib2.Request( 'https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8' ) req.add_header('user-agent', headers()) #因为是POST,不是GET,所以需要data html = urllib2.urlopen( req, data= 'q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=¤tPage=1&pageSize=100' ) print 'sds' print html html.read().decode('gbk').encode('utf-8') print html # #返回数据是json(类似于字符串,可以用正则取出来,但是复杂点),我们需要里面数据的切片,所以先转化为dict,然后字典取值 # #所以我们需要import json中的loads json = loads(html) return json['data']['searchDOList']
def getInfo(userId): req = urllib.request.Request( 'https://mm.taobao.com/self/aiShow.htm?userId=%s' % userId) req.add_header('user-agent', headers()) html = urllib.request.urlopen(req).read().decode('gbk') print(html)