def down_media_file(url, mediaInfo=None): baseDir = "song" requestInfo = request_variable_init(url) requestInfo.requestName = 'down_media_file' if mediaInfo: headers['Referer'] = url # 判断下载文件、目录是否已存在 addr = '{}/{}'.format(baseDir, mediaInfo.author) if not os.path.exists(addr): os.mkdir(addr) addr = os.path.join(addr, '{}.{}'.format(mediaInfo.mediaName, mediaInfo.mediaFormat)) logger.debug('media addr {}'.format(addr)) else: md5 = hashlib.md5() md5.update(url.encode('utf-8')) addr = os.path.join(baseDir, md5.hexdigest()) logger.debug('media addr {}'.format(addr)) if os.path.exists(addr): logger.debug('media addr {} is exists'.format(addr)) requestInfo.status = 0 return requestInfo,None try: logger.debug('request url {}'.format(url)) response = requests.get(url, headers=headers) except requests.exceptions.ConnectTimeout as e: # 连接超时 服务器在指定时间内没有应答 requestInfo.status = -3 logger.exception(sys.exc_info()) except requests.exceptions.ReadTimeout as e: # 读取超时 客户端等待服务器发送第一个字节之前的时间 requestInfo.status = -3 logger.exception(sys.exc_info()) except requests.exceptions.ConnectionError as e: # 网络环境异常 或 服务器异常 requestInfo.status = -2 logger.exception(sys.exc_info()) except requests.exceptions.RequestException as e: requestInfo.status = -1 logger.exception(sys.exc_info()) except Exception as e: logger.exception(sys.exc_info()) requestInfo.status = 0 if response and response.status_code==200: requestInfo.status = response.status_code try: with open(addr, 'wb') as fp: fp.write(response.content) except Exception as e: requestInfo.status = 0 logger.exception(sys.exc_info()) else: logger.error('requests error {}'.format(response if response is None else response.status_code)) requestInfo.status = response.status_code if response else 0 return requestInfo,None
def weibo_http_get_home_uniqueid(ticket, ssosavestate, session=None): url = 'https://passport.weibo.com/wbsso/login?ticket={}&ssosavestate={}&callback=sinaSSOController.doCrossDomainCallBack&scriptId=ssoscript0&client=ssologin.js(v1.4.19)&_=1533119634900'.format(ticket, ssosavestate,str(int(time.time()*1000)) ) logger.debug(url) try: # headers['Referer'] = 'https://weibo.com/' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code==200: txt = response.content.decode('gbk') logger.debug(txt) txt = txt.replace('sinaSSOController.doCrossDomainCallBack', '') txt = txt.replace(';', '') txt = txt.replace('true', '1') resultDict = eval(txt) return resultDict else: logger.error('request error http code:{}'.format(response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def weibo_http_get_raskey(user, session=None): ''' returns the result on success or none on failure :param user: weibo login user name :return: ''' url = "https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su={}&rsakt=mod&client=ssologin.js(v1.4.19)&_={}".format( user, str(int(time.time()*1000)) ) logger.debug(url) try: headers['Referer'] = 'https://weibo.com/' if session: response = requests.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code==200: logger.debug(response.content.decode('utf-8')) text = response.content.decode('utf-8') result = eval(text.replace('sinaSSOController.preloginCallBack','')) return result # nonce = re.findall(r'"nonce":"(.*?)"', text)[0] # pubkey = re.findall(r'"pubkey":"(.*?)"', text)[0] # rsakv = re.findall(r'"rsakv":"(.*?)"', text)[0] # servertime = re.findall(r'"servertime":(.*?),', text)[0] else: logger.error('request error http code:{}'.format(response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def weibo_http_get_verify_pic(pcid, session=None): ''' download img files and return img file path :param pcid: :return: return img file path ''' url = 'https://login.sina.com.cn/cgi/pin.php?r={}&s=0&p={}'.format( math.floor(random.random()*100000000), pcid ) logger.debug(url) try: headers['Referer'] = 'https://weibo.com/' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code==200: filename = 'img/{}.png'.format(pcid) with open(filename, 'wb') as fp: fp.write(response.content) return filename else: logger.error('request error http code:{}'.format(response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def weibo_http_get_tophot_list(session=None): ''' return to the list of hot searches :param session: :return: returns like [(url,topname),(url,topname)] on success and None on failure ''' url = 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6' logger.debug(url) try: headers['Referer'] = 'https://weibo.com/' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code == 200: txt = response.content.decode('utf8') logger.debug(txt) # pattern = re.compile(r'<td class="td-02">\s*<a href="(.*)" target="_blank">(.*)</a>.*</td>',re.S) pattern = re.compile( r'<td class="td-02">\s*?<a href="(.*?)" target="_blank">(\S*?)</a>.*?</td>', re.S) reGroups = pattern.findall(txt) if reGroups: return None return reGroups else: logger.error('request error http code:{}'.format( response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def weibo_http_get_navigation_page_url(url, session=None): ''' 通过分类标签页获取当前分类标签下所有数据url :param url: :param session: :return: ''' try: headers['Referer'] = 'https://weibo.com/' logger.debug('request url {}'.format(url)) requestInfo = request_variable_init(url) requestInfo.requestName = 'weibo_http_get_navigation_page_url' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code == 200: requestInfo.status = response.status_code txt = response.content.decode('utf-8') # logger.debug(txt) # 查找匹配默认是贪婪法则 加上?号后转为非贪婪 pattern = re.compile(r'<div class=\\"W_pages\\">(.*?)<\\/div>', re.S) reGroups = pattern.findall(txt) if not reGroups: logger.error('re find li_1 clearfix error') yield requestInfo, None if reGroups: allPagesTxt = reGroups[0] pattern = re.compile(r'href=\\"\\(.*?)\\">', re.S) reGroups = pattern.findall(allPagesTxt) maxPageTxt = reGroups[-2] logger.debug(maxPageTxt) maxPageReGroups = re.search(r'&page=(\d*)', maxPageTxt) maxPage = int(maxPageReGroups.group(1)) logger.debug(maxPageReGroups.group(1)) strFormat = maxPageTxt.replace('page={}'.format(maxPage), 'page={}') logger.debug(strFormat) for i in range(1, maxPage + 1): url = strFormat.format(i) url = 'https://d.weibo.com{}'.format(url) logger.debug(url) resultInfo = request_variable_init(url) resultInfo.requestName = 'weibo_http_get_navigation' yield resultInfo, None else: logger.error('request error http code:{}'.format( response.status_code)) requestInfo.status = response.status_code if response else 0 except Exception as e: logger.exception(sys.exc_info()) requestInfo.status = 0 yield requestInfo, None
def weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid=None, verify=None, session=None): url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)' logger.debug(url) try: # headers['Referer'] = 'https://weibo.com/' # headers.pop('Referer') datas={ 'entry':'weibo', 'gateway':'1', 'from':'', 'savestate': '7', 'qrcode_flag': 'false', 'useticket': '1', 'pagerefer':'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fpassport.weibo.com%2Fwbsso%2Flogout%3Fr%3Dhttps%253A%252F%252Fweibo.com%26returntype%3D1', 'vsnf': '1', 'su': su, 'service': 'miniblog', 'servertime': servertime, 'nonce': nonce, 'pwencode':'rsa2', 'rsakv': rsakv, 'sp':sp, 'sr':'1366*768', 'encoding':'UTF-8', 'prelt':'115', 'url':'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 'returntype':'META' } if verify: datas['pcid'] = pcid datas['door'] = verify logger.debug(datas) if session: response = session.post(url, headers=headers, data=datas, verify=False) else: response = requests.post(url, headers=headers, data=datas, verify=False) if response and response.status_code==200: txt = response.content.decode('gbk') logger.debug(txt) reGroups = re.match('.*location.replace\(\"(.*)\"\);.*', txt, re.S) locationDict = None if reGroups: locationUrl = reGroups.group(1) locationUrl = unquote_plus(locationUrl) logger.debug('unquote url {}'.format(locationUrl)) locationDict = parse_qs(urlparse(locationUrl).query) return locationDict else: logger.error('request error http code:{}'.format(response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def get_artist_music_list_xhr(url): headers['Referer'] = url try: logger.debug('request url {}'.format(url)) requestInfo = request_variable_init(url) requestInfo.requestName = 'get_artist_music_list_xhr' response = requests.get(url, headers=headers) if response and response.status_code == 200: requestInfo.status = response.status_code jsonObj = json.loads(response.content.decode('utf8')) htmlText = jsonObj['data']['html'] # logger.debug(htmlText) # htmlText = html.unescape(htmlText) # logger.debug(htmlText) # htmlText = htmlText.encode('utf8').decode('unicode-escape') # logger.debug(htmlText) # text = response.content.decode('unicode-escape') # import html # text = html.unescape(text) # r'<a href="/song/(\d+)" target="_blank" class="namelink" title="(.*)" a-tj' pattern = re.compile("<a href=\"/song/(\d+)\" target=\"_blank\" class=\"namelink \w*\" title=\"(\S*)\" a-tj", re.S) reGroups = pattern.findall(htmlText) if reGroups: jQuery = '17204780742719340729_1586053549318' item = '1586053553445' for songid,k in reGroups: # print(i,k) url = 'http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery{}&songid={}&from=web&_={}'.format(jQuery, songid, item) resultInfo = request_variable_init(url) resultInfo.requestName = 'get_media_info_js_request' yield resultInfo,None else: requestInfo.status = 0 logger.error('parser music list xr error') else: requestInfo.status = response.status_code if response else 0 logger.error('requests error {}'.format(response if response is None else response.status_code)) except Exception as e: logger.exception(sys.exc_info()) yield requestInfo, None
def get_artist_info(url): # http://music.taihe.com/data/tingapi/v1/restserver/ting?method=baidu.ting.artist.getInfo&from=web&tinguid=1097 headers['Referer'] = url try: logger.debug('request url {}'.format(url)) response = requests.get(url, headers=headers) if response and response.status_code == 200: pass else: logger.error('requests error {}'.format(response if response is None else response.status_code)) except Exception as e: logger.exception(sys.exc_info())
def get_artist_music_list(url): headers['Referer'] = url try: logger.debug('request url {}'.format(url)) requestInfo = request_variable_init(url) requestInfo.requestName = 'get_artist_music_list' response = requests.get(url, headers=headers) if response and response.status_code==200: requestInfo.status = response.status_code # soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib') # tag = soup.find('div',class_='page_navigator-box').find('div', class_='page-navigator-hook') # if tag: # attr = tag['class'] # print(attr) # total = attr[5].strip(',').split(':')[1] # size = attr[6].strip(',').split(':')[1] # print(total, size) # pattern = re.compile(".*'total':(\d+),[ \t]*'size':(\d+).*", re.S) pattern = re.compile("'total':(\d+),[ \t]*'size':(\d+)", re.S) totalGroup = pattern.findall(response.content.decode('utf8')) if totalGroup: # 0歌曲 1专辑 2视屏 total = int(totalGroup[0][0].strip("'")) size = int(totalGroup[0][1].strip("'")) ting_uid = url[url.rindex('/')+1:] for i in range(0, total, size): # 获取歌曲总数 每次调用xhr获取歌曲数量 xhrUrl = 'http://music.taihe.com/data/user/getsongs?start={}&size={}&ting_uid={}&r=0.196355769444312541586235172159'.format(i,size,ting_uid) # get_artist_music_list_xhr(xhrUrl) resultInfo = request_variable_init(xhrUrl) resultInfo.requestName = 'get_artist_music_list_xhr' yield resultInfo,None else: requestInfo.status = 0 logger.error('parser music list none') else: requestInfo.status = response.status_code if response else 0 logger.error('requests error {}'.format( response if response is None else response.status_code)) except Exception as e: requestInfo.status = 0 logger.exception(sys.exc_info()) yield requestInfo,None
def get_artist_list(url, filename=None): headers['Referer'] = url artistDict = {} #从json文件导出歌手清单 if filename and os.path.exists(filename) and os.path.getsize(filename)>0: # print(os.stat(filename).st_ctime,os.stat(filename).st_mtime) try: with open(filename, 'r', encoding='utf8') as fp: artistDict = json.load(fp) return artistDict except Exception as e: logger.exception(sys.exc_info()) raise try: logger.debug('request url {}'.format(url)) response = requests.get(url, headers=headers) if response and response.status_code == 200: soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib') musicBodyTag = soup.find('div', class_='music-body clearfix').find('div', class_='main-body').find('ul', class_='container') if musicBodyTag: musicTagList = musicBodyTag.find_all('a',{'href':re.compile("^.*/[0-9]*$"), 'title':re.compile("^.*$")}) for tag in musicTagList: artist = tag['title'] id = tag['href'][8:] artistDict[id] = artist else: logger.error('parser artist list none') else: logger.error('requests error {}'.format(response if response is None else response.status_code)) except Exception as e: logger.exception(sys.exc_info()) return None if filename: try: with open(filename, 'w', encoding='utf8') as fp: json.dump(artistDict, fp, ensure_ascii=False, indent=4) except Exception as e: logger.exception(sys.exc_info()) raise return artistDict
def request_workflow_thread(): while True: try: priority,requestInfo,param = requestQueue.get(block=True, timeout=10) requestQueue.task_done() logger.debug('PriorityQueue size {}'.format(requestQueue.qsize())) except Exception as e: logger.exception(sys.exc_info()) logger.error('request_workflow_thread queue empty') break if requestInfo and requestInfo.requestName: logger.debug('run {} params {}'.format(requestInfo.requestName,requestInfo.requestUrl)) if requestInfo.requestName == 'down_media_file': results = eval(requestInfo.requestName)(requestInfo.requestUrl,param) else: results=eval(requestInfo.requestName)(requestInfo.requestUrl) if isgeneratorfunction(eval(requestInfo.requestName)): logger.debug('isgeneratorfunction {} true'.format(requestInfo.requestName)) for resultInfo,result in results: if resultInfo.status == 999: logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName,resultInfo.requestUrl)) requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result)) else: if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo): dbsession.add(result) request_info_update_insert(requestInfo) try: dbsession.commit() except Exception as e: logger.exception(sys.exc_info()) logger.error('dbsession error') dbsession.rollback() else: logger.debug('isgeneratorfunction {} false'.format(requestInfo.requestName)) resultInfo, result = results if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo): dbsession.add(result) request_info_update_insert(requestInfo) if resultInfo.status != 999: logger.debug('dbsession add resultInfo {}'.format(resultInfo.urlId)) else: logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName, resultInfo.requestUrl)) requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result)) try: dbsession.commit() except Exception as e: logger.exception(sys.exc_info()) logger.error('dbsession error') dbsession.rollback() else: logger.error('request_workflow_thread requestInfo none') break
def weibo_http_get_home(uniqueid, session=None): url = 'https://weibo.com/u/{}/home?wvr=5&lf=reg'.format( uniqueid ) logger.debug(url) try: headers['Referer'] = 'https://weibo.com/' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code==200: # txt = response.content.decode('utf8') # logger.debug(txt) return True else: logger.error('request error http code:{}'.format(response.status_code)) return None except Exception as e: logger.exception(sys.exc_info()) return None
def weibo_login(user, password, session=None): su = user_base64(user) preLoginDict = weibo_http_get_raskey(su, session=session) logger.debug('dict:{}'.format(preLoginDict)) nonce = preLoginDict['nonce'] pubkey = preLoginDict['pubkey'] rsakv = preLoginDict['rsakv'] servertime = preLoginDict['servertime'] sp = password_rsa(password, servertime, nonce, pubkey) verify = None pcid = None if 'showpin' in preLoginDict.keys(): pcid = preLoginDict['pcid'] verify_file = weibo_http_get_verify_pic(pcid, session=session) # 验证码文件解析 logger.error('verify file {}'.format(verify_file)) verify = input("input verfy code:") locationParams = weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid, verify, session=session) if locationParams: retcode = locationParams['retcode'][0] ticket = locationParams['ticket'][0] rParams = parse_qs(urlparse(locationParams['r'][0]).query) ssosavestate = rParams['ssosavestate'][0] else: logger.error('weibo login error') return None resultDict = weibo_http_get_home_uniqueid(ticket, ssosavestate, session) if resultDict is None: logger.error('weibo login error') return None logger.debug("weibo login uniqueid {}".format(resultDict)) uniqueid = resultDict['userinfo']['uniqueid'] return weibo_http_get_home(uniqueid)
def request_workflow_thread(): while True: try: session = sessionQueue.get(block=True, timeout=10) sessionQueue.task_done() sessionQueue.put(session) priority, requestInfo, param = requestQueue.get(block=True, timeout=10) requestQueue.task_done() logger.debug('PriorityQueue size {}'.format(requestQueue.qsize())) except Exception as e: logger.exception(sys.exc_info()) logger.error('request_workflow_thread queue empty') break time.sleep(20) if requestInfo and requestInfo.requestName: logger.debug('run {} params {}'.format(requestInfo.requestName, requestInfo.requestUrl)) results = eval(requestInfo.requestName)(requestInfo.requestUrl, session) # 判断函数是否是生成器 if isgeneratorfunction(eval(requestInfo.requestName)): logger.debug('isgeneratorfunction {} true'.format( requestInfo.requestName)) for resultInfo, result in results: if resultInfo.status == 999: logger.debug('PriorityQueue put {},{} '.format( resultInfo.requestName, resultInfo.requestUrl)) requestQueue.put( (PRIORITYDEFINE[resultInfo.requestName], resultInfo, result)) else: if isinstance(result, list): for item in result: if isinstance(item, WeiboUser): dbsession.add(item) elif isinstance(result, WeiboUser): dbsession.add(result) request_info_update_insert(requestInfo) try: dbsession.commit() except Exception as e: logger.exception(sys.exc_info()) logger.error('dbsession error') dbsession.rollback() else: logger.debug('isgeneratorfunction {} false'.format( requestInfo.requestName)) resultInfo, result = results if isinstance(result, list): for item in result: if isinstance(item, WeiboUser): dbsession.add(item) elif isinstance(result, WeiboUser): dbsession.add(result) request_info_update_insert(requestInfo) if resultInfo.status != 999: logger.debug('dbsession add resultInfo {}'.format( resultInfo.urlId)) else: logger.debug('PriorityQueue put {},{} '.format( resultInfo.requestName, resultInfo.requestUrl)) requestQueue.put((PRIORITYDEFINE[resultInfo.requestName], resultInfo, result)) try: dbsession.commit() except Exception as e: logger.exception(sys.exc_info()) logger.error('dbsession error') dbsession.rollback() else: logger.error('request_workflow_thread requestInfo none') break
def weibo_http_get_navigation_page_list(url, session=None): ''' 获取导航页各分类标签地址 :param session: :return: ''' logger.debug(url) try: headers['Referer'] = 'https://weibo.com/' requestInfo = request_variable_init(url) requestInfo.requestName = 'weibo_http_get_navigation_page_list' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code == 200: requestInfo.status = response.status_code txt = response.content.decode('utf-8') # logger.debug(txt) # 查找匹配默认是贪婪法则 加上?号后转为非贪婪 pattern = re.compile(r'<li class=\\"li_1 clearfix\\">(.*?)<\\/li>', re.S) reGroups = pattern.findall(txt) if not reGroups: logger.error('re find li_1 clearfix error') yield requestInfo, None else: navigationDict = {} for item in reGroups: pattern = re.compile( r'<span class=\\"pt_title S_txt2\\">(.*?)<\\/span>', re.S) reTags = pattern.findall(item) key = reTags[0].replace(':', '') pattern = re.compile( r'<a target=\\"_blank\\" href=\\"(.*?)\\".*?<span.*?<\\/span>(.*?)<\\/a>', re.S) reTags = pattern.findall(item) value = [(v[0], v[1].replace('\\t', '').strip()) for v in reTags] id, name = value[0] id = id[:id.rindex('_')] + '_0' name = '全部' value.insert(0, (id, name)) navigationDict[key] = value url = 'https://d.weibo.com/{}#'.format(id) resultInfo = request_variable_init(url) resultInfo.requestName = 'weibo_http_get_navigation_page_url' yield resultInfo, None # logger.debug(navigationDict) else: logger.error('request error http code:{}'.format( response.status_code)) requestInfo.status = response.status_code if response else 0 except Exception as e: logger.exception(sys.exc_info()) requestInfo.status = 0 yield requestInfo, None
def weibo_http_get_navigation(url, session=None): ''' 获取账户信息 :param url: :param session: :return: ''' try: headers['Referer'] = 'https://weibo.com/' logger.debug('request url {}'.format(url)) requestInfo = request_variable_init(url) requestInfo.requestName = 'weibo_http_get_navigation' if session: response = session.get(url, headers=headers, verify=False) else: response = requests.get(url, headers=headers, verify=False) if response and response.status_code == 200: requestInfo.status = response.status_code txt = response.content.decode('utf-8') pattern = re.compile( r'<li class=\\"follow_item S_line2\\">(.*?)<\\/li>', re.S) reGroups = pattern.findall(txt) if not reGroups: logger.error('re find li_1 clearfix error') return requestInfo, None else: logger.debug(reGroups) userList = [] for item in reGroups: user = WeiboUser() # 头像 pattern = re.compile( r'<dt class=\\"mod_pic\\">.*?src=\\"(.*?)\\".*<\\/dt>', re.S) picReGroups = pattern.findall(item) # print(picReGroups) # 名称 # '<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*?<i.*?class=\\"(.*?)\\".*?><\\/i>.*?' pattern = re.compile( r'<div class=\\"info_name W_fb W_f14\\">(.*?)<\\/div>', re.S) infoNameGroups = pattern.findall(item) if infoNameGroups: txt = infoNameGroups[0] tags = re.findall( r'.*<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*', txt, re.S) if tags: user.username = tags[0][1] user.userid = tags[0][0] tags = re.findall(r'<i.*?class=\\"(.*?)\\".*?><\\/i>', txt, re.S) if tags: for tag in tags: if 'icon_approve' in tag: # 微博个人认证 user.verify = '1' elif 'icon_female' in tag: user.gender = 'female' elif 'icon_male' in tag: user.gender = 'male' elif 'icon_member' in tag: # 微博会员 user.member = '1' # 关注情况 pattern = re.compile( r'<div class=\\"info_connect\\">.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<\\/div>', re.S) infoConnectGroups = pattern.findall(item) user.focusnumber, user.fansnumber, user.weibonumber = infoConnectGroups[ 0] # user.focusnumber = int(infoNameGroups[0][0]) # user.fansnumber = int(infoNameGroups[0][1]) # user.weibonumber = int(infoNameGroups[0][2]) # 地址 pattern = re.compile( r'<div class=\\"info_add\\">.*?<span>(.*?)<\\/span>.*?<\\/div>', re.S) infoAddGroups = pattern.findall(item) adds = infoAddGroups[0].split(' ') # print(infoAddGroups, adds) if len(adds) == 2: user.province = adds[0] user.city = adds[1] else: user.province = adds[0] user.city = adds[0] # 简介 pattern = re.compile( r'<div class=\\"info_intro\\">.*?<span>(.*?)<\\/span>.*?<\\/div>', re.S) infoIntroGroups = pattern.findall(item) user.intro = infoIntroGroups[0] # print(user.intro) userList.append(user) return requestInfo, userList else: logger.error('request error http code:{}'.format( response.status_code)) requestInfo.status = response.status_code if response else 0 except Exception as e: logger.exception(sys.exc_info()) requestInfo.status = 0 return requestInfo, None
def get_media_info_js_request(url): logger.debug('request url {}'.format(url)) requestInfo = request_variable_init(url) requestInfo.requestName = 'get_media_info_js_request' headers['Referer'] = url httpSession = requests.session() s_time = time.time() mediaInfo = None try: response = httpSession.get(url, headers=headers) if response and response.status_code==200: requestInfo.status = response.status_code # jsonStr = response.text.strip('jQuery{}('.format(jQuery)) # jsonStr = jsonStr.strip(');') pattern = re.compile('^jQuery\S*\(({.*})\);$') reGroups = pattern.match(response.text) if reGroups: jsonStr = reGroups.group(1) else: requestInfo.status = 0 logger.error('parser js none') return requestInfo, None jsonObj = json.loads(jsonStr) mediaInfo = MediaInfo() mediaInfo.mediaUrl = jsonObj['bitrate']['show_link'] md5 = hashlib.md5() md5.update(mediaInfo.mediaUrl.encode('utf-8')) mediaInfo.mediaId = md5.hexdigest() ###ERROR 检查必要字段是否为空 为空特别处理 mediaInfo.mediaName = jsonObj['songinfo']['title'] mediaInfo.mediaLang = jsonObj['songinfo']['language'] mediaInfo.country = jsonObj['songinfo']['country'] mediaInfo.proxycompany = jsonObj['songinfo']['si_proxycompany'] mediaInfo.compose = jsonObj['songinfo']['compose'] mediaInfo.writer = jsonObj['songinfo']['songwriting'] mediaInfo.author = jsonObj['songinfo']['author'] mediaInfo.publishTime = jsonObj['songinfo']['publishtime'] mediaInfo.albumName = jsonObj['songinfo']['album_title'] mediaInfo.lrcUrl = jsonObj['songinfo']['lrclink'] mediaInfo.mediaSize = jsonObj['bitrate']['file_size'] mediaInfo.mediaFormat = jsonObj['bitrate']['file_format'] mediaInfo.albumId = jsonObj['songinfo']['album_id'] # 1 版权原因删除 0正常可听 mediaInfo.useStatus = jsonObj['songinfo']['del_status'] mediaInfo.source = urllib.parse.urlparse(mediaInfo.mediaUrl).netloc # mediaInfo.sourceDate = time.strftime('%Y%m%d', time.localtime(time.time())) e_time = time.time() mediaInfo.useTime = e_time-s_time logger.debug('media url {}'.format(mediaInfo.mediaUrl)) requestInfo = request_variable_init(mediaInfo.mediaUrl) requestInfo.requestName = 'down_media_file' else: logger.error('requests error {}'.format(response if response is None else response.status_code)) requestInfo.status = response.status_code if response else 0 except requests.exceptions.ConnectTimeout as e: # 连接超时 服务器在指定时间内没有应答 requestInfo.status = -3 logger.exception(sys.exc_info()) except requests.exceptions.ReadTimeout as e: # 读取超时 客户端等待服务器发送第一个字节之前的时间 requestInfo.status = -3 logger.exception(sys.exc_info()) except requests.exceptions.ConnectionError as e: # 网络环境异常 或 服务器异常 requestInfo.status = -2 logger.exception(sys.exc_info()) except requests.exceptions.RequestException as e: requestInfo.status = -1 logger.exception(sys.exc_info()) except Exception as e: logger.exception(sys.exc_info()) requestInfo.status = 0 return requestInfo, None return requestInfo, mediaInfo