Esempio n. 1
0
def down_media_file(url, mediaInfo=None):
    baseDir = "song"

    requestInfo = request_variable_init(url)
    requestInfo.requestName = 'down_media_file'

    if mediaInfo:
        headers['Referer'] = url

        # 判断下载文件、目录是否已存在
        addr = '{}/{}'.format(baseDir, mediaInfo.author)
        if not os.path.exists(addr):
            os.mkdir(addr)
        addr = os.path.join(addr, '{}.{}'.format(mediaInfo.mediaName, mediaInfo.mediaFormat))
        logger.debug('media addr {}'.format(addr))
    else:
        md5 = hashlib.md5()
        md5.update(url.encode('utf-8'))
        addr =  os.path.join(baseDir, md5.hexdigest())
        logger.debug('media addr {}'.format(addr))

    if os.path.exists(addr):
        logger.debug('media addr {} is exists'.format(addr))
        requestInfo.status = 0
        return requestInfo,None

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)
    except requests.exceptions.ConnectTimeout as e:
        # 连接超时  服务器在指定时间内没有应答
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ReadTimeout as e:
        # 读取超时 客户端等待服务器发送第一个字节之前的时间
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ConnectionError as e:
        # 网络环境异常 或 服务器异常
        requestInfo.status = -2
        logger.exception(sys.exc_info())
    except requests.exceptions.RequestException as e:
        requestInfo.status = -1
        logger.exception(sys.exc_info())
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    if response and response.status_code==200:
        requestInfo.status = response.status_code
        try:
            with open(addr, 'wb') as fp:
                fp.write(response.content)
        except Exception as e:
            requestInfo.status = 0
            logger.exception(sys.exc_info())
    else:
        logger.error('requests error {}'.format(response if response is None else response.status_code))
        requestInfo.status = response.status_code if response else 0
    return requestInfo,None
Esempio n. 2
0
def weibo_http_get_home_uniqueid(ticket, ssosavestate, session=None):
    url = 'https://passport.weibo.com/wbsso/login?ticket={}&ssosavestate={}&callback=sinaSSOController.doCrossDomainCallBack&scriptId=ssoscript0&client=ssologin.js(v1.4.19)&_=1533119634900'.format(ticket, ssosavestate,str(int(time.time()*1000)) )
    logger.debug(url)

    try:
        # headers['Referer'] = 'https://weibo.com/'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            txt = response.content.decode('gbk')
            logger.debug(txt)
            txt = txt.replace('sinaSSOController.doCrossDomainCallBack', '')
            txt = txt.replace(';', '')
            txt = txt.replace('true', '1')
            resultDict = eval(txt)
            return resultDict
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 3
0
def weibo_http_get_raskey(user, session=None):
    '''
        returns the result on success or none on failure
    :param user: weibo login user name
    :return:
    '''
    url = "https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su={}&rsakt=mod&client=ssologin.js(v1.4.19)&_={}".format( user, str(int(time.time()*1000)) )
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = requests.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            logger.debug(response.content.decode('utf-8'))
            text = response.content.decode('utf-8')
            result = eval(text.replace('sinaSSOController.preloginCallBack',''))
            return result
            # nonce = re.findall(r'"nonce":"(.*?)"', text)[0]
            # pubkey = re.findall(r'"pubkey":"(.*?)"', text)[0]
            # rsakv = re.findall(r'"rsakv":"(.*?)"', text)[0]
            # servertime = re.findall(r'"servertime":(.*?),', text)[0]
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 4
0
def weibo_http_get_verify_pic(pcid, session=None):
    '''
    download img files and return img file path
    :param pcid:
    :return: return img file path
    '''
    url = 'https://login.sina.com.cn/cgi/pin.php?r={}&s=0&p={}'.format( math.floor(random.random()*100000000), pcid )
    logger.debug(url)
    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            filename = 'img/{}.png'.format(pcid)
            with open(filename, 'wb') as fp:
                fp.write(response.content)
            return filename
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 5
0
def weibo_http_get_tophot_list(session=None):
    '''
        return to the list of hot searches
    :param session:
    :return: returns like [(url,topname),(url,topname)] on success and None on failure
    '''
    url = 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            txt = response.content.decode('utf8')
            logger.debug(txt)
            # pattern = re.compile(r'<td class="td-02">\s*<a href="(.*)" target="_blank">(.*)</a>.*</td>',re.S)
            pattern = re.compile(
                r'<td class="td-02">\s*?<a href="(.*?)" target="_blank">(\S*?)</a>.*?</td>',
                re.S)
            reGroups = pattern.findall(txt)
            if reGroups:
                return None
            return reGroups
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 6
0
def weibo_http_get_navigation_page_url(url, session=None):
    '''
        通过分类标签页获取当前分类标签下所有数据url
    :param url:
    :param session:
    :return:
    '''
    try:
        headers['Referer'] = 'https://weibo.com/'
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation_page_url'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            # logger.debug(txt)
            # 查找匹配默认是贪婪法则 加上?号后转为非贪婪
            pattern = re.compile(r'<div class=\\"W_pages\\">(.*?)<\\/div>',
                                 re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                yield requestInfo, None
            if reGroups:
                allPagesTxt = reGroups[0]
                pattern = re.compile(r'href=\\"\\(.*?)\\">', re.S)
                reGroups = pattern.findall(allPagesTxt)

                maxPageTxt = reGroups[-2]
                logger.debug(maxPageTxt)
                maxPageReGroups = re.search(r'&page=(\d*)', maxPageTxt)
                maxPage = int(maxPageReGroups.group(1))
                logger.debug(maxPageReGroups.group(1))
                strFormat = maxPageTxt.replace('page={}'.format(maxPage),
                                               'page={}')
                logger.debug(strFormat)

                for i in range(1, maxPage + 1):
                    url = strFormat.format(i)
                    url = 'https://d.weibo.com{}'.format(url)
                    logger.debug(url)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'weibo_http_get_navigation'
                    yield resultInfo, None
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    yield requestInfo, None
Esempio n. 7
0
def weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid=None, verify=None, session=None):
    url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
    logger.debug(url)

    try:
        # headers['Referer'] = 'https://weibo.com/'
        # headers.pop('Referer')
        datas={
            'entry':'weibo',
            'gateway':'1',
            'from':'',
            'savestate': '7',
            'qrcode_flag': 'false',
            'useticket': '1',
            'pagerefer':'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fpassport.weibo.com%2Fwbsso%2Flogout%3Fr%3Dhttps%253A%252F%252Fweibo.com%26returntype%3D1',
            'vsnf': '1',
            'su': su,
            'service': 'miniblog',
            'servertime': servertime,
            'nonce': nonce,
            'pwencode':'rsa2',
            'rsakv': rsakv,
            'sp':sp,
            'sr':'1366*768',
            'encoding':'UTF-8',
            'prelt':'115',
            'url':'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'returntype':'META'
        }
        if verify:
            datas['pcid'] = pcid
            datas['door'] = verify
        logger.debug(datas)
        if session:
            response = session.post(url, headers=headers, data=datas, verify=False)
        else:
            response = requests.post(url, headers=headers, data=datas, verify=False)
        if response and response.status_code==200:
            txt = response.content.decode('gbk')
            logger.debug(txt)
            reGroups = re.match('.*location.replace\(\"(.*)\"\);.*', txt, re.S)
            locationDict = None
            if reGroups:
                locationUrl = reGroups.group(1)
                locationUrl = unquote_plus(locationUrl)
                logger.debug('unquote url {}'.format(locationUrl))

                locationDict = parse_qs(urlparse(locationUrl).query)

            return locationDict
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 8
0
def get_artist_music_list_xhr(url):
    headers['Referer'] = url

    try:
        logger.debug('request url {}'.format(url))

        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'get_artist_music_list_xhr'

        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            jsonObj = json.loads(response.content.decode('utf8'))
            htmlText = jsonObj['data']['html']
            # logger.debug(htmlText)

            # htmlText = html.unescape(htmlText)
            # logger.debug(htmlText)

            # htmlText = htmlText.encode('utf8').decode('unicode-escape')
            # logger.debug(htmlText)

            # text = response.content.decode('unicode-escape')
            # import html
            # text = html.unescape(text)

            # r'<a href="/song/(\d+)" target="_blank" class="namelink" title="(.*)" a-tj'
            pattern = re.compile("<a href=\"/song/(\d+)\" target=\"_blank\" class=\"namelink \w*\" title=\"(\S*)\" a-tj", re.S)
            reGroups = pattern.findall(htmlText)

            if reGroups:
                jQuery = '17204780742719340729_1586053549318'
                item = '1586053553445'
                for songid,k in reGroups:
                    # print(i,k)
                    url = 'http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery{}&songid={}&from=web&_={}'.format(jQuery, songid, item)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'get_media_info_js_request'
                    yield  resultInfo,None
            else:
                requestInfo.status = 0
                logger.error('parser music list xr error')
        else:
            requestInfo.status = response.status_code if response else 0
            logger.error('requests error {}'.format(response if response is None else response.status_code))

    except Exception as e:
        logger.exception(sys.exc_info())
    yield requestInfo, None
Esempio n. 9
0
def get_artist_info(url):
    # http://music.taihe.com/data/tingapi/v1/restserver/ting?method=baidu.ting.artist.getInfo&from=web&tinguid=1097
    headers['Referer'] = url

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            pass
        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))
    except Exception as e:
        logger.exception(sys.exc_info())
Esempio n. 10
0
def get_artist_music_list(url):

    headers['Referer'] = url
    try:
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'get_artist_music_list'
        response = requests.get(url, headers=headers)

        if response and response.status_code==200:
            requestInfo.status = response.status_code
            # soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib')
            # tag = soup.find('div',class_='page_navigator-box').find('div', class_='page-navigator-hook')
            # if tag:
            #     attr = tag['class']
            #     print(attr)
            #     total = attr[5].strip(',').split(':')[1]
            #     size = attr[6].strip(',').split(':')[1]
            #     print(total, size)

            # pattern = re.compile(".*'total':(\d+),[ \t]*'size':(\d+).*", re.S)
            pattern = re.compile("'total':(\d+),[ \t]*'size':(\d+)", re.S)
            totalGroup = pattern.findall(response.content.decode('utf8'))

            if totalGroup:
                # 0歌曲 1专辑 2视屏
                total = int(totalGroup[0][0].strip("'"))
                size = int(totalGroup[0][1].strip("'"))

                ting_uid = url[url.rindex('/')+1:]
                for i in range(0, total, size):
                    # 获取歌曲总数 每次调用xhr获取歌曲数量
                    xhrUrl = 'http://music.taihe.com/data/user/getsongs?start={}&size={}&ting_uid={}&r=0.196355769444312541586235172159'.format(i,size,ting_uid)
                    # get_artist_music_list_xhr(xhrUrl)

                    resultInfo = request_variable_init(xhrUrl)
                    resultInfo.requestName = 'get_artist_music_list_xhr'
                    yield  resultInfo,None
            else:
                requestInfo.status = 0
                logger.error('parser music list none')
        else:
            requestInfo.status = response.status_code if response else 0
            logger.error('requests error {}'.format( response if response is None else response.status_code))
    except Exception as e:
        requestInfo.status = 0
        logger.exception(sys.exc_info())

    yield requestInfo,None
Esempio n. 11
0
def get_artist_list(url, filename=None):
    headers['Referer'] = url

    artistDict = {}

    #从json文件导出歌手清单
    if filename and os.path.exists(filename) and os.path.getsize(filename)>0:
        # print(os.stat(filename).st_ctime,os.stat(filename).st_mtime)
        try:
            with open(filename, 'r', encoding='utf8') as fp:
                artistDict = json.load(fp)

            return artistDict
        except Exception as e:
            logger.exception(sys.exc_info())
            raise

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib')

            musicBodyTag = soup.find('div', class_='music-body clearfix').find('div', class_='main-body').find('ul', class_='container')
            if musicBodyTag:
                musicTagList = musicBodyTag.find_all('a',{'href':re.compile("^.*/[0-9]*$"), 'title':re.compile("^.*$")})
                for tag in musicTagList:
                    artist = tag['title']
                    id = tag['href'][8:]
                    artistDict[id] = artist
            else:
                logger.error('parser artist list none')
        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))

    except Exception as e:
        logger.exception(sys.exc_info())
        return None

    if filename:
        try:
            with open(filename, 'w', encoding='utf8') as fp:
                json.dump(artistDict, fp, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(sys.exc_info())
            raise
    return artistDict
Esempio n. 12
0
def request_workflow_thread():
    while True:
        try:
            priority,requestInfo,param = requestQueue.get(block=True, timeout=10)
            requestQueue.task_done()
            logger.debug('PriorityQueue size {}'.format(requestQueue.qsize()))
        except Exception as e:
            logger.exception(sys.exc_info())
            logger.error('request_workflow_thread queue empty')
            break

        if requestInfo and requestInfo.requestName:
            logger.debug('run {} params {}'.format(requestInfo.requestName,requestInfo.requestUrl))
            if requestInfo.requestName == 'down_media_file':
                results = eval(requestInfo.requestName)(requestInfo.requestUrl,param)
            else:
                results=eval(requestInfo.requestName)(requestInfo.requestUrl)
            if isgeneratorfunction(eval(requestInfo.requestName)):
                logger.debug('isgeneratorfunction {} true'.format(requestInfo.requestName))
                for resultInfo,result in results:
                    if resultInfo.status == 999:
                        logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName,resultInfo.requestUrl))
                        requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result))
                    else:
                        if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo):
                            dbsession.add(result)
                    request_info_update_insert(requestInfo)
                    try:
                        dbsession.commit()
                    except Exception as e:
                        logger.exception(sys.exc_info())
                        logger.error('dbsession error')
                        dbsession.rollback()
            else:
                logger.debug('isgeneratorfunction {} false'.format(requestInfo.requestName))
                resultInfo, result = results

                if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo):
                    dbsession.add(result)

                request_info_update_insert(requestInfo)
                if resultInfo.status != 999:
                    logger.debug('dbsession add resultInfo {}'.format(resultInfo.urlId))
                else:
                    logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName, resultInfo.requestUrl))
                    requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result))

                try:
                    dbsession.commit()
                except Exception as e:
                    logger.exception(sys.exc_info())
                    logger.error('dbsession error')
                    dbsession.rollback()

        else:
            logger.error('request_workflow_thread requestInfo none')
            break
Esempio n. 13
0
def weibo_http_get_home(uniqueid, session=None):
    url = 'https://weibo.com/u/{}/home?wvr=5&lf=reg'.format( uniqueid )
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            # txt = response.content.decode('utf8')
            # logger.debug(txt)
            return True
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Esempio n. 14
0
def weibo_login(user, password, session=None):
    su = user_base64(user)
    preLoginDict = weibo_http_get_raskey(su, session=session)
    logger.debug('dict:{}'.format(preLoginDict))

    nonce = preLoginDict['nonce']
    pubkey = preLoginDict['pubkey']
    rsakv = preLoginDict['rsakv']
    servertime = preLoginDict['servertime']

    sp = password_rsa(password, servertime, nonce, pubkey)
    verify = None
    pcid = None
    if 'showpin' in preLoginDict.keys():
        pcid = preLoginDict['pcid']
        verify_file = weibo_http_get_verify_pic(pcid, session=session)
        # 验证码文件解析
        logger.error('verify file {}'.format(verify_file))
        verify = input("input verfy code:")

    locationParams = weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid, verify, session=session)
    if locationParams:
        retcode = locationParams['retcode'][0]
        ticket = locationParams['ticket'][0]
        rParams = parse_qs(urlparse(locationParams['r'][0]).query)
        ssosavestate = rParams['ssosavestate'][0]
    else:
        logger.error('weibo login error')
        return None

    resultDict = weibo_http_get_home_uniqueid(ticket, ssosavestate, session)

    if resultDict is None:
        logger.error('weibo login error')
        return None

    logger.debug("weibo login uniqueid {}".format(resultDict))
    uniqueid = resultDict['userinfo']['uniqueid']
    return weibo_http_get_home(uniqueid)
Esempio n. 15
0
def request_workflow_thread():
    while True:
        try:
            session = sessionQueue.get(block=True, timeout=10)
            sessionQueue.task_done()
            sessionQueue.put(session)

            priority, requestInfo, param = requestQueue.get(block=True,
                                                            timeout=10)
            requestQueue.task_done()
            logger.debug('PriorityQueue size {}'.format(requestQueue.qsize()))
        except Exception as e:
            logger.exception(sys.exc_info())
            logger.error('request_workflow_thread queue empty')
            break

        time.sleep(20)
        if requestInfo and requestInfo.requestName:
            logger.debug('run {} params {}'.format(requestInfo.requestName,
                                                   requestInfo.requestUrl))
            results = eval(requestInfo.requestName)(requestInfo.requestUrl,
                                                    session)

            # 判断函数是否是生成器
            if isgeneratorfunction(eval(requestInfo.requestName)):
                logger.debug('isgeneratorfunction {} true'.format(
                    requestInfo.requestName))
                for resultInfo, result in results:
                    if resultInfo.status == 999:
                        logger.debug('PriorityQueue put {},{} '.format(
                            resultInfo.requestName, resultInfo.requestUrl))
                        requestQueue.put(
                            (PRIORITYDEFINE[resultInfo.requestName],
                             resultInfo, result))
                    else:
                        if isinstance(result, list):
                            for item in result:
                                if isinstance(item, WeiboUser):
                                    dbsession.add(item)
                        elif isinstance(result, WeiboUser):
                            dbsession.add(result)
                    request_info_update_insert(requestInfo)
                    try:
                        dbsession.commit()
                    except Exception as e:
                        logger.exception(sys.exc_info())
                        logger.error('dbsession error')
                        dbsession.rollback()
            else:
                logger.debug('isgeneratorfunction {} false'.format(
                    requestInfo.requestName))
                resultInfo, result = results

                if isinstance(result, list):
                    for item in result:
                        if isinstance(item, WeiboUser):
                            dbsession.add(item)
                elif isinstance(result, WeiboUser):
                    dbsession.add(result)

                request_info_update_insert(requestInfo)
                if resultInfo.status != 999:
                    logger.debug('dbsession add resultInfo {}'.format(
                        resultInfo.urlId))
                else:
                    logger.debug('PriorityQueue put {},{} '.format(
                        resultInfo.requestName, resultInfo.requestUrl))
                    requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],
                                      resultInfo, result))

                try:
                    dbsession.commit()
                except Exception as e:
                    logger.exception(sys.exc_info())
                    logger.error('dbsession error')
                    dbsession.rollback()

        else:
            logger.error('request_workflow_thread requestInfo none')
            break
Esempio n. 16
0
def weibo_http_get_navigation_page_list(url, session=None):
    '''
        获取导航页各分类标签地址
    :param session:
    :return:
    '''
    logger.debug(url)
    try:
        headers['Referer'] = 'https://weibo.com/'
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation_page_list'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            # logger.debug(txt)
            # 查找匹配默认是贪婪法则 加上?号后转为非贪婪
            pattern = re.compile(r'<li class=\\"li_1 clearfix\\">(.*?)<\\/li>',
                                 re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                yield requestInfo, None
            else:
                navigationDict = {}
                for item in reGroups:
                    pattern = re.compile(
                        r'<span class=\\"pt_title S_txt2\\">(.*?)<\\/span>',
                        re.S)
                    reTags = pattern.findall(item)
                    key = reTags[0].replace(':', '')
                    pattern = re.compile(
                        r'<a target=\\"_blank\\" href=\\"(.*?)\\".*?<span.*?<\\/span>(.*?)<\\/a>',
                        re.S)
                    reTags = pattern.findall(item)
                    value = [(v[0], v[1].replace('\\t', '').strip())
                             for v in reTags]
                    id, name = value[0]
                    id = id[:id.rindex('_')] + '_0'
                    name = '全部'
                    value.insert(0, (id, name))
                    navigationDict[key] = value

                    url = 'https://d.weibo.com/{}#'.format(id)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'weibo_http_get_navigation_page_url'
                    yield resultInfo, None
            # logger.debug(navigationDict)
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    yield requestInfo, None
Esempio n. 17
0
def weibo_http_get_navigation(url, session=None):
    '''
        获取账户信息
    :param url:
    :param session:
    :return:
    '''
    try:
        headers['Referer'] = 'https://weibo.com/'
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            pattern = re.compile(
                r'<li class=\\"follow_item S_line2\\">(.*?)<\\/li>', re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                return requestInfo, None
            else:
                logger.debug(reGroups)
                userList = []
                for item in reGroups:
                    user = WeiboUser()
                    # 头像
                    pattern = re.compile(
                        r'<dt class=\\"mod_pic\\">.*?src=\\"(.*?)\\".*<\\/dt>',
                        re.S)
                    picReGroups = pattern.findall(item)
                    # print(picReGroups)

                    # 名称
                    # '<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*?<i.*?class=\\"(.*?)\\".*?><\\/i>.*?'
                    pattern = re.compile(
                        r'<div class=\\"info_name W_fb W_f14\\">(.*?)<\\/div>',
                        re.S)
                    infoNameGroups = pattern.findall(item)
                    if infoNameGroups:
                        txt = infoNameGroups[0]
                        tags = re.findall(
                            r'.*<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*',
                            txt, re.S)
                        if tags:
                            user.username = tags[0][1]
                            user.userid = tags[0][0]
                        tags = re.findall(r'<i.*?class=\\"(.*?)\\".*?><\\/i>',
                                          txt, re.S)
                        if tags:
                            for tag in tags:
                                if 'icon_approve' in tag:
                                    # 微博个人认证
                                    user.verify = '1'
                                elif 'icon_female' in tag:
                                    user.gender = 'female'
                                elif 'icon_male' in tag:
                                    user.gender = 'male'
                                elif 'icon_member' in tag:
                                    # 微博会员
                                    user.member = '1'

                    # 关注情况
                    pattern = re.compile(
                        r'<div class=\\"info_connect\\">.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<\\/div>',
                        re.S)
                    infoConnectGroups = pattern.findall(item)
                    user.focusnumber, user.fansnumber, user.weibonumber = infoConnectGroups[
                        0]
                    # user.focusnumber = int(infoNameGroups[0][0])
                    # user.fansnumber = int(infoNameGroups[0][1])
                    # user.weibonumber = int(infoNameGroups[0][2])

                    # 地址
                    pattern = re.compile(
                        r'<div class=\\"info_add\\">.*?<span>(.*?)<\\/span>.*?<\\/div>',
                        re.S)
                    infoAddGroups = pattern.findall(item)
                    adds = infoAddGroups[0].split(' ')
                    # print(infoAddGroups, adds)
                    if len(adds) == 2:
                        user.province = adds[0]
                        user.city = adds[1]
                    else:
                        user.province = adds[0]
                        user.city = adds[0]

                    # 简介
                    pattern = re.compile(
                        r'<div class=\\"info_intro\\">.*?<span>(.*?)<\\/span>.*?<\\/div>',
                        re.S)
                    infoIntroGroups = pattern.findall(item)
                    user.intro = infoIntroGroups[0]
                    # print(user.intro)
                    userList.append(user)

                return requestInfo, userList
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    return requestInfo, None
Esempio n. 18
0
def get_media_info_js_request(url):
    logger.debug('request url {}'.format(url))

    requestInfo = request_variable_init(url)
    requestInfo.requestName = 'get_media_info_js_request'

    headers['Referer'] = url
    httpSession = requests.session()
    s_time = time.time()
    mediaInfo = None
    try:
        response = httpSession.get(url, headers=headers)

        if response and response.status_code==200:
            requestInfo.status = response.status_code
            # jsonStr = response.text.strip('jQuery{}('.format(jQuery))
            # jsonStr = jsonStr.strip(');')
            pattern = re.compile('^jQuery\S*\(({.*})\);$')
            reGroups = pattern.match(response.text)
            if reGroups:
                jsonStr = reGroups.group(1)
            else:
                requestInfo.status = 0
                logger.error('parser js none')
                return requestInfo, None

            jsonObj = json.loads(jsonStr)

            mediaInfo = MediaInfo()
            mediaInfo.mediaUrl = jsonObj['bitrate']['show_link']
            md5 = hashlib.md5()
            md5.update(mediaInfo.mediaUrl.encode('utf-8'))
            mediaInfo.mediaId = md5.hexdigest()

            ###ERROR 检查必要字段是否为空 为空特别处理

            mediaInfo.mediaName = jsonObj['songinfo']['title']
            mediaInfo.mediaLang =  jsonObj['songinfo']['language']
            mediaInfo.country = jsonObj['songinfo']['country']
            mediaInfo.proxycompany = jsonObj['songinfo']['si_proxycompany']
            mediaInfo.compose = jsonObj['songinfo']['compose']
            mediaInfo.writer = jsonObj['songinfo']['songwriting']
            mediaInfo.author = jsonObj['songinfo']['author']
            mediaInfo.publishTime = jsonObj['songinfo']['publishtime']
            mediaInfo.albumName = jsonObj['songinfo']['album_title']
            mediaInfo.lrcUrl = jsonObj['songinfo']['lrclink']
            mediaInfo.mediaSize = jsonObj['bitrate']['file_size']
            mediaInfo.mediaFormat = jsonObj['bitrate']['file_format']
            mediaInfo.albumId = jsonObj['songinfo']['album_id']
            # 1 版权原因删除  0正常可听
            mediaInfo.useStatus = jsonObj['songinfo']['del_status']

            mediaInfo.source = urllib.parse.urlparse(mediaInfo.mediaUrl).netloc
            # mediaInfo.sourceDate = time.strftime('%Y%m%d', time.localtime(time.time()))
            e_time = time.time()
            mediaInfo.useTime = e_time-s_time
            logger.debug('media url {}'.format(mediaInfo.mediaUrl))

            requestInfo = request_variable_init(mediaInfo.mediaUrl)
            requestInfo.requestName = 'down_media_file'

        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))
            requestInfo.status = response.status_code if response else 0

    except requests.exceptions.ConnectTimeout as e:
        # 连接超时  服务器在指定时间内没有应答
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ReadTimeout as e:
        # 读取超时 客户端等待服务器发送第一个字节之前的时间
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ConnectionError as e:
        # 网络环境异常 或 服务器异常
        requestInfo.status = -2
        logger.exception(sys.exc_info())
    except requests.exceptions.RequestException as e:
        requestInfo.status = -1
        logger.exception(sys.exc_info())
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0
        return requestInfo, None

    return requestInfo, mediaInfo