Beispiel #1
0
def weibo_http_get_tophot_list(session=None):
    '''
        return to the list of hot searches
    :param session:
    :return: returns like [(url,topname),(url,topname)] on success and None on failure
    '''
    url = 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            txt = response.content.decode('utf8')
            logger.debug(txt)
            # pattern = re.compile(r'<td class="td-02">\s*<a href="(.*)" target="_blank">(.*)</a>.*</td>',re.S)
            pattern = re.compile(
                r'<td class="td-02">\s*?<a href="(.*?)" target="_blank">(\S*?)</a>.*?</td>',
                re.S)
            reGroups = pattern.findall(txt)
            if reGroups:
                return None
            return reGroups
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #2
0
def weibo_http_get_verify_pic(pcid, session=None):
    '''
    download img files and return img file path
    :param pcid:
    :return: return img file path
    '''
    url = 'https://login.sina.com.cn/cgi/pin.php?r={}&s=0&p={}'.format( math.floor(random.random()*100000000), pcid )
    logger.debug(url)
    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            filename = 'img/{}.png'.format(pcid)
            with open(filename, 'wb') as fp:
                fp.write(response.content)
            return filename
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #3
0
def weibo_login_cookie(user, password):
    session = requests.session()
    cookfile = '{}{}'.format(user,COOKFILENAME)
    if os.path.exists(cookfile):
        # os.stat  st_atime(访问时间), st_mtime(修改时间), st_ctime(创建时间)
        filestat = os.stat(cookfile)
        modifyDt = datetime.datetime.utcfromtimestamp(filestat.st_mtime).date()
        nowDt = datetime.datetime.now().date()
        print( modifyDt, nowDt)
        if modifyDt<nowDt:
            logger.debug('cookie file dated')
        else:
            cookieDict = {}
            with open(cookfile, 'r') as f:
                cookieDict = json.load(f)
            cookjar = requests.utils.cookiejar_from_dict(cookieDict)
            session.cookies = cookjar
            return session

    logining = weibo_login(user, password, session)

    if logining:
        # with open(COOKFILENAMEPICKLE, 'wb') as f:
        #     pickle.dump(session.cookies, f)
        cookies = requests.utils.dict_from_cookiejar(session.cookies)
        with open(cookfile, 'w') as f:
            f.write(json.dumps(cookies))

        return session
    else:
        return False
Beispiel #4
0
def scrapy_work():
    threadObjs = []
    max_workers = 5
    threadPool = ThreadPoolExecutor(max_workers=max_workers)
    for i in range(max_workers):
        obj=threadPool.submit(request_workflow_thread)
        threadObjs.append(obj)

    requestQueue.join()
    for _ in as_completed(threadObjs):
        logger.debug('one thread over')


    #error result, the main thread over but threads not over
    # while True:
    #     theadFlag = False
    #     for obj in threadObjs:
    #         theadFlag = theadFlag or obj.done()
    #
    #     if theadFlag:
    #         for obj in threadObjs:
    #             if obj.done():
    #                 threadObjs.remove(obj)
    #                 obj=threadPool.submit(request_workflow_thread)
    #                 threadObjs.append(obj)
    #     else:
    #         logger.debug('all threads over')
    #         break

    logger.debug('main threads over')
Beispiel #5
0
def weibo_http_get_home_uniqueid(ticket, ssosavestate, session=None):
    url = 'https://passport.weibo.com/wbsso/login?ticket={}&ssosavestate={}&callback=sinaSSOController.doCrossDomainCallBack&scriptId=ssoscript0&client=ssologin.js(v1.4.19)&_=1533119634900'.format(ticket, ssosavestate,str(int(time.time()*1000)) )
    logger.debug(url)

    try:
        # headers['Referer'] = 'https://weibo.com/'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            txt = response.content.decode('gbk')
            logger.debug(txt)
            txt = txt.replace('sinaSSOController.doCrossDomainCallBack', '')
            txt = txt.replace(';', '')
            txt = txt.replace('true', '1')
            resultDict = eval(txt)
            return resultDict
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #6
0
def weibo_http_get_raskey(user, session=None):
    '''
        returns the result on success or none on failure
    :param user: weibo login user name
    :return:
    '''
    url = "https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su={}&rsakt=mod&client=ssologin.js(v1.4.19)&_={}".format( user, str(int(time.time()*1000)) )
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = requests.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            logger.debug(response.content.decode('utf-8'))
            text = response.content.decode('utf-8')
            result = eval(text.replace('sinaSSOController.preloginCallBack',''))
            return result
            # nonce = re.findall(r'"nonce":"(.*?)"', text)[0]
            # pubkey = re.findall(r'"pubkey":"(.*?)"', text)[0]
            # rsakv = re.findall(r'"rsakv":"(.*?)"', text)[0]
            # servertime = re.findall(r'"servertime":(.*?),', text)[0]
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #7
0
def weibo_http_get_navigation_page_url(url, session=None):
    '''
        通过分类标签页获取当前分类标签下所有数据url
    :param url:
    :param session:
    :return:
    '''
    try:
        headers['Referer'] = 'https://weibo.com/'
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation_page_url'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            # logger.debug(txt)
            # 查找匹配默认是贪婪法则 加上?号后转为非贪婪
            pattern = re.compile(r'<div class=\\"W_pages\\">(.*?)<\\/div>',
                                 re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                yield requestInfo, None
            if reGroups:
                allPagesTxt = reGroups[0]
                pattern = re.compile(r'href=\\"\\(.*?)\\">', re.S)
                reGroups = pattern.findall(allPagesTxt)

                maxPageTxt = reGroups[-2]
                logger.debug(maxPageTxt)
                maxPageReGroups = re.search(r'&page=(\d*)', maxPageTxt)
                maxPage = int(maxPageReGroups.group(1))
                logger.debug(maxPageReGroups.group(1))
                strFormat = maxPageTxt.replace('page={}'.format(maxPage),
                                               'page={}')
                logger.debug(strFormat)

                for i in range(1, maxPage + 1):
                    url = strFormat.format(i)
                    url = 'https://d.weibo.com{}'.format(url)
                    logger.debug(url)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'weibo_http_get_navigation'
                    yield resultInfo, None
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    yield requestInfo, None
Beispiel #8
0
def scrapy_work():
    threadObjs = []
    max_workers = 1
    threadPool = ThreadPoolExecutor(max_workers=max_workers)
    for i in range(max_workers):
        obj = threadPool.submit(request_workflow_thread)
        threadObjs.append(obj)

    requestQueue.join()
    for _ in as_completed(threadObjs):
        logger.debug('one thread over')

    logger.debug('main threads over')
Beispiel #9
0
def get_artist_info(url):
    # http://music.taihe.com/data/tingapi/v1/restserver/ting?method=baidu.ting.artist.getInfo&from=web&tinguid=1097
    headers['Referer'] = url

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            pass
        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))
    except Exception as e:
        logger.exception(sys.exc_info())
Beispiel #10
0
def get_artist_music_list_xhr(url):
    headers['Referer'] = url

    try:
        logger.debug('request url {}'.format(url))

        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'get_artist_music_list_xhr'

        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            jsonObj = json.loads(response.content.decode('utf8'))
            htmlText = jsonObj['data']['html']
            # logger.debug(htmlText)

            # htmlText = html.unescape(htmlText)
            # logger.debug(htmlText)

            # htmlText = htmlText.encode('utf8').decode('unicode-escape')
            # logger.debug(htmlText)

            # text = response.content.decode('unicode-escape')
            # import html
            # text = html.unescape(text)

            # r'<a href="/song/(\d+)" target="_blank" class="namelink" title="(.*)" a-tj'
            pattern = re.compile("<a href=\"/song/(\d+)\" target=\"_blank\" class=\"namelink \w*\" title=\"(\S*)\" a-tj", re.S)
            reGroups = pattern.findall(htmlText)

            if reGroups:
                jQuery = '17204780742719340729_1586053549318'
                item = '1586053553445'
                for songid,k in reGroups:
                    # print(i,k)
                    url = 'http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery{}&songid={}&from=web&_={}'.format(jQuery, songid, item)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'get_media_info_js_request'
                    yield  resultInfo,None
            else:
                requestInfo.status = 0
                logger.error('parser music list xr error')
        else:
            requestInfo.status = response.status_code if response else 0
            logger.error('requests error {}'.format(response if response is None else response.status_code))

    except Exception as e:
        logger.exception(sys.exc_info())
    yield requestInfo, None
Beispiel #11
0
def down_media_file(url, mediaInfo=None):
    baseDir = "song"

    requestInfo = request_variable_init(url)
    requestInfo.requestName = 'down_media_file'

    if mediaInfo:
        headers['Referer'] = url

        # 判断下载文件、目录是否已存在
        addr = '{}/{}'.format(baseDir, mediaInfo.author)
        if not os.path.exists(addr):
            os.mkdir(addr)
        addr = os.path.join(addr, '{}.{}'.format(mediaInfo.mediaName, mediaInfo.mediaFormat))
        logger.debug('media addr {}'.format(addr))
    else:
        md5 = hashlib.md5()
        md5.update(url.encode('utf-8'))
        addr =  os.path.join(baseDir, md5.hexdigest())
        logger.debug('media addr {}'.format(addr))

    if os.path.exists(addr):
        logger.debug('media addr {} is exists'.format(addr))
        requestInfo.status = 0
        return requestInfo,None

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)
    except requests.exceptions.ConnectTimeout as e:
        # 连接超时  服务器在指定时间内没有应答
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ReadTimeout as e:
        # 读取超时 客户端等待服务器发送第一个字节之前的时间
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ConnectionError as e:
        # 网络环境异常 或 服务器异常
        requestInfo.status = -2
        logger.exception(sys.exc_info())
    except requests.exceptions.RequestException as e:
        requestInfo.status = -1
        logger.exception(sys.exc_info())
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    if response and response.status_code==200:
        requestInfo.status = response.status_code
        try:
            with open(addr, 'wb') as fp:
                fp.write(response.content)
        except Exception as e:
            requestInfo.status = 0
            logger.exception(sys.exc_info())
    else:
        logger.error('requests error {}'.format(response if response is None else response.status_code))
        requestInfo.status = response.status_code if response else 0
    return requestInfo,None
Beispiel #12
0
def get_artist_music_list(url):

    headers['Referer'] = url
    try:
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'get_artist_music_list'
        response = requests.get(url, headers=headers)

        if response and response.status_code==200:
            requestInfo.status = response.status_code
            # soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib')
            # tag = soup.find('div',class_='page_navigator-box').find('div', class_='page-navigator-hook')
            # if tag:
            #     attr = tag['class']
            #     print(attr)
            #     total = attr[5].strip(',').split(':')[1]
            #     size = attr[6].strip(',').split(':')[1]
            #     print(total, size)

            # pattern = re.compile(".*'total':(\d+),[ \t]*'size':(\d+).*", re.S)
            pattern = re.compile("'total':(\d+),[ \t]*'size':(\d+)", re.S)
            totalGroup = pattern.findall(response.content.decode('utf8'))

            if totalGroup:
                # 0歌曲 1专辑 2视屏
                total = int(totalGroup[0][0].strip("'"))
                size = int(totalGroup[0][1].strip("'"))

                ting_uid = url[url.rindex('/')+1:]
                for i in range(0, total, size):
                    # 获取歌曲总数 每次调用xhr获取歌曲数量
                    xhrUrl = 'http://music.taihe.com/data/user/getsongs?start={}&size={}&ting_uid={}&r=0.196355769444312541586235172159'.format(i,size,ting_uid)
                    # get_artist_music_list_xhr(xhrUrl)

                    resultInfo = request_variable_init(xhrUrl)
                    resultInfo.requestName = 'get_artist_music_list_xhr'
                    yield  resultInfo,None
            else:
                requestInfo.status = 0
                logger.error('parser music list none')
        else:
            requestInfo.status = response.status_code if response else 0
            logger.error('requests error {}'.format( response if response is None else response.status_code))
    except Exception as e:
        requestInfo.status = 0
        logger.exception(sys.exc_info())

    yield requestInfo,None
Beispiel #13
0
def get_media_workflow(songid):
    # 正常流程
    jQuery = '17204780742719340729_1586053549318'
    item = '1586053553445'
    url = 'http://musicapi.taihe.com/v1/restserver/ting?method=baidu.ting.song.playAAC&format=jsonp&callback=jQuery{}&songid={}&from=web&_={}'.format(jQuery,songid,item)

    requestInfo,info = get_media_info_js_request(url)

    #成功获取媒体信息
    if info:
        #请求信息入库
        dbsession.add(requestInfo)
        # 媒体信息入库
        dbsession.add(info)

        s_time = time.time()
        requestInfo,result = down_media_file(info.mediaUrl, info)
        e_time = time.time()

        # 更新媒体信息 下载状态 下载用时
        info.downStatus = '00'
        info.downTime = e_time - s_time

        if requestInfo and requestInfo.status == 200:
            # 请求信息入库
            dbsession.add(requestInfo)
        else:
            # 判断哪些请求入库 哪些请求信息入队列
            if requestInfo.status == 0:
                dbsession.add(requestInfo)
            if requestInfo.status<0 and (requestInfo.runCnt + requestInfo.status)>0:
                dbsession.add(requestInfo)
            else:
                requestQueue.put([requestInfo.status,requestInfo])
    else:
        # print(requestInfo.status, requestInfo.runCnt)
        if requestInfo and requestInfo.status == 0:
            dbsession.add(requestInfo)
        if requestInfo and requestInfo.status < 0 and (requestInfo.runCnt + requestInfo.status) > 0:
            dbsession.add(requestInfo)
        else:
            requestQueue.put([requestInfo.status, requestInfo])
            logger.debug('requestQueue size {}'.format(requestQueue.qsize()))

    try:
        dbsession.commit()
    except Exception as e:
        dbsession.rollback()
        logger.exception(sys.exc_info())
Beispiel #14
0
def get_artist_list(url, filename=None):
    headers['Referer'] = url

    artistDict = {}

    #从json文件导出歌手清单
    if filename and os.path.exists(filename) and os.path.getsize(filename)>0:
        # print(os.stat(filename).st_ctime,os.stat(filename).st_mtime)
        try:
            with open(filename, 'r', encoding='utf8') as fp:
                artistDict = json.load(fp)

            return artistDict
        except Exception as e:
            logger.exception(sys.exc_info())
            raise

    try:
        logger.debug('request url {}'.format(url))
        response = requests.get(url, headers=headers)

        if response and response.status_code == 200:
            soup = BeautifulSoup(response.content.decode('utf8'), 'html5lib')

            musicBodyTag = soup.find('div', class_='music-body clearfix').find('div', class_='main-body').find('ul', class_='container')
            if musicBodyTag:
                musicTagList = musicBodyTag.find_all('a',{'href':re.compile("^.*/[0-9]*$"), 'title':re.compile("^.*$")})
                for tag in musicTagList:
                    artist = tag['title']
                    id = tag['href'][8:]
                    artistDict[id] = artist
            else:
                logger.error('parser artist list none')
        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))

    except Exception as e:
        logger.exception(sys.exc_info())
        return None

    if filename:
        try:
            with open(filename, 'w', encoding='utf8') as fp:
                json.dump(artistDict, fp, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(sys.exc_info())
            raise
    return artistDict
Beispiel #15
0
def init_scrapy_work():
    init_session_pool()

    result = dbsession.query(RequestInfo).first()
    if result is None:
        url = 'https://d.weibo.com/1087030002_2986_top'
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation_page_list'
        requestQueue.put(
            (PRIORITYDEFINE[requestInfo.requestName], requestInfo, None))
    else:
        logger.debug('scrapy reboot from db')
        infos = dbsession.query(RequestInfo).filter_by(status='999').all()
        for item in infos:
            logger.debug('scrapy reboot from db {}'.format(item))
            requestQueue.put((1, item, None))
Beispiel #16
0
def weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid=None, verify=None, session=None):
    url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)'
    logger.debug(url)

    try:
        # headers['Referer'] = 'https://weibo.com/'
        # headers.pop('Referer')
        datas={
            'entry':'weibo',
            'gateway':'1',
            'from':'',
            'savestate': '7',
            'qrcode_flag': 'false',
            'useticket': '1',
            'pagerefer':'https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fpassport.weibo.com%2Fwbsso%2Flogout%3Fr%3Dhttps%253A%252F%252Fweibo.com%26returntype%3D1',
            'vsnf': '1',
            'su': su,
            'service': 'miniblog',
            'servertime': servertime,
            'nonce': nonce,
            'pwencode':'rsa2',
            'rsakv': rsakv,
            'sp':sp,
            'sr':'1366*768',
            'encoding':'UTF-8',
            'prelt':'115',
            'url':'https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'returntype':'META'
        }
        if verify:
            datas['pcid'] = pcid
            datas['door'] = verify
        logger.debug(datas)
        if session:
            response = session.post(url, headers=headers, data=datas, verify=False)
        else:
            response = requests.post(url, headers=headers, data=datas, verify=False)
        if response and response.status_code==200:
            txt = response.content.decode('gbk')
            logger.debug(txt)
            reGroups = re.match('.*location.replace\(\"(.*)\"\);.*', txt, re.S)
            locationDict = None
            if reGroups:
                locationUrl = reGroups.group(1)
                locationUrl = unquote_plus(locationUrl)
                logger.debug('unquote url {}'.format(locationUrl))

                locationDict = parse_qs(urlparse(locationUrl).query)

            return locationDict
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #17
0
def init_work():
    result = dbsession.query(RequestInfo).first()
    if result is None:
        artistDict = get_artist_list('http://music.taihe.com/artist', r'D:\project\python\pylib\artistjson.txt')

        for k, v in artistDict.items():
            url = 'http://music.taihe.com/artist/{}'.format(k)
            requestInfo = request_variable_init(url)
            requestInfo.requestName = 'get_artist_music_list'
            requestQueue.put((PRIORITYDEFINE[requestInfo.requestName], requestInfo, None))

        print(requestQueue.qsize())
    else:
        logger.debug('scrapy reboot from db')
        infos = dbsession.query(RequestInfo).filter_by( status = '999').all()
        for item in infos:
            logger.debug('scrapy reboot from db {}'.format(item))
            requestQueue.put((1, item, None))
Beispiel #18
0
def weibo_http_get_home(uniqueid, session=None):
    url = 'https://weibo.com/u/{}/home?wvr=5&lf=reg'.format( uniqueid )
    logger.debug(url)

    try:
        headers['Referer'] = 'https://weibo.com/'
        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)
        if response and response.status_code==200:
            # txt = response.content.decode('utf8')
            # logger.debug(txt)
            return True
        else:
            logger.error('request error http code:{}'.format(response.status_code))
            return None
    except Exception as e:
        logger.exception(sys.exc_info())
        return None
Beispiel #19
0
def weibo_login(user, password, session=None):
    su = user_base64(user)
    preLoginDict = weibo_http_get_raskey(su, session=session)
    logger.debug('dict:{}'.format(preLoginDict))

    nonce = preLoginDict['nonce']
    pubkey = preLoginDict['pubkey']
    rsakv = preLoginDict['rsakv']
    servertime = preLoginDict['servertime']

    sp = password_rsa(password, servertime, nonce, pubkey)
    verify = None
    pcid = None
    if 'showpin' in preLoginDict.keys():
        pcid = preLoginDict['pcid']
        verify_file = weibo_http_get_verify_pic(pcid, session=session)
        # 验证码文件解析
        logger.error('verify file {}'.format(verify_file))
        verify = input("input verfy code:")

    locationParams = weibo_http_post_login_location(su, sp, nonce, rsakv, servertime, pcid, verify, session=session)
    if locationParams:
        retcode = locationParams['retcode'][0]
        ticket = locationParams['ticket'][0]
        rParams = parse_qs(urlparse(locationParams['r'][0]).query)
        ssosavestate = rParams['ssosavestate'][0]
    else:
        logger.error('weibo login error')
        return None

    resultDict = weibo_http_get_home_uniqueid(ticket, ssosavestate, session)

    if resultDict is None:
        logger.error('weibo login error')
        return None

    logger.debug("weibo login uniqueid {}".format(resultDict))
    uniqueid = resultDict['userinfo']['uniqueid']
    return weibo_http_get_home(uniqueid)
Beispiel #20
0
def request_workflow_thread():
    while True:
        try:
            priority,requestInfo,param = requestQueue.get(block=True, timeout=10)
            requestQueue.task_done()
            logger.debug('PriorityQueue size {}'.format(requestQueue.qsize()))
        except Exception as e:
            logger.exception(sys.exc_info())
            logger.error('request_workflow_thread queue empty')
            break

        if requestInfo and requestInfo.requestName:
            logger.debug('run {} params {}'.format(requestInfo.requestName,requestInfo.requestUrl))
            if requestInfo.requestName == 'down_media_file':
                results = eval(requestInfo.requestName)(requestInfo.requestUrl,param)
            else:
                results=eval(requestInfo.requestName)(requestInfo.requestUrl)
            if isgeneratorfunction(eval(requestInfo.requestName)):
                logger.debug('isgeneratorfunction {} true'.format(requestInfo.requestName))
                for resultInfo,result in results:
                    if resultInfo.status == 999:
                        logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName,resultInfo.requestUrl))
                        requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result))
                    else:
                        if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo):
                            dbsession.add(result)
                    request_info_update_insert(requestInfo)
                    try:
                        dbsession.commit()
                    except Exception as e:
                        logger.exception(sys.exc_info())
                        logger.error('dbsession error')
                        dbsession.rollback()
            else:
                logger.debug('isgeneratorfunction {} false'.format(requestInfo.requestName))
                resultInfo, result = results

                if isinstance(result, MediaInfo) or isinstance(result, MediaInfo) or isinstance(result, ArtistInfo):
                    dbsession.add(result)

                request_info_update_insert(requestInfo)
                if resultInfo.status != 999:
                    logger.debug('dbsession add resultInfo {}'.format(resultInfo.urlId))
                else:
                    logger.debug('PriorityQueue put {},{} '.format(resultInfo.requestName, resultInfo.requestUrl))
                    requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],resultInfo,result))

                try:
                    dbsession.commit()
                except Exception as e:
                    logger.exception(sys.exc_info())
                    logger.error('dbsession error')
                    dbsession.rollback()

        else:
            logger.error('request_workflow_thread requestInfo none')
            break
Beispiel #21
0
def user_base64(user):
    userquote=urllib.parse.quote_plus(user)
    logger.debug('weibo user quote {}'.format( userquote ))
    userbase64 = base64.b64encode(user.encode('utf-8'))
    logger.debug('weibo user base64 {}'.format(userbase64.decode('utf-8')) )
    return userbase64.decode('utf-8')
Beispiel #22
0
def weibo_http_get_navigation(url, session=None):
    '''
        获取账户信息
    :param url:
    :param session:
    :return:
    '''
    try:
        headers['Referer'] = 'https://weibo.com/'
        logger.debug('request url {}'.format(url))
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            pattern = re.compile(
                r'<li class=\\"follow_item S_line2\\">(.*?)<\\/li>', re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                return requestInfo, None
            else:
                logger.debug(reGroups)
                userList = []
                for item in reGroups:
                    user = WeiboUser()
                    # 头像
                    pattern = re.compile(
                        r'<dt class=\\"mod_pic\\">.*?src=\\"(.*?)\\".*<\\/dt>',
                        re.S)
                    picReGroups = pattern.findall(item)
                    # print(picReGroups)

                    # 名称
                    # '<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*?<i.*?class=\\"(.*?)\\".*?><\\/i>.*?'
                    pattern = re.compile(
                        r'<div class=\\"info_name W_fb W_f14\\">(.*?)<\\/div>',
                        re.S)
                    infoNameGroups = pattern.findall(item)
                    if infoNameGroups:
                        txt = infoNameGroups[0]
                        tags = re.findall(
                            r'.*<strong.*?usercard=\\"(.*?)\\"\s*>(.*?)<\\/strong>.*',
                            txt, re.S)
                        if tags:
                            user.username = tags[0][1]
                            user.userid = tags[0][0]
                        tags = re.findall(r'<i.*?class=\\"(.*?)\\".*?><\\/i>',
                                          txt, re.S)
                        if tags:
                            for tag in tags:
                                if 'icon_approve' in tag:
                                    # 微博个人认证
                                    user.verify = '1'
                                elif 'icon_female' in tag:
                                    user.gender = 'female'
                                elif 'icon_male' in tag:
                                    user.gender = 'male'
                                elif 'icon_member' in tag:
                                    # 微博会员
                                    user.member = '1'

                    # 关注情况
                    pattern = re.compile(
                        r'<div class=\\"info_connect\\">.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<em class=\\"count\\">(.*?)<\\/em>.*?<\\/div>',
                        re.S)
                    infoConnectGroups = pattern.findall(item)
                    user.focusnumber, user.fansnumber, user.weibonumber = infoConnectGroups[
                        0]
                    # user.focusnumber = int(infoNameGroups[0][0])
                    # user.fansnumber = int(infoNameGroups[0][1])
                    # user.weibonumber = int(infoNameGroups[0][2])

                    # 地址
                    pattern = re.compile(
                        r'<div class=\\"info_add\\">.*?<span>(.*?)<\\/span>.*?<\\/div>',
                        re.S)
                    infoAddGroups = pattern.findall(item)
                    adds = infoAddGroups[0].split(' ')
                    # print(infoAddGroups, adds)
                    if len(adds) == 2:
                        user.province = adds[0]
                        user.city = adds[1]
                    else:
                        user.province = adds[0]
                        user.city = adds[0]

                    # 简介
                    pattern = re.compile(
                        r'<div class=\\"info_intro\\">.*?<span>(.*?)<\\/span>.*?<\\/div>',
                        re.S)
                    infoIntroGroups = pattern.findall(item)
                    user.intro = infoIntroGroups[0]
                    # print(user.intro)
                    userList.append(user)

                return requestInfo, userList
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    return requestInfo, None
Beispiel #23
0
def weibo_http_get_navigation_page_list(url, session=None):
    '''
        获取导航页各分类标签地址
    :param session:
    :return:
    '''
    logger.debug(url)
    try:
        headers['Referer'] = 'https://weibo.com/'
        requestInfo = request_variable_init(url)
        requestInfo.requestName = 'weibo_http_get_navigation_page_list'

        if session:
            response = session.get(url, headers=headers, verify=False)
        else:
            response = requests.get(url, headers=headers, verify=False)

        if response and response.status_code == 200:
            requestInfo.status = response.status_code
            txt = response.content.decode('utf-8')
            # logger.debug(txt)
            # 查找匹配默认是贪婪法则 加上?号后转为非贪婪
            pattern = re.compile(r'<li class=\\"li_1 clearfix\\">(.*?)<\\/li>',
                                 re.S)
            reGroups = pattern.findall(txt)
            if not reGroups:
                logger.error('re find li_1 clearfix error')
                yield requestInfo, None
            else:
                navigationDict = {}
                for item in reGroups:
                    pattern = re.compile(
                        r'<span class=\\"pt_title S_txt2\\">(.*?)<\\/span>',
                        re.S)
                    reTags = pattern.findall(item)
                    key = reTags[0].replace(':', '')
                    pattern = re.compile(
                        r'<a target=\\"_blank\\" href=\\"(.*?)\\".*?<span.*?<\\/span>(.*?)<\\/a>',
                        re.S)
                    reTags = pattern.findall(item)
                    value = [(v[0], v[1].replace('\\t', '').strip())
                             for v in reTags]
                    id, name = value[0]
                    id = id[:id.rindex('_')] + '_0'
                    name = '全部'
                    value.insert(0, (id, name))
                    navigationDict[key] = value

                    url = 'https://d.weibo.com/{}#'.format(id)
                    resultInfo = request_variable_init(url)
                    resultInfo.requestName = 'weibo_http_get_navigation_page_url'
                    yield resultInfo, None
            # logger.debug(navigationDict)
        else:
            logger.error('request error http code:{}'.format(
                response.status_code))
            requestInfo.status = response.status_code if response else 0
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0

    yield requestInfo, None
Beispiel #24
0
def get_media_info_js_request(url):
    logger.debug('request url {}'.format(url))

    requestInfo = request_variable_init(url)
    requestInfo.requestName = 'get_media_info_js_request'

    headers['Referer'] = url
    httpSession = requests.session()
    s_time = time.time()
    mediaInfo = None
    try:
        response = httpSession.get(url, headers=headers)

        if response and response.status_code==200:
            requestInfo.status = response.status_code
            # jsonStr = response.text.strip('jQuery{}('.format(jQuery))
            # jsonStr = jsonStr.strip(');')
            pattern = re.compile('^jQuery\S*\(({.*})\);$')
            reGroups = pattern.match(response.text)
            if reGroups:
                jsonStr = reGroups.group(1)
            else:
                requestInfo.status = 0
                logger.error('parser js none')
                return requestInfo, None

            jsonObj = json.loads(jsonStr)

            mediaInfo = MediaInfo()
            mediaInfo.mediaUrl = jsonObj['bitrate']['show_link']
            md5 = hashlib.md5()
            md5.update(mediaInfo.mediaUrl.encode('utf-8'))
            mediaInfo.mediaId = md5.hexdigest()

            ###ERROR 检查必要字段是否为空 为空特别处理

            mediaInfo.mediaName = jsonObj['songinfo']['title']
            mediaInfo.mediaLang =  jsonObj['songinfo']['language']
            mediaInfo.country = jsonObj['songinfo']['country']
            mediaInfo.proxycompany = jsonObj['songinfo']['si_proxycompany']
            mediaInfo.compose = jsonObj['songinfo']['compose']
            mediaInfo.writer = jsonObj['songinfo']['songwriting']
            mediaInfo.author = jsonObj['songinfo']['author']
            mediaInfo.publishTime = jsonObj['songinfo']['publishtime']
            mediaInfo.albumName = jsonObj['songinfo']['album_title']
            mediaInfo.lrcUrl = jsonObj['songinfo']['lrclink']
            mediaInfo.mediaSize = jsonObj['bitrate']['file_size']
            mediaInfo.mediaFormat = jsonObj['bitrate']['file_format']
            mediaInfo.albumId = jsonObj['songinfo']['album_id']
            # 1 版权原因删除  0正常可听
            mediaInfo.useStatus = jsonObj['songinfo']['del_status']

            mediaInfo.source = urllib.parse.urlparse(mediaInfo.mediaUrl).netloc
            # mediaInfo.sourceDate = time.strftime('%Y%m%d', time.localtime(time.time()))
            e_time = time.time()
            mediaInfo.useTime = e_time-s_time
            logger.debug('media url {}'.format(mediaInfo.mediaUrl))

            requestInfo = request_variable_init(mediaInfo.mediaUrl)
            requestInfo.requestName = 'down_media_file'

        else:
            logger.error('requests error {}'.format(response if response is None else response.status_code))
            requestInfo.status = response.status_code if response else 0

    except requests.exceptions.ConnectTimeout as e:
        # 连接超时  服务器在指定时间内没有应答
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ReadTimeout as e:
        # 读取超时 客户端等待服务器发送第一个字节之前的时间
        requestInfo.status = -3
        logger.exception(sys.exc_info())
    except requests.exceptions.ConnectionError as e:
        # 网络环境异常 或 服务器异常
        requestInfo.status = -2
        logger.exception(sys.exc_info())
    except requests.exceptions.RequestException as e:
        requestInfo.status = -1
        logger.exception(sys.exc_info())
    except Exception as e:
        logger.exception(sys.exc_info())
        requestInfo.status = 0
        return requestInfo, None

    return requestInfo, mediaInfo
Beispiel #25
0
def request_workflow_thread():
    while True:
        try:
            session = sessionQueue.get(block=True, timeout=10)
            sessionQueue.task_done()
            sessionQueue.put(session)

            priority, requestInfo, param = requestQueue.get(block=True,
                                                            timeout=10)
            requestQueue.task_done()
            logger.debug('PriorityQueue size {}'.format(requestQueue.qsize()))
        except Exception as e:
            logger.exception(sys.exc_info())
            logger.error('request_workflow_thread queue empty')
            break

        time.sleep(20)
        if requestInfo and requestInfo.requestName:
            logger.debug('run {} params {}'.format(requestInfo.requestName,
                                                   requestInfo.requestUrl))
            results = eval(requestInfo.requestName)(requestInfo.requestUrl,
                                                    session)

            # 判断函数是否是生成器
            if isgeneratorfunction(eval(requestInfo.requestName)):
                logger.debug('isgeneratorfunction {} true'.format(
                    requestInfo.requestName))
                for resultInfo, result in results:
                    if resultInfo.status == 999:
                        logger.debug('PriorityQueue put {},{} '.format(
                            resultInfo.requestName, resultInfo.requestUrl))
                        requestQueue.put(
                            (PRIORITYDEFINE[resultInfo.requestName],
                             resultInfo, result))
                    else:
                        if isinstance(result, list):
                            for item in result:
                                if isinstance(item, WeiboUser):
                                    dbsession.add(item)
                        elif isinstance(result, WeiboUser):
                            dbsession.add(result)
                    request_info_update_insert(requestInfo)
                    try:
                        dbsession.commit()
                    except Exception as e:
                        logger.exception(sys.exc_info())
                        logger.error('dbsession error')
                        dbsession.rollback()
            else:
                logger.debug('isgeneratorfunction {} false'.format(
                    requestInfo.requestName))
                resultInfo, result = results

                if isinstance(result, list):
                    for item in result:
                        if isinstance(item, WeiboUser):
                            dbsession.add(item)
                elif isinstance(result, WeiboUser):
                    dbsession.add(result)

                request_info_update_insert(requestInfo)
                if resultInfo.status != 999:
                    logger.debug('dbsession add resultInfo {}'.format(
                        resultInfo.urlId))
                else:
                    logger.debug('PriorityQueue put {},{} '.format(
                        resultInfo.requestName, resultInfo.requestUrl))
                    requestQueue.put((PRIORITYDEFINE[resultInfo.requestName],
                                      resultInfo, result))

                try:
                    dbsession.commit()
                except Exception as e:
                    logger.exception(sys.exc_info())
                    logger.error('dbsession error')
                    dbsession.rollback()

        else:
            logger.error('request_workflow_thread requestInfo none')
            break