Example #1
0
def getPartUrl(partUrl, partCid, basePlayInfoUrl, sessCookie):
    def sortBandWidth(item):
        return item['id'] * (10**10) + item['bandwidth']

    headers = getHeaders(partUrl)
    headers['Cookie'] = "CURRENT_FNVAL=16"
    content = tools.getText(partUrl, headers)

    match = re.search(r'<script>window\.__playinfo__=(.+?)</script>', content)

    if match: 
        data = match.group(1)
        data = json.loads(data)['data']
    else: 
        playInfoUrl = basePlayInfoUrl + '&cid=' + str(partCid)
        headers = { 'Cookie': sessCookie }
        data = json.loads(tools.getText(playInfoUrl, headers))
        data = data.get('data', None) or data.get('result', None)

    if 'dash' in data:
        # 音视频分离
        data = data['dash']
        data['audio'].sort(key=sortBandWidth, reverse=True)
        data['video'].sort(key=sortBandWidth, reverse=True)
        combineVideoUrl = data['audio'][0]['baseUrl'] + '|' + data['video'][0]['baseUrl']
    elif 'durl' in data:
        # 视频分段
        data = data['durl']
        urls = list(map(lambda each: each['url'], data))
        combineVideoUrl = '|'.join(urls)

    return combineVideoUrl
Example #2
0
def parseIqiyiUrl(url, headers = {}):
    data = json.loads(tools.getText(url, headers))
    program = data['data']['program']
    if type(program) == list:
        print('服务器返回错误,可能原因:愛奇藝台灣站需要使用代理下载(http_proxy/https_proxy)')
        exit()

    subtitles = []
    filterVideos = list(filter(lambda each: each.get('m3u8'), program['video']))

    if len(filterVideos):
        content = filterVideos[0]['m3u8']

        if content.startswith('#EXTM3U'):
            videoType = 'hls'
            audioUrls, videoUrls = [], tools.filterHlsUrls(content)
        else:
            videoType = 'dash'
            audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    else:
        filterVideos = list(filter(lambda each: each.get('fs'), program['video']))
        fsList = filterVideos[0]['fs']
        basePath = data['data']['dd']
        infoUrls = list(map(lambda each: basePath + each['l'], fsList))
        videoType = 'partial'
        audioUrls, videoUrls = [], parseIqiyiInfoUrls(infoUrls, headers)

    if 'stl' in program:
        defaultSrts = list(filter(lambda x: x.get('_selected'), program['stl']))
        srts = defaultSrts + list(filter(lambda x: not x.get('_selected'), program['stl']))
        basePath = data['data']['dstl']
        subtitles = [ (srt.get('_name', 'default'), basePath + srt['srt']) for srt in srts ]
    return videoType, audioUrls, videoUrls, subtitles
Example #3
0
def parseIqiyiUrl(url, headers={}):
    data = json.loads(tools.getText(url, headers))
    program = data['data']['program']
    if type(program) == list:
        print('服务器返回错误,可能原因:愛奇藝台灣站需要使用代理下载(http_proxy/https_proxy)')
        exit()

    videos = program['video']
    filterVideos = list(filter(lambda each: each.get('m3u8'), videos))

    if len(filterVideos):
        content = filterVideos[0]['m3u8']

        if content.startswith('#EXTM3U'):
            videoType = 'hls'
            audioUrls, videoUrls = [], tools.filterHlsUrls(content)
        else:
            videoType = 'dash'
            audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    else:
        filterVideos = list(filter(lambda each: each.get('fs'), videos))
        fsList = filterVideos[0]['fs']
        basePath = data['data']['dd']
        infoUrls = list(map(lambda each: basePath + each['l'], fsList))
        videoType = 'partial'
        audioUrls, videoUrls = [], parseIqiyiInfoUrls(infoUrls, headers)
    return videoType, audioUrls, videoUrls
Example #4
0
def parseIqiyiInfoUrls(urls, headers = {}):
    print('共%d段视频,正在获取各段视频的真实链接' % len(urls))

    videoUrls = []
    for url in urls:
        data = json.loads(tools.getText(url, headers, timeout=10))
        videoUrls.append(data['l'])
    return videoUrls
Example #5
0
def parseIqiyiUrl(url, headers={}):
    data = json.loads(tools.getText(url, headers))
    videos = data['data']['program']['video']
    videos = list(filter(lambda each: each.get('m3u8'), videos))
    content = videos[0]['m3u8']

    if content.startswith('#EXTM3U'):
        videoType = 'hls'
        audioUrls, videoUrls = [], tools.filterHlsUrls(content)
    else:
        videoType = 'dash'
        audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    return videoType, audioUrls, videoUrls
Example #6
0
def parsePostNER(fin):
    statistics = {}
    text = tools.getText(fin)
    for line in text.splitlines():
        if line != '':
            cols = [word for word in line.split('\t')]
            if len(cols) != 5:
                print line
                print "not well formatted line"
                continue
            else:
                if cols[1] == 'NNP':
                    if not statistics.has_key(cols[0]):
                        statistics[cols[0]] = {}
                    if not statistics[cols[0]].has_key(cols[4]):
                            statistics[cols[0]][cols[4]]=0
                    statistics[cols[0]][cols[4]] += 1
    return statistics
Example #7
0
def parseIqiyiMpd(content, headers = {}):
    mediaUrls = {
        'audio': [],
        'video': [],
    }
    root = XMLUtils.parse(content)
    items = XMLUtils.findall(root, 'Period/AdaptationSet/Representation')

    for item in items:
        mType = item.attrib['mimeType'].split('/')[0]
        segName = XMLUtils.findtext(item, 'BaseURL')
        clipItems = XMLUtils.findall(root, "clip_list/clip[BaseURL='%s']" % segName)

        for clip in clipItems:
            infoUrl = XMLUtils.findtext(clip, 'remote_path').replace('&amp;', '&')
            mediaInfo = json.loads(tools.getText(infoUrl, headers))
            mediaUrls[mType].append(mediaInfo['l'])

    return mediaUrls['audio'], mediaUrls['video']
Example #8
0
def compile (object, stream=False):
	"""
		compile all the native data formats from this project into native format
		data files in the specified cache directory. Object is the project DataFetch object.

		uses the compile method if present within each identified stream.
		OR it can run only on a specified stream.
	"""
	print "Compiling your project to the cache directory"
	if object.proj.c_version < 3:
		print "You need to be using version 3+ of the XML config to use the cache tag."
		return False
	count = 0
	cache = object.proj.cache
	streams = object.proj.findsource('',find='all') 
	# now we have an XML object of all the streams... whoopie.
	# now loop through them, import, check for compile options, and run if necessary
	for s in streams:
                        name = getText(s.getElementsByTagName('name')[0].childNodes)
			if stream and not stream == name:
				# then continue
				continue
			# now import the module, and check for a compile method
			pkg = object.proj.findsource(name, find='package')
			#package = getText(s.getElementsByTagName('type')[0].childNodes)
                        #exec "import uudewey.readers."+package+" as c"
			__import__(pkg) # ooh! all newfangled!
			c = sys.modules[pkg]
			object.nowPkg = name # needed for the find_files method to work!
	
			if 'compile' in dir(c):
				# then run compile! it's that simple!
				print "Compiling:",name
				count += 1
				object.nowPkg = name
				pkg_id = name.lower().replace(' ','')
				c.compile(object,pkg_id)
			else:
				print name,"is not a compilable datatype currently. (",pkg,")"
	# well, if we got to this point, then the project has no streams capable of compiling
	print "Project Compiled: streams successfully compiled: ",count
Example #9
0
def getAllPartInfo(url):
    content = tools.getText(url, getHeaders(url))

    # 获取分p名称和cid
    match = re.search(r'<script>window\.__INITIAL_STATE__=(.+?});.+?</script>', content)
    data = json.loads(match.group(1))
    isOpera = 'epList' in data
    pages = data['epList'] if isOpera else data['videoData']['pages']

    allPartInfo = []
    for page in pages:
        if isOpera:
            name, partUrl = page['longTitle'], re.sub(r'\d+$', str(page['id']), url)
        else:
            name, partUrl = page['part'], url + '?p=' + str(page['page'])
        allPartInfo.append({
            'cid': page['cid'],
            'name': name,
            'url': partUrl,
        })

    return allPartInfo
Example #10
0
def parseHls(url, headers = {}):
    content = tools.getText(url, headers)
    return tools.filterHlsUrls(content, url)