Beispiel #1
0
def getPartUrl(partUrl, partCid, basePlayInfoUrl, sessCookie):
    def sortBandWidth(item):
        return item['id'] * (10**10) + item['bandwidth']

    headers = getHeaders(partUrl)
    headers['Cookie'] = "CURRENT_FNVAL=16"
    content = tools.getText(partUrl, headers)

    match = re.search(r'<script>window\.__playinfo__=(.+?)</script>', content)

    if match: 
        data = match.group(1)
        data = json.loads(data)['data']
    else: 
        playInfoUrl = basePlayInfoUrl + '&cid=' + str(partCid)
        headers = { 'Cookie': sessCookie }
        data = json.loads(tools.getText(playInfoUrl, headers))
        data = data.get('data', None) or data.get('result', None)

    if 'dash' in data:
        # 音视频分离
        data = data['dash']
        data['audio'].sort(key=sortBandWidth, reverse=True)
        data['video'].sort(key=sortBandWidth, reverse=True)
        combineVideoUrl = data['audio'][0]['baseUrl'] + '|' + data['video'][0]['baseUrl']
    elif 'durl' in data:
        # 视频分段
        data = data['durl']
        urls = list(map(lambda each: each['url'], data))
        combineVideoUrl = '|'.join(urls)

    return combineVideoUrl
Beispiel #2
0
def parseIqiyiUrl(url, headers = {}):
    data = json.loads(tools.getText(url, headers))
    program = data['data']['program']
    if type(program) == list:
        print('服务器返回错误,可能原因:愛奇藝台灣站需要使用代理下载(http_proxy/https_proxy)')
        exit()

    subtitles = []
    filterVideos = list(filter(lambda each: each.get('m3u8'), program['video']))

    if len(filterVideos):
        content = filterVideos[0]['m3u8']

        if content.startswith('#EXTM3U'):
            videoType = 'hls'
            audioUrls, videoUrls = [], tools.filterHlsUrls(content)
        else:
            videoType = 'dash'
            audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    else:
        filterVideos = list(filter(lambda each: each.get('fs'), program['video']))
        fsList = filterVideos[0]['fs']
        basePath = data['data']['dd']
        infoUrls = list(map(lambda each: basePath + each['l'], fsList))
        videoType = 'partial'
        audioUrls, videoUrls = [], parseIqiyiInfoUrls(infoUrls, headers)

    if 'stl' in program:
        defaultSrts = list(filter(lambda x: x.get('_selected'), program['stl']))
        srts = defaultSrts + list(filter(lambda x: not x.get('_selected'), program['stl']))
        basePath = data['data']['dstl']
        subtitles = [ (srt.get('_name', 'default'), basePath + srt['srt']) for srt in srts ]
    return videoType, audioUrls, videoUrls, subtitles
Beispiel #3
0
def parseIqiyiUrl(url, headers={}):
    data = json.loads(tools.getText(url, headers))
    program = data['data']['program']
    if type(program) == list:
        print('服务器返回错误,可能原因:愛奇藝台灣站需要使用代理下载(http_proxy/https_proxy)')
        exit()

    videos = program['video']
    filterVideos = list(filter(lambda each: each.get('m3u8'), videos))

    if len(filterVideos):
        content = filterVideos[0]['m3u8']

        if content.startswith('#EXTM3U'):
            videoType = 'hls'
            audioUrls, videoUrls = [], tools.filterHlsUrls(content)
        else:
            videoType = 'dash'
            audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    else:
        filterVideos = list(filter(lambda each: each.get('fs'), videos))
        fsList = filterVideos[0]['fs']
        basePath = data['data']['dd']
        infoUrls = list(map(lambda each: basePath + each['l'], fsList))
        videoType = 'partial'
        audioUrls, videoUrls = [], parseIqiyiInfoUrls(infoUrls, headers)
    return videoType, audioUrls, videoUrls
Beispiel #4
0
def parseIqiyiInfoUrls(urls, headers = {}):
    print('共%d段视频,正在获取各段视频的真实链接' % len(urls))

    videoUrls = []
    for url in urls:
        data = json.loads(tools.getText(url, headers, timeout=10))
        videoUrls.append(data['l'])
    return videoUrls
Beispiel #5
0
def parseIqiyiUrl(url, headers={}):
    data = json.loads(tools.getText(url, headers))
    videos = data['data']['program']['video']
    videos = list(filter(lambda each: each.get('m3u8'), videos))
    content = videos[0]['m3u8']

    if content.startswith('#EXTM3U'):
        videoType = 'hls'
        audioUrls, videoUrls = [], tools.filterHlsUrls(content)
    else:
        videoType = 'dash'
        audioUrls, videoUrls = parseIqiyiMpd(content, headers)
    return videoType, audioUrls, videoUrls
Beispiel #6
0
def parsePostNER(fin):
    statistics = {}
    text = tools.getText(fin)
    for line in text.splitlines():
        if line != '':
            cols = [word for word in line.split('\t')]
            if len(cols) != 5:
                print line
                print "not well formatted line"
                continue
            else:
                if cols[1] == 'NNP':
                    if not statistics.has_key(cols[0]):
                        statistics[cols[0]] = {}
                    if not statistics[cols[0]].has_key(cols[4]):
                            statistics[cols[0]][cols[4]]=0
                    statistics[cols[0]][cols[4]] += 1
    return statistics
Beispiel #7
0
def parseIqiyiMpd(content, headers = {}):
    mediaUrls = {
        'audio': [],
        'video': [],
    }
    root = XMLUtils.parse(content)
    items = XMLUtils.findall(root, 'Period/AdaptationSet/Representation')

    for item in items:
        mType = item.attrib['mimeType'].split('/')[0]
        segName = XMLUtils.findtext(item, 'BaseURL')
        clipItems = XMLUtils.findall(root, "clip_list/clip[BaseURL='%s']" % segName)

        for clip in clipItems:
            infoUrl = XMLUtils.findtext(clip, 'remote_path').replace('&amp;', '&')
            mediaInfo = json.loads(tools.getText(infoUrl, headers))
            mediaUrls[mType].append(mediaInfo['l'])

    return mediaUrls['audio'], mediaUrls['video']
Beispiel #8
0
def compile (object, stream=False):
	"""
		compile all the native data formats from this project into native format
		data files in the specified cache directory. Object is the project DataFetch object.

		uses the compile method if present within each identified stream.
		OR it can run only on a specified stream.
	"""
	print "Compiling your project to the cache directory"
	if object.proj.c_version < 3:
		print "You need to be using version 3+ of the XML config to use the cache tag."
		return False
	count = 0
	cache = object.proj.cache
	streams = object.proj.findsource('',find='all') 
	# now we have an XML object of all the streams... whoopie.
	# now loop through them, import, check for compile options, and run if necessary
	for s in streams:
                        name = getText(s.getElementsByTagName('name')[0].childNodes)
			if stream and not stream == name:
				# then continue
				continue
			# now import the module, and check for a compile method
			pkg = object.proj.findsource(name, find='package')
			#package = getText(s.getElementsByTagName('type')[0].childNodes)
                        #exec "import uudewey.readers."+package+" as c"
			__import__(pkg) # ooh! all newfangled!
			c = sys.modules[pkg]
			object.nowPkg = name # needed for the find_files method to work!
	
			if 'compile' in dir(c):
				# then run compile! it's that simple!
				print "Compiling:",name
				count += 1
				object.nowPkg = name
				pkg_id = name.lower().replace(' ','')
				c.compile(object,pkg_id)
			else:
				print name,"is not a compilable datatype currently. (",pkg,")"
	# well, if we got to this point, then the project has no streams capable of compiling
	print "Project Compiled: streams successfully compiled: ",count
Beispiel #9
0
def getAllPartInfo(url):
    content = tools.getText(url, getHeaders(url))

    # 获取分p名称和cid
    match = re.search(r'<script>window\.__INITIAL_STATE__=(.+?});.+?</script>', content)
    data = json.loads(match.group(1))
    isOpera = 'epList' in data
    pages = data['epList'] if isOpera else data['videoData']['pages']

    allPartInfo = []
    for page in pages:
        if isOpera:
            name, partUrl = page['longTitle'], re.sub(r'\d+$', str(page['id']), url)
        else:
            name, partUrl = page['part'], url + '?p=' + str(page['page'])
        allPartInfo.append({
            'cid': page['cid'],
            'name': name,
            'url': partUrl,
        })

    return allPartInfo
Beispiel #10
0
def parseHls(url, headers = {}):
    content = tools.getText(url, headers)
    return tools.filterHlsUrls(content, url)