def download_caption(vidmeta, folder): logr = logging.getLogger(vid) title = clean_up_title(vidmeta['title']) uid = vidmeta['vid'] select_map = vidmeta['select_map'] path = folder.rstrip('/')+"/"+str(title)+"_-_"+str(uid)+"."+"srt" for smap in select_map: media = smap['media'] if(media == "caption"): capDom = minidom.parse( urllib.urlopen(smap['url']) ) texts = capDom.getElementsByTagName('text') hp = HTMLParser() f = open(path,'w') for i, text in enumerate(texts): fstart = float(text.getAttribute('start')) start = convert_time_format(fstart) fdur = float(text.getAttribute('dur')) dur = convert_time_format(fstart+fdur) t = text.childNodes[0].data f.write('%d\n'%(i)) f.write('%s --> %s\n'%(start, dur)) f.write(hp.unescape(t).encode(sys.getfilesystemencoding())) f.write('\n\n') logr.info("\t%s\n\tSaved in: => %s",smap_to_str(smap),path) break;
def extract_playlist(plid,pl_page): tree = html.fromstring(pl_page['contents']) t = tree.xpath('//h1[@class="pl-header-title"]/text()') title = clean_up_title(t[0]) if (len(t) > 0) else "unknown title" owner = tree.xpath('//h1[@class="branded-page-header-title"]/span/span/span/a/text()')[0] playlist = { 'plid': plid, 'title': title, 'owner': owner } plist = parse_playlist(tree) lmurl = parse_lmwidget(tree) count = 1 while (len(lmurl)>0): #print "Loading next ... "+lmurl ajax_resp = load_more_ajax(lmurl) if(ajax_resp['error'] <0 ): print "Error extracting load more... returning the list" break pl = parse_playlist(ajax_resp['list_content'],len(plist)) plist.extend(pl) lmurl = parse_lmwidget(ajax_resp['lm_widget']) count += 1 playlist['list'] = plist prune_playlist(playlist) print_playlist_header(playlist) if(load_sequential) : playlist['list'] = load_meta_info(plist) else : playlist['list'] = load_meta_info_parallel(plist) return playlist
def parse_playlist(list_content,last=0): plist = list() count = 1 tstr = '//table[@id="pl-video-table"]/tbody/tr' i = last for l in list_content.xpath(tstr): vid = list_content.xpath(tstr+"["+str(count)+"]/@data-video-id")[0] title = clean_up_title(list_content.xpath(tstr+"["+str(count)+"]/@data-title")[0]) t = list_content.xpath(tstr+"["+str(count)+"]/td[@class='pl-video-time']/div/div[@class='timestamp']/span/text()") time = t[0] if (t) else "00:00" i = i+1 plitem = ({'index': i, 'vid':vid,'title':title,'duration':str(time) }) plist.append(plitem) count += 1; return plist
def download_streams(vidmeta, folder): logr = logging.getLogger(vid) title = clean_up_title(vidmeta['title']) uid = vidmeta['vid'] select_map = vidmeta['select_map'] out_fmt = "mp4" separated = 1; # Assume sepeated content by default. If not, no need to merge temp_files = dict(); for smap in select_map: url = smap['url'] media = smap['media'] if(media == "caption"): continue elif(media == "audio-video"): outfile = filename = folder.rstrip('/')+"/"+str(title)+"_-_"+str(uid)+"."+str(smap['fmt']) separated = 0; else: filename = folder.rstrip('/')+"/"+str(uid)+"."+str(smap['media'])+"."+str(smap['fmt']) temp_files[media] = filename logr.info("\t%s",smap_to_str(smap)) logr.debug("\tSaving URL: %s\n\tto %s",smap['url'],filename) t0 = datetime.datetime.now() socket.setdefaulttimeout(120) fname, msg = urllib.urlretrieve(url,filename,reporthook=dlProgress) t1 = datetime.datetime.now() sys.stdout.write("\r") sys.stdout.flush() logr.debug("%sTime taken %s\n---------------------------------",msg,str(t1-t0)) if(separated == 1): outfile = folder.rstrip('/')+"/"+str(title)+"_-_"+str(uid)+"."+out_fmt combine_streams(temp_files,outfile,1) logr.info("\t[Outfile] '%s'",outfile)