def download_syllabus_icourse163(session, leclist, path = '', overwrite = False): headers = { 'Accept':'*/*', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection':'keep-alive', 'Host':'v.stu.126.net', #* 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'X-Requested-With':'ShockwaveFlash/15.0.0.239', } session.headers.update(headers) retry_list = [] for week in leclist: cur_week = week[0] lessons = week[1] for lesson in lessons: cur_lesson = lesson[0] lectures = lesson[1] cur_week = clean_filename(cur_week) cur_lesson = clean_filename(cur_lesson) dir = os.path.join(path, cur_week, cur_lesson) if not os.path.exists(dir): mkdir_p(dir) for (lecnum, (lecture_url, lecture_name)) in enumerate(lectures): lecture_name = clean_filename(lecture_name) filename = os.path.join(dir,"%02d_%s.%s"%(lecnum+1, lecture_name, lecture_url[-3:])) print (filename) print (lecture_url) try: resume_download_file(session, lecture_url, filename, overwrite ) except Exception as e: print(e) print('Error, add it to retry list') retry_list.append((lecture_url, filename)) retry_times = 0 while len(retry_list) != 0 and retry_times < 3: print('%d items should be retried, retrying...' % len(retry_list)) tmp_list = [item for item in retry_list] retry_times += 1 for (url, filename) in tmp_list: try: print(url) print(filename) resume_download_file(session, url, filename, overwrite ) except Exception as e: print(e) print('Error, add it to retry list') continue retry_list.remove((url, filename)) if len(retry_list) != 0: print('%d items failed, please check it' % len(retry_list)) else: print('All done.')
def _extract_links_from_asset_tags_in_text(self, text): """ Scan the text and extract asset tags and links to corresponding files. @param text: Page text. @type text: str @return: @see CourseraOnDemand._extract_links_from_text """ # Extract asset tags from instructions text asset_tags_map = self._extract_asset_tags(text) ids = list(iterkeys(asset_tags_map)) if not ids: return {} # asset tags contain asset names and ids. We need to make another # HTTP request to get asset URL. asset_urls = self._extract_asset_urls(ids) supplement_links = {} # Build supplement links, providing nice titles along the way for asset in asset_urls: title = clean_filename(asset_tags_map[asset['id']]['name'], self._unrestricted_filenames) extension = clean_filename( asset_tags_map[asset['id']]['extension'].strip(), self._unrestricted_filenames) url = asset['url'].strip() if extension not in supplement_links: supplement_links[extension] = [] supplement_links[extension].append((url, title)) return supplement_links
def addUniversity(self, data): cursor = self.conn.cursor() sql = "INSERT INTO university.university (u_name, kind, descri, pdf1_path, pdf2_path, url_path, reward, medal1, medal2, medal3, medal4, medal5) \ VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" val = (data.get('u_name'), data.get('kind'), data.get('descri'), utils.clean_filename(data.get('pdf1_path')), utils.clean_filename(data.get('pdf2_path')), data.get('url_path'), data.get('reward'), data.get('medal1'), data.get('medal2'), data.get('medal3'), data.get('medal4'), data.get('medal5')) log.info(val) try: cursor.execute(sql, val) self.conn.commit() return True except Exception as e: log.info("query '{}' with params {} failed with {}".format( sql, val, e)) log.info(cursor._executed) self.conn.rollback() raise e finally: cursor.close() self.conn.close()
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [ item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs ] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename(extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename(os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename( extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename( os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def parse_course_detail(content, doc_only): """parse course video and doc detail from response body or xxx.json file""" # json_file_path = os.path.join(output_folder, '{}.json'.format(tid)) # if os.path.exists(json_file_path): # return json.load(open(json_file_path, 'r', encoding='utf-8')) term = dict() last_week_name = '' last_lesson_name = '' for line in content.splitlines(): line = line.decode('unicode_escape') week_match = week_ptn.findall(line) if week_match: last_week_name = clean_filename(week_match[0]) term[last_week_name] = dict() logger.info(last_week_name) continue lesson_match = lesson_ptn.findall(line) if lesson_match and last_week_name in term: last_lesson_name = clean_filename(lesson_match[0]) term[last_week_name][last_lesson_name] = list() logger.info(' %s', last_lesson_name) continue if not doc_only: video_match = video_ptn.findall(line) if video_match and last_lesson_name in term[last_week_name]: content_id, _id, lecture_name, term_id = video_match[0] file_url = get_file_url(content_id, _id) postfix = 'mp4' if 'mp4' in file_url else 'flv' term[last_week_name][last_lesson_name].append( ('{}.{}'.format(lecture_name, postfix), file_url)) logger.info(' %s', '{}.{}'.format(lecture_name, postfix)) doc_match = doc_ptn.findall(line) if doc_match and last_lesson_name in term[last_week_name]: content_id, _id, lecture_name, term_id = doc_match[0] file_url = get_file_url(content_id, _id, file_type='doc') postfix = 'doc' if '.doc' in file_url else 'pdf' term[last_week_name][last_lesson_name].append( ('{}.{}'.format(lecture_name, postfix), file_url)) logger.info(' %s', '{}.{}'.format(lecture_name, postfix)) if last_week_name == '': raise ParseException('no video information in response body, %s' % content.decode('unicode_escape')) # dump_course_detail(term, json_file_path) return term
def _add_asset(name, url, destination): filename, extension = os.path.splitext(clean_url(name)) if extension is '': return extension = clean_filename(extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename(os.path.basename(filename), self._unrestricted_filenames) url = url.strip() if extension not in destination: destination[extension] = [] destination[extension].append((url, basename))
def validate_link(url): """解析命令行传过来的课程参数 优先使用`第一次开课`的tid,如果传过来的参数是最后一次开课,可能视频只放出来一部分 """ course_page_url = 'http://www.icourse163.org/course/{}' part_param_ptn = re.compile('([A-Za-z0-9-]+)\?tid=(\d+)') url_param_ptn = re.compile('course/([A-Za-z0-9-]+)') course_name_ptn = re.compile('keywords" content="(.+?)"/>') tid_ptn = re.compile('id : "(\d+)",\ncourseId :') part_match = part_param_ptn.findall(url) url_match = url_param_ptn.findall(url) if part_match: course_id = part_match[0][0] elif url_match: course_id = url_match[0] else: raise ParamsException('course url or parameters error, %s', url) resp = retry_request(course_page_url.format(course_id), method='GET') tid_match = tid_ptn.findall(resp.text) if tid_match: tid = tid_match[0] elif part_match: tid = part_match[0] else: raise ParamsException('course url or parameters error, %s', url) course_name_match = course_name_ptn.findall(resp.text) course_name = course_name_match[0] if course_name_match else course_id course_name = clean_filename(course_name.replace(',中国大学MOOC(慕课)', '')) logger.info('parse link success, name:%s, tid:%s', course_name, tid) return course_name, tid
def video_appxRankPooling(source, dest, n_jobs, buffer_size, img_ext): print(". Executing appx_rank_pool on video...") safe_mkdir(dest) for class_folder in os.listdir( source): # run appx rank pool for each video in all class_folder video_files = search_files_recursively( os.path.join(source, class_folder)) outfolder = os.path.join(dest, class_folder) safe_mkdir(outfolder) # take only the basename of each video url, clean name from dot and whitespace # and use this basename for output image name outdir = [ os.path.join(outfolder, clean_filename(get_basename(video_file))) for video_file in video_files ] img_exts = [img_ext] * len( outdir ) # TODO: optimise this extension duplicating given every element is constant buffer_sizes = [buffer_size] * len(outdir) print(". Current class folder: %s, total:%d" % (class_folder, len(video_files))) run_args = list(zip(video_files, outdir, img_exts, buffer_sizes)) results = Pool(n_jobs).starmap(run_video_appx_rank_pooling, run_args) print(". Finished %s." % class_folder)
def _add_asset(name, url, destination): filename, extension = os.path.splitext(clean_url(name)) if extension is '': return extension = clean_filename( extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename( os.path.basename(filename), self._unrestricted_filenames) url = url.strip() if extension not in destination: destination[extension] = [] destination[extension].append((url, basename))
def post(self): file = request.files['file'] log.info(file.content_length) if file and allowed_file(file.filename): filename_ok = utils.clean_filename(file.filename) log.info('file name: ' + filename_ok) file.save(os.path.join('./univer/upload', filename_ok)) return {"data":filename_ok, "status": 200, "message":"success"}, 200 else: log.info('valid sub filename') return {"status": 400, "message":"錯誤檔案格式"}, 200
def download_images_from_link_list(img_links, img_path, record_file_name): print_log('遍历下载图片中') start_time = time.time() total = len(img_links) record_file = open(record_file_name,'a') for index, link in enumerate(img_links): print_log('第 %s / %s 张' % (str(index + 1), total)) urllib.urlretrieve(link, filename=img_path + '\\' + utils.clean_filename(link[link.rfind('/') + 1:]), reporthook=schedule) record_file.write(link + '\n') end_time = time.time() record_file.close() print_log('遍历下载图片共花费 : %s 秒 ' % str(round(end_time - start_time, 2)))
def editUniversity(self, data): cursor = self.conn.cursor() sql = "UPDATE university.university SET u_name = %s, kind = %s, descri = %s, pdf1_path = %s, pdf2_path = %s, url_path = %s, reward = %s, medal1 = %s, medal2 = %s, medal3 = %s, medal4 = %s, medal5 = %s WHERE u_id = %s" val = (data.get('u_name'), data.get('kind'), data.get('descri'), utils.clean_filename(data.get('pdf1_path')), utils.clean_filename(data.get('pdf2_path')), data.get('url_path'), data.get('reward'), data.get('medal1'), data.get('medal2'), data.get('medal3'), data.get('medal4'), data.get('medal5'), int(data.get("u_id"))) log.info(val) try: cursor.execute(sql, val) self.conn.commit() return True except Exception as e: log.info("query '{}' with params {} failed with {}".format( sql, val, e)) log.info(cursor._executed) self.conn.rollback() raise e finally: cursor.close() self.conn.close()
def _extract_links_from_asset_tags_in_text(self, text): """ Scan the text and extract asset tags and links to corresponding files. @param text: Page text. @type text: str @return: @see CourseraOnDemand._extract_links_from_text """ # Extract asset tags from instructions text asset_tags_map = self._extract_asset_tags(text) ids = list(iterkeys(asset_tags_map)) if not ids: return {} # asset tags contain asset names and ids. We need to make another # HTTP request to get asset URL. asset_urls = self._extract_asset_urls(ids) supplement_links = {} # Build supplement links, providing nice titles along the way for asset in asset_urls: title = clean_filename( asset_tags_map[asset['id']]['name'], self._unrestricted_filenames) extension = clean_filename( asset_tags_map[asset['id']]['extension'].strip(), self._unrestricted_filenames) url = asset['url'].strip() if extension not in supplement_links: supplement_links[extension] = [] supplement_links[extension].append((url, title)) return supplement_links
def post(self): token = request.cookies.get('access_token') log.info(token) if utils.JWTdecode(token) == False: return redirect("/login", code=302) file = request.files['file'] log.info(file.content_length) if file and allowed_file(file.filename): filename_ok = utils.clean_filename(file.filename) log.info('file name: ' + filename_ok) file.save(os.path.join('./university/upload', filename_ok)) return { "data": filename_ok, "status": 200, "message": "success" }, 200 else: log.info('valid sub filename') # return Response({"status": 400, "message":"錯誤檔案格式"},status=500) return {"status": 400, "message": "錯誤檔案格式"}, 200
def print_vedio_name(session, leclist, path=''): print "path\n" print path video_file = open(path + 'vedio.txt', 'wb') video_file.truncate() for week in leclist: cur_week = week[0] lessons = week[1] for lesson in lessons: cur_lesson = lesson[0].encode("utf-8") lectures = lesson[1] #(link,undef) = lectures # print(repr(lessons)) cur_week = clean_filename(cur_week) #print "cur_lesson:%s" %cur_lesson video_file.write(cur_lesson) video_file.write("\n") #print "lectures:" #print lectures[0] for (lecnum, (lecture_url, lecture_name)) in enumerate(lectures): video_file.write(lecture_url.encode("utf-8")) video_file.write("\n") video_file.close()
def main(): args = parse_args() if args.username is None: print ('No username specified.') sys.exit(1) if args.password is None: print ('No password specified.') sys.exit(1) user_email = args.username user_pswd = args.password course_link = args.course_url[0] path = args.path overwrite = args.overwrite regex = r'(?:https?://)(?P<site>[^/]+)/(?P<baseurl>[^/]+)/(?P<coursename>[^/]+)/?' m = re.match(regex, args.course_url[0]) if m is None: print ('The URL provided is not valid for icourse163.') sys.exit(0) md = md5.new() md.update(user_pswd) encryptedpswd = md.hexdigest() if m.group('site') in ['www.icourse163.org']: login_data = { 'product': 'imooc', 'url': 'http://www.icourse163.org/mooc.htm?#/index', 'savelogin': 1, 'domains': 'icourse163.org', 'type': 0, 'append': 1, 'username': user_email, 'password': encryptedpswd } login_success_flag = '正在登录,请稍等...' web_host = 'www.icourse163.org' regex_loc = 'window.location.replace\(\"(http:\/\/reg\.icourse163\.org\/next\.jsp.+)\"\)' elif m.group('site') in [ 'mooc.study.163.com']: login_data = { 'product': 'study', 'url': 'http://study.163.com?from=study', 'savelogin': 1, 'domains': '163.com', 'type': 0, 'append': 1, 'username': user_email, 'password': encryptedpswd } login_success_flag = '登录成功,正在跳转' web_host = 'mooc.study.163.com' regex_loc = 'window.location.replace\(\"(http:\/\/study\.163\.com\?from=study)\"\)' else: print ('The URL provided is not valid for icourse163.') sys.exit(0) path = os.path.join(path, clean_filename(m.group('coursename'))) login_url = 'https://reg.163.com/logins.jsp' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', } session = requests.Session() session.headers.update(headers) r1 = session.post(login_url, data=login_data) success = re.search(login_success_flag, r1.content) if not success: print ('Fail to login.') exit(2) else: print ('Login done...') se = re.search(regex_loc, r1.content) r = session.get(se.group(1), allow_redirects=True, cookies = {'NTES_PASSPORT':session.cookies['NTES_PASSPORT']}) # get course id, it's in cid.group(1) r2 = session.get(course_link) cid = re.search(r'window\.termDto = { id:([0-9]+),', r2.content) if cid is None: cid = re.search(r'termId : \"([0-9]+)\",', r2.content) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': '*/*' , 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'Content-Type': 'text/plain', 'Cookie': 'STUDY_SESS=%s; '% session.cookies['STUDY_SESS'], 'Host': web_host, } session.headers.update(headers) params = { 'callCount':1, 'scriptSessionId':'${scriptSessionId}190', 'httpSessionId':'e8890caec7fe435d944c0f318b932719', 'c0-scriptName':'CourseBean', 'c0-id': 0, 'c0-methodName':'getLastLearnedMocTermDto', 'c0-param0':'number:' + cid.group(1), 'batchId':434820, #arbitrarily } getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' r3 = session.post(getcourse_url,data = params) print ('Parsing...', end="") syllabus = parse_syllabus_icourse163(session, r3.content) if syllabus: print ('Done.') else: print ('Failed. No course content on the page.') sys.exit(0) print ('Save files to %s' % path) download_syllabus_icourse163(session, syllabus, path)
def main(username, password, params): print('username:%s, password:%s, params:%s' % (username, password, params)) # if sys.argv[1] is None: # print('缺少用户名参数 e.g. python icourse163.py username password param') # sys.exit(1) # if sys.argv[2] is None: # print('缺少密码参数 e.g. python icourse163.py username password param') # sys.exit(1) # if sys.argv[3] is None: # print('缺少课程链接参数 e.g. python icourse163.py username password param') # sys.exit(1) # NUDT-42003 学校课程id、tid为mooc上课程id # course_link = sys.argv[3] course_link = params path = './' course_link_pattern = '(?P<s_course_id>[^/]+)\?tid=(?P<mooc_tid>[^/]+)' m = re.match(course_link_pattern, course_link) if m is None: print('The URL provided is not recognized!') sys.exit(0) s_course_id = m.group('s_course_id') mooc_tid = m.group('mooc_tid') path = os.path.join(path, clean_filename(s_course_id)) # 1.登陆 login_url = 'http://login.icourse163.org/reg/icourseLogin.do' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'Referer': 'http://www.icourse163.org/member/login.htm', 'Content-Type': 'application/x-www-form-urlencoded' } login_data = { 'returnUrl': 'aHR0cDovL3d3dy5pY291cnNlMTYzLm9yZy9pbmRleC5odG0=', 'failUrl': 'aHR0cDovL3d3dy5pY291cnNlMTYzLm9yZy9tZW1iZXIvbG9naW4uaHRtP2VtYWlsRW5jb2RlZD1Nek16TXpNeU1qTTE=', 'savelogin': '******', 'oauthType': '', 'username': username, 'passwd': password } web_host = 'www.icourse163.org' session = requests.Session() session.headers.update(headers) session.post(login_url, data=login_data) print('Login done...') # 2.查看课程信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'Content-Type': 'text/plain', 'Cookie': 'STUDY_SESS=%s; ' % session.cookies['STUDY_SESS'], 'Host': web_host, } params = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}190', 'httpSessionId': 'e8890caec7fe435d944c0f318b932719', 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': 0, 'c0-param0': 'number:' + mooc_tid, 'batchId': 434820, # arbitrarily } session.headers.update(headers) getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' r3 = session.post(getcourse_url, data=params) print('Parsing...', end="") # Parse Main Page syllabus = parse_syllabus_icourse163(session, r3.content) # If syllabus exists if syllabus: print('Done.') else: print('Failed. No course content on the page.') sys.exit(0) print('Save files to %s' % path) # Download Data download_syllabus_icourse163(session, syllabus, path)
import time import os import numpy as np from camera import Camera import game import simulation import simfileplayer import cPickle as pickle import copy import zipfile logs_directory = '/tmp/gravipy_log' or os.path.join(os.path.dirname(os.path.realpath(__file__)), 'log') if os.path.exists(logs_directory) and not os.path.isdir(logs_directory): raise IOError("Log directory choice is not a real directory!") current_log = os.path.join(logs_directory, clean_filename(time.asctime())) os.makedirs(current_log, mode=0744) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(os.path.join(current_log, 'run.log')) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.WARNING) ch.setFormatter(formatter) log = logging.getLogger(__name__) log.setLevel(logging.WARNING) log.addHandler(ch)
def main(): args = parse_args() if args.username is None: print('No username specified.') sys.exit(1) if args.password is None: print('No password specified.') sys.exit(1) user_email = args.username user_pswd = args.password course_link = args.course_url[0] path = args.path overwrite = args.overwrite regex = r'(?:https?://)(?P<site>[^/]+)/(?P<baseurl>[^/]+)/(?P<coursename>[^/]+)/?' m = re.match(regex, args.course_url[0]) if m is None: print('The URL provided is not valid for icourse163.') sys.exit(0) md = md5.new() md.update(user_pswd) encryptedpswd = md.hexdigest() if m.group('site') in ['www.icourse163.org']: login_data = { 'product': 'imooc', 'url': 'http://www.icourse163.org/mooc.htm?#/index', 'savelogin': 1, 'domains': 'icourse163.org', 'type': 0, 'append': 1, 'username': user_email, 'password': encryptedpswd } login_success_flag = '正在登录,请稍等...' web_host = 'www.icourse163.org' elif m.group('site') in ['mooc.study.163.com']: login_data = { 'product': 'study', 'url': 'http://study.163.com?from=study', 'savelogin': 1, 'domains': '163.com', 'type': 0, 'append': 1, 'username': user_email, 'password': encryptedpswd } login_success_flag = '登录成功,正在跳转' web_host = 'mooc.study.163.com' else: print('The URL provided is not valid for icourse163.') sys.exit(0) path = os.path.join(path, clean_filename(m.group('coursename'))) login_url = 'https://reg.163.com/logins.jsp' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', } session = requests.Session() session.headers.update(headers) r1 = session.post(login_url, data=login_data) success = re.search(login_success_flag, r1.content) if not success: print('Fail to login.') exit(2) else: print('Login successful...') se = re.search('window.location.replace\(\"(.+)\"\)', r1.content) r = session.get( se.group(1), allow_redirects=True, cookies={'NTES_PASSPORT': session.cookies['NTES_PASSPORT']}) # get course id, it's in cid.group(1) r2 = session.get(course_link) cid = re.search(r'window\.termDto = { id:([0-9]+),', r2.content) if cid is None: cid = re.search(r'termId : \"([0-9]+)\",', r2.content) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'Content-Type': 'text/plain', 'Cookie': 'STUDY_SESS=%s; ' % session.cookies['STUDY_SESS'], 'Host': web_host, } session.headers.update(headers) params = { 'callCount': 1, 'scriptSessionId': '${scriptSessionId}190', 'httpSessionId': 'e8890caec7fe435d944c0f318b932719', 'c0-scriptName': 'CourseBean', 'c0-id': 0, 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-param0': 'number:' + cid.group(1), 'batchId': 434820, #arbitrarily } getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr' r3 = session.post(getcourse_url, data=params) print('Parsing...', end="") syllabus = parse_syllabus_icourse163(session, r3.content) if syllabus: print('Successful.') else: print('Failed.') print('Save files to %s' % path) download_syllabus_icourse163(session, syllabus, path)
def get_download_urls(tid, doc_only=False): """获取下载链接 Args: tid: 开课id doc_only: 是否之下载课件 """ data = { 'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 'httpSessionId': sess.cookies.get('NTESSTUDYSI', 'b427803d95384cf496d3240af2526a60'), 'c0-scriptName': 'CourseBean', 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': '0', 'c0-param0': 'number:{}'.format(tid), 'batchId': '1506485521617' } custom_header = { 'Accept': '*/*', 'Content-Type': 'text/plain', } try: response = retry_request(COURSE_DETAIL_URL, data=data, headers=custom_header, timeout=20) if not response.ok: raise RequestExcetpion('获取视频链接响应状态码错误, 状态码:{}'.format(response.status_code)) except Exception as e: raise RequestExcetpion('获取视频链接HTTP请求错误, {}'.format(e)) # 解析响应数据 # 其中,每行可能代表下面某种数据类型。每个lecture可能是视频,也可能是文档 # |--week1 # |--lesson1.1 # |--lecture1.1.1 # |--lecture1.1.2 # |--lesson1.2 term = OrderedDict() last_week_name = '' last_lesson_name = '' if response.ok: for line in response.content.splitlines(): line = line.decode('unicode_escape') # 解析week week_match = week_ptn.findall(line) if week_match: last_week_name = clean_filename(week_match[0]) term[last_week_name] = OrderedDict() logger.info(last_week_name) continue # 解析lesson lesson_match = lesson_ptn.findall(line) if lesson_match and last_week_name in term: last_lesson_name = clean_filename(lesson_match[0]) term[last_week_name][last_lesson_name] = OrderedDict() logger.info(' %s', last_lesson_name) continue # 解析视频 if not doc_only: # 获取视频链接 video_match = video_ptn.findall(line) if video_match and last_lesson_name in term[last_week_name]: content_id, _id, lecture_name, term_id = video_match[0] lecture_name = clean_filename(lecture_name) file_url = get_video_doc_url(content_id, _id) postfix = 'mp4' if 'mp4' in file_url else 'flv' term[last_week_name][last_lesson_name]['{}.{}'.format(lecture_name, postfix)] = file_url logger.info(' %s', '{}.{}'.format(lecture_name, postfix)) # 解析文档 doc_match = doc_ptn.findall(line) if doc_match and last_lesson_name in term[last_week_name]: content_id, _id, lecture_name, term_id = doc_match[0] lecture_name = clean_filename(lecture_name) file_url = get_video_doc_url(content_id, _id, file_type='doc') postfix = 'doc' if '.doc' in file_url else 'pdf' term[last_week_name][last_lesson_name]['{}.{}'.format(lecture_name, postfix)] = file_url logger.info(' %s', '{}.{}'.format(lecture_name, postfix)) if last_week_name == '': raise ParseException('未找到每周课程名称列表') term = reindex_file_name(term) return term
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False, subtitle_language='en'): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, unrestricted_filenames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, unrestricted_filenames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, unrestricted_filenames) fmt = get_anchor_format(href) if fmt in ('srt', 'txt') and subtitle_language != 'en': title = title.replace('_en&format', '_' + subtitle_language + '&format') href = href.replace('_en&format', '_' + subtitle_language + '&format') logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warning( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections
def upload_path_handler(instance, filename): return "images/news/{title}/{file}".format(title=slugify(instance.title), file=clean_filename(filename))
def download_syllabus_study163(session, syllabus, path='', overwrite=False): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'study.163.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', } get_token_url = 'http://study.163.com/video/getVideoAuthorityToken.htm' session.headers.update(headers) course_id = syllabus[0] print('-----') print(course_id) course = syllabus[1] retry_list = [] for (chapter_num, (chapter, lessons)) in enumerate(course): chapter_name = clean_filename(chapter) dir = os.path.join(path, ('%02d %s' % (chapter_num + 1, chapter_name))) print(dir) if not os.path.exists(dir): mkdir_p(dir) for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons): print('lesson_num: ', end="") print(lesson_num) print('lesson_name: ' + lesson_name.decode('raw_unicode_escape')) lesson_name = clean_filename( lesson_name.decode('raw_unicode_escape')) filename = os.path.join( dir, '%02d_%s.mp4' % (lesson_num + 1, lesson_name)) print(filename) if overwrite or not os.path.exists(filename): try: r = session.get(get_token_url) video_url = lesson_url download_file(video_url, filename) except Exception as e: print(e) print('1Error, add it to retry list') retry_list.append((lesson_url, filename)) else: print('Already downloaded') retry_times = 0 while len(retry_list) != 0 and retry_times < 3: print('%d items should be retried, retrying...' % len(retry_list)) tmp_list = [item for item in retry_list] retry_times += 1 for (url, filename) in tmp_list: try: print(url) print(filename) r = session.get(get_token_url) video_url = lesson_url download_file(video_url, filename) except Exception as e: print(e) print('2Error, add it to retry list') print('lesson_url:' + lesson_url) continue retry_list.remove((url, filename)) if len(retry_list) != 0: print('%d items failed, please check it' % len(retry_list)) else: print('All done.')
def parse_old_style_syllabus(session, page, reverse=False, intact_fnames=False, subtitle_language="en"): """ Parse an old style Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections stags = soup.findAll(attrs={"class": re.compile("^course-item-list-header")}) for stag in stags: assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll("li"): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(" %s", vname) lecture = {} lecture_page = None for a in vtag.findAll("a"): href = fix_url(a["href"]) untouched_fname = a.get("title", "") title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) if fmt in ("srt", "txt") and subtitle_language != "en": title = title.replace("_en&format", "_" + subtitle_language + "&format") href = href.replace("_en&format", "_" + subtitle_language + "&format") logging.debug(" %s %s", fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_old_style_video(session, lecture_page) lecture["mp4"] = lecture.get("mp4", []) lecture["mp4"].append((fix_url(href), "")) except TypeError: logging.warn("Could not get resource: %s", lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if "mp4" not in lecture: for a in vtag.findAll("a"): if a.get("data-modal-iframe"): href = grab_hidden_video_url(session, a["data-modal-iframe"]) href = fix_url(href) fmt = "mp4" logging.debug(" %s %s", fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, "")) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], "") else: # make sure the title is unique lecture[fmt][i] = (r[0], "{0:d}_{1}".format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info("Found %d sections and %d lectures on this page", len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error("The cookies file may be invalid, " "please re-run with the `--clear-cache` option.") return sections
def download_syllabus_study163(session, syllabus, path='', overwrite=False): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', } get_token_url = 'http://study.163.com/video/getVideoAuthorityToken.htm' session.headers.update(headers) course_id = syllabus[0] course = syllabus[1] retry_list = [] for (chapter_num, (chapter, lessons)) in enumerate(course): chapter_name = clean_filename(chapter) dir = os.path.join(path, ('%02d %s' % (chapter_num + 1, chapter_name))) if not os.path.exists(dir): mkdir_p(dir) for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons): fmt = lesson_url.split('.')[-1] lesson_name = clean_filename( lesson_name.decode('raw_unicode_escape')) filename = os.path.join( dir, '%02d_%s.%s' % (lesson_num + 1, lesson_name, fmt)) print(filename) if overwrite or not os.path.exists(filename): try: r = session.get(get_token_url) video_url_suffix = '88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59'\ + r.content video_url = lesson_url + '?key=' + video_url_suffix download_file(session, video_url, filename) except Exception as e: print(e) print('Error, add it to retry list') retry_list.append((lesson_url, filename)) else: print('Already downloaded') retry_times = 0 while len(retry_list) != 0 and retry_times < 3: print('%d items should be retried, retrying...' % len(retry_list)) tmp_list = [item for item in retry_list] retry_times += 1 for (url, filename) in tmp_list: try: print(url) print(filename) r = session.get(get_token_url) video_url_suffix = '88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59'\ + r.content video_url = url + '?key=' + video_url_suffix download_file(session, video_url, filename) except Exception as e: print(e) print('Error, add it to retry list') continue retry_list.remove((url, filename)) if len(retry_list) != 0: print('%d items failed, please check it' % len(retry_list)) else: print('All done.')
def parse_syllabus(session, page, reverse=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll(attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" section_name = clean_filename(stag.contents[0].contents[1]) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" vname = clean_filename(vtag.a.contents[0]) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) title = clean_filename(a.get('title', '')) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn( 'Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url( session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if (count == i + 1): # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('Probably bad cookies file (or wrong class name)') return sections
def parse_syllabus(session, page, reverse=False, intact_fnames=False): """ Parses a Coursera course listing/syllabus page. Each section is a week of classes. """ sections = [] soup = BeautifulSoup(page) # traverse sections for stag in soup.findAll( attrs={'class': re.compile('^course-item-list-header')}): assert stag.contents[0] is not None, "couldn't find section" untouched_fname = stag.contents[0].contents[1] section_name = clean_filename(untouched_fname, intact_fnames) logging.info(section_name) lectures = [] # resources for 1 lecture # traverse resources (e.g., video, ppt, ..) for vtag in stag.nextSibling.findAll('li'): assert vtag.a.contents[0], "couldn't get lecture name" untouched_fname = vtag.a.contents[0] vname = clean_filename(untouched_fname, intact_fnames) logging.info(' %s', vname) lecture = {} lecture_page = None for a in vtag.findAll('a'): href = fix_url(a['href']) untouched_fname = a.get('title', '') title = clean_filename(untouched_fname, intact_fnames) fmt = get_anchor_format(href) logging.debug(' %s %s', fmt, href) if fmt: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, title)) continue # Special case: find preview URLs lecture_page = transform_preview_url(href) if lecture_page: try: href = get_video(session, lecture_page) lecture['mp4'] = lecture.get('mp4', []) lecture['mp4'].append((fix_url(href), '')) except TypeError: logging.warn('Could not get resource: %s', lecture_page) # Special case: we possibly have hidden video links---thanks to # the University of Washington for that. if 'mp4' not in lecture: for a in vtag.findAll('a'): if a.get('data-modal-iframe'): href = grab_hidden_video_url(session, a['data-modal-iframe']) href = fix_url(href) fmt = 'mp4' logging.debug(' %s %s', fmt, href) if href is not None: lecture[fmt] = lecture.get(fmt, []) lecture[fmt].append((href, '')) for fmt in lecture: count = len(lecture[fmt]) for i, r in enumerate(lecture[fmt]): if count == i + 1: # for backward compatibility, we do not add the title # to the filename (format_combine_number_resource and # format_resource) lecture[fmt][i] = (r[0], '') else: # make sure the title is unique lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1])) lectures.append((vname, lecture)) sections.append((section_name, lectures)) logging.info('Found %d sections and %d lectures on this page', len(sections), sum(len(s[1]) for s in sections)) if sections and reverse: sections.reverse() if not len(sections): logging.error('The cookies file may be invalid, ' 'please re-run with the `--clear-cache` option.') return sections
import game import sys import logging import time import os import numpy as np from camera import Camera from utils import clean_filename import simulation logs_directory = '/tmp/gravipy_log' or os.path.join( os.path.dirname(os.path.realpath(__file__)), 'log') if os.path.exists(logs_directory) and not os.path.isdir(logs_directory): raise IOError("Log directory choice is not a real directory!") current_log = os.path.join(logs_directory, clean_filename(time.asctime())) os.makedirs(current_log, mode=0744) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh = logging.FileHandler(os.path.join(current_log, 'run.log')) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.WARNING) ch.setFormatter(formatter) log = logging.getLogger(__name__) log.setLevel(logging.WARNING) log.addHandler(ch)
def upload_path_handler(instance, filename): return "images/events/{family}/{title}/{file}".format( family=slugify(instance.family), title=slugify(instance.title), file=utils.clean_filename(filename))
def download_syllabus_study163(session, syllabus, path="", overwrite=False): headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", } get_token_url = "http://study.163.com/video/getVideoAuthorityToken.htm" session.headers.update(headers) course_id = syllabus[0] course = syllabus[1] retry_list = [] for (chapter_num, (chapter, lessons)) in enumerate(course): chapter_name = clean_filename(chapter) dir = os.path.join(path, ("%02d %s" % (chapter_num + 1, chapter_name))) if not os.path.exists(dir): mkdir_p(dir) for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons): fmt = lesson_url.split(".")[-1] lesson_name = clean_filename(lesson_name.decode("raw_unicode_escape")) filename = os.path.join(dir, "%02d_%s.%s" % (lesson_num + 1, lesson_name, fmt)) print(filename) if overwrite or not os.path.exists(filename): try: r = session.get(get_token_url) video_url_suffix = ( "88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59" + r.content ) video_url = lesson_url + "?key=" + video_url_suffix download_file(session, video_url, filename) except Exception as e: print(e) print("Error, add it to retry list") retry_list.append((lesson_url, filename)) else: print("Already downloaded") retry_times = 0 while len(retry_list) != 0 and retry_times < 3: print("%d items should be retried, retrying..." % len(retry_list)) tmp_list = [item for item in retry_list] retry_times += 1 for (url, filename) in tmp_list: try: print(url) print(filename) r = session.get(get_token_url) video_url_suffix = ( "88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59" + r.content ) video_url = url + "?key=" + video_url_suffix download_file(session, video_url, filename) except Exception as e: print(e) print("Error, add it to retry list") continue retry_list.remove((url, filename)) if len(retry_list) != 0: print("%d items failed, please check it" % len(retry_list)) else: print("All done.")