Exemple #1
0
def download_syllabus_icourse163(session, leclist, path = '', overwrite = False):

    headers = {
                'Accept':'*/*',
                'Accept-Encoding':'gzip, deflate, sdch',
                'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                'Connection':'keep-alive',
                'Host':'v.stu.126.net', #*
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'X-Requested-With':'ShockwaveFlash/15.0.0.239',
               }

    session.headers.update(headers)

    retry_list = []
    for week in leclist:
        cur_week = week[0]
        lessons = week[1]
        for lesson in lessons:
            cur_lesson = lesson[0]
            lectures = lesson[1]
            cur_week = clean_filename(cur_week)
            cur_lesson = clean_filename(cur_lesson)
            dir = os.path.join(path, cur_week, cur_lesson)
            if not os.path.exists(dir):
                mkdir_p(dir)

            for (lecnum, (lecture_url, lecture_name)) in enumerate(lectures):
                lecture_name = clean_filename(lecture_name)
                filename = os.path.join(dir,"%02d_%s.%s"%(lecnum+1, lecture_name, lecture_url[-3:]))
                print (filename)
                print (lecture_url)
                try:
                    resume_download_file(session, lecture_url, filename, overwrite )
                except Exception as e:
                    print(e)
                    print('Error, add it to retry list')
                    retry_list.append((lecture_url, filename))

    retry_times = 0
    while len(retry_list) != 0 and retry_times < 3:
        print('%d items should be retried, retrying...' % len(retry_list))
        tmp_list = [item for item in retry_list]
        retry_times += 1
        for (url, filename) in tmp_list:
            try:
                print(url)
                print(filename)
                resume_download_file(session, url, filename, overwrite )
            except Exception as e:
                print(e)
                print('Error, add it to retry list')
                continue

            retry_list.remove((url, filename)) 
    
    if len(retry_list) != 0:
        print('%d items failed, please check it' % len(retry_list))
    else:
        print('All done.')
def download_syllabus_icourse163(session, leclist, path = '', overwrite = False):

    headers = {
                'Accept':'*/*',
                'Accept-Encoding':'gzip, deflate, sdch',
                'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                'Connection':'keep-alive',
                'Host':'v.stu.126.net', #*
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'X-Requested-With':'ShockwaveFlash/15.0.0.239',
               }

    session.headers.update(headers)

    retry_list = []
    for week in leclist:
        cur_week = week[0]
        lessons = week[1]
        for lesson in lessons:
            cur_lesson = lesson[0]
            lectures = lesson[1]
            cur_week = clean_filename(cur_week)
            cur_lesson = clean_filename(cur_lesson)
            dir = os.path.join(path, cur_week, cur_lesson)
            if not os.path.exists(dir):
                mkdir_p(dir)

            for (lecnum, (lecture_url, lecture_name)) in enumerate(lectures):
                lecture_name = clean_filename(lecture_name)
                filename = os.path.join(dir,"%02d_%s.%s"%(lecnum+1, lecture_name, lecture_url[-3:]))
                print (filename)
                print (lecture_url)
                try:
                    resume_download_file(session, lecture_url, filename, overwrite )
                except Exception as e:
                    print(e)
                    print('Error, add it to retry list')
                    retry_list.append((lecture_url, filename))

    retry_times = 0
    while len(retry_list) != 0 and retry_times < 3:
        print('%d items should be retried, retrying...' % len(retry_list))
        tmp_list = [item for item in retry_list]
        retry_times += 1
        for (url, filename) in tmp_list:
            try:
                print(url)
                print(filename)
                resume_download_file(session, url, filename, overwrite )
            except Exception as e:
                print(e)
                print('Error, add it to retry list')
                continue

            retry_list.remove((url, filename)) 
    
    if len(retry_list) != 0:
        print('%d items failed, please check it' % len(retry_list))
    else:
        print('All done.')
Exemple #3
0
    def _extract_links_from_asset_tags_in_text(self, text):
        """
        Scan the text and extract asset tags and links to corresponding
        files.

        @param text: Page text.
        @type text: str

        @return: @see CourseraOnDemand._extract_links_from_text
        """
        # Extract asset tags from instructions text
        asset_tags_map = self._extract_asset_tags(text)
        ids = list(iterkeys(asset_tags_map))
        if not ids:
            return {}

        # asset tags contain asset names and ids. We need to make another
        # HTTP request to get asset URL.
        asset_urls = self._extract_asset_urls(ids)

        supplement_links = {}

        # Build supplement links, providing nice titles along the way
        for asset in asset_urls:
            title = clean_filename(asset_tags_map[asset['id']]['name'],
                                   self._unrestricted_filenames)
            extension = clean_filename(
                asset_tags_map[asset['id']]['extension'].strip(),
                self._unrestricted_filenames)
            url = asset['url'].strip()
            if extension not in supplement_links:
                supplement_links[extension] = []
            supplement_links[extension].append((url, title))

        return supplement_links
Exemple #4
0
    def addUniversity(self, data):
        cursor = self.conn.cursor()
        sql = "INSERT INTO university.university (u_name, kind, descri, pdf1_path, pdf2_path, url_path, reward, medal1, medal2, medal3, medal4, medal5) \
                VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

        val = (data.get('u_name'), data.get('kind'), data.get('descri'),
               utils.clean_filename(data.get('pdf1_path')),
               utils.clean_filename(data.get('pdf2_path')),
               data.get('url_path'), data.get('reward'), data.get('medal1'),
               data.get('medal2'), data.get('medal3'), data.get('medal4'),
               data.get('medal5'))
        log.info(val)
        try:
            cursor.execute(sql, val)
            self.conn.commit()
            return True
        except Exception as e:
            log.info("query '{}' with params {} failed with {}".format(
                sql, val, e))
            log.info(cursor._executed)
            self.conn.rollback()
            raise e
        finally:
            cursor.close()
            self.conn.close()
Exemple #5
0
    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [
            item['href'].strip() for item in soup.find_all('a')
            if 'href' in item.attrs
        ]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(extension.lower().strip('.').strip(),
                                       self._unrestricted_filenames)
            basename = clean_filename(os.path.basename(filename),
                                      self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links
Exemple #6
0
    def _extract_links_from_a_tags_in_text(self, text):
        """
        Extract supplement links from the html text that contains <a> tags
        with href attribute.

        @param text: HTML text.
        @type text: str

        @return: Dictionary with supplement links grouped by extension.
        @rtype: {
            '<extension1>': [
                ('<link1>', '<title1>'),
                ('<link2>', '<title2')
            ],
            'extension2': [
                ('<link3>', '<title3>'),
                ('<link4>', '<title4>')
            ]
        }
        """
        soup = BeautifulSoup(text)
        links = [item['href'].strip()
                 for item in soup.find_all('a') if 'href' in item.attrs]
        links = sorted(list(set(links)))
        supplement_links = {}

        for link in links:
            filename, extension = os.path.splitext(clean_url(link))
            # Some courses put links to sites in supplement section, e.g.:
            # http://pandas.pydata.org/
            if extension is '':
                continue

            # Make lowercase and cut the leading/trailing dot
            extension = clean_filename(
                extension.lower().strip('.').strip(),
                self._unrestricted_filenames)
            basename = clean_filename(
                os.path.basename(filename),
                self._unrestricted_filenames)
            if extension not in supplement_links:
                supplement_links[extension] = []
            # Putting basename into the second slot of the tuple is important
            # because that will allow to download many supplements within a
            # single lecture, e.g.:
            # 01_slides-presented-in-this-module.pdf
            # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf
            # 01_slides-presented-in-this-module_LM-3dtexton.pdf
            supplement_links[extension].append((link, basename))

        return supplement_links
Exemple #7
0
def parse_course_detail(content, doc_only):
    """parse course video and doc detail from response body or xxx.json file"""
    # json_file_path = os.path.join(output_folder, '{}.json'.format(tid))
    # if os.path.exists(json_file_path):
    #     return json.load(open(json_file_path, 'r', encoding='utf-8'))

    term = dict()
    last_week_name = ''
    last_lesson_name = ''

    for line in content.splitlines():
        line = line.decode('unicode_escape')
        week_match = week_ptn.findall(line)
        if week_match:
            last_week_name = clean_filename(week_match[0])
            term[last_week_name] = dict()
            logger.info(last_week_name)
            continue

        lesson_match = lesson_ptn.findall(line)
        if lesson_match and last_week_name in term:
            last_lesson_name = clean_filename(lesson_match[0])
            term[last_week_name][last_lesson_name] = list()
            logger.info('    %s', last_lesson_name)
            continue

        if not doc_only:
            video_match = video_ptn.findall(line)
            if video_match and last_lesson_name in term[last_week_name]:
                content_id, _id, lecture_name, term_id = video_match[0]
                file_url = get_file_url(content_id, _id)
                postfix = 'mp4' if 'mp4' in file_url else 'flv'
                term[last_week_name][last_lesson_name].append(
                    ('{}.{}'.format(lecture_name, postfix), file_url))
                logger.info('        %s',
                            '{}.{}'.format(lecture_name, postfix))

        doc_match = doc_ptn.findall(line)
        if doc_match and last_lesson_name in term[last_week_name]:
            content_id, _id, lecture_name, term_id = doc_match[0]
            file_url = get_file_url(content_id, _id, file_type='doc')
            postfix = 'doc' if '.doc' in file_url else 'pdf'
            term[last_week_name][last_lesson_name].append(
                ('{}.{}'.format(lecture_name, postfix), file_url))
            logger.info('        %s', '{}.{}'.format(lecture_name, postfix))
    if last_week_name == '':
        raise ParseException('no video information in response body, %s' %
                             content.decode('unicode_escape'))
    # dump_course_detail(term, json_file_path)
    return term
Exemple #8
0
        def _add_asset(name, url, destination):
            filename, extension = os.path.splitext(clean_url(name))
            if extension is '':
                return

            extension = clean_filename(extension.lower().strip('.').strip(),
                                       self._unrestricted_filenames)
            basename = clean_filename(os.path.basename(filename),
                                      self._unrestricted_filenames)
            url = url.strip()

            if extension not in destination:
                destination[extension] = []
            destination[extension].append((url, basename))
Exemple #9
0
def validate_link(url):
    """解析命令行传过来的课程参数
    优先使用`第一次开课`的tid,如果传过来的参数是最后一次开课,可能视频只放出来一部分
    """
    course_page_url = 'http://www.icourse163.org/course/{}'
    part_param_ptn = re.compile('([A-Za-z0-9-]+)\?tid=(\d+)')
    url_param_ptn = re.compile('course/([A-Za-z0-9-]+)')
    course_name_ptn = re.compile('keywords" content="(.+?)"/>')
    tid_ptn = re.compile('id : "(\d+)",\ncourseId :')

    part_match = part_param_ptn.findall(url)
    url_match = url_param_ptn.findall(url)
    if part_match:
        course_id = part_match[0][0]
    elif url_match:
        course_id = url_match[0]
    else:
        raise ParamsException('course url or parameters error, %s', url)
    resp = retry_request(course_page_url.format(course_id), method='GET')
    tid_match = tid_ptn.findall(resp.text)
    if tid_match:
        tid = tid_match[0]
    elif part_match:
        tid = part_match[0]
    else:
        raise ParamsException('course url or parameters error, %s', url)
    course_name_match = course_name_ptn.findall(resp.text)
    course_name = course_name_match[0] if course_name_match else course_id
    course_name = clean_filename(course_name.replace(',中国大学MOOC(慕课)', ''))
    logger.info('parse link success, name:%s, tid:%s', course_name, tid)
    return course_name, tid
Exemple #10
0
def video_appxRankPooling(source, dest, n_jobs, buffer_size, img_ext):
    print(". Executing appx_rank_pool on video...")
    safe_mkdir(dest)

    for class_folder in os.listdir(
            source):  # run appx rank pool for each video in all class_folder
        video_files = search_files_recursively(
            os.path.join(source, class_folder))
        outfolder = os.path.join(dest, class_folder)

        safe_mkdir(outfolder)

        # take only the basename of each video url, clean name from dot and whitespace
        # and use this basename for output image name
        outdir = [
            os.path.join(outfolder, clean_filename(get_basename(video_file)))
            for video_file in video_files
        ]
        img_exts = [img_ext] * len(
            outdir
        )  # TODO: optimise this extension duplicating given every element is constant
        buffer_sizes = [buffer_size] * len(outdir)

        print(". Current class folder: %s, total:%d" %
              (class_folder, len(video_files)))

        run_args = list(zip(video_files, outdir, img_exts, buffer_sizes))
        results = Pool(n_jobs).starmap(run_video_appx_rank_pooling, run_args)

        print(". Finished %s." % class_folder)
Exemple #11
0
        def _add_asset(name, url, destination):
            filename, extension = os.path.splitext(clean_url(name))
            if extension is '':
                return

            extension = clean_filename(
                extension.lower().strip('.').strip(),
                self._unrestricted_filenames)
            basename = clean_filename(
                os.path.basename(filename),
                self._unrestricted_filenames)
            url = url.strip()

            if extension not in destination:
                destination[extension] = []
            destination[extension].append((url, basename))
 def post(self):
     file = request.files['file']
     log.info(file.content_length)
     if file and allowed_file(file.filename):
         filename_ok = utils.clean_filename(file.filename)
         log.info('file name: ' + filename_ok)
         file.save(os.path.join('./univer/upload', filename_ok))
         return {"data":filename_ok, "status": 200, "message":"success"}, 200
     else:
         log.info('valid sub filename')
         return {"status": 400, "message":"錯誤檔案格式"}, 200
Exemple #13
0
def download_images_from_link_list(img_links, img_path, record_file_name):
    print_log('遍历下载图片中')
    start_time = time.time()
    total = len(img_links)
    record_file = open(record_file_name,'a')
    for index, link in enumerate(img_links):
        print_log('第 %s / %s 张' % (str(index + 1), total))
        urllib.urlretrieve(link, filename=img_path + '\\' + utils.clean_filename(link[link.rfind('/') + 1:]),
                           reporthook=schedule)
        record_file.write(link + '\n')
    end_time = time.time()
    record_file.close()
    print_log('遍历下载图片共花费 : %s 秒 ' % str(round(end_time - start_time, 2)))
Exemple #14
0
 def editUniversity(self, data):
     cursor = self.conn.cursor()
     sql = "UPDATE university.university SET u_name = %s, kind = %s, descri = %s, pdf1_path = %s, pdf2_path = %s, url_path = %s, reward = %s, medal1 = %s, medal2 = %s, medal3 = %s, medal4 = %s, medal5 = %s WHERE u_id = %s"
     val = (data.get('u_name'), data.get('kind'), data.get('descri'),
            utils.clean_filename(data.get('pdf1_path')),
            utils.clean_filename(data.get('pdf2_path')),
            data.get('url_path'), data.get('reward'), data.get('medal1'),
            data.get('medal2'), data.get('medal3'), data.get('medal4'),
            data.get('medal5'), int(data.get("u_id")))
     log.info(val)
     try:
         cursor.execute(sql, val)
         self.conn.commit()
         return True
     except Exception as e:
         log.info("query '{}' with params {} failed with {}".format(
             sql, val, e))
         log.info(cursor._executed)
         self.conn.rollback()
         raise e
     finally:
         cursor.close()
         self.conn.close()
Exemple #15
0
    def _extract_links_from_asset_tags_in_text(self, text):
        """
        Scan the text and extract asset tags and links to corresponding
        files.

        @param text: Page text.
        @type text: str

        @return: @see CourseraOnDemand._extract_links_from_text
        """
        # Extract asset tags from instructions text
        asset_tags_map = self._extract_asset_tags(text)
        ids = list(iterkeys(asset_tags_map))
        if not ids:
            return {}

        # asset tags contain asset names and ids. We need to make another
        # HTTP request to get asset URL.
        asset_urls = self._extract_asset_urls(ids)

        supplement_links = {}

        # Build supplement links, providing nice titles along the way
        for asset in asset_urls:
            title = clean_filename(
                asset_tags_map[asset['id']]['name'],
                self._unrestricted_filenames)
            extension = clean_filename(
                asset_tags_map[asset['id']]['extension'].strip(),
                self._unrestricted_filenames)
            url = asset['url'].strip()
            if extension not in supplement_links:
                supplement_links[extension] = []
            supplement_links[extension].append((url, title))

        return supplement_links
 def post(self):
     token = request.cookies.get('access_token')
     log.info(token)
     if utils.JWTdecode(token) == False:
         return redirect("/login", code=302)
     file = request.files['file']
     log.info(file.content_length)
     if file and allowed_file(file.filename):
         filename_ok = utils.clean_filename(file.filename)
         log.info('file name: ' + filename_ok)
         file.save(os.path.join('./university/upload', filename_ok))
         return {
             "data": filename_ok,
             "status": 200,
             "message": "success"
         }, 200
     else:
         log.info('valid sub filename')
         # return Response({"status": 400, "message":"錯誤檔案格式"},status=500)
         return {"status": 400, "message": "錯誤檔案格式"}, 200
Exemple #17
0
def print_vedio_name(session, leclist, path=''):
    print "path\n"
    print path
    video_file = open(path + 'vedio.txt', 'wb')
    video_file.truncate()
    for week in leclist:
        cur_week = week[0]
        lessons = week[1]
        for lesson in lessons:
            cur_lesson = lesson[0].encode("utf-8")
            lectures = lesson[1]
            #(link,undef) = lectures
            # print(repr(lessons))
            cur_week = clean_filename(cur_week)
            #print "cur_lesson:%s" %cur_lesson
            video_file.write(cur_lesson)
            video_file.write("\n")
            #print "lectures:"
            #print lectures[0]
            for (lecnum, (lecture_url, lecture_name)) in enumerate(lectures):
                video_file.write(lecture_url.encode("utf-8"))
                video_file.write("\n")
    video_file.close()
def main():
    args = parse_args()

    if args.username is None:
        print ('No username specified.')
        sys.exit(1)
    if args.password is None:
        print ('No password specified.')
        sys.exit(1)

    user_email = args.username
    user_pswd = args.password
    course_link = args.course_url[0]
    path = args.path
    overwrite = args.overwrite

    regex = r'(?:https?://)(?P<site>[^/]+)/(?P<baseurl>[^/]+)/(?P<coursename>[^/]+)/?'
    m = re.match(regex, args.course_url[0]) 
    if m is None:
        print ('The URL provided is not valid for icourse163.')
        sys.exit(0)

    md = md5.new()
    md.update(user_pswd)
    encryptedpswd =  md.hexdigest()

    if m.group('site') in ['www.icourse163.org']:
        login_data = {
                'product': 'imooc',
                'url': 'http://www.icourse163.org/mooc.htm?#/index',
                'savelogin': 1,
                'domains': 'icourse163.org',
                'type': 0,
                'append': 1,
                'username': user_email,
                'password': encryptedpswd
                }
        login_success_flag = '正在登录,请稍等...'
        web_host = 'www.icourse163.org'
        regex_loc = 'window.location.replace\(\"(http:\/\/reg\.icourse163\.org\/next\.jsp.+)\"\)'
    elif m.group('site') in [ 'mooc.study.163.com']:
        login_data = {
                'product': 'study',
                'url': 'http://study.163.com?from=study',
                'savelogin': 1,
                'domains': '163.com',
                'type': 0,
                'append': 1,
                'username': user_email,
                'password': encryptedpswd
                }        
        login_success_flag = '登录成功,正在跳转'
        web_host = 'mooc.study.163.com'
        regex_loc = 'window.location.replace\(\"(http:\/\/study\.163\.com\?from=study)\"\)'
    else:
        print ('The URL provided is not valid for icourse163.')
        sys.exit(0)
    path = os.path.join(path, clean_filename(m.group('coursename')))

    login_url = 'https://reg.163.com/logins.jsp'

    headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                'Connection': 'keep-alive',
               }


    session = requests.Session()
    session.headers.update(headers)
    r1 = session.post(login_url, data=login_data)

    
    success = re.search(login_success_flag, r1.content)
    if not success:
        print ('Fail to login.')
        exit(2)
    else:
        print ('Login done...')
    
    se = re.search(regex_loc, r1.content)
        
    r = session.get(se.group(1), allow_redirects=True, cookies = {'NTES_PASSPORT':session.cookies['NTES_PASSPORT']})

    # get course id, it's in cid.group(1)
    r2 = session.get(course_link)
    cid = re.search(r'window\.termDto = {             id:([0-9]+),', r2.content)
    if cid is None:
        cid = re.search(r'termId : \"([0-9]+)\",', r2.content)


    headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'Accept': '*/*' ,
                'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                'Connection': 'keep-alive',
                'Content-Type': 'text/plain',
                'Cookie': 'STUDY_SESS=%s; '% session.cookies['STUDY_SESS'],
                'Host': web_host,
               }

    session.headers.update(headers)

    params =  {
                'callCount':1,
                'scriptSessionId':'${scriptSessionId}190',
                'httpSessionId':'e8890caec7fe435d944c0f318b932719',
                'c0-scriptName':'CourseBean',
                'c0-id': 0,
                'c0-methodName':'getLastLearnedMocTermDto',
                'c0-param0':'number:' + cid.group(1),
                'batchId':434820, #arbitrarily
                }

    getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'

    r3 = session.post(getcourse_url,data = params)

    print ('Parsing...', end="")

    syllabus = parse_syllabus_icourse163(session, r3.content)

    if syllabus:
        print ('Done.')
    else:
        print ('Failed. No course content on the page.')
        sys.exit(0)

    print ('Save files to %s' % path)

    download_syllabus_icourse163(session, syllabus, path)
Exemple #19
0
def main(username, password, params):
    print('username:%s, password:%s, params:%s' % (username, password, params))
    # if sys.argv[1] is None:
    #     print('缺少用户名参数 e.g. python icourse163.py username password param')
    #     sys.exit(1)
    # if sys.argv[2] is None:
    #     print('缺少密码参数 e.g. python icourse163.py username password param')
    #     sys.exit(1)
    # if sys.argv[3] is None:
    #     print('缺少课程链接参数 e.g. python icourse163.py username password param')
    #     sys.exit(1)
    # NUDT-42003 学校课程id、tid为mooc上课程id
    # course_link = sys.argv[3]
    course_link = params
    path = './'

    course_link_pattern = '(?P<s_course_id>[^/]+)\?tid=(?P<mooc_tid>[^/]+)'
    m = re.match(course_link_pattern, course_link)
    if m is None:
        print('The URL provided is not recognized!')
        sys.exit(0)
    s_course_id = m.group('s_course_id')
    mooc_tid = m.group('mooc_tid')

    path = os.path.join(path, clean_filename(s_course_id))
    # 1.登陆
    login_url = 'http://login.icourse163.org/reg/icourseLogin.do'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Referer': 'http://www.icourse163.org/member/login.htm',
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    login_data = {
        'returnUrl': 'aHR0cDovL3d3dy5pY291cnNlMTYzLm9yZy9pbmRleC5odG0=',
        'failUrl':
        'aHR0cDovL3d3dy5pY291cnNlMTYzLm9yZy9tZW1iZXIvbG9naW4uaHRtP2VtYWlsRW5jb2RlZD1Nek16TXpNeU1qTTE=',
        'savelogin': '******',
        'oauthType': '',
        'username': username,
        'passwd': password
    }
    web_host = 'www.icourse163.org'

    session = requests.Session()
    session.headers.update(headers)
    session.post(login_url, data=login_data)
    print('Login done...')

    # 2.查看课程信息
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'text/plain',
        'Cookie': 'STUDY_SESS=%s; ' % session.cookies['STUDY_SESS'],
        'Host': web_host,
    }
    params = {
        'callCount': 1,
        'scriptSessionId': '${scriptSessionId}190',
        'httpSessionId': 'e8890caec7fe435d944c0f318b932719',
        'c0-scriptName': 'CourseBean',
        'c0-methodName': 'getLastLearnedMocTermDto',
        'c0-id': 0,
        'c0-param0': 'number:' + mooc_tid,
        'batchId': 434820,  # arbitrarily
    }
    session.headers.update(headers)
    getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
    r3 = session.post(getcourse_url, data=params)
    print('Parsing...', end="")

    # Parse Main Page
    syllabus = parse_syllabus_icourse163(session, r3.content)
    # If syllabus exists
    if syllabus:
        print('Done.')
    else:
        print('Failed. No course content on the page.')
        sys.exit(0)

    print('Save files to %s' % path)
    # Download Data
    download_syllabus_icourse163(session, syllabus, path)
Exemple #20
0
import time
import os
import numpy as np
from camera import Camera
import game
import simulation
import simfileplayer
import cPickle as pickle
import copy
import zipfile

logs_directory = '/tmp/gravipy_log' or os.path.join(os.path.dirname(os.path.realpath(__file__)), 'log')

if os.path.exists(logs_directory) and not os.path.isdir(logs_directory):
    raise IOError("Log directory choice is not a real directory!")
current_log = os.path.join(logs_directory, clean_filename(time.asctime()))
os.makedirs(current_log, mode=0744)


formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh = logging.FileHandler(os.path.join(current_log, 'run.log'))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)

ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)
ch.setFormatter(formatter)

log = logging.getLogger(__name__)
log.setLevel(logging.WARNING)
log.addHandler(ch)
def main():
    args = parse_args()

    if args.username is None:
        print('No username specified.')
        sys.exit(1)
    if args.password is None:
        print('No password specified.')
        sys.exit(1)

    user_email = args.username
    user_pswd = args.password
    course_link = args.course_url[0]
    path = args.path
    overwrite = args.overwrite

    regex = r'(?:https?://)(?P<site>[^/]+)/(?P<baseurl>[^/]+)/(?P<coursename>[^/]+)/?'
    m = re.match(regex, args.course_url[0])
    if m is None:
        print('The URL provided is not valid for icourse163.')
        sys.exit(0)

    md = md5.new()
    md.update(user_pswd)
    encryptedpswd = md.hexdigest()

    if m.group('site') in ['www.icourse163.org']:
        login_data = {
            'product': 'imooc',
            'url': 'http://www.icourse163.org/mooc.htm?#/index',
            'savelogin': 1,
            'domains': 'icourse163.org',
            'type': 0,
            'append': 1,
            'username': user_email,
            'password': encryptedpswd
        }
        login_success_flag = '正在登录,请稍等...'
        web_host = 'www.icourse163.org'
    elif m.group('site') in ['mooc.study.163.com']:
        login_data = {
            'product': 'study',
            'url': 'http://study.163.com?from=study',
            'savelogin': 1,
            'domains': '163.com',
            'type': 0,
            'append': 1,
            'username': user_email,
            'password': encryptedpswd
        }
        login_success_flag = '登录成功,正在跳转'
        web_host = 'mooc.study.163.com'
    else:
        print('The URL provided is not valid for icourse163.')
        sys.exit(0)
    path = os.path.join(path, clean_filename(m.group('coursename')))

    login_url = 'https://reg.163.com/logins.jsp'

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
    }

    session = requests.Session()
    session.headers.update(headers)
    r1 = session.post(login_url, data=login_data)

    success = re.search(login_success_flag, r1.content)
    if not success:
        print('Fail to login.')
        exit(2)
    else:
        print('Login successful...')

    se = re.search('window.location.replace\(\"(.+)\"\)', r1.content)

    r = session.get(
        se.group(1),
        allow_redirects=True,
        cookies={'NTES_PASSPORT': session.cookies['NTES_PASSPORT']})

    # get course id, it's in cid.group(1)
    r2 = session.get(course_link)
    cid = re.search(r'window\.termDto = {             id:([0-9]+),',
                    r2.content)
    if cid is None:
        cid = re.search(r'termId : \"([0-9]+)\",', r2.content)

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'text/plain',
        'Cookie': 'STUDY_SESS=%s; ' % session.cookies['STUDY_SESS'],
        'Host': web_host,
    }

    session.headers.update(headers)

    params = {
        'callCount': 1,
        'scriptSessionId': '${scriptSessionId}190',
        'httpSessionId': 'e8890caec7fe435d944c0f318b932719',
        'c0-scriptName': 'CourseBean',
        'c0-id': 0,
        'c0-methodName': 'getLastLearnedMocTermDto',
        'c0-param0': 'number:' + cid.group(1),
        'batchId': 434820,  #arbitrarily
    }

    getcourse_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'

    r3 = session.post(getcourse_url, data=params)

    print('Parsing...', end="")

    syllabus = parse_syllabus_icourse163(session, r3.content)

    if syllabus:
        print('Successful.')
    else:
        print('Failed.')

    print('Save files to %s' % path)

    download_syllabus_icourse163(session, syllabus, path)
Exemple #22
0
def get_download_urls(tid, doc_only=False):
    """获取下载链接

    Args:
        tid: 开课id
        doc_only: 是否之下载课件
    """
    data = {
        'callCount': '1',
        'scriptSessionId': '${scriptSessionId}190',
        'httpSessionId': sess.cookies.get('NTESSTUDYSI', 'b427803d95384cf496d3240af2526a60'),
        'c0-scriptName': 'CourseBean',
        'c0-methodName': 'getLastLearnedMocTermDto',
        'c0-id': '0',
        'c0-param0': 'number:{}'.format(tid),
        'batchId': '1506485521617'
    }
    custom_header = {
        'Accept': '*/*',
        'Content-Type': 'text/plain',
    }
    try:
        response = retry_request(COURSE_DETAIL_URL, data=data, headers=custom_header, timeout=20)
        if not response.ok:
            raise RequestExcetpion('获取视频链接响应状态码错误, 状态码:{}'.format(response.status_code))
    except Exception as e:
        raise RequestExcetpion('获取视频链接HTTP请求错误, {}'.format(e))

    # 解析响应数据
    # 其中,每行可能代表下面某种数据类型。每个lecture可能是视频,也可能是文档
    # |--week1
    #     |--lesson1.1
    #           |--lecture1.1.1
    #           |--lecture1.1.2
    #     |--lesson1.2

    term = OrderedDict()
    last_week_name = ''
    last_lesson_name = ''

    if response.ok:
        for line in response.content.splitlines():
            line = line.decode('unicode_escape')

            # 解析week
            week_match = week_ptn.findall(line)
            if week_match:
                last_week_name = clean_filename(week_match[0])
                term[last_week_name] = OrderedDict()
                logger.info(last_week_name)
                continue

            # 解析lesson
            lesson_match = lesson_ptn.findall(line)
            if lesson_match and last_week_name in term:
                last_lesson_name = clean_filename(lesson_match[0])
                term[last_week_name][last_lesson_name] = OrderedDict()
                logger.info('    %s', last_lesson_name)
                continue

            # 解析视频
            if not doc_only:
                # 获取视频链接
                video_match = video_ptn.findall(line)
                if video_match and last_lesson_name in term[last_week_name]:
                    content_id, _id, lecture_name, term_id = video_match[0]
                    lecture_name = clean_filename(lecture_name)
                    file_url = get_video_doc_url(content_id, _id)
                    postfix = 'mp4' if 'mp4' in file_url else 'flv'
                    term[last_week_name][last_lesson_name]['{}.{}'.format(lecture_name, postfix)] = file_url
                    logger.info('        %s', '{}.{}'.format(lecture_name, postfix))

            # 解析文档
            doc_match = doc_ptn.findall(line)
            if doc_match and last_lesson_name in term[last_week_name]:
                content_id, _id, lecture_name, term_id = doc_match[0]
                lecture_name = clean_filename(lecture_name)
                file_url = get_video_doc_url(content_id, _id, file_type='doc')
                postfix = 'doc' if '.doc' in file_url else 'pdf'
                term[last_week_name][last_lesson_name]['{}.{}'.format(lecture_name, postfix)] = file_url
                logger.info('        %s', '{}.{}'.format(lecture_name, postfix))

        if last_week_name == '':
            raise ParseException('未找到每周课程名称列表')
        term = reindex_file_name(term)
        return term
Exemple #23
0
def parse_old_style_syllabus(session, page, reverse=False, unrestricted_filenames=False,
                             subtitle_language='en'):
    """
    Parse an old style Coursera course listing/syllabus page.

    Each section is a week of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    stags = soup.findAll(attrs={'class': re.compile('^course-item-list-header')})
    for stag in stags:
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, unrestricted_filenames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, unrestricted_filenames)
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                untouched_fname = a.get('title', '')
                title = clean_filename(untouched_fname, unrestricted_filenames)
                fmt = get_anchor_format(href)
                if fmt in ('srt', 'txt') and subtitle_language != 'en':
                    title = title.replace('_en&format', '_' + subtitle_language + '&format')
                    href = href.replace('_en&format', '_' + subtitle_language + '&format')

                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_old_style_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warning(
                            'Could not get resource: %s', lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(
                            session, a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('The cookies file may be invalid, '
                      'please re-run with the `--clear-cache` option.')

    return sections
Exemple #24
0
def upload_path_handler(instance, filename):
    return "images/news/{title}/{file}".format(title=slugify(instance.title),
                                               file=clean_filename(filename))
Exemple #25
0
def download_syllabus_study163(session, syllabus, path='', overwrite=False):

    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Connection':
        'keep-alive',
        'Host':
        'study.163.com',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    }
    get_token_url = 'http://study.163.com/video/getVideoAuthorityToken.htm'

    session.headers.update(headers)

    course_id = syllabus[0]
    print('-----')
    print(course_id)
    course = syllabus[1]
    retry_list = []

    for (chapter_num, (chapter, lessons)) in enumerate(course):
        chapter_name = clean_filename(chapter)
        dir = os.path.join(path, ('%02d %s' % (chapter_num + 1, chapter_name)))
        print(dir)
        if not os.path.exists(dir):
            mkdir_p(dir)
        for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons):

            print('lesson_num:   ', end="")
            print(lesson_num)
            print('lesson_name:   ' + lesson_name.decode('raw_unicode_escape'))

            lesson_name = clean_filename(
                lesson_name.decode('raw_unicode_escape'))
            filename = os.path.join(
                dir, '%02d_%s.mp4' % (lesson_num + 1, lesson_name))
            print(filename)

            if overwrite or not os.path.exists(filename):
                try:
                    r = session.get(get_token_url)
                    video_url = lesson_url
                    download_file(video_url, filename)

                except Exception as e:
                    print(e)
                    print('1Error, add it to retry list')
                    retry_list.append((lesson_url, filename))
            else:
                print('Already downloaded')

    retry_times = 0
    while len(retry_list) != 0 and retry_times < 3:
        print('%d items should be retried, retrying...' % len(retry_list))
        tmp_list = [item for item in retry_list]
        retry_times += 1
        for (url, filename) in tmp_list:
            try:
                print(url)
                print(filename)
                r = session.get(get_token_url)
                video_url = lesson_url
                download_file(video_url, filename)

            except Exception as e:
                print(e)
                print('2Error, add it to retry list')
                print('lesson_url:' + lesson_url)
                continue

            retry_list.remove((url, filename))

    if len(retry_list) != 0:
        print('%d items failed, please check it' % len(retry_list))
    else:
        print('All done.')
def parse_old_style_syllabus(session, page, reverse=False, intact_fnames=False, subtitle_language="en"):
    """
    Parse an old style Coursera course listing/syllabus page.

    Each section is a week of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    stags = soup.findAll(attrs={"class": re.compile("^course-item-list-header")})
    for stag in stags:
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, intact_fnames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll("li"):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, intact_fnames)
            logging.info("  %s", vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll("a"):
                href = fix_url(a["href"])
                untouched_fname = a.get("title", "")
                title = clean_filename(untouched_fname, intact_fnames)
                fmt = get_anchor_format(href)
                if fmt in ("srt", "txt") and subtitle_language != "en":
                    title = title.replace("_en&format", "_" + subtitle_language + "&format")
                    href = href.replace("_en&format", "_" + subtitle_language + "&format")

                logging.debug("    %s %s", fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_old_style_video(session, lecture_page)
                        lecture["mp4"] = lecture.get("mp4", [])
                        lecture["mp4"].append((fix_url(href), ""))
                    except TypeError:
                        logging.warn("Could not get resource: %s", lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if "mp4" not in lecture:
                for a in vtag.findAll("a"):
                    if a.get("data-modal-iframe"):
                        href = grab_hidden_video_url(session, a["data-modal-iframe"])
                        href = fix_url(href)
                        fmt = "mp4"
                        logging.debug("    %s %s", fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ""))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], "")
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], "{0:d}_{1}".format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info("Found %d sections and %d lectures on this page", len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error("The cookies file may be invalid, " "please re-run with the `--clear-cache` option.")

    return sections
Exemple #27
0
def download_syllabus_study163(session, syllabus, path='', overwrite=False):

    headers = {
        'Accept':
        '*/*',
        'Accept-Encoding':
        'gzip, deflate, sdch',
        'Accept-Language':
        'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
        'Connection':
        'keep-alive',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
    }
    get_token_url = 'http://study.163.com/video/getVideoAuthorityToken.htm'

    session.headers.update(headers)

    course_id = syllabus[0]
    course = syllabus[1]
    retry_list = []
    for (chapter_num, (chapter, lessons)) in enumerate(course):
        chapter_name = clean_filename(chapter)
        dir = os.path.join(path, ('%02d %s' % (chapter_num + 1, chapter_name)))
        if not os.path.exists(dir):
            mkdir_p(dir)
        for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons):
            fmt = lesson_url.split('.')[-1]
            lesson_name = clean_filename(
                lesson_name.decode('raw_unicode_escape'))
            filename = os.path.join(
                dir, '%02d_%s.%s' % (lesson_num + 1, lesson_name, fmt))
            print(filename)

            if overwrite or not os.path.exists(filename):
                try:
                    r = session.get(get_token_url)
                    video_url_suffix = '88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59'\
                                         + r.content
                    video_url = lesson_url + '?key=' + video_url_suffix
                    download_file(session, video_url, filename)
                except Exception as e:
                    print(e)
                    print('Error, add it to retry list')
                    retry_list.append((lesson_url, filename))
            else:
                print('Already downloaded')

    retry_times = 0
    while len(retry_list) != 0 and retry_times < 3:
        print('%d items should be retried, retrying...' % len(retry_list))
        tmp_list = [item for item in retry_list]
        retry_times += 1
        for (url, filename) in tmp_list:
            try:
                print(url)
                print(filename)
                r = session.get(get_token_url)
                video_url_suffix = '88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59'\
                                        + r.content
                video_url = url + '?key=' + video_url_suffix
                download_file(session, video_url, filename)
            except Exception as e:
                print(e)
                print('Error, add it to retry list')
                continue

            retry_list.remove((url, filename))

    if len(retry_list) != 0:
        print('%d items failed, please check it' % len(retry_list))
    else:
        print('All done.')
Exemple #28
0
def parse_syllabus(session, page, reverse=False):
    """
    Parses a Coursera course listing/syllabus page.  Each section is a week
    of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    for stag in soup.findAll(attrs={'class':
                                    re.compile('^course-item-list-header')}):
        assert stag.contents[0] is not None, "couldn't find section"
        section_name = clean_filename(stag.contents[0].contents[1])
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            vname = clean_filename(vtag.a.contents[0])
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                title = clean_filename(a.get('title', ''))
                fmt = get_anchor_format(href)
                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warn(
                            'Could not get resource: %s', lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(
                            session, a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))


            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if (count == i + 1):
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('Probably bad cookies file (or wrong class name)')

    return sections
Exemple #29
0
def parse_syllabus(session, page, reverse=False, intact_fnames=False):
    """
    Parses a Coursera course listing/syllabus page.  Each section is a week
    of classes.
    """

    sections = []
    soup = BeautifulSoup(page)

    # traverse sections
    for stag in soup.findAll(
            attrs={'class': re.compile('^course-item-list-header')}):
        assert stag.contents[0] is not None, "couldn't find section"
        untouched_fname = stag.contents[0].contents[1]
        section_name = clean_filename(untouched_fname, intact_fnames)
        logging.info(section_name)
        lectures = []  # resources for 1 lecture

        # traverse resources (e.g., video, ppt, ..)
        for vtag in stag.nextSibling.findAll('li'):
            assert vtag.a.contents[0], "couldn't get lecture name"
            untouched_fname = vtag.a.contents[0]
            vname = clean_filename(untouched_fname, intact_fnames)
            logging.info('  %s', vname)
            lecture = {}
            lecture_page = None

            for a in vtag.findAll('a'):
                href = fix_url(a['href'])
                untouched_fname = a.get('title', '')
                title = clean_filename(untouched_fname, intact_fnames)
                fmt = get_anchor_format(href)
                logging.debug('    %s %s', fmt, href)
                if fmt:
                    lecture[fmt] = lecture.get(fmt, [])
                    lecture[fmt].append((href, title))
                    continue

                # Special case: find preview URLs
                lecture_page = transform_preview_url(href)
                if lecture_page:
                    try:
                        href = get_video(session, lecture_page)
                        lecture['mp4'] = lecture.get('mp4', [])
                        lecture['mp4'].append((fix_url(href), ''))
                    except TypeError:
                        logging.warn('Could not get resource: %s',
                                     lecture_page)

            # Special case: we possibly have hidden video links---thanks to
            # the University of Washington for that.
            if 'mp4' not in lecture:
                for a in vtag.findAll('a'):
                    if a.get('data-modal-iframe'):
                        href = grab_hidden_video_url(session,
                                                     a['data-modal-iframe'])
                        href = fix_url(href)
                        fmt = 'mp4'
                        logging.debug('    %s %s', fmt, href)
                        if href is not None:
                            lecture[fmt] = lecture.get(fmt, [])
                            lecture[fmt].append((href, ''))

            for fmt in lecture:
                count = len(lecture[fmt])
                for i, r in enumerate(lecture[fmt]):
                    if count == i + 1:
                        # for backward compatibility, we do not add the title
                        # to the filename (format_combine_number_resource and
                        # format_resource)
                        lecture[fmt][i] = (r[0], '')
                    else:
                        # make sure the title is unique
                        lecture[fmt][i] = (r[0], '{0:d}_{1}'.format(i, r[1]))

            lectures.append((vname, lecture))

        sections.append((section_name, lectures))

    logging.info('Found %d sections and %d lectures on this page',
                 len(sections), sum(len(s[1]) for s in sections))

    if sections and reverse:
        sections.reverse()

    if not len(sections):
        logging.error('The cookies file may be invalid, '
                      'please re-run with the `--clear-cache` option.')

    return sections
Exemple #30
0
import game
import sys
import logging
import time
import os
import numpy as np
from camera import Camera
from utils import clean_filename
import simulation

logs_directory = '/tmp/gravipy_log' or os.path.join(
    os.path.dirname(os.path.realpath(__file__)), 'log')

if os.path.exists(logs_directory) and not os.path.isdir(logs_directory):
    raise IOError("Log directory choice is not a real directory!")
current_log = os.path.join(logs_directory, clean_filename(time.asctime()))
os.makedirs(current_log, mode=0744)

formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh = logging.FileHandler(os.path.join(current_log, 'run.log'))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)

ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)
ch.setFormatter(formatter)

log = logging.getLogger(__name__)
log.setLevel(logging.WARNING)
log.addHandler(ch)
Exemple #31
0
def upload_path_handler(instance, filename):
    return "images/events/{family}/{title}/{file}".format(
        family=slugify(instance.family),
        title=slugify(instance.title),
        file=utils.clean_filename(filename))
Exemple #32
0
def download_syllabus_study163(session, syllabus, path="", overwrite=False):

    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4",
        "Connection": "keep-alive",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    }
    get_token_url = "http://study.163.com/video/getVideoAuthorityToken.htm"

    session.headers.update(headers)

    course_id = syllabus[0]
    course = syllabus[1]
    retry_list = []
    for (chapter_num, (chapter, lessons)) in enumerate(course):
        chapter_name = clean_filename(chapter)
        dir = os.path.join(path, ("%02d %s" % (chapter_num + 1, chapter_name)))
        if not os.path.exists(dir):
            mkdir_p(dir)
        for (lesson_num, (lesson_url, lesson_name)) in enumerate(lessons):
            fmt = lesson_url.split(".")[-1]
            lesson_name = clean_filename(lesson_name.decode("raw_unicode_escape"))
            filename = os.path.join(dir, "%02d_%s.%s" % (lesson_num + 1, lesson_name, fmt))
            print(filename)

            if overwrite or not os.path.exists(filename):
                try:
                    r = session.get(get_token_url)
                    video_url_suffix = (
                        "88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59"
                        + r.content
                    )
                    video_url = lesson_url + "?key=" + video_url_suffix
                    download_file(session, video_url, filename)
                except Exception as e:
                    print(e)
                    print("Error, add it to retry list")
                    retry_list.append((lesson_url, filename))
            else:
                print("Already downloaded")

    retry_times = 0
    while len(retry_list) != 0 and retry_times < 3:
        print("%d items should be retried, retrying..." % len(retry_list))
        tmp_list = [item for item in retry_list]
        retry_times += 1
        for (url, filename) in tmp_list:
            try:
                print(url)
                print(filename)
                r = session.get(get_token_url)
                video_url_suffix = (
                    "88C752A6C3513A0A5EEFA4CD7091A96E365D0185B8133CC883910200B043BC0F57E3024A35D1C582757D905A6B9289E9f4eej632de59"
                    + r.content
                )
                video_url = url + "?key=" + video_url_suffix
                download_file(session, video_url, filename)
            except Exception as e:
                print(e)
                print("Error, add it to retry list")
                continue

            retry_list.remove((url, filename))

    if len(retry_list) != 0:
        print("%d items failed, please check it" % len(retry_list))
    else:
        print("All done.")