def render_wiki(mooc): wiki_data=mooc.wiki wiki_name=mooc.wiki_name for page in wiki_data: if "text" in wiki_data[page]: #this is a page jinja( os.path.join(wiki_data[page]["path"],"index.html"), "wiki_page.html", False, content=wiki_data[page], dir=wiki_data[page]["dir"].replace(mooc.instance_url + "/wiki/","") + "index.html", mooc=mooc, rooturl=wiki_data[page]["rooturl"] ) make_dir(os.path.join(wiki_data[page]["path"],"_dir")) if len(wiki_data[page]["children"]) != 0: #this is a list page page_to_list=[] for child_page in wiki_data[page]["children"]: if "title" in wiki_data[child_page]: page_to_list.append({ "url": wiki_data[page]["rooturl"]+ "/.." + child_page.replace(mooc.instance_url,""), "title": wiki_data[child_page]["title"], "last-modif": wiki_data[child_page]["last-modif"]}) jinja( os.path.join(wiki_data[page]["path"],"_dir","index.html"), "wiki_list.html", False, pages=page_to_list, wiki_name=wiki_name, mooc=mooc, rooturl=wiki_data[page]["rooturl"] + "../" )
def __init__(self,c,course_url,convert_in_webm,ignore_missing_xblock,lang): self.course_url=course_url self.convert_in_webm=convert_in_webm self.ignore_missing_xblock=ignore_missing_xblock self.lang = lang or "en" self.instance_url=c.conf["instance_url"] self.course_id=get_course_id(self.course_url, c.conf["course_page_name"], c.conf["course_prefix"], self.instance_url) logging.info("Get info about course") self.info=c.get_api_json("/api/courses/v1/courses/" + self.course_id + "?username="******"output",slugify(self.info["name"])) self.name = slugify(self.info["name"]) make_dir(self.output_path) logging.info("Get course blocks") json_from_api=c.get_api_json("/api/courses/v1/blocks/?course_id=" + self.course_id + "&username="******"&depth=all&requested_fields=graded,format,student_view_multi_device&student_view_data=video,discussion&block_counts=video,discussion,problem&nav_depth=3") self.json=json_from_api["blocks"] self.root_id=json_from_api["root"] self.course_root=None self.path="" self.rooturl="" self.top={} self.object=[] self.no_homepage=False self.wiki=None self.forum_thread=None self.page_annexe=[] self.book_list_list=[]
def __init__(self, json, path, rooturl, id, descendants, mooc): self.mooc = mooc self.json = json self.path = path self.rooturl = rooturl self.id = id self.folder_name = slugify(json["display_name"]) self.output_path = os.path.join(self.mooc.output_path, self.path) make_dir(self.output_path)
def __init__(self, json, path, rooturl, id, descendants, mooc): self.mooc = mooc self.json = json self.path = path self.rooturl = rooturl self.id = id self.output_path = self.mooc.output_path path = os.path.join(self.output_path, self.path, slugify(json["display_name"])) make_dir(path) self.data = [] self.category_title = ""
def download(self,c): download("https://www.google.com/s2/favicons?domain=" + self.instance_url , os.path.join(self.output_path,"favicon.png"),None) logging.info("Get homepage") content=c.get_page(self.course_url) make_dir(os.path.join(self.output_path,"home")) self.html_homepage=[] soup=BeautifulSoup.BeautifulSoup(content, 'lxml') html_content=soup.find('div', attrs={"class": "welcome-message" }) if html_content is None: html_content=soup.find_all('div', attrs={"class": re.compile("info-wrapper")}) if html_content == [] : self.no_homepage=True else: for x in range(0,len(html_content)): article=html_content[x] dismiss = article.find("div", attrs={"class": "dismiss-message"}) if dismiss != None: dismiss.decompose() bookmark = article.find("a", attrs={"class": "action-show-bookmarks"}) if bookmark != None: bookmark.decompose() buttons = article.find_all("button",attrs={"class": "toggle-visibility-button"}) if buttons != None: for button in buttons: button.decompose() article['class']="toggle-visibility-element article-content" self.html_homepage.append(dl_dependencies(article.prettify(),os.path.join(self.output_path, "home"),"home",c)) else: dismiss = html_content.find("div", attrs={"class": "dismiss-message"}) if dismiss != None: dismiss.decompose() bookmark = html_content.find("a", attrs={"class": "action-show-bookmarks"}) if bookmark != None: bookmark.decompose() buttons = html_content.find_all("button",attrs={"class": "toggle-visibility-button"}) if buttons != None: for button in buttons: button.decompose() self.html_homepage.append(dl_dependencies(html_content.prettify(),os.path.join(self.output_path, "home"),"home",c)) logging.info("Get content") for x in self.object: x.download(c)
def annexe(self,c): logging.info("Try to get specific page of mooc") content=c.get_page(self.course_url) soup=BeautifulSoup.BeautifulSoup(content, 'lxml') top_bs=soup.find('ol', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "navbar-nav" }) or soup.find('ol', attrs={"class": "course-tabs"}) if top_bs != None: for top_elem in top_bs.find_all("li"): top_elem=top_elem.find("a") if top_elem["href"][-1] == "/": path=top_elem["href"][:-1].split("/")[-1] else: path=top_elem["href"].split("/")[-1] if path == "course" or "courseware" in path: name = top_elem.get_text().replace(", current location", "") self.top[name] = "course/" + self.head.folder_name + "/index.html" if "info" in path: name = top_elem.get_text().replace(", current location", "") self.top[name] = "/index.html" if path == "course" or "edxnotes" in path or "progress" in path or "info" in path or "courseware" in path: continue if "wiki" in path: self.wiki, self.wiki_name, path=annexe.wiki(c,self) elif "forum" in path: path="forum/" self.forum_thread, self.forum_category, self.staff_user_forum = annexe.forum(c,self) else: output_path = os.path.join(self.output_path,path) make_dir(output_path) page_content=c.get_page(self.instance_url + top_elem["href"]) soup_page=BeautifulSoup.BeautifulSoup(page_content, 'lxml') just_content = soup_page.find('section', attrs={"class": "container"}) if just_content != None : html_content=dl_dependencies(str(just_content),output_path,"",c) self.page_annexe.append({ "output_path": output_path, "content": html_content,"title" : soup_page.find('title').get_text()}) else: book=soup_page.find('section', attrs={"class": "book-sidebar"}) if book != None: self.book_list_list.append({"output_path": output_path, "book_list" : annexe.booknav(self,book,output_path), "dir_path": path}) else: logging.warning("Oh it's seems we does not support one type of extra content (in top bar) :" + path) continue self.top[top_elem.get_text()]= path + "/index.html"
def __init__(self, json, path, rooturl, id, descendants, mooc): self.mooc = mooc self.json = json self.path = path self.rooturl = rooturl self.id = id self.descendants = descendants self.top = self.mooc.top self.output_path = self.mooc.output_path if self.json["block_counts"]["video"] != 0: self.icon_type = "fa-video-camera" elif self.json["block_counts"]["problem"] != 0: self.icon_type = "fa-question-circle" elif self.json["block_counts"]["discussion"] != 0: self.icon_type = "fa-comment" else: self.icon_type = "fa-book" self.display_name = json["display_name"] self.folder_name = slugify(self.display_name) path = os.path.join(self.output_path, self.path, self.folder_name) make_dir(path)
def wiki(c,mooc): #Get redirection to wiki first_page=c.get_redirection(mooc.instance_url + "/courses/" + mooc.course_id + "/course_wiki") page_to_visit=[first_page] wiki_data={} #Data from page already visit # "[url]" : { "rooturl": , "path": , "text": , "title": , "dir" : , "children": [] } #Extract wiki name wiki_name = first_page.replace(mooc.instance_url + "/wiki/", "")[:-1] wiki_path = os.path.join("wiki", first_page.replace(mooc.instance_url + "/wiki/","")) while page_to_visit: get_page_error=False url = page_to_visit.pop() try: content=c.get_page(url) except HTTPError as e: if e.code == 404 or e.code == 403: get_page_error=True else: logging.warning("Fail to get " + url + "Error :" + str(e.code)) pass wiki_data[url]={} web_path=os.path.join("wiki", url.replace(mooc.instance_url + "/wiki/","")) path=os.path.join(mooc.output_path, web_path) make_dir(path) wiki_data[url]["path"] = path rooturl="../" for x in range(0,len(web_path.split("/"))): rooturl+="../" wiki_data[url]["rooturl"]= rooturl wiki_data[url]["children"]=[] #Parse content page soup=BeautifulSoup.BeautifulSoup(content, 'lxml') text=soup.find("div", attrs={"class": "wiki-article"}) if text != None : #If it's a page (and not a list of page) #Find new wiki page in page content for link in text.find_all("a"): if link.has_attr("href") and "/wiki/" in link["href"]: if link not in wiki_data and link not in page_to_visit: if not link["href"][0:4] == "http": page_to_visit.append(mooc.instance_url + link["href"]) else: page_to_visit.append(link["href"]) if not link["href"][0:4] == "http": #Update path in wiki page link["href"] = rooturl[:-1] + link["href"].replace(mooc.instance_url,"") + "/index.html" wiki_data[url]["text"] = dl_dependencies(str(text),path,"",c) wiki_data[url]["title"] = soup.find("title").text wiki_data[url]["last-modif"] = soup.find("span", attrs={"class": "date"}).text wiki_data[url]["children"]=[] elif get_page_error: wiki_data[url]["text"] = """<div><h1 class="page-header">Permission Denied</h1><p class="alert denied">Sorry, you don't have permission to view this page.</p></div>""" wiki_data[url]["title"] = "Permission Denied | Wiki" wiki_data[url]["last-modif"] = "Unknow" wiki_data[url]["children"]=[] #find new url of wiki in the list children page see_children=soup.find('div', attrs={"class": "see-children"}) if see_children: allpage_url=str(see_children.find("a")["href"]) wiki_data[url]["dir"] = allpage_url content=c.get_page(mooc.instance_url + allpage_url) soup=BeautifulSoup.BeautifulSoup(content, 'lxml') table=soup.find("table") if table != None: for link in table.find_all("a"): if link.has_attr("class") and "list-children" in link["class"]: pass else: if link["href"] not in wiki_data and link["href"] not in page_to_visit: page_to_visit.append(mooc.instance_url + link["href"]) wiki_data[url]["children"].append(mooc.instance_url + link["href"]) return wiki_data, wiki_name, wiki_path
def forum(c,mooc): forum_output=os.path.join(mooc.output_path, "forum") make_dir(forum_output) content=c.get_page(mooc.instance_url + "/courses/" + mooc.course_id + "/discussion/forum") good_content=BeautifulSoup.BeautifulSoup(content, 'lxml').find("script", attrs={"id": "thread-list-template"}) category=OrderedDict() if good_content: soup=BeautifulSoup.BeautifulSoup(content, 'lxml') all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"}) if len(all_category) == 0: soup=BeautifulSoup.BeautifulSoup(good_content.text, 'lxml') #On Fun plateform, categorie list is in script with id thread-list-template all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"}) for cat in all_category: if (cat.has_attr("id") and cat["id"] in [ "all_discussions", "posts_following" ]) or (cat.has_attr("class") and ("forum-nav-browse-menu-all" in cat["class"] or "forum-nav-browse-menu-following" in cat["class"])): continue if not cat.has_attr("data-discussion-id"): #and cat.find("a") != None: category[str(uuid4())] = { "title" : cat.find(["a", "span"], attrs={"class": "forum-nav-browse-title"}).text, "catego_with_sub_catego" : True} elif cat.has_attr("data-discussion-id"): category[cat["data-discussion-id"]] = {"title": str(cat.text).replace("\n","")} else: logging.error("No forum category found") threads=[] #Search for Staff user : json_user={} section_user = BeautifulSoup.BeautifulSoup(content, 'lxml').find("section", attrs={"id": "discussion-container"}) if section_user and section_user.has_attr("data-roles"): if """ in section_user["data-roles"]: json_user = json.loads(unescape(section_user["data-roles"])) else: json_user = json.loads(section_user["data-roles"]) else: section_user = re.search("roles: [^\n]*", content) if section_user: #TODO check ok in this case json_user=json.loads(re.sub(r"roles: (.*),", r'\1', section_user.group())) staff_user = [] for x in json_user: staff_user += [ str(y) for y in json_user[x]] #Search category for x in category: make_dir(os.path.join(forum_output,x)) url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=1&sort_key=activity&sort_order=desc" data=c.get_api_json(url) d=data["discussion_data"] threads+=d for i in range(1,data["num_pages"]): url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=" + str(i+1) + "&sort_key=activity&sort_order=desc" data=c.get_api_json(url) d=data["discussion_data"] threads+=d for thread in threads: url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip=0&resp_limit=100" make_dir(os.path.join(forum_output,thread["id"])) try: thread["data_thread"]=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0]) total_answers = 100 while total_answers < thread["data_thread"]["content"]["resp_total"]: url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip="+ str(total_answers) + "&resp_limit=100" new_answers=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0])["content"]["children"] thread["data_thread"]["content"]["children"] += new_answers total_answers += 100 except: try: thread["data_thread"]=c.get_api_json(url) except: logging.log("Can not get " + mooc.instance_url + url + "discussion") if ("endorsed_responses" in thread["data_thread"]["content"] or "non_endorsed_responses" in thread["data_thread"]["content"]) and "children" in thread["data_thread"]["content"]: logging.warning("pb endorsed VS children" + thread["id"]) if "children" not in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] = [] if "endorsed_responses" in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["endorsed_responses"] if "non_endorsed_responses" in thread["data_thread"]["content"]: thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["non_endorsed_responses"] thread["data_thread"]["content"]["body"] = dl_dependencies(markdown(thread["data_thread"]["content"]["body"]),os.path.join(forum_output,thread["id"]),"",c) for children in thread["data_thread"]["content"]["children"]: children["body"]=dl_dependencies(markdown(children["body"]),os.path.join(forum_output,thread["id"]),"",c) if "children" in children: for children_children in children["children"]: children_children["body"]=dl_dependencies(markdown(children_children["body"]),os.path.join(forum_output,thread["id"]),"",c) return threads, category, staff_user