Ejemplo n.º 1
0
 def download(self, c):
     content = c.get_page(self.json["student_view_url"])
     soup = BeautifulSoup.BeautifulSoup(content, 'lxml')
     html_content = soup.find('div', attrs={"class": "edx-notes-wrapper"})
     if html_content == None:
         html_content = str(
             soup.find('div', attrs={"class": "course-wrapper"}))
     self.html = dl_dependencies(html_content, self.output_path,
                                 self.folder_name, c)
Ejemplo n.º 2
0
    def download(self,c):
        download("https://www.google.com/s2/favicons?domain=" + self.instance_url , os.path.join(self.output_path,"favicon.png"),None)

        logging.info("Get homepage")
        content=c.get_page(self.course_url)
        make_dir(os.path.join(self.output_path,"home"))
        self.html_homepage=[]
        soup=BeautifulSoup.BeautifulSoup(content, 'lxml')
        html_content=soup.find('div', attrs={"class": "welcome-message" })
        if html_content is None:
            html_content=soup.find_all('div', attrs={"class": re.compile("info-wrapper")})
            if html_content == [] :
                self.no_homepage=True
            else:
                for x in range(0,len(html_content)):
                    article=html_content[x]
                    dismiss = article.find("div", attrs={"class": "dismiss-message"})
                    if dismiss != None:
                        dismiss.decompose()
                    bookmark = article.find("a", attrs={"class": "action-show-bookmarks"})
                    if bookmark != None:
                        bookmark.decompose()
                    buttons = article.find_all("button",attrs={"class": "toggle-visibility-button"})
                    if buttons != None:
                        for button in buttons:
                            button.decompose()
                    article['class']="toggle-visibility-element article-content"
                    self.html_homepage.append(dl_dependencies(article.prettify(),os.path.join(self.output_path, "home"),"home",c))
        else:
                dismiss = html_content.find("div", attrs={"class": "dismiss-message"})
                if dismiss != None:
                    dismiss.decompose()
                bookmark = html_content.find("a", attrs={"class": "action-show-bookmarks"})
                if bookmark != None:
                    bookmark.decompose()
                buttons = html_content.find_all("button",attrs={"class": "toggle-visibility-button"})
                if buttons != None:
                    for button in buttons:
                        button.decompose()
                self.html_homepage.append(dl_dependencies(html_content.prettify(),os.path.join(self.output_path, "home"),"home",c))
        logging.info("Get content")
        for x in self.object:
            x.download(c)
Ejemplo n.º 3
0
 def download(self, c):
     content = c.get_page(self.json["student_view_url"])
     soup = BeautifulSoup.BeautifulSoup(content, 'lxml')
     html_content = soup.find('div', attrs={"class": "edx-notes-wrapper"})
     if html_content == None:
         html_content = str(
             soup.find('div', attrs={"class": "course-wrapper"}))
     soup = BeautifulSoup.BeautifulSoup(html_content, "lxml")
     text_area = soup.find("textarea", attrs={"class": "student_answer"})
     check = soup.find("button", attrs={"class": "check"}).decompose()
     save = soup.find("button", attrs={"class": "save"})
     text_area["id"] = self.id
     #check["onclick"] = 'check_freetext("{}")'.format(self.id)
     save["onclick"] = 'save_freetext("{}")'.format(self.id)
     html_no_answers = '<div class="noanswers"><p data-l10n-id="no_answers_for_freetext" >  <b> Warning : </b> There is not correction for Freetext block. </p> </div>'
     self.html = html_no_answers + dl_dependencies(
         str(soup), self.output_path, self.folder_name, c)
Ejemplo n.º 4
0
 def annexe(self,c):
     logging.info("Try to get specific page of mooc")
     content=c.get_page(self.course_url)
     soup=BeautifulSoup.BeautifulSoup(content, 'lxml')
     top_bs=soup.find('ol', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "course-material" }) or soup.find('ul', attrs={"class": "navbar-nav" }) or soup.find('ol', attrs={"class": "course-tabs"})
     if top_bs != None:
         for top_elem in top_bs.find_all("li"):
             top_elem=top_elem.find("a")
             if top_elem["href"][-1] == "/":
                 path=top_elem["href"][:-1].split("/")[-1]
             else:
                 path=top_elem["href"].split("/")[-1]
             if path == "course" or "courseware" in path:
                 name = top_elem.get_text().replace(", current location", "")
                 self.top[name] = "course/" + self.head.folder_name + "/index.html"
             if "info" in path:
                 name = top_elem.get_text().replace(", current location", "")
                 self.top[name] = "/index.html" 
             if path == "course" or "edxnotes" in path or "progress" in path or "info" in path or "courseware" in path:
                 continue
             if "wiki" in path:
                 self.wiki, self.wiki_name, path=annexe.wiki(c,self)
             elif "forum" in path:
                 path="forum/"
                 self.forum_thread, self.forum_category, self.staff_user_forum = annexe.forum(c,self)
             else:
                 output_path = os.path.join(self.output_path,path)
                 make_dir(output_path)
                 page_content=c.get_page(self.instance_url + top_elem["href"])
                 soup_page=BeautifulSoup.BeautifulSoup(page_content, 'lxml')
                 just_content = soup_page.find('section', attrs={"class": "container"})
                 if just_content != None :
                     html_content=dl_dependencies(str(just_content),output_path,"",c)
                     self.page_annexe.append({ "output_path": output_path, "content": html_content,"title" : soup_page.find('title').get_text()})
                 else:
                     book=soup_page.find('section', attrs={"class": "book-sidebar"})
                     if book != None:
                         self.book_list_list.append({"output_path": output_path, "book_list" : annexe.booknav(self,book,output_path), "dir_path": path})
                     else:
                         logging.warning("Oh it's seems we does not support one type of extra content (in top bar) :" + path)
                         continue
             self.top[top_elem.get_text()]= path + "/index.html"
Ejemplo n.º 5
0
    def download(self, c):
        content = c.get_page(self.json["student_view_url"])
        soup = BeautifulSoup.BeautifulSoup(content, 'lxml')
        try:
            html_content_from_div = str(
                soup.find('div', attrs={"class":
                                        "problems-wrapper"})['data-content'])
        except:
            problem_json_url = str(
                soup.find('div', attrs={"class":
                                        "problems-wrapper"})['data-url'])
            html_content_from_div = str(
                c.get_api_json(problem_json_url + "/problem_get")["html"])
        soup = BeautifulSoup.BeautifulSoup(html_content_from_div, 'lxml')
        #self.has_hint=soup.find("button", attrs={"class" : "hint-button"}) #Remove comment when  hint ok
        for div in soup.find_all('div', attrs={"class": "notification"}):
            div.decompose()
        for input_tag in soup.find_all('input'):
            if input_tag.has_attr("value"):
                input_tag["value"] = ""
            if input_tag.has_attr("checked"):
                del input_tag.attrs['checked']
        soup.find('div', attrs={"class": "action"}).decompose()
        for span in soup.find_all('span', attrs={"class": "unanswered"}):
            span.decompose()
        for span in soup.find_all('span', attrs={"class": "sr"}):
            span.decompose()
        html_content = str(soup)
        html_content = dl_dependencies(html_content, self.output_path,
                                       self.folder_name, c)
        self.html_content = str(html_content)

        #Save json answers
        path_answers = os.path.join(self.output_path, "problem_show")
        answers_content = {"success": None}
        retry = 0
        while "success" in answers_content and retry < 6:  #We use our check to finally get anwers
            answers_content = c.get_api_json(
                "/courses/" + self.mooc.course_id + "/xblock/" +
                self.json["id"] + "/handler/xmodule_handler/problem_show")
            if "success" in answers_content:
                """
                    #IMPROUVEMENT connection , same as hint ?
                    post_data=urlencode({'event_type': "problem_show", "event": { "problem": self.json["id"] }, "page" : self.json["lms_web_url"]}).encode('utf-8')
                    c.get_api_json("/event", post_data)
                    """
                c.get_api_json("/courses/" + self.mooc.course_id + "/xblock/" +
                               self.json["id"] +
                               "/handler/xmodule_handler/problem_check")
                retry += 1
        if "success" in answers_content:
            logging.warning(" fail to get answers to this problem : " +
                            self.json["id"] + " (" + self.json["lms_web_url"] +
                            ")")
            self.answers = None
        else:
            with open(path_answers, "w") as f:
                json.dump(answers_content, f)
            self.answers = []
            self.explanation = []
            for qid in answers_content["answers"]:
                if not "solution" in qid:
                    for response in answers_content["answers"][qid]:
                        self.answers.append("input_" + qid + "_" + response)
                else:
                    self.explanation.append({
                        "name":
                        "solution_" + qid,
                        "value":
                        json.dumps(answers_content["answers"][qid])
                    })
            self.problem_id = str(uuid4())
        """
Ejemplo n.º 6
0
def wiki(c,mooc):
    #Get redirection to wiki
    first_page=c.get_redirection(mooc.instance_url + "/courses/" +  mooc.course_id + "/course_wiki")
    page_to_visit=[first_page]
    wiki_data={} #Data from page already visit
    # "[url]" : { "rooturl": , "path": , "text": , "title": , "dir" : , "children": [] }
    #Extract wiki name
    wiki_name = first_page.replace(mooc.instance_url + "/wiki/", "")[:-1]
    wiki_path = os.path.join("wiki", first_page.replace(mooc.instance_url + "/wiki/",""))

    while page_to_visit:
        get_page_error=False
        url = page_to_visit.pop()
        try:
            content=c.get_page(url)
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                get_page_error=True 
            else:
                logging.warning("Fail to get " + url + "Error :" + str(e.code))
                pass

        wiki_data[url]={}
        web_path=os.path.join("wiki", url.replace(mooc.instance_url + "/wiki/",""))
        path=os.path.join(mooc.output_path, web_path)
        make_dir(path)
        wiki_data[url]["path"] = path
        rooturl="../"
        for x in range(0,len(web_path.split("/"))):
            rooturl+="../"
        wiki_data[url]["rooturl"]= rooturl
        wiki_data[url]["children"]=[]


        #Parse content page
        soup=BeautifulSoup.BeautifulSoup(content, 'lxml')
        text=soup.find("div", attrs={"class": "wiki-article"})
        if text != None : #If it's a page (and not a list of page)
            #Find new wiki page in page content
            for link in text.find_all("a"):
                if link.has_attr("href") and "/wiki/" in link["href"]:
                    if link not in wiki_data and link not in page_to_visit:
                        if not link["href"][0:4] == "http":
                            page_to_visit.append(mooc.instance_url + link["href"])
                        else:
                            page_to_visit.append(link["href"])

                    if not link["href"][0:4] == "http": #Update path in wiki page
                        link["href"] = rooturl[:-1] + link["href"].replace(mooc.instance_url,"") + "/index.html"

            wiki_data[url]["text"] = dl_dependencies(str(text),path,"",c)
            wiki_data[url]["title"] = soup.find("title").text
            wiki_data[url]["last-modif"] = soup.find("span", attrs={"class": "date"}).text
            wiki_data[url]["children"]=[]
        elif get_page_error:
            wiki_data[url]["text"] = """<div><h1 class="page-header">Permission Denied</h1><p class="alert denied">Sorry, you don't have permission to view this page.</p></div>""" 
            wiki_data[url]["title"] = "Permission Denied | Wiki" 
            wiki_data[url]["last-modif"] = "Unknow" 
            wiki_data[url]["children"]=[]

        #find new url of wiki in the list children page
        see_children=soup.find('div', attrs={"class": "see-children"})
        if see_children:
            allpage_url=str(see_children.find("a")["href"])
            wiki_data[url]["dir"] = allpage_url 
            content=c.get_page(mooc.instance_url + allpage_url)
            soup=BeautifulSoup.BeautifulSoup(content, 'lxml')
            table=soup.find("table")
            if table != None:
                for link in table.find_all("a"):
                    if link.has_attr("class") and "list-children" in link["class"]:
                        pass
                    else:
                        if link["href"] not in wiki_data and link["href"] not in page_to_visit:
                            page_to_visit.append(mooc.instance_url + link["href"])
                        wiki_data[url]["children"].append(mooc.instance_url + link["href"])
    return wiki_data, wiki_name, wiki_path
Ejemplo n.º 7
0
def forum(c,mooc):
    forum_output=os.path.join(mooc.output_path, "forum")
    make_dir(forum_output)
    content=c.get_page(mooc.instance_url + "/courses/" +  mooc.course_id + "/discussion/forum")
    good_content=BeautifulSoup.BeautifulSoup(content, 'lxml').find("script", attrs={"id": "thread-list-template"})
    category=OrderedDict()
    if good_content:
        soup=BeautifulSoup.BeautifulSoup(content, 'lxml')

        all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"})
        if len(all_category) == 0:
            soup=BeautifulSoup.BeautifulSoup(good_content.text, 'lxml') #On Fun plateform, categorie list is in script with id thread-list-template
            all_category=soup.find_all('li', attrs={"class": "forum-nav-browse-menu-item"}) 
        for cat in all_category:
            if (cat.has_attr("id") and cat["id"] in [ "all_discussions", "posts_following" ]) or (cat.has_attr("class") and ("forum-nav-browse-menu-all" in cat["class"] or "forum-nav-browse-menu-following" in cat["class"])):
                continue
            if not cat.has_attr("data-discussion-id"): #and cat.find("a") != None:
                category[str(uuid4())] = { "title" : cat.find(["a", "span"], attrs={"class": "forum-nav-browse-title"}).text, "catego_with_sub_catego" : True}
            elif cat.has_attr("data-discussion-id"):
                category[cat["data-discussion-id"]] = {"title": str(cat.text).replace("\n","")}

    else:
        logging.error("No forum category found")
    threads=[]

    #Search for Staff user :
    json_user={}
    section_user = BeautifulSoup.BeautifulSoup(content, 'lxml').find("section", attrs={"id": "discussion-container"})
    if section_user and section_user.has_attr("data-roles"):
        if "&#34;" in section_user["data-roles"]:
            json_user = json.loads(unescape(section_user["data-roles"]))
        else:
            json_user = json.loads(section_user["data-roles"])
    else:
        section_user = re.search("roles: [^\n]*", content)
        if section_user: #TODO check ok in this case
            json_user=json.loads(re.sub(r"roles: (.*),", r'\1', section_user.group()))
    staff_user = []
    for x in json_user:
        staff_user += [ str(y) for y in json_user[x]]

    #Search category
    for x in category:
        make_dir(os.path.join(forum_output,x))
        url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=1&sort_key=activity&sort_order=desc"
        data=c.get_api_json(url)
        d=data["discussion_data"]
        threads+=d
        for i in range(1,data["num_pages"]):
            url="/courses/" + mooc.course_id + "/discussion/forum/" + x + "/inline?ajax=1&page=" + str(i+1) + "&sort_key=activity&sort_order=desc"
            data=c.get_api_json(url)
            d=data["discussion_data"]
            threads+=d

    for thread in threads:
        url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip=0&resp_limit=100"
        make_dir(os.path.join(forum_output,thread["id"]))
        try:
            thread["data_thread"]=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0])
            total_answers = 100
            while total_answers < thread["data_thread"]["content"]["resp_total"]:
                url = "/courses/" + mooc.course_id + "/discussion/forum/" + thread["commentable_id"] + "/threads/" + thread["id"] + "?ajax=1&resp_skip="+ str(total_answers) + "&resp_limit=100"
                new_answers=c.get_api_json(url, referer=mooc.instance_url+url.split("?")[0])["content"]["children"]
                thread["data_thread"]["content"]["children"] += new_answers
                total_answers += 100
        except:
            try:
                thread["data_thread"]=c.get_api_json(url)
            except:
                logging.log("Can not get " + mooc.instance_url + url + "discussion")
        if ("endorsed_responses" in thread["data_thread"]["content"] or "non_endorsed_responses" in thread["data_thread"]["content"]) and "children" in thread["data_thread"]["content"]:
            logging.warning("pb endorsed VS children" + thread["id"])
        if "children" not in thread["data_thread"]["content"]:
            thread["data_thread"]["content"]["children"] = []
        if "endorsed_responses" in thread["data_thread"]["content"]:
            thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["endorsed_responses"]
        if "non_endorsed_responses" in thread["data_thread"]["content"]:
            thread["data_thread"]["content"]["children"] += thread["data_thread"]["content"]["non_endorsed_responses"]
        thread["data_thread"]["content"]["body"] = dl_dependencies(markdown(thread["data_thread"]["content"]["body"]),os.path.join(forum_output,thread["id"]),"",c)
        for children in thread["data_thread"]["content"]["children"]:
            children["body"]=dl_dependencies(markdown(children["body"]),os.path.join(forum_output,thread["id"]),"",c)
            if "children" in children:
                for children_children in children["children"]:
                    children_children["body"]=dl_dependencies(markdown(children_children["body"]),os.path.join(forum_output,thread["id"]),"",c)

    return threads, category, staff_user