def download_file_or_doc(self, unit_name, file_url, unit, path, file_dict, text_file): """Download file or document unit_name on to path in the local directory.""" s = requests.session() s.get(file_url, auth=(self.username, self.password)) # reaccess site with authentication since Chalk always returns an error # page; access is obtained only after the second attempt r = s.get(file_url, stream=True, auth=(self.username, self.password)) file_dict["format"] = r.headers.get("content-type") destination = "{:}/{:}/{:}".format(self.default_folder, path, check_folder_name(unit_name)) # Deleting apostrophes to prevent unterminated quote strings file_dict["path"] = os.path.abspath(destination).replace("'", "") delete_file_dict = False if self.need_to_update(r, file_dict): print("downloading {:}".format(unit_name)) make_dirs(self.course_material_dict, self.default_folder) # Downloading process with open(file_dict["path"], "wb") as f: r.raw.decode_content = True f.write(r.content) # Obtain body of file depending on file format if "pdf" in file_dict["format"]: try: file_dict["body"] = convert_pdf(file_dict["path"]) except: file_dict["body"] = "" elif "txt" in file_dict["format"]: file_dict["body"] = r.content else: file_dict["body"] = "" if file_dict["heading"] not in text_file: # Adding heading and description of each file return text_file + file_dict["heading"] + "\n" + file_dict["description"] + "\n\n", delete_file_dict # If the file already exists, the file_dict is deleted and no # downloading occurs else: print("{:} already up to date".format(unit_name)) delete_file_dict = True return text_file, delete_file_dict
def gen_folder(self, unit, path, folder_dict, course): """Generate a folder and perform operations within that folder""" unit.find_element_by_tag_name("a").click() if self.check_id_exists("content_listContainer"): num_of_items = len(self.browser.find_element_by_id("content_listContainer").find_elements_by_tag_name("li")) text_file = "" for unit_index in range(num_of_items): inner_unit = self.browser.find_element_by_id("content_listContainer").find_elements_by_tag_name("li")[ unit_index ] if self.check_tag_exists_in_web_element(inner_unit, "img"): img = inner_unit.find_element_by_tag_name("img") if img.get_attribute("class") == "item_icon": # if icon is a folder if "folder_on" in img.get_attribute("src"): folder_name = check_folder_name(inner_unit.find_element_by_tag_name("a").text) folder_dict[folder_name] = {} make_dirs(self.course_material_dict, self.default_folder) # Recursively generate folders within folders self.gen_folder( inner_unit, path + "/{:}".format(folder_name), folder_dict[folder_name], course ) # if icon is a file elif "file_on" in img.get_attribute("src"): unit_name = inner_unit.find_element_by_tag_name("a").text file_url = inner_unit.find_element_by_tag_name("a").get_attribute("href") heading = inner_unit.find_element_by_tag_name("h3").text file_dict = {"course": course, "heading": heading, "description": ""} text_file, delete_file_dict = self.download_file_or_doc( unit_name, file_url, inner_unit, path, file_dict, text_file ) # if file already exists, delete file_dict, else # append it to file_list if delete_file_dict: del file_dict else: self.file_list.append(file_dict) # if icon is a document elif "document_on" in img.get_attribute("src"): if self.check_tag_exists_in_web_element(inner_unit, "a"): for download_file in inner_unit.find_elements_by_tag_name("a"): unit_name = download_file.text file_url = download_file.get_attribute("href") heading = inner_unit.find_element_by_tag_name("h3").text description = "" for paragraph in inner_unit.find_elements_by_tag_name("p"): description += paragraph.text + "\n" file_dict = {"course": course, "heading": heading, "description": description} text_file, delete_file_dict = self.download_file_or_doc( unit_name, file_url, download_file, path, file_dict, text_file ) # if file already exists, delete # file_dict, else append it to # file_list if delete_file_dict: del file_dict else: self.file_list.append(file_dict) # download descriptions text describing all headers and descriptions # of each file in a folder if the descriptions text is not empty if text_file != "": self.download_text("Chalk context for files", text_file, path) self.browser.execute_script("window.history.go(-1)") return None
def build_course_dict(self, course_info, material_dict, prof_list, course, course_list): """Crawls a course in Chalk to download course materials into the correct path in the local directory, and to compile a list of dictionaries with information of each file""" # Click course link on Chalk home page self.browser.find_element_by_partial_link_text(course).click() for item_index in range( len(self.browser.find_element_by_id("courseMenuPalette_contents").find_elements_by_tag_name("li")) ): # For each item on the left panel (i.e. Announcements, Syllabus...) item = self.browser.find_element_by_id("courseMenuPalette_contents").find_elements_by_tag_name("li")[ item_index ] item_name = item.text if item_name == "Announcements": material_dict[item_name] = {} # Generate item_name folder make_dirs(self.course_material_dict, self.default_folder) item.find_element_by_tag_name("a").click() if self.check_id_exists("content_listContainer"): content_list_container = self.browser.find_element_by_id("content_listContainer") announcement_text = "" # Adds text of each icon on to announcement_text for unit in content_list_container.find_elements_by_tag_name("li"): announcement_text += unit.text + "\n\n" else: # if no container exists content = self.browser.find_element_by_id("content") # if announcements is a list of text if self.check_id_exists("announcementList"): announcement_text = content.find_element_by_id("announcementList").text else: announcement_text = "" # No announcements if announcement_text != "": self.download_text( "Announcements", announcement_text, "{:}/Announcements".format(str(check_folder_name(course))) ) elif item_name == "Send Email": list_of_tas = [] list_of_students = [] item.find_element_by_tag_name("a").click() self.browser.find_element_by_link_text("All Teaching Assistant Users").click() # If TA's present and available if not self.check_id_exists("inlineReceipt_bad"): list_of_tas = ( self.browser.find_element_by_id("stepcontent1") .find_elements_by_tag_name("li")[0] .text[3:] .split("; ") ) course_list.append(list_of_tas) # Navigate browser back one page self.browser.execute_script("window.history.go(-1)") if self.check_link_text_exists("Select Users"): self.browser.find_element_by_link_text("Select Users").click() list_of_students_web_elements = ( self.browser.find_element_by_id("stepcontent1") .find_element_by_name("USERS_AVAIL") .find_elements_by_tag_name("option") ) compare_profs = [] for professor in prof_list: prof_str = professor.split(" ")[1] + ", " + professor.split(" ")[0] compare_profs.append(prof_str) for student_web_element in list_of_students_web_elements: # excluding profs and TA's from list of students if ( student_web_element.text not in compare_profs and student_web_element.text not in list_of_tas and "PreviewUser" not in student_web_element.text ): list_of_students.append(student_web_element.text) # Navigate browser back one page self.browser.execute_script("window.history.go(-1)") course_list.append(list_of_students) self.course_info.append(course_list) elif item_name not in [ "Home", "Announcements", "Send Email", "My Grades", "Discussion Board", "Discussions", "Library Course Reserves", "Tools", "Groups", "Calendar", ]: component = check_folder_name(item_name) material_dict[component] = {} folder_empty = True # Generates item_name folder in folder path make_dirs(self.course_material_dict, self.default_folder) item.find_element_by_tag_name("a").click() if self.check_xpath_exists('//*div[@class = "noItems' 'container-empty"]'): continue elif self.check_id_exists("content_listContainer"): num_of_items = len( self.browser.find_element_by_id("content_listContainer").find_elements_by_tag_name("li") ) text_file = "" for unit_index in range(num_of_items): time.sleep(1) # Wait for element to be found # each unit on the content panel unit = self.browser.find_element_by_id("content_listContainer").find_elements_by_tag_name("li")[ unit_index ] if self.check_tag_exists_in_web_element(unit, "img"): img = unit.find_element_by_tag_name("img") if img.get_attribute("class") == "item_icon": # if icon is a folder if "folder_on" in img.get_attribute("src"): folder_empty = False folder_name = check_folder_name(unit.find_element_by_tag_name("a").text) material_dict[component][folder_name] = {} # Generate new folder make_dirs(self.course_material_dict, self.default_folder) self.gen_folder( unit, "{:}/{:}/{:}".format(check_folder_name(course), component, folder_name), material_dict[component][folder_name], course, ) # if icon is a file elif "file_on" in img.get_attribute("src"): folder_empty = False unit_name = unit.find_element_by_tag_name("a").text file_url = unit.find_element_by_tag_name("a").get_attribute("href") heading = unit.find_element_by_tag_name("h3").text file_dict = {"course": course, "heading": heading, "description": ""} text_file, delete_file_dict = self.download_file_or_doc( unit_name, file_url, unit, check_folder_name(course) + "/" + component, file_dict, text_file, ) # if file already exists, delete # file_dict, else append it to file_list if delete_file_dict: del file_dict else: self.file_list.append(file_dict) # if icon is a document with download links elif "document_on" in img.get_attribute("src"): folder_empty = False if self.check_tag_exists_in_web_element(unit, "a"): # if download links present for download_file in unit.find_elements_by_tag_name("a"): unit_name = download_file.text file_url = download_file.get_attribute("href") heading = unit.find_element_by_tag_name("h3").text description = "" for paragraph in unit.find_elements_by_tag_name("p"): description += paragraph.text + "\n" file_dict = { "course": course, "heading": heading, "description": description, } text_file, delete_file_dict = self.download_file_or_doc( unit_name, file_url, download_file, check_folder_name(course) + "/" + component, file_dict, text_file, ) # if file already exists, delete # file_dict, else append it to # file_list if delete_file_dict: del file_dict else: self.file_list.append(file_dict) # downloads text for describing each icon if text_file != "": self.download_text( "Chalk context for files", text_file, "{:}/{:}/".format(str(check_folder_name(course)), str(check_folder_name(item_name))), ) # deletes folder if empty if folder_empty: del material_dict[component] make_dirs(self.course_material_dict, self.default_folder) # Go back to Chalk Home Page self.browser.find_element_by_id("My Chalk").find_element_by_tag_name("a").click() return material_dict