def get_subjects(queue): logger = logging.getLogger(__name__) connection = Connection() request = connection.get(connection.user_url) soup = BeautifulSoup(request.text, "html.parser") primary_li = soup.find_all("li", class_="contentnode")[3] lis = primary_li.find_all("li") logger.debug("Found %d potential subjects", len(lis)) subjects = [] for li in lis: course_id = int(re.search(r"course=(\d+)", li.a["href"]).group(1)) subject_url = "https://campusvirtual.uva.es/course/view.php?id=%d" % course_id name = re.search(r"^([\w\/\sáéíóúÁÉÍÓÚ]+?)\s?\(", li.text).group(1) if course_id in settings.exclude_subjects_ids: logger.info("Excluding subject %s (%d)", name, course_id) continue # Don't consider subject if 'grado' is in the name (it is the degree itself) if "grado" in name.lower(): continue logger.debug("Assembling subject %r", name) _subject = Subject(name, subject_url, queue) subjects.append(_subject) subjects.sort(key=lambda x: x.name) return subjects
class Subject: """Representation of a subject.""" def __init__(self, name, url, queue): """ Args: name (str): name of the subject. url (str): url of the subject. queue (Queue): queue to controll threads. """ name = name.capitalize().replace("\\", "").replace("/", "").strip() self.name = Alias.id_to_alias( sha1(url.encode()).hexdigest(), settings.root_folder / name ).name self.url = url self.connection = Connection() self.queue = queue self.enable_section_indexing = self.url in settings.section_indexing_urls self.response: Response = None self.soup: BeautifulSoup = None self.notes_links = [] self.folder_lock = Lock() self.hasfolder = False # self.folder = settings.root_folder / secure_filename(self.name) self.folder = settings.root_folder / self.name self.logger = logging.getLogger(__name__) self.logger.debug( "Created %s(name=%r, url=%r)", type(self).__name__, self.name, self.url ) def __repr__(self): return ( f"{type(self).__name__}(name={self.name!r}, url={self.url!r}, " f"{len(self.notes_links)} notes links)" ) def __str__(self): return f"{self.name}" def make_request(self): """Makes the primary request.""" self.logger.debug("Making subject request") self.response = self.connection.get(self.url) self.soup = BeautifulSoup(self.response.text, "html.parser") self.logger.debug("Response obtained [%d]", self.response.status_code) self.logger.debug("Response parsed") def create_folder(self): """Creates the folder named as self.""" if self.hasfolder is False: self.logger.debug("Creating folder %r", self.name) with self.folder_lock: if not self.folder.exists(): os.makedirs(self.folder.as_posix()) self.hasfolder = True else: self.logger.debug("Folder already exists: %r", self.name) def add_link(self, link: BaseLink): """Adds a note link to the list.""" self.logger.debug("Adding link: %s", link.name) if not self.enable_section_indexing: link.section = None self.notes_links.append(link) self.queue.put(link) @staticmethod def find_section_by_child(child): try: section_h3 = child.find_parent("li", class_="section main clearfix").find( "h3", class_="sectionname" ) except AttributeError: section_h3 = child.find_parent( "li", class_="section main clearfix current" ).find("h3", class_="sectionname") return Section(section_h3.text, section_h3.a["href"]) @staticmethod def url_to_query_args(url: str): return parse_qs(urlparse(url).query) def find_and_download_links(self): """Finds the links downloading the primary page.""" self.logger.debug("Finding links of %s", self.name) self.make_request() _ = [x.extract() for x in self.soup.findAll("span", {"class": "accesshide"})] _ = [x.extract() for x in self.soup.findAll("div", {"class": "mod-indent"})] for folder in self.soup.find_all("div", class_="singlebutton"): folder_name = folder.parent.parent.div.find( "span", class_="fp-filename" ).text section = self.find_section_by_child(folder) folder_url = folder.form["action"] folder_icon_url = folder.find_parent( "div", class_="contentwithoutlink" ).find("img", class_="icon")["src"] id_ = folder.form.find("input", {"name": "id"})["value"] self.logger.debug( "Created Folder (subject search): %r, %s", folder_name, folder_url ) self.add_link( Folder(folder_name, section, folder_url, folder_icon_url, self, id_) ) for resource in self.soup.find_all("div", class_="activityinstance"): if not resource.a: continue section = self.find_section_by_child(resource) name = resource.a.span.text url = resource.a["href"] icon_url = resource.a.img["src"] if "resource" in url: self.logger.debug( "Created Resource (subject search): %r, %s", name, url ) self.add_link(Resource(name, section, url, icon_url, self)) elif "folder" in url: real_url = "https://campusvirtual.uva.es/mod/folder/download_folder.php" id_ = self.url_to_query_args(url)["id"][0] self.logger.debug( "Created Folder (subject search): %r, id=%r", name, id_ ) self.add_link(Folder(name, section, real_url, icon_url, self, id_)) elif "forum" in url: self.logger.debug("Created Forum (subject search): %r, %s", name, url) self.add_link(ForumList(name, section, url, icon_url, self)) elif "chat" in url: self.logger.debug("Created Chat (subject search): %r, %s", name, url) self.add_link(Chat(name, section, url, icon_url, self)) elif "page" in url: self.logger.debug("Created Page (subject search): %r, %s", name, url) self.add_link(Page(name, section, url, icon_url, self)) elif "url" in url: self.logger.debug("Created Page (subject search): %r, %s", name, url) self.add_link(Url(name, section, url, icon_url, self)) elif "assign" in url: self.logger.debug( "Created Delivery (subject search): %r, %s", name, url ) self.add_link(Delivery(name, section, url, icon_url, self)) elif "kalvidres" in url: self.logger.debug( "Created Kalvidres (subject search): %r, %s", name, url ) self.add_link(Kalvidres(name, section, url, icon_url, self)) elif "quiz" in url: self.logger.debug("Created Quiz (subject search): %r, %s", name, url) self.add_link(Quiz(name, section, url, icon_url, self)) elif "collaborate" in url: self.logger.debug( "Created Blackboard (subject search): %r, %s", name, url ) self.add_link(BlackBoard(name, section, url, icon_url, self)) self.logger.debug("Downloading files for subject %r", self.name)
def test_get(self): conn = Connection() conn.get(self.url) self.downloader_m.return_value.get.assert_called_once_with(self.url)
class BaseLink(_Notify): """Base class for Links.""" def __init__(self, name, section, url, icon_url, subject, parent=None): """ Args: name (str): name of the url. url (str): URL of the url. icon_url (str or None): URL of the icon. subject (vcm.subject.Subject): subject of the url. parent (BaseLink): object that created self. """ self.name = name.strip() self.section = section self.url = url self.icon_url = icon_url self.subject = subject self.connection = Connection() self.parent = parent self.response: Response = None self.soup: BeautifulSoup = None self.filepath: Path = None self.redirect_url = None self.response_name = None self.subfolders = [] self.logger = logging.getLogger(__name__) self.logger.debug( "Created %s(name=%r, url=%r, subject=%r)", self.__class__.__name__, self.name, self.url, self.subject.name, ) @property def content_disposition(self): if self.response is None: raise RuntimeError("Response not made yet") return unidecode.unidecode( self.response.headers["Content-Disposition"]) def append_subfolder(self, dirname): dirname = secure_filename(dirname) return self.subfolders.append(dirname) def insert_subfolder(self, index, dirname): dirname = secure_filename(dirname) return self.subfolders.insert(index, dirname) def create_subfolder(self): """Creates the subfolder, if it is configured.""" self.create_subject_folder() if not self.filepath: self.autoset_filepath() folder: Path = self.filepath.parent if not folder.exists(): os.makedirs(folder.as_posix(), exist_ok=True) self.logger.debug("Created subfolder %r", folder.as_posix()) else: self.logger.debug("Subfolder already exists %r", folder.as_posix()) @staticmethod def _process_filename(filepath: str): """Quits some characters from the filename that can not be in a filepath. Args: filepath (st): filepath to process. Returns: str: filepath processed. """ filepath = filepath.replace(">", " mayor que ") filepath = filepath.replace("<", " menor que ") return filepath @staticmethod def _filename_to_ext(filename): """Returns the extension given a filename.""" return Path(filename).suffix[1:] def _get_ext_from_response(self): """Returns the extension of the filename of the response, got from the Content-Dispotition HTTP header. Returns: str: the extension. """ if self.response_name is not None: return self._filename_to_ext(self.response_name) try: # unidecode.unidecode is used to remove accents. self.response_name = Patterns.FILENAME.search( self.content_disposition).group(1) extension = self._filename_to_ext(self.response_name) if extension: return extension except KeyError: pass self.response_name = Path(self.url).name extension = self._filename_to_ext(self.response_name) if extension: return extension return self.content_type.split("/")[-1] def create_subject_folder(self): """Creates the subject's principal folder.""" return self.subject.create_folder() def make_request(self): """Makes the request for the Link.""" self.logger.debug("Making request") self.response = self.connection.get(self.redirect_url or self.url) self.logger.debug("Response obtained [%d | %s]", self.response.status_code, self.content_type) if 500 <= self.response.status_code <= 599: raise MoodleError( f"Moodle server replied with {self.response.status_code}") if self.response.status_code == 408: self.logger.warning("Received response with code 408, retrying") return self.make_request() if not self.response.ok: raise ResponseError(f"Got HTTP {self.response.status_code}") def close_connection(self): warnings.warn( "Since streams are not used, this method should not be called", DeprecationWarning, ) self.logger.debug("Closing connection") self.response.close() def process_request_bs4(self): """Parses the response with BeautifulSoup with the html parser.""" self.logger.debug("Parsing response (bs4)") self.soup = BeautifulSoup(self.response.text, "html.parser") self.logger.debug("Response parsed (bs4)") def autoset_filepath(self): """Determines the filepath of the Link.""" if self.filepath is not None: self.logger.debug("Filepath is setted, skipping (%s)", self.filepath) return if self.response is None: raise RuntimeError("Request not launched") filename = secure_filename( self._process_filename(self.name) + "." + self._get_ext_from_response()) self.logger.debug("Initial filename: %s", filename) temp_filepath = self.subject.folder if self.subfolders: temp_filepath.joinpath(*self.subfolders) if self.section: temp_filepath /= self.section.name temp_filepath /= filename try: folder_id = self.id except AttributeError: folder_id = None self.filepath = Path( Alias.id_to_alias( sha1(self.url.encode()).hexdigest(), temp_filepath.as_posix(), folder_id)) self.logger.debug("Set filepath: %r", self.filepath.as_posix()) def download(self): """Wrapper for self.do_download().""" try: self.do_download() finally: self.response = None self.soup = None def do_download(self): """Abstract method to download the Link. Must be overridden by subclasses.""" self.logger.debug("Called do_download() but it was not implemented") raise NotImplementedError def get_header_length(self): try: return int(self.response.headers["Content-Length"]) except KeyError: return len(self.response.content) @property def content_type(self): if "Content-Type" in self.response.headers: return self.response.headers["Content-Type"] return None def save_response_content(self): """Saves the response content to the disk.""" if self.filepath is None: self.autoset_filepath() if Modules.current() == Modules.notify: return self.create_subfolder() self.logger.debug("filepath in REAL_FILE_CACHE: %s", self.filepath in REAL_FILE_CACHE) if self.filepath in REAL_FILE_CACHE: if REAL_FILE_CACHE[self.filepath] == self.get_header_length(): self.logger.debug("File found in cache: Same content (%d)", len(self.response.content)) return self.logger.debug( "File found in cache: Different content (%d --> %d)", REAL_FILE_CACHE[self.filepath], len(self.response.content), ) Results.print_updated(self.filepath) else: self.logger.debug( "File added to cache: %s [%d]", self.filepath, len(self.response.content), ) REAL_FILE_CACHE[self.filepath] = len(self.response.content) Results.print_new(self.filepath) try: with self.filepath.open("wb") as file_handler: file_handler.write(self.response.content) self.logger.debug("File downloaded and saved: %s", self.filepath) except PermissionError: self.logger.warning( "File couldn't be downloaded due to permission error: %s", self.filepath.name, ) self.logger.warning("Permission error %s -- %s", self.subject.name, self.filepath.name) @staticmethod def ensure_origin(url: str) -> bool: """Returns True if the origin is the virtual campus.""" return "uva.es" in url