Esempio n. 1
0
def get_subjects(queue):
    logger = logging.getLogger(__name__)

    connection = Connection()
    request = connection.get(connection.user_url)
    soup = BeautifulSoup(request.text, "html.parser")
    primary_li = soup.find_all("li", class_="contentnode")[3]

    lis = primary_li.find_all("li")
    logger.debug("Found %d potential subjects", len(lis))
    subjects = []

    for li in lis:
        course_id = int(re.search(r"course=(\d+)", li.a["href"]).group(1))
        subject_url = "https://campusvirtual.uva.es/course/view.php?id=%d" % course_id
        name = re.search(r"^([\w\/\sáéíóúÁÉÍÓÚ]+?)\s?\(", li.text).group(1)

        if course_id in settings.exclude_subjects_ids:
            logger.info("Excluding subject %s (%d)", name, course_id)
            continue

        # Don't consider subject if 'grado' is in the name (it is the degree itself)
        if "grado" in name.lower():
            continue

        logger.debug("Assembling subject %r", name)
        _subject = Subject(name, subject_url, queue)
        subjects.append(_subject)

    subjects.sort(key=lambda x: x.name)
    return subjects
Esempio n. 2
0
class Subject:
    """Representation of a subject."""

    def __init__(self, name, url, queue):
        """

        Args:
            name (str): name of the subject.
            url (str): url of the subject.
            queue (Queue): queue to controll threads.
        """

        name = name.capitalize().replace("\\", "").replace("/", "").strip()

        self.name = Alias.id_to_alias(
            sha1(url.encode()).hexdigest(), settings.root_folder / name
        ).name
        self.url = url
        self.connection = Connection()
        self.queue = queue

        self.enable_section_indexing = self.url in settings.section_indexing_urls

        self.response: Response = None
        self.soup: BeautifulSoup = None
        self.notes_links = []
        self.folder_lock = Lock()
        self.hasfolder = False
        # self.folder = settings.root_folder / secure_filename(self.name)
        self.folder = settings.root_folder / self.name
        self.logger = logging.getLogger(__name__)

        self.logger.debug(
            "Created %s(name=%r, url=%r)", type(self).__name__, self.name, self.url
        )

    def __repr__(self):
        return (
            f"{type(self).__name__}(name={self.name!r}, url={self.url!r}, "
            f"{len(self.notes_links)} notes links)"
        )

    def __str__(self):
        return f"{self.name}"

    def make_request(self):
        """Makes the primary request."""
        self.logger.debug("Making subject request")
        self.response = self.connection.get(self.url)
        self.soup = BeautifulSoup(self.response.text, "html.parser")

        self.logger.debug("Response obtained [%d]", self.response.status_code)
        self.logger.debug("Response parsed")

    def create_folder(self):
        """Creates the folder named as self."""
        if self.hasfolder is False:
            self.logger.debug("Creating folder %r", self.name)
            with self.folder_lock:
                if not self.folder.exists():
                    os.makedirs(self.folder.as_posix())
            self.hasfolder = True

        else:
            self.logger.debug("Folder already exists: %r", self.name)

    def add_link(self, link: BaseLink):
        """Adds a note link to the list."""
        self.logger.debug("Adding link: %s", link.name)
        if not self.enable_section_indexing:
            link.section = None

        self.notes_links.append(link)
        self.queue.put(link)

    @staticmethod
    def find_section_by_child(child):
        try:
            section_h3 = child.find_parent("li", class_="section main clearfix").find(
                "h3", class_="sectionname"
            )
        except AttributeError:
            section_h3 = child.find_parent(
                "li", class_="section main clearfix current"
            ).find("h3", class_="sectionname")
        return Section(section_h3.text, section_h3.a["href"])

    @staticmethod
    def url_to_query_args(url: str):
        return parse_qs(urlparse(url).query)

    def find_and_download_links(self):
        """Finds the links downloading the primary page."""
        self.logger.debug("Finding links of %s", self.name)
        self.make_request()

        _ = [x.extract() for x in self.soup.findAll("span", {"class": "accesshide"})]
        _ = [x.extract() for x in self.soup.findAll("div", {"class": "mod-indent"})]

        for folder in self.soup.find_all("div", class_="singlebutton"):
            folder_name = folder.parent.parent.div.find(
                "span", class_="fp-filename"
            ).text

            section = self.find_section_by_child(folder)

            folder_url = folder.form["action"]
            folder_icon_url = folder.find_parent(
                "div", class_="contentwithoutlink"
            ).find("img", class_="icon")["src"]
            id_ = folder.form.find("input", {"name": "id"})["value"]

            self.logger.debug(
                "Created Folder (subject search): %r, %s", folder_name, folder_url
            )
            self.add_link(
                Folder(folder_name, section, folder_url, folder_icon_url, self, id_)
            )

        for resource in self.soup.find_all("div", class_="activityinstance"):
            if not resource.a:
                continue

            section = self.find_section_by_child(resource)

            name = resource.a.span.text
            url = resource.a["href"]
            icon_url = resource.a.img["src"]

            if "resource" in url:
                self.logger.debug(
                    "Created Resource (subject search): %r, %s", name, url
                )
                self.add_link(Resource(name, section, url, icon_url, self))
            elif "folder" in url:
                real_url = "https://campusvirtual.uva.es/mod/folder/download_folder.php"
                id_ = self.url_to_query_args(url)["id"][0]
                self.logger.debug(
                    "Created Folder (subject search): %r, id=%r", name, id_
                )
                self.add_link(Folder(name, section, real_url, icon_url, self, id_))
            elif "forum" in url:
                self.logger.debug("Created Forum (subject search): %r, %s", name, url)
                self.add_link(ForumList(name, section, url, icon_url, self))
            elif "chat" in url:
                self.logger.debug("Created Chat (subject search): %r, %s", name, url)
                self.add_link(Chat(name, section, url, icon_url, self))
            elif "page" in url:
                self.logger.debug("Created Page (subject search): %r, %s", name, url)
                self.add_link(Page(name, section, url, icon_url, self))
            elif "url" in url:
                self.logger.debug("Created Page (subject search): %r, %s", name, url)
                self.add_link(Url(name, section, url, icon_url, self))
            elif "assign" in url:
                self.logger.debug(
                    "Created Delivery (subject search): %r, %s", name, url
                )
                self.add_link(Delivery(name, section, url, icon_url, self))
            elif "kalvidres" in url:
                self.logger.debug(
                    "Created Kalvidres (subject search): %r, %s", name, url
                )
                self.add_link(Kalvidres(name, section, url, icon_url, self))
            elif "quiz" in url:
                self.logger.debug("Created Quiz (subject search): %r, %s", name, url)
                self.add_link(Quiz(name, section, url, icon_url, self))
            elif "collaborate" in url:
                self.logger.debug(
                    "Created Blackboard (subject search): %r, %s", name, url
                )
                self.add_link(BlackBoard(name, section, url, icon_url, self))

        self.logger.debug("Downloading files for subject %r", self.name)
Esempio n. 3
0
 def test_get(self):
     conn = Connection()
     conn.get(self.url)
     self.downloader_m.return_value.get.assert_called_once_with(self.url)
Esempio n. 4
0
File: link.py Progetto: sralloza/vcm
class BaseLink(_Notify):
    """Base class for Links."""
    def __init__(self, name, section, url, icon_url, subject, parent=None):
        """
        Args:
            name (str): name of the url.
            url (str): URL of the url.
            icon_url (str or None): URL of the icon.
            subject (vcm.subject.Subject): subject of the url.
            parent (BaseLink): object that created self.
        """

        self.name = name.strip()
        self.section = section
        self.url = url
        self.icon_url = icon_url
        self.subject = subject
        self.connection = Connection()
        self.parent = parent

        self.response: Response = None
        self.soup: BeautifulSoup = None
        self.filepath: Path = None
        self.redirect_url = None
        self.response_name = None
        self.subfolders = []

        self.logger = logging.getLogger(__name__)
        self.logger.debug(
            "Created %s(name=%r, url=%r, subject=%r)",
            self.__class__.__name__,
            self.name,
            self.url,
            self.subject.name,
        )

    @property
    def content_disposition(self):
        if self.response is None:
            raise RuntimeError("Response not made yet")

        return unidecode.unidecode(
            self.response.headers["Content-Disposition"])

    def append_subfolder(self, dirname):
        dirname = secure_filename(dirname)
        return self.subfolders.append(dirname)

    def insert_subfolder(self, index, dirname):
        dirname = secure_filename(dirname)
        return self.subfolders.insert(index, dirname)

    def create_subfolder(self):
        """Creates the subfolder, if it is configured."""
        self.create_subject_folder()

        if not self.filepath:
            self.autoset_filepath()

        folder: Path = self.filepath.parent

        if not folder.exists():
            os.makedirs(folder.as_posix(), exist_ok=True)
            self.logger.debug("Created subfolder %r", folder.as_posix())
        else:
            self.logger.debug("Subfolder already exists %r", folder.as_posix())

    @staticmethod
    def _process_filename(filepath: str):
        """Quits some characters from the filename that can not be in a filepath.

        Args:
            filepath (st): filepath to process.

        Returns:
            str: filepath processed.

        """

        filepath = filepath.replace(">", " mayor que ")
        filepath = filepath.replace("<", " menor que ")

        return filepath

    @staticmethod
    def _filename_to_ext(filename):
        """Returns the extension given a filename."""
        return Path(filename).suffix[1:]

    def _get_ext_from_response(self):
        """Returns the extension of the filename of the response, got from the Content-Dispotition
        HTTP header.

        Returns:
            str: the extension.

        """

        if self.response_name is not None:
            return self._filename_to_ext(self.response_name)

        try:
            # unidecode.unidecode is used to remove accents.
            self.response_name = Patterns.FILENAME.search(
                self.content_disposition).group(1)
            extension = self._filename_to_ext(self.response_name)
            if extension:
                return extension
        except KeyError:
            pass

        self.response_name = Path(self.url).name
        extension = self._filename_to_ext(self.response_name)
        if extension:
            return extension
        return self.content_type.split("/")[-1]

    def create_subject_folder(self):
        """Creates the subject's principal folder."""
        return self.subject.create_folder()

    def make_request(self):
        """Makes the request for the Link."""

        self.logger.debug("Making request")

        self.response = self.connection.get(self.redirect_url or self.url)

        self.logger.debug("Response obtained [%d | %s]",
                          self.response.status_code, self.content_type)

        if 500 <= self.response.status_code <= 599:
            raise MoodleError(
                f"Moodle server replied with {self.response.status_code}")

        if self.response.status_code == 408:
            self.logger.warning("Received response with code 408, retrying")
            return self.make_request()

        if not self.response.ok:
            raise ResponseError(f"Got HTTP {self.response.status_code}")

    def close_connection(self):
        warnings.warn(
            "Since streams are not used, this method should not be called",
            DeprecationWarning,
        )
        self.logger.debug("Closing connection")
        self.response.close()

    def process_request_bs4(self):
        """Parses the response with BeautifulSoup with the html parser."""

        self.logger.debug("Parsing response (bs4)")
        self.soup = BeautifulSoup(self.response.text, "html.parser")
        self.logger.debug("Response parsed (bs4)")

    def autoset_filepath(self):
        """Determines the filepath of the Link."""

        if self.filepath is not None:
            self.logger.debug("Filepath is setted, skipping (%s)",
                              self.filepath)
            return

        if self.response is None:
            raise RuntimeError("Request not launched")

        filename = secure_filename(
            self._process_filename(self.name) + "." +
            self._get_ext_from_response())
        self.logger.debug("Initial filename: %s", filename)

        temp_filepath = self.subject.folder

        if self.subfolders:
            temp_filepath.joinpath(*self.subfolders)

        if self.section:
            temp_filepath /= self.section.name

        temp_filepath /= filename

        try:
            folder_id = self.id
        except AttributeError:
            folder_id = None

        self.filepath = Path(
            Alias.id_to_alias(
                sha1(self.url.encode()).hexdigest(), temp_filepath.as_posix(),
                folder_id))

        self.logger.debug("Set filepath: %r", self.filepath.as_posix())

    def download(self):
        """Wrapper for self.do_download()."""
        try:
            self.do_download()
        finally:
            self.response = None
            self.soup = None

    def do_download(self):
        """Abstract method to download the Link. Must be overridden by subclasses."""
        self.logger.debug("Called do_download() but it was not implemented")
        raise NotImplementedError

    def get_header_length(self):
        try:
            return int(self.response.headers["Content-Length"])
        except KeyError:
            return len(self.response.content)

    @property
    def content_type(self):
        if "Content-Type" in self.response.headers:
            return self.response.headers["Content-Type"]

        return None

    def save_response_content(self):
        """Saves the response content to the disk."""
        if self.filepath is None:
            self.autoset_filepath()

        if Modules.current() == Modules.notify:
            return

        self.create_subfolder()

        self.logger.debug("filepath in REAL_FILE_CACHE: %s", self.filepath
                          in REAL_FILE_CACHE)

        if self.filepath in REAL_FILE_CACHE:
            if REAL_FILE_CACHE[self.filepath] == self.get_header_length():
                self.logger.debug("File found in cache: Same content (%d)",
                                  len(self.response.content))
                return

            self.logger.debug(
                "File found in cache: Different content (%d --> %d)",
                REAL_FILE_CACHE[self.filepath],
                len(self.response.content),
            )
            Results.print_updated(self.filepath)
        else:
            self.logger.debug(
                "File added to cache: %s [%d]",
                self.filepath,
                len(self.response.content),
            )
            REAL_FILE_CACHE[self.filepath] = len(self.response.content)
            Results.print_new(self.filepath)

        try:
            with self.filepath.open("wb") as file_handler:
                file_handler.write(self.response.content)
            self.logger.debug("File downloaded and saved: %s", self.filepath)
        except PermissionError:
            self.logger.warning(
                "File couldn't be downloaded due to permission error: %s",
                self.filepath.name,
            )
            self.logger.warning("Permission error %s -- %s", self.subject.name,
                                self.filepath.name)

    @staticmethod
    def ensure_origin(url: str) -> bool:
        """Returns True if the origin is the virtual campus."""
        return "uva.es" in url