Ejemplo n.º 1
0
class SummitLearning(WebUIDataSource, LoggingMixin):
    def __init__(self, username, password, wait_time, hostname='summitlearning.org', temp_folder_path=None,
                 headless=False, login_provider='google'):
        super().__init__(username, password, wait_time, hostname, temp_folder_path, headless)
        self.login_provider=login_provider
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + 'www.' + self.hostname

    def _login(self):
        if self.login_provider == 'google':
            login_url = self.base_url + '/auth/google_oauth2'
            self.driver.get(login_url)

            elem = WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.ID, 'identifierId')))
            elem.clear()
            elem.send_keys(self.username)
            elem.send_keys(Keys.RETURN)

            elem = WebDriverWait(self.driver, self.wait_time).until(
                EC.visibility_of_element_located((By.NAME, 'password')))
            elem.send_keys(self.password)
            elem.send_keys(Keys.RETURN)

        WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'app-teacher')))

    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Summit Learning report at a URL that triggers a CSV download

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        self._login()

        self.log.debug('Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        self.log.debug('Starting download of: '.format(report_download_url))

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                  **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise NoDataError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url, report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Ejemplo n.º 2
0
class Lexia(WebUIDataSource, LoggingMixin):
    """ Class for interacting with the web ui of Lexia
    """
    def __init__(self,
                 username,
                 password,
                 wait_time,
                 hostname,
                 temp_folder_path=None,
                 headless=False,
                 lexia_school_year_start_date=None,
                 district_export_email_address=None,
                 district_export_email_password=None,
                 district_export_email_imap_uri=None,
                 district_export_email_folder='Lexia District Exports',
                 district_export_email_wait_time=600,
                 district_export_email_retry_frequency=30,
                 district_id=None):
        super().__init__(username, password, wait_time, hostname,
                         temp_folder_path, headless)
        self.lexia_school_year_start_date = lexia_school_year_start_date
        self.district_export_email_address = district_export_email_address
        self.district_export_email_password = district_export_email_password
        self.district_export_email_imap_uri = district_export_email_imap_uri
        self.district_export_email_folder = district_export_email_folder
        self.district_export_email_wait_time = district_export_email_wait_time
        self.district_export_email_retry_frequency = district_export_email_retry_frequency
        self.district_id = district_id
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + 'www.' + self.hostname

    def _login(self):
        """ Logs into the provided Lexia instance.
        """
        login_url = self.uri_scheme + 'auth.mylexia.com/mylexiaLogin'
        self.log.debug('Logging into Lexia at: {}'.format(login_url))
        self.driver.get(login_url)
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.ID, 'username')))

        elem.clear()
        elem.send_keys(self.username)
        elem.send_keys(Keys.RETURN)
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.ID, 'login-password')))
        elem.send_keys(self.password)
        elem.send_keys(Keys.RETURN)

        # ensure that login is successful
        self.driver.get(self.base_url)

        if 'Welcome' in self.driver.title:
            self.driver.close()
            raise InvalidLoginCredentials

    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Lexia report at a URL for a page with an 'export' button.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        # if user is trying to download a manage tab report (for convenience)
        if '/mylexiaweb/app/index.html#/groups/' in report_download_url:
            return self.download_manage_tab_report(report_url, write_to_disk,
                                                   **kwargs)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # find and click the download button
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//button[contains(text(), 'Export')]")))

        self.log.debug('Starting download of: '.format(report_download_url))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xlsx")
        self.log.debug('Downloada Finished.')

        df_report = pd.read_excel(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_manage_tab_report(self,
                                   report_url,
                                   write_to_disk=None,
                                   **kwargs):
        """ Downloads a Lexia report from the 'Manage' tab.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_csv

        Returns: A Pandas DataFrame of the report contents.
        """
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        report_download_url = interpret_report_url(self.base_url, report_url)
        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # select all users and find the download button
        def check_for_export_button_enabled(driver, elem_select_all_locator,
                                            elem_export_locator):
            elem_select_all = driver.find_element(*elem_select_all_locator)
            if not elem_select_all.is_enabled():
                return False
            elem_select_all.click()
            if not elem_select_all.is_selected():
                return False
            elem_export = driver.find_element(*elem_export_locator)
            if elem_export.is_enabled() and elem_export.is_displayed():
                return elem_export
            else:
                return False

        # have to use a lambda because until expects a callable
        elem_export = WebDriverWait(
            self.driver,
            self.wait_time).until(lambda x: check_for_export_button_enabled(
                self.driver, (By.NAME, "lexia-select-all"),
                (By.XPATH, "//button[contains(text(), 'Export')]")))
        self.log.debug('Starting download of: '.format(report_download_url))
        elem_export.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xls")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            sep='\t',
            **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_district_export_core5_monthly(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='export',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def download_district_export_core5_year_to_date(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='expytd',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def download_district_export_powerup_year_to_date(
        self,
        write_to_disk=None,
        pandas_read_csv_kwargs={},
        period_end_date=dt.datetime.now().date()):
        return self._download_district_export(
            report_type='pupytd',
            period_end_date=period_end_date,
            write_to_disk=write_to_disk,
            pandas_read_csv_kwargs=pandas_read_csv_kwargs)

    def _download_district_export(self,
                                  report_type,
                                  period_end_date,
                                  period_start_date=None,
                                  write_to_disk=None,
                                  pandas_read_csv_kwargs={}):
        if not period_start_date:
            period_start_date = self.lexia_school_year_start_date
        self.__request_district_export(report_type, period_start_date,
                                       period_end_date)

        df_report = None
        number_retries = int(self.district_export_email_wait_time /
                             self.district_export_email_retry_frequency)
        for retry_count in range(0, number_retries):
            if retry_count > 0:
                time.sleep(self.district_export_email_retry_frequency)
            self.log.info(
                str(self.district_id) + ': get export_id from email, try: ' +
                str(retry_count))
            try:
                export_id = self.__get_exportid_from_email()
            except ValueError as err:
                self.log.debug(err)
                self.log.warning(
                    '{}: No export_id found in email, retrying in {} seconds.'.
                    format(self.district_id,
                           self.district_export_email_retry_frequency))
                time.sleep(self.district_export_email_retry_frequency)
                continue

            try:
                df_report = self.__download_export_for_exportid(
                    export_id, write_to_disk, pandas_read_csv_kwargs)
                break
            except NoDataError as e:
                self.log.warning('{}: {} Retrying in {} seconds.'.format(
                    self.district_id, e,
                    self.district_export_email_retry_frequency))
        if df_report is None:
            raise ReportNotFound(
                'No email was received with report id. Make sure the emails are not going to spam.'
            )
        else:
            return df_report

    def __request_district_export(self,
                                  report_type,
                                  period_start_date=None,
                                  period_end_date=None,
                                  write_to_disk=None):
        """
        Logs into Lexia and submits the request to generate a district export
        :param report_type: The text from one of 'Report type' options listed in the myLexia
            'District Exports' modal.
        :param period_start_date: The start date for the report request (unsure if this actually
            affects the data returned if it is different from the school year start date set
            for your Lexia instance)
        :param period_end_date: The end date for the report request (unsure if this actually
            affects the data returned if it is different from the day on which the request is made)
        :param write_to_disk: The path to save the CSV to.
        :return: Boolean. Whether or not the export request was successful.
        """
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = self.temp_folder_path
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        # use requests to post the download request
        with requests.Session() as s:
            for cookie in self.driver.get_cookies():
                s.cookies.set(cookie['name'], cookie['value'])

            payload = {
                "districtID": self.district_id,
                "type": report_type,
                "email": self.district_export_email_address,
                "startDate": period_start_date.strftime("%Y-%m-%d"),
                "endDate": period_end_date.strftime("%Y-%m-%d")
            }
            self.log.info('{}: Export request payload: {}'.format(
                self.district_id, payload))
            download_response = s.put(self.base_url + '/exportData/progress',
                                      data=payload)

            if download_response.ok:
                self.log.info(
                    '{}: Export request for {} succeeded for user: {}'.format(
                        self.district_id, report_type, self.username))
                j_data = json.loads(download_response.content.decode())
                self.log.info(j_data)
                return True
            else:
                self.log.info(
                    '{}: Export request for {} FAILED  for user: {}'.format(
                        self.district_id, report_type, self.username))
                self.log.info(download_response.content)
                return False

    def __get_exportid_from_email(self):
        """Log into an IMAP email server and get messages in a specific folder.
        Checks for a new Lexia export_id in those messages.

        Returns:
            int: the export_id
        """
        self.log.info('Checking email for latest report ID for district_id: ' +
                      str(self.district_id))
        imap_conn = imaplib.IMAP4_SSL(self.district_export_email_imap_uri)

        try:
            imap_conn.login(self.district_export_email_address,
                            self.district_export_email_password)
        except imaplib.IMAP4.error:
            self.log.error('Email login failed for: ' +
                           self.district_export_email_address)
            sys.exit(1)

        rv, data = imap_conn.select('"{}"'.format(
            self.district_export_email_folder))
        if rv == 'OK':
            self.log.info('Processing mailbox for ' +
                          self.district_export_email_address + ' in folder "' +
                          self.district_export_email_folder + '"')
            export_id = self.__extract_lexia_export_id_from_email(imap_conn)
            if export_id == -1:
                raise ValueError('No new export_id found on ' +
                                 self.district_export_email_address)
            else:
                imap_conn.close()
                return export_id

        else:
            raise InvalidIMAPParameters(
                "ERROR: Unable to open mailbox. Check your parameters and email folder. Message: ",
                rv)
            imap_conn.logout()

    def __extract_lexia_export_id_from_email(self, imap_conn):
        """ Extract the export_id that is sent by Lexia that is needed to
        download the prepared report export.

        Email messages in Gmail aren't sorted can can't be sorted using
        regular IMAP functions (Gmail does not support them). Therefore
        we will search within the folder for messages in the last day.

        Args:
            imap_conn (imaplib.IMAP4_SSL): A current connection to an IMAP
                email account.

        Returns:
            int: The new export_id
        """
        # get all messages received in the last day
        rv, data = imap_conn.search(
            None, '(SINCE ' +
            (dt.datetime.now() - dt.timedelta(1)).strftime("%d-%b-%Y") + ')')
        if rv != 'OK':
            self.log.warning("No email messages found!")
            # TODO change this to raise an error
            return -1

        highest_export_id = -1
        for num in data[0].split():
            rv, data = imap_conn.fetch(num, '(RFC822)')
            if rv != 'OK':
                # TODO change this to raise an error
                self.log.error("ERROR getting email message", num)
                return -1

            msg = email.message_from_bytes(data[0][1])
            self.log.debug('Processing Message %s, Raw Date: %s' %
                           (num, msg['Date']))
            highest_export_id = 0
            for part in msg.walk():
                # each part is a either non-multipart, or another multipart message
                # that contains further parts... Message is organized like a tree
                if part.get_content_type() == 'text/plain':
                    # get the raw text
                    part_str = part.get_payload()
                    # extract the report id
                    match = re.search(r'(?<=id=)(\d*?)(?=\s)', part_str)
                    if match:
                        export_id = int(match.group(0))
                        self.log.debug('export_id found: ' + str(export_id))
                        if export_id > highest_export_id:
                            highest_export_id = export_id
                    else:
                        return -1

        return highest_export_id

    def __download_export_for_exportid(self,
                                       export_id,
                                       write_to_disk=None,
                                       pandas_read_csv_kwargs={}):
        """Logs into lexia and downloads the report associated with a specific
        export_id.

        Args:
            export_id (int): The Lexia export id to download.
            write_to_disk (str): A path where the CSV that has been downloaded should be written to disk.
            pandas_read_csv_kwargs (dict): kwargs to pass to the Pandas read_csv function as necessary
        Returns:
            A Pandas dataframe with the report contents
        """
        self.log.info(
            str(self.district_id) + ': downloading report with export_id=' +
            str(export_id))
        with requests.Session() as s:
            for cookie in self.driver.get_cookies():
                s.cookies.set(cookie['name'], cookie['value'])

            export_url = self.base_url + '/reports/get_export.php' + '?id=' + str(
                export_id)
            download_response = s.get(export_url)

            self.log.info(
                'Report download request response for export_id {}: {}'.format(
                    export_id, download_response.content))

            if download_response.ok:
                df_report = pd.read_csv(
                    io.StringIO(
                        download_response.content.decode(LEXIA_CSV_ENCODING)),
                    **pandas_read_csv_kwargs)

                # if the dataframe is empty (the report had no data), raise an error
                if df_report.shape[0] == 0:
                    raise NoDataError(
                        'No data in report for user {} at url: {}'.format(
                            self.username, export_url))
            else:
                raise ValueError('Report download request failed')

        self.driver.close()

        if write_to_disk:
            df_report.to_csv(write_to_disk)

        return df_report
Ejemplo n.º 3
0
class SchoolMint(WebUIDataSource, LoggingMixin):
    """ Class for interacting with SchoolMint
    """
    def __init__(self,
                 username,
                 password,
                 wait_time,
                 hostname,
                 temp_folder_path,
                 headless=False):
        # try:
        #     self.logger = logging.getLogger('sps-automation.data_sources.schoolmint.Schoolmint')
        # except AttributeError:
        #     self.log
        super().__init__(username, password, wait_time, hostname,
                         temp_folder_path, headless)
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + self.hostname

    def _login(self):
        """ Logs into the provided SchoolMint instance.
        """
        # 2019-01-16 SchoolMint seems to be having some issues with loading the login screen recently,
        # so we'll add a retry here
        count = 0
        while count < NUMBER_OF_RETRIES:
            self.log.debug('Logging into SchoolMint at, try {}: {}'.format(
                count, self.base_url))
            self.driver.get(self.base_url + "/signin")
            # wait until login form available
            try:
                elem = WebDriverWait(self.driver, self.wait_time).until(
                    EC.presence_of_element_located((By.ID, 'login')))
                elem.clear()
                elem.send_keys(self.username)
                elem = self.driver.find_element_by_id("password")
                elem.send_keys(self.password)
                elem.send_keys(Keys.RETURN)
                break
            except ElementNotVisibleException:
                count += 1

        # check that login succeeded by looking for the 'Student search' box
        try:
            elem = WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.ID, 'student-lookup')))
        except TimeoutException:
            self.driver.close()
            raise InvalidLoginCredentials

        # wait for the page to fully load - the walk-me player is the last thing, but since it's a third
        # party add-on we'll wait for the filters on the application index first
        WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'report-filters')))
        # now we'll wait for the walk "Walk Me Through" overlay in the bottom right
        try:
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.ID, 'walkme-player')))
        except TimeoutException:
            pass

        # deal with the walk_me announcement pop-ups overlays
        try:
            elem = WebDriverWait(self.driver,
                                 WALKME_AND_SUPPORT_TIMEOUT).until(
                                     EC.presence_of_element_located(
                                         (By.CLASS_NAME, 'wm-shoutout')))
            self.driver.execute_script(
                """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                elem)
            elem = self.driver.find_element_by_id('walkme-overlay-all')
            self.driver.execute_script(
                """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                elem)
        except TimeoutException:
            self.log.debug('No wm-shoutout found')

    def __remove_walk_me_and_support(self):
        """Removes two third party overlays that can block buttons that selenium needs to click."""
        self.log.info('Removing "Walk-Me" and "Support" overlays.')
        walkme = True
        # wait for walk-me to load
        try:
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.ID, 'walkme-player')))
        except TimeoutException:
            self.log.info('Probably no Walk-Me found')
            walkme = False

        if walkme:
            self.log.debug('Removing "Walk Me" overlay.')
            try:
                for id in ['walkme-player', 'walkme-overlay-all']:
                    elem = WebDriverWait(self.driver,
                                         WALKME_AND_SUPPORT_TIMEOUT).until(
                                             EC.presence_of_element_located(
                                                 (By.ID, id)))
                    self.driver.execute_script(
                        """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                        elem)
                self.log.debug('Success')
                self.log.debug('Removing "Walk Me" bouncing overlay.')
                try:
                    elem = self.driver.find_element_by_id('walkme-attengrab')
                    self.driver.execute_script(
                        """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                        elem)
                    self.log.debug('Success')
                except NoSuchElementException:
                    self.log.debug('No "Walk Me" bouncing overlay found.')
            except TimeoutException:
                self.log.debug('No "Walk Me" overlay found.')

            # remove "Homeroom" announcement
            self.log.debug('Removing "Homeroom" and other wm-shoutout modals.')
            try:
                elem = WebDriverWait(
                    self.driver, WALKME_AND_SUPPORT_TIMEOUT
                ).until(
                    # it turns out that the id can have numbers at the end (e.g. wm-shoutout-141590), so we need XPATH
                    EC.presence_of_element_located(
                        (By.XPATH, "//*[starts-with(@id, 'wm-shoutout')]")))
                self.driver.execute_script(
                    """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                    elem)
                self.log.debug('Success')
            except TimeoutException:
                self.log.debug(
                    'No "Homeroom" or other wm-shoutout modals found.')

        # remove 'Support' button
        self.log.debug('Trying to remove "Support" overlay.')
        try:
            elem = WebDriverWait(self.driver,
                                 WALKME_AND_SUPPORT_TIMEOUT).until(
                                     EC.presence_of_element_located(
                                         (By.ID, 'launcher')))
            self.driver.execute_script(
                """var elem=arguments[0];elem.parentNode.removeChild(elem);""",
                elem)
            self.log.debug('Success')
        except TimeoutException:
            self.log.debug('No "Support" overlay found.')
            pass

    def _set_year(self, school_year, driver=None):
        """Sets the year for the SchoolMint interface.

        Args:
            school_year (string): The school year that should be selected. Use the format shown in the
                SchoolMint interface. Example: '2016-2017'

        :return: True if function succeeds
        """
        self.log.debug('Changing school year to: {}'.format(school_year))
        if not driver:
            self.driver = configure_selenium_chrome()
            self._login()

        # open the year selector menu
        elem = self.driver.find_element_by_xpath(
            "//a[contains(@class,'dropdown-toggle enrollment')]")
        elem.click()

        # select the appropriate year
        try:
            year_xpath = "//*[@id='enrollment-selector']//a[contains(text(),'{}')]".format(
                school_year)
            elem = self.driver.find_element_by_xpath(year_xpath)
            elem.click()
        except NoSuchElementException as e:
            self.driver.save_screenshot('cannot_find_year.png')
            message = (' Check that the school_year variable is valid. '
                       'Passed value for school_year: {}').format(school_year)

            raise_with_traceback(type(e)(str(e) + message))

        # wait for the page to be ready again
        self.driver.get(self.base_url)
        WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.ID, 'student-lookup')))

        if not driver:
            self.driver.close()

        return True

    def check_school_year(self, school_year):
        """Checks that the school year is set as expected in the UI."""
        elem = self.driver.find_element_by_xpath(
            "//a[contains(@class,'dropdown-toggle enrollment')]/span[contains(@class,'current')]"
        )
        if school_year in elem.text:
            return True
        else:
            return False

    def download_url_report(self,
                            report_url,
                            school_year,
                            temp_folder_name=None,
                            pandas_read_csv_kwargs={}):
        """ Downloads a SchoolMint data-stream-table report.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            school_year (string): The SchoolMint school year to download from (e.g. '2018-2019')
            temp_folder_name (string): The name for a sub-directory in which the files from the
                browser will be temporarily stored. If this directory does not exist, it will be
                created. NOTE: This sub-directory will be
            pandas_read_csv_kwargs: additional arguments to pass to Pandas read_csv

        Returns: A Pandas DataFrame of the report contents.
        """
        if temp_folder_name:
            csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        else:
            csv_download_folder_path = mkdtemp(dir=self.temp_folder_path)

        # set up the driver for execution
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()
        self._set_year(school_year, self.driver)

        # get the report url
        self.driver.get(interpret_report_url(self.base_url, report_url))
        self.__remove_walk_me_and_support()

        # wait until we have rows in the stream data table before starting to
        # look for results
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//*[@id='stream-table']/tbody/tr[1]/td[1]")))

        if not self.check_school_year(school_year):
            raise ReportNotFound(
                "Wrong school detected prior to clicking generate.")

        self.log.debug('Waiting for report-data-summary to load')
        # wait until the stream table is fully loaded before downloading
        prev_data_summary_elem = self.driver.find_element_by_id(
            'report-data-summary').text
        # print(prev_data_summary_elem)
        time.sleep(1)
        # we use the following count as a proxy for time elapsed, so we can
        # use the class's wait_time as the number of retries
        count = 0
        while True:
            # check id=report-data-summary
            report_data_summary_elem = self.driver.find_element_by_id(
                'report-data-summary').text

            # if it matches, wait a little longer and double deck that it hasn't changed
            if prev_data_summary_elem == report_data_summary_elem:
                time.sleep(3)
                count += 3
                report_data_summary_elem = self.driver.find_element_by_id(
                    'report-data-summary').text
                if prev_data_summary_elem == report_data_summary_elem:
                    break
            prev_data_summary_elem = report_data_summary_elem
            time.sleep(1)

            count += 1
            if count >= self.wait_time:
                raise TimeoutError(
                    'SchoolMint Report Data never did not fully load within %d'
                    % self.wait_time)

        # click the button to download the report
        self.log.debug('Starting download...')
        elem = self.driver.find_element_by_class_name("export-table")
        elem.click()

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        wait_for_any_file_in_folder(csv_download_folder_path, "csv")

        self.log.debug('Download finished.')
        report_df = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
            **pandas_read_csv_kwargs)

        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        #delete_folder_contents(csv_download_folder_path)
        shutil.rmtree(csv_download_folder_path)

        # close the driver for this task
        self.driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            #delete_folder_contents(csv_download_folder_path)
            shutil.rmtree(csv_download_folder_path)
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        return report_df

    def __get_number_of_pages(self):
        """Get the number of pages in a SchoolMint pagination."""
        total_num_pages_xpath = '//*[@id="content"]//*[@class="pagination "]/li[@data-page][last()]'

        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.XPATH, total_num_pages_xpath)))

        num_pages = int(elem.get_attribute("data-page")) + 1

        return num_pages

    def __navigate_to_custom_report(self,
                                    report_name,
                                    school_year,
                                    download_folder_path=None):
        """Navigate to the page of the custom report tool that has the custom report on it"""
        if not download_folder_path:
            download_folder_path = self.temp_folder_path
        self.driver = DriverBuilder().get_driver(
            download_location=download_folder_path, headless=self.headless)
        self._login()
        self._set_year(school_year, self.driver)

        # get the custom reports page
        custom_reports_url = 'report/customReports'
        self.driver.get(interpret_report_url(self.base_url,
                                             custom_reports_url))
        self.__remove_walk_me_and_support()

        # wait for the page to load and get the maximum number of pages
        total_num_pages_xpath = '//*[@id="content"]//*[@class="pagination "]/li[@data-page][last()]'

        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.XPATH, total_num_pages_xpath)))

        num_pages = int(elem.get_attribute("data-page")) + 1

        current_page = 0
        while current_page < num_pages:
            report_name_xpath = "//tr[td//text()[contains(., '{}')]]".format(
                report_name)

            try:
                elem = self.driver.find_element_by_xpath(report_name_xpath)
                return current_page
            except NoSuchElementException:
                current_page += 1
                if current_page < num_pages:
                    next_page_xpath = '//*[@id="content"]//*[@class="pagination "]/li[@data-page={}]/a'.format(
                        current_page)
                    self.driver.find_element_by_xpath(next_page_xpath).click()

                    # scroll back to the top of the page, prevents selenium clicking errors
                    self.driver.execute_script("window.scrollTo(0, 0);")

        raise ReportNotFound

    def generate_custom_report(self, report_name, school_year):
        """
        Clicks the generate button on a SchoolMint custom report.
        :param report_name: The name of the report exactly as it is shown in the SchoolMint UI
        :param school_year: The year in SchoolMint. Should be formatted as shown in the UI
            (e.g. '2018-2019')
        :return: True if the button was clicked. False if the button was not clicked because
            the report is generating.
        """
        self.__navigate_to_custom_report(report_name, school_year)

        if not self.check_school_year(school_year):
            raise ReportNotFound(
                "Wrong school detected prior to clicking generate.")

        generate_report_button_xpath = GENERATE_REPORT_BUTTON_XPATH.format(
            report_name=report_name)
        try:
            generate_report_button = WebDriverWait(
                self.driver, self.wait_time).until(
                    EC.presence_of_element_located(
                        (By.XPATH, generate_report_button_xpath)))
        except NoSuchElementException:
            raise ReportNotFound

        if generate_report_button.text == 'Generate Report':
            generate_report_button.click()
            self.driver.close()

            return True
        elif generate_report_button.text == 'Report in Progress':
            self.driver.close()

            return False
        else:
            raise ValueError("Unknown 'Generate Report' button text found")

    def is_custom_report_generating(self, report_name, school_year):
        """Checks if a SchoolMint Custom Report is generating or not"""
        self.__navigate_to_custom_report(report_name, school_year)

        generate_report_button_xpath = GENERATE_REPORT_BUTTON_XPATH.format(
            report_name=report_name)
        try:
            generate_report_button = WebDriverWait(
                self.driver, self.wait_time).until(
                    EC.presence_of_element_located(
                        (By.XPATH, generate_report_button_xpath)))
        except NoSuchElementException:
            raise ReportNotFound

        if generate_report_button.text == 'Report in Progress':
            return True
        elif generate_report_button.text == 'Generate Report':
            return False
        else:
            raise ValueError("Unknown 'Generate Report' button text found")

    def get_last_custom_report_generation_datetime(self, report_name,
                                                   school_year):
        """Get's a report's generation timestamp in raw text"""
        self.__navigate_to_custom_report(report_name, school_year)

        try:
            # old custom reports interface
            report_generated_on_xpath = (
                "//tr[td[./text()='{}']]/td[4]").format(report_name)
            report_generated_on_text = WebDriverWait(
                self.driver, self.wait_time).until(
                    EC.presence_of_element_located(
                        (By.XPATH, report_generated_on_xpath))).text
        except TimeoutException:
            try:
                # new custom reports interface
                report_generated_on_xpath = (
                    "//tr[td[text()=' {} ']]/td[contains(@class,'last_generated_date-td')]"
                ).format(report_name)
                report_generated_on_text = WebDriverWait(
                    self.driver, self.wait_time).until(
                        EC.presence_of_element_located(
                            (By.XPATH, report_generated_on_xpath))).text
            except TimeoutException:
                raise ReportNotFound

        return report_generated_on_text

    def _download_custom_report(self,
                                report_name,
                                school_year,
                                download_folder_path,
                                download_if_generating=False):
        """Protected function for clicking the download button on a report on the Custom Reports page"""
        if not download_folder_path:
            download_folder_path = self.temp_folder_path
        self.__navigate_to_custom_report(report_name, school_year,
                                         download_folder_path)

        generate_report_button_xpath = GENERATE_REPORT_BUTTON_XPATH.format(
            report_name=report_name)
        generate_report_button_text = WebDriverWait(
            self.driver, self.wait_time).until(
                EC.presence_of_element_located(
                    (By.XPATH, generate_report_button_xpath))).text

        download_button_xpath = (
            "//tr[td[text() = '{report_name}' or text() = ' {report_name} ']]//a[contains(text(), 'Download')]"
        ).format(report_name=report_name)

        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.XPATH, download_button_xpath)))

        if generate_report_button_text == 'Generate Report':
            elem.click()
        elif generate_report_button_text == 'Report in Progress' and download_if_generating:
            elem.click()
        else:
            raise ReportNotReady

        return self.driver

    def download_csv_custom_report(self,
                                   report_name,
                                   school_year,
                                   download_if_generating=False,
                                   pandas_read_csv_kwargs={}):
        """Download a SchoolMint Custom Report that downloads as a single CSV file"""
        temp_folder_name = report_name.replace(" ", "_").lower()
        csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        driver = self._download_custom_report(report_name, school_year,
                                              csv_download_folder_path,
                                              download_if_generating)

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        wait_for_any_file_in_folder(csv_download_folder_path, "csv")

        report_df = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
            **pandas_read_csv_kwargs)

        # delete any files in the mealtime temp folder; we don't need them now
        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        delete_folder_contents(csv_download_folder_path)

        # close the driver for this task
        driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            raise NoDataError(
                'No data for user {} in Custom Report: {}'.format(
                    self.username, report_name))

        return report_df

    def download_zip_custom_report(self,
                                   report_name,
                                   school_year,
                                   download_folder_path=None,
                                   download_if_generating=False,
                                   unzip=True,
                                   pandas_read_csv_kwargs={}):
        """
        Downloads a SchoolMint Custom Report that downloads as a zipped set of CSVs
        :param report_name: The name of the report exactly as it is shown in the SchoolMint UI
        :param school_year: The year in SchoolMint. Should be formatted as shown in the UI
            (e.g. '2018-2019')
        :param download_folder_path: The path to where you want to store the zip file.
        :param download_if_generating: Whether or not to download a custom report if the
            report is currently generating.
        :param unzip: Boolean. If True, not only downloads the file, but also unzips it and
            returns each csv in a Pandas Dataframe in a dictionary.
        :param pandas_read_csv_kwargs: Additional keyward arguments to pass to Panda's read_csv function.
        :return: None or a dictionary of Pandas DataFrames representing each of the CSVs in
            the zipped file.
        """
        # create a folder for this specific run
        run_time = datetime.datetime.utcnow()
        if not download_folder_path:
            download_folder_path = self.temp_folder_path
        download_dir_final = "{}/{}-{}-{}".format(download_folder_path,
                                                  report_name,
                                                  run_time.strftime('%Y%m%d'),
                                                  run_time.strftime('%H%M%S'))
        driver = self._download_custom_report(report_name, school_year,
                                              download_dir_final,
                                              download_if_generating)

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        # TODO add a try/except block here
        wait_for_any_file_in_folder(download_dir_final, "zip")

        driver.close()

        if unzip:
            # unzip the files
            file_path = max(glob.iglob(download_dir_final + '/*.zip'),
                            key=os.path.getctime)
            ZipfileLongPaths(file_path).extractall(download_dir_final)

            dfs = dict()
            # iterate through the unzipped files and load them into dataframes
            for csv_filepath in glob.iglob(download_dir_final + '/*.csv'):
                csv_filename = os.path.basename(csv_filepath)
                #print(csv_filename)
                # find the files that start with a number, these are the custom forms files
                if re.match("^(\d+)", csv_filename):
                    num_beg = re.match("^(\d+)", csv_filename).group(0)
                    words = re.findall("[A-Za-z]+", csv_filename)
                    dict_key = csv_filename  # "{}_{}".format(num_beg, '_'.join(words[0:3])).lower()
                    dfs[dict_key] = pd.read_csv(
                        csv_filepath,
                        encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
                        skiprows=[0, 2],
                        **pandas_read_csv_kwargs)
                # otherwise it is the info file that comes along with the zip export (application-data-export, etc.)
                else:
                    words = re.findall("[A-Za-z]+", csv_filename)
                    dict_key = csv_filename  # "{}".format('_'.join(words[0:3])).lower()

                    dfs[dict_key] = pd.read_csv(
                        csv_filepath,
                        encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
                        **pandas_read_csv_kwargs)

            return dfs
Ejemplo n.º 4
0
class Mealtime(WebUIDataSource):
    """ Class for interacting with the web ui of Mealtime
    """

    def __init__(self, username, password, wait_time, hostname, temp_folder_path, headless=False):
        super().__init__(username, password, wait_time, hostname, temp_folder_path, headless)
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + self.hostname

    def _login(self):
        """ Logs into the provided Mealtime instance.
        """
        self.driver.get(self.base_url + '/Base/SignIn.aspx')
        elem = self.driver.find_element_by_id("username")
        elem.clear()
        elem.send_keys(self.username)
        elem = self.driver.find_element_by_id("password")
        elem.send_keys(self.password)
        elem.send_keys(Keys.RETURN)

    def download_url_report(self, report_url, temp_folder_name):
        """ Downloads a MealTime report.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            temp_folder_name (string): The name of the folder in which this
                specific report's download files should be stored.

        Returns: A Pandas DataFrame of the report contents.
        """
        csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        # set up the driver for execution
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        #self.driver = configure_selenium_chrome(csv_download_folder_path)
        self._login()

        # get the report url
        self.driver.get(interpret_report_url(self.base_url, report_url))

        # select the download format (csv) and execute
        export_format_select = Select(self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl00'))
        try:
            export_format_select.select_by_value('CSV')
            dl_type = 'csv'
        except NoSuchElementException:
            export_format_select.select_by_value('EXCELNoHeader')
            dl_type = 'xls'
        self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl01').click()

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        # TODO add a try/except block here
        wait_for_any_file_in_folder(csv_download_folder_path, dl_type)

        # remove the header rows
        #xlrd.open_workbook(utils.get_most_recent_file_in_dir(csv_download_folder_path), formatting_info=False)

        if dl_type == 'csv':
            report_df = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                      header=2)
        else:
            report_df = pd.read_excel(get_most_recent_file_in_dir(csv_download_folder_path),
                                      header=3)

        # delete any files in the mealtime temp folder; we don't need them now
        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        delete_folder_contents(csv_download_folder_path)

        self.driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(self.username, interpret_report_url(self.base_url, report_url)))

        return report_df
Ejemplo n.º 5
0
class SummitLearning(WebUIDataSource, LoggingMixin):
    def __init__(self,
                 username,
                 password,
                 wait_time,
                 hostname='summitlearning.org',
                 temp_folder_path=None,
                 headless=False,
                 login_provider='google'):
        super().__init__(username, password, wait_time, hostname,
                         temp_folder_path, headless)
        self.login_provider = login_provider
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + 'www.' + self.hostname

    def _login(self):
        if self.login_provider == 'google':
            login_url = self.base_url + '/auth/google_oauth2'
            self.driver.get(login_url)

            # the Google login screen has multiple versions - the 'Email' one
            # seems to be used when headless
            try:
                elem = self.driver.find_element_by_id('Email')
            except NoSuchElementException:
                elem = self.driver.find_element_by_id('identifierId')
            elem.clear()
            elem.send_keys(self.username)
            elem.send_keys(Keys.RETURN)

            # headless version of Google Login
            elem = WebDriverWait(self.driver, self.wait_time).until(
                EC.visibility_of_element_located((By.ID, 'password')))
            # regular version of Google login
            if elem.tag_name == 'div':
                elem = WebDriverWait(self.driver, self.wait_time).until(
                    EC.element_to_be_clickable((By.NAME, 'password')))
            elem.send_keys(self.password)
            elem.send_keys(Keys.RETURN)

        # wait for the destination page to fully load
        WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'app-teacher')))

    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Summit Learning report at a URL that triggers a CSV download

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        self.log.debug('Starting download of: '.format(report_download_url))

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise NoDataError(
                'No data in report for user {} at url: {}'.format(
                    self.username,
                    interpret_report_url(self.base_url, report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def _set_dl_academic_year(self, academic_year):
        """Sets the academic year to download the reports from

        Args:
            academic_year (string): The academic year that should be selected. Use the format shown in the
                Summit Learning interface. Example: '2016-2017'

        :return: True if function succeeds
        """
        # The UI uses a '–' instead of a '-'. We'll make a convenience replacement
        academic_year = academic_year.replace('-', '–')

        self.log.debug('Changing academic year to: {}'.format(academic_year))

        # open the menu to select the academic year
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.element_to_be_clickable((By.ID, 'academic-year-selector')))
        elem.click()

        # select the appropriate year
        try:
            year_xpath = "//*[@id='academic-year-selector']/parent::div//a[contains(text(),'{}')]".format(
                academic_year)
            elem = self.driver.find_element_by_xpath(year_xpath)
            elem.click()
        except NoSuchElementException as e:
            self.driver.save_screenshot('cannot_find_year.png')
            message = (
                ' Check that the academic_year variable is valid. '
                'Passed value for academic_year: {}').format(academic_year)

            raise_with_traceback(type(e)(str(e) + message))

        return True

    def check_dl_academic_year(self, academic_year):
        """Checks that the academic year is set as expected in the UI."""
        # The UI uses a '–' instead of a '-'. We'll make a convenience replacement
        academic_year = academic_year.replace('-', '–')

        elem = self.driver.find_element_by_xpath(
            "//*[@id='academic-year-selector']/parent::div//button")

        if academic_year in elem.text:
            return True
        else:
            return False

    def download_site_data_download(
            self,
            dl_heading,
            site_id,
            academic_year,
            report_generation_wait=REPORT_GENERATION_WAIT,
            write_to_disk=None,
            **kwargs):
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        dl_page_url = "{base_url}/sites/{site_id}/data_downloads/".format(
            base_url=self.base_url, site_id=site_id)

        self.driver.get(dl_page_url)

        self._set_dl_academic_year(academic_year)

        if not self.check_dl_academic_year(academic_year):
            raise ValueError("Academic Year not correctly set")

        # start the CSV generation process
        download_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//a[contains(text(), '{button_text}')]"

        # try to find the "Download CSV" button - old version of the interface
        old_interface = False
        try:
            elem = self.driver.find_element_by_xpath(
                download_button_xpath.format(dl_heading=dl_heading,
                                             button_text='Download CSV'))
            old_interface = True
            self.log.info("'Download CSV' interface detected.")
            elem.click()
        # if it's not there, it may have changed to a "Refresh" button
        except NoSuchElementException as e:
            pass

        # try to find the "Generate CSV" button - new version of the interface

        if not old_interface:
            gen_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//button[contains(text(), '{button_text}')]"
            try:
                elem = self.driver.find_element_by_xpath(
                    gen_button_xpath.format(dl_heading=dl_heading,
                                            button_text='Generate CSV'))
                self.log.info("'Generate CSV' interface detected.")
                elem.click()
            # if it's not there, it may have changed to a "Refresh" button
            except NoSuchElementException as e:
                try:
                    elem = self.driver.find_element_by_xpath(
                        gen_button_xpath.format(dl_heading=dl_heading,
                                                button_text='Download'))
                except NoSuchElementException as e:
                    elem = self.driver.find_element_by_xpath(
                        gen_button_xpath.format(dl_heading=dl_heading,
                                                button_text='Refresh'))
                    elem.click()

        # wait for the refresh command to be issued
        time.sleep(1)

        # wait for the report to be available and download it
        self.log.info(
            'Starting download of report "{}" for site_id "{}"'.format(
                dl_heading, site_id))

        dl_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//a[contains(text(), 'Download')]"
        try:
            elem = WebDriverWait(self.driver, report_generation_wait).until(
                EC.presence_of_element_located(
                    (By.XPATH, dl_button_xpath.format(dl_heading=dl_heading))))
            elem.click()
        # if the download is not ready, refresh the page and try one more time
        except TimeoutException:
            self.driver.refresh()
            elem = WebDriverWait(self.driver, report_generation_wait).until(
                EC.presence_of_element_located(
                    (By.XPATH, dl_button_xpath.format(dl_heading=dl_heading))))
            elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise NoDataError('No data in report "{}" for site_id "{}"'.format(
                dl_heading, site_id))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Ejemplo n.º 6
0
class Clever(WebUIDataSource, LoggingMixin):
    """ Class for interacting with the Clever Web UI
    """

    def __init__(self, username, password, wait_time, hostname='schools.clever.com',
                 temp_folder_path=None, headless=False):
        super().__init__(username, password, wait_time, hostname, temp_folder_path)
        self.uri_scheme = 'https://'
        self.base_url = self.uri_scheme + self.hostname
        self.headless = headless
        self.log.debug('creating instance of Clever')

    def _login(self):
        """ Logs into the provided Clever instance.
        """
        self.log.info('Logging into Clever instance: hostname, username: {}, {}'.format(
            self.hostname, self.username
        ))
        self.driver.get(self.base_url)
        # wait until login form available
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located((By.NAME, 'username')))

        elem.clear()
        elem.send_keys(self.username)
        elem = self.driver.find_element_by_name("password")
        elem.send_keys(self.password)
        elem.send_keys(Keys.RETURN)

        # ensure that login is successful
        self.driver.get(self.base_url)

        if 'Clever | Home' not in self.driver.title:
            self.driver.close()
            raise InvalidLoginCredentials

    def download_url_report(self, report_url, collection, write_to_disk=None, **kwargs):
        """Currently a short cut for download_data_shared_with_application"""
        return self.download_data_shared_with_application(report_url, collection, write_to_disk, **kwargs)

    def download_data_shared_with_application(self, application_page_url, collection,
                                              write_to_disk=None, **kwargs):
        """
        Downloads the students shared with a particular application through Clever.
        :param application_page_url: The url for the main Clever management page for a
            particular application. For example, for My Lexia, this would be
            https://schools.clever.com/applications/lexia-mylexia
        :param collection: A string of 'schools', 'students', 'sections', 'teachers', 'schooladmins'
            that indicates which shared data to download
        :param write_to_disk: A path to a directory where the downloaded CSV should be saved.
            If nothing is passed, it will not be saved and only a Pandas DataFrame will be returned.
        :param kwargs: Additional keyword arguments to be passed to the Pandas read_csv function.
        :return: A Pandas DataFrame of the indicated collection download.
        """
        collection = collection.lower().replace(' ', '')
        if collection not in ['schools', 'students', 'sections', 'teachers', 'schooladmins']:
            raise ReportNotFound(
                (
                    "Argument for collection '{collection}' is not a valid. Please choose from: "
                    "'schools', 'students', 'sections', 'teachers', 'schooladmins'."
                ).format(collection=collection)
            )
        report_access_page_url = interpret_report_url(self.base_url, application_page_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        self._login()

        self.log.debug('Getting report access page at: {}'.format(report_access_page_url))
        self.driver.get(report_access_page_url)

        # find and click the download button based on the collection desired
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//a[contains(@href, '{collection}.csv')]".format(collection=collection))
            )
        )
        self.log.info('Starting download of: {} - {}'.format(report_access_page_url, collection))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.info('Download Finished.')

        df_report = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0 and collection != 'schooladmins':
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url, application_page_url)))
        elif df_report.shape[0] == 0:
            warnings.warn("The 'schooladmins' collection has no data. Ensure that no school admins are shared.")

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report

    def download_google_accounts_manager_student_export(self):
        """ Downloads the Google Accounts Manager Student Export that includes student emails."""
        self.log.info('Starting student email download.')
        # set up the driver for execution
        self.driver = configure_selenium_chrome()
        self._login()

        # grab some cookies (need to do this here for _mkto_trk cookie)
        cookies_orig = self.driver.get_cookies()

        # open the Google Accounts Manager application page
        # note - clever applications like Google Accounts Manager have unique ids that are a part of their URL
        # note - we have to get the settings page of the Google Accounts Manager to get the cookie
        #  that we need in order to download the file
        self.driver.get('https://schools.clever.com/school/applications/50ca15a93bc2733956000007/settings')
        cookies_schools = self.driver.get_cookies()

        # we may need to get the gaprov.ops.clever.com to get a cookie in new versions of chromedriver
        self.driver.get('https://gaprov.ops.clever.com/')
        cookies_gaprov = self.driver.get_cookies()

        # create requests session to download report without need for file storage
        with requests.Session() as s:

            # transfer over a bunch of cookies to the requests session
            for cookie in cookies_orig:
                s.cookies.set(cookie['name'], cookie['value'])

            for cookie in cookies_schools:
                s.cookies.set(cookie['name'], cookie['value'])

            for cookie in cookies_gaprov:
                s.cookies.set(cookie['name'], cookie['value'])

            s.cookies.set('_gat', "1")
            s.cookies.set('_gat_globalTracker', "1")

            report_url = 'https://gaprov.ops.clever.com/reporting/student'

            # download with 10 retries on failure
            c = 0
            while True:
                download_response = s.get(report_url, stream=True)

                if download_response.ok:
                    df_report = pd.read_csv(io.StringIO(download_response.content.decode('utf-8')))
                else:
                    self.log.info('Download failed for report url: {}'.format(report_url))
                    self.log.info('Download status_code: {}'.format(download_response.status_code))
                    self.log.info('Retrying... Retry#: {}'.format(c+1))
                    if c >= 9:
                        raise ValueError('Unable to download report after multiple retries.')
                    # add some jitter to the requests
                    sleep_time = (1000 + randint(500)) / 1000
                    time.sleep(sleep_time)
                    c += 1
                    continue
                break
        self.driver.close()

        self.log.info('Student email download complete.')

        return df_report