Example #1
0
    def download_url_report(self, report_url, temp_folder_name):
        """ Downloads a MealTime report.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            temp_folder_name (string): The name of the folder in which this
                specific report's download files should be stored.

        Returns: A Pandas DataFrame of the report contents.
        """
        csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        # set up the driver for execution
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        #self.driver = configure_selenium_chrome(csv_download_folder_path)
        self._login()

        # get the report url
        self.driver.get(interpret_report_url(self.base_url, report_url))

        # select the download format (csv) and execute
        export_format_select = Select(self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl00'))
        try:
            export_format_select.select_by_value('CSV')
            dl_type = 'csv'
        except NoSuchElementException:
            export_format_select.select_by_value('EXCELNoHeader')
            dl_type = 'xls'
        self.driver.find_element_by_id('ctl00_ctl00_MainContent_reportViewer_ctl01_ctl05_ctl01').click()

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        # TODO add a try/except block here
        wait_for_any_file_in_folder(csv_download_folder_path, dl_type)

        # remove the header rows
        #xlrd.open_workbook(utils.get_most_recent_file_in_dir(csv_download_folder_path), formatting_info=False)

        if dl_type == 'csv':
            report_df = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                      header=2)
        else:
            report_df = pd.read_excel(get_most_recent_file_in_dir(csv_download_folder_path),
                                      header=3)

        # delete any files in the mealtime temp folder; we don't need them now
        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        delete_folder_contents(csv_download_folder_path)

        self.driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(self.username, interpret_report_url(self.base_url, report_url)))

        return report_df
Example #2
0
    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Summit Learning report at a URL that triggers a CSV download

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        self.log.debug('Starting download of: '.format(report_download_url))

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise NoDataError(
                'No data in report for user {} at url: {}'.format(
                    self.username,
                    interpret_report_url(self.base_url, report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Example #3
0
    def download_csv_custom_report(self,
                                   report_name,
                                   school_year,
                                   download_if_generating=False,
                                   pandas_read_csv_kwargs={}):
        """Download a SchoolMint Custom Report that downloads as a single CSV file"""
        temp_folder_name = report_name.replace(" ", "_").lower()
        csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        driver = self._download_custom_report(report_name, school_year,
                                              csv_download_folder_path,
                                              download_if_generating)

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        wait_for_any_file_in_folder(csv_download_folder_path, "csv")

        report_df = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
            **pandas_read_csv_kwargs)

        # delete any files in the mealtime temp folder; we don't need them now
        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        delete_folder_contents(csv_download_folder_path)

        # close the driver for this task
        driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            raise NoDataError(
                'No data for user {} in Custom Report: {}'.format(
                    self.username, report_name))

        return report_df
Example #4
0
    def download_url_report(self, report_url, write_to_disk=None, **kwargs):
        """ Downloads a Lexia report at a URL for a page with an 'export' button.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_excel or
                read_csv (depending on the report_url)

        Returns: A Pandas DataFrame of the report contents.
        """

        report_download_url = interpret_report_url(self.base_url, report_url)

        # if user is trying to download a manage tab report (for convenience)
        if '/mylexiaweb/app/index.html#/groups/' in report_download_url:
            return self.download_manage_tab_report(report_url, write_to_disk,
                                                   **kwargs)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # find and click the download button
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//button[contains(text(), 'Export')]")))

        self.log.debug('Starting download of: '.format(report_download_url))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xlsx")
        self.log.debug('Downloada Finished.')

        df_report = pd.read_excel(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Example #5
0
    def download_manage_tab_report(self,
                                   report_url,
                                   write_to_disk=None,
                                   **kwargs):
        """ Downloads a Lexia report from the 'Manage' tab.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            write_to_disk (string): The path for a directory to store the
                downloaded file. If nothing is provided, the file will be
                stored in a temporary directory and deleted at the end of
                this function.
            **kwargs: additional arguments to pass to Pandas read_csv

        Returns: A Pandas DataFrame of the report contents.
        """
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        report_download_url = interpret_report_url(self.base_url, report_url)
        self.log.debug(
            'Getting report page at: {}'.format(report_download_url))
        self.driver.get(report_download_url)

        # select all users and find the download button
        def check_for_export_button_enabled(driver, elem_select_all_locator,
                                            elem_export_locator):
            elem_select_all = driver.find_element(*elem_select_all_locator)
            if not elem_select_all.is_enabled():
                return False
            elem_select_all.click()
            if not elem_select_all.is_selected():
                return False
            elem_export = driver.find_element(*elem_export_locator)
            if elem_export.is_enabled() and elem_export.is_displayed():
                return elem_export
            else:
                return False

        # have to use a lambda because until expects a callable
        elem_export = WebDriverWait(
            self.driver,
            self.wait_time).until(lambda x: check_for_export_button_enabled(
                self.driver, (By.NAME, "lexia-select-all"),
                (By.XPATH, "//button[contains(text(), 'Export')]")))
        self.log.debug('Starting download of: '.format(report_download_url))
        elem_export.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "xls")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            sep='\t',
            **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Example #6
0
    def download_url_report(self,
                            report_url,
                            school_year,
                            temp_folder_name=None,
                            pandas_read_csv_kwargs={}):
        """ Downloads a SchoolMint data-stream-table report.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            school_year (string): The SchoolMint school year to download from (e.g. '2018-2019')
            temp_folder_name (string): The name for a sub-directory in which the files from the
                browser will be temporarily stored. If this directory does not exist, it will be
                created. NOTE: This sub-directory will be
            pandas_read_csv_kwargs: additional arguments to pass to Pandas read_csv

        Returns: A Pandas DataFrame of the report contents.
        """
        if temp_folder_name:
            csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
        else:
            csv_download_folder_path = mkdtemp(dir=self.temp_folder_path)

        # set up the driver for execution
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()
        self._set_year(school_year, self.driver)

        # get the report url
        self.driver.get(interpret_report_url(self.base_url, report_url))
        self.__remove_walk_me_and_support()

        # wait until we have rows in the stream data table before starting to
        # look for results
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//*[@id='stream-table']/tbody/tr[1]/td[1]")))

        if not self.check_school_year(school_year):
            raise ReportNotFound(
                "Wrong school detected prior to clicking generate.")

        self.log.debug('Waiting for report-data-summary to load')
        # wait until the stream table is fully loaded before downloading
        prev_data_summary_elem = self.driver.find_element_by_id(
            'report-data-summary').text
        # print(prev_data_summary_elem)
        time.sleep(1)
        # we use the following count as a proxy for time elapsed, so we can
        # use the class's wait_time as the number of retries
        count = 0
        while True:
            # check id=report-data-summary
            report_data_summary_elem = self.driver.find_element_by_id(
                'report-data-summary').text

            # if it matches, wait a little longer and double deck that it hasn't changed
            if prev_data_summary_elem == report_data_summary_elem:
                time.sleep(3)
                count += 3
                report_data_summary_elem = self.driver.find_element_by_id(
                    'report-data-summary').text
                if prev_data_summary_elem == report_data_summary_elem:
                    break
            prev_data_summary_elem = report_data_summary_elem
            time.sleep(1)

            count += 1
            if count >= self.wait_time:
                raise TimeoutError(
                    'SchoolMint Report Data never did not fully load within %d'
                    % self.wait_time)

        # click the button to download the report
        self.log.debug('Starting download...')
        elem = self.driver.find_element_by_class_name("export-table")
        elem.click()

        # wait until file has downloaded to close the browser. We can do this
        # because we delete the file before we return it, so the temp dir should
        # always be empty when this command is run
        wait_for_any_file_in_folder(csv_download_folder_path, "csv")

        self.log.debug('Download finished.')
        report_df = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path),
            encoding=SCHOOLMINT_DEFAULT_EXPORT_ENCODING,
            **pandas_read_csv_kwargs)

        # TODO: move this out of this function. It should happen as cleanup once
        # the whole DAG has completed
        #delete_folder_contents(csv_download_folder_path)
        shutil.rmtree(csv_download_folder_path)

        # close the driver for this task
        self.driver.close()

        # if the dataframe is empty (the report had no data), raise an error
        if report_df.shape[0] == 0:
            #delete_folder_contents(csv_download_folder_path)
            shutil.rmtree(csv_download_folder_path)
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url,
                                                    report_url)))

        return report_df
Example #7
0
    def download_site_data_download(
            self,
            dl_heading,
            site_id,
            academic_year,
            report_generation_wait=REPORT_GENERATION_WAIT,
            write_to_disk=None,
            **kwargs):
        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path,
                                                 self.headless)
        self._login()

        dl_page_url = "{base_url}/sites/{site_id}/data_downloads/".format(
            base_url=self.base_url, site_id=site_id)

        self.driver.get(dl_page_url)

        self._set_dl_academic_year(academic_year)

        if not self.check_dl_academic_year(academic_year):
            raise ValueError("Academic Year not correctly set")

        # start the CSV generation process
        download_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//a[contains(text(), '{button_text}')]"

        # try to find the "Download CSV" button - old version of the interface
        old_interface = False
        try:
            elem = self.driver.find_element_by_xpath(
                download_button_xpath.format(dl_heading=dl_heading,
                                             button_text='Download CSV'))
            old_interface = True
            self.log.info("'Download CSV' interface detected.")
            elem.click()
        # if it's not there, it may have changed to a "Refresh" button
        except NoSuchElementException as e:
            pass

        # try to find the "Generate CSV" button - new version of the interface

        if not old_interface:
            gen_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//button[contains(text(), '{button_text}')]"
            try:
                elem = self.driver.find_element_by_xpath(
                    gen_button_xpath.format(dl_heading=dl_heading,
                                            button_text='Generate CSV'))
                self.log.info("'Generate CSV' interface detected.")
                elem.click()
            # if it's not there, it may have changed to a "Refresh" button
            except NoSuchElementException as e:
                try:
                    elem = self.driver.find_element_by_xpath(
                        gen_button_xpath.format(dl_heading=dl_heading,
                                                button_text='Download'))
                except NoSuchElementException as e:
                    elem = self.driver.find_element_by_xpath(
                        gen_button_xpath.format(dl_heading=dl_heading,
                                                button_text='Refresh'))
                    elem.click()

        # wait for the refresh command to be issued
        time.sleep(1)

        # wait for the report to be available and download it
        self.log.info(
            'Starting download of report "{}" for site_id "{}"'.format(
                dl_heading, site_id))

        dl_button_xpath = "//h3[contains(text(), '{dl_heading}')]/parent::div/parent::div//a[contains(text(), 'Download')]"
        try:
            elem = WebDriverWait(self.driver, report_generation_wait).until(
                EC.presence_of_element_located(
                    (By.XPATH, dl_button_xpath.format(dl_heading=dl_heading))))
            elem.click()
        # if the download is not ready, refresh the page and try one more time
        except TimeoutException:
            self.driver.refresh()
            elem = WebDriverWait(self.driver, report_generation_wait).until(
                EC.presence_of_element_located(
                    (By.XPATH, dl_button_xpath.format(dl_heading=dl_heading))))
            elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.debug('Download Finished.')

        df_report = pd.read_csv(
            get_most_recent_file_in_dir(csv_download_folder_path), **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0:
            raise NoDataError('No data in report "{}" for site_id "{}"'.format(
                dl_heading, site_id))

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report
Example #8
0
    def download_url_report(self, report_url, temp_folder_name):
        """ Downloads an Informed K12 report.

        Args:
            report_url (string): Information pertaining to the path and query
                string for the report whose access is desired. Any filtering
                that can be done with a stateful URL should be included.
            temp_folder_name (string): The name of the folder in which this
                specific report's download files should be stored.

        Returns: A Pandas DataFrame of the report contents.
        """
        count = 0
        while True:
            try:
                # WebDriverException - except
                csv_download_folder_path = self.temp_folder_path + '/' + temp_folder_name
                # set up the driver for execution
                self.driver = configure_selenium_chrome(
                    csv_download_folder_path)
                self._login()

                time.sleep(2)
                #self.driver.get(self.base_url)

                # get the report url
                self.driver.get(interpret_report_url(self.base_url,
                                                     report_url))

                # select all responses
                # get the report url
                #self.driver.get(interpret_report_url(self.base_url, report_url))

                # check to see if there are no submissions. If so, abort by exception
                try:
                    self.driver.find_element_by_xpath(
                        "//h2[contains(text(), 'No submissions')]")
                    self.driver.close()
                    raise ValueError(
                        'No data in report for user {} at url: {}'.format(
                            self.username,
                            interpret_report_url(self.base_url, report_url)))
                except NoSuchElementException:
                    # We actually don't want to find this.
                    pass

                # wait until we have rows in the responses data table before starting to
                # look for results
                try:
                    elem = WebDriverWait(self.driver, self.wait_time).until(
                        EC.presence_of_element_located((
                            By.XPATH,
                            "//*[@class='responses-table']/table/thead/tr[1]/*[@class='checkboxes']/input"
                        )))
                except TimeoutException:
                    raise

                # select all
                elem.click()

                # check to see if a new link populates to 'select all filtered submissions" (happens if more than 50 submissions)
                try:
                    elem = self.driver.find_element_by_xpath(
                        "//*[@class='responses-bulk-actions']/*[@class='select-link']"
                    )
                    elem.click()
                except NoSuchElementException():
                    pass

                # click download
                elem = self.driver.find_element_by_xpath(
                    "//*[contains(text(), 'Download') and @class='hidden-xs']")
                elem.click()

                # click 'As a spreadsheet'
                elem = self.driver.find_element_by_xpath(
                    "//*[@class='dropdown-menu dropdown-menu-right']//*[contains(text(), 'As a spreadsheet')]"
                )
                elem.click()

                # activate the menu that allows 'select all'
                try:
                    # the following elem selection fails b/c is moves, so we time.sleep to let it load first
                    time.sleep(0.5)
                    elem = WebDriverWait(self.driver, self.wait_time).until(
                        EC.visibility_of_element_located((
                            By.XPATH,
                            "//*[@class='dropdown-toggle']/*[contains(text(), 'columns')]/i"
                        )))
                    elem.click()
                except TimeoutException:
                    # TODO
                    raise

                # click on 'select all'
                elem = self.driver.find_element_by_xpath(
                    "//*[@class='dropdown-menu dropdown-menu-right']//*[contains(text(), 'Select all')]"
                )
                elem.click()

                # wait a moment for the info to populate
                time.sleep(2)

                # click download
                # elem = self.driver.find_element_by_xpath(
                #     "//*[@class='btn btn-primary' and contains(text(), 'Download')]")
                # elem.click()
                #
                # time.sleep(1)
                # try:
                #     elem = self.driver.find_element_by_xpath(
                #         "//*[@class='btn btn-primary' and contains(text(), 'Download')]")
                #     elem.click()
                # except WebDriverException:
                #     pass

                c = 0
                while True:

                    try:
                        elem = self.driver.find_element_by_xpath(
                            "//*[@class='btn btn-primary' and contains(text(), 'Download')]"
                        )
                        elem.click()
                    except NoSuchElementException:
                        if c >= 9:
                            raise
                        time.sleep(1)
                        c += 1
                        continue
                    break

                # wait until file has downloaded to close the browser. We can do this
                # because we delete the file before we return it, so the temp dir should
                # always be empty when this command is run
                # TODO add a try/except block here
                wait_for_any_file_in_folder(csv_download_folder_path, 'csv')

                report_df = pd.read_csv(
                    get_most_recent_file_in_dir(csv_download_folder_path))

                # delete any files in the mealtime temp folder; we don't need them now
                # TODO: move this out of this function. It should happen as cleanup once
                # the whole DAG has completed
                delete_folder_contents(csv_download_folder_path)

                self.driver.close()
            except WebDriverException:
                if count >= 9:
                    raise
                count += 1
                self.driver.close()
                continue
            break

        return report_df
Example #9
0
    def download_data_shared_with_application(self, application_page_url, collection,
                                              write_to_disk=None, **kwargs):
        """
        Downloads the students shared with a particular application through Clever.
        :param application_page_url: The url for the main Clever management page for a
            particular application. For example, for My Lexia, this would be
            https://schools.clever.com/applications/lexia-mylexia
        :param collection: A string of 'schools', 'students', 'sections', 'teachers', 'schooladmins'
            that indicates which shared data to download
        :param write_to_disk: A path to a directory where the downloaded CSV should be saved.
            If nothing is passed, it will not be saved and only a Pandas DataFrame will be returned.
        :param kwargs: Additional keyword arguments to be passed to the Pandas read_csv function.
        :return: A Pandas DataFrame of the indicated collection download.
        """
        collection = collection.lower().replace(' ', '')
        if collection not in ['schools', 'students', 'sections', 'teachers', 'schooladmins']:
            raise ReportNotFound(
                (
                    "Argument for collection '{collection}' is not a valid. Please choose from: "
                    "'schools', 'students', 'sections', 'teachers', 'schooladmins'."
                ).format(collection=collection)
            )
        report_access_page_url = interpret_report_url(self.base_url, application_page_url)

        if write_to_disk:
            csv_download_folder_path = write_to_disk
        else:
            csv_download_folder_path = mkdtemp()
        self.driver = DriverBuilder().get_driver(csv_download_folder_path, self.headless)
        self._login()

        self.log.debug('Getting report access page at: {}'.format(report_access_page_url))
        self.driver.get(report_access_page_url)

        # find and click the download button based on the collection desired
        elem = WebDriverWait(self.driver, self.wait_time).until(
            EC.presence_of_element_located(
                (By.XPATH, "//a[contains(@href, '{collection}.csv')]".format(collection=collection))
            )
        )
        self.log.info('Starting download of: {} - {}'.format(report_access_page_url, collection))
        elem.click()

        wait_for_any_file_in_folder(csv_download_folder_path, "csv")
        self.log.info('Download Finished.')

        df_report = pd.read_csv(get_most_recent_file_in_dir(csv_download_folder_path),
                                **kwargs)

        # if the dataframe is empty (the report had no data), raise an error
        if df_report.shape[0] == 0 and collection != 'schooladmins':
            raise ValueError('No data in report for user {} at url: {}'.format(
                self.username, interpret_report_url(self.base_url, application_page_url)))
        elif df_report.shape[0] == 0:
            warnings.warn("The 'schooladmins' collection has no data. Ensure that no school admins are shared.")

        self.driver.close()

        if not write_to_disk:
            shutil.rmtree(csv_download_folder_path)

        return df_report