def open_browser(self):
        if self.verbose: print('Opening browser...')

        # Set whether to show browser UI while fetching
        options = _Options()
        if not self.show_browser_ui:
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')

        # Launch Chrome
        self.browser = _webdriver.Chrome(chrome_options=options)
    def _get_archive_dates(self):
        # Initialize calendar navigation
        print(f'Initializing calendar navigation for {self.feed_name}...')

        # Set whether to show browser UI while fetching
        options = _Options()
        if not self.show_browser_ui:
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')

        # Launch Chrome
        with _webdriver.Chrome(executable_path=self.webdriver_path,
                               chrome_options=options) as browser:
            browser.get(self.archive_url)
            self.archive_calendar = ArchiveCalendar(self,
                                                    browser,
                                                    get_dates=True)
            self.start_date = self.archive_calendar.start_date
            self.end_date = self.archive_calendar.end_date

        self.archive_calendar = None

        print('Initialization complete.\n')
        print(self)
    def build(self,
              start=None,
              end=None,
              days_back=None,
              chronological=False,
              rebuild=False):
        """
        Build archive entry data for the BroadcastifyArchive's feed_id and
        populate as a dictionary to the .entries attribute.

        Parameters
        ----------
            start : datetime.date
                The earliest date for which to populate the archive. If None,
                go from the earliest date on the calendar (inclusive).
            end : datetime.date
                The latest date for which to populate the archive. If None,
                go to the latest date on the calendar (inclusive).
            days_back : int
                The number of days before the current day to retrieve informa-
                tion for. A value of `0` retrieves only archive entries corres-
                ponding to the current day. Pass either days_back OR a valid
                combination of start/end dates.
            chronological : bool
                By default, start with the latest date and work backward in
                time. If True, reverse that.
            rebuild : bool
                Specifies that existing data in the `entries` list should be
                overwritten with data newly fetched from Broadcastify.
        """
        # Prevent the user from unintentionally erasing existing archive info
        if self.entries and not rebuild:
            raise ValueError(
                f'Archive already built: Entries already exist for'
                f' this BroadcastifyArchive. To erase and rebuild,'
                f' specify `rebuild=True` when calling .build()')

        # Make sure valid arguments were passed
        ## Either start/end or days_back; not both
        if (start or end) and days_back:
            raise ValueError(f'Expected either `days_back` OR a `start`/`end` '
                             f'combination. Both were passed.')

        ## `days_back` must be a non-negative integer
        if days_back is not None:
            bad_days_back = False
            try:
                if days_back < 0:
                    bad_days_back = True
            except:
                bad_days_back = True

            if bad_days_back:
                raise TypeError(f'`days_back` must be a non-negative integer.')

            # Capture the archive end date to count back from
            end = self.end_date

            # Make sure days_back is no larger than the archive date range size
            start = self.start_date
            archive_size = (end - start).days
            if days_back > archive_size:
                _warnings.warn(
                    f"The number of days_back passed ({days_back}) "
                    f"exceeds the size of the archive's date range ("
                    f"{archive_size}). Only valid dates will be "
                    f"built.")
                days_back = archive_size

        else:
            ## Check that `start` and `end` within archive's start/end dates
            ## If they weren't passed, set them to the archive's start/end dates
            out_of_range = ''

            if start:
                if start < self.start_date:
                    out_of_range = (f'start date out of archive range: '
                                    f'{start} < {self.start_date}\n')
                elif start > self.end_date:
                    out_of_range = (f'start date out of archive range: '
                                    f'{start} > {self.end_date}\n')
            else:
                start = self.start_date

            if end:
                if end > self.end_date:
                    out_of_range += (f'end date out of archive range: '
                                     f'{end} > {self.end_date}')
                elif end < self.start_date:
                    out_of_range += (f'end date out of archive range: '
                                     f'{end} < {self.start_date}')
            else:
                end = self.end_date

            if out_of_range:
                raise AttributeError(out_of_range)

            ## `start` cannot be > `end`
            if start > end:
                raise AttributeError(f'`start` date ({start}) cannot be after '
                                     f'`end` date ({end}).')

            # Get size of the date range
            days_back = (end - start).days

        # Adjust for exclusive end of range()
        days_back += 1

        # Build the list of dates to scrape
        date_list = sorted(
            [end - _dt.timedelta(days=x) for x in range(days_back)],
            reverse=not (chronological))

        archive_entries = []

        # Spin up a browser and an ArchiveCalendar
        # Set whether to show browser UI while fetching
        print('Launching webdriver...')
        options = _Options()
        if not self.show_browser_ui:
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')

        with _webdriver.Chrome(executable_path=self.webdriver_path,
                               chrome_options=options) as browser:
            browser.get(self.archive_url)
            self.arch_cal = ArchiveCalendar(self, browser)

            # Get archive entries for each date in list
            t = _tqdm(date_list,
                      desc=f'Building dates',
                      leave=True,
                      dynamic_ncols=True)
            for date in t:
                t.set_description(f'Building {date}', refresh=True)
                self.arch_cal.go_to_date(date)

                if self.arch_cal.entries_for_date:
                    archive_entries.extend(self.arch_cal.entries_for_date)

        # Empty & replace the current archive entries
        self.entries = []

        # Store URIs and end times in the entries attritbute
        for entry in archive_entries:
            entry_dict = {
                'uri': entry[0],
                'start_time': entry[1],
                'end_time': entry[2]
            }

            self.entries.append(entry_dict)

        self.earliest_entry = min(
            [entry['end_time'] for entry in self.entries]).date()
        self.latest_entry = max([entry['end_time']
                                 for entry in self.entries]).date()

        print(self)
Ejemplo n.º 4
0
    def oauth_authenticate(self, client_id, expiration):
        # Authenticate with RC View single-sign-on.
        if _print_messages:
            self._spinner.text = 'Authenticating user'

        if self._existing_tokens:
            self._refresh_token = self._existing_tokens['refresh_token']
            self._token = self._existing_tokens['token']
            return self._token

        parameters = {
            'client_id': client_id,
            'response_type': 'code',
            'expiration': -1,
            'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob'
        }

        url = self.baseurl + 'oauth2/authorize'
        paramstring = _urlencode(parameters)
        codeurl = "{}?{}".format(url, paramstring)

        options = _Options()
        options.set_headless(True)
        options.add_argument('--log-level=3')
        driver = _webdriver.Chrome(chrome_options=options)
        driver.get(codeurl)

        delay = 10
        try:
            using_redcross_element = _WebDriverWait(driver, delay).\
                until(_EC.presence_of_element_located((_By.ID, 'idp_Name')))
        except _TimeoutException:
            driver.quit()
            if _print_messages:
                self._spinner.fail(
                    'Accessing Red Cross single-sign-on took too much time.')

        using_redcross_element.click()

        try:
            username_element = _WebDriverWait(driver, delay).\
                until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[4]/input')))
            password_element = _WebDriverWait(driver, delay).\
                until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[5]/input')))
            signin_element = _WebDriverWait(driver, delay).\
                until(_EC.presence_of_element_located((_By.XPATH, '/html/body/main/div[4]/div/div/div/div/div/div/div/div[1]/div/div/div/div[6]/button')))
        except _TimeoutException:
            driver.quit()
            if _print_messages:
                self._spinner.fail(
                    'Accessing Red Cross single-sign-on took too much time.')

        username_element.send_keys(self._username)
        password_element.send_keys(self._password)
        signin_element.click()

        try:
            code_element = _WebDriverWait(driver, delay).\
                until(_EC.presence_of_element_located((_By.ID, 'code')))
        except _TimeoutException:
            driver.quit()
            if _print_messages:
                self._spinner.fail(
                    'Receiving an authentication code took too much time.')

        code = code_element.get_attribute('value')
        driver.quit()

        parameters = {
            'client_id': client_id,
            'grant_type': 'authorization_code',
            'code': code,
            'redirect_uri': 'urn:ietf:wg:oauth:2.0:oob'
        }

        token_info = self.post('oauth2/token',
                               parameters,
                               ssl=True,
                               add_token=False)
        self._refresh_token = token_info['refresh_token']
        self._token = token_info['access_token']
        return self._token