def authenticate(self, provider):
     """
     Step 1:: Authenticate and login to provider's portal
     :param provider: provider
     :return: requests session
     """
     logging.debug('BonyDownloader:authenticate')
     auth_config = self.configs.auth_config[provider]
     access_config = self.configs.access_config[provider]
     session = requests.Session()
     logging.debug(
         f':::1 Connect to {access_config["login-url"]} and get cookies')
     session.get(access_config['login-url'])
     logging.debug(f':::2 Call {access_config["auth-url"]} page')
     # requests will use the available cookies from session
     try:
         res1 = session.post(access_config["auth-url"], data=auth_config)
         if self._login_failed(provider, res1):
             raise DownloadException(
                 '2000_AUTHENTICATION_FAILED',
                 custom_message=f"Authentication failed for {provider}")
         logging.debug(f'Login status :: {res1.status_code}')
         # BONY request need certificate key for each request
         f_html = self.utils.format_html(res1.text)
         tree = lxml.html.fromstring(f_html)
         csrf_key = tree.xpath(
             '//form[@name="NavForm"]/input[@name="csrfKey"]/@value')[0]
     except Exception as e:
         raise DownloadException('2000_AUTHENTICATION_FAILED', e) from None
     return session, {'for_next_params': True, 'csrfKey': csrf_key}
예제 #2
0
 def authenticate(self, provider):
     """
     Step 1:: Authenticate and login to provider's portal
     :param provider: provider
     :return: requests session
     """
     logging.debug('FileDownloader:authenticate')
     auth_config = self.configs.auth_config[provider]
     access_config = self.configs.access_config[provider]
     session = requests.Session()
     logging.debug(
         f':::1 Connect to {access_config["login-url"]} and get cookies')
     session.get(access_config['login-url'])
     # logging.info(f'Session cookies {s.cookies.get_dict()}')
     logging.debug(f':::2 Call {access_config["auth-url"]} page')
     # requests will use the available cookies from session
     try:
         res1 = session.post(access_config["auth-url"], data=auth_config)
         if self._login_failed(provider, res1):
             raise DownloadException(
                 '2000_AUTHENTICATION_FAILED',
                 custom_message=f"Authentication failed for {provider}")
         logging.debug(f'Login status :: {res1.status_code}')
         # logging.info(f'Page Details:::: {res1.status_code}, {res1.cookies.get_dict()}, res1.headers, {res1.text}')
     except Exception as e:
         raise DownloadException('2000_AUTHENTICATION_FAILED', e) from None
     return session, None
예제 #3
0
 def download_files(self, session, **opts):
     """
     # Step 5::Download files to output directory, using urls and output directory in 'download_url' dictionary
     :param session: session object site cookies
     :param opts: user/commandline inputs + a_url['deal_info_dict_list'] + a_url['download_urls']
     :return: None
     """
     logging.debug('FileDownloader:Download files')
     download_count = 0
     for a_url in opts['access_urls']:
         if 'download_urls' in a_url:
             download_urls = a_url['download_urls']
             for download_url in download_urls:
                 d_url = download_url.file_url
                 o_file = download_url.out_file
                 o_file = self.utils.out_file(d_url, o_file)
                 if session:
                     if download_url.method and 'POST' in download_url.method:
                         response = session.post(d_url,
                                                 data=download_url.params)
                     else:
                         response = session.get(d_url)
                 else:
                     response = requests.get(download_url.file_url,
                                             stream=True)
                 self._download(o_file, response)
                 logging.info(
                     f"[Report: {download_url.report_group}] [{o_file}] downloaded"
                 )
                 download_count += 1
     if download_count == 0:
         raise DownloadException('6001_FILE_DOWNLOAD_NO_FILE')
예제 #4
0
def validate_n_format(configs, **opts):
    """
    This method validate and format the commandline parameters provided by user
    Converting provider string to lowercase to match with configuration keys
    :param configs: reference to Config class object, which has all config details
    :param opts: commandline input parameters
    :return: None
    """
    opts['provider'] = opts['provider'].lower()
    opts['tspan'] = opts['tspan'].lower() if opts['tspan'] else 'latest'
    # Validations for provider, tspan and other input variables
    utils = Utils()
    if opts['provider'] not in configs.auth_config.keys():
        raise DownloadException('1001_INVALID_PROVIDER')
    elif not utils.is_valid_time_span(opts['tspan']):
        raise DownloadException('1002_INVALID_TIME_SPAN_FORMAT')
예제 #5
0
 def _download(self, o_file, response):
     """
     Download file to output directory, raise exception of login session has expired
     :param o_file: output file path
     :param response: response
     :return: None
     """
     try:
         if response.status_code == 200:
             with open(o_file, 'wb') as output:
                 output.write(response.content)
         else:
             raise DownloadException(
                 '6000_FILE_DOWNLOAD_FAILED',
                 custom_message=f'File not available {response.status_code}'
             )
     except Exception as e:
         raise DownloadException('6000_FILE_DOWNLOAD_FAILED', e) from None
 def _dict_for_next_url(self, input_dict, tree):
     # print(f'table.text :: {etree.tostring(tree)}')
     result_dict = dict()
     for k, xp in input_dict.items():
         try:
             xp_result = tree.xpath(xp)
             result_dict[k] = ''.join(xp_result).strip()
         except Exception as e:
             raise DownloadException('3000_ACCESS_FAILED', e)
     return result_dict
예제 #7
0
 def parse(self, **opts):
     """
     method parses the 'ubn' specific page source using xpath from access-configs, after method execution
         a_url['download_urls'] appended to opts dictionary
     :param opts: user/commandline inputs + a_url['deal_info_dict_list']
     :return:
     """
     logging.debug('UbnDownloader:parse')
     provider = opts['provider']
     out_dir = opts['output']
     access_config = self.configs.access_config[provider]
     site_url = access_config['site-url']
     for a_url in opts['access_urls']:
         download_urls = list()
         for xpath in a_url['xpath']:
             for deal_info_dict in a_url['deal_info_dict_list']:
                 if 'f_html' in deal_info_dict:
                     tree = lxml.html.fromstring(deal_info_dict['f_html'])
                     try:
                         tr = tree.xpath(xpath)
                     except Exception as e:
                         raise DownloadException('4000_PARSING_FAILED', e)
                     search_data = ''
                     yyyy_mon = ''
                     for td in tr:
                         td_txt = td.text.strip()
                         if td_txt:
                             if 'Investor Report' in td_txt:
                                 search_data = ''
                                 search_data += td_txt + '||'
                             else:
                                 dt = self.utils.validate_date(td_txt)
                                 if dt[0]:
                                     yyyy_mon = str(dt[1].year) + '-' + str(
                                         dt[1].month)
                                     search_data += dt[1].strftime(
                                         "%b") + ' ' + str(
                                             dt[1].year) + '||'
                                 else:
                                     search_data += td_txt + '||'
                         w_td = td.xpath('a/@href | a/text()')
                         for href, a_text in zip(*[iter(w_td)] * 2):
                             logging.debug(
                                 f'href:: {href} a_text:: {a_text}')
                             f_url = site_url + href.strip()
                             o_file = out_dir + '/' + yyyy_mon + '/' + provider + '/' + deal_info_dict[
                                 'dealName']
                             o_file += '/' + deal_info_dict[
                                 'dealName'] + '-Investor-Report-' + yyyy_mon
                             o_file += '.' + a_text.strip()
                             download_urls.append(
                                 DownloadUrl(f_url, o_file, search_data,
                                             deal_info_dict['dealName']))
             # del a_url['f_html']
             a_url['download_urls'] = download_urls
예제 #8
0
 def access(self, session, **opts):
     """
     Step 2:: Pull access URL/s from configs file and use it to pull page source which has URLs for file download
         after method execution a_url['deal_info_dict_list'] appended to opts dictionary
     TODO Use namedtuple DealInfo to make current dictionary generic to all providers
     :param session: session with site cookies
     :param opts: user/commandline inputs
     :return: None
     """
     logging.debug('FileDownloader:access')
     provider = opts['provider']
     previous_url_results = list()
     for a_url in opts['access_urls']:
         logging.debug(f':::3 Send request to {a_url} page')
         deal_info_list = list()
         # Pull input parameters to append as a query string
         if len(a_url['input-param']) > 0:
             user_config = opts[
                 'user_input_config'] if 'user_input_config' in opts else None
             user_inputs = user_config[
                 'input'] if user_config else self.configs.user_input_config[
                     provider]['input']
             deal_info_list = self._append_query_str_to_url(
                 a_url, user_inputs)
         else:
             deal_info_list.append(a_url['url'])
         # Update URL with value pulled from previous page
         deal_info_list = self._urls_with_previous_result(
             a_url, deal_info_list, previous_url_results)
         for deal_info in deal_info_list:
             link = deal_info['link']
             params = deal_info['params'] if 'params' in deal_info else {}
             try:
                 if a_url['method'] == 'POST':
                     res = session.post(link, data=params)
                 elif a_url['method'] == 'GET':
                     res = session.get(link, params=params)
             except Exception as e:
                 raise DownloadException(
                     '3000_ACCESS_FAILED', e,
                     f'Access failed for {a_url["method"]} - {link}')
             logging.debug(
                 f'status code :: {res.status_code} history :: {res.history} response URL :: {res.url}'
             )
             if len(a_url['xpath']) > 0:
                 f_html = self.utils.format_html(res.text)
                 if 'for_next_url' in a_url[
                         'result-url-dict'] or 'for_next_params' in a_url[
                             'result-url-dict']:
                     previous_url_results.append(
                         self._values_for_next_url(a_url, f_html))
                 else:
                     deal_info['f_html'] = f_html
         a_url['deal_info_dict_list'] = deal_info_list
예제 #9
0
 def parse(self, **opts):
     """
     method parses the 'WF' specific page source using xpath from access-configs, after method execution
         a_url['download_urls'] appended to opts dictionary
     :param opts: user/commandline inputs + a_url['deal_info_dict_list']
     :return:
     """
     logging.debug('WellsFargoDownloader:parse')
     provider = opts['provider']
     out_dir = opts['output']
     access_config = self.configs.access_config[provider]
     site_url = access_config['site-url']
     for a_url in opts['access_urls']:
         download_urls = list()
         for xpath in a_url['xpath']:
             for deal_info_dict in a_url['deal_info_dict_list']:
                 if 'f_html' in deal_info_dict:
                     tree = lxml.html.fromstring(deal_info_dict['f_html'])
                     try:
                         trs = tree.xpath(xpath)
                     except Exception as e:
                         raise DownloadException('4000_PARSING_FAILED', e)
                     # to avoid the dates confusion, using first 3 items and 5th item from tr
                     # search_data will have only 'Current Cycle' date
                     for tr in trs:
                         # print(f'table.text :: {etree.tostring(tr)}')
                         td = tr.xpath('td')
                         if len(td) > 5:
                             w_td = td[1].xpath('a/@href | a/img/@alt')
                             for href, doc in zip(*[iter(w_td)] * 2):
                                 f_url = site_url + href.strip()
                                 search_data = doc.strip() + ' || '
                             if f_url:
                                 search_data += td[0].text.strip() + ' || '
                                 dt = self.utils.validate_date(td[2].text.strip(), '%m/%d/%Y')
                                 if dt[0]:
                                     yyyy_mon = str(dt[1].year) + '-' + str(dt[1].month)
                                     search_data += dt[1].strftime("%b") + ' ' + str(dt[1].year) + '||'
                                 if 'series_name' in deal_info_dict:
                                     deal_name = deal_info_dict['series_name']
                                     file_ext = '.csv'
                                 elif 'shelf_name' in deal_info_dict:
                                     deal_name = deal_info_dict['shelf_name']
                                     file_ext = '.zip'
                                 for hist_ele in td[5].xpath('a/@href'):
                                     hist_ele = hist_ele.strip()
                                     file_name = hist_ele[hist_ele.index('doc=') + 4:] + file_ext
                                     o_file = out_dir + '/' + yyyy_mon + '/' + provider + '/'
                                     o_file += deal_name + '/' + file_name
                                     download_urls.append(
                                         DownloadUrl(f_url, o_file, search_data, deal_name))
                     # del a_url['f_html']
                     a_url['download_urls'] = download_urls
예제 #10
0
 def _values_for_next_url(self, a_url, f_html, input_dict=None):
     result_dict = input_dict if input_dict else dict(
         a_url['result-url-dict'])
     tree = lxml.html.fromstring(f_html)
     key_list = list(a_url['result-url-dict'].keys())[
         1:]  # Excluding first element, Ex. 'for_next_url'
     for k, xp in zip(key_list, a_url['xpath']):
         try:
             result_dict[k] = tree.xpath(xp)[0].strip()
         except Exception as e:
             raise DownloadException(
                 '3000_ACCESS_FAILED', e,
                 f'Access failed for xpath: {xp} and source {f_html}')
     return result_dict
예제 #11
0
 def parse(self, **opts):
     """
     method parses the 'ct' specific page source using xpath from access-configs, after method execution
         a_url['download_urls'] appended to opts dictionary
     :param opts: user/commandline inputs + a_url['deal_info_dict_list']
     :return:
     """
     logging.debug('CTDownloader:parse')
     provider = opts['provider']
     out_dir = opts['output']
     access_config = self.configs.access_config[provider]
     site_url = access_config['site-url']
     for a_url in opts['access_urls']:
         download_urls = list()
         for xpath in a_url['xpath']:
             for deal_info_dict in a_url['deal_info_dict_list']:
                 if 'f_html' in deal_info_dict:
                     tree = lxml.html.fromstring(deal_info_dict['f_html'])
                     # xpath = "body/div/div/div/table/tbody/tr/td/form/table[3]/tbody/tr/td"
                     # table[3] or table[@class='tableBorder'] gives same result
                     try:
                         tr = tree.xpath(xpath)
                     except Exception as e:
                         raise DownloadException('4000_PARSING_FAILED', e)
                     for td in tr:
                         if td.text.strip():
                             stmt_grp = td.text.strip()
                         mbs_tds = td.xpath(
                             "a/@href | a/text() | span/text()")
                         for href, a_text, s_text in zip(*[iter(mbs_tds)] *
                                                         3):
                             logging.debug(
                                 f'href:{href}, report_name:{a_text}, payment_date:{s_text}'
                             )
                             stm_date = self._get_statement_date(
                                 s_text.strip())
                             f_url = site_url + href.strip()
                             o_dir = out_dir + '/' + str(
                                 stm_date.year) + '-' + str(stm_date.month)
                             o_dir += '/' + provider + '/' + stmt_grp
                             s_data = a_text.strip(
                             ) + ' || ' + s_text.strip() + ' || ' + stmt_grp
                             download_urls.append(
                                 DownloadUrl(f_url, o_dir, s_data,
                                             stmt_grp))
         # del a_url['f_html']
         a_url['download_urls'] = download_urls
 def access(self, session, **opts):
     """
     Step 2:: Pull access URL/s from configs file and use it to pull page source which has URLs for file download
         after method execution a_url['deal_info_dict_list'] appended to opts dictionary
     TODO Use namedtuple DealInfo to make current dictionary generic to all providers
     :param session: session with site cookies
     :param opts: user/commandline inputs
     :return: None
     """
     logging.debug('FileDownloader:access')
     provider = opts['provider']
     previous_url_results = list()
     for a_url in opts['access_urls']:
         logging.debug(f':::3 Send request to {a_url} page')
         # Pull input parameters to append as a query string
         user_config = opts[
             'user_input_config'] if 'user_input_config' in opts else None
         user_inputs = user_config[
             'input'] if user_config else self.configs.user_input_config[
                 provider]['input']
         deal_info_list = self._prepare_params(a_url, user_inputs)
         # Update URL with values pulled from previous page response
         deal_info_list = self._use_previous_url_result(
             deal_info_list, previous_url_results)
         # After use clean the previous_url_results
         previous_url_results = []
         for deal_info in deal_info_list:
             params = deal_info['params']
             from_opts = opts[
                 'response_dict'] if 'response_dict' in opts else {}
             params = {**params, **from_opts}
             opts['response_dict'] = {}
             try:
                 if a_url['method'] == 'POST':
                     res = session.post(deal_info['link'], data=params)
                 elif a_url['method'] == 'GET':
                     res = session.get(deal_info['link'], params=params)
             except Exception as e:
                 raise DownloadException('3000_ACCESS_FAILED', e)
             logging.debug(
                 f'status code :: {res.status_code} history :: {res.history} response URL :: {res.url}'
             )
             f_html = self.utils.format_html(res.text)
             tree = lxml.html.fromstring(f_html)
             for ele_name, ele_value in a_url['result-dict'].items():
                 if 'for_next_params' in ele_name:
                     _result = self._dict_for_next_url(ele_value, tree)
                     _result['for_next_params'] = True
                     previous_url_results.append(_result)
                     deal_info['for_next_params'] = _result
                     opts['response_dict'] = {'csrfKey': _result['csrfKey']}
                 elif 'for_next_url' in ele_name:
                     _result = self._dict_for_next_url(ele_value, tree)
                     _result['for_next_url'] = True
                     previous_url_results.append(_result)
                 elif 'deal_info' in ele_name:
                     deal_info['deal_info'] = self._dict_for_next_url(
                         ele_value, tree)
                 elif 'for_parsing' in ele_name:
                     f_html_trees = list()
                     for xp in ele_value:
                         f_html_trees.append(tree.xpath(xp))
                     deal_info['f_html'] = f_html_trees
         a_url['deal_info_dict_list'] = deal_info_list