def authenticate(self, provider): """ Step 1:: Authenticate and login to provider's portal :param provider: provider :return: requests session """ logging.debug('BonyDownloader:authenticate') auth_config = self.configs.auth_config[provider] access_config = self.configs.access_config[provider] session = requests.Session() logging.debug( f':::1 Connect to {access_config["login-url"]} and get cookies') session.get(access_config['login-url']) logging.debug(f':::2 Call {access_config["auth-url"]} page') # requests will use the available cookies from session try: res1 = session.post(access_config["auth-url"], data=auth_config) if self._login_failed(provider, res1): raise DownloadException( '2000_AUTHENTICATION_FAILED', custom_message=f"Authentication failed for {provider}") logging.debug(f'Login status :: {res1.status_code}') # BONY request need certificate key for each request f_html = self.utils.format_html(res1.text) tree = lxml.html.fromstring(f_html) csrf_key = tree.xpath( '//form[@name="NavForm"]/input[@name="csrfKey"]/@value')[0] except Exception as e: raise DownloadException('2000_AUTHENTICATION_FAILED', e) from None return session, {'for_next_params': True, 'csrfKey': csrf_key}
def authenticate(self, provider): """ Step 1:: Authenticate and login to provider's portal :param provider: provider :return: requests session """ logging.debug('FileDownloader:authenticate') auth_config = self.configs.auth_config[provider] access_config = self.configs.access_config[provider] session = requests.Session() logging.debug( f':::1 Connect to {access_config["login-url"]} and get cookies') session.get(access_config['login-url']) # logging.info(f'Session cookies {s.cookies.get_dict()}') logging.debug(f':::2 Call {access_config["auth-url"]} page') # requests will use the available cookies from session try: res1 = session.post(access_config["auth-url"], data=auth_config) if self._login_failed(provider, res1): raise DownloadException( '2000_AUTHENTICATION_FAILED', custom_message=f"Authentication failed for {provider}") logging.debug(f'Login status :: {res1.status_code}') # logging.info(f'Page Details:::: {res1.status_code}, {res1.cookies.get_dict()}, res1.headers, {res1.text}') except Exception as e: raise DownloadException('2000_AUTHENTICATION_FAILED', e) from None return session, None
def download_files(self, session, **opts): """ # Step 5::Download files to output directory, using urls and output directory in 'download_url' dictionary :param session: session object site cookies :param opts: user/commandline inputs + a_url['deal_info_dict_list'] + a_url['download_urls'] :return: None """ logging.debug('FileDownloader:Download files') download_count = 0 for a_url in opts['access_urls']: if 'download_urls' in a_url: download_urls = a_url['download_urls'] for download_url in download_urls: d_url = download_url.file_url o_file = download_url.out_file o_file = self.utils.out_file(d_url, o_file) if session: if download_url.method and 'POST' in download_url.method: response = session.post(d_url, data=download_url.params) else: response = session.get(d_url) else: response = requests.get(download_url.file_url, stream=True) self._download(o_file, response) logging.info( f"[Report: {download_url.report_group}] [{o_file}] downloaded" ) download_count += 1 if download_count == 0: raise DownloadException('6001_FILE_DOWNLOAD_NO_FILE')
def validate_n_format(configs, **opts): """ This method validate and format the commandline parameters provided by user Converting provider string to lowercase to match with configuration keys :param configs: reference to Config class object, which has all config details :param opts: commandline input parameters :return: None """ opts['provider'] = opts['provider'].lower() opts['tspan'] = opts['tspan'].lower() if opts['tspan'] else 'latest' # Validations for provider, tspan and other input variables utils = Utils() if opts['provider'] not in configs.auth_config.keys(): raise DownloadException('1001_INVALID_PROVIDER') elif not utils.is_valid_time_span(opts['tspan']): raise DownloadException('1002_INVALID_TIME_SPAN_FORMAT')
def _download(self, o_file, response): """ Download file to output directory, raise exception of login session has expired :param o_file: output file path :param response: response :return: None """ try: if response.status_code == 200: with open(o_file, 'wb') as output: output.write(response.content) else: raise DownloadException( '6000_FILE_DOWNLOAD_FAILED', custom_message=f'File not available {response.status_code}' ) except Exception as e: raise DownloadException('6000_FILE_DOWNLOAD_FAILED', e) from None
def _dict_for_next_url(self, input_dict, tree): # print(f'table.text :: {etree.tostring(tree)}') result_dict = dict() for k, xp in input_dict.items(): try: xp_result = tree.xpath(xp) result_dict[k] = ''.join(xp_result).strip() except Exception as e: raise DownloadException('3000_ACCESS_FAILED', e) return result_dict
def parse(self, **opts): """ method parses the 'ubn' specific page source using xpath from access-configs, after method execution a_url['download_urls'] appended to opts dictionary :param opts: user/commandline inputs + a_url['deal_info_dict_list'] :return: """ logging.debug('UbnDownloader:parse') provider = opts['provider'] out_dir = opts['output'] access_config = self.configs.access_config[provider] site_url = access_config['site-url'] for a_url in opts['access_urls']: download_urls = list() for xpath in a_url['xpath']: for deal_info_dict in a_url['deal_info_dict_list']: if 'f_html' in deal_info_dict: tree = lxml.html.fromstring(deal_info_dict['f_html']) try: tr = tree.xpath(xpath) except Exception as e: raise DownloadException('4000_PARSING_FAILED', e) search_data = '' yyyy_mon = '' for td in tr: td_txt = td.text.strip() if td_txt: if 'Investor Report' in td_txt: search_data = '' search_data += td_txt + '||' else: dt = self.utils.validate_date(td_txt) if dt[0]: yyyy_mon = str(dt[1].year) + '-' + str( dt[1].month) search_data += dt[1].strftime( "%b") + ' ' + str( dt[1].year) + '||' else: search_data += td_txt + '||' w_td = td.xpath('a/@href | a/text()') for href, a_text in zip(*[iter(w_td)] * 2): logging.debug( f'href:: {href} a_text:: {a_text}') f_url = site_url + href.strip() o_file = out_dir + '/' + yyyy_mon + '/' + provider + '/' + deal_info_dict[ 'dealName'] o_file += '/' + deal_info_dict[ 'dealName'] + '-Investor-Report-' + yyyy_mon o_file += '.' + a_text.strip() download_urls.append( DownloadUrl(f_url, o_file, search_data, deal_info_dict['dealName'])) # del a_url['f_html'] a_url['download_urls'] = download_urls
def access(self, session, **opts): """ Step 2:: Pull access URL/s from configs file and use it to pull page source which has URLs for file download after method execution a_url['deal_info_dict_list'] appended to opts dictionary TODO Use namedtuple DealInfo to make current dictionary generic to all providers :param session: session with site cookies :param opts: user/commandline inputs :return: None """ logging.debug('FileDownloader:access') provider = opts['provider'] previous_url_results = list() for a_url in opts['access_urls']: logging.debug(f':::3 Send request to {a_url} page') deal_info_list = list() # Pull input parameters to append as a query string if len(a_url['input-param']) > 0: user_config = opts[ 'user_input_config'] if 'user_input_config' in opts else None user_inputs = user_config[ 'input'] if user_config else self.configs.user_input_config[ provider]['input'] deal_info_list = self._append_query_str_to_url( a_url, user_inputs) else: deal_info_list.append(a_url['url']) # Update URL with value pulled from previous page deal_info_list = self._urls_with_previous_result( a_url, deal_info_list, previous_url_results) for deal_info in deal_info_list: link = deal_info['link'] params = deal_info['params'] if 'params' in deal_info else {} try: if a_url['method'] == 'POST': res = session.post(link, data=params) elif a_url['method'] == 'GET': res = session.get(link, params=params) except Exception as e: raise DownloadException( '3000_ACCESS_FAILED', e, f'Access failed for {a_url["method"]} - {link}') logging.debug( f'status code :: {res.status_code} history :: {res.history} response URL :: {res.url}' ) if len(a_url['xpath']) > 0: f_html = self.utils.format_html(res.text) if 'for_next_url' in a_url[ 'result-url-dict'] or 'for_next_params' in a_url[ 'result-url-dict']: previous_url_results.append( self._values_for_next_url(a_url, f_html)) else: deal_info['f_html'] = f_html a_url['deal_info_dict_list'] = deal_info_list
def parse(self, **opts): """ method parses the 'WF' specific page source using xpath from access-configs, after method execution a_url['download_urls'] appended to opts dictionary :param opts: user/commandline inputs + a_url['deal_info_dict_list'] :return: """ logging.debug('WellsFargoDownloader:parse') provider = opts['provider'] out_dir = opts['output'] access_config = self.configs.access_config[provider] site_url = access_config['site-url'] for a_url in opts['access_urls']: download_urls = list() for xpath in a_url['xpath']: for deal_info_dict in a_url['deal_info_dict_list']: if 'f_html' in deal_info_dict: tree = lxml.html.fromstring(deal_info_dict['f_html']) try: trs = tree.xpath(xpath) except Exception as e: raise DownloadException('4000_PARSING_FAILED', e) # to avoid the dates confusion, using first 3 items and 5th item from tr # search_data will have only 'Current Cycle' date for tr in trs: # print(f'table.text :: {etree.tostring(tr)}') td = tr.xpath('td') if len(td) > 5: w_td = td[1].xpath('a/@href | a/img/@alt') for href, doc in zip(*[iter(w_td)] * 2): f_url = site_url + href.strip() search_data = doc.strip() + ' || ' if f_url: search_data += td[0].text.strip() + ' || ' dt = self.utils.validate_date(td[2].text.strip(), '%m/%d/%Y') if dt[0]: yyyy_mon = str(dt[1].year) + '-' + str(dt[1].month) search_data += dt[1].strftime("%b") + ' ' + str(dt[1].year) + '||' if 'series_name' in deal_info_dict: deal_name = deal_info_dict['series_name'] file_ext = '.csv' elif 'shelf_name' in deal_info_dict: deal_name = deal_info_dict['shelf_name'] file_ext = '.zip' for hist_ele in td[5].xpath('a/@href'): hist_ele = hist_ele.strip() file_name = hist_ele[hist_ele.index('doc=') + 4:] + file_ext o_file = out_dir + '/' + yyyy_mon + '/' + provider + '/' o_file += deal_name + '/' + file_name download_urls.append( DownloadUrl(f_url, o_file, search_data, deal_name)) # del a_url['f_html'] a_url['download_urls'] = download_urls
def _values_for_next_url(self, a_url, f_html, input_dict=None): result_dict = input_dict if input_dict else dict( a_url['result-url-dict']) tree = lxml.html.fromstring(f_html) key_list = list(a_url['result-url-dict'].keys())[ 1:] # Excluding first element, Ex. 'for_next_url' for k, xp in zip(key_list, a_url['xpath']): try: result_dict[k] = tree.xpath(xp)[0].strip() except Exception as e: raise DownloadException( '3000_ACCESS_FAILED', e, f'Access failed for xpath: {xp} and source {f_html}') return result_dict
def parse(self, **opts): """ method parses the 'ct' specific page source using xpath from access-configs, after method execution a_url['download_urls'] appended to opts dictionary :param opts: user/commandline inputs + a_url['deal_info_dict_list'] :return: """ logging.debug('CTDownloader:parse') provider = opts['provider'] out_dir = opts['output'] access_config = self.configs.access_config[provider] site_url = access_config['site-url'] for a_url in opts['access_urls']: download_urls = list() for xpath in a_url['xpath']: for deal_info_dict in a_url['deal_info_dict_list']: if 'f_html' in deal_info_dict: tree = lxml.html.fromstring(deal_info_dict['f_html']) # xpath = "body/div/div/div/table/tbody/tr/td/form/table[3]/tbody/tr/td" # table[3] or table[@class='tableBorder'] gives same result try: tr = tree.xpath(xpath) except Exception as e: raise DownloadException('4000_PARSING_FAILED', e) for td in tr: if td.text.strip(): stmt_grp = td.text.strip() mbs_tds = td.xpath( "a/@href | a/text() | span/text()") for href, a_text, s_text in zip(*[iter(mbs_tds)] * 3): logging.debug( f'href:{href}, report_name:{a_text}, payment_date:{s_text}' ) stm_date = self._get_statement_date( s_text.strip()) f_url = site_url + href.strip() o_dir = out_dir + '/' + str( stm_date.year) + '-' + str(stm_date.month) o_dir += '/' + provider + '/' + stmt_grp s_data = a_text.strip( ) + ' || ' + s_text.strip() + ' || ' + stmt_grp download_urls.append( DownloadUrl(f_url, o_dir, s_data, stmt_grp)) # del a_url['f_html'] a_url['download_urls'] = download_urls
def access(self, session, **opts): """ Step 2:: Pull access URL/s from configs file and use it to pull page source which has URLs for file download after method execution a_url['deal_info_dict_list'] appended to opts dictionary TODO Use namedtuple DealInfo to make current dictionary generic to all providers :param session: session with site cookies :param opts: user/commandline inputs :return: None """ logging.debug('FileDownloader:access') provider = opts['provider'] previous_url_results = list() for a_url in opts['access_urls']: logging.debug(f':::3 Send request to {a_url} page') # Pull input parameters to append as a query string user_config = opts[ 'user_input_config'] if 'user_input_config' in opts else None user_inputs = user_config[ 'input'] if user_config else self.configs.user_input_config[ provider]['input'] deal_info_list = self._prepare_params(a_url, user_inputs) # Update URL with values pulled from previous page response deal_info_list = self._use_previous_url_result( deal_info_list, previous_url_results) # After use clean the previous_url_results previous_url_results = [] for deal_info in deal_info_list: params = deal_info['params'] from_opts = opts[ 'response_dict'] if 'response_dict' in opts else {} params = {**params, **from_opts} opts['response_dict'] = {} try: if a_url['method'] == 'POST': res = session.post(deal_info['link'], data=params) elif a_url['method'] == 'GET': res = session.get(deal_info['link'], params=params) except Exception as e: raise DownloadException('3000_ACCESS_FAILED', e) logging.debug( f'status code :: {res.status_code} history :: {res.history} response URL :: {res.url}' ) f_html = self.utils.format_html(res.text) tree = lxml.html.fromstring(f_html) for ele_name, ele_value in a_url['result-dict'].items(): if 'for_next_params' in ele_name: _result = self._dict_for_next_url(ele_value, tree) _result['for_next_params'] = True previous_url_results.append(_result) deal_info['for_next_params'] = _result opts['response_dict'] = {'csrfKey': _result['csrfKey']} elif 'for_next_url' in ele_name: _result = self._dict_for_next_url(ele_value, tree) _result['for_next_url'] = True previous_url_results.append(_result) elif 'deal_info' in ele_name: deal_info['deal_info'] = self._dict_for_next_url( ele_value, tree) elif 'for_parsing' in ele_name: f_html_trees = list() for xp in ele_value: f_html_trees.append(tree.xpath(xp)) deal_info['f_html'] = f_html_trees a_url['deal_info_dict_list'] = deal_info_list