def get_all_funds_ca_data_and_ingest_in_db(model_db): logging.info('Funds CA - Scrap all pages and get data') for url in tqdm(URLS_FUNDS_CA): logging.info(f'Funds CA - Scrap one page: {url}') html_code = get_code(url) dict_values = extract_all_values_of_fund_from_one_page(html_code) ingest_data_in_db(model_db=model_db, dict_values=dict_values, url=url)
def get_all_assets_ca_data_and_ingest_in_db(model_db): logging.info('Assets - Scrap all pages and get data') urls_for_each_asset = get_urls_for_each_asset() for url in tqdm(urls_for_each_asset): logging.info(f'Assets - Scrap one page: {url}') html_code = get_code(url) dict_values = extract_all_values_of_asset_from_one_page(html_code) ingest_data_in_db(model_db=model_db, dict_values=dict_values, url=url)
def get_id_asset_info(url): html_code = get_code(url) id_asset_div = html_code('div', 'c-faceplate__body') try: return id_asset_div[0]( 'h2', 'c-faceplate__isin')[0].get_text().strip().split()[0] except Exception as e: logging.info( f'Recommended assets - Scrap one page: convert text to int: {e}')
def get_urls_for_each_asset(): urls = [] for page in tqdm(ALL_PAGES): all_code = get_code(page) all_infos = all_code.findAll('tbody')[1].findAll('tr') for elt in all_infos: info = elt.findAll('div')[-1] logging.info(f'Assets - Get {info.get_text()} url') urls.append('https://www.boursorama.com' + info.a['href']) print(urls) return list(set(urls))
def get_all_assets_data_for_testing(): logging.info('Assets - Scrap all pages and get data') url = get_urls_for_each_asset()[0] html_code = get_code(url) dict_values = extract_all_values_of_asset_from_one_page(html_code) return dict_values
def get_code_and_insert_in_db(): for url in tqdm(URLS_RECOMMENDED_ASSETS): html_code = get_code(url=url) table = html_code('tbody')[0] insert_table_in_db(table=table)
def get_all_funds_ca_data_for_testing(): url = URLS_FUNDS_CA[0] html_code = get_code(url) dict_values = extract_all_values_of_fund_from_one_page(html_code) return dict_values