def main(): logger.info("Combining all the data from external sources together") dt = DataTransformation() try: dt.us() dt.jpx() dt.cn() dt.euronext() dt.aastocks() dt.lse() dt.ca() dt.frankfurt() dt.krx() dt.asx() dt.twse() dt.bme() dt.sgx() dt.idx() dt.bm() dt.nasdaqnordic() dt.spotlight() dt.italy() except Exception as e: logger.error(e, exc_info=sys.exc_info()) error_email(str(e)) finally: dt.formatting_all() dt.save_all()
def main(): wd = WebDriver() wd.random_wait() logger.info("Gathering data from sources") for k, v in wd.sources_dict.items(): try: wd.load_url(v.get('url'), sleep_after=True) df = wd.parse_table(**v) if df is not None: s_file = os.path.join(wd.source_data_folder, v.get('file') + '.csv') if os.path.exists(s_file): df = wd.update_existing_data(pd.read_csv(s_file), df, exclude_col='time_checked') df.sort_values(by='time_checked', ascending=False, inplace=True) df.to_csv(s_file, index=False, encoding='utf-8-sig') wd.webscraping_results.append([wd.time_checked_str, k, 1]) except Exception as e: logger.error(f"ERROR for {k}") logger.error(e, exc_info=sys.exc_info()) logger.info('-' * 100) error_screenshot_file = f"{k} Error {wd.time_checked.strftime('%Y-%m-%d %H%M')}.png" wd.driver.save_screenshot( os.path.join(log_folder, 'Screenshots', error_screenshot_file)) wd.webscraping_results.append([wd.time_checked_str, k, 0]) pass wd.asx() wd.tkipo() wd.close_driver() wd.av_api() wd.save_webscraping_results()
def update_withdrawn_ipos(self): """ If an IPO is withdrawn, the RPD will be updated with a comment showing that the status is withdrawn and in the main data frame the RPD Status will be set to Resolved (so that I no longer update the RPD). :return: """ df_wd = pd.merge(self.df_wd, self.df_rpd, how='inner', on='formatted company name', suffixes=('', '_')) if len(df_wd) > 0: df_wd['IPO Date'] = df_wd['IPO Date'].dt.strftime('%Y-%m-%d') logger.info( f"{len(df_wd)} RPDs to update for withdrwan IPOs: {', '.join([str(int(num)) for num in df_wd['RPD Number'].to_list()])}" ) df_wd.replace(np.nan, '', inplace=True) for idx, row in df_wd.iterrows(): rpd = int(row['RPD Number']) ipo_html = row[self.rpd_cols].to_frame().to_html( header=False, na_rep='', justify='left') comment_endpoint = self.base_url + f'rpd/{int(rpd)}/comments' rpd_comment = {'Content': ipo_html} res_c = self.session.post(comment_endpoint, data=json.dumps(rpd_comment), headers=self.headers) self.df.loc[self.df['RPD Number'] == rpd, 'Status'] = 'Withdrawn' self.df.loc[self.df['RPD Number'] == rpd, 'RPD Status'] = 'Resolved'
def create_csv(self, recheck_all: bool = False): # create a dataframe of all company names without iconums including new names found df_e = pd.read_excel( self.entity_mapping_file, usecols=['Company Name', 'iconum', 'entity_id', 'mapStatus']) df_s = pd.read_excel(os.path.join(os.getcwd(), 'Results', 'All IPOs.xlsx'), usecols=['Company Name', 'Symbol', 'Market']) df = pd.merge(df_s, df_e, how='outer', on='Company Name') if recheck_all: # checking 1. company names that aren't null 2. don't have an iconum df = df.loc[~df['Company Name'].isna() & df['iconum'].isna()] else: # only checking 1. company names that aren't null 2. don't have an iconum and 3. haven't been checked yet df = df.loc[~df['Company Name'].isna() & df['iconum'].isna() & df['mapStatus'].isna()] df = df.drop_duplicates() logger.info(f"{len(df)} unmapped entities") # making unique client_id by concatenating company name, symbol and market separated by underscores df['client_id'] = df['Company Name'].fillna('') + '_' + df[ 'Symbol'].fillna('').astype(str) + '_' + df['Market'].fillna('') df.set_index('client_id', inplace=True) # save that dataframe to a csv encoded as utf8 if len(df) > 1: df.to_csv(self.file, index_label='client_id', encoding='utf-8-sig')
def get_task_status(self, eid, recheck_count: int = 0, max_recheck=12, wait_time=10): # get the status of the request entity_task_status_endpoint = 'https://api.factset.com/content/factset-concordance/v1/entity-task-status' status_parameters = {'taskId': str(eid)} entity_task_status_response = requests.get( url=entity_task_status_endpoint, params=status_parameters, auth=self.authorization, headers=self.headers, verify=False) entity_task_status_data = json.loads(entity_task_status_response.text) task_status = entity_task_status_data['data'][0]['status'] if task_status in ['PENDING', 'IN-PROGRESS' ] and recheck_count < max_recheck: recheck_count += 1 sleep(wait_time) return self.get_task_status(eid, recheck_count) else: logger.info( f"Duration for Concordance API {entity_task_status_data['data'][0]['processDuration']}" ) logger.info( f"Decision Rate for Concordance API {entity_task_status_data['data'][0]['decisionRate']}" ) return task_status
def main(): logger.info("Comparing external data with data collected internally") dc = DataComparison() try: dc.concatenate_ticker_exchange() dc.file_for_rpds() return dc.compare() except Exception as e: logger.error(e, exc_info=sys.exc_info()) error_email(str(e))
def main(): logger.info("Checking Cordance API for entity IDs") em = EntityMatchBulk() try: em.create_csv() em.entity_mapping_api() except Exception as e: logger.error(e, exc_info=sys.exc_info()) logger.info('-' * 100) error_email(str(e))
def delete_old_files(folder: str, num_days: int = 30) -> list: """ Deletes files older than the number of days given as a parameter. Defaults to delete files more than 30 days old. :param folder: folder location files will be deleted from :param num_days: int specifying the number of days before a file is deleted :return: list of files that were deleted """ old_date = datetime.utcnow() - timedelta(days=num_days) files_deleted = [] for root, dirs, files in os.walk(folder): for file in files: f_abs = os.path.join(root, file) f_modified = datetime.fromtimestamp(os.path.getmtime(f_abs)) if f_modified <= old_date: os.unlink(f_abs) files_deleted.append(file) if len(files_deleted) > 0: logger.info(f"Deleted {', '.join(files_deleted)}") return files_deleted
def av_api(self): try: requests.packages.urllib3.disable_warnings(InsecureRequestWarning) parameters = { 'function': self.config.get('AV', 'funct'), 'apikey': self.config.get('AV', 'funct') } r = requests.get(self.config.get('AV', 'base_url'), params=parameters, verify=False) cal = [[cell.replace('\r', '') for cell in row.split(',')] for row in r.text.split('\n')] df = pd.DataFrame(cal) df.columns = df.loc[0] df = df.drop(0).reset_index(drop=True) df = df.dropna() df.loc[df['name'].str.contains(r' Warrant'), 'assetType'] = 'Warrants' df.loc[df['name'].str.contains(r' Right'), 'assetType'] = 'Rights' df.loc[df['name'].str.contains(r' Unit'), 'assetType'] = 'Units' df['assetType'].fillna('Shares', inplace=True) for c in ['priceRangeLow', 'priceRangeHigh']: df[c] = pd.to_numeric(df[c], errors='coerce') df['time_checked'] = self.time_checked_str df.sort_values(by=['ipoDate', 'name'], inplace=True) s_file = os.path.join(self.source_data_folder, self.config.get('AV', 'file_name') + '.csv') if os.path.exists(s_file): df = self.update_existing_data(pd.read_csv(s_file), df, exclude_col='time_checked') df.sort_values(by='time_checked', ascending=False, inplace=True) df.to_csv(s_file, index=False, encoding='utf-8-sig') self.webscraping_results.append( [self.time_checked_str, self.config.get('AV', 'file_name'), 1]) except Exception as e: logger.error(f"ERROR for AV") logger.error(e, exc_info=sys.exc_info()) logger.info('-' * 100) self.webscraping_results.append([self.time_checked_str, 'AV', 0])
def asx(self): try: self.driver.get( 'https://www2.asx.com.au/listings/upcoming-floats-and-listings' ) soup = self.return_soup() listing_info = [ co.text.strip() for co in soup.find_all('span', attrs={'class': 'gtm-accordion'}) ] df = pd.DataFrame(listing_info) df.columns = ['listing_info'] df['Company Name'] = df['listing_info'].str.extract( r'^([a-zA-Z0-9\s,\.&]*)\s\-') df['IPO Date'] = df['listing_info'].str.extract( r'\s*-\s*(\d{1,2}\s\w*\s\d{2,4})') df['IPO Date'] = pd.to_datetime(df['IPO Date'], errors='coerce').dt.date df['Market'] = 'Australian Stock Exchange' df['time_checked'] = self.time_checked_str if df is not None: s_file = os.path.join(self.source_data_folder, 'ASX.csv') if os.path.exists(s_file): df = self.update_existing_data(pd.read_csv(s_file), df, exclude_col='time_checked') df.sort_values(by='time_checked', ascending=False, inplace=True) df.to_csv(s_file, index=False, encoding='utf-8-sig') self.webscraping_results.append( [self.time_checked_str, 'ASX', 1]) except Exception as e: logger.error(f"ERROR for ASX") logger.error(e, exc_info=sys.exc_info()) logger.info('-' * 100) error_screenshot_file = f"ASX Error {self.time_checked.strftime('%Y-%m-%d %H%M')}.png" self.driver.save_screenshot( os.path.join(log_folder, 'Screenshots', error_screenshot_file)) self.webscraping_results.append([self.time_checked_str, 'ASX', 0])
def email_report(attach_file=None, addtl_message: str = ''): """ Emails the report as an attachment. Email details like sender and recipients are provided in .ini file which is read by configparser. :param attach_file: path of a file or list of files which will be attached to the email :param addtl_message: optional string that can be added to body of email :return: None """ outlook = win32.Dispatch('outlook.application') mail = outlook.CreateItem(0) mail.To = config.get('Email', 'To') mail.Sender = config.get('Email', 'Sender') mail.Subject = f"{config.get('Email', 'Subject')} {today_date}" mail.HTMLBody = config.get('Email', 'Body') + addtl_message + config.get('Email', 'Signature') if isinstance(attach_file, str) and os.path.exists(attach_file): mail.Attachments.Add(attach_file) elif isinstance(attach_file, list): for f in attach_file: mail.Attachments.Add(f) mail.Send() logger.info('Email sent')
def update_existing_data(old_df: pd.DataFrame, new_df: pd.DataFrame, exclude_col=None) -> pd.DataFrame: """ If there is already existing data, this function can be called to remove any duplicates. :param old_df: DataFrame with existing data :param new_df: DataFrame with new data :param exclude_col: Column(s) that will be excluded when removing duplicate values in DataFrames. Can be given either as a list of columns or a string with the column name. :return: DataFrame """ try: df = pd.concat([old_df, new_df.astype(old_df.dtypes)], ignore_index=True, sort=False) except KeyError as ke: logger.error(ke) logger.info(f"Existing df columns: {', '.join(old_df.columns)}") logger.info(f"New df columns: {', '.join(new_df.columns)}") except ValueError as ve: logger.error(ve) logger.info( f"Existing df data types: \n{old_df.dtypes.to_string(na_rep='')}" ) logger.info( f"New df data types: \n{new_df.dtypes.to_string(na_rep='')}") df = pd.concat([old_df, new_df], ignore_index=True, sort=False) if exclude_col and isinstance(exclude_col, str): ss = [col for col in df.columns.to_list() if col != exclude_col] elif exclude_col and isinstance(exclude_col, list): ss = [ col for col in df.columns.to_list() if col not in exclude_col ] else: ss = df.columns.to_list() # I want to preserve when this item was first added to the website and have most recent updates at the top so # sorting by most recent time_checked, dropping duplicates for subset of columns and keeping the last (earliest) if 'time_checked' in df.columns: df.sort_values(by='time_checked', ascending=False, inplace=True) df.drop_duplicates(subset=ss, keep='last', inplace=True) return df
def entity_mapping_api(self): if os.path.exists(self.file): # create request with concordance API entity_task_endpoint = 'https://api.factset.com/content/factset-concordance/v1/entity-task' entity_task_request = { 'taskName': self.file_name, 'clientIdColumn': 'client_id', 'nameColumn': 'Company Name', 'includeEntityType': ['PUB', 'PVT', 'HOL', 'SUB'], 'uniqueMatch': True } with open(self.file, 'rb') as f: file_data = { 'inputFile': (self.file_name + '.csv', f, 'text/csv') } entity_task_response = requests.post(url=entity_task_endpoint, data=entity_task_request, auth=self.authorization, files=file_data, headers=self.headers) assert entity_task_response.ok, f"{entity_task_response.status_code} - {entity_task_response.text}" # temporarily saving entity task response to look into errors # getting Bad Request - Number of elements in the header doesn't match the total number of columns with open(os.path.join(log_folder, 'Concordance API Responses', f"API response for {self.file_name}.txt"), 'w', encoding='utf8') as f: json.dump(entity_task_response.text, f, ensure_ascii=False) if entity_task_response.text is not None and entity_task_response.text != '': entity_task_data = json.loads(entity_task_response.text) eid = entity_task_data['data']['taskId'] task_name = entity_task_data['data'][ 'taskName'] # will be file_name provided in entity task request logger.info( f"Entity mapping request submitted - task ID {eid} - task name {task_name}" ) task_status = self.get_task_status(eid) logger.info(f"Task {eid} status - {task_status}") if task_status == 'SUCCESS': df_result = self.get_entity_decisions(eid) self.formatting_and_saving(df_result) else: logger.info(f"File not found - {self.file}")
def tkipo(self): try: self.driver.get( 'http://www.tokyoipo.com/top/iposche/index.php?j_e=E') soup = self.return_soup() table = soup.find('table', attrs={'class': 'iposchedulelist'}) table_data = [] row = [] for r in table.find_all('tr'): for cell in r.find_all('td'): cell_text = cell.text.strip() if '\n\n▶\xa0Stock/Chart' in cell_text: table_data.append(row) row = [cell_text.replace('\n\n▶\xa0Stock/Chart', '')] else: row.append(cell_text) table_data.append(row) df = pd.DataFrame(table_data) df.columns = [ 'Company Name', 'IPO Date', 'Symbol', 'Listed Shares', 'Blank_0', 'Price Range', 'Price', 'Book Building Period', 'Opening Price', 'Change', 'Lead Underwriter', 'Business Description', 'Blank_1' ] df.replace('', np.nan, inplace=True) df.dropna(how='all', inplace=True) df.drop(columns=['Blank_0', 'Business Description', 'Blank_1'], inplace=True, errors='ignore') df['Company Name'] = df['Company Name'].str.strip() df['Price Range Expected Date'] = df['Price Range'].str.extract( r'^(\d{0,2}\/\d{0,2})$') df['Price Expected Date'] = df['Price'].str.extract( r'^(\d{0,2}\/\d{0,2})$') df['Price'] = pd.to_numeric(df['Price'].str.replace(',', ''), errors='coerce') # date is provided as mm/dd, adding current year to make the date formatted as mm/dd/yyyy df['IPO Date'] = df['IPO Date'] + f"/{datetime.now().year}" df['IPO Date'] = pd.to_datetime(df['IPO Date'], errors='coerce').dt.date # at the beginning of the year, the calendar will still show IPOs from last year # adding the current year to that previous date will be incorrect # those incorrect dates will be 6+ months away, we shouldn't see legitimate IPO dates that far in advance # if the IPO date is more than 6 months away, I subtract 1 year from the IPO date df.loc[df['IPO Date'] > (pd.to_datetime('today') + pd.offsets.DateOffset(months=6)), 'IPO Date'] = df['IPO Date'] - pd.offsets.DateOffset( years=1) df['Market'] = 'Japan Stock Exchange' + ' - ' + df[ 'Symbol'].str.extract(r'\((\w*)\)') df['Symbol'] = df['Symbol'].str.replace(r'(\(\w*\))', '', regex=True) df['time_checked'] = self.time_checked_str if df is not None: s_file = os.path.join(self.source_data_folder, 'TokyoIPO.csv') if os.path.exists(s_file): df = self.update_existing_data(pd.read_csv(s_file), df, exclude_col='time_checked') df.sort_values(by='time_checked', ascending=False, inplace=True) df.to_csv(s_file, index=False, encoding='utf-8-sig') self.webscraping_results.append( [self.time_checked_str, 'TokyoIPO', 1]) except Exception as e: logger.error(f"ERROR for TokyoIPO") logger.error(e, exc_info=sys.exc_info()) logger.info('-' * 100) error_screenshot_file = f"TokyoIPO Error {self.time_checked.strftime('%Y-%m-%d %H%M')}.png" self.driver.save_screenshot( os.path.join(log_folder, 'Screenshots', error_screenshot_file)) self.webscraping_results.append( [self.time_checked_str, 'TokyoIPO', 0])
def create_new_rpds(self) -> dict: """ Creates new RPDs for all the IPOs that currently do not have an RPD. :return: Dictionary with data about the new RPDs created """ endpoint = self.base_url + 'rpd' rpd_dict = defaultdict(list) df_rpd = self.df.copy() # filtering for only IPOs that do not have an RPD Number df_rpd = df_rpd.loc[df_rpd['RPD Number'].isna()] df_rpd['IPO Date'] = df_rpd['IPO Date'].dt.strftime('%Y-%m-%d') df_rpd.replace(np.nan, '', inplace=True) for idx, row in df_rpd.iterrows(): company_name = str(row['Company Name']) exchange = str(row['Market']) fds_cusip = str(row['CUSIP']) ipo_date = str(row['IPO Date']) ticker = str(row['Symbol']) ipo_html = row[self.rpd_cols].to_frame().to_html(header=False, na_rep='', justify='left') rpd_request = { 'Title': f"{company_name} - {exchange}", 'Products': [{ 'Id': '106317' }], 'Content': ipo_html, 'Type': 'EnhancementRequest', 'Priority': 'Medium', 'Severity': 'Medium', 'Questions': [{ 'Id': 31407, 'Answers': [{ 'AnswerValue': fds_cusip }] }, { 'Id': 31405, 'Answers': [{ 'AnswerValue': ipo_date }] }, { 'Id': 31406, 'Answers': [{ 'AnswerValue': exchange }] }, { 'Id': 31408, 'Answers': [{ 'AnswerValue': ticker }] }] } res = self.session.post(url=endpoint, data=json.dumps(rpd_request), headers=self.headers) if res.ok: rpd_num = res.headers['X-IS-ID'] rpd_date = res.headers['Date'] # rpd_api_link = res.headers['Location'] rpd_dict['Company Name'].append(company_name) rpd_dict['RPD Number'].append(rpd_num) rpd_dict['RPD Link'].append( 'https://is.factset.com/rpd/summary.aspx?messageId=' + str(rpd_num)) rpd_dict['RPD Creation Date'].append(rpd_date) rpd_dict['RPD Status'].append('Pending') sleep(1) logger.info( f"Created {len(rpd_dict['RPD Number'])} new RPDs: {', '.join([str(num) for num in rpd_dict['RPD Number']])}" ) return rpd_dict
def update_rpds(self): """ Updating existing RPDs when either the IPO Date, CUSIP or Symbol have changed. The data is merged so that I can see what data has changed (if any). :return: """ df_rpd = pd.merge(self.df_ipo, self.df_rpd, how='left', on='formatted company name', suffixes=('', '_old')) df_rpd = df_rpd.loc[(df_rpd['RPD Number'].notna()) & (df_rpd['RPD Status'] != 'Resolved')] # only make one update per RPD using the latest information df_rpd.sort_values(by=['Last Checked'], ascending=False, inplace=True) df_rpd.drop_duplicates(subset='RPD Number', inplace=True, ignore_index=True) # compare the data df_rpd['IPO Date Comparison'] = df_rpd['IPO Date'] == df_rpd[ 'IPO Date_old'] df_rpd['Market Comparison'] = df_rpd['Market'] == df_rpd['Market_old'] df_rpd['CUSIP Comparison'] = df_rpd['CUSIP'] == df_rpd['CUSIP_old'] df_rpd.loc[df_rpd['CUSIP'].isna(), 'CUSIP Comparison'] = True df_rpd['Symbol Comparison'] = df_rpd['Symbol'] == df_rpd['Symbol_old'] df_rpd.loc[df_rpd['Symbol'].isna(), 'Symbol Comparison'] = True # filter for only updated data (i.e. where comparison is False) df_rpd = df_rpd.loc[(~df_rpd['IPO Date Comparison']) | (~df_rpd['CUSIP Comparison']) | (~df_rpd['Symbol Comparison']) # | (~df_rpd['Market Comparison']) ] # update the main data frame with updated data to prevent from making the same comment multiple times # note this will only update when IPO date, CUSIP or Symbol have changed. Other changes won't be added. df_rpd = df_rpd[[ 'iconum', 'CUSIP', 'Company Name', 'Symbol', 'Market', 'IPO Date', 'Price', 'Price Range', 'Status', 'Notes', 'Last Checked', 'IPO Deal ID', 'formatted company name', 'RPD Number', 'RPD Link', 'RPD Creation Date', 'RPD Status' ]] self.df = pd.concat([self.df, df_rpd], ignore_index=True).drop_duplicates(subset=[ 'RPD Number', 'formatted company name' ], keep='last') logger.info( f"{len(df_rpd)} updates to make on existing RPDs: {', '.join([str(int(num)) for num in df_rpd['RPD Number'].to_list()])}" ) df_rpd['IPO Date'] = df_rpd['IPO Date'].dt.strftime('%Y-%m-%d') df_rpd.replace(np.nan, '', inplace=True) for idx, row in df_rpd.iterrows(): rpd = int(row['RPD Number']) rpd_status = self.get_rpd_status(rpd) # update the main data frame with the status self.df.loc[self.df['RPD Number'] == rpd, 'RPD Status'] = rpd_status if rpd_status == 'Resolved': rpd_resolution = self.get_rpd_resolution(rpd) dupe_rpd = rpd_resolution.get('DuplicateRPD') if dupe_rpd: # if RPD is resolved and not a duplicate dupe_rpd will be None # if this RPD was resolved as a duplicate RPD, update the main data frame with the new RPD number # not updating comments of the dupe RPD since that should already be done in the row for that RPD self.df.loc[ self.df['RPD Number'] == rpd, 'RPD Link'] = 'https://is.factset.com/rpd/summary.aspx?messageId=' + str( dupe_rpd) self.df.loc[self.df['RPD Number'] == rpd, 'RPD Number'] = dupe_rpd self.df.loc[self.df['RPD Number'] == rpd, 'RPD Status'] = '' else: # only adding comments to RPDs that have not been resolved (will still add comments to completed RPDs) fds_cusip = str(row['CUSIP']) ipo_date = str(row['IPO Date']) ticker = str(row['Symbol']) exchange = str(row['Market']) ipo_html = row[self.rpd_cols].to_frame().to_html( header=False, na_rep='', justify='left') comment_endpoint = self.base_url + f'rpd/{int(rpd)}/comments' rpd_comment = {'Content': ipo_html} res_c = self.session.post(comment_endpoint, data=json.dumps(rpd_comment), headers=self.headers) question_endpoint = self.base_url + f'rpd/{int(rpd)}/questions' questions = [{ 'Id': 31407, 'Answers': [{ 'AnswerValue': fds_cusip }] }, { 'Id': 31405, 'Answers': [{ 'AnswerValue': ipo_date }] }, { 'Id': 31406, 'Answers': [{ 'AnswerValue': exchange }] }, { 'Id': 31408, 'Answers': [{ 'AnswerValue': ticker }] }] res_q = self.session.post(question_endpoint, data=json.dumps(questions), headers=self.headers)
from logging_ipo_dates import logger, consolidate_webscraping_results import source_reference import website_scraping import data_transformation import entity_mapping import data_comparison import file_management import rpd_creation logger.info('-' * 100) source_reference.main() website_scraping.main() data_transformation.main() entity_mapping.main() df_summary = data_comparison.main() # Note: email_report is being called separately with another batch file because the run schedule is different rpd_creation.main() consolidate_webscraping_results() file_management.main() logger.info('-' * 100)