コード例 #1
0
def main():
    wd = WebDriver()
    wd.random_wait()
    logger.info("Gathering data from sources")
    for k, v in wd.sources_dict.items():
        try:
            wd.load_url(v.get('url'), sleep_after=True)
            df = wd.parse_table(**v)
            if df is not None:
                s_file = os.path.join(wd.source_data_folder,
                                      v.get('file') + '.csv')
                if os.path.exists(s_file):
                    df = wd.update_existing_data(pd.read_csv(s_file),
                                                 df,
                                                 exclude_col='time_checked')
                df.sort_values(by='time_checked',
                               ascending=False,
                               inplace=True)
                df.to_csv(s_file, index=False, encoding='utf-8-sig')
                wd.webscraping_results.append([wd.time_checked_str, k, 1])
        except Exception as e:
            logger.error(f"ERROR for {k}")
            logger.error(e, exc_info=sys.exc_info())
            logger.info('-' * 100)
            error_screenshot_file = f"{k} Error {wd.time_checked.strftime('%Y-%m-%d %H%M')}.png"
            wd.driver.save_screenshot(
                os.path.join(log_folder, 'Screenshots', error_screenshot_file))
            wd.webscraping_results.append([wd.time_checked_str, k, 0])
            pass
    wd.asx()
    wd.tkipo()
    wd.close_driver()
    wd.av_api()
    wd.save_webscraping_results()
コード例 #2
0
def main():
    logger.info("Combining all the data from external sources together")
    dt = DataTransformation()
    try:
        dt.us()
        dt.jpx()
        dt.cn()
        dt.euronext()
        dt.aastocks()
        dt.lse()
        dt.ca()
        dt.frankfurt()
        dt.krx()
        dt.asx()
        dt.twse()
        dt.bme()
        dt.sgx()
        dt.idx()
        dt.bm()
        dt.nasdaqnordic()
        dt.spotlight()
        dt.italy()
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
        error_email(str(e))
    finally:
        dt.formatting_all()
        dt.save_all()
コード例 #3
0
 def file_for_rpds(self):
     """
     Creating a file that will be used to create RPDs for Symbology and Fundamentals teams.
     This will have all IPOs both from external sources and collected internally.
     :return:
     """
     pp_cols = [
         'iconum', 'CUSIP', 'Company Name', 'client_deal_id', 'ticker',
         'exchange', 'Price', 'trading_date', 'last_updated_date_utc'
     ]
     df_outer = pd.merge(self.merge_entity_data(),
                         self.df_pp[pp_cols],
                         how='outer',
                         on='iconum',
                         suffixes=('_external', '_fds'))
     for c in ['IPO Date', 'trading_date']:
         try:
             df_outer[c] = pd.to_datetime(df_outer[c].fillna(pd.NaT),
                                          errors='coerce')
         except Exception as e:
             logger.error(f"{c} - {e}")
     df_ipo = df_outer.loc[
         (df_outer['IPO Date'].dt.date >= date.today())
         | (df_outer['trading_date'].dt.date >= date.today())]
     df_ipo.to_excel(os.path.join(self.ref_folder,
                                  'IPO Monitoring Data.xlsx'),
                     index=False,
                     encoding='utf-8-sig')
     df_wd = df_outer.loc[df_outer['Status'] == 'Withdrawn']
     df_wd.to_excel(os.path.join(self.ref_folder, 'Withdrawn IPOs.xlsx'),
                    index=False,
                    encoding='utf-8-sig')
コード例 #4
0
def main():
    try:
        for folder in [os.path.join(os.getcwd(), 'Reference', 'Entity Mapping Requests'),
                       os.path.join(os.getcwd(), 'Logs', 'Screenshots'),
                       os.path.join(os.getcwd(), 'Logs', 'Concordance API Responses')]:
            delete_old_files(folder)
            archive_logs()
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
コード例 #5
0
def main():
    try:
        rpd = RPDCreation()
        rpd.update_withdrawn_ipos()
        rpd.update_rpds()
        rpd.add_new_rpds()
        rpd.save_results()
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
        error_email(str(e))
コード例 #6
0
def main():
    logger.info("Comparing external data with data collected internally")
    dc = DataComparison()
    try:
        dc.concatenate_ticker_exchange()
        dc.file_for_rpds()
        return dc.compare()
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
        error_email(str(e))
コード例 #7
0
def main():
    logger.info("Checking Cordance API for entity IDs")
    em = EntityMatchBulk()
    try:
        em.create_csv()
        em.entity_mapping_api()
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
        logger.info('-' * 100)
        error_email(str(e))
コード例 #8
0
def archive_logs(num_days: int = 30):
    current_log_file = os.path.join(log_folder, log_file)
    with open(current_log_file, 'r') as f:
        all_lines = f.readlines()
    key_dates = {'first_log_date': return_date_str(all_lines[0]), 'last_log_date': return_date_str(all_lines[-1])}
    old_date = datetime.utcnow() - timedelta(days=num_days)
    if key_dates.get('first_log_date') is not None:
        first_log_date_datetime = datetime.strptime(key_dates.get('first_log_date'), '%Y-%m-%d')
        if first_log_date_datetime < old_date:
            log_file_name, log_file_ext = os.path.splitext(log_file)
            archived_log = f"{log_file_name} {key_dates.get('first_log_date')} - {key_dates.get('last_log_date', datetime.today().strftime('%Y-%m-%d'))}{log_file_ext}"
            # TODO: this will fail with PermissionError
            #  The process cannot access the file because it is being used by another process
            # os.rename(current_log_file, os.path.join(log_folder, 'Previous Logs', archived_log))
            try:
                shutil.move(current_log_file, os.path.join(log_folder, 'Previous Logs', archived_log))
            except PermissionError as e:
                logger.error(e)
コード例 #9
0
 def update_existing_data(old_df: pd.DataFrame,
                          new_df: pd.DataFrame,
                          exclude_col=None) -> pd.DataFrame:
     """
     If there is already existing data, this function can be called to remove any duplicates.
     :param old_df: DataFrame with existing data
     :param new_df: DataFrame with new data
     :param exclude_col: Column(s) that will be excluded when removing duplicate values in DataFrames.
                         Can be given either as a list of columns or a string with the column name.
     :return: DataFrame
     """
     try:
         df = pd.concat([old_df, new_df.astype(old_df.dtypes)],
                        ignore_index=True,
                        sort=False)
     except KeyError as ke:
         logger.error(ke)
         logger.info(f"Existing df columns: {', '.join(old_df.columns)}")
         logger.info(f"New df columns: {', '.join(new_df.columns)}")
     except ValueError as ve:
         logger.error(ve)
         logger.info(
             f"Existing df data types: \n{old_df.dtypes.to_string(na_rep='')}"
         )
         logger.info(
             f"New df data types: \n{new_df.dtypes.to_string(na_rep='')}")
         df = pd.concat([old_df, new_df], ignore_index=True, sort=False)
     if exclude_col and isinstance(exclude_col, str):
         ss = [col for col in df.columns.to_list() if col != exclude_col]
     elif exclude_col and isinstance(exclude_col, list):
         ss = [
             col for col in df.columns.to_list() if col not in exclude_col
         ]
     else:
         ss = df.columns.to_list()
     # I want to preserve when this item was first added to the website and have most recent updates at the top so
     # sorting by most recent time_checked, dropping duplicates for subset of columns and keeping the last (earliest)
     if 'time_checked' in df.columns:
         df.sort_values(by='time_checked', ascending=False, inplace=True)
     df.drop_duplicates(subset=ss, keep='last', inplace=True)
     return df
コード例 #10
0
 def av_api(self):
     try:
         requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
         parameters = {
             'function': self.config.get('AV', 'funct'),
             'apikey': self.config.get('AV', 'funct')
         }
         r = requests.get(self.config.get('AV', 'base_url'),
                          params=parameters,
                          verify=False)
         cal = [[cell.replace('\r', '') for cell in row.split(',')]
                for row in r.text.split('\n')]
         df = pd.DataFrame(cal)
         df.columns = df.loc[0]
         df = df.drop(0).reset_index(drop=True)
         df = df.dropna()
         df.loc[df['name'].str.contains(r' Warrant'),
                'assetType'] = 'Warrants'
         df.loc[df['name'].str.contains(r' Right'), 'assetType'] = 'Rights'
         df.loc[df['name'].str.contains(r' Unit'), 'assetType'] = 'Units'
         df['assetType'].fillna('Shares', inplace=True)
         for c in ['priceRangeLow', 'priceRangeHigh']:
             df[c] = pd.to_numeric(df[c], errors='coerce')
         df['time_checked'] = self.time_checked_str
         df.sort_values(by=['ipoDate', 'name'], inplace=True)
         s_file = os.path.join(self.source_data_folder,
                               self.config.get('AV', 'file_name') + '.csv')
         if os.path.exists(s_file):
             df = self.update_existing_data(pd.read_csv(s_file),
                                            df,
                                            exclude_col='time_checked')
         df.sort_values(by='time_checked', ascending=False, inplace=True)
         df.to_csv(s_file, index=False, encoding='utf-8-sig')
         self.webscraping_results.append(
             [self.time_checked_str,
              self.config.get('AV', 'file_name'), 1])
     except Exception as e:
         logger.error(f"ERROR for AV")
         logger.error(e, exc_info=sys.exc_info())
         logger.info('-' * 100)
         self.webscraping_results.append([self.time_checked_str, 'AV', 0])
コード例 #11
0
 def asx(self):
     try:
         self.driver.get(
             'https://www2.asx.com.au/listings/upcoming-floats-and-listings'
         )
         soup = self.return_soup()
         listing_info = [
             co.text.strip()
             for co in soup.find_all('span',
                                     attrs={'class': 'gtm-accordion'})
         ]
         df = pd.DataFrame(listing_info)
         df.columns = ['listing_info']
         df['Company Name'] = df['listing_info'].str.extract(
             r'^([a-zA-Z0-9\s,\.&]*)\s\-')
         df['IPO Date'] = df['listing_info'].str.extract(
             r'\s*-\s*(\d{1,2}\s\w*\s\d{2,4})')
         df['IPO Date'] = pd.to_datetime(df['IPO Date'],
                                         errors='coerce').dt.date
         df['Market'] = 'Australian Stock Exchange'
         df['time_checked'] = self.time_checked_str
         if df is not None:
             s_file = os.path.join(self.source_data_folder, 'ASX.csv')
             if os.path.exists(s_file):
                 df = self.update_existing_data(pd.read_csv(s_file),
                                                df,
                                                exclude_col='time_checked')
             df.sort_values(by='time_checked',
                            ascending=False,
                            inplace=True)
             df.to_csv(s_file, index=False, encoding='utf-8-sig')
             self.webscraping_results.append(
                 [self.time_checked_str, 'ASX', 1])
     except Exception as e:
         logger.error(f"ERROR for ASX")
         logger.error(e, exc_info=sys.exc_info())
         logger.info('-' * 100)
         error_screenshot_file = f"ASX Error {self.time_checked.strftime('%Y-%m-%d %H%M')}.png"
         self.driver.save_screenshot(
             os.path.join(log_folder, 'Screenshots', error_screenshot_file))
         self.webscraping_results.append([self.time_checked_str, 'ASX', 0])
コード例 #12
0
 def tkipo(self):
     try:
         self.driver.get(
             'http://www.tokyoipo.com/top/iposche/index.php?j_e=E')
         soup = self.return_soup()
         table = soup.find('table', attrs={'class': 'iposchedulelist'})
         table_data = []
         row = []
         for r in table.find_all('tr'):
             for cell in r.find_all('td'):
                 cell_text = cell.text.strip()
                 if '\n\n▶\xa0Stock/Chart' in cell_text:
                     table_data.append(row)
                     row = [cell_text.replace('\n\n▶\xa0Stock/Chart', '')]
                 else:
                     row.append(cell_text)
         table_data.append(row)
         df = pd.DataFrame(table_data)
         df.columns = [
             'Company Name', 'IPO Date', 'Symbol', 'Listed Shares',
             'Blank_0', 'Price Range', 'Price', 'Book Building Period',
             'Opening Price', 'Change', 'Lead Underwriter',
             'Business Description', 'Blank_1'
         ]
         df.replace('', np.nan, inplace=True)
         df.dropna(how='all', inplace=True)
         df.drop(columns=['Blank_0', 'Business Description', 'Blank_1'],
                 inplace=True,
                 errors='ignore')
         df['Company Name'] = df['Company Name'].str.strip()
         df['Price Range Expected Date'] = df['Price Range'].str.extract(
             r'^(\d{0,2}\/\d{0,2})$')
         df['Price Expected Date'] = df['Price'].str.extract(
             r'^(\d{0,2}\/\d{0,2})$')
         df['Price'] = pd.to_numeric(df['Price'].str.replace(',', ''),
                                     errors='coerce')
         # date is provided as mm/dd, adding current year to make the date formatted as mm/dd/yyyy
         df['IPO Date'] = df['IPO Date'] + f"/{datetime.now().year}"
         df['IPO Date'] = pd.to_datetime(df['IPO Date'],
                                         errors='coerce').dt.date
         # at the beginning of the year, the calendar will still show IPOs from last year
         # adding the current year to that previous date will be incorrect
         # those incorrect dates will be 6+ months away, we shouldn't see legitimate IPO dates that far in advance
         # if the IPO date is more than 6 months away, I subtract 1 year from the IPO date
         df.loc[df['IPO Date'] >
                (pd.to_datetime('today') + pd.offsets.DateOffset(months=6)),
                'IPO Date'] = df['IPO Date'] - pd.offsets.DateOffset(
                    years=1)
         df['Market'] = 'Japan Stock Exchange' + ' - ' + df[
             'Symbol'].str.extract(r'\((\w*)\)')
         df['Symbol'] = df['Symbol'].str.replace(r'(\(\w*\))',
                                                 '',
                                                 regex=True)
         df['time_checked'] = self.time_checked_str
         if df is not None:
             s_file = os.path.join(self.source_data_folder, 'TokyoIPO.csv')
             if os.path.exists(s_file):
                 df = self.update_existing_data(pd.read_csv(s_file),
                                                df,
                                                exclude_col='time_checked')
             df.sort_values(by='time_checked',
                            ascending=False,
                            inplace=True)
             df.to_csv(s_file, index=False, encoding='utf-8-sig')
             self.webscraping_results.append(
                 [self.time_checked_str, 'TokyoIPO', 1])
     except Exception as e:
         logger.error(f"ERROR for TokyoIPO")
         logger.error(e, exc_info=sys.exc_info())
         logger.info('-' * 100)
         error_screenshot_file = f"TokyoIPO Error {self.time_checked.strftime('%Y-%m-%d %H%M')}.png"
         self.driver.save_screenshot(
             os.path.join(log_folder, 'Screenshots', error_screenshot_file))
         self.webscraping_results.append(
             [self.time_checked_str, 'TokyoIPO', 0])
コード例 #13
0
 def compare(self):
     df_m = pd.merge(self.merge_entity_data(),
                     self.df_pp,
                     how='left',
                     on='iconum',
                     suffixes=('_external', '_fds'))
     df_m.drop_duplicates(inplace=True)
     for c in [col for col in df_m.columns if 'date' in col.lower()]:
         # intermittently getting InvalidIndexError: Reindexing only valid with uniquely valued Index objects
         try:
             df_m[c] = pd.to_datetime(df_m[c].fillna(pd.NaT),
                                      errors='coerce').dt.date
         except Exception as e:
             logger.error(f"{c} - {e}")
     df_m['IPO Dates Match'] = df_m['IPO Date'] == df_m['trading_date']
     df_m['IPO Prices Match'] = df_m['Price_external'] == df_m['Price_fds']
     df_m.loc[df_m['Price_external'].isna(), 'IPO Prices Match'] = True
     df_m = df_m[[
         'IPO Dates Match', 'IPO Prices Match', 'iconum',
         'Company Name_external', 'Symbol', 'Market', 'IPO Date',
         'Price_external', 'Price Range', 'Status', 'Notes', 'time_checked',
         'Company Name_fds', 'master_deal', 'client_deal_id', 'CUSIP',
         'ticker', 'exchange', 'Price_fds', 'min_offering_price',
         'max_offering_price', 'announcement_date', 'pricing_date',
         'trading_date', 'closing_date', 'deal_status',
         'last_updated_date_utc'
     ]]
     df_m.drop_duplicates(inplace=True)
     df_summary = df_m[[
         'Company Name_external', 'iconum', 'master_deal', 'IPO Date',
         'Symbol', 'Market', 'Price_external', 'IPO Dates Match',
         'IPO Prices Match'
     ]]
     df_summary.rename(columns={
         'Company Name_external': 'Company Name',
         'Price_external': 'Price'
     },
                       inplace=True)
     df_summary.drop_duplicates(inplace=True)
     df_summary = df_summary.loc[df_summary['IPO Date'] >= date.today()]
     df_summary.sort_values('IPO Date', inplace=True)
     with pd.ExcelWriter(
             os.path.join(self.results_folder,
                          'IPO Monitoring.xlsx')) as writer:
         df_m.to_excel(writer,
                       sheet_name='Comparison',
                       index=False,
                       encoding='utf-8-sig',
                       freeze_panes=(1, 0))
         self.df_s.to_excel(writer,
                            sheet_name='Upcoming IPOs - External',
                            index=False,
                            encoding='utf-8-sig',
                            freeze_panes=(1, 0))
         self.df_pp.to_excel(writer,
                             sheet_name='PEO-PIPE IPO Data',
                             index=False,
                             encoding='utf-8-sig',
                             freeze_panes=(1, 0))
         df_summary.to_excel(writer,
                             sheet_name='Summary',
                             index=False,
                             encoding='utf-8-sig',
                             freeze_panes=(1, 0))
     return df_summary
コード例 #14
0
def main(file_attachment: str, addtl_message: str = ''):
    try:
        email_report(attach_file=file_attachment, addtl_message=addtl_message)
    except Exception as e:
        logger.error(e, exc_info=sys.exc_info())
        error_email(str(e))