def main(): logger.info( 'Starting historical transaction executive compensation backfill.') parser = argparse.ArgumentParser( description= 'Backfill historical executive compensation data for transactions.') parser.add_argument('-k', '--ssh_key', help='private key used to access the API remotely', required=True) args = parser.parse_args() root_dir = CONFIG_BROKER['d_file_storage_path'] client = get_client(ssh_key=args.ssh_key) sftp = client.open_sftp() # dirlist on remote host dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR) sorted_monthly_file_names = sorted([ monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+', monthly_file) ]) sess = GlobalDB.db().session for monthly_file in sorted_monthly_file_names: file_date = re.match('.*(\d{8}).*', monthly_file).group(1) logger.info('Starting {} monthly file'.format(file_date)) exec_comp_data = parse_exec_comp_file(monthly_file, root_dir, sftp=sftp, ssh_key=args.ssh_key) update_transactions(sess, exec_comp_data, file_date)
def process_from_dir(root_dir, file_name, sess, sftp=None, ssh_key=None, metrics=None, monthly=False): """ Process the SAM file found locally or remotely Args: root_dir: the folder containing the SAM file file_name: the name of the SAM file sess: the database connection sftp: the sftp client to pull the CSV from metrics: dictionary representing metrics data for the load monthly: whether it's a monthly file """ if not metrics: metrics = {} file_path = os.path.join(root_dir, file_name) if sftp: logger.info("Pulling {}".format(file_name)) with open(file_path, 'wb') as zip_file: try: sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', file_name]), zip_file) except: logger.debug("Socket closed. Reconnecting...") ssh_client = get_client(ssh_key=ssh_key) sftp = ssh_client.open_sftp() sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', file_name]), zip_file) exec_comp_data = parse_exec_comp_file(file_name, root_dir, sftp=sftp, ssh_key=ssh_key, metrics=metrics, monthly=monthly) update_exec_comp_duns(sess, exec_comp_data, metrics=metrics) if sftp: os.remove(file_path)
def process_from_dir(root_dir, file_name, sess, local, sftp=None, monthly=False, benchmarks=False, table=DUNS, year=None, metrics=None): """ Process the SAM file found locally or remotely Args: root_dir: the folder containing the SAM file file_name: the name of the SAM file sess: the database connection local: whether it's local or not sftp: the sftp client to pull the CSV from monthly: whether it's a monthly file benchmarks: whether to log times table: the table to work from (could be DUNS/HistoricParentDuns) year: the year associated with the data (primarily for HistoricParentDUNS loads) metrics: dictionary representing metrics data for the load """ if not metrics: metrics = {} file_path = os.path.join(root_dir, file_name) if not local: if sftp.sock.closed: # Reconnect if channel is closed ssh_client = get_client() sftp = ssh_client.open_sftp() logger.info("Pulling {}".format(file_name)) with open(file_path, "wb") as zip_file: sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]), zip_file) parse_duns_file(file_path, sess, monthly=monthly, benchmarks=benchmarks, table=table, year=year, metrics=metrics) if not local: os.remove(file_path)
def process_from_dir(root_dir, file_name, sess, sftp=None, monthly=False, benchmarks=False, metrics=None): """ Process the SAM file found locally or remotely Args: root_dir: the folder containing the SAM file file_name: the name of the SAM file sess: the database connection sftp: the sftp client to pull the CSV from monthly: whether it's a monthly file benchmarks: whether to log times metrics: dictionary representing metrics data for the load """ if not metrics: metrics = {} file_path = os.path.join(root_dir, file_name) if sftp: logger.info("Pulling {}".format(file_name)) with open(file_path, "wb") as zip_file: try: sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]), zip_file) except: logger.debug("Socket closed. Reconnecting...") ssh_client = get_client() sftp = ssh_client.open_sftp() sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]), zip_file) add_update_data, delete_data = parse_duns_file(file_path, sess, monthly=monthly, benchmarks=benchmarks, metrics=metrics) if add_update_data is not None: update_duns(sess, add_update_data, metrics=metrics) if delete_data is not None: update_duns(sess, delete_data, metrics=metrics, deletes=True) if sftp: os.remove(file_path)
def main(): logger.info('Starting historical transaction executive compensation backfill.') parser = argparse.ArgumentParser(description='Backfill historical executive compensation data for transactions.') algorithm = parser.add_mutually_exclusive_group(required=True) algorithm.add_argument('-k', '--ssh_key', help='private key used to access the API remotely') algorithm.add_argument('-p', '--pulled_since', help='min created_at/updated_at date when directly using the ' 'historic duns table') args = parser.parse_args() sess = GlobalDB.db().session if args.ssh_key: root_dir = CONFIG_BROKER['d_file_storage_path'] # dirlist on remote host client = get_client(ssh_key=args.ssh_key) sftp = client.open_sftp() dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR) sorted_monthly_file_names = sorted([monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+', monthly_file)]) for monthly_file in sorted_monthly_file_names: file_date = re.match('.*(\d{8}).*', monthly_file).group(1) logger.info('Starting {} monthly file'.format(file_date)) exec_comp_data = parse_exec_comp_file(monthly_file, root_dir, sftp=sftp, ssh_key=args.ssh_key) temp_table_name = 'temp_exec_comp_update' # Only create a table out of the data we might actually need pop_exec = exec_comp_data[exec_comp_data.high_comp_officer1_full_na.notnull()] create_temp_exec_comp_table(sess, temp_table_name, pop_exec) update_transactions(sess, exec_comp_data, file_date, date_type='action_date') logger.info('Dropping {}'.format(temp_table_name)) sess.execute('DROP TABLE {};'.format(temp_table_name)) sess.commit() else: update_transactions(sess, 'historic_duns', args.pulled_since, date_type='created_at', source='fabs') update_transactions(sess, 'historic_duns', args.pulled_since, date_type='updated_at', source='fpds')
def parse_exec_comp_file(filename, root_dir, sftp=None, ssh_key=None, metrics=None): """ Parses the executive compensation file to update corresponding DUNS records Args: filename: name of file to import root_dir: working directory sftp: connection to remote server ssh_key: ssh_key for reconnecting metrics: dictionary representing metrics of the script Raises: Exception: couldn't extract the last exec comp modification date, this generally means the filename provided doesn't match the expected format. """ if not metrics: metrics = { 'files_processed': [], 'records_received': 0, 'records_processed': 0 } file_path = os.path.join(root_dir, filename) if sftp: if sftp.sock.closed: # Reconnect if channel is closed ssh_client = get_client(ssh_key=ssh_key) sftp = ssh_client.open_sftp() with open(os.path.join(root_dir, filename), 'wb') as file: sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', filename]), file) logger.info('starting file ' + file_path) metrics['files_processed'].append(filename) csv_file = os.path.splitext(filename)[0] + '.dat' zfile = zipfile.ZipFile(file_path) # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype nrows = 0 with zfile.open(csv_file) as zip_file: nrows = len(zip_file.readlines()) - 2 # subtract the header and footer column_header_mapping = { 'awardee_or_recipient_uniqu': 0, 'sam_extract': 4, 'exec_comp_str': 89 } column_header_mapping_ordered = OrderedDict( sorted(column_header_mapping.items(), key=lambda c: c[1])) with zfile.open(csv_file) as zip_file: csv_data = pd.read_csv(zip_file, dtype=str, header=None, skiprows=1, nrows=nrows, sep='|', usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys()) total_data = csv_data.copy() metrics['records_received'] += len(total_data.index) total_data = total_data[ total_data.awardee_or_recipient_uniqu.notnull() & total_data.sam_extract.isin(['2', '3', 'A', 'E'])] metrics['records_processed'] += len(total_data.index) del total_data['sam_extract'] # Note: we're splitting these up cause it vastly saves memory parsing only the records that are populated blank_exec = total_data[total_data.exec_comp_str.isnull()] pop_exec = total_data[total_data.exec_comp_str.notnull()] # parse out executive compensation from row 90 for populated records lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values()))) parsed_data = pop_exec['exec_comp_str'].apply(lambda_func) parsed_data.columns = list(parse_exec_comp().keys()) del pop_exec['exec_comp_str'] pop_exec = pop_exec.join(parsed_data) # leave blanks del blank_exec['exec_comp_str'] blank_exec = blank_exec.assign(**parse_exec_comp()) # setup the final dataframe total_data = pd.concat([pop_exec, blank_exec]) total_data.replace('', np.nan, inplace=True) last_exec_comp_mod_date_str = re.findall('[0-9]{8}', filename) if not last_exec_comp_mod_date_str: raise Exception( 'Last Executive Compensation Mod Date not found in filename.') last_exec_comp_mod_date = datetime.datetime.strptime( last_exec_comp_mod_date_str[0], '%Y%m%d').date() total_data = total_data.assign( last_exec_comp_mod_date=last_exec_comp_mod_date) if sftp: os.remove(os.path.join(root_dir, filename)) return total_data
'script_name': 'load_exec_duns.py', 'start_time': str(now), 'files_processed': [], 'records_received': 0, 'records_processed': 0, 'updated_duns': [], 'records_updated': 0 } with create_app().app_context(): sess = GlobalDB.db().session sftp = None if ssh_key: root_dir = CONFIG_BROKER['d_file_storage_path'] client = get_client(ssh_key=ssh_key) sftp = client.open_sftp() # dirlist on remote host dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR) elif local: root_dir = local dirlist = os.listdir(local) # generate chronological list of daily files sorted_monthly_file_names = sorted([ monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+', monthly_file) ]) sorted_daily_file_names = sorted([ daily_file for daily_file in dirlist if re.match('.*DAILY_\d+', daily_file)
def process_exec_comp_dir(sess, historic, local, ssh_key, benchmarks=None, metrics=None): """ Process the script arguments to figure out which files to process in which order Args: sess: the database connection historic: whether to load in monthly file and daily files after, or just the latest daily files local: path to local directory to process, if None, it will go though the remote SAM service ssh_key: URI to ssh key used to pull exec comp files from SAM benchmarks: whether to log times metrics: dictionary representing metrics data for the load """ if not metrics: metrics = {} sftp = None # dealing with a local or remote directory if not (local or ssh_key) or (local and ssh_key): raise Exception('Please provide the local param or the ssh key.') if ssh_key: root_dir = CONFIG_BROKER['d_file_storage_path'] client = get_client(ssh_key=ssh_key) sftp = client.open_sftp() # dirlist on remote host dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR) elif local: root_dir = local dirlist = os.listdir(local) # generate chronological list of daily and monthly files sorted_monthly_file_names = sorted([monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+\.ZIP', monthly_file.upper())]) sorted_daily_file_names = sorted([daily_file for daily_file in dirlist if re.match('.*DAILY_\d+\.ZIP', daily_file.upper())]) # load in earliest monthly file for historic if historic and sorted_monthly_file_names: process_from_dir(root_dir, sorted_monthly_file_names[0], sess, sftp=sftp, ssh_key=ssh_key, metrics=metrics, monthly=True) # load in daily files after depending on params if sorted_daily_file_names: # if update, make sure it's been done once before last_update = sess.query(DUNS.last_exec_comp_mod_date). \ order_by(DUNS.last_exec_comp_mod_date.desc()). \ filter(DUNS.last_exec_comp_mod_date.isnot(None)). \ first() if not historic and not last_update: raise Exception('No last executive compenstation mod date found in database. ' 'Please run historic loader first.') # determine which daily files to load earliest_daily_file = None if historic and sorted_monthly_file_names: earliest_daily_file = sorted_monthly_file_names[0].replace("MONTHLY", "DAILY") elif not historic: last_update = last_update[0].strftime("%Y%m%d") earliest_daily_file = re.sub("_DAILY_[0-9]{8}\.ZIP", "_DAILY_" + last_update + ".ZIP", sorted_daily_file_names[0]) daily_files_after = sorted_daily_file_names if earliest_daily_file: sorted_full_list = sorted(sorted_daily_file_names + [earliest_daily_file]) daily_files_after = sorted_full_list[sorted_full_list.index(earliest_daily_file) + 1:] # load daily files for daily_file in daily_files_after: process_from_dir(root_dir, daily_file, sess, sftp=sftp, ssh_key=ssh_key, metrics=metrics)
def process_duns_dir(sess, historic, local, benchmarks=None, metrics=None): """ Process the script arguments to figure out which files to process in which order Args: sess: the database connection historic: whether to load in monthly file and daily files after, or just the latest daily files local: path to local directory to process, if None, it will go though the remote SAM service benchmarks: whether to log times metrics: dictionary representing metrics data for the load """ if not metrics: metrics = {} updated_date = datetime.date.today() # dealing with a local or remote directory sftp = None if not local: root_dir = CONFIG_BROKER["d_file_storage_path"] client = get_client() sftp = client.open_sftp() # dirlist on remote host dirlist = sftp.listdir(REMOTE_SAM_DUNS_DIR) elif local: root_dir = local dirlist = os.listdir(local) # generate chronological list of daily and monthly files sorted_monthly_file_names = sorted([ monthly_file for monthly_file in dirlist if re.match(".*MONTHLY_\d+\.ZIP", monthly_file.upper()) ]) sorted_daily_file_names = sorted([ daily_file for daily_file in dirlist if re.match(".*DAILY_\d+\.ZIP", daily_file.upper()) ]) # load in earliest monthly file for historic if historic and sorted_monthly_file_names: process_from_dir(root_dir, sorted_monthly_file_names[0], sess, sftp, monthly=True, benchmarks=benchmarks, metrics=metrics) # load in daily files after depending on params if sorted_daily_file_names: # if update, make sure it's been done once before last_update = sess.query(DUNS.last_sam_mod_date). \ order_by(DUNS.last_sam_mod_date.desc()). \ filter(DUNS.last_sam_mod_date.isnot(None)). \ first() if not historic and not last_update: raise Exception( 'No last sam mod date found in DUNS table. Please run historic loader first.' ) # determine which daily files to load earliest_daily_file = None if historic and sorted_monthly_file_names: earliest_daily_file = sorted_monthly_file_names[0].replace( "MONTHLY", "DAILY") elif not historic: last_update = last_update[0].strftime("%Y%m%d") earliest_daily_file = re.sub("_DAILY_[0-9]{8}\.ZIP", "_DAILY_" + last_update + ".ZIP", sorted_daily_file_names[0]) daily_files_after = sorted_daily_file_names if earliest_daily_file: sorted_full_list = sorted(sorted_daily_file_names + [earliest_daily_file]) daily_files_after = sorted_full_list[sorted_full_list. index(earliest_daily_file) + 1:] # load daily files for daily_file in daily_files_after: process_from_dir(root_dir, daily_file, sess, sftp, benchmarks=benchmarks, metrics=metrics) if daily_files_after: metrics['parent_rows_updated'] = update_missing_parent_names( sess, updated_date=updated_date) metrics['parent_update_date'] = str(updated_date)
parse_duns_file(monthly, sess=sess, monthly=True, benchmarks=benchmarks, metrics=metrics) elif daily: parse_duns_file(daily, sess=sess, benchmarks=benchmarks, metrics=metrics) else: # dealing with a local or remote directory if not local: root_dir = CONFIG_BROKER["d_file_storage_path"] ssh_client = get_client() sftp = ssh_client.open_sftp() # dirlist on remote host dirlist = sftp.listdir(REMOTE_SAM_DUNS_DIR) else: root_dir = local dirlist = os.listdir(local) # generate chronological list of daily and monthly files sorted_monthly_file_names = sorted([ monthly_file for monthly_file in dirlist if re.match(".*MONTHLY_\d+\.ZIP", monthly_file.upper()) ]) sorted_daily_file_names = sorted([ daily_file for daily_file in dirlist if re.match(".*DAILY_\d+\.ZIP", daily_file.upper())