def main():
    logger.info(
        'Starting historical transaction executive compensation backfill.')

    parser = argparse.ArgumentParser(
        description=
        'Backfill historical executive compensation data for transactions.')
    parser.add_argument('-k',
                        '--ssh_key',
                        help='private key used to access the API remotely',
                        required=True)
    args = parser.parse_args()

    root_dir = CONFIG_BROKER['d_file_storage_path']
    client = get_client(ssh_key=args.ssh_key)
    sftp = client.open_sftp()
    # dirlist on remote host
    dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR)

    sorted_monthly_file_names = sorted([
        monthly_file for monthly_file in dirlist
        if re.match('.*MONTHLY_\d+', monthly_file)
    ])

    sess = GlobalDB.db().session

    for monthly_file in sorted_monthly_file_names:
        file_date = re.match('.*(\d{8}).*', monthly_file).group(1)

        logger.info('Starting {} monthly file'.format(file_date))
        exec_comp_data = parse_exec_comp_file(monthly_file,
                                              root_dir,
                                              sftp=sftp,
                                              ssh_key=args.ssh_key)
        update_transactions(sess, exec_comp_data, file_date)
def process_from_dir(root_dir, file_name, sess, sftp=None, ssh_key=None, metrics=None, monthly=False):
    """ Process the SAM file found locally or remotely

        Args:
            root_dir: the folder containing the SAM file
            file_name: the name of the SAM file
            sess: the database connection
            sftp: the sftp client to pull the CSV from
            metrics: dictionary representing metrics data for the load
            monthly: whether it's a monthly file
    """
    if not metrics:
        metrics = {}

    file_path = os.path.join(root_dir, file_name)
    if sftp:
        logger.info("Pulling {}".format(file_name))
        with open(file_path, 'wb') as zip_file:
            try:
                sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', file_name]), zip_file)
            except:
                logger.debug("Socket closed. Reconnecting...")
                ssh_client = get_client(ssh_key=ssh_key)
                sftp = ssh_client.open_sftp()
                sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', file_name]), zip_file)
    exec_comp_data = parse_exec_comp_file(file_name, root_dir, sftp=sftp, ssh_key=ssh_key, metrics=metrics,
                                          monthly=monthly)
    update_exec_comp_duns(sess, exec_comp_data, metrics=metrics)
    if sftp:
        os.remove(file_path)
def process_from_dir(root_dir,
                     file_name,
                     sess,
                     local,
                     sftp=None,
                     monthly=False,
                     benchmarks=False,
                     table=DUNS,
                     year=None,
                     metrics=None):
    """ Process the SAM file found locally or remotely

        Args:
            root_dir: the folder containing the SAM file
            file_name: the name of the SAM file
            sess: the database connection
            local: whether it's local or not
            sftp: the sftp client to pull the CSV from
            monthly: whether it's a monthly file
            benchmarks: whether to log times
            table: the table to work from (could be DUNS/HistoricParentDuns)
            year: the year associated with the data (primarily for  HistoricParentDUNS loads)
            metrics: dictionary representing metrics data for the load
    """
    if not metrics:
        metrics = {}

    file_path = os.path.join(root_dir, file_name)
    if not local:
        if sftp.sock.closed:
            # Reconnect if channel is closed
            ssh_client = get_client()
            sftp = ssh_client.open_sftp()
        logger.info("Pulling {}".format(file_name))
        with open(file_path, "wb") as zip_file:
            sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]),
                       zip_file)
    parse_duns_file(file_path,
                    sess,
                    monthly=monthly,
                    benchmarks=benchmarks,
                    table=table,
                    year=year,
                    metrics=metrics)
    if not local:
        os.remove(file_path)
def process_from_dir(root_dir,
                     file_name,
                     sess,
                     sftp=None,
                     monthly=False,
                     benchmarks=False,
                     metrics=None):
    """ Process the SAM file found locally or remotely

        Args:
            root_dir: the folder containing the SAM file
            file_name: the name of the SAM file
            sess: the database connection
            sftp: the sftp client to pull the CSV from
            monthly: whether it's a monthly file
            benchmarks: whether to log times
            metrics: dictionary representing metrics data for the load
    """
    if not metrics:
        metrics = {}

    file_path = os.path.join(root_dir, file_name)
    if sftp:
        logger.info("Pulling {}".format(file_name))
        with open(file_path, "wb") as zip_file:
            try:
                sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]),
                           zip_file)
            except:
                logger.debug("Socket closed. Reconnecting...")
                ssh_client = get_client()
                sftp = ssh_client.open_sftp()
                sftp.getfo(''.join([REMOTE_SAM_DUNS_DIR, '/', file_name]),
                           zip_file)
    add_update_data, delete_data = parse_duns_file(file_path,
                                                   sess,
                                                   monthly=monthly,
                                                   benchmarks=benchmarks,
                                                   metrics=metrics)
    if add_update_data is not None:
        update_duns(sess, add_update_data, metrics=metrics)
    if delete_data is not None:
        update_duns(sess, delete_data, metrics=metrics, deletes=True)
    if sftp:
        os.remove(file_path)
Ejemplo n.º 5
0
def main():
    logger.info('Starting historical transaction executive compensation backfill.')

    parser = argparse.ArgumentParser(description='Backfill historical executive compensation data for transactions.')
    algorithm = parser.add_mutually_exclusive_group(required=True)
    algorithm.add_argument('-k', '--ssh_key', help='private key used to access the API remotely')
    algorithm.add_argument('-p', '--pulled_since', help='min created_at/updated_at date when directly using the '
                                                        'historic duns table')
    args = parser.parse_args()

    sess = GlobalDB.db().session

    if args.ssh_key:
        root_dir = CONFIG_BROKER['d_file_storage_path']
        # dirlist on remote host
        client = get_client(ssh_key=args.ssh_key)
        sftp = client.open_sftp()
        dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR)
        sorted_monthly_file_names = sorted([monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+',
                                                                                                 monthly_file)])
        for monthly_file in sorted_monthly_file_names:
            file_date = re.match('.*(\d{8}).*', monthly_file).group(1)

            logger.info('Starting {} monthly file'.format(file_date))
            exec_comp_data = parse_exec_comp_file(monthly_file, root_dir, sftp=sftp, ssh_key=args.ssh_key)

            temp_table_name = 'temp_exec_comp_update'
            # Only create a table out of the data we might actually need
            pop_exec = exec_comp_data[exec_comp_data.high_comp_officer1_full_na.notnull()]
            create_temp_exec_comp_table(sess, temp_table_name, pop_exec)

            update_transactions(sess, exec_comp_data, file_date, date_type='action_date')

            logger.info('Dropping {}'.format(temp_table_name))
            sess.execute('DROP TABLE {};'.format(temp_table_name))
            sess.commit()
    else:
        update_transactions(sess, 'historic_duns', args.pulled_since, date_type='created_at', source='fabs')
        update_transactions(sess, 'historic_duns', args.pulled_since, date_type='updated_at', source='fpds')
def parse_exec_comp_file(filename,
                         root_dir,
                         sftp=None,
                         ssh_key=None,
                         metrics=None):
    """ Parses the executive compensation file to update corresponding DUNS records

        Args:
            filename: name of file to import
            root_dir: working directory
            sftp: connection to remote server
            ssh_key: ssh_key for reconnecting
            metrics: dictionary representing metrics of the script

        Raises:
            Exception: couldn't extract the last exec comp modification date, this generally means the filename provided
                doesn't match the expected format.
    """
    if not metrics:
        metrics = {
            'files_processed': [],
            'records_received': 0,
            'records_processed': 0
        }

    file_path = os.path.join(root_dir, filename)
    if sftp:
        if sftp.sock.closed:
            # Reconnect if channel is closed
            ssh_client = get_client(ssh_key=ssh_key)
            sftp = ssh_client.open_sftp()
        with open(os.path.join(root_dir, filename), 'wb') as file:
            sftp.getfo(''.join([REMOTE_SAM_EXEC_COMP_DIR, '/', filename]),
                       file)

    logger.info('starting file ' + file_path)
    metrics['files_processed'].append(filename)

    csv_file = os.path.splitext(filename)[0] + '.dat'
    zfile = zipfile.ZipFile(file_path)

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(csv_file) as zip_file:
        nrows = len(zip_file.readlines()) - 2  # subtract the header and footer
    column_header_mapping = {
        'awardee_or_recipient_uniqu': 0,
        'sam_extract': 4,
        'exec_comp_str': 89
    }
    column_header_mapping_ordered = OrderedDict(
        sorted(column_header_mapping.items(), key=lambda c: c[1]))
    with zfile.open(csv_file) as zip_file:
        csv_data = pd.read_csv(zip_file,
                               dtype=str,
                               header=None,
                               skiprows=1,
                               nrows=nrows,
                               sep='|',
                               usecols=column_header_mapping_ordered.values(),
                               names=column_header_mapping_ordered.keys())
    total_data = csv_data.copy()
    metrics['records_received'] += len(total_data.index)
    total_data = total_data[
        total_data.awardee_or_recipient_uniqu.notnull()
        & total_data.sam_extract.isin(['2', '3', 'A', 'E'])]
    metrics['records_processed'] += len(total_data.index)
    del total_data['sam_extract']
    # Note: we're splitting these up cause it vastly saves memory parsing only the records that are populated
    blank_exec = total_data[total_data.exec_comp_str.isnull()]
    pop_exec = total_data[total_data.exec_comp_str.notnull()]

    # parse out executive compensation from row 90 for populated records
    lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
    parsed_data = pop_exec['exec_comp_str'].apply(lambda_func)
    parsed_data.columns = list(parse_exec_comp().keys())
    del pop_exec['exec_comp_str']
    pop_exec = pop_exec.join(parsed_data)

    # leave blanks
    del blank_exec['exec_comp_str']
    blank_exec = blank_exec.assign(**parse_exec_comp())

    # setup the final dataframe
    total_data = pd.concat([pop_exec, blank_exec])
    total_data.replace('', np.nan, inplace=True)
    last_exec_comp_mod_date_str = re.findall('[0-9]{8}', filename)
    if not last_exec_comp_mod_date_str:
        raise Exception(
            'Last Executive Compensation Mod Date not found in filename.')
    last_exec_comp_mod_date = datetime.datetime.strptime(
        last_exec_comp_mod_date_str[0], '%Y%m%d').date()
    total_data = total_data.assign(
        last_exec_comp_mod_date=last_exec_comp_mod_date)

    if sftp:
        os.remove(os.path.join(root_dir, filename))

    return total_data
        'script_name': 'load_exec_duns.py',
        'start_time': str(now),
        'files_processed': [],
        'records_received': 0,
        'records_processed': 0,
        'updated_duns': [],
        'records_updated': 0
    }

    with create_app().app_context():
        sess = GlobalDB.db().session
        sftp = None

        if ssh_key:
            root_dir = CONFIG_BROKER['d_file_storage_path']
            client = get_client(ssh_key=ssh_key)
            sftp = client.open_sftp()
            # dirlist on remote host
            dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR)
        elif local:
            root_dir = local
            dirlist = os.listdir(local)

        # generate chronological list of daily files
        sorted_monthly_file_names = sorted([
            monthly_file for monthly_file in dirlist
            if re.match('.*MONTHLY_\d+', monthly_file)
        ])
        sorted_daily_file_names = sorted([
            daily_file for daily_file in dirlist
            if re.match('.*DAILY_\d+', daily_file)
def process_exec_comp_dir(sess, historic, local, ssh_key, benchmarks=None, metrics=None):
    """ Process the script arguments to figure out which files to process in which order

        Args:
            sess: the database connection
            historic: whether to load in monthly file and daily files after, or just the latest daily files
            local: path to local directory to process, if None, it will go though the remote SAM service
            ssh_key: URI to ssh key used to pull exec comp files from SAM
            benchmarks: whether to log times
            metrics: dictionary representing metrics data for the load
    """
    if not metrics:
        metrics = {}

    sftp = None

    # dealing with a local or remote directory
    if not (local or ssh_key) or (local and ssh_key):
        raise Exception('Please provide the local param or the ssh key.')
    if ssh_key:
        root_dir = CONFIG_BROKER['d_file_storage_path']
        client = get_client(ssh_key=ssh_key)
        sftp = client.open_sftp()
        # dirlist on remote host
        dirlist = sftp.listdir(REMOTE_SAM_EXEC_COMP_DIR)
    elif local:
        root_dir = local
        dirlist = os.listdir(local)

    # generate chronological list of daily and monthly files
    sorted_monthly_file_names = sorted([monthly_file for monthly_file in dirlist if re.match('.*MONTHLY_\d+\.ZIP',
                                                                                             monthly_file.upper())])
    sorted_daily_file_names = sorted([daily_file for daily_file in dirlist if re.match('.*DAILY_\d+\.ZIP',
                                                                                       daily_file.upper())])

    # load in earliest monthly file for historic
    if historic and sorted_monthly_file_names:
        process_from_dir(root_dir, sorted_monthly_file_names[0], sess, sftp=sftp, ssh_key=ssh_key, metrics=metrics,
                         monthly=True)

    # load in daily files after depending on params
    if sorted_daily_file_names:
        # if update, make sure it's been done once before
        last_update = sess.query(DUNS.last_exec_comp_mod_date). \
            order_by(DUNS.last_exec_comp_mod_date.desc()). \
            filter(DUNS.last_exec_comp_mod_date.isnot(None)). \
            first()
        if not historic and not last_update:
            raise Exception('No last executive compenstation mod date found in database. '
                            'Please run historic loader first.')

        # determine which daily files to load
        earliest_daily_file = None
        if historic and sorted_monthly_file_names:
            earliest_daily_file = sorted_monthly_file_names[0].replace("MONTHLY", "DAILY")
        elif not historic:
            last_update = last_update[0].strftime("%Y%m%d")
            earliest_daily_file = re.sub("_DAILY_[0-9]{8}\.ZIP", "_DAILY_" +
                                         last_update + ".ZIP", sorted_daily_file_names[0])
        daily_files_after = sorted_daily_file_names
        if earliest_daily_file:
            sorted_full_list = sorted(sorted_daily_file_names + [earliest_daily_file])
            daily_files_after = sorted_full_list[sorted_full_list.index(earliest_daily_file) + 1:]

        # load daily files
        for daily_file in daily_files_after:
            process_from_dir(root_dir, daily_file, sess, sftp=sftp, ssh_key=ssh_key, metrics=metrics)
def process_duns_dir(sess, historic, local, benchmarks=None, metrics=None):
    """ Process the script arguments to figure out which files to process in which order

        Args:
            sess: the database connection
            historic: whether to load in monthly file and daily files after, or just the latest daily files
            local: path to local directory to process, if None, it will go though the remote SAM service
            benchmarks: whether to log times
            metrics: dictionary representing metrics data for the load
    """
    if not metrics:
        metrics = {}

    updated_date = datetime.date.today()

    # dealing with a local or remote directory
    sftp = None
    if not local:
        root_dir = CONFIG_BROKER["d_file_storage_path"]
        client = get_client()
        sftp = client.open_sftp()
        # dirlist on remote host
        dirlist = sftp.listdir(REMOTE_SAM_DUNS_DIR)
    elif local:
        root_dir = local
        dirlist = os.listdir(local)

    # generate chronological list of daily and monthly files
    sorted_monthly_file_names = sorted([
        monthly_file for monthly_file in dirlist
        if re.match(".*MONTHLY_\d+\.ZIP", monthly_file.upper())
    ])
    sorted_daily_file_names = sorted([
        daily_file for daily_file in dirlist
        if re.match(".*DAILY_\d+\.ZIP", daily_file.upper())
    ])

    # load in earliest monthly file for historic
    if historic and sorted_monthly_file_names:
        process_from_dir(root_dir,
                         sorted_monthly_file_names[0],
                         sess,
                         sftp,
                         monthly=True,
                         benchmarks=benchmarks,
                         metrics=metrics)

    # load in daily files after depending on params
    if sorted_daily_file_names:
        # if update, make sure it's been done once before
        last_update = sess.query(DUNS.last_sam_mod_date). \
            order_by(DUNS.last_sam_mod_date.desc()). \
            filter(DUNS.last_sam_mod_date.isnot(None)). \
            first()
        if not historic and not last_update:
            raise Exception(
                'No last sam mod date found in DUNS table. Please run historic loader first.'
            )

        # determine which daily files to load
        earliest_daily_file = None
        if historic and sorted_monthly_file_names:
            earliest_daily_file = sorted_monthly_file_names[0].replace(
                "MONTHLY", "DAILY")
        elif not historic:
            last_update = last_update[0].strftime("%Y%m%d")
            earliest_daily_file = re.sub("_DAILY_[0-9]{8}\.ZIP",
                                         "_DAILY_" + last_update + ".ZIP",
                                         sorted_daily_file_names[0])
        daily_files_after = sorted_daily_file_names
        if earliest_daily_file:
            sorted_full_list = sorted(sorted_daily_file_names +
                                      [earliest_daily_file])
            daily_files_after = sorted_full_list[sorted_full_list.
                                                 index(earliest_daily_file) +
                                                 1:]

        # load daily files
        for daily_file in daily_files_after:
            process_from_dir(root_dir,
                             daily_file,
                             sess,
                             sftp,
                             benchmarks=benchmarks,
                             metrics=metrics)
        if daily_files_after:
            metrics['parent_rows_updated'] = update_missing_parent_names(
                sess, updated_date=updated_date)
            metrics['parent_update_date'] = str(updated_date)
            parse_duns_file(monthly,
                            sess=sess,
                            monthly=True,
                            benchmarks=benchmarks,
                            metrics=metrics)
        elif daily:
            parse_duns_file(daily,
                            sess=sess,
                            benchmarks=benchmarks,
                            metrics=metrics)
        else:
            # dealing with a local or remote directory
            if not local:
                root_dir = CONFIG_BROKER["d_file_storage_path"]

                ssh_client = get_client()
                sftp = ssh_client.open_sftp()
                # dirlist on remote host
                dirlist = sftp.listdir(REMOTE_SAM_DUNS_DIR)
            else:
                root_dir = local
                dirlist = os.listdir(local)

            # generate chronological list of daily and monthly files
            sorted_monthly_file_names = sorted([
                monthly_file for monthly_file in dirlist
                if re.match(".*MONTHLY_\d+\.ZIP", monthly_file.upper())
            ])
            sorted_daily_file_names = sorted([
                daily_file for daily_file in dirlist
                if re.match(".*DAILY_\d+\.ZIP", daily_file.upper())