Esempio n. 1
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-account-names.yaml'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            pass

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        account, role = utils.get_master(config)
        retries = utils.get_config_value(config, 'ACCOUNT_NAMES', 'RETRIES', 5)
        file = utils.get_config_value(config, 'ACCOUNT_NAMES', 'FILE', '')

        account_names = {}
        if role != '':
            # Organizations should be queried, load that first
            session = utils.assume_role(boto3.Session(), role)
            org = session.client('organizations', region_name='us-east-1')

            rsp = org.list_accounts()
            while True:
                for account in rsp['Accounts']:
                    account_names[account['Id']] = account['Name']

                if 'NextToken' in rsp:
                    for i in range(retries):
                        try:
                            rsp = org.list_accounts(NextToken=rsp['NextToken'])
                            break
                        except ClientError as e:
                            if i == retries:
                                raise e
                            sleep(0.5 + 0.1 * i)
                    continue
                break

        if file != '':
            # Update account names with file contents
            with utils.get_read_handle(file) as f:
                account_names.update(yaml.load(f, Loader=yaml.FullLoader))

        with open(cache_file, 'w') as outfile:
            yaml.dump(account_names, outfile, default_flow_style=False)

        return account_names

    with utils.get_read_handle(cache_file) as input:
        account_names = yaml.load(input, Loader=yaml.FullLoader)
        return account_names
Esempio n. 2
0
def generate(config, instances, ris, pricing):
    def get_units(instancetype):
        try:
            return pricing['units'][instancetype]
        except KeyError as e:
            if '.' in instancetype:
                raise e
            for key in pricing['units']:
                if key.endswith('.' + instancetype):
                    return pricing['units'][key]
            raise e

    # Make sure we have a reasonable about of data
    if instances['usagestartdate'].max() - instances['usagestartdate'].min(
    ) < timedelta(days=14):
        raise ValueError('Insufficient Data')

    # Preaggregate some data
    timerange = instances['usagestartdate'].unique()

    # Add some additional data to instances
    hourofweek_column = instances.columns.get_loc('usagestartdate') + 1
    hourofweek_value = instances[
        'usagestartdate'].dt.dayofweek * 24 + instances[
            'usagestartdate'].dt.hour
    instances.insert(hourofweek_column, 'hourofweek', hourofweek_value)

    region_column = instances.columns.get_loc('availabilityzone')
    region_value = instances['availabilityzone'].str[:-1]
    instances.insert(region_column, 'region', region_value)

    family_column = instances.columns.get_loc('instancetype')

    # meckstmd:07/29/2019 - Metal RIs are no different than regular RIs - they are a family with a normalization factor
    #  for example, i3.metal is equivalent to i3.16xlarge.  See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/apply_ri.html
    #family_value = instances['instancetype'].apply(lambda x: x if x.endswith('.metal') else x.split('.')[0])
    family_value = instances['instancetype'].apply(lambda x: x.split('.')[0])
    instances.insert(family_column, 'instancetypefamily', family_value)

    instance_units_column = instances.columns.get_loc('instances') + 2
    units_value = instances['instancetype'].apply(
        get_units) * instances['instances']
    instances.insert(instance_units_column, 'instance_units', units_value)

    reserved_units_column = instances.columns.get_loc('reserved') + 2
    units_value = instances['instancetype'].apply(
        get_units) * instances['reserved']
    instances.insert(reserved_units_column, 'reserved_units', units_value)

    # Add some additional data to ris
    family_column = ris.columns.get_loc('instancetype') + 1

    # meckstmd:07/29/2019 - Metal RIs are no different than regular RIs - they are a family with a normalization factor
    #  for example, i3.metal is equivalent to i3.16xlarge.  See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/apply_ri.html
    #family_value = ris['instancetype'].apply(lambda x: x if x.endswith('.metal') else x.split('.')[0])
    family_value = ris['instancetype'].apply(lambda x: x.split('.')[0])
    ris.insert(family_column, 'instancetypefamily', family_value)

    units_column = ris.columns.get_loc('quantity') + 1
    units_value = ris['instancetype'].apply(get_units) * ris['quantity']
    ris.insert(units_column, 'units', units_value)

    # Create aggregates for faster processing
    az_instance_groups = instances.groupby(
        ['availabilityzone', 'instancetype', 'tenancy', 'operatingsystem'])
    az_account_instance_groups = instances.groupby(az_instance_groups.keys +
                                                   ['usageaccountid'])
    region_instance_groups = instances.groupby(
        ['region', 'instancetypefamily', 'tenancy', 'operatingsystem'])
    region_account_instance_groups = instances.groupby(
        region_instance_groups.keys + ['usageaccountid'])
    ri_groups = ris.groupby(region_instance_groups.keys + ['scope'])

    # Reference Lookup
    all_sizes = instances['instancetype'].apply(
        lambda x: x.split('.')[1]).unique()
    reference_sizes = {}
    for family in ris['instancetypefamily'].unique():
        for size in all_sizes:
            if "{}.{}".format(family, size) in pricing['us-east-1']:
                reference_sizes[family] = size
                break

    # Reports
    unused_az_ris = pd.DataFrame(
        columns=az_instance_groups.keys +
        ['min_unused_qty', 'avg_unused_qty', 'max_unused_qty'])
    ri_hourly_usage_report = pd.DataFrame(
        columns=region_instance_groups.keys + ['hourofweek'] + [
            'total_ri_units', 'total_instance_units', 'floating_ri_units',
            'floating_instance_units', 'unused_ri_units', 'coverage_chance'
        ])
    ri_purchases = pd.DataFrame(columns=[
        'Account ID', 'Scope', 'Region', 'External AZ', 'Offering Class',
        'Quantity', 'Term', 'Instance Type', 'Product Description', 'RI Type',
        'Tenancy', 'accountid', 'family', 'units', 'ri upfront cost',
        'ri total cost', 'ri savings', 'ondemand value', 'algorithm'
    ])

    # NOTE: For usage values, AZ usage is booked by quantity (instances), Region usage is booked by units.

    # Iterate by Union of (Region Instance Groups and RI Groups)
    for group in sorted(
            list(
                set(region_instance_groups.groups.keys())
                | set(ris.groupby(region_instance_groups.keys).groups.keys()))
    ):
        # for group in [('ap-northeast-1', 'c4', 'Shared', 'Linux')]:
        region, family, tenancy, operatingsystem = group
        LOGGER.debug("Evaluting {:>14}:{:3} ({}, {})".format(
            region, family, tenancy, operatingsystem))

        # Account for In-Account AZ RI usage
        # In-Account RI usage only needs to be counted against Regional Usage for accuracy
        try:
            az_ris = ri_groups.get_group(group + tuple(['Availability Zone']))
        except KeyError:
            az_ris = pd.DataFrame(columns=ris.columns)
        az_account_hour_ri_usage = pd.DataFrame(
            columns=az_account_instance_groups.keys +
            ['hourofweek', 'instances'])
        region_account_hour_ri_usage = pd.DataFrame(
            columns=region_account_instance_groups.keys +
            ['hourofweek', 'instance_units'])

        for index, az_ri in az_ris.iterrows():
            LOGGER.debug("Evaluating In-Account AZ RI: {}:{} {} x{}".format(
                az_ri['accountid'], az_ri['availabilityzone'],
                az_ri['instancetype'], az_ri['quantity']))

            try:
                group_key = (az_ri['availabilityzone'], az_ri['instancetype'],
                             tenancy, operatingsystem, az_ri['accountid'])
                az_account_instance_group = az_account_instance_groups.get_group(
                    group_key)
            except KeyError:
                continue

            # Straight to hourofweek average, since there should not be multiple rows per usagestartdate
            in_account_usage = az_account_instance_group.groupby(
                ['hourofweek'])['instances'].mean()

            # Account for already assigned usage from previously evaluated AZ RIs
            in_account_assigned = az_account_hour_ri_usage[
                (az_account_hour_ri_usage['availabilityzone'] ==
                 az_ri['availabilityzone'])
                & (az_account_hour_ri_usage['instancetype'] ==
                   az_ri['instancetype']) &
                (az_account_hour_ri_usage['usageaccountid'] ==
                 az_ri['accountid'])].groupby('hourofweek')['instances'].sum()
            if len(in_account_assigned) > 0:
                in_account_usage -= in_account_assigned

            in_account_used = np.minimum(in_account_usage, az_ri['quantity'])

            # Build assignment usage rows
            usage_keys = pd.DataFrame([group_key],
                                      columns=az_account_instance_groups.keys)
            usage_data = pd.DataFrame({
                'key': 1,
                'hourofweek': in_account_used.index,
                'instances': in_account_used.values
            })
            usage = usage_keys.assign(key=1).merge(usage_data,
                                                   on='key').drop('key', 1)
            LOGGER.debug("In-Account Assigned AZ Usage:\n" + str(usage.head()))
            az_account_hour_ri_usage = az_account_hour_ri_usage.append(
                usage, ignore_index=True)

            # Build regional usage rows
            usage_keys = pd.DataFrame(
                [group + tuple([az_ri['accountid']])],
                columns=region_account_instance_groups.keys)
            usage_data = pd.DataFrame({
                'key':
                1,
                'hourofweek':
                in_account_used.index,
                'instance_units':
                in_account_used.values * get_units(az_ri['instancetype'])
            })
            usage = usage_keys.assign(key=1).merge(usage_data,
                                                   on='key').drop('key', 1)
            LOGGER.debug("In-Account Regional Assigned AZ Usage:\n" +
                         str(usage.head()))
            region_account_hour_ri_usage = region_account_hour_ri_usage.append(
                usage, ignore_index=True)

        # Account for Cross-Account AZ RI Usage
        # To simplify analysis, treat in-account and cross-account identically since we only report unused AZ RIs
        az_ris = az_ris.groupby(['availabilityzone', 'instancetype'])

        # for index, az_ri in az_ris.iterrows():
        for az_group in az_ris.groups.keys():
            availabilityzone, instancetype = az_group
            quantity = az_ris.get_group(az_group)['quantity'].sum()
            LOGGER.debug("Evaluating Cross-Account AZ RI: {} {} x{}".format(
                availabilityzone, instancetype, quantity))
            try:
                group_key = (availabilityzone, instancetype, tenancy,
                             operatingsystem)
                az_instance_group = az_instance_groups.get_group(group_key)
            except KeyError:
                continue

            # Aggregate by hour before hourofweek
            total_usage = az_instance_group.groupby(['usagestartdate', 'hourofweek'])['instances'].sum(). \
                groupby(['hourofweek']).mean()

            # No pre-assigned usage since individual RI subscriptions are getting bundled
            total_used = np.minimum(total_usage, quantity)

            # Add to regional usage for purchase recommendations
            usage_keys = pd.DataFrame(
                [group + tuple(['000000000000'])],
                columns=region_account_instance_groups.keys)
            usage_data = pd.DataFrame({
                'key':
                1,
                'hourofweek':
                total_used.index,
                'instance_units':
                total_used.values * get_units(az_ri['instancetype'])
            })
            usage = usage_keys.assign(key=1).merge(usage_data,
                                                   on='key').drop('key', 1)
            LOGGER.debug("Cross-Account Regional Assigned AZ Usage:\n" +
                         str(usage.head()))
            region_account_hour_ri_usage = region_account_hour_ri_usage.append(
                usage, ignore_index=True)

            unused = quantity - total_used
            if unused.max() > 0:
                unused_az_ri_row = {
                    'availabilityzone': availabilityzone,
                    'instancetype': instancetype,
                    'tenancy': tenancy,
                    'operatingsystem': operatingsystem,
                    'min_unused_qty': unused.min(),
                    'avg_unused_qty': unused.mean(),
                    'max_unused_qty': unused.max(),
                }
                unused_az_ris = unused_az_ris.append(unused_az_ri_row,
                                                     ignore_index=True)
                LOGGER.debug("Unused AZ RIs:\n" + str(unused_az_ri_row))

        # Account for In-Account Region RI Usage
        # In-Account Region RI usage only needed to calculate RI Float
        try:
            region_ris = ri_groups.get_group(group + tuple(['Region']))
        except KeyError:
            region_ris = pd.DataFrame(columns=ris.columns)
        region_hour_ri_usage = pd.DataFrame(
            columns=region_instance_groups.keys + ['hourofweek', 'units'])

        account_region_ris = region_ris.groupby(['accountid'])
        for accountid in account_region_ris.groups.keys():
            ri_units = account_region_ris.get_group(accountid)['units'].sum()
            LOGGER.debug(
                "Evaluating In-Account Region RI: {}:{} {} x{}".format(
                    accountid, region, family, ri_units))

            try:
                group_key = (region, family, tenancy, operatingsystem,
                             accountid)
                region_account_instance_group = region_account_instance_groups.get_group(
                    group_key)
            except KeyError:
                continue

            # Aggregate by hour before hourofweek
            in_account_usage = region_account_instance_group.groupby(['usagestartdate', 'hourofweek']) \
                    ['instance_units'].sum().groupby(['hourofweek']).mean()

            # Account for already assigned usage from AZ RIs
            in_account_assigned = region_account_hour_ri_usage[(
                region_account_hour_ri_usage['usageaccountid'] == accountid
            )].groupby('hourofweek')['instance_units'].sum()
            if len(in_account_assigned) > 0:
                in_account_usage -= in_account_assigned

            in_account_used = np.minimum(in_account_usage, ri_units)

            # Fix partial indexes
            in_account_used = in_account_used.reindex(range(168),
                                                      copy=False,
                                                      fill_value=0.0)

            # Build usage rows
            usage_keys = pd.DataFrame([group],
                                      columns=region_instance_groups.keys)
            usage_data = pd.DataFrame({
                'key': 1,
                'hourofweek': in_account_used.index,
                'units': in_account_used.values
            })
            usage = usage_keys.assign(key=1).merge(usage_data,
                                                   on='key').drop('key', 1)
            LOGGER.debug("In-Account Assigned Region Usage:\n" +
                         str(usage.head()))
            region_hour_ri_usage = region_hour_ri_usage.append(
                usage, ignore_index=True)

        try:
            region_instance_group = region_instance_groups.get_group(group)
        except:
            # This is a bit heavy, but it shouldn't be called frequently.
            # Create a new DataFrame that has the right structure
            region_instance_group = region_instance_groups.get_group(
                list(region_instance_groups.groups.keys())[0])
            region_instance_group = region_instance_group.assign(instances=0)
            region_instance_group = region_instance_group.assign(
                instance_units=0)
            region_instance_group = region_instance_group.assign(reserved=0)

        # Account for Cross-Account Region RI Usage
        if len(region_ris) == 0:
            ri_units = 0
        else:
            ri_units = region_ris['units'].sum()
            LOGGER.debug(
                "Evaluating Cross-Account Region RI: {} {} x{}".format(
                    region, family, ri_units))

            # Aggregate by hour before hourofweek
            total_usage = region_instance_group.groupby(['usagestartdate', 'hourofweek']) \
                ['instance_units'].sum().groupby(['hourofweek']).mean()

            # In-Account usage to calculate float
            in_account_usage = region_hour_ri_usage.groupby(['hourofweek'
                                                             ])['units'].sum()
            if len(in_account_usage) == 0:
                in_account_usage = pd.Series(0, index=total_usage.index)

            # Floating RIs
            floating_ri_units = ri_units - in_account_usage

            # Instances eligible for float
            floating_instance_units = total_usage - in_account_usage

            # Unused RIs
            unused_ri_units = np.maximum(ri_units - total_usage, 0)

            # % Change a new instance will be covered
            coverage_chance = np.minimum(
                floating_ri_units / floating_instance_units * 100, 100)

            # Build report rows
            usage_keys = pd.DataFrame([group],
                                      columns=region_instance_groups.keys)
            usage_data = pd.DataFrame({
                'key':
                1,
                'hourofweek':
                total_usage.index,
                'total_ri_units':
                ri_units,
                'total_instance_units':
                total_usage.values,
                'floating_ri_units':
                floating_ri_units.values,
                'floating_instance_units':
                floating_instance_units.values,
                'unused_ri_units':
                unused_ri_units.values,
                'coverage_chance':
                coverage_chance.values,
            })
            usage = usage_keys.assign(key=1).merge(usage_data,
                                                   on='key').drop('key', 1)
            LOGGER.debug("Cross-Account Region Usage Report:\n" +
                         str(usage.head()))
            ri_hourly_usage_report = ri_hourly_usage_report.append(
                usage, ignore_index=True)

        # RI Utilization Evaluation complete.  Evaluate Purchase recommendations
        if region_instance_group['instance_units'].sum() > 0:

            # Calculate usage slope to determine purchase aggressiveness

            # First filter data to reduce noise.
            region_hourly_usage = region_instance_group.groupby(
                ['usagestartdate', 'hourofweek'])['instance_units'].sum()
            threshold = int(
                utils.get_config_value(config, 'RI_PURCHASES',
                                       'FILTER_THRESHOLD', 3))
            signal = region_hourly_usage.values.copy()
            delta = np.abs(signal - np.mean(signal))
            median_delta = np.median(delta)
            if median_delta > 0:
                mask = (delta / float(median_delta)) > threshold
                signal[mask] = np.median(signal)

            # Least squares fit
            ts = region_hourly_usage.reset_index()['usagestartdate'].apply(
                lambda x: x.timestamp())
            A = np.vstack([ts, np.ones(len(ts))]).T
            y = signal
            m, c = np.linalg.lstsq(A, y, rcond=None)[0]
            slope = m * 86400

            # Determine RI Algorithm
            algorithm = 'DEFAULT'
            aggressive_threshold = utils.get_config_value(
                config, 'RI_PURCHASES', 'AGGRESSIVE_THRESHOLD', 'NONE')
            try:
                if slope >= float(aggressive_threshold):
                    algorithm = 'AGGRESSIVE'
            except ValueError:
                pass

            conservative_threshold = utils.get_config_value(
                config, 'RI_PURCHASES', 'CONSERVATIVE_THRESHOLD', 'NONE')
            try:
                if slope <= float(conservative_threshold):
                    algorithm = 'CONSERVATIVE'
            except ValueError:
                pass

            # Subtract AZ RI Usage from instances since we for the most part can completely ignore them.
            az_assigned = region_account_hour_ri_usage.groupby(
                'hourofweek')['instance_units'].sum()
            if len(az_assigned) > 0:
                region_hourly_usage -= az_assigned

            # Determine our purchase size for this family
            types = [
                key for key in pricing[region].keys()
                if key.startswith(family + '.')
            ]
            type_units = {key: get_units(key) for key in types}
            desired_size = utils.get_config_value(config, 'RI_PURCHASES',
                                                  'RI_SIZE', 'largest')
            if desired_size == 'largest':
                purchase_size, purchase_size_units = max(
                    type_units.items(), key=operator.itemgetter(1))
            elif desired_size == 'smallest':
                purchase_size, purchase_size_units = min(
                    type_units.items(), key=operator.itemgetter(1))
            else:
                desired_size_units = get_units(desired_size)
                filtered_units = {
                    k: v
                    for k, v in type_units.items() if v <= desired_size_units
                }
                if len(filtered_units) > 0:
                    purchase_size, purchase_size_units = max(
                        filtered_units.items(), key=operator.itemgetter(1))
                else:
                    purchase_size, purchase_size_units = min(
                        type_units.items(), key=operator.itemgetter(1))

            # Get RI Details
            term = utils.get_config_value(config, 'RI_PURCHASES', 'RI_TERM')
            term_h = int(term) * 730
            term_y = '3yr' if term == 36 else '1yr'
            if region not in pricing or purchase_size not in pricing[region] or \
                    tenancy not in pricing[region][purchase_size] or \
                    operatingsystem not in pricing[region][purchase_size][tenancy]:
                LOGGER.error('Missing RI Pricing data for {}:{}:{}:{}'.format(
                    region, purchase_size, tenancy, operatingsystem))
                continue

            for offering in ['standard', 'convertible']:
                # Get RI Pricing Data
                offering2 = 'classic' if offering == 'standard' else 'convertible'
                rates = pricing[region][purchase_size][tenancy][
                    operatingsystem]
                od_rate = rates['onDemandRate']
                ri_rate = None
                option = utils.get_config_value(config, 'RI_PURCHASES',
                                                'RI_OPTION')
                for o in (option, 'No Upfront', 'Partial Upfront',
                          'All Upfront'):
                    ri_key = '{}-{}-{}'.format(term_y, offering, o)
                    if ri_key in rates['reserved']:
                        ri_rate = rates['reserved'][ri_key]
                        option = o
                        break
                if ri_rate is None:
                    LOGGER.error('Missing RI Pricing data(2) for {}:{}'.format(
                        region, purchase_size))
                    continue

                for slush in [False, True]:
                    utilization_key = "{}_{}_{}UTIL_TARGET".format(
                        offering.upper(), algorithm, 'SLUSH_' if slush else '')
                    target_utilization = utils.get_config_value(
                        config, 'RI_PURCHASES', utilization_key, 'NONE')

                    if target_utilization == 'BREAK_EVEN':
                        target_utilization = (ri_rate['upfront'] +
                                              ri_rate['hourly'] * term_h) / (
                                                  od_rate * term_h) * 100
                        # RI Total Cost / OnDemand Cost = Break Even Utilization

                    LOGGER.debug(
                        "Purchase: {:>14}:{:3} {:11} {:5}: slope={} algo={} target={}"
                        .format(region, family, offering,
                                'slush' if slush else 'acct', slope, algorithm,
                                target_utilization))

                    if target_utilization == 'NONE':
                        continue

                    # Subtract existing RIs from usage to determine demand
                    demand_hourly_usage = region_hourly_usage - ri_units

                    # Edge case fix here...
                    # If usage only exists for a part of the timerange, it's percentile will be incorrect
                    # unless we fill it with zeros.
                    demand_hourly_usage = demand_hourly_usage.reset_index(
                        level=1, drop=True).reindex(timerange, fill_value=0.0)

                    # Subtract previously recommended RIs from usage
                    prior_ri_units = ri_purchases[
                        (ri_purchases['Region'] == region)
                        & (ri_purchases['family'] == family) &
                        (ri_purchases['Product Description']
                         == operatingsystem) &
                        (ri_purchases['Tenancy'] == tenancy)]['units'].sum()
                    demand_hourly_usage -= prior_ri_units

                    # Evalute Demand
                    demand_units = sorted(demand_hourly_usage.values)[int(
                        len(demand_hourly_usage.values) *
                        (100 - target_utilization) / 100)]
                    demand_units -= demand_units % purchase_size_units
                    if demand_units < purchase_size_units:
                        LOGGER.debug(
                            "Purchase: {:>14}:{:3} : No additional RIs required"
                            .format(region, family))
                    else:
                        # Recommend purchases in accounts with the most uncovered demand

                        # Calculate per-account demand (single number at percentile)
                        if slush:
                            account_demand = pd.DataFrame(
                                {
                                    'accountid':
                                    utils.get_config_value(
                                        config, 'RI_PURCHASES',
                                        'SLUSH_ACCOUNT'),
                                    'units':
                                    demand_units,
                                },
                                index=[0])
                        else:
                            # Edge case fix here...
                            # If an account only has usage for a part of the window, it's percentile will be incorrect
                            # unless we fill the timerange with zeros.
                            idx = pd.merge(
                                pd.DataFrame({
                                    'key':
                                    1,
                                    'usageaccountid':
                                    region_instance_group['usageaccountid'].
                                    unique()
                                }),
                                pd.DataFrame({
                                    'key': 1,
                                    'usagestartdate': timerange
                                }),
                                on='key')[['usageaccountid', 'usagestartdate']]
                            account_demand = region_instance_group.groupby(['usageaccountid', 'usagestartdate']) \
                                ['instance_units'].sum().reindex(idx, fill_value=0.0).groupby('usageaccountid'). \
                                agg(lambda x: np.percentile(x, q=100 - target_utilization))

                            # subtract in-account RIs
                            account_ris = region_ris.groupby(
                                ['accountid'])['units'].sum()
                            account_demand = account_demand.subtract(
                                account_ris, fill_value=0)
                            account_demand = pd.DataFrame({
                                'accountid':
                                account_demand.index,
                                'units':
                                account_demand.values
                            })

                            # Noramlize to purchase units
                            account_demand['units'] -= account_demand[
                                'units'] % purchase_size_units

                            # Filter for positive demand
                            account_demand = account_demand[
                                account_demand['units'] > 0]

                            # subtract from bottom to allow equal float opportunity
                            while account_demand['units'].sum(
                            ) > demand_units + len(
                                    account_demand) * purchase_size_units:
                                excess_qty_per_account = int(
                                    (account_demand['units'].sum() -
                                     demand_units) / purchase_size_units /
                                    len(account_demand))
                                account_demand[
                                    'units'] -= excess_qty_per_account * purchase_size_units
                                account_demand = account_demand[
                                    account_demand['units'] > 0]

                            # Consistently distribute stragglers
                            if account_demand['units'].sum() > demand_units:
                                excess_qty = int(
                                    (account_demand['units'].sum() -
                                     demand_units) / purchase_size_units)
                                sorted_accounts = account_demand.sort_values(
                                    ['units', 'accountid'])
                                delta = pd.Series(
                                    [purchase_size_units] * excess_qty + [0] *
                                    (len(account_demand) - excess_qty),
                                    index=sorted_accounts.index)
                                account_demand['units'] -= delta
                                account_demand = account_demand[
                                    account_demand['units'] > 0]

                        # Build report rows
                        quantity = (account_demand['units'] /
                                    purchase_size_units).astype(int)
                        purchases = pd.DataFrame({
                            'Account ID':
                            account_demand['accountid'].apply(
                                lambda x: '="{0:012}"'.format(x)),
                            'Scope':
                            'Region',
                            'Region':
                            region,
                            'External AZ':
                            '',
                            'Offering Class':
                            offering2,
                            'Quantity':
                            quantity,
                            'Term':
                            term,
                            'Instance Type':
                            purchase_size,
                            'Product Description':
                            operatingsystem,
                            'RI Type':
                            option,
                            'Tenancy':
                            tenancy,
                            'accountid':
                            account_demand['accountid'].apply(
                                lambda x: '{0:012}'.format(x)),
                            'family':
                            family,
                            'units':
                            account_demand['units'].astype(int),
                            'ri upfront cost':
                            quantity * ri_rate['upfront'],
                            'ri total cost':
                            quantity *
                            (ri_rate['upfront'] + ri_rate['hourly'] * term_h),
                            'ri savings':
                            quantity *
                            ((od_rate - ri_rate['hourly']) * term_h -
                             ri_rate['upfront']),
                            'ondemand value':
                            quantity * od_rate * term_h,
                            'algorithm':
                            algorithm
                        })
                        LOGGER.debug("Purchases:\n" + str(purchases.head()))
                        ri_purchases = ri_purchases.append(purchases,
                                                           ignore_index=True)

                        # Assign to top until filly assigned
                        LOGGER.debug(
                            "Purchase: {:>14}:{:3} : type={} demand={}, recommend={} in {} accounts"
                            .format(region, family, purchase_size,
                                    demand_units,
                                    account_demand['units'].sum(),
                                    len(account_demand)))

    # GroupBy to assign appropriate index columns
    unused_az_ris = unused_az_ris.groupby(az_instance_groups.keys).sum()
    ri_hourly_usage_report = ri_hourly_usage_report.groupby(
        region_instance_groups.keys + ['hourofweek']).sum()
    instances = instances.drop('hourofweek', 1)

    # https://github.com/yahoo/ariel/issues/8: this is necessary if the accounts have not purchased any RIs
    if (len(ri_hourly_usage_report) == 0):
        ri_hourly_usage_report = pd.DataFrame(columns=[
            'region', 'instancetypefamily', 'tenancy', 'operatingsystem',
            'hourofweek', 'total_ri_units', 'total_instance_units',
            'floating_ri_units', 'floating_instance_units', 'unused_ri_units',
            'coverage_chance'
        ])
        ri_usage_report = pd.DataFrame(columns=[
            'region', 'instancetypefamily', 'tenancy', 'operatingsystem',
            'total_ri_units', 'total_instance_units', 'floating_ri_units',
            'floating_instance_units', 'unused_ri_units', 'coverage_chance',
            'xl_effective_rate', 'monthly_ri_cost', 'monthly_od_cost',
            'monthly_ri_savings'
        ])
    else:
        # Build RI Usage report with Actual cost benefit
        ri_usage_report = ri_hourly_usage_report.groupby(
            region_instance_groups.keys).mean()
        ri_cost = ris.groupby(
            region_instance_groups.keys)['amortizedupfrontprice'].sum(
            ) + ris.groupby(
                region_instance_groups.keys)['amortizedrecurringfee'].sum()
        od_cost = ri_usage_report.apply(lambda x: 720 * pricing[x.name[0]][
            "{}.{}".format(x.name[1], reference_sizes[x.name[1]])][
                x.name[2]][x.name[3]]['onDemandRate'] * min(
                    x['total_ri_units'], x['total_instance_units']
                ) / get_units(x.name[1] + '.' + reference_sizes[x.name[1]]),
                                        axis=1)
        xl_effective_rate = (
            (od_cost - ri_cost) *
            (100 - ri_usage_report['coverage_chance']) / 100 +
            ri_cost) / 720 / ri_usage_report['total_ri_units'] * 8
        ri_usage_report.insert(len(ri_usage_report.columns),
                               'xl_effective_rate', xl_effective_rate)
        ri_usage_report.insert(len(ri_usage_report.columns), 'monthly_ri_cost',
                               ri_cost)
        ri_usage_report.insert(len(ri_usage_report.columns), 'monthly_od_cost',
                               od_cost)
        ri_usage_report.insert(len(ri_usage_report.columns),
                               'monthly_ri_savings', od_cost - ri_cost)

        # Apply some column formats
        ri_hourly_usage_report['total_ri_units'] = ri_hourly_usage_report[
            'total_ri_units'].map('{:.0f}'.format)
        ri_hourly_usage_report[
            'total_instance_units'] = ri_hourly_usage_report[
                'total_instance_units'].map('{:.0f}'.format)
        ri_hourly_usage_report['floating_ri_units'] = ri_hourly_usage_report[
            'floating_ri_units'].map('{:.0f}'.format)
        ri_hourly_usage_report[
            'floating_instance_units'] = ri_hourly_usage_report[
                'floating_instance_units'].map('{:.0f}'.format)
        ri_hourly_usage_report['unused_ri_units'] = ri_hourly_usage_report[
            'unused_ri_units'].map('{:.0f}'.format)
        ri_hourly_usage_report['coverage_chance'] = ri_hourly_usage_report[
            'coverage_chance'].map('{:.2f}'.format)
        ri_usage_report['total_ri_units'] = ri_usage_report[
            'total_ri_units'].map('{:.0f}'.format)
        ri_usage_report['total_instance_units'] = ri_usage_report[
            'total_instance_units'].map('{:.0f}'.format)
        ri_usage_report['floating_ri_units'] = ri_usage_report[
            'floating_ri_units'].map('{:.0f}'.format)
        ri_usage_report['floating_instance_units'] = ri_usage_report[
            'floating_instance_units'].map('{:.0f}'.format)
        ri_usage_report['unused_ri_units'] = ri_usage_report[
            'unused_ri_units'].map('{:.0f}'.format)
        ri_usage_report['coverage_chance'] = ri_usage_report[
            'coverage_chance'].map('{:.2f}'.format)
        ri_usage_report['xl_effective_rate'] = ri_usage_report[
            'xl_effective_rate'].map('${:,.4f}'.format)
        ri_usage_report['monthly_ri_cost'] = ri_usage_report[
            'monthly_ri_cost'].map('${:,.2f}'.format)
        ri_usage_report['monthly_od_cost'] = ri_usage_report[
            'monthly_od_cost'].map('${:,.2f}'.format)
        ri_usage_report['monthly_ri_savings'] = ri_usage_report[
            'monthly_ri_savings'].map('${:,.2f}'.format)
        ri_purchases['ri upfront cost'] = ri_purchases['ri upfront cost'].map(
            '${:,.2f}'.format)
        ri_purchases['ri total cost'] = ri_purchases['ri total cost'].map(
            '${:,.2f}'.format)
        ri_purchases['ri savings'] = ri_purchases['ri savings'].map(
            '${:,.2f}'.format)
        ri_purchases['ondemand value'] = ri_purchases['ondemand value'].map(
            '${:,.2f}'.format)

    reports = {
        "ACCOUNT_INSTANCE_SUMMARY": instances,
        "RI_PURCHASES": ri_purchases,
        "RI_USAGE": ri_usage_report,
        "RI_HOURLY_USAGE": ri_hourly_usage_report,
        "UNUSED_AZ_RIS": unused_az_ris,
    }

    return reports
Esempio n. 3
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-unlimited-summary.csv'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            mtime = 0

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        account, role = utils.get_master(config)
        region = utils.get_config_value(
            config, 'ATHENA', 'AWS_REGION',
            utils.get_config_value(config, 'DEFAULTS', 'AWS_REGION',
                                   os.environ.get('AWS_DEFAULT_REGION')))
        database = utils.get_config_value(config, 'ATHENA', 'CUR_DATABASE')
        staging = utils.get_config_value(
            config, 'ATHENA', 'STAGING',
            's3://aws-athena-query-results-{0}-{1}/ariel-cur-output/'.format(
                account, region))
        days = utils.get_config_value(config, 'ATHENA', 'DAYS', 28)
        offset = utils.get_config_value(config, 'ATHENA', 'OFFSET', 1)

        session = boto3.Session()
        proto, empty, staging_bucket, staging_prefix = staging.split('/', 3)

        # Assume role if needed
        if role is not None:
            session = utils.assume_role(session, role)

        # Connect to Athena
        athena = session.client('athena', region_name=region)

        # Validate database is usable
        status_id = utils.execute_athena_query(
            athena, staging,
            'SELECT status FROM ' + database + '.cost_and_usage_data_status')

        # Row 0 is header
        status = athena.get_query_results(
            QueryExecutionId=status_id
        )['ResultSet']['Rows'][1]['Data'][0]['VarCharValue']
        if status != 'READY':
            raise Exception('Athena database not in READY status')

        # Identify start to end range query
        today = datetime.datetime.combine(datetime.datetime.today(),
                                          datetime.time.min)
        endtime = today - datetime.timedelta(days=offset)
        starttime = endtime - datetime.timedelta(days=days)

        # Download Instance and RI usage
        query = ' '.join((
            "" + "SELECT line_item_usage_account_id AS accountid ,"
            "       product_region AS region, "
            "       lower(product_instance) AS instancetypefamily, "
            "       sum(line_item_usage_amount) AS unlimitedusageamount, "
            "       sum(line_item_unblended_cost) AS unlimitedusagecost " +
            "  FROM " + database + ".cur " +
            " WHERE line_item_usage_type like '%CPUCredits:%' " +
            "   AND line_item_usage_start_date >= cast('{}' as timestamp) ".
            format(starttime.isoformat(' ')) +
            "   AND line_item_usage_start_date < cast('{}' as timestamp) ".
            format(endtime.isoformat(' ')) +
            " GROUP BY line_item_usage_account_id, product_region, lower(product_instance) "
            +
            " ORDER BY line_item_usage_account_id, product_region, lower(product_instance) "
        ).split())
        query_id = utils.execute_athena_query(athena, staging, query)
        session.client('s3').download_file(
            staging_bucket, '{0}{1}.csv'.format(staging_prefix, query_id),
            cache_file)

    result = pd.read_csv(cache_file)
    if len(result) == 0:
        result = pd.DataFrame(columns=[
            'accountid', 'region', 'instancetypefamily',
            'unlimitedusageamount', 'unlimitedusagecost'
        ])

    result['accountid'] = result['accountid'].map('{:012}'.format)
    result['unlimitedusageamount'] = result['unlimitedusageamount'].map(
        '{:.2f}'.format)
    result['unlimitedusagecost'] = result['unlimitedusagecost'].map(
        '${:,.2f}'.format)
    LOGGER.info("Loaded {} unlimited rows".format(len(result)))
    return result
Esempio n. 4
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-reserved-instances.csv'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            pass

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
        ris = pd.read_csv(cache_file)
    else:
        account, role = utils.get_master(config)
        region = utils.get_config_value(
            config, 'RESERVED_INSTANCES', 'AWS_REGION',
            utils.get_config_value(config, 'DEFAULTS', 'AWS_REGION',
                                   os.environ.get('AWS_DEFAULT_REGION')))
        # start date cannot be after 2 days ago for GetReservationUtilization
        monthend = datetime.date.today()
        monthstart = (datetime.date.today() - datetime.timedelta(days=31))

        ris = []
        if role != '':
            session = utils.assume_role(boto3.Session(), role)
            ce = session.client('ce', region_name=region)

            rsp = ce.get_reservation_utilization(TimePeriod={
                "Start": str(monthstart),
                "End": str(monthend)
            },
                                                 GroupBy=[{
                                                     "Type":
                                                     "DIMENSION",
                                                     "Key":
                                                     "SUBSCRIPTION_ID"
                                                 }])

            while True:
                groups = rsp['UtilizationsByTime'][0]['Groups']
                for row in groups:
                    # Make sure to only capture active RIs
                    endDate = datetime.datetime.strptime(
                        row['Attributes']['endDateTime'],
                        "%Y-%m-%dT%H:%M:%S.000Z")
                    if endDate.date() > datetime.date.today():
                        operatingSystem = 'Linux' if row['Attributes'][
                            'platform'] == 'Linux/UNIX' else row['Attributes'][
                                'platform']  # for CUR compatibility
                        ri = {
                            'accountid':
                            int(row['Attributes']['accountId']),
                            'accountname':
                            row['Attributes']['accountName'],
                            'reservationid':
                            row['Attributes']['leaseId'],
                            'subscriptionid':
                            row['Attributes']['subscriptionId'],
                            'startdate':
                            row['Attributes']['startDateTime'],
                            'enddate':
                            row['Attributes']['endDateTime'],
                            'state':
                            row['Attributes']['subscriptionStatus'],
                            'quantity':
                            int(row['Attributes']['numberOfInstances']),
                            'availabilityzone':
                            row['Attributes']['availabilityZone'],
                            'region':
                            row['Attributes']['region'],
                            'instancetype':
                            row['Attributes']['instanceType'],
                            'paymentoption':
                            row['Attributes']['subscriptionType'],
                            'tenancy':
                            row['Attributes']['tenancy'],
                            'operatingsystem':
                            operatingSystem,
                            'amortizedhours':
                            int(row['Utilization']['PurchasedHours']),
                            'amortizedupfrontprice':
                            float(row['Utilization']['AmortizedUpfrontFee']),
                            'amortizedrecurringfee':
                            float(row['Utilization']['AmortizedRecurringFee']),
                            'offeringclass':
                            row['Attributes']['offeringType'],
                            'scope':
                            row['Attributes']['scope'],
                        }
                        ris.append(ri)

                if 'NextToken' in rsp:
                    rsp = ce.get_reservation_utilization(
                        NextToken=rsp['NextToken'])
                    continue
                break

        ris = pd.DataFrame.from_records(ris)
        ris.to_csv(cache_file, index=False)

    LOGGER.info("Loaded {} reserved instances".format(len(ris)))
    return ris
Esempio n. 5
0
def load(config, locations=LOCATIONS):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-ec2-pricing.json'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            pass

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        pricing_url  = utils.get_config_value(config, 'PRICING', 'URL', 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/index.csv')

        csv_reader = csv.reader(codecs.iterdecode(urlopen(pricing_url), 'utf-8'), utils.CsvDialect())

        # Find header row
        header_map = None
        while header_map is None:
            header = next(csv_reader)
            if header[0] == "SKU":
                header_map = utils.parse_header(header)

        rowcount = 0
        prices = {}
        units = {}
        for row in csv_reader:
            if not check_row(header_map, row):
                continue

            sku = row[header_map['SKU']]
            location = row[header_map['Location']]
            instanceType = row[header_map['Instance Type']]
            tenancy = row[header_map['Tenancy']]
            operatingsystem = row[header_map['Operating System']]

            # Resolve the AWS Region from location information.
            if location not in locations:
                locations[location] = utils.get_config_value(config, 'LOCATIONS', location, '')
                if locations[location] == '':
                    LOGGER.info('Skipping unknown location: {}'.format(location))
            if locations[location] == '':
                continue
            region = locations[location]

            # Populate result set
            if region not in prices:
                prices[region] = {}
            if instanceType not in prices[region]:
                prices[region][instanceType] = {}
            if tenancy not in prices[region][instanceType]:
                prices[region][instanceType][tenancy] = {}
            if operatingsystem not in prices[region][instanceType][tenancy]:
                prices[region][instanceType][tenancy][operatingsystem] = {
                    "sku": sku,
                    "reserved": {}
                }
                units[instanceType] = float(row[header_map['Normalization Size Factor']])

            price = prices[region][instanceType][tenancy][operatingsystem]
            if price['sku'] != sku:
                print('WARNING: Duplicate sku: {}:{} -> {} != {}'.format(region, instanceType, sku, price['sku']))
                continue

            # Add pricing data
            if row[header_map['TermType']] == 'OnDemand':
                if row[header_map['Unit']] in ('Hrs', 'Hours'):
                    price['onDemandRate'] = float(row[header_map['PricePerUnit']])
            elif row[header_map['TermType']] == 'Reserved':
                id = '{}-{}-{}'.format(row[header_map['LeaseContractLength']], row[header_map['OfferingClass']],
                                       row[header_map['PurchaseOption']])
                if id not in price['reserved']:
                    price['reserved'][id] = { 'upfront': 0.0, 'hourly': 0.0 }
                if row[header_map['Unit']] in ('Hrs', 'Hours'):
                    price['reserved'][id]['hourly'] = float(row[header_map['PricePerUnit']])
                elif row[header_map['Unit']] in ('Quantity'):
                    price['reserved'][id]['upfront'] = float(row[header_map['PricePerUnit']])

            rowcount += 1
        LOGGER.info("Loaded {} pricing rows".format(rowcount))

        # Trim useless data
        rowcount = 0
        remove = []
        for region in prices:
            for instanceType in prices[region]:
                for tenancy in prices[region][instanceType]:
                    for operatingsystem in prices[region][instanceType][tenancy]:
                        if 'onDemandRate' not in prices[region][instanceType][tenancy][operatingsystem] or \
                                len(prices[region][instanceType][tenancy][operatingsystem]['reserved']) == 0:
                            remove.append([region, instanceType, tenancy, operatingsystem])
                        else:
                            rowcount += 1
        for keys in remove:
            del prices[keys[0]][keys[1]][keys[2]][keys[3]]
        prices['units'] = units
        LOGGER.info("Loaded prices for {} instance types".format(rowcount))

        with open(cache_file, 'w') as outfile:
            json.dump(prices, outfile, indent=4)
        return prices

    with utils.get_read_handle(cache_file) as input:
        prices = json.load(input)
        return prices
Esempio n. 6
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-locations.yaml'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            pass

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        account, role = utils.get_master(config)
        region = utils.get_config_value(
            config, 'ATHENA', 'AWS_REGION',
            utils.get_config_value(config, 'DEFAULTS', 'AWS_REGION',
                                   os.environ.get('AWS_DEFAULT_REGION')))
        database = utils.get_config_value(config, 'ATHENA', 'CUR_DATABASE')
        staging = utils.get_config_value(
            config, 'ATHENA', 'STAGING',
            's3://aws-athena-query-results-{0}-{1}/ariel-cur-output/'.format(
                account, region))
        days = utils.get_config_value(config, 'ATHENA', 'DAYS', 28)
        offset = utils.get_config_value(config, 'ATHENA', 'OFFSET', 1)

        session = boto3.Session()
        proto, empty, staging_bucket, staging_prefix = staging.split('/', 3)

        # Assume role if needed
        if role is not None:
            session = utils.assume_role(session, role)

        # Connect to Athena
        athena = session.client('athena', region_name=region)

        # Validate database is usable
        status_id = utils.execute_athena_query(
            athena, staging,
            'SELECT status FROM ' + database + '.cost_and_usage_data_status')

        # Row 0 is header
        status = athena.get_query_results(
            QueryExecutionId=status_id
        )['ResultSet']['Rows'][1]['Data'][0]['VarCharValue']
        if status != 'READY':
            raise Exception('Athena database not in READY status')

        # Identify start to end range query
        today = datetime.datetime.combine(datetime.datetime.today(),
                                          datetime.time.min)
        endtime = today - datetime.timedelta(days=offset)
        starttime = endtime - datetime.timedelta(days=days)

        # Retrieve location to region mapping for use with ec2 pricing data
        query = ' '.join(
            ("" + "SELECT DISTINCT product_location, product_region " +
             "  FROM " + database + ".cur " +
             " WHERE line_item_usage_start_date >= cast('{}' as timestamp) ".
             format(starttime.isoformat(' ')) +
             "   AND line_item_usage_start_date < cast('{}' as timestamp) ".
             format(endtime.isoformat(' ')) +
             "   AND product_operation = 'RunInstances' ").split())
        map_id = utils.execute_athena_query(athena, staging, query)
        map_result = athena.get_query_results(
            QueryExecutionId=map_id)['ResultSet']['Rows']
        locations = {}
        for i in range(1, len(map_result)):
            row = map_result[i]['Data']
            location = row[0]['VarCharValue']
            region = row[1]['VarCharValue']
            locations[location] = region

        with open(cache_file, 'w') as outfile:
            yaml.dump(locations, outfile, default_flow_style=False)
        return locations

    with utils.get_read_handle(cache_file) as input:
        locations = yaml.load(input, Loader=yaml.FullLoader)
        return locations
Esempio n. 7
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-account-instance-summary.csv'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            mtime = 0

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        account, role = utils.get_master(config)
        region = utils.get_config_value(
            config, 'ATHENA', 'AWS_REGION',
            utils.get_config_value(config, 'DEFAULTS', 'AWS_REGION',
                                   os.environ.get('AWS_DEFAULT_REGION')))
        database = utils.get_config_value(config, 'ATHENA', 'CUR_DATABASE')
        table_name = utils.get_config_value(config, 'ATHENA', 'CUR_TABLE_NAME',
                                            'cur')
        staging = utils.get_config_value(
            config, 'ATHENA', 'STAGING',
            's3://aws-athena-query-results-{0}-{1}/ariel-cur-output/'.format(
                account, region))
        days = utils.get_config_value(config, 'ATHENA', 'DAYS', 28)
        offset = utils.get_config_value(config, 'ATHENA', 'OFFSET', 1)

        session = boto3.Session()
        proto, empty, staging_bucket, staging_prefix = staging.split('/', 3)

        # Assume role if needed
        if role is not None:
            session = utils.assume_role(session, role)

        # Connect to Athena
        athena = session.client('athena', region_name=region)

        # Validate database is usable
        status_id = utils.execute_athena_query(
            athena, staging,
            'SELECT status FROM ' + database + '.cost_and_usage_data_status')

        # Row 0 is header
        status = athena.get_query_results(
            QueryExecutionId=status_id
        )['ResultSet']['Rows'][1]['Data'][0]['VarCharValue']
        if status != 'READY':
            raise Exception('Athena database not in READY status')

        # Identify start to end range query
        today = datetime.datetime.combine(datetime.datetime.today(),
                                          datetime.time.min)
        endtime = today - datetime.timedelta(days=offset)
        starttime = endtime - datetime.timedelta(days=days)

        # Download Instance and RI usage
        query = ' '.join((
            "" + "  WITH preprocess AS ( " +
            "       SELECT line_item_usage_start_date AS usagestartdate, " +
            "              line_item_usage_account_id AS usageaccountid, " +
            "              line_item_availability_zone AS availabilityzone, " +
            "              CASE WHEN line_item_usage_type LIKE '%:%' THEN SPLIT(line_item_usage_type, ':')[2] "
            +
            "                   WHEN line_item_line_item_description LIKE '%m1.small%' THEN 'm1.small' "
            +
            "                   WHEN line_item_line_item_description LIKE '%m1.medium%' THEN 'm1.medium' "
            +
            "                   WHEN line_item_line_item_description LIKE '%m1.large%' THEN 'm1.large' "
            +
            "                   WHEN line_item_line_item_description LIKE '%m1.xlarge%' THEN 'm1.xlarge' "
            + "                   ELSE 'm1.error' " +
            "              END AS instancetype, " +
            "              product_tenancy AS tenancy, " +
            "              product_operating_system AS operatingsystem, " +
            "              CAST(line_item_usage_amount AS double) as usageamount, "
            +
            "              CASE WHEN line_item_line_item_type = 'DiscountedUsage' THEN CAST(line_item_usage_amount AS DOUBLE) ELSE 0 END as reservedamount "
            + "         FROM " + database + "." + table_name +
            "        WHERE product_operation = 'RunInstances' " +
            "          AND line_item_availability_zone != '' " +
            "          AND line_item_availability_zone NOT LIKE '%-wlz-%' "  # Filter out Wavelength Instances.  They're not available for RIs.
            + "          AND product_tenancy = 'Shared' " + " ) " +
            "SELECT usagestartdate, usageaccountid, availabilityzone, instancetype, tenancy, operatingsystem, SUM(usageamount) as instances, SUM(reservedamount) as reserved "
            + "  FROM preprocess " +
            " WHERE usagestartdate >= cast('{}' as timestamp) ".format(
                starttime.isoformat(' ')) +
            "   AND usagestartdate < cast('{}' as timestamp) ".format(
                endtime.isoformat(' ')) +
            " GROUP BY usagestartdate, usageaccountid, availabilityzone, instancetype, tenancy, operatingsystem "
            +
            " ORDER BY usagestartdate, usageaccountid, availabilityzone, instancetype, tenancy, operatingsystem "
        ).split())
        query_id = utils.execute_athena_query(athena, staging, query)
        session.client('s3').download_file(
            staging_bucket, '{0}{1}.csv'.format(staging_prefix, query_id),
            cache_file)

    result = pd.read_csv(cache_file, parse_dates=['usagestartdate'])

    LOGGER.info("Loaded {} instance summary rows".format(len(result)))
    return result
Esempio n. 8
0
def load(config):

    # If local files exists and is less than a day old, just use it.
    cache_file = '/tmp/cached-unlimited-summary.csv'
    caching = utils.get_config_value(config, 'DEFAULTS', 'CACHING', False)
    mtime = 0
    if caching:
        try:
            mtime = os.stat(cache_file).st_mtime
        except FileNotFoundError:
            mtime = 0

    if mtime > time.time() - 86400:
        LOGGER.info("Using existing cache file: " + cache_file)
    else:
        account, role = utils.get_master(config)
        region = utils.get_config_value(
            config, 'ATHENA', 'AWS_REGION',
            utils.get_config_value(config, 'DEFAULTS', 'AWS_REGION',
                                   os.environ.get('AWS_DEFAULT_REGION')))
        database = utils.get_config_value(config, 'ATHENA', 'CUR_DATABASE')
        table_name = utils.get_config_value(config, 'ATHENA', 'CUR_TABLE_NAME',
                                            'cur')
        staging = utils.get_config_value(
            config, 'ATHENA', 'STAGING',
            's3://aws-athena-query-results-{0}-{1}/ariel-cur-output/'.format(
                account, region))
        days = utils.get_config_value(config, 'ATHENA', 'DAYS', 28)
        offset = utils.get_config_value(config, 'ATHENA', 'OFFSET', 1)

        session = boto3.Session()
        proto, empty, staging_bucket, staging_prefix = staging.split('/', 3)

        # Assume role if needed
        if role is not None:
            session = utils.assume_role(session, role)

        # Connect to Athena
        athena = session.client('athena', region_name=region)

        # Validate database is usable
        status_id = utils.execute_athena_query(
            athena, staging,
            'SELECT status FROM ' + database + '.cost_and_usage_data_status')

        # Row 0 is header
        status = athena.get_query_results(
            QueryExecutionId=status_id
        )['ResultSet']['Rows'][1]['Data'][0]['VarCharValue']
        if status != 'READY':
            raise Exception('Athena database not in READY status')

        # Identify start to end range query
        today = datetime.datetime.combine(datetime.datetime.today(),
                                          datetime.time.min)
        endtime = today - datetime.timedelta(days=offset)
        starttime = endtime - datetime.timedelta(days=days)

        # Download Instance and RI usage
        # meckstmd: 07/30/2019 - Something must have changed with the way AWS is exposing CPU Credits.
        #  There is a line_item_line_item_type column of Tax for each account which has CPU Credits which
        #  does not have a product_region or product_instance.  Because these fields are empty, Ariel
        #  fails when trying to insert this report data into the unlimited_usage DB table because it does
        #  not allow nulls.  The line_item_line_item_type column of Usage in this report has the per-instance
        #  CPU credits for unlimited and does have product_region and product_instance.  I am guessing the
        #  Tax one was just added to this report and that is what broke Ariel.
        #  See https://github.com/yahoo/ariel/issues/5
        query = ' '.join((
            "" + "SELECT line_item_usage_account_id AS accountid ,"
            "       product_region AS region, "
            "       lower(product_instance) AS instancetypefamily, "
            "       sum(line_item_usage_amount) AS unlimitedusageamount, "
            "       sum(line_item_unblended_cost) AS unlimitedusagecost " +
            "  FROM " + database + "." + table_name +
            " WHERE line_item_usage_type like '%CPUCredits:%' " +
            "   AND line_item_usage_start_date >= cast('{}' as timestamp) ".
            format(starttime.isoformat(' ')) +
            "   AND line_item_usage_start_date < cast('{}' as timestamp) ".
            format(endtime.isoformat(' ')) +
            "   AND product_region <> '' AND product_instance <> ''" +
            " GROUP BY line_item_usage_account_id, product_region, lower(product_instance) "
            +
            " ORDER BY line_item_usage_account_id, product_region, lower(product_instance) "
        ).split())
        query_id = utils.execute_athena_query(athena, staging, query)
        session.client('s3').download_file(
            staging_bucket, '{0}{1}.csv'.format(staging_prefix, query_id),
            cache_file)

    result = pd.read_csv(cache_file)
    if len(result) == 0:
        result = pd.DataFrame(columns=[
            'accountid', 'region', 'instancetypefamily',
            'unlimitedusageamount', 'unlimitedusagecost'
        ])

    result['accountid'] = result['accountid'].map('{:012}'.format)
    result['unlimitedusageamount'] = result['unlimitedusageamount'].map(
        '{:.2f}'.format)
    result['unlimitedusagecost'] = result['unlimitedusagecost'].map(
        '${:,.2f}'.format)
    LOGGER.info("Loaded {} unlimited rows".format(len(result)))
    return result