def main():
    """
    Automation script for running scaling tests for Toil Recompute
    """
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--config', required=True, help='Configuration file for run. Must be in shared_dir')
    parser.add_argument('-c', '--cluster_size', required=True, help='Number of workers desired in the cluster.')
    parser.add_argument('-s', '--sample_size', required=True, type=float, help='Size of the sample deisred in TB.')
    parser.add_argument('-t', '--instance_type', default='c3.8xlarge', help='e.g. m4.large or c3.8xlarge.')
    parser.add_argument('-n', '--cluster_name', required=True, help='Name of cluster.')
    parser.add_argument('--namespace', default='jtvivian', help='CGCloud NameSpace')
    parser.add_argument('--spot_price', default=0.60, help='Change spot price of instances')
    parser.add_argument('-b', '--bucket', default='tcga-data-cgl-recompute', help='Bucket where data is.')
    parser.add_argument('-d', '--shared_dir', required=True,
                        help='Full path to directory with: pipeline script, launch script, config, and master key.')
    params = parser.parse_args()

    # Run sequence
    start = time.time()
    # Get number of samples from config
    with open(params.config, 'r') as f:
        num_samples = len(f.readlines())
    # Launch cluster and pipeline
    uuid = fix_launch(params)
    launch_cluster(params)
    ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker')
    launch_pipeline(params)
    # Blocks until all workers are idle
    stop = time.time()
    # Collect metrics from cluster
    collect_metrics(ids, list_of_metrics, start, stop, uuid=uuid)
    # Apply "Insta-kill" alarm to every worker
    map(apply_alarm_to_instance, ids)
    # Kill leader
    logging.info('Killing Leader')
    leader_id = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-leader')[0]
    apply_alarm_to_instance(leader_id, threshold=5)
    # Generate Run Report
    avail_zone = get_avail_zone(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker')[0]
    total_cost, avg_hourly_cost = calculate_cost(params.instance_type, ids[0], avail_zone)
    # Report values
    output = ['UUID: {}'.format(uuid),
              'Number of Samples: {}'.format(num_samples),
              'Number of Nodes: {}'.format(params.cluster_size),
              'Cluster Name: {}'.format(params.cluster_name),
              'Source Bucket: {}'.format(params.bucket),
              'Average Hourly Cost: ${}'.format(avg_hourly_cost),
              'Cost per Instance: ${}'.format(total_cost),
              'Availability Zone: {}'.format(avail_zone),
              'Start Time: {}'.format(datetime.isoformat(datetime.utcfromtimestamp(start))),
              'Stop Time: {}'.format(datetime.isoformat(datetime.utcfromtimestamp(stop))),
              'Total Cost of Cluster: ${}'.format(float(total_cost) * int(params.cluster_size)),
              'Cost Per Sample: ${}'.format((float(total_cost) * int(params.cluster_size) / int(num_samples)))]
    with open(os.path.join(str(uuid) + '_{}'.format(str(datetime.utcnow()).split()[0]), 'run_report.txt'), 'w') as f:
        f.write('\n'.join(output))
    # You're done!
    logging.info('\n\nScaling Test Complete.')
def collect_metrics(params, start, uuid=str(uuid4())):
    """
    Collect metrics from AWS instances.  AWS limits data collection to 1,440 points or 5 days if
    collected in intervals of 5 minutes.  This metric collection will "page" the results in intervals
    of 4 days (to be safe) in order to collect all the desired metrics.

    instance_ids: list          List of instance IDs
    list_of_metrics: list       List of metric names
    start: float                time.time() of start point
    stop: float                 time.time() of stop point
    region: str                 AWS region metrics are being collected from
    uuid: str                   UUID of metric collection
    """
    list_of_metrics = ['AWS/EC2/CPUUtilization',
                       'CGCloud/MemUsage',
                       'CGCloud/DiskUsage_mnt_ephemeral',
                       'CGCloud/DiskUsage_root',
                       'AWS/EC2/NetworkIn',
                       'AWS/EC2/NetworkOut',
                       'AWS/EC2/DiskWriteOps',
                       'AWS/EC2/DiskReadOps']

    ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker')

    while ids:
        # metrics = {metric: [] for metric in list_of_metrics}
        for instance_id in ids:
            for metric in list_of_metrics:
                averages = []
                try:
                    s = start
                    while s < stop:
                        e = s + (4 * 24 * 3600)
                        aws_start = datetime.utcfromtimestamp(s)
                        aws_stop = datetime.utcfromtimestamp(e)
                        met_object = get_metric(metric, instance_id, aws_start, aws_stop)
                        averages.extend([x['Average'] for x in get_datapoints(met_object)])
                        s = e
                    if averages:
                        metrics[metric].append(averages)
                        logging.info('# of Datapoints for metric {} is {}'.format(metric, len(metrics[metric][0])))
                except RuntimeError:
                    if instance_id in instance_ids:
                        instance_ids.remove(instance_id)
        # Remove metrics if no datapoints were collected
        metrics = dict((k, v) for k, v in metrics.iteritems() if v)
        # Save CSV of data
        mkdir_p('{}_{}'.format(uuid, str(datetime.utcnow()).split()[0]))
        for metric in metrics:
            with open('{}_{}/{}.csv'.format(uuid, str(datetime.utcnow()).split()[0], metric.rsplit('/', 1)[1]), 'wb') as f:
                writer = csv.writer(f)
                writer.writerows(metrics[metric])
def main():
    """
    Script to collect aggregate metrics from a collection of instances.
    """
    # parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    # parser.add_argument('-c', '--cluster_name', default=None, help='Name of cluster to filter by.')
    # parser.add_argument('-n', '--instance_name', default=None, help='Name of instnace to filter by.')
    # params = parser.parse_args()
    #
    # ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.instance_name)
    ids = get_instance_ids(filter_cluster='scaling-gtex-400', filter_name='jtvivian_toil-worker')
    logging.info("IDs being collected: {}".format(ids))
    list_of_metrics = ['AWS/EC2/CPUUtilization',
                       'CGCloud/MemUsage',
                       'CGCloud/DiskUsage_mnt_ephemeral',
                       'CGCloud/DiskUsage_root',
                       'AWS/EC2/NetworkIn',
                       'AWS/EC2/NetworkOut',
                       'AWS/EC2/DiskWriteOps',
                       'AWS/EC2/DiskReadOps']
    collect_metrics(ids, list_of_metrics, start=1454355507.550286, stop=1454405909.397642)
def collect_realtime_metrics(params, threshold=0.5):
    """
    Collect metrics from AWS instances in 1 hour intervals. Instances that have gone idle (below threshold CPU value)
    are terminated.

    :type params: argparse.Namespace
    """
    list_of_metrics = ['AWS/EC2/CPUUtilization',
                       'CGCloud/MemUsage',
                       'CGCloud/DiskUsage_mnt_ephemeral',
                       'CGCloud/DiskUsage_root',
                       'AWS/EC2/NetworkIn',
                       'AWS/EC2/NetworkOut',
                       'AWS/EC2/DiskWriteOps',
                       'AWS/EC2/DiskReadOps']

    # Create output directory
    uuid = str(uuid4())
    date = str(datetime.utcnow().date())
    dir_path = '{}_{}_{}'.format(params.cluster_name, uuid, date)
    mkdir_p(dir_path)

    start = time.time() - metric_start_time_margin

    # Create connections to ec2 and cloudwatch
    region = region_of_zone(params.zone)
    conn = boto.ec2.connect_to_region(region)
    cw = boto.ec2.cloudwatch.connect_to_region(region)
    sdbconn = boto.sdb.connect_to_region(region)
    domain = sdbconn.get_domain('{0}--files'.format(params.jobstore))

    # Create initial variables
    start = datetime.utcfromtimestamp(start)
    DataPoint = namedtuple('datapoint', ['instance_id', 'value', 'timestamp'])
    timestamps = {}
    # Begin loop
    log.info('Metric collection has started. '
             'Waiting {} seconds before initial collection.'.format(metric_initial_wait_period_in_seconds))
    time.sleep(metric_initial_wait_period_in_seconds)

    while True:
        # FIXME: why doesn't filter_cluster=params.cluster_name work?
        ids = get_instance_ids(filter_name=params.namespace.strip('/').rstrip('/') + '_toil-worker')
        if not ids:
            break
        metric_collection_time = time.time()
        try:
            for instance_id in tqdm(ids):
                idle = False
                for metric in list_of_metrics:
                    datapoints = []
                    aws_start = timestamps.get(instance_id, start)
                    aws_stop = datetime.utcnow() + metric_endtime_margin
                    metric_object = get_metric(cw, metric, instance_id, aws_start, aws_stop)
                    for datum in metric_object:
                        d = DataPoint(instance_id=instance_id, value=datum['Average'], timestamp=datum['Timestamp'])
                        datapoints.append(d)
                    # Save data in local directory
                    if datapoints:
                        datapoints = sorted(datapoints, key=lambda x: x.timestamp)
                        with open(os.path.join(dir_path, '{}.csv'.format(os.path.basename(metric))), 'a') as f:
                            writer = csv.writer(f, delimiter='\t')
                            writer.writerows(datapoints)
                    # Check if instance's CPU has been idle the last 30 minutes.
                    if metric == 'AWS/EC2/CPUUtilization':
                        averages = [x.value for x in sorted(datapoints, key=lambda x: x.timestamp)][-6:]
                        # If there is at least 30 minutes of data points and max is below threshold, flag to be killed.
                        if len(averages) == 6:
                            if max(averages) < threshold:
                                idle = True
                                log.info('Flagging {} to be killed. '
                                         'Max CPU {} for last 30 minutes.'.format(instance_id, max(averages)))
                            else:
                                log.info('Max CPU for {} was {} for last 30 minutes.'.format(instance_id, max(averages)))

                # Kill instance if idle and cluster is too large
                if idle:
                    try:
                        with cluster_size_lock:
                            cluster_size = get_cluster_size(params.cluster_name)
                            desired_cluster_size = get_desired_cluster_size(domain)
                            if cluster_size > desired_cluster_size:
                                log.info('Cluster size (%d) is larger than requested (%d).'
                                         'Terminating idle instance %s.',
                                         cluster_size,
                                         desired_cluster_size,
                                         instance_id)
                                
                                cmd = ['cgcloud',
                                       'terminate',
                                       '--instance-id', instance_id,
                                       '--cluster-name', params.cluster_name,
                                       'toil']

                                try:
                                    check_call(cmd)
                                    log.info("Successfully terminated instance via %s.",
                                             " ".join(cmd))
                                except:
                                    log.error("Terminating instance with %s failed.",
                                              " ".join(cmd))
                                    raise

                                update_cluster_size(domain, cluster_size - 1)
                    except (EC2ResponseError, BotoServerError) as e:
                        log.info('Error terminating instance: {}\n{}'.format(instance_id, e))
                # Set start point to be last collected timestamp
                timestamps[instance_id] = max(x.timestamp for x in datapoints) if datapoints else start
        except BotoServerError:
            log.error('Giving up trying to fetch metric for this interval')

        # Sleep
        collection_time = time.time() - metric_collection_time
        log.info('Metric collection took: {} seconds. Waiting one hour.'.format(collection_time))
        wait_time = metric_collection_interval_in_seconds - collection_time
        if wait_time < 0:
            log.warning('Collection time exceeded metric collection interval by: %i', -wait_time)
        else:
            time.sleep(wait_time)

    log.info('Metric collection has finished.')
def collect_realtime_metrics(params, threshold=0.5, region='us-west-2'):
    """
    Collect metrics from AWS instances in 1 hour intervals.
    Instances that have gone idle (below threshold CPU value) are terminated.

    params: argparse.Namespace      Input arguments
    region: str                     AWS region metrics are being collected from
    uuid: str                       UUID of metric collection
    """
    list_of_metrics = ['AWS/EC2/CPUUtilization',
                       'CGCloud/MemUsage',
                       'CGCloud/DiskUsage_mnt_ephemeral',
                       'CGCloud/DiskUsage_root',
                       'AWS/EC2/NetworkIn',
                       'AWS/EC2/NetworkOut',
                       'AWS/EC2/DiskWriteOps',
                       'AWS/EC2/DiskReadOps']

    # Create output directory
    uuid = str(uuid4())
    date = str(datetime.utcnow().date())
    dir_path = '{}_{}_{}'.format(params.cluster_name, uuid, date)
    mkdir_p(dir_path)

    start = time.time() - metric_start_time_margin

    # Create connections to ec2 and cloudwatch
    conn = boto.ec2.connect_to_region(region)
    cw = boto.ec2.cloudwatch.connect_to_region(region)
    # Create initial variables
    start = datetime.utcfromtimestamp(start)
    DataPoint = namedtuple('datapoint', ['instance_id', 'value', 'timestamp'])
    timestamps = {}
    # Begin loop
    log.info('Metric collection has started. '
             'Waiting {} seconds before initial collection.'.format(metric_initial_wait_period_in_seconds))
    time.sleep(metric_initial_wait_period_in_seconds)
    while True:
        ids = get_instance_ids(filter_cluster=params.cluster_name, filter_name=params.namespace + '_toil-worker')
        if not ids:
            break
        metric_collection_time = time.time()
        try:
            for instance_id in tqdm(ids):
                kill_instance = False
                for metric in list_of_metrics:
                    datapoints = []
                    aws_start = timestamps.get(instance_id, start)
                    aws_stop = datetime.utcnow() + metric_endtime_margin
                    metric_object = get_metric(cw, metric, instance_id, aws_start, aws_stop)
                    for datum in metric_object:
                        d = DataPoint(instance_id=instance_id, value=datum['Average'], timestamp=datum['Timestamp'])
                        datapoints.append(d)
                    # Save data in local directory
                    if datapoints:
                        datapoints = sorted(datapoints, key=lambda x: x.timestamp)
                        with open(os.path.join(dir_path, '{}.tsv'.format(os.path.basename(metric))), 'a') as f:
                            writer = csv.writer(f, delimiter='\t')
                            writer.writerows(datapoints)
                    # Check if instance's CPU has been idle the last 20 minutes.
                    if metric == 'AWS/EC2/CPUUtilization':
                        averages = [x.value for x in sorted(datapoints, key=lambda x: x.timestamp)][-4:]
                        # If there is at least 20 minutes of data points and max is below threshold, flag to be killed.
                        if len(averages) == 4:
                            if max(averages) < threshold:
                                kill_instance = False    # Don't kill an instance
                                log.info('Flagging {} to be killed. '
                                         'Max CPU {} for last 30 minutes.'.format(instance_id, max(averages)))
                # Kill instance if idle
                if kill_instance:
                    try:
                        log.info('Terminating Instance: {}'.format(instance_id))
                        conn.terminate_instances(instance_ids=[instance_id])
                    except (EC2ResponseError, BotoServerError) as e:
                        log.info('Error terminating instance: {}\n{}'.format(instance_id, e))
                # Set start point to be last collected timestamp
                timestamps[instance_id] = max(x.timestamp for x in datapoints) if datapoints else start
        except BotoServerError:
            log.error('Giving up trying to fetch metric for this interval')
        # Sleep
        collection_time = time.time() - metric_collection_time
        log.info('Metric collection took: {} seconds. '
                 'Waiting {} seconds.'.format(collection_time, metric_collection_interval_in_seconds))
        wait_time = metric_collection_interval_in_seconds - collection_time
        if wait_time < 0:
            log.warning('Collection time exceeded metric collection interval by: %i', -wait_time)
        else:
            time.sleep(wait_time)
    log.info('Metric collection has finished.')