Example #1
0
def FetchItemData(task_id, benchmark, index, temp_dir):
    """Fetches the performance values (AVG & CI ranges) of tasks.

  Args:
    task_id: The list of item Ids to fetch dat for.
    benchmark: The benchmark these task are on (desktop/mobile).
    index: The index field of the data_frame
    temp_dir: The temp directory to store task data in.

  Returns:
    A data_frame containing the averages and confidence interval ranges.
  """
    output_directory = os.path.abspath(os.path.join(temp_dir, task_id))
    query = [
        SWARMING_PATH, 'collect', '-S', 'chromium-swarm.appspot.com',
        '--task-output-dir', output_directory, task_id
    ]
    try:
        subprocess.check_output(query)
    except Exception as e:
        print(e)

    result_file_path = os.path.join(output_directory, '0',
                                    'rendering.' + benchmark,
                                    'perf_results.csv')

    try:
        df = pandas.read_csv(result_file_path)
        df = df.loc[df['name'] == 'frame_times']
        df = df[['stories', 'avg', 'ci_095']]
        df['index'] = index
        return df
    except:
        print("CSV results were not produced!")
Example #2
0
    def testAggregateAndUploadResults(self, time_ago, get_revision_results):
        state = [
            StateItem('a100', timestamp='2019-03-15', job1='completed'),
            StateItem('a200', timestamp='2019-03-16', job2='completed'),
            StateItem('a300', timestamp='2019-03-17', job3='failed'),
            StateItem('a400', timestamp='2019-03-18', job4='completed'),
            StateItem('a500', timestamp='2019-03-19', job5='completed'),
        ]

        def GetFakeResults(item):
            df = pd.DataFrame(index=[0])
            df['revision'] = item['revision']
            df['label'] = 'with_patch'
            df['benchmark'] = 'loading'
            df['name'] = 'Total:duration'
            df['timestamp'] = pd.Timestamp(item['timestamp'])
            df['count'] = 1 if item['revision'] != 'a400' else 0
            return df

        get_revision_results.side_effect = GetFakeResults
        time_ago.return_value = pd.Timestamp('2018-10-20')

        # Only process first few revisions.
        new_items, cached_df = pinboard.GetItemsToUpdate(state[:3])
        pinboard.AggregateAndUploadResults(new_items, cached_df)
        dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE)
        df = pd.read_csv(dataset_file)
        self.assertEqual(set(df['revision']), set(['a100', 'a200']))
        self.assertTrue((df[df['reference']]['revision'] == 'a200').all())

        # Incrementally process the rest.
        new_items, cached_df = pinboard.GetItemsToUpdate(state)
        pinboard.AggregateAndUploadResults(new_items, cached_df)
        dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE)
        df = pd.read_csv(dataset_file)
        self.assertEqual(set(df['revision']), set(['a100', 'a200', 'a500']))
        self.assertTrue((df[df['reference']]['revision'] == 'a500').all())

        # No new revisions. This should be a no-op.
        new_items, cached_df = pinboard.GetItemsToUpdate(state)
        pinboard.AggregateAndUploadResults(new_items, cached_df)

        self.assertEqual(get_revision_results.call_count, 4)
        # Uploads twice (the pkl and csv) on each call to aggregate results.
        self.assertEqual(self.upload_to_cloud.call_count, 2 * 2)
Example #3
0
def GetRevisionResults(item):
    """Aggregate the results from jobs that ran on a particular revision."""
    # First load pinpoint csv results into a DataFrame. The dtype arg is needed
    # to ensure that job_id's are always read a strings (even if some of them
    # look like large numbers).
    df = pd.read_csv(RevisionResultsFile(item), dtype={'job_id': str})
    assert df['change'].str.contains(item['revision']).all(), (
        'Not all results match the expected git revision')

    # Filter out and keep only the measurements and stories that we want.
    df = df[df['name'].isin(MEASUREMENTS)]
    df = df[df['story'].isin(ACTIVE_STORIES)]

    if not df.empty:
        # Aggregate over the results of individual stories.
        df = df.groupby(['change', 'name', 'benchmark',
                         'unit'])['mean'].agg(['mean', 'count']).reset_index()
    else:
        # Otherwise build a single row with an "empty" aggregate for this revision.
        # This is needed so we can remember in the cache that this revision has
        # been processed.
        df = pd.DataFrame(index=[0])
        df['change'] = item['revision']
        df['name'] = '(missing)'
        df['benchmark'] = '(missing)'
        df['unit'] = ''
        df['mean'] = np.nan
        df['count'] = 0

    # Convert time units from milliseconds to seconds. This is what Data Studio
    # dashboards expect.
    is_ms_unit = df['unit'].str.startswith('ms_')
    df.loc[is_ms_unit, 'mean'] = df['mean'] / 1000

    # Distinguish jobs that ran with/without the tested patch.
    df['label'] = df['change'].str.contains(r'\+').map({
        False: 'without_patch',
        True: 'with_patch'
    })

    # Add timestamp and revision information. We snap the date to noon and make
    # it naive (i.e. no timezone), so the dashboard doesn't get confused with
    # dates close to the end of day.
    date = item['timestamp'].split('T')[0] + 'T12:00:00'
    df['timestamp'] = pd.Timestamp(date)
    df['revision'] = item['revision']

    # Fake the timestamp of jobs without the patch to appear as if they ran a
    # year ago; this makes it easier to visualize and compare timeseries from
    # runs with/without the patch in Data Studio dashboards.
    df.loc[df['label'] == 'without_patch',
           'timestamp'] = (df['timestamp'] - pd.DateOffset(years=1))

    return df[[
        'revision', 'timestamp', 'label', 'benchmark', 'name', 'mean', 'count'
    ]]
Example #4
0
def Main(argv):
    parser = argparse.ArgumentParser(
        description=('Gathers the values of each metric and platfrom pair in a'
                     ' csv file to be used in clustering of stories.'))
    parser.add_argument('benchmark', type=str, help='benchmark to be used')
    parser.add_argument('--metrics',
                        type=str,
                        nargs='*',
                        help='List of metrics to use')
    parser.add_argument('--platforms',
                        type=str,
                        nargs='*',
                        help='List of platforms to use')
    parser.add_argument(
        '--testcases-path',
        type=str,
        help=
        ('Path to the file '
         'containing a list of all test_cases in the benchmark that needs to '
         'be clustered'))
    parser.add_argument('--days',
                        default=180,
                        help=('Number of days to gather'
                              ' data about'))
    parser.add_argument('--output-path',
                        type=str,
                        help='Output file',
                        default='//tmp/story_clustering/clusters.json')
    parser.add_argument('--max-cluster-count',
                        default='10',
                        help='Number of not valid clusters needed')
    parser.add_argument('--min-cluster-size',
                        default='2',
                        help=('Least number '
                              'of members in cluster, to make cluster valied'))
    parser.add_argument(
        '--rolling-window',
        default='1',
        help=(
            'Number of '
            'samples to take average from while calculating the moving average'
        ))
    parser.add_argument('--normalize',
                        default=False,
                        help='Normalize timeseries to calculate similarity',
                        action='store_true')
    args = parser.parse_args(argv[1:])

    temp_dir = tempfile.mkdtemp('telemetry')
    startup_timeseries = os.path.join(temp_dir, 'startup_timeseries.json')
    soundwave_output_path = os.path.join(temp_dir, 'data.csv')
    soundwave_path = os.path.join(TOOLS_PERF_PATH, 'soundwave')

    try:
        output_dir = os.path.dirname(args.output_path)
        clusters_json = {}

        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        # creating the json file needed for soundwave
        create_soundwave_input.CreateInput(test_suite=args.benchmark,
                                           platforms=args.platforms,
                                           metrics=args.metrics,
                                           test_cases_path=args.testcases_path,
                                           output_dir=startup_timeseries)

        subprocess.call([
            soundwave_path, '-d', args.days, 'timeseries', '-i',
            startup_timeseries, '--output-csv', soundwave_output_path
        ])

        # Processing the data.
        dataframe = pandas.read_csv(soundwave_output_path)
        dataframe_per_metric = dataframe.groupby(dataframe['measurement'])
        for metric_name, all_bots in list(dataframe_per_metric):
            clusters_json[metric_name] = []

            distance_matrix = CalculateDistances(all_bots_dataframe=all_bots,
                                                 bots=dataframe['bot'],
                                                 rolling_window=int(
                                                     args.rolling_window),
                                                 metric_name=metric_name,
                                                 normalize=args.normalize)

            clusters, coverage = cluster_stories.RunHierarchicalClustering(
                distance_matrix,
                max_cluster_count=int(args.max_cluster_count),
                min_cluster_size=int(args.min_cluster_size),
            )
            print()
            print(metric_name, ':')
            print(format(coverage * 100.0, '.1f'), 'percent coverage.')
            print('Stories are grouped into', len(clusters), 'clusters.')
            print('representatives:')
            for cluster in clusters:
                print(cluster.GetRepresentative())
            print()

            for cluster in clusters:
                clusters_json[metric_name].append(cluster.AsDict())

        with open(args.output_path, 'w') as outfile:
            json.dump(clusters_json,
                      outfile,
                      separators=(',', ': '),
                      indent=4,
                      sort_keys=True)

    except Exception:
        logging.exception('The following exception may have prevented the code'
                          ' from clustering stories.')
    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)