def testGetLastCommitOfDate_failed(self, find_commit): commit_before = ('2a66bac4', '2019-03-17T23:50:16-07:00') find_commit.side_effect = [None, commit_before] date = pd.Timestamp('2019-03-17 04:01:01', tz=pinboard.TZ) with self.assertRaises(ValueError): pinboard.GetLastCommitOfDate(date) cutoff_date = pd.Timestamp('2019-03-18 00:00:00', tz=pinboard.TZ) find_commit.assert_has_calls([mock.call(after_date=cutoff_date)])
def testGetLastCommitOfDate_simple(self, find_commit): commit_before = ('2a66bac4', '2019-03-17T23:50:16-07:00') commit_after = ('5aefdb31', '2019-03-18T02:41:58-07:00') find_commit.side_effect = [commit_after, commit_before] date = pd.Timestamp('2019-03-17 04:01:01', tz=pinboard.TZ) return_value = pinboard.GetLastCommitOfDate(date) cutoff_date = pd.Timestamp('2019-03-18 00:00:00', tz=pinboard.TZ) find_commit.assert_has_calls([ mock.call(after_date=cutoff_date), mock.call(before_date=cutoff_date)]) self.assertEqual(return_value, commit_before)
def testGetRevisionResults_simple(self): item = StateItem('2a66ba', timestamp='2019-03-17T23:50:16-07:00') csv = [ 'change,benchmark,story,name,unit,mean\n', '2a66ba,loading,story1,Total:duration,ms_smallerIsBetter,300.0\n', '2a66ba,loading,story2,Total:duration,ms_smallerIsBetter,400.0\n', '2a66ba+patch,loading,story1,Total:duration,ms_smallerIsBetter,100.0\n', '2a66ba+patch,loading,story2,Total:duration,ms_smallerIsBetter,200.0\n', '2a66ba,loading,story1,Other:metric,count_smallerIsBetter,1.0\n' ] expected_results = [ ('without_patch', 0.35, '2018-03-17T12:00:00'), ('with_patch', 0.15, '2019-03-17T12:00:00'), ] filename = pinboard.RevisionResultsFile(item) with open(filename, 'w') as f: f.writelines(csv) with mock.patch('cli_tools.pinboard.pinboard.ACTIVE_STORIES', new=['story1', 'story2']): df = pinboard.GetRevisionResults(item) self.assertEqual(len(df.index), 2) # Only two rows of output. self.assertTrue((df['revision'] == '2a66ba').all()) self.assertTrue((df['benchmark'] == 'loading').all()) self.assertTrue((df['name'] == 'Total:duration').all()) self.assertTrue((df['count'] == 2).all()) df = df.set_index('label', verify_integrity=True) for label, value, timestamp in expected_results: self.assertEqual(df.loc[label, 'mean'], value) self.assertEqual(df.loc[label, 'timestamp'], pd.Timestamp(timestamp))
def Main(): SetUpLogging(level=logging.INFO) actions = ('start', 'collect', 'upload') parser = argparse.ArgumentParser() parser.add_argument( 'actions', metavar='ACTION', nargs='+', choices=actions + ('auto', ), help=("select action to perform: 'start' pinpoint jobs, 'collect' job " "results, 'upload' aggregated data, or 'auto' to do all in " "sequence.")) parser.add_argument( '--date', type=lambda s: pd.Timestamp(s, tz=TZ), default=Yesterday(), help=( 'Run jobs for the last commit landed on the given date (assuming ' 'MTV time). Defaults to the last commit landed yesterday.')) args = parser.parse_args() if 'auto' in args.actions: logging.info('=== auto run for %s ===', args.date) args.actions = actions state = LoadJobsState() try: if 'start' in args.actions: StartPinpointJobs(state, args.date) if 'collect' in args.actions: CollectPinpointResults(state) finally: UpdateJobsState(state) if 'upload' in args.actions: AggregateAndUploadResults(state)
def GetFakeResults(item): df = pd.DataFrame(index=[0]) df['revision'] = item['revision'] df['label'] = 'with_patch' df['benchmark'] = 'loading' df['name'] = 'Total:duration' df['timestamp'] = pd.Timestamp(item['timestamp']) df['count'] = 1 if item['revision'] != 'a400' else 0 return df
def testFindCommit_notFound(self): self.subprocess.check_output.return_value = '' date = pd.Timestamp('2019-03-18T00:00:00', tz=pinboard.TZ) return_value = pinboard.FindCommit(after_date=date) self.subprocess.check_output.assert_called_once_with( ['git', 'log', '--max-count', '1', '--format=format:%H:%ct', '--after', '2019-03-18T00:00:00-07:00', 'origin/master'], cwd=pinboard.TOOLS_PERF_DIR) self.assertIsNone(return_value)
def GetRevisionResults(item): """Aggregate the results from jobs that ran on a particular revision.""" # First load pinpoint csv results into a DataFrame. The dtype arg is needed # to ensure that job_id's are always read a strings (even if some of them # look like large numbers). df = pd.read_csv(RevisionResultsFile(item), dtype={'job_id': str}) assert df['change'].str.contains(item['revision']).all(), ( 'Not all results match the expected git revision') # Filter out and keep only the measurements and stories that we want. df = df[df['name'].isin(MEASUREMENTS)] df = df[df['story'].isin(ACTIVE_STORIES)] if not df.empty: # Aggregate over the results of individual stories. df = df.groupby(['change', 'name', 'benchmark', 'unit'])['mean'].agg(['mean', 'count']).reset_index() else: # Otherwise build a single row with an "empty" aggregate for this revision. # This is needed so we can remember in the cache that this revision has # been processed. df = pd.DataFrame(index=[0]) df['change'] = item['revision'] df['name'] = '(missing)' df['benchmark'] = '(missing)' df['unit'] = '' df['mean'] = np.nan df['count'] = 0 # Convert time units from milliseconds to seconds. This is what Data Studio # dashboards expect. is_ms_unit = df['unit'].str.startswith('ms_') df.loc[is_ms_unit, 'mean'] = df['mean'] / 1000 # Distinguish jobs that ran with/without the tested patch. df['label'] = df['change'].str.contains(r'\+').map({ False: 'without_patch', True: 'with_patch' }) # Add timestamp and revision information. We snap the date to noon and make # it naive (i.e. no timezone), so the dashboard doesn't get confused with # dates close to the end of day. date = item['timestamp'].split('T')[0] + 'T12:00:00' df['timestamp'] = pd.Timestamp(date) df['revision'] = item['revision'] # Fake the timestamp of jobs without the patch to appear as if they ran a # year ago; this makes it easier to visualize and compare timeseries from # runs with/without the patch in Data Studio dashboards. df.loc[df['label'] == 'without_patch', 'timestamp'] = (df['timestamp'] - pd.DateOffset(years=1)) return df[[ 'revision', 'timestamp', 'label', 'benchmark', 'name', 'mean', 'count' ]]
def testFindCommit_simple(self): self.subprocess.check_output.return_value = '2a66bac4:1552891816\n' date = pd.Timestamp('2019-03-18T00:00:00', tz=pinboard.TZ) revision, timestamp = pinboard.FindCommit(before_date=date) self.subprocess.check_output.assert_called_once_with( ['git', 'log', '--max-count', '1', '--format=format:%H:%ct', '--before', '2019-03-18T00:00:00-07:00', 'origin/master'], cwd=pinboard.TOOLS_PERF_DIR) self.assertEqual(revision, '2a66bac4') self.assertEqual(timestamp, '2019-03-17T23:50:16-07:00')
def FindCommit(before_date=None, after_date=None): """Find latest commit with optional before/after date constraints.""" cmd = ['git', 'log', '--max-count', '1', '--format=format:%H:%ct'] if before_date is not None: cmd.extend(['--before', before_date.isoformat()]) if after_date is not None: cmd.extend(['--after', after_date.isoformat()]) cmd.append('origin/master') line = subprocess.check_output(cmd, cwd=TOOLS_PERF_DIR).strip() if line: revision, commit_time = line.split(':') commit_time = pd.Timestamp(int(commit_time), unit='s', tz=TZ).isoformat() return revision, commit_time else: return None
def testAggregateAndUploadResults(self, time_ago, get_revision_results): state = [ StateItem('a100', timestamp='2019-03-15', job1='completed'), StateItem('a200', timestamp='2019-03-16', job2='completed'), StateItem('a300', timestamp='2019-03-17', job3='failed'), StateItem('a400', timestamp='2019-03-18', job4='completed'), StateItem('a500', timestamp='2019-03-19', job5='completed'), ] def GetFakeResults(item): df = pd.DataFrame(index=[0]) df['revision'] = item['revision'] df['label'] = 'with_patch' df['benchmark'] = 'loading' df['name'] = 'Total:duration' df['timestamp'] = pd.Timestamp(item['timestamp']) df['count'] = 1 if item['revision'] != 'a400' else 0 return df get_revision_results.side_effect = GetFakeResults time_ago.return_value = pd.Timestamp('2018-10-20') # Only process first few revisions. new_items, cached_df = pinboard.GetItemsToUpdate(state[:3]) pinboard.AggregateAndUploadResults(new_items, cached_df) dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE) df = pd.read_csv(dataset_file) self.assertEqual(set(df['revision']), set(['a100', 'a200'])) self.assertTrue((df[df['reference']]['revision'] == 'a200').all()) # Incrementally process the rest. new_items, cached_df = pinboard.GetItemsToUpdate(state) pinboard.AggregateAndUploadResults(new_items, cached_df) dataset_file = pinboard.CachedFilePath(pinboard.DATASET_CSV_FILE) df = pd.read_csv(dataset_file) self.assertEqual(set(df['revision']), set(['a100', 'a200', 'a500'])) self.assertTrue((df[df['reference']]['revision'] == 'a500').all()) # No new revisions. This should be a no-op. new_items, cached_df = pinboard.GetItemsToUpdate(state) pinboard.AggregateAndUploadResults(new_items, cached_df) self.assertEqual(get_revision_results.call_count, 4) # Uploads twice (the pkl and csv) on each call to aggregate results. self.assertEqual(self.upload_to_cloud.call_count, 2 * 2)