Train Stats

 ----------------------------------------------------------------------------------------
/                                                                                        \
| /$$$$$$$$                 /$$            /$$$$$$   /$$                 /$$             |
||__  $$__/                |__/           /$$__  $$ | $$                | $$             |
|   | $$  /$$$$$$  /$$$$$$  /$$ /$$$$$$$ | $$  \__//$$$$$$    /$$$$$$  /$$$$$$   /$$$$$$$|
|   | $$ /$$__  $$|____  $$| $$| $$__  $$|  $$$$$$|_  $$_/   |____  $$|_  $$_/  /$$_____/|
|   | $$| $$  \__/ /$$$$$$$| $$| $$  \ $$ \____  $$ | $$      /$$$$$$$  | $$   |  $$$$$$ |
|   | $$| $$      /$$__  $$| $$| $$  | $$ /$$  \ $$ | $$ /$$ /$$__  $$  | $$ /$$\____  $$|
|   | $$| $$     |  $$$$$$$| $$| $$  | $$|  $$$$$$/ |  $$$$/|  $$$$$$$  |  $$$$//$$$$$$$/|
|   |__/|__/      \_______/|__/|__/  |__/ \______/   \___/   \_______/   \___/ |_______/ |
|                                                                                        |
|                                                                                        |
|                                                                                        |
|                                      🚂 ¯\_(ツ)_/¯?                                     |
\                                                                                        /
 ----------------------------------------------------------------------------------------
    \     
     \     
      \      
           ___ ____
         ⎛   ⎛ ,----
          \  //==--'
     _//|,.·//==--'    ____________________________
    _OO≣=-  ︶ ᴹw ⎞_§ ______  ___\ ___\ ,\__ \/ __ \
   (∞)_, )  (     |  ______/__  \/ /__ / /_/ / /_/ /
     ¨--¨|| |- (  / ______\____/ \___/ \__^_/  .__/
         ««_/  «_/ jgs/bd808                /_/

In which I look at data from the past several hundred trains and pretend that I know how to do exploratory data analysis.

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine

with open('data/TRAINS') as f:
    TRAINS = [x.strip() for x in f.readlines()]

engine = create_engine('sqlite:///data/train.db')
df = pd.read_sql('''
SELECT
    version,
    rollbacks,
    rollbacks_time,
    group2_delay_days,
    (group0_delay_days +
     group1_delay_days +
     group2_delay_days) as total_delay,
    total_time as train_total_time,
    (select count(*) from blocker b where b.train_id = t.id) as blockers,
    (select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
    patches,
    (select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
    (select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
    (select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time
FROM train t
''', engine)

# Makes your data 538% better...I think
plt.style.use('fivethirtyeight')
df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	rollbacks	rollbacks_time	group2_delay_days	total_delay	train_total_time	blockers	resolved_blockers	patches	max_time_in_review	max_comments_per_patch	max_cycle_time
0	1.37.0-wmf.1	0	0	0	0	178349	5	3	450	36809044.0	27.0	36952873.0
1	1.37.0-wmf.3	3	94493	0	1	219880	7	6	366	56122286.0	30.0	56562620.0
2	1.37.0-wmf.4	1	66812	1	3	263742	9	4	422	38820872.0	29.0	38982601.0
3	1.36.0-wmf.1	0	0	4	4	519622	1	1	566	47181045.0	31.0	47755190.0
4	1.36.0-wmf.2	4	389769	4	5	554704	7	1	273	110996452.0	33.0	111569626.0

df_corr = df.copy(deep=True)
df_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(df_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title('Correlation of train variables')
plt.show()

df.set_index('version')['blockers'].hist(figsize=(12, 10))
plt.xlabel("Blockers", labelpad=15)
plt.title("Blockers per Train", y=1.02, fontsize=22)

Text(0.5, 1.02, 'Blockers per Train')

df.patches.plot(linewidth=1)
plt.xticks(rotation=90)

(array([-50.,   0.,  50., 100., 150., 200., 250., 300., 350., 400.]),
 [Text(-50.0, 0, '−50'),
  Text(0.0, 0, '0'),
  Text(50.0, 0, '50'),
  Text(100.0, 0, '100'),
  Text(150.0, 0, '150'),
  Text(200.0, 0, '200'),
  Text(250.0, 0, '250'),
  Text(300.0, 0, '300'),
  Text(350.0, 0, '350'),
  Text(400.0, 0, '400')])

df[df['blockers'] > 10].sort_values(by='blockers', ascending=False)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	rollbacks	rollbacks_time	group2_delay_days	total_delay	train_total_time	blockers	resolved_blockers	patches	max_time_in_review	max_comments_per_patch	max_cycle_time
82	1.34.0-wmf.20	1	16897	5	5	600096	20	11	413	60583935.0	37.0	60715205.0
242	1.38.0-wmf.19	1	70250	0	1	176801	18	10	319	85391889.0	24.0	85778365.0
103	1.33.0-wmf.22	0	0	0	1	63844	17	11	391	107411197.0	45.0	107659715.0
77	1.34.0-wmf.14	2	412678	4	5	524983	16	7	646	73502579.0	34.0	73539481.0
136	1.31.0-wmf.20	2	134534	1	5	255075	14	12	822	66099647.0	75.0	67210868.0
210	1.30.0-wmf.2	2	595079	0	0	782668	14	12	462	67958577.0	56.0	68943029.0
241	1.38.0-wmf.18	0	0	0	0	171956	13	7	287	186407187.0	43.0	186862129.0
124	1.32.0-wmf.22	0	0	0	0	173055	12	7	824	64869761.0	44.0	65392036.0
259	1.39.0-wmf.10	5	57256	0	1	172544	12	5	206	89212513.0	22.0	89427805.0
57	1.35.0-wmf.31	3	365118	4	15	516489	11	7	427	72215548.0	70.0	72206585.0
76	1.34.0-wmf.13	2	14912	0	1	183853	11	8	471	53208155.0	23.0	53820019.0
187	1.28.0-wmf.21	1	88476	0	1	176082	11	8	514	60386455.0	93.0	61360331.0

block_df = pd.read_sql('''
SELECT
    version,
    group_blocked
FROM train t
JOIN blocker b ON t.id = b.train_id
''', engine)
block_df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	group_blocked
0	1.37.0-wmf.7	-1
1	1.37.0-wmf.7	-1
2	1.37.0-wmf.12	2
3	1.37.0-wmf.12	1
4	1.37.0-wmf.12	1

block_df.group_blocked.unique()

array([-1,  2,  1,  0])

group_name_map = {
    -1: "Earlier",
    0: "Group0",
    1: "Group1",
    2: "Group2",
}
block_df['blocker_added'] = block_df.group_blocked.map(group_name_map)
block_df.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	group_blocked	blocker_added
0	1.37.0-wmf.7	-1	Earlier
1	1.37.0-wmf.7	-1	Earlier
2	1.37.0-wmf.12	2	Group2
3	1.37.0-wmf.12	1	Group1
4	1.37.0-wmf.12	1	Group1

block_df.group_blocked.value_counts()

group_blocked
-1    557
 1    422
 0    349
 2    143
Name: count, dtype: int64

block_df.version

0        1.37.0-wmf.7
1        1.37.0-wmf.7
2       1.37.0-wmf.12
3       1.37.0-wmf.12
4       1.37.0-wmf.12
            ...      
1466     1.43.0-wmf.6
1467     1.43.0-wmf.6
1468     1.43.0-wmf.6
1469     1.43.0-wmf.7
1470     1.43.0-wmf.7
Name: version, Length: 1471, dtype: object

block_df.set_index('version')
block_df.sort_values('group_blocked', inplace=True)


fig = plt.figure(figsize=(16,6))
plt.grid(color='white', lw=0.5, axis='x')
n, bins, patches = plt.hist(block_df.blocker_added, bins=4, rwidth=0.95)

xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.2f}\nto\n{:.2f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=["Before group0", "Group0", "Group1", "Group2"])

# remove y ticks
plt.yticks([])

# plot values on top of bars
for idx, value in enumerate(n):
    if value > 0:
        plt.text(xticks[idx], value+5, int(value), ha='center')

plt.title('Train Blockers by Group Where They Were Discovered', loc='left', pad=30)
plt.show()

patches = pd.read_sql('''
SELECT
    link,
    version,
    submitted,
    insertions as ins,
    (deletions*-1) as del
FROM patch p JOIN train t ON t.id = p.train_id
''', engine)
patches.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	submitted	ins	del
count	1.191340e+05	1.191340e+05	1.191340e+05
mean	1.592731e+09	2.941666e+02	-2.334408e+02
std	7.264151e+07	5.028140e+04	4.821069e+04
min	1.431572e+09	0.000000e+00	-1.661412e+07
25%	1.535077e+09	2.000000e+00	-1.800000e+01
50%	1.589385e+09	7.000000e+00	-4.000000e+00
75%	1.655997e+09	3.400000e+01	-1.000000e+00
max	1.716881e+09	1.728860e+07	0.000000e+00

patches['loc'] = patches['ins'] + patches['del']
patches.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	link	version	submitted	ins	del	loc
0	https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z	1.37.0-wmf.1	1618945759	5	-1	4
1	https://gerrit.wikimedia.org/r/#/q/3302274f,n,z	1.37.0-wmf.1	1618878371	1156	-660	496
2	https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z	1.37.0-wmf.1	1618343309	976	-3	973
3	https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z	1.37.0-wmf.1	1618341075	8	-29	-21
4	https://gerrit.wikimedia.org/r/#/q/af916aad,n,z	1.37.0-wmf.1	1618300868	7	-5	2

patches['submitted'] = pd.to_datetime(patches['submitted'], unit='s')
patches.set_index('submitted', inplace=True)

out = patches.groupby(pd.Grouper(freq='M')).apply(lambda x: x)
out = out[out['link'] != 'https://gerrit.wikimedia.org/r/#/q/9a08dbab,n,z'] # The one patch that inserts 17.2M lines of code
out['ok'] = out['loc'].cumsum()

Cycle time/Lead time

Cycle time is the time from when a patch enters code review to the time that it's in production. Lead time is the time it takes from commit to production.

# GOAL
#         train     lead_time    cycle_time   Id
# 0    1.37.0-wmf.6    200   2000   u1234
# 1    1.37.0-wmf.6    123   2800   u1235

cycle = pd.read_sql('''
SELECT
    substr(version, 8) as version,
    datetime(start_time, 'unixepoch'),
    (start_time - created) as cycle_time,
    (start_time - submitted) as lead_time,
    datetime(created, 'unixepoch'),
    datetime(submitted, 'unixepoch'),
    link
FROM patch p JOIN train t ON t.id = p.train_id
WHERE (lead_time > 0 AND cycle_time > 0)
    AND (
        version = "%(version_one)s" OR
        version = "%(version_two)s" OR
        version = "%(version_three)s"
    )
''' % {
    'version_one': TRAINS[0],
    'version_two': TRAINS[1],
    'version_three': TRAINS[2],
}, engine)
cycle.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	datetime(start_time, 'unixepoch')	cycle_time	lead_time	datetime(created, 'unixepoch')	datetime(submitted, 'unixepoch')	link
0	wmf.5	2024-05-14 03:01:41	8195933	7497	2024-02-09 06:22:48	2024-05-14 00:56:44	https://gerrit.wikimedia.org/r/999129
1	wmf.5	2024-05-14 03:01:41	20163	7409	2024-05-13 21:25:38	2024-05-14 00:58:12	https://gerrit.wikimedia.org/r/1030971
2	wmf.5	2024-05-14 03:01:41	21159	19285	2024-05-13 21:09:02	2024-05-13 21:40:16	https://gerrit.wikimedia.org/r/1031000
3	wmf.5	2024-05-14 03:01:41	28359	26929	2024-05-13 19:09:02	2024-05-13 19:32:52	https://gerrit.wikimedia.org/r/1030949
4	wmf.5	2024-05-14 03:01:41	182643	30033	2024-05-12 00:17:38	2024-05-13 18:41:08	https://gerrit.wikimedia.org/r/1030546

cycle.sort_values(by='lead_time', ascending=False).head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	datetime(start_time, 'unixepoch')	cycle_time	lead_time	datetime(created, 'unixepoch')	datetime(submitted, 'unixepoch')	link
422	wmf.6	2024-05-21 03:01:39	3040057	589118	2024-04-15 22:34:02	2024-05-14 07:23:01	https://gerrit.wikimedia.org/r/1019841
421	wmf.6	2024-05-21 03:01:39	3036305	587585	2024-04-15 23:36:34	2024-05-14 07:48:34	https://gerrit.wikimedia.org/r/1019853
244	wmf.5	2024-05-14 03:01:41	1307246	586672	2024-04-28 23:54:15	2024-05-07 08:03:49	https://gerrit.wikimedia.org/r/1024975
618	wmf.6	2024-05-21 03:01:39	170491763	586440	2018-12-25 20:12:16	2024-05-14 08:07:39	https://gerrit.wikimedia.org/r/481318
420	wmf.6	2024-05-21 03:01:39	2817108	586156	2024-04-18 12:29:51	2024-05-14 08:12:23	https://gerrit.wikimedia.org/r/1021274

Lead time

The time from commit to deploy (in seconds)

cycle['lead_time_days'] = cycle['lead_time'] / (60*60*24)
cycle['cycle_time_days'] = cycle['cycle_time'] / (60*60*24)
cycle.sort_values(by='lead_time_days').head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	datetime(start_time, 'unixepoch')	cycle_time	lead_time	datetime(created, 'unixepoch')	datetime(submitted, 'unixepoch')	link	lead_time_days	cycle_time_days
249	wmf.5	2024-05-14 03:01:41	9529	1085	2024-05-14 00:22:52	2024-05-14 02:43:36	https://gerrit.wikimedia.org/r/1031040	0.012558	0.110289
156	wmf.5	2024-05-14 03:01:41	479204	1089	2024-05-08 13:54:57	2024-05-14 02:43:32	https://gerrit.wikimedia.org/r/1029111	0.012604	5.546343
518	wmf.6	2024-05-21 03:01:39	731263	2609	2024-05-12 15:53:56	2024-05-21 02:18:10	https://gerrit.wikimedia.org/r/1030588	0.030197	8.463692
638	wmf.6	2024-05-21 03:01:39	94900	2609	2024-05-20 00:39:59	2024-05-21 02:18:10	https://gerrit.wikimedia.org/r/1033700	0.030197	1.098380
115	wmf.5	2024-05-14 03:01:41	886376	5049	2024-05-03 20:48:45	2024-05-14 01:37:32	https://gerrit.wikimedia.org/r/1026948	0.058438	10.258981

from matplotlib import ticker as mticker
import numpy as np

# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10), constrained_layout=True)
plt.tight_layout(pad=5)
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='lead_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='lead_time_days', ax=ax[2])
ax[0].set_ylabel('Time from merge to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))

plt.suptitle('Lead time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

/tmp/ipykernel_1609226/2853478908.py:6: UserWarning: The figure layout has changed to tight
  plt.tight_layout(pad=5)

fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax)
ax.set_ylabel('Time from merge to deploy (days)', labelpad=20.0)

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Lead time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

Cycle time

The time from patchset submission for code review to deploy

from matplotlib import ticker as mticker
import numpy as np

# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10))
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='cycle_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='cycle_time_days', ax=ax[2])
ax[0].set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 74.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 80.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 82.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 79.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

Cycle time log scale

It's hard to see the majority of our patch's cycletime with the outliers. Here's the log-scale version.

fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax)
ax.set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

Train bugfixes

Backport

v. To retroactively supply a fix, or a new feature, to a previous version of a software product at the same time (or after) supplying it to the current version.

n. A commit that is backported

Each train has many backports. Each backport may be supplied to many trains. Backports add features, change feature flags, and fix bugs.

Some backports have tasks associated with them. Some tasks are "bugs" or "errors".

When a backport is associated with a task that is a "bug" or an "error" it's a bugfix. The number of bugfixes per train is a good signal of the number of bugs that were present in that train.

Bugs may persist for many trains; however, if a developer makes a backport, they felt that the bug was severe enough to warrant fixing immediately rather than waiting a week—that's signal about train quality, too.

train_bugs = pd.read_sql('''
select
  version,
  count(b.link) as bug_count
from
  train t
  join bug_train bt on bt.train_id = t.id
  join bug b on bt.bug_id = b.id
group by
  version
order by
  start_time
''', engine)
train_bugs.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	bug_count
0	1.27.0-wmf.16	3
1	1.27.0-wmf.19	1
2	1.27.0-wmf.21	1
3	1.27.0-wmf.22	2
4	1.27.0-wmf.23	3

Bug count histogram

Seems to follow the power law

train_bugs.hist()
plt.title('Bug count per train')

Text(0.5, 1.0, 'Bug count per train')

train_bugs.sort_values(by="bug_count", ascending=False).head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	bug_count
120	1.34.0-wmf.13	13
90	1.32.0-wmf.24	11
206	1.37.0-wmf.20	11
194	1.37.0-wmf.5	11
119	1.34.0-wmf.11	11

train_bugs = pd.read_sql('''
select
  version,
  count(b.link) as bug_count,
  rollbacks,
  (select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
      (select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
    (select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
    (select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time,
  patches
from
  train t
  join bug_train bt on bt.train_id = t.id
  join bug b on bt.bug_id = b.id
group by
  version
order by
  start_time
''', engine)
train_bugs.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	bug_count	rollbacks	resolved_blockers	max_time_in_review	max_comments_per_patch	max_cycle_time	patches
0	1.27.0-wmf.16	3	1	0	17557313.0	95.0	18128243.0	322
1	1.27.0-wmf.19	1	1	1	58248774.0	45.0	58798558.0	230
2	1.27.0-wmf.21	1	0	0	120956985.0	73.0	120937246.0	180
3	1.27.0-wmf.22	2	0	0	50045244.0	58.0	50160003.0	416
4	1.27.0-wmf.23	3	2	3	20153065.0	40.0	20228209.0	168

train_bugs_corr = train_bugs.copy(deep=True)
train_bugs_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(train_bugs_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

train_bugs[train_bugs['version'] == TRAINS[-1]]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	bug_count	rollbacks	resolved_blockers	max_time_in_review	max_comments_per_patch	max_cycle_time	patches
326	1.43.0-wmf.7	2	0	0	47093205.0	54.0	47545250.0	283

A look at comments per patch

DCaro made an interesting comment on the the fame blog about this repo. This is my ham-fisted investigation.

comm_dist = pd.read_sql('select version, sum(comments) as comm from patch p join train t on p.train_id = t.id group by t.version', engine)
comm_dist.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	comm
0	1.27.0-wmf.16	1840
1	1.27.0-wmf.17	1372
2	1.27.0-wmf.18	1204
3	1.27.0-wmf.19	1440
4	1.27.0-wmf.20	1573

comm_dist.set_index('version')['comm'].hist(figsize=(12, 10))
plt.xlabel("Patch Comments", labelpad=15)
plt.title("Comments per Train", y=1.02, fontsize=22)

Text(0.5, 1.02, 'Comments per Train')

pcommdf = pd.read_sql('select link, comments from patch', engine)
pcommdf.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	link	comments
0	https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z	1
1	https://gerrit.wikimedia.org/r/#/q/3302274f,n,z	1
2	https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z	0
3	https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z	15
4	https://gerrit.wikimedia.org/r/#/q/af916aad,n,z	3

pcommdf.set_index('link')['comments'].hist(figsize=(12, 10),bins=100)

<Axes: >

pcommdf.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	comments
count	119134.000000
mean	3.524561
std	5.192373
min	0.000000
25%	1.000000
50%	2.000000
75%	4.000000
max	354.000000

# Let's try to remove huge outliers
pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	link	comments
0	https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z	1
1	https://gerrit.wikimedia.org/r/#/q/3302274f,n,z	1
2	https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z	0
3	https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z	15
4	https://gerrit.wikimedia.org/r/#/q/af916aad,n,z	3
...	...	...
119129	https://gerrit.wikimedia.org/r/1035369	1
119130	https://gerrit.wikimedia.org/r/1035368	1
119131	https://gerrit.wikimedia.org/r/1035367	1
119132	https://gerrit.wikimedia.org/r/1035366	1
119133	https://gerrit.wikimedia.org/r/1032418	2

118440 rows × 2 columns

pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())].hist(bins=50,figsize=(10, 10))
plt.xlabel("Distribution of comments on patches", labelpad=15)
plt.title("Comments per Patch", y=1.02, fontsize=22)

Text(0.5, 1.02, 'Comments per Patch')

Patch authors per train

patch_authors = pd.read_sql('''
SELECT train.version,
    COUNT(DISTINCT patch.owner) as unique_owners
    FROM train
    JOIN patch ON train.id = patch.train_id
    GROUP BY train.id
    ORDER BY start_time;
''', engine)
patch_authors.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	unique_owners
0	1.27.0-wmf.16	65
1	1.27.0-wmf.17	60
2	1.27.0-wmf.18	61
3	1.27.0-wmf.19	67
4	1.27.0-wmf.20	71

patch_authors.set_index('version').unique_owners.plot(linewidth=1)
plt.xticks(rotation=90)

(array([-50.,   0.,  50., 100., 150., 200., 250., 300., 350., 400.]),
 [Text(-50.0, 0, '1.41.0-wmf.9'),
  Text(0.0, 0, '1.27.0-wmf.16'),
  Text(50.0, 0, '1.30.0-wmf.6'),
  Text(100.0, 0, '1.32.0-wmf.23'),
  Text(150.0, 0, '1.35.0-wmf.11'),
  Text(200.0, 0, '1.36.0-wmf.32'),
  Text(250.0, 0, '1.39.0-wmf.5'),
  Text(300.0, 0, '1.41.0-wmf.7'),
  Text(350.0, 0, '1.43.0-wmf.6'),
  Text(400.0, 0, '')])

patch_authors.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	unique_owners
count	352.000000
mean	68.238636
std	11.973850
min	11.000000
25%	60.000000
50%	68.000000
75%	75.000000
max	108.000000

sns.jointplot(data=train_bugs, x='patches', y='resolved_blockers', kind='reg')

<seaborn.axisgrid.JointGrid at 0x7f570804ef50>

Bugs escaping into production

Everytime we backport a fix, it counts as a bug. And it's a bug we missed before the code went to production. This is known as an "escape."

escapes = pd.read_sql('''
select
  version,
  b.link
from
  bug_train bt
  join train t on t.id = bt.train_id
  join bug b on bt.bug_id = b.id
  join bug_bug_patch bbp on bbp.bug_id = b.id
  join bug_patch bp on bp.id = bbp.bug_patch_id
  join bug_file bf on bp.id = bf.bug_patch_id
group by
  version, b.link;
''', engine)
escapes.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	link
0	1.27.0-wmf.16	https://phabricator.wikimedia.org/T129641
1	1.27.0-wmf.16	https://phabricator.wikimedia.org/T129704
2	1.27.0-wmf.16	https://phabricator.wikimedia.org/T129715
3	1.27.0-wmf.19	https://phabricator.wikimedia.org/T131283
4	1.27.0-wmf.21	https://phabricator.wikimedia.org/T132645

escapes.version.value_counts()

version
1.34.0-wmf.13    13
1.37.0-wmf.20    11
1.32.0-wmf.24    11
1.34.0-wmf.11    11
1.37.0-wmf.5     11
                 ..
1.31.0-wmf.11     1
1.31.0-wmf.1      1
1.30.0-wmf.9      1
1.35.0-wmf.40     1
1.31.0-wmf.4      1
Name: count, Length: 327, dtype: int64

df_escapes = escapes.version.value_counts(dropna=True, sort=False).rename_axis('version').to_frame('counts')
# sns.lineplot(df_escapes, x='version')
df_escapes.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	counts
version
1.27.0-wmf.16	3
1.27.0-wmf.19	1
1.27.0-wmf.21	1
1.27.0-wmf.22	2
1.27.0-wmf.23	3

df_escapes = df_escapes.reset_index()
df_escapes.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	counts
0	1.27.0-wmf.16	3
1	1.27.0-wmf.19	1
2	1.27.0-wmf.21	1
3	1.27.0-wmf.22	2
4	1.27.0-wmf.23	3

df_escapes['counts'] = pd.to_numeric(df_escapes['counts'], errors='coerce')
df_escapes['version'] = df_escapes['version'].astype(str)
df_escapes.head()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	version	counts
0	1.27.0-wmf.16	3
1	1.27.0-wmf.19	1
2	1.27.0-wmf.21	1
3	1.27.0-wmf.22	2
4	1.27.0-wmf.23	3

import numpy as np

# Select a subset of versions to display
subset_versions = df_escapes['version'].unique()[::1]  # Every version...just make a copy

# Filter the DataFrame to include only the subset of versions
df_subset = df_escapes[df_escapes['version'].isin(subset_versions)]

# Extract x and y values from the DataFrame
x = df_subset['version']
y = df_subset['counts']

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the line plot
ax.plot(x, y, marker='o', linestyle='-', label='Data')

# Calculate the trend line using polynomial regression (degree=1)
coefficients = np.polyfit(np.arange(len(x)), y, deg=1)
trendline = np.poly1d(coefficients)
ax.plot(x, trendline(np.arange(len(x))), linestyle='--', label='Trend Line')

# Calculate the slope of the trend line
slope = coefficients[0]

# Set labels and title
ax.set_xlabel('Version')
ax.set_ylabel('Counts')
ax.set_title(f'Trend Plot of Escapes by Version\nSlope: {slope:.2f} escapes per version')

# Reduce the number of x-axis tick labels
tick_positions = np.arange(0, len(x), step=20)  # Adjust the step value to show desired number of labels
tick_labels = x[tick_positions]
plt.xticks(tick_positions, tick_labels, rotation=45)

# Display legend
ax.legend()

# Show the plot
plt.show()

Name		Name	Last commit message	Last commit date
Latest commit History 348 Commits
README_files		README_files
data		data
scripts		scripts
submodules/operations		submodules/operations
templates		templates
.gitattributes		.gitattributes
.gitignore		.gitignore
.gitmodules		.gitmodules
LICENSE		LICENSE
Makefile		Makefile
README.ipynb		README.ipynb
README.md		README.md
TODO.md		TODO.md
Trainsperiment.ipynb		Trainsperiment.ipynb
app.py		app.py
bugs.py		bugs.py
gerrit.py		gerrit.py
git.py		git.py
pyproject.toml		pyproject.toml
report.csv		report.csv
requirements.txt		requirements.txt
toolforge-deploy.sh		toolforge-deploy.sh
trainblockers.py		trainblockers.py
trainstats.py		trainstats.py
update-group1-group2.py		update-group1-group2.py
update-patch-owner-in-db.py		update-patch-owner-in-db.py
update.sh		update.sh
utils.py		utils.py

License

thcipriani/train-stats

Folders and files

Latest commit

History

Repository files navigation

Train Stats

Cycle time/Lead time

Lead time

Cycle time

Cycle time log scale

Train bugfixes

Bug count histogram

A look at comments per patch

Patch authors per train

Bugs escaping into production

About

Resources

License

Stars

Watchers

Forks

Languages