----------------------------------------------------------------------------------------
/ \
| /$$$$$$$$ /$$ /$$$$$$ /$$ /$$ |
||__ $$__/ |__/ /$$__ $$ | $$ | $$ |
| | $$ /$$$$$$ /$$$$$$ /$$ /$$$$$$$ | $$ \__//$$$$$$ /$$$$$$ /$$$$$$ /$$$$$$$|
| | $$ /$$__ $$|____ $$| $$| $$__ $$| $$$$$$|_ $$_/ |____ $$|_ $$_/ /$$_____/|
| | $$| $$ \__/ /$$$$$$$| $$| $$ \ $$ \____ $$ | $$ /$$$$$$$ | $$ | $$$$$$ |
| | $$| $$ /$$__ $$| $$| $$ | $$ /$$ \ $$ | $$ /$$ /$$__ $$ | $$ /$$\____ $$|
| | $$| $$ | $$$$$$$| $$| $$ | $$| $$$$$$/ | $$$$/| $$$$$$$ | $$$$//$$$$$$$/|
| |__/|__/ \_______/|__/|__/ |__/ \______/ \___/ \_______/ \___/ |_______/ |
| |
| |
| |
| 🚂 ¯\_(ツ)_/¯? |
\ /
----------------------------------------------------------------------------------------
\
\
\
___ ____
⎛ ⎛ ,----
\ //==--'
_//|,.·//==--' ____________________________
_OO≣=- ︶ ᴹw ⎞_§ ______ ___\ ___\ ,\__ \/ __ \
(∞)_, ) ( | ______/__ \/ /__ / /_/ / /_/ /
¨--¨|| |- ( / ______\____/ \___/ \__^_/ .__/
««_/ «_/ jgs/bd808 /_/
In which I look at data from the past several hundred trains and pretend that I know how to do exploratory data analysis.
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
with open('data/TRAINS') as f:
TRAINS = [x.strip() for x in f.readlines()]
engine = create_engine('sqlite:///data/train.db')
df = pd.read_sql('''
SELECT
version,
rollbacks,
rollbacks_time,
group2_delay_days,
(group0_delay_days +
group1_delay_days +
group2_delay_days) as total_delay,
total_time as train_total_time,
(select count(*) from blocker b where b.train_id = t.id) as blockers,
(select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
patches,
(select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
(select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
(select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time
FROM train t
''', engine)
# Makes your data 538% better...I think
plt.style.use('fivethirtyeight')
df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | rollbacks | rollbacks_time | group2_delay_days | total_delay | train_total_time | blockers | resolved_blockers | patches | max_time_in_review | max_comments_per_patch | max_cycle_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.37.0-wmf.1 | 0 | 0 | 0 | 0 | 178349 | 5 | 3 | 450 | 36809044.0 | 27.0 | 36952873.0 |
1 | 1.37.0-wmf.3 | 3 | 94493 | 0 | 1 | 219880 | 7 | 6 | 366 | 56122286.0 | 30.0 | 56562620.0 |
2 | 1.37.0-wmf.4 | 1 | 66812 | 1 | 3 | 263742 | 9 | 4 | 422 | 38820872.0 | 29.0 | 38982601.0 |
3 | 1.36.0-wmf.1 | 0 | 0 | 4 | 4 | 519622 | 1 | 1 | 566 | 47181045.0 | 31.0 | 47755190.0 |
4 | 1.36.0-wmf.2 | 4 | 389769 | 4 | 5 | 554704 | 7 | 1 | 273 | 110996452.0 | 33.0 | 111569626.0 |
df_corr = df.copy(deep=True)
df_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches
sns.heatmap(df_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title('Correlation of train variables')
plt.show()
df.set_index('version')['blockers'].hist(figsize=(12, 10))
plt.xlabel("Blockers", labelpad=15)
plt.title("Blockers per Train", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Blockers per Train')
df.patches.plot(linewidth=1)
plt.xticks(rotation=90)
(array([-50., 0., 50., 100., 150., 200., 250., 300., 350., 400.]),
[Text(-50.0, 0, '−50'),
Text(0.0, 0, '0'),
Text(50.0, 0, '50'),
Text(100.0, 0, '100'),
Text(150.0, 0, '150'),
Text(200.0, 0, '200'),
Text(250.0, 0, '250'),
Text(300.0, 0, '300'),
Text(350.0, 0, '350'),
Text(400.0, 0, '400')])
df[df['blockers'] > 10].sort_values(by='blockers', ascending=False)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | rollbacks | rollbacks_time | group2_delay_days | total_delay | train_total_time | blockers | resolved_blockers | patches | max_time_in_review | max_comments_per_patch | max_cycle_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
82 | 1.34.0-wmf.20 | 1 | 16897 | 5 | 5 | 600096 | 20 | 11 | 413 | 60583935.0 | 37.0 | 60715205.0 |
242 | 1.38.0-wmf.19 | 1 | 70250 | 0 | 1 | 176801 | 18 | 10 | 319 | 85391889.0 | 24.0 | 85778365.0 |
103 | 1.33.0-wmf.22 | 0 | 0 | 0 | 1 | 63844 | 17 | 11 | 391 | 107411197.0 | 45.0 | 107659715.0 |
77 | 1.34.0-wmf.14 | 2 | 412678 | 4 | 5 | 524983 | 16 | 7 | 646 | 73502579.0 | 34.0 | 73539481.0 |
136 | 1.31.0-wmf.20 | 2 | 134534 | 1 | 5 | 255075 | 14 | 12 | 822 | 66099647.0 | 75.0 | 67210868.0 |
210 | 1.30.0-wmf.2 | 2 | 595079 | 0 | 0 | 782668 | 14 | 12 | 462 | 67958577.0 | 56.0 | 68943029.0 |
241 | 1.38.0-wmf.18 | 0 | 0 | 0 | 0 | 171956 | 13 | 7 | 287 | 186407187.0 | 43.0 | 186862129.0 |
124 | 1.32.0-wmf.22 | 0 | 0 | 0 | 0 | 173055 | 12 | 7 | 824 | 64869761.0 | 44.0 | 65392036.0 |
259 | 1.39.0-wmf.10 | 5 | 57256 | 0 | 1 | 172544 | 12 | 5 | 206 | 89212513.0 | 22.0 | 89427805.0 |
57 | 1.35.0-wmf.31 | 3 | 365118 | 4 | 15 | 516489 | 11 | 7 | 427 | 72215548.0 | 70.0 | 72206585.0 |
76 | 1.34.0-wmf.13 | 2 | 14912 | 0 | 1 | 183853 | 11 | 8 | 471 | 53208155.0 | 23.0 | 53820019.0 |
187 | 1.28.0-wmf.21 | 1 | 88476 | 0 | 1 | 176082 | 11 | 8 | 514 | 60386455.0 | 93.0 | 61360331.0 |
block_df = pd.read_sql('''
SELECT
version,
group_blocked
FROM train t
JOIN blocker b ON t.id = b.train_id
''', engine)
block_df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | group_blocked | |
---|---|---|
0 | 1.37.0-wmf.7 | -1 |
1 | 1.37.0-wmf.7 | -1 |
2 | 1.37.0-wmf.12 | 2 |
3 | 1.37.0-wmf.12 | 1 |
4 | 1.37.0-wmf.12 | 1 |
block_df.group_blocked.unique()
array([-1, 2, 1, 0])
group_name_map = {
-1: "Earlier",
0: "Group0",
1: "Group1",
2: "Group2",
}
block_df['blocker_added'] = block_df.group_blocked.map(group_name_map)
block_df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | group_blocked | blocker_added | |
---|---|---|---|
0 | 1.37.0-wmf.7 | -1 | Earlier |
1 | 1.37.0-wmf.7 | -1 | Earlier |
2 | 1.37.0-wmf.12 | 2 | Group2 |
3 | 1.37.0-wmf.12 | 1 | Group1 |
4 | 1.37.0-wmf.12 | 1 | Group1 |
block_df.group_blocked.value_counts()
group_blocked
-1 557
1 422
0 349
2 143
Name: count, dtype: int64
block_df.version
0 1.37.0-wmf.7
1 1.37.0-wmf.7
2 1.37.0-wmf.12
3 1.37.0-wmf.12
4 1.37.0-wmf.12
...
1466 1.43.0-wmf.6
1467 1.43.0-wmf.6
1468 1.43.0-wmf.6
1469 1.43.0-wmf.7
1470 1.43.0-wmf.7
Name: version, Length: 1471, dtype: object
block_df.set_index('version')
block_df.sort_values('group_blocked', inplace=True)
fig = plt.figure(figsize=(16,6))
plt.grid(color='white', lw=0.5, axis='x')
n, bins, patches = plt.hist(block_df.blocker_added, bins=4, rwidth=0.95)
xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.2f}\nto\n{:.2f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=["Before group0", "Group0", "Group1", "Group2"])
# remove y ticks
plt.yticks([])
# plot values on top of bars
for idx, value in enumerate(n):
if value > 0:
plt.text(xticks[idx], value+5, int(value), ha='center')
plt.title('Train Blockers by Group Where They Were Discovered', loc='left', pad=30)
plt.show()
patches = pd.read_sql('''
SELECT
link,
version,
submitted,
insertions as ins,
(deletions*-1) as del
FROM patch p JOIN train t ON t.id = p.train_id
''', engine)
patches.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
submitted | ins | del | |
---|---|---|---|
count | 1.191340e+05 | 1.191340e+05 | 1.191340e+05 |
mean | 1.592731e+09 | 2.941666e+02 | -2.334408e+02 |
std | 7.264151e+07 | 5.028140e+04 | 4.821069e+04 |
min | 1.431572e+09 | 0.000000e+00 | -1.661412e+07 |
25% | 1.535077e+09 | 2.000000e+00 | -1.800000e+01 |
50% | 1.589385e+09 | 7.000000e+00 | -4.000000e+00 |
75% | 1.655997e+09 | 3.400000e+01 | -1.000000e+00 |
max | 1.716881e+09 | 1.728860e+07 | 0.000000e+00 |
patches['loc'] = patches['ins'] + patches['del']
patches.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
link | version | submitted | ins | del | loc | |
---|---|---|---|---|---|---|
0 | https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z | 1.37.0-wmf.1 | 1618945759 | 5 | -1 | 4 |
1 | https://gerrit.wikimedia.org/r/#/q/3302274f,n,z | 1.37.0-wmf.1 | 1618878371 | 1156 | -660 | 496 |
2 | https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z | 1.37.0-wmf.1 | 1618343309 | 976 | -3 | 973 |
3 | https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z | 1.37.0-wmf.1 | 1618341075 | 8 | -29 | -21 |
4 | https://gerrit.wikimedia.org/r/#/q/af916aad,n,z | 1.37.0-wmf.1 | 1618300868 | 7 | -5 | 2 |
patches['submitted'] = pd.to_datetime(patches['submitted'], unit='s')
patches.set_index('submitted', inplace=True)
out = patches.groupby(pd.Grouper(freq='M')).apply(lambda x: x)
out = out[out['link'] != 'https://gerrit.wikimedia.org/r/#/q/9a08dbab,n,z'] # The one patch that inserts 17.2M lines of code
out['ok'] = out['loc'].cumsum()
Cycle time is the time from when a patch enters code review to the time that it's in production. Lead time is the time it takes from commit to production.
# GOAL
# train lead_time cycle_time Id
# 0 1.37.0-wmf.6 200 2000 u1234
# 1 1.37.0-wmf.6 123 2800 u1235
cycle = pd.read_sql('''
SELECT
substr(version, 8) as version,
datetime(start_time, 'unixepoch'),
(start_time - created) as cycle_time,
(start_time - submitted) as lead_time,
datetime(created, 'unixepoch'),
datetime(submitted, 'unixepoch'),
link
FROM patch p JOIN train t ON t.id = p.train_id
WHERE (lead_time > 0 AND cycle_time > 0)
AND (
version = "%(version_one)s" OR
version = "%(version_two)s" OR
version = "%(version_three)s"
)
''' % {
'version_one': TRAINS[0],
'version_two': TRAINS[1],
'version_three': TRAINS[2],
}, engine)
cycle.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | datetime(start_time, 'unixepoch') | cycle_time | lead_time | datetime(created, 'unixepoch') | datetime(submitted, 'unixepoch') | link | |
---|---|---|---|---|---|---|---|
0 | wmf.5 | 2024-05-14 03:01:41 | 8195933 | 7497 | 2024-02-09 06:22:48 | 2024-05-14 00:56:44 | https://gerrit.wikimedia.org/r/999129 |
1 | wmf.5 | 2024-05-14 03:01:41 | 20163 | 7409 | 2024-05-13 21:25:38 | 2024-05-14 00:58:12 | https://gerrit.wikimedia.org/r/1030971 |
2 | wmf.5 | 2024-05-14 03:01:41 | 21159 | 19285 | 2024-05-13 21:09:02 | 2024-05-13 21:40:16 | https://gerrit.wikimedia.org/r/1031000 |
3 | wmf.5 | 2024-05-14 03:01:41 | 28359 | 26929 | 2024-05-13 19:09:02 | 2024-05-13 19:32:52 | https://gerrit.wikimedia.org/r/1030949 |
4 | wmf.5 | 2024-05-14 03:01:41 | 182643 | 30033 | 2024-05-12 00:17:38 | 2024-05-13 18:41:08 | https://gerrit.wikimedia.org/r/1030546 |
cycle.sort_values(by='lead_time', ascending=False).head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | datetime(start_time, 'unixepoch') | cycle_time | lead_time | datetime(created, 'unixepoch') | datetime(submitted, 'unixepoch') | link | |
---|---|---|---|---|---|---|---|
422 | wmf.6 | 2024-05-21 03:01:39 | 3040057 | 589118 | 2024-04-15 22:34:02 | 2024-05-14 07:23:01 | https://gerrit.wikimedia.org/r/1019841 |
421 | wmf.6 | 2024-05-21 03:01:39 | 3036305 | 587585 | 2024-04-15 23:36:34 | 2024-05-14 07:48:34 | https://gerrit.wikimedia.org/r/1019853 |
244 | wmf.5 | 2024-05-14 03:01:41 | 1307246 | 586672 | 2024-04-28 23:54:15 | 2024-05-07 08:03:49 | https://gerrit.wikimedia.org/r/1024975 |
618 | wmf.6 | 2024-05-21 03:01:39 | 170491763 | 586440 | 2018-12-25 20:12:16 | 2024-05-14 08:07:39 | https://gerrit.wikimedia.org/r/481318 |
420 | wmf.6 | 2024-05-21 03:01:39 | 2817108 | 586156 | 2024-04-18 12:29:51 | 2024-05-14 08:12:23 | https://gerrit.wikimedia.org/r/1021274 |
The time from commit to deploy (in seconds)
cycle['lead_time_days'] = cycle['lead_time'] / (60*60*24)
cycle['cycle_time_days'] = cycle['cycle_time'] / (60*60*24)
cycle.sort_values(by='lead_time_days').head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | datetime(start_time, 'unixepoch') | cycle_time | lead_time | datetime(created, 'unixepoch') | datetime(submitted, 'unixepoch') | link | lead_time_days | cycle_time_days | |
---|---|---|---|---|---|---|---|---|---|
249 | wmf.5 | 2024-05-14 03:01:41 | 9529 | 1085 | 2024-05-14 00:22:52 | 2024-05-14 02:43:36 | https://gerrit.wikimedia.org/r/1031040 | 0.012558 | 0.110289 |
156 | wmf.5 | 2024-05-14 03:01:41 | 479204 | 1089 | 2024-05-08 13:54:57 | 2024-05-14 02:43:32 | https://gerrit.wikimedia.org/r/1029111 | 0.012604 | 5.546343 |
518 | wmf.6 | 2024-05-21 03:01:39 | 731263 | 2609 | 2024-05-12 15:53:56 | 2024-05-21 02:18:10 | https://gerrit.wikimedia.org/r/1030588 | 0.030197 | 8.463692 |
638 | wmf.6 | 2024-05-21 03:01:39 | 94900 | 2609 | 2024-05-20 00:39:59 | 2024-05-21 02:18:10 | https://gerrit.wikimedia.org/r/1033700 | 0.030197 | 1.098380 |
115 | wmf.5 | 2024-05-14 03:01:41 | 886376 | 5049 | 2024-05-03 20:48:45 | 2024-05-14 01:37:32 | https://gerrit.wikimedia.org/r/1026948 | 0.058438 | 10.258981 |
from matplotlib import ticker as mticker
import numpy as np
# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10), constrained_layout=True)
plt.tight_layout(pad=5)
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='lead_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='lead_time_days', ax=ax[2])
ax[0].set_ylabel('Time from merge to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Lead time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
/tmp/ipykernel_1609226/2853478908.py:6: UserWarning: The figure layout has changed to tight
plt.tight_layout(pad=5)
fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax)
ax.set_ylabel('Time from merge to deploy (days)', labelpad=20.0)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Lead time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
The time from patchset submission for code review to deploy
from matplotlib import ticker as mticker
import numpy as np
# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10))
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='cycle_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='cycle_time_days', ax=ax[2])
ax[0].set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 74.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 80.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 82.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 79.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
It's hard to see the majority of our patch's cycletime with the outliers. Here's the log-scale version.
fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax)
ax.set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
Backport
- v. To retroactively supply a fix, or a new feature, to a previous version of a software product at the same time (or after) supplying it to the current version.
- n. A commit that is backported
Each train has many backports. Each backport may be supplied to many trains. Backports add features, change feature flags, and fix bugs.
Some backports have tasks associated with them. Some tasks are "bugs" or "errors".
When a backport is associated with a task that is a "bug" or an "error" it's a bugfix. The number of bugfixes per train is a good signal of the number of bugs that were present in that train.
Bugs may persist for many trains; however, if a developer makes a backport, they felt that the bug was severe enough to warrant fixing immediately rather than waiting a week—that's signal about train quality, too.
train_bugs = pd.read_sql('''
select
version,
count(b.link) as bug_count
from
train t
join bug_train bt on bt.train_id = t.id
join bug b on bt.bug_id = b.id
group by
version
order by
start_time
''', engine)
train_bugs.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | bug_count | |
---|---|---|
0 | 1.27.0-wmf.16 | 3 |
1 | 1.27.0-wmf.19 | 1 |
2 | 1.27.0-wmf.21 | 1 |
3 | 1.27.0-wmf.22 | 2 |
4 | 1.27.0-wmf.23 | 3 |
Seems to follow the power law
train_bugs.hist()
plt.title('Bug count per train')
Text(0.5, 1.0, 'Bug count per train')
train_bugs.sort_values(by="bug_count", ascending=False).head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | bug_count | |
---|---|---|
120 | 1.34.0-wmf.13 | 13 |
90 | 1.32.0-wmf.24 | 11 |
206 | 1.37.0-wmf.20 | 11 |
194 | 1.37.0-wmf.5 | 11 |
119 | 1.34.0-wmf.11 | 11 |
train_bugs = pd.read_sql('''
select
version,
count(b.link) as bug_count,
rollbacks,
(select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
(select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
(select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
(select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time,
patches
from
train t
join bug_train bt on bt.train_id = t.id
join bug b on bt.bug_id = b.id
group by
version
order by
start_time
''', engine)
train_bugs.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | bug_count | rollbacks | resolved_blockers | max_time_in_review | max_comments_per_patch | max_cycle_time | patches | |
---|---|---|---|---|---|---|---|---|
0 | 1.27.0-wmf.16 | 3 | 1 | 0 | 17557313.0 | 95.0 | 18128243.0 | 322 |
1 | 1.27.0-wmf.19 | 1 | 1 | 1 | 58248774.0 | 45.0 | 58798558.0 | 230 |
2 | 1.27.0-wmf.21 | 1 | 0 | 0 | 120956985.0 | 73.0 | 120937246.0 | 180 |
3 | 1.27.0-wmf.22 | 2 | 0 | 0 | 50045244.0 | 58.0 | 50160003.0 | 416 |
4 | 1.27.0-wmf.23 | 3 | 2 | 3 | 20153065.0 | 40.0 | 20228209.0 | 168 |
train_bugs_corr = train_bugs.copy(deep=True)
train_bugs_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10)) # Sample figsize in inches
sns.heatmap(train_bugs_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
train_bugs[train_bugs['version'] == TRAINS[-1]]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | bug_count | rollbacks | resolved_blockers | max_time_in_review | max_comments_per_patch | max_cycle_time | patches | |
---|---|---|---|---|---|---|---|---|
326 | 1.43.0-wmf.7 | 2 | 0 | 0 | 47093205.0 | 54.0 | 47545250.0 | 283 |
DCaro made an interesting comment on the the fame blog about this repo. This is my ham-fisted investigation.
comm_dist = pd.read_sql('select version, sum(comments) as comm from patch p join train t on p.train_id = t.id group by t.version', engine)
comm_dist.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | comm | |
---|---|---|
0 | 1.27.0-wmf.16 | 1840 |
1 | 1.27.0-wmf.17 | 1372 |
2 | 1.27.0-wmf.18 | 1204 |
3 | 1.27.0-wmf.19 | 1440 |
4 | 1.27.0-wmf.20 | 1573 |
comm_dist.set_index('version')['comm'].hist(figsize=(12, 10))
plt.xlabel("Patch Comments", labelpad=15)
plt.title("Comments per Train", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Comments per Train')
pcommdf = pd.read_sql('select link, comments from patch', engine)
pcommdf.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pcommdf.set_index('link')['comments'].hist(figsize=(12, 10),bins=100)
<Axes: >
pcommdf.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
comments | |
---|---|
count | 119134.000000 |
mean | 3.524561 |
std | 5.192373 |
min | 0.000000 |
25% | 1.000000 |
50% | 2.000000 |
75% | 4.000000 |
max | 354.000000 |
# Let's try to remove huge outliers
pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
118440 rows × 2 columns
pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())].hist(bins=50,figsize=(10, 10))
plt.xlabel("Distribution of comments on patches", labelpad=15)
plt.title("Comments per Patch", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Comments per Patch')
patch_authors = pd.read_sql('''
SELECT train.version,
COUNT(DISTINCT patch.owner) as unique_owners
FROM train
JOIN patch ON train.id = patch.train_id
GROUP BY train.id
ORDER BY start_time;
''', engine)
patch_authors.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | unique_owners | |
---|---|---|
0 | 1.27.0-wmf.16 | 65 |
1 | 1.27.0-wmf.17 | 60 |
2 | 1.27.0-wmf.18 | 61 |
3 | 1.27.0-wmf.19 | 67 |
4 | 1.27.0-wmf.20 | 71 |
patch_authors.set_index('version').unique_owners.plot(linewidth=1)
plt.xticks(rotation=90)
(array([-50., 0., 50., 100., 150., 200., 250., 300., 350., 400.]),
[Text(-50.0, 0, '1.41.0-wmf.9'),
Text(0.0, 0, '1.27.0-wmf.16'),
Text(50.0, 0, '1.30.0-wmf.6'),
Text(100.0, 0, '1.32.0-wmf.23'),
Text(150.0, 0, '1.35.0-wmf.11'),
Text(200.0, 0, '1.36.0-wmf.32'),
Text(250.0, 0, '1.39.0-wmf.5'),
Text(300.0, 0, '1.41.0-wmf.7'),
Text(350.0, 0, '1.43.0-wmf.6'),
Text(400.0, 0, '')])
patch_authors.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
unique_owners | |
---|---|
count | 352.000000 |
mean | 68.238636 |
std | 11.973850 |
min | 11.000000 |
25% | 60.000000 |
50% | 68.000000 |
75% | 75.000000 |
max | 108.000000 |
sns.jointplot(data=train_bugs, x='patches', y='resolved_blockers', kind='reg')
<seaborn.axisgrid.JointGrid at 0x7f570804ef50>
Everytime we backport a fix, it counts as a bug. And it's a bug we missed before the code went to production. This is known as an "escape."
escapes = pd.read_sql('''
select
version,
b.link
from
bug_train bt
join train t on t.id = bt.train_id
join bug b on bt.bug_id = b.id
join bug_bug_patch bbp on bbp.bug_id = b.id
join bug_patch bp on bp.id = bbp.bug_patch_id
join bug_file bf on bp.id = bf.bug_patch_id
group by
version, b.link;
''', engine)
escapes.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | link | |
---|---|---|
0 | 1.27.0-wmf.16 | https://phabricator.wikimedia.org/T129641 |
1 | 1.27.0-wmf.16 | https://phabricator.wikimedia.org/T129704 |
2 | 1.27.0-wmf.16 | https://phabricator.wikimedia.org/T129715 |
3 | 1.27.0-wmf.19 | https://phabricator.wikimedia.org/T131283 |
4 | 1.27.0-wmf.21 | https://phabricator.wikimedia.org/T132645 |
escapes.version.value_counts()
version
1.34.0-wmf.13 13
1.37.0-wmf.20 11
1.32.0-wmf.24 11
1.34.0-wmf.11 11
1.37.0-wmf.5 11
..
1.31.0-wmf.11 1
1.31.0-wmf.1 1
1.30.0-wmf.9 1
1.35.0-wmf.40 1
1.31.0-wmf.4 1
Name: count, Length: 327, dtype: int64
df_escapes = escapes.version.value_counts(dropna=True, sort=False).rename_axis('version').to_frame('counts')
# sns.lineplot(df_escapes, x='version')
df_escapes.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
counts | |
---|---|
version | |
1.27.0-wmf.16 | 3 |
1.27.0-wmf.19 | 1 |
1.27.0-wmf.21 | 1 |
1.27.0-wmf.22 | 2 |
1.27.0-wmf.23 | 3 |
df_escapes = df_escapes.reset_index()
df_escapes.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | counts | |
---|---|---|
0 | 1.27.0-wmf.16 | 3 |
1 | 1.27.0-wmf.19 | 1 |
2 | 1.27.0-wmf.21 | 1 |
3 | 1.27.0-wmf.22 | 2 |
4 | 1.27.0-wmf.23 | 3 |
df_escapes['counts'] = pd.to_numeric(df_escapes['counts'], errors='coerce')
df_escapes['version'] = df_escapes['version'].astype(str)
df_escapes.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
version | counts | |
---|---|---|
0 | 1.27.0-wmf.16 | 3 |
1 | 1.27.0-wmf.19 | 1 |
2 | 1.27.0-wmf.21 | 1 |
3 | 1.27.0-wmf.22 | 2 |
4 | 1.27.0-wmf.23 | 3 |
import numpy as np
# Select a subset of versions to display
subset_versions = df_escapes['version'].unique()[::1] # Every version...just make a copy
# Filter the DataFrame to include only the subset of versions
df_subset = df_escapes[df_escapes['version'].isin(subset_versions)]
# Extract x and y values from the DataFrame
x = df_subset['version']
y = df_subset['counts']
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))
# Plot the line plot
ax.plot(x, y, marker='o', linestyle='-', label='Data')
# Calculate the trend line using polynomial regression (degree=1)
coefficients = np.polyfit(np.arange(len(x)), y, deg=1)
trendline = np.poly1d(coefficients)
ax.plot(x, trendline(np.arange(len(x))), linestyle='--', label='Trend Line')
# Calculate the slope of the trend line
slope = coefficients[0]
# Set labels and title
ax.set_xlabel('Version')
ax.set_ylabel('Counts')
ax.set_title(f'Trend Plot of Escapes by Version\nSlope: {slope:.2f} escapes per version')
# Reduce the number of x-axis tick labels
tick_positions = np.arange(0, len(x), step=20) # Adjust the step value to show desired number of labels
tick_labels = x[tick_positions]
plt.xticks(tick_positions, tick_labels, rotation=45)
# Display legend
ax.legend()
# Show the plot
plt.show()