Skip to content

thcipriani/train-stats

Repository files navigation

Train Stats

 ----------------------------------------------------------------------------------------
/                                                                                        \
| /$$$$$$$$                 /$$            /$$$$$$   /$$                 /$$             |
||__  $$__/                |__/           /$$__  $$ | $$                | $$             |
|   | $$  /$$$$$$  /$$$$$$  /$$ /$$$$$$$ | $$  \__//$$$$$$    /$$$$$$  /$$$$$$   /$$$$$$$|
|   | $$ /$$__  $$|____  $$| $$| $$__  $$|  $$$$$$|_  $$_/   |____  $$|_  $$_/  /$$_____/|
|   | $$| $$  \__/ /$$$$$$$| $$| $$  \ $$ \____  $$ | $$      /$$$$$$$  | $$   |  $$$$$$ |
|   | $$| $$      /$$__  $$| $$| $$  | $$ /$$  \ $$ | $$ /$$ /$$__  $$  | $$ /$$\____  $$|
|   | $$| $$     |  $$$$$$$| $$| $$  | $$|  $$$$$$/ |  $$$$/|  $$$$$$$  |  $$$$//$$$$$$$/|
|   |__/|__/      \_______/|__/|__/  |__/ \______/   \___/   \_______/   \___/ |_______/ |
|                                                                                        |
|                                                                                        |
|                                                                                        |
|                                      🚂 ¯\_(ツ)_/¯?                                     |
\                                                                                        /
 ----------------------------------------------------------------------------------------
    \     
     \     
      \      
           ___ ____
         ⎛   ⎛ ,----
          \  //==--'
     _//|,.·//==--'    ____________________________
    _OO≣=-  ︶ ᴹw ⎞_§ ______  ___\ ___\ ,\__ \/ __ \
   (∞)_, )  (     |  ______/__  \/ /__ / /_/ / /_/ /
     ¨--¨|| |- (  / ______\____/ \___/ \__^_/  .__/
         ««_/  «_/ jgs/bd808                /_/

In which I look at data from the past several hundred trains and pretend that I know how to do exploratory data analysis.

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine

with open('data/TRAINS') as f:
    TRAINS = [x.strip() for x in f.readlines()]

engine = create_engine('sqlite:///data/train.db')
df = pd.read_sql('''
SELECT
    version,
    rollbacks,
    rollbacks_time,
    group2_delay_days,
    (group0_delay_days +
     group1_delay_days +
     group2_delay_days) as total_delay,
    total_time as train_total_time,
    (select count(*) from blocker b where b.train_id = t.id) as blockers,
    (select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
    patches,
    (select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
    (select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
    (select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time
FROM train t
''', engine)

# Makes your data 538% better...I think
plt.style.use('fivethirtyeight')
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version rollbacks rollbacks_time group2_delay_days total_delay train_total_time blockers resolved_blockers patches max_time_in_review max_comments_per_patch max_cycle_time
0 1.37.0-wmf.1 0 0 0 0 178349 5 3 450 36809044.0 27.0 36952873.0
1 1.37.0-wmf.3 3 94493 0 1 219880 7 6 366 56122286.0 30.0 56562620.0
2 1.37.0-wmf.4 1 66812 1 3 263742 9 4 422 38820872.0 29.0 38982601.0
3 1.36.0-wmf.1 0 0 4 4 519622 1 1 566 47181045.0 31.0 47755190.0
4 1.36.0-wmf.2 4 389769 4 5 554704 7 1 273 110996452.0 33.0 111569626.0
df_corr = df.copy(deep=True)
df_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(df_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title('Correlation of train variables')
plt.show()

png

df.set_index('version')['blockers'].hist(figsize=(12, 10))
plt.xlabel("Blockers", labelpad=15)
plt.title("Blockers per Train", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Blockers per Train')

png

df.patches.plot(linewidth=1)
plt.xticks(rotation=90)
(array([-50.,   0.,  50., 100., 150., 200., 250., 300., 350., 400.]),
 [Text(-50.0, 0, '−50'),
  Text(0.0, 0, '0'),
  Text(50.0, 0, '50'),
  Text(100.0, 0, '100'),
  Text(150.0, 0, '150'),
  Text(200.0, 0, '200'),
  Text(250.0, 0, '250'),
  Text(300.0, 0, '300'),
  Text(350.0, 0, '350'),
  Text(400.0, 0, '400')])

png

df[df['blockers'] > 10].sort_values(by='blockers', ascending=False)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version rollbacks rollbacks_time group2_delay_days total_delay train_total_time blockers resolved_blockers patches max_time_in_review max_comments_per_patch max_cycle_time
82 1.34.0-wmf.20 1 16897 5 5 600096 20 11 413 60583935.0 37.0 60715205.0
242 1.38.0-wmf.19 1 70250 0 1 176801 18 10 319 85391889.0 24.0 85778365.0
103 1.33.0-wmf.22 0 0 0 1 63844 17 11 391 107411197.0 45.0 107659715.0
77 1.34.0-wmf.14 2 412678 4 5 524983 16 7 646 73502579.0 34.0 73539481.0
136 1.31.0-wmf.20 2 134534 1 5 255075 14 12 822 66099647.0 75.0 67210868.0
210 1.30.0-wmf.2 2 595079 0 0 782668 14 12 462 67958577.0 56.0 68943029.0
241 1.38.0-wmf.18 0 0 0 0 171956 13 7 287 186407187.0 43.0 186862129.0
124 1.32.0-wmf.22 0 0 0 0 173055 12 7 824 64869761.0 44.0 65392036.0
259 1.39.0-wmf.10 5 57256 0 1 172544 12 5 206 89212513.0 22.0 89427805.0
57 1.35.0-wmf.31 3 365118 4 15 516489 11 7 427 72215548.0 70.0 72206585.0
76 1.34.0-wmf.13 2 14912 0 1 183853 11 8 471 53208155.0 23.0 53820019.0
187 1.28.0-wmf.21 1 88476 0 1 176082 11 8 514 60386455.0 93.0 61360331.0
block_df = pd.read_sql('''
SELECT
    version,
    group_blocked
FROM train t
JOIN blocker b ON t.id = b.train_id
''', engine)
block_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version group_blocked
0 1.37.0-wmf.7 -1
1 1.37.0-wmf.7 -1
2 1.37.0-wmf.12 2
3 1.37.0-wmf.12 1
4 1.37.0-wmf.12 1
block_df.group_blocked.unique()
array([-1,  2,  1,  0])
group_name_map = {
    -1: "Earlier",
    0: "Group0",
    1: "Group1",
    2: "Group2",
}
block_df['blocker_added'] = block_df.group_blocked.map(group_name_map)
block_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version group_blocked blocker_added
0 1.37.0-wmf.7 -1 Earlier
1 1.37.0-wmf.7 -1 Earlier
2 1.37.0-wmf.12 2 Group2
3 1.37.0-wmf.12 1 Group1
4 1.37.0-wmf.12 1 Group1
block_df.group_blocked.value_counts()
group_blocked
-1    557
 1    422
 0    349
 2    143
Name: count, dtype: int64
block_df.version
0        1.37.0-wmf.7
1        1.37.0-wmf.7
2       1.37.0-wmf.12
3       1.37.0-wmf.12
4       1.37.0-wmf.12
            ...      
1466     1.43.0-wmf.6
1467     1.43.0-wmf.6
1468     1.43.0-wmf.6
1469     1.43.0-wmf.7
1470     1.43.0-wmf.7
Name: version, Length: 1471, dtype: object
block_df.set_index('version')
block_df.sort_values('group_blocked', inplace=True)


fig = plt.figure(figsize=(16,6))
plt.grid(color='white', lw=0.5, axis='x')
n, bins, patches = plt.hist(block_df.blocker_added, bins=4, rwidth=0.95)

xticks = [(bins[idx+1] + value)/2 for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:.2f}\nto\n{:.2f}".format(value, bins[idx+1]) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels=["Before group0", "Group0", "Group1", "Group2"])

# remove y ticks
plt.yticks([])

# plot values on top of bars
for idx, value in enumerate(n):
    if value > 0:
        plt.text(xticks[idx], value+5, int(value), ha='center')

plt.title('Train Blockers by Group Where They Were Discovered', loc='left', pad=30)
plt.show()

png

patches = pd.read_sql('''
SELECT
    link,
    version,
    submitted,
    insertions as ins,
    (deletions*-1) as del
FROM patch p JOIN train t ON t.id = p.train_id
''', engine)
patches.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
submitted ins del
count 1.191340e+05 1.191340e+05 1.191340e+05
mean 1.592731e+09 2.941666e+02 -2.334408e+02
std 7.264151e+07 5.028140e+04 4.821069e+04
min 1.431572e+09 0.000000e+00 -1.661412e+07
25% 1.535077e+09 2.000000e+00 -1.800000e+01
50% 1.589385e+09 7.000000e+00 -4.000000e+00
75% 1.655997e+09 3.400000e+01 -1.000000e+00
max 1.716881e+09 1.728860e+07 0.000000e+00
patches['loc'] = patches['ins'] + patches['del']
patches.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
link version submitted ins del loc
0 https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z 1.37.0-wmf.1 1618945759 5 -1 4
1 https://gerrit.wikimedia.org/r/#/q/3302274f,n,z 1.37.0-wmf.1 1618878371 1156 -660 496
2 https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z 1.37.0-wmf.1 1618343309 976 -3 973
3 https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z 1.37.0-wmf.1 1618341075 8 -29 -21
4 https://gerrit.wikimedia.org/r/#/q/af916aad,n,z 1.37.0-wmf.1 1618300868 7 -5 2
patches['submitted'] = pd.to_datetime(patches['submitted'], unit='s')
patches.set_index('submitted', inplace=True)
out = patches.groupby(pd.Grouper(freq='M')).apply(lambda x: x)
out = out[out['link'] != 'https://gerrit.wikimedia.org/r/#/q/9a08dbab,n,z'] # The one patch that inserts 17.2M lines of code
out['ok'] = out['loc'].cumsum()

Cycle time/Lead time

Cycle time is the time from when a patch enters code review to the time that it's in production. Lead time is the time it takes from commit to production.

# GOAL
#         train     lead_time    cycle_time   Id
# 0    1.37.0-wmf.6    200   2000   u1234
# 1    1.37.0-wmf.6    123   2800   u1235

cycle = pd.read_sql('''
SELECT
    substr(version, 8) as version,
    datetime(start_time, 'unixepoch'),
    (start_time - created) as cycle_time,
    (start_time - submitted) as lead_time,
    datetime(created, 'unixepoch'),
    datetime(submitted, 'unixepoch'),
    link
FROM patch p JOIN train t ON t.id = p.train_id
WHERE (lead_time > 0 AND cycle_time > 0)
    AND (
        version = "%(version_one)s" OR
        version = "%(version_two)s" OR
        version = "%(version_three)s"
    )
''' % {
    'version_one': TRAINS[0],
    'version_two': TRAINS[1],
    'version_three': TRAINS[2],
}, engine)
cycle.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version datetime(start_time, 'unixepoch') cycle_time lead_time datetime(created, 'unixepoch') datetime(submitted, 'unixepoch') link
0 wmf.5 2024-05-14 03:01:41 8195933 7497 2024-02-09 06:22:48 2024-05-14 00:56:44 https://gerrit.wikimedia.org/r/999129
1 wmf.5 2024-05-14 03:01:41 20163 7409 2024-05-13 21:25:38 2024-05-14 00:58:12 https://gerrit.wikimedia.org/r/1030971
2 wmf.5 2024-05-14 03:01:41 21159 19285 2024-05-13 21:09:02 2024-05-13 21:40:16 https://gerrit.wikimedia.org/r/1031000
3 wmf.5 2024-05-14 03:01:41 28359 26929 2024-05-13 19:09:02 2024-05-13 19:32:52 https://gerrit.wikimedia.org/r/1030949
4 wmf.5 2024-05-14 03:01:41 182643 30033 2024-05-12 00:17:38 2024-05-13 18:41:08 https://gerrit.wikimedia.org/r/1030546
cycle.sort_values(by='lead_time', ascending=False).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version datetime(start_time, 'unixepoch') cycle_time lead_time datetime(created, 'unixepoch') datetime(submitted, 'unixepoch') link
422 wmf.6 2024-05-21 03:01:39 3040057 589118 2024-04-15 22:34:02 2024-05-14 07:23:01 https://gerrit.wikimedia.org/r/1019841
421 wmf.6 2024-05-21 03:01:39 3036305 587585 2024-04-15 23:36:34 2024-05-14 07:48:34 https://gerrit.wikimedia.org/r/1019853
244 wmf.5 2024-05-14 03:01:41 1307246 586672 2024-04-28 23:54:15 2024-05-07 08:03:49 https://gerrit.wikimedia.org/r/1024975
618 wmf.6 2024-05-21 03:01:39 170491763 586440 2018-12-25 20:12:16 2024-05-14 08:07:39 https://gerrit.wikimedia.org/r/481318
420 wmf.6 2024-05-21 03:01:39 2817108 586156 2024-04-18 12:29:51 2024-05-14 08:12:23 https://gerrit.wikimedia.org/r/1021274

Lead time

The time from commit to deploy (in seconds)

cycle['lead_time_days'] = cycle['lead_time'] / (60*60*24)
cycle['cycle_time_days'] = cycle['cycle_time'] / (60*60*24)
cycle.sort_values(by='lead_time_days').head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version datetime(start_time, 'unixepoch') cycle_time lead_time datetime(created, 'unixepoch') datetime(submitted, 'unixepoch') link lead_time_days cycle_time_days
249 wmf.5 2024-05-14 03:01:41 9529 1085 2024-05-14 00:22:52 2024-05-14 02:43:36 https://gerrit.wikimedia.org/r/1031040 0.012558 0.110289
156 wmf.5 2024-05-14 03:01:41 479204 1089 2024-05-08 13:54:57 2024-05-14 02:43:32 https://gerrit.wikimedia.org/r/1029111 0.012604 5.546343
518 wmf.6 2024-05-21 03:01:39 731263 2609 2024-05-12 15:53:56 2024-05-21 02:18:10 https://gerrit.wikimedia.org/r/1030588 0.030197 8.463692
638 wmf.6 2024-05-21 03:01:39 94900 2609 2024-05-20 00:39:59 2024-05-21 02:18:10 https://gerrit.wikimedia.org/r/1033700 0.030197 1.098380
115 wmf.5 2024-05-14 03:01:41 886376 5049 2024-05-03 20:48:45 2024-05-14 01:37:32 https://gerrit.wikimedia.org/r/1026948 0.058438 10.258981
from matplotlib import ticker as mticker
import numpy as np

# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10), constrained_layout=True)
plt.tight_layout(pad=5)
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='lead_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='lead_time_days', ax=ax[2])
ax[0].set_ylabel('Time from merge to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))

plt.suptitle('Lead time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
/tmp/ipykernel_1609226/2853478908.py:6: UserWarning: The figure layout has changed to tight
  plt.tight_layout(pad=5)

png

fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='lead_time_days', ax=ax)
ax.set_ylabel('Time from merge to deploy (days)', labelpad=20.0)

ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Lead time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

png

Cycle time

The time from patchset submission for code review to deploy

from matplotlib import ticker as mticker
import numpy as np

# Adapted from <https://stackoverflow.com/a/60132262>
fig, ax = plt.subplots(1, 3, sharey=True, figsize=(20,10))
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax[0])
sns.swarmplot(data=cycle,x='version', y='cycle_time_days', ax=ax[1])
sns.stripplot(data=cycle,x='version', y='cycle_time_days', ax=ax[2])
ax[0].set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax[1].set_ylabel('')
ax[2].set_ylabel('')
ax[0].yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 78.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 74.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 80.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 82.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/home/thcipriani/Projects/Wikimedia/train-stats/venv/lib/python3.11/site-packages/seaborn/categorical.py:3544: UserWarning: 79.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

png

Cycle time log scale

It's hard to see the majority of our patch's cycletime with the outliers. Here's the log-scale version.

fig, ax = plt.subplots(1, 1, sharey=True, figsize=(20,10))
ax.set_yscale('log')
sns.violinplot(data=cycle,x='version', y='cycle_time_days', ax=ax)
ax.set_ylabel('Time from patch creation to deploy (days)', labelpad=20.0)
ax.yaxis.set_major_formatter(mticker.StrMethodFormatter("{x}"))
plt.suptitle('Cycle time of changes per version (log scale)', x=0.02, y=.92, ha='left', fontsize=25)
plt.show()

png

Train bugfixes

Backport

  • v. To retroactively supply a fix, or a new feature, to a previous version of a software product at the same time (or after) supplying it to the current version.
  • n. A commit that is backported

Each train has many backports. Each backport may be supplied to many trains. Backports add features, change feature flags, and fix bugs.

Some backports have tasks associated with them. Some tasks are "bugs" or "errors".

When a backport is associated with a task that is a "bug" or an "error" it's a bugfix. The number of bugfixes per train is a good signal of the number of bugs that were present in that train.

Bugs may persist for many trains; however, if a developer makes a backport, they felt that the bug was severe enough to warrant fixing immediately rather than waiting a week—that's signal about train quality, too.

train_bugs = pd.read_sql('''
select
  version,
  count(b.link) as bug_count
from
  train t
  join bug_train bt on bt.train_id = t.id
  join bug b on bt.bug_id = b.id
group by
  version
order by
  start_time
''', engine)
train_bugs.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version bug_count
0 1.27.0-wmf.16 3
1 1.27.0-wmf.19 1
2 1.27.0-wmf.21 1
3 1.27.0-wmf.22 2
4 1.27.0-wmf.23 3

Bug count histogram

Seems to follow the power law

train_bugs.hist()
plt.title('Bug count per train')
Text(0.5, 1.0, 'Bug count per train')

png

train_bugs.sort_values(by="bug_count", ascending=False).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version bug_count
120 1.34.0-wmf.13 13
90 1.32.0-wmf.24 11
206 1.37.0-wmf.20 11
194 1.37.0-wmf.5 11
119 1.34.0-wmf.11 11
train_bugs = pd.read_sql('''
select
  version,
  count(b.link) as bug_count,
  rollbacks,
  (select count(*) from blocker b where b.train_id = t.id and resolved = 1) as resolved_blockers,
      (select max(time_in_review) from patch p where p.train_id = t.id) as max_time_in_review,
    (select max(comments) from patch where patch.train_id = t.id) as max_comments_per_patch,
    (select max(start_time - created) from patch p where p.train_id = t.id) as max_cycle_time,
  patches
from
  train t
  join bug_train bt on bt.train_id = t.id
  join bug b on bt.bug_id = b.id
group by
  version
order by
  start_time
''', engine)
train_bugs.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version bug_count rollbacks resolved_blockers max_time_in_review max_comments_per_patch max_cycle_time patches
0 1.27.0-wmf.16 3 1 0 17557313.0 95.0 18128243.0 322
1 1.27.0-wmf.19 1 1 1 58248774.0 45.0 58798558.0 230
2 1.27.0-wmf.21 1 0 0 120956985.0 73.0 120937246.0 180
3 1.27.0-wmf.22 2 0 0 50045244.0 58.0 50160003.0 416
4 1.27.0-wmf.23 3 2 3 20153065.0 40.0 20228209.0 168
train_bugs_corr = train_bugs.copy(deep=True)
train_bugs_corr.drop(['version'], axis=1, inplace=True)
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(train_bugs_corr.corr(), annot=True, cmap="YlGnBu", linewidths=0.3, annot_kws={"size": 8}, ax=ax)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

png

train_bugs[train_bugs['version'] == TRAINS[-1]]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version bug_count rollbacks resolved_blockers max_time_in_review max_comments_per_patch max_cycle_time patches
326 1.43.0-wmf.7 2 0 0 47093205.0 54.0 47545250.0 283

A look at comments per patch

DCaro made an interesting comment on the the fame blog about this repo. This is my ham-fisted investigation.

comm_dist = pd.read_sql('select version, sum(comments) as comm from patch p join train t on p.train_id = t.id group by t.version', engine)
comm_dist.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version comm
0 1.27.0-wmf.16 1840
1 1.27.0-wmf.17 1372
2 1.27.0-wmf.18 1204
3 1.27.0-wmf.19 1440
4 1.27.0-wmf.20 1573
comm_dist.set_index('version')['comm'].hist(figsize=(12, 10))
plt.xlabel("Patch Comments", labelpad=15)
plt.title("Comments per Train", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Comments per Train')

png

pcommdf = pd.read_sql('select link, comments from patch', engine)
pcommdf.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
link comments
0 https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z 1
1 https://gerrit.wikimedia.org/r/#/q/3302274f,n,z 1
2 https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z 0
3 https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z 15
4 https://gerrit.wikimedia.org/r/#/q/af916aad,n,z 3
pcommdf.set_index('link')['comments'].hist(figsize=(12, 10),bins=100)
<Axes: >

png

pcommdf.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
comments
count 119134.000000
mean 3.524561
std 5.192373
min 0.000000
25% 1.000000
50% 2.000000
75% 4.000000
max 354.000000
# Let's try to remove huge outliers
pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
link comments
0 https://gerrit.wikimedia.org/r/#/q/ccbfcf28,n,z 1
1 https://gerrit.wikimedia.org/r/#/q/3302274f,n,z 1
2 https://gerrit.wikimedia.org/r/#/q/8b5471b5,n,z 0
3 https://gerrit.wikimedia.org/r/#/q/a6abbb67,n,z 15
4 https://gerrit.wikimedia.org/r/#/q/af916aad,n,z 3
... ... ...
119129 https://gerrit.wikimedia.org/r/1035369 1
119130 https://gerrit.wikimedia.org/r/1035368 1
119131 https://gerrit.wikimedia.org/r/1035367 1
119132 https://gerrit.wikimedia.org/r/1035366 1
119133 https://gerrit.wikimedia.org/r/1032418 2

118440 rows × 2 columns

pcommdf[np.abs(pcommdf.comments - pcommdf.comments.mean()) <= (5 * pcommdf.comments.std())].hist(bins=50,figsize=(10, 10))
plt.xlabel("Distribution of comments on patches", labelpad=15)
plt.title("Comments per Patch", y=1.02, fontsize=22)
Text(0.5, 1.02, 'Comments per Patch')

png

Patch authors per train

patch_authors = pd.read_sql('''
SELECT train.version,
    COUNT(DISTINCT patch.owner) as unique_owners
    FROM train
    JOIN patch ON train.id = patch.train_id
    GROUP BY train.id
    ORDER BY start_time;
''', engine)
patch_authors.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version unique_owners
0 1.27.0-wmf.16 65
1 1.27.0-wmf.17 60
2 1.27.0-wmf.18 61
3 1.27.0-wmf.19 67
4 1.27.0-wmf.20 71
patch_authors.set_index('version').unique_owners.plot(linewidth=1)
plt.xticks(rotation=90)
(array([-50.,   0.,  50., 100., 150., 200., 250., 300., 350., 400.]),
 [Text(-50.0, 0, '1.41.0-wmf.9'),
  Text(0.0, 0, '1.27.0-wmf.16'),
  Text(50.0, 0, '1.30.0-wmf.6'),
  Text(100.0, 0, '1.32.0-wmf.23'),
  Text(150.0, 0, '1.35.0-wmf.11'),
  Text(200.0, 0, '1.36.0-wmf.32'),
  Text(250.0, 0, '1.39.0-wmf.5'),
  Text(300.0, 0, '1.41.0-wmf.7'),
  Text(350.0, 0, '1.43.0-wmf.6'),
  Text(400.0, 0, '')])

png

patch_authors.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
unique_owners
count 352.000000
mean 68.238636
std 11.973850
min 11.000000
25% 60.000000
50% 68.000000
75% 75.000000
max 108.000000
sns.jointplot(data=train_bugs, x='patches', y='resolved_blockers', kind='reg')
<seaborn.axisgrid.JointGrid at 0x7f570804ef50>

png

Bugs escaping into production

Everytime we backport a fix, it counts as a bug. And it's a bug we missed before the code went to production. This is known as an "escape."

escapes = pd.read_sql('''
select
  version,
  b.link
from
  bug_train bt
  join train t on t.id = bt.train_id
  join bug b on bt.bug_id = b.id
  join bug_bug_patch bbp on bbp.bug_id = b.id
  join bug_patch bp on bp.id = bbp.bug_patch_id
  join bug_file bf on bp.id = bf.bug_patch_id
group by
  version, b.link;
''', engine)
escapes.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version link
0 1.27.0-wmf.16 https://phabricator.wikimedia.org/T129641
1 1.27.0-wmf.16 https://phabricator.wikimedia.org/T129704
2 1.27.0-wmf.16 https://phabricator.wikimedia.org/T129715
3 1.27.0-wmf.19 https://phabricator.wikimedia.org/T131283
4 1.27.0-wmf.21 https://phabricator.wikimedia.org/T132645
escapes.version.value_counts()
version
1.34.0-wmf.13    13
1.37.0-wmf.20    11
1.32.0-wmf.24    11
1.34.0-wmf.11    11
1.37.0-wmf.5     11
                 ..
1.31.0-wmf.11     1
1.31.0-wmf.1      1
1.30.0-wmf.9      1
1.35.0-wmf.40     1
1.31.0-wmf.4      1
Name: count, Length: 327, dtype: int64
df_escapes = escapes.version.value_counts(dropna=True, sort=False).rename_axis('version').to_frame('counts')
# sns.lineplot(df_escapes, x='version')
df_escapes.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
counts
version
1.27.0-wmf.16 3
1.27.0-wmf.19 1
1.27.0-wmf.21 1
1.27.0-wmf.22 2
1.27.0-wmf.23 3
df_escapes = df_escapes.reset_index()
df_escapes.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version counts
0 1.27.0-wmf.16 3
1 1.27.0-wmf.19 1
2 1.27.0-wmf.21 1
3 1.27.0-wmf.22 2
4 1.27.0-wmf.23 3
df_escapes['counts'] = pd.to_numeric(df_escapes['counts'], errors='coerce')
df_escapes['version'] = df_escapes['version'].astype(str)
df_escapes.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
version counts
0 1.27.0-wmf.16 3
1 1.27.0-wmf.19 1
2 1.27.0-wmf.21 1
3 1.27.0-wmf.22 2
4 1.27.0-wmf.23 3
import numpy as np

# Select a subset of versions to display
subset_versions = df_escapes['version'].unique()[::1]  # Every version...just make a copy

# Filter the DataFrame to include only the subset of versions
df_subset = df_escapes[df_escapes['version'].isin(subset_versions)]

# Extract x and y values from the DataFrame
x = df_subset['version']
y = df_subset['counts']

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the line plot
ax.plot(x, y, marker='o', linestyle='-', label='Data')

# Calculate the trend line using polynomial regression (degree=1)
coefficients = np.polyfit(np.arange(len(x)), y, deg=1)
trendline = np.poly1d(coefficients)
ax.plot(x, trendline(np.arange(len(x))), linestyle='--', label='Trend Line')

# Calculate the slope of the trend line
slope = coefficients[0]

# Set labels and title
ax.set_xlabel('Version')
ax.set_ylabel('Counts')
ax.set_title(f'Trend Plot of Escapes by Version\nSlope: {slope:.2f} escapes per version')

# Reduce the number of x-axis tick labels
tick_positions = np.arange(0, len(x), step=20)  # Adjust the step value to show desired number of labels
tick_labels = x[tick_positions]
plt.xticks(tick_positions, tick_labels, rotation=45)

# Display legend
ax.legend()

# Show the plot
plt.show()

png