Ejemplo n.º 1
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
Ejemplo n.º 2
0
def test_timedelta_format():
    x = [timedelta(days=7 * i) for i in range(5)]
    labels = timedelta_format()(x)
    assert labels == ['0', '1 week', '2 weeks', '3 weeks', '4 weeks']

    x = [pd.Timedelta(seconds=600 * i) for i in range(5)]
    labels = timedelta_format()(x)
    assert labels == \
        ['0', '10 minutes', '20 minutes', '30 minutes', '40 minutes']

    # specific units
    labels = timedelta_format(units='h')(x)
    assert labels == \
        ['0', '0.1667 hours', '0.3333 hours', '0.5000 hours',
         '0.6667 hours']
    # usetex
    x = [timedelta(microseconds=7 * i) for i in range(5)]
    labels = timedelta_format(units='us', usetex=True)(x)
    assert labels == \
        ['0', '7$\\mu s$', '14$\\mu s$', '21$\\mu s$', '28$\\mu s$']
Ejemplo n.º 3
0
def test_empty_breaks():
    x = []
    assert custom_format()(x) == []
    assert comma_format()(x) == []
    assert currency_format()(x) == []
    assert percent_format()(x) == []
    assert scientific_format()(x) == []
    assert date_format()(x) == []
    assert mpl_format()(x) == []
    assert log_format()(x) == []
    assert timedelta_format()(x) == []
Ejemplo n.º 4
0
def test_timedelta_format():
    x = [timedelta(days=7*i) for i in range(5)]
    labels = timedelta_format()(x)
    assert labels == ['0', '1 week', '2 weeks', '3 weeks', '4 weeks']

    x = [pd.Timedelta(seconds=600*i) for i in range(5)]
    labels = timedelta_format()(x)
    assert labels == \
        ['0', '10 minutes', '20 minutes', '30 minutes', '40 minutes']

    # specific units
    labels = timedelta_format(units='h')(x)
    assert labels == \
        ['0', '0.1667 hours', '0.3333 hours', '0.5000 hours',
         '0.6667 hours']
    # usetex
    x = [timedelta(microseconds=7*i) for i in range(5)]
    labels = timedelta_format(units='us', usetex=True)(x)
    assert labels == \
        ['0', '7$\\mu s$', '14$\\mu s$', '21$\\mu s$', '28$\\mu s$']
Ejemplo n.º 5
0
])
y_line = x_line * results_2.slope + results_2.intercept

g = (p9.ggplot(
    published_date_distances,
    p9.aes(x="factor(version_count)", y="time_to_published"),
) + p9.geom_boxplot(fill="#a6cee3") + p9.geom_line(
    mapping=p9.aes(x="version_count", y="time_to_published"),
    stat="smooth",
    method="lm",
    linetype="dashed",
    se=False,
    alpha=1,
    size=0.7,
    inherit_aes=False,
) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.annotate(
    "text",
    x=9,
    y=timedelta(days=1470),
    label=f"Y={results_2.slope:.2f}*X+{results_2.intercept:.2f}",
) + p9.labs(x="# of Preprint Versions",
            y="Time Elapsed Until Preprint is Published") + p9.theme_seaborn(
                context="paper", style="ticks", font="Arial", font_scale=1.3))
# g.save("output/version_count_vs_publication_time.svg", dpi=500)
# g.save("output/version_count_vs_publication_time.png", dpi=500)
print(g)

plt.figure(figsize=(8, 5))
g = sns.boxenplot(
    x="version_count",
    y="days_to_published",
median_ci_l, median_ci_u = median_ci.values.flatten()
median_ci_l, median_ci_u

# In[9]:

overall_preprint_survival = kmf.survival_function_.reset_index().assign(
    label="all_papers")
overall_preprint_survival.head()

# In[10]:

g = (p9.ggplot(
    overall_preprint_survival.assign(
        timeline=lambda x: pd.to_timedelta(x.timeline, "D")),
    p9.aes(x="timeline", y="KM_estimate", color="label"),
) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() +
     p9.ylim(0, 1))
print(g)

# # Calculate Category Survival Function

# This section measures how long it takes for certain categories to get preprints published.

# In[11]:

entire_preprint_df = pd.DataFrame(
    [], columns=["timeline", "KM_estimate", "category"])
half_life = []
for cat, grouped_df in preprints_w_published_dates.groupby("category"):
    temp_df = preprints_w_published_dates.query(f"category=='{cat}'")
    kmf.fit(
Ejemplo n.º 7
0
median_ci_l, median_ci_u = median_ci.values.flatten()
median_ci_l, median_ci_u

overall_preprint_survival = kmf.survival_function_.reset_index().assign(
    label="all_papers"
)
overall_preprint_survival.head()

g = (
    p9.ggplot(
        overall_preprint_survival.assign(
            timeline=lambda x: pd.to_timedelta(x.timeline, "D")
        ),
        p9.aes(x="timeline", y="KM_estimate", color="label"),
    )
    + p9.scale_x_timedelta(labels=timedelta_format("d"))
    + p9.geom_line()
    + p9.ylim(0, 1)
)
print(g)

# # Calculate Category Survival Function

# This section measures how long it takes for certain categories to get preprints published.

entire_preprint_df = pd.DataFrame([], columns=["timeline", "KM_estimate", "category"])
half_life = []
for cat, grouped_df in preprints_w_published_dates.groupby("category"):
    temp_df = preprints_w_published_dates.query(f"category=='{cat}'")
    kmf.fit(
        temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24,