def validate_assembled_dataframe(df, endeavor, stint):

    # there should only be one entry for each series/stint
    assert len(df.groupby([
        'Series',
        'Stint',
    ])) == len(df)
    assert ip.pophomogeneous(df['Series'] // 1000) == endeavor
    assert ip.pophomogeneous(df['Stint']) == stint
Example #2
0
def draw_plots(measurement, df):

    facet(df).map(
        sns.barplot,
        'Threads',
        measurement,
        'Implementation',
        hue_order=sorted(df['Implementation'].unique()),
        palette=sns.color_palette(),
    ).add_legend()

    plt.savefig(
        kn.pack({
            'measurement': slugify(measurement),
            'time_type': ip.pophomogeneous(df['time_type']),
            'ext': '.png',
        }),
        transparent=True,
        dpi=300,
    )

    plt.clf()

    for showfliers in True, False:
        facet(df).map(
            sns.boxplot,
            'Threads',
            measurement,
            'Implementation',
            hue_order=sorted(df['Implementation'].unique()),
            palette=sns.color_palette(),
            showfliers=showfliers,
        ).add_legend()

        plt.savefig(
            kn.pack({
                'fliers': showfliers,
                'measurement': slugify(measurement),
                'time_type': ip.pophomogeneous(df['time_type']),
                'ext': '.png',
            }),
            transparent=True,
            dpi=300,
        )

        plt.clf()
Example #3
0
def make_output_dir():

    # directory all sources were located in
    # must be identical across sources
    source_dir = ip.pophomogeneous(
        map(
            lambda x: os.path.dirname(os.path.abspath(x)),
            sys.argv[1:],
        ))

    # put output files into concat subdirectory
    return os.path.join(
        source_dir,
        'concat',
    )
Example #4
0
    return res


df_stitched = reduce(
    merge_pair,
    dataframes,
)

print(f'merged dataframe has {len(df_stitched.index)} rows')

# there should only be one entry for each series/stint
assert len(df_stitched.groupby([
    'Series',
    'Stint',
])) == len(df_stitched)
assert ip.pophomogeneous(df_stitched['Stint']) == stint
assert ip.pophomogeneous(df_stitched['Series'] // 1000) == endeavor

################################################################################
print()
print('calculating upload path')
print('---------------------------------------------------------------------')
################################################################################

# common_keys = set.intersection(*[
#     set( kn.unpack(source).keys() )
#     for source in sources
# ])

out_filename = kn.pack({
    # **{
    'Number Unique Module Expression Profiles',
    'Number Unique Module Regulation Profiles',
    'Num Instructions Executed per Live Cardinal-update',
    'Mean Resource Received Per Cell',
    'Resource Receiving Cell Fraction',
    'Fraction Deaths apoptosis',
    'Fraction Deaths elimination',
    'Nulliparous Fraction',
    'Mean Kin Group Size Level 0',
    'Mean Kin Group Size Level 1',
]:
    df[target] = df[target + ' (monoculture mean)']

################################################################################

mutation_str = ip.pophomogeneous( df['MUTATION_RATE'].dropna() )
mutation_items = mutation_str.split()
assert mutation_items[0] == '[' and mutation_items[-1] == ']'

mutation_rates = list(map( float, mutation_items[1:-1] ))
nlev = len(mutation_rates) - 1

df['Expected Mutations'] = sum(
    df[f'Elapsed Generations Level {lev}'] * rate
    for lev, rate in enumerate(mutation_rates)
)
df['Mutation Accumulation'] = df['Elapsed Mutations'] / df['Expected Mutations']

################################################################################

df['Elapsed Generations'] = sum(
def tabulate_fitness_complexity(variant_df, control_fits_df):

    # count competions where both strains went extinct simultaneously
    # as 0 Fitness Differential
    na_rows = variant_df['Fitness Differential'].isna()
    assert all(variant_df[na_rows]['Population Extinct'])
    variant_df['Fitness Differential'].fillna(
        0,
        inplace=True,
    )

    res = []
    for series in variant_df['Competition Series'].unique():

        series_df = variant_df[variant_df['Competition Series'] == series]

        wt_vs_variant_df = series_df[
            series_df['genome variation'] != 'master'].reset_index()

        h0_fit = ip.popsingleton(
            control_fits_df[control_fits_df['Series'] == series].to_dict(
                orient='records', ))

        # calculate the probability of observing fitness differential result
        # under control data distribution
        if len(wt_vs_variant_df):
            wt_vs_variant_df['p'] = wt_vs_variant_df.apply(
                lambda row: stats.t.cdf(
                    row['Fitness Differential'],
                    h0_fit['Fit Degrees of Freedom'],
                    loc=h0_fit['Fit Loc'],
                    scale=h0_fit['Fit Scale'],
                ),
                axis=1,
            )
        else:
            # special case for an empty dataframe
            # to prevent an exception
            wt_vs_variant_df['p'] = []

        p_thresh = 1.0 / 100
        num_more_fit_variants = (wt_vs_variant_df['p'] > 1 - p_thresh).sum()
        num_less_fit_variants = (wt_vs_variant_df['p'] < p_thresh).sum()

        expected_false_positives = len(wt_vs_variant_df) * p_thresh

        res.append({
            'Stint':
            ip.pophomogeneous(variant_df['Competition Stint']),
            'Series':
            series,
            'Flagged Advantageous Sites':
            num_less_fit_variants,
            'Flagged Deleterious Sites':
            num_more_fit_variants,
            'H_0 Advantageous Site Flags':
            expected_false_positives,
            'H_0 Deleterious Site Flags':
            expected_false_positives,
            'Estimated True Advantageous Sites':
            num_less_fit_variants - expected_false_positives,
            'Estimated True Deleterious Sites':
            num_more_fit_variants - expected_false_positives,
        })

    return pd.DataFrame(res)
Example #7
0
# plot regressions used to estimate overhead and latency
for threads, chunk in df.groupby('Threads', ):
    plot_regression(
        chunk,
        'Work',
        'Time',
        extra_names={'threads': threads},
    )

# extract estimates of overhead and latency
res = res.append(
    pd.DataFrame([{
        'Parameter':
        '{} @ {} Threads'.format(name, threads),
        'Lower Bound':
        ip.pophomogeneous(chunk['Lower Bound ' + name]),
        'Upper Bound':
        ip.pophomogeneous(chunk['Upper Bound ' + name]),
        'Estimate':
        ip.pophomogeneous(chunk['Estimated ' + name]),
    } for threads, chunk in df.groupby('Threads')
                  for name in ('Overhead', 'Latency')]))

# consolidate and save computed estimates and bounds
res.sort_values([
    'Parameter',
]).to_csv(
    kn.pack({
        'title': 'parameter_estimates',
        'synchronous': str(synchronous),
        'ext': '.csv',