def validate_assembled_dataframe(df, endeavor, stint): # there should only be one entry for each series/stint assert len(df.groupby([ 'Series', 'Stint', ])) == len(df) assert ip.pophomogeneous(df['Series'] // 1000) == endeavor assert ip.pophomogeneous(df['Stint']) == stint
def draw_plots(measurement, df): facet(df).map( sns.barplot, 'Threads', measurement, 'Implementation', hue_order=sorted(df['Implementation'].unique()), palette=sns.color_palette(), ).add_legend() plt.savefig( kn.pack({ 'measurement': slugify(measurement), 'time_type': ip.pophomogeneous(df['time_type']), 'ext': '.png', }), transparent=True, dpi=300, ) plt.clf() for showfliers in True, False: facet(df).map( sns.boxplot, 'Threads', measurement, 'Implementation', hue_order=sorted(df['Implementation'].unique()), palette=sns.color_palette(), showfliers=showfliers, ).add_legend() plt.savefig( kn.pack({ 'fliers': showfliers, 'measurement': slugify(measurement), 'time_type': ip.pophomogeneous(df['time_type']), 'ext': '.png', }), transparent=True, dpi=300, ) plt.clf()
def make_output_dir(): # directory all sources were located in # must be identical across sources source_dir = ip.pophomogeneous( map( lambda x: os.path.dirname(os.path.abspath(x)), sys.argv[1:], )) # put output files into concat subdirectory return os.path.join( source_dir, 'concat', )
return res df_stitched = reduce( merge_pair, dataframes, ) print(f'merged dataframe has {len(df_stitched.index)} rows') # there should only be one entry for each series/stint assert len(df_stitched.groupby([ 'Series', 'Stint', ])) == len(df_stitched) assert ip.pophomogeneous(df_stitched['Stint']) == stint assert ip.pophomogeneous(df_stitched['Series'] // 1000) == endeavor ################################################################################ print() print('calculating upload path') print('---------------------------------------------------------------------') ################################################################################ # common_keys = set.intersection(*[ # set( kn.unpack(source).keys() ) # for source in sources # ]) out_filename = kn.pack({ # **{
'Number Unique Module Expression Profiles', 'Number Unique Module Regulation Profiles', 'Num Instructions Executed per Live Cardinal-update', 'Mean Resource Received Per Cell', 'Resource Receiving Cell Fraction', 'Fraction Deaths apoptosis', 'Fraction Deaths elimination', 'Nulliparous Fraction', 'Mean Kin Group Size Level 0', 'Mean Kin Group Size Level 1', ]: df[target] = df[target + ' (monoculture mean)'] ################################################################################ mutation_str = ip.pophomogeneous( df['MUTATION_RATE'].dropna() ) mutation_items = mutation_str.split() assert mutation_items[0] == '[' and mutation_items[-1] == ']' mutation_rates = list(map( float, mutation_items[1:-1] )) nlev = len(mutation_rates) - 1 df['Expected Mutations'] = sum( df[f'Elapsed Generations Level {lev}'] * rate for lev, rate in enumerate(mutation_rates) ) df['Mutation Accumulation'] = df['Elapsed Mutations'] / df['Expected Mutations'] ################################################################################ df['Elapsed Generations'] = sum(
def tabulate_fitness_complexity(variant_df, control_fits_df): # count competions where both strains went extinct simultaneously # as 0 Fitness Differential na_rows = variant_df['Fitness Differential'].isna() assert all(variant_df[na_rows]['Population Extinct']) variant_df['Fitness Differential'].fillna( 0, inplace=True, ) res = [] for series in variant_df['Competition Series'].unique(): series_df = variant_df[variant_df['Competition Series'] == series] wt_vs_variant_df = series_df[ series_df['genome variation'] != 'master'].reset_index() h0_fit = ip.popsingleton( control_fits_df[control_fits_df['Series'] == series].to_dict( orient='records', )) # calculate the probability of observing fitness differential result # under control data distribution if len(wt_vs_variant_df): wt_vs_variant_df['p'] = wt_vs_variant_df.apply( lambda row: stats.t.cdf( row['Fitness Differential'], h0_fit['Fit Degrees of Freedom'], loc=h0_fit['Fit Loc'], scale=h0_fit['Fit Scale'], ), axis=1, ) else: # special case for an empty dataframe # to prevent an exception wt_vs_variant_df['p'] = [] p_thresh = 1.0 / 100 num_more_fit_variants = (wt_vs_variant_df['p'] > 1 - p_thresh).sum() num_less_fit_variants = (wt_vs_variant_df['p'] < p_thresh).sum() expected_false_positives = len(wt_vs_variant_df) * p_thresh res.append({ 'Stint': ip.pophomogeneous(variant_df['Competition Stint']), 'Series': series, 'Flagged Advantageous Sites': num_less_fit_variants, 'Flagged Deleterious Sites': num_more_fit_variants, 'H_0 Advantageous Site Flags': expected_false_positives, 'H_0 Deleterious Site Flags': expected_false_positives, 'Estimated True Advantageous Sites': num_less_fit_variants - expected_false_positives, 'Estimated True Deleterious Sites': num_more_fit_variants - expected_false_positives, }) return pd.DataFrame(res)
# plot regressions used to estimate overhead and latency for threads, chunk in df.groupby('Threads', ): plot_regression( chunk, 'Work', 'Time', extra_names={'threads': threads}, ) # extract estimates of overhead and latency res = res.append( pd.DataFrame([{ 'Parameter': '{} @ {} Threads'.format(name, threads), 'Lower Bound': ip.pophomogeneous(chunk['Lower Bound ' + name]), 'Upper Bound': ip.pophomogeneous(chunk['Upper Bound ' + name]), 'Estimate': ip.pophomogeneous(chunk['Estimated ' + name]), } for threads, chunk in df.groupby('Threads') for name in ('Overhead', 'Latency')])) # consolidate and save computed estimates and bounds res.sort_values([ 'Parameter', ]).to_csv( kn.pack({ 'title': 'parameter_estimates', 'synchronous': str(synchronous), 'ext': '.csv',