def avgPrefs(prefsfiles): """Gets average of site-specific preferences. Args: `prefsfiles` (list) List of CSV files containing preferences, must all be for same sites and characters. Returns: A `pandas.DataFrame` containing the average of the preferences in `prefsfiles`. In this returned data frame, `site` is the index >>> tf1 = tempfile.NamedTemporaryFile >>> tf2 = tempfile.NamedTemporaryFile >>> with tf1(mode='w') as file1, tf2(mode='w') as file2: ... x = file1.write('site,A,C,G,T\\n' ... '10,0.2,0.2,0.5,0.1\\n' ... '2a,0.3,0.3,0.3,0.1') ... file1.flush() ... x = file2.write('site,A,C,G,T\\n' ... '10,0.4,0.1,0.1,0.4\\n' ... '2a,0.3,0.4,0.1,0.2') ... file2.flush() ... avg = avgPrefs([file1.name, file2.name]) >>> (avg['site'] == ['2a', '10']).all() True >>> numpy.allclose(avg['A'], [0.3, 0.3]) True >>> numpy.allclose(avg['C'], [0.35, 0.15]) True >>> numpy.allclose(avg['G'], [0.2, 0.3]) True >>> numpy.allclose(avg['T'], [0.15, 0.25]) True """ assert len(prefsfiles) >= 1 prefs = [ pandas.read_csv(f, index_col='site').sort_index() for f in prefsfiles ] # make sure all have the same columns in the same order cols = prefs[0].columns for i in range(len(prefs)): assert set(cols) == set(prefs[i].columns) prefs[i] = prefs[i][cols] avgprefs = pandas.concat(prefs).groupby('site').mean().reset_index() # natural sort by site: https://stackoverflow.com/a/29582718 avgprefs = avgprefs.reindex(index=natsort.order_by_index( avgprefs.index, natsort.index_realsorted(avgprefs.site))) return avgprefs
def test_index_realsorted_returns_results_identical_to_index_natsorted_with_REAL(): a = ['a50', 'a51.', 'a50.31', 'a-50', 'a50.4', 'a5.034e1', 'a50.300'] assert index_realsorted(a) == index_natsorted(a, alg=ns.REAL)
def tidyToWide(tidy_df, valuecol): """Converts tidy `diffsel` data frame to wide form. The `diffsel` data frames returned by ``dms2_diffsel`` (and also other dataframes, such as the `fracsurvive` ones from ``dms_fracsurvive`` are in tidy form. This function converts them to wide form. Args: `tidy_df` (pandas DataFrame) Data frame in tidy form. Should have columns named `site`, `wildtype`, `mutation`, and something with the name matching `valuecol`. `valuecol` (string) Name of value column in `df`, such `diffsel` or `fracsurvive`. Returns: Wide form dataframe. Will have columns `site` (as string), `wildtype`, and all characters (e.g., amino acids) for which values are given. Natural sorted by `site`. >>> tidy_df = pandas.read_csv(io.StringIO( ... '''site wildtype mutation diffsel ... 3 A D -1.5 ... 3 A C 10.1 ... 2 A C 10.1 ... 1 C D 9.5 ... 1 C A 0.2 ... 2 A D -1.5'''), ... delim_whitespace=True, index_col=False) >>> wide_df = tidyToWide(tidy_df, valuecol='diffsel') >>> print(wide_df.to_string(float_format=lambda x: '{0:.1f}'.format(x))) site A C D wildtype 0 1 0.2 0.0 9.5 C 1 2 0.0 10.1 -1.5 A 2 3 0.0 10.1 -1.5 A """ assert isinstance(tidy_df, pandas.DataFrame) cols = ['site', 'wildtype', 'mutation', valuecol] assert set(cols) == set( tidy_df.columns), ('expected columns ' '{0}\nactual columns {1}'.format( cols, tidy_df.columns)) # make site a string tidy_df['site'] = tidy_df['site'].astype(str) # sort on site as here: https://stackoverflow.com/a/29582718 tidy_df = tidy_df.reindex(index=natsort.order_by_index( tidy_df.index, natsort.index_realsorted(tidy_df.site))) # convert to wide form, keeping wildtype identities tidy_df = tidy_df.set_index('site', drop=True) wt = tidy_df['wildtype'] wide_df = (tidy_df.pivot( columns='mutation', values=valuecol).fillna(0.0).join(wt).reset_index()) wide_df = wide_df.drop_duplicates().reset_index(drop=True) return wide_df
def test_index_realsorted_returns_results_identical_to_index_natsorted(): a = ['a50', 'a51.', 'a50.31', 'a50.4', 'a5.034e1', 'a50.300'] assert index_realsorted(a) == index_natsorted(a)
def test_index_realsorted_returns_results_identical_to_index_natsorted_with_REAL(): a = ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"] assert index_realsorted(a) == index_natsorted(a, alg=ns.REAL)
def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg(float_list): assert index_realsorted(float_list) == index_natsorted(float_list, alg=ns.REAL)
def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg( float_list): assert index_realsorted(float_list) == index_natsorted(float_list, alg=ns.REAL)