Esempio n. 1
0
def avgPrefs(prefsfiles):
    """Gets average of site-specific preferences.

    Args:
        `prefsfiles` (list)
            List of CSV files containing preferences, must all be
            for same sites and characters.

    Returns:
        A `pandas.DataFrame` containing the average of the
        preferences in `prefsfiles`. In this returned
        data frame, `site` is the index

    >>> tf1 = tempfile.NamedTemporaryFile
    >>> tf2 = tempfile.NamedTemporaryFile
    >>> with tf1(mode='w') as file1, tf2(mode='w') as file2:
    ...     x = file1.write('site,A,C,G,T\\n'
    ...                 '10,0.2,0.2,0.5,0.1\\n'
    ...                 '2a,0.3,0.3,0.3,0.1')
    ...     file1.flush()
    ...     x = file2.write('site,A,C,G,T\\n'
    ...                 '10,0.4,0.1,0.1,0.4\\n'
    ...                 '2a,0.3,0.4,0.1,0.2')
    ...     file2.flush()
    ...     avg = avgPrefs([file1.name, file2.name])
    >>> (avg['site'] == ['2a', '10']).all()
    True
    >>> numpy.allclose(avg['A'], [0.3, 0.3])
    True
    >>> numpy.allclose(avg['C'], [0.35, 0.15])
    True
    >>> numpy.allclose(avg['G'], [0.2, 0.3])
    True
    >>> numpy.allclose(avg['T'], [0.15, 0.25])
    True
    """
    assert len(prefsfiles) >= 1
    prefs = [
        pandas.read_csv(f, index_col='site').sort_index() for f in prefsfiles
    ]

    # make sure all have the same columns in the same order
    cols = prefs[0].columns
    for i in range(len(prefs)):
        assert set(cols) == set(prefs[i].columns)
        prefs[i] = prefs[i][cols]

    avgprefs = pandas.concat(prefs).groupby('site').mean().reset_index()

    # natural sort by site: https://stackoverflow.com/a/29582718
    avgprefs = avgprefs.reindex(index=natsort.order_by_index(
        avgprefs.index, natsort.index_realsorted(avgprefs.site)))

    return avgprefs
Esempio n. 2
0
def test_index_realsorted_returns_results_identical_to_index_natsorted_with_REAL():
    a = ['a50', 'a51.', 'a50.31', 'a-50', 'a50.4', 'a5.034e1', 'a50.300']
    assert index_realsorted(a) == index_natsorted(a, alg=ns.REAL)
Esempio n. 3
0
def tidyToWide(tidy_df, valuecol):
    """Converts tidy `diffsel` data frame to wide form.

    The `diffsel` data frames returned by ``dms2_diffsel`` (and
    also other dataframes, such as the `fracsurvive` ones
    from ``dms_fracsurvive`` are in tidy form. This function
    converts them to wide form.

    Args:
        `tidy_df` (pandas DataFrame)
            Data frame in tidy form. Should have columns named
            `site`, `wildtype`, `mutation`, and something
            with the name matching `valuecol`.
        `valuecol` (string)
            Name of value column in `df`, such `diffsel` or
            `fracsurvive`.

    Returns:
        Wide form dataframe. Will have columns `site` (as string), 
        `wildtype`, and all characters (e.g., amino acids)
        for which values are given. Natural sorted by `site`.

    >>> tidy_df = pandas.read_csv(io.StringIO(
    ...     '''site wildtype mutation diffsel
    ...           3        A        D    -1.5 
    ...           3        A        C    10.1
    ...           2        A        C    10.1
    ...           1        C        D     9.5
    ...           1        C        A     0.2
    ...           2        A        D    -1.5'''),
    ...     delim_whitespace=True, index_col=False)
    >>> wide_df = tidyToWide(tidy_df, valuecol='diffsel')
    >>> print(wide_df.to_string(float_format=lambda x: '{0:.1f}'.format(x)))
      site   A    C    D wildtype
    0    1 0.2  0.0  9.5        C
    1    2 0.0 10.1 -1.5        A
    2    3 0.0 10.1 -1.5        A
    """
    assert isinstance(tidy_df, pandas.DataFrame)
    cols = ['site', 'wildtype', 'mutation', valuecol]
    assert set(cols) == set(
        tidy_df.columns), ('expected columns '
                           '{0}\nactual columns {1}'.format(
                               cols, tidy_df.columns))

    # make site a string
    tidy_df['site'] = tidy_df['site'].astype(str)

    # sort on site as here: https://stackoverflow.com/a/29582718
    tidy_df = tidy_df.reindex(index=natsort.order_by_index(
        tidy_df.index, natsort.index_realsorted(tidy_df.site)))

    # convert to wide form, keeping wildtype identities
    tidy_df = tidy_df.set_index('site', drop=True)
    wt = tidy_df['wildtype']
    wide_df = (tidy_df.pivot(
        columns='mutation',
        values=valuecol).fillna(0.0).join(wt).reset_index())
    wide_df = wide_df.drop_duplicates().reset_index(drop=True)

    return wide_df
Esempio n. 4
0
def test_index_realsorted_returns_results_identical_to_index_natsorted():
    a = ['a50', 'a51.', 'a50.31', 'a50.4', 'a5.034e1', 'a50.300']
    assert index_realsorted(a) == index_natsorted(a)
Esempio n. 5
0
def test_index_realsorted_returns_results_identical_to_index_natsorted_with_REAL():
    a = ["a50", "a51.", "a50.31", "a-50", "a50.4", "a5.034e1", "a50.300"]
    assert index_realsorted(a) == index_natsorted(a, alg=ns.REAL)
def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg(float_list):
    assert index_realsorted(float_list) == index_natsorted(float_list, alg=ns.REAL)
def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg(
        float_list):
    assert index_realsorted(float_list) == index_natsorted(float_list,
                                                           alg=ns.REAL)