Esempio n. 1
0
def colnames_to_sample_info_array(dataframe, main_dir='./'):
    col_df = pd.DataFrame({'ID': dataframe.reset_index().ID})
    sample_info = read_sample_info(main_dir=main_dir)
    return pd.merge(col_df, sample_info, how='left')
def aggregate_mixed_taxonomy(dataframe, taxa_dict, main_dir='./',
                             summarise_other=True, check_totals_sum_to_1=True):
    """
    Summarise abundances based on cherry-picked taxonomic abundances,
    perhaps mixed at different levels.

    Loop over the different taxonomic levels specified in a dictionary of
    taxonomic level keys and name pairs.  Reduce using sum_on_taxonomy()
    and store that result in a list.  Concatenate the lists into one DataFrame
    for return.

    In order to keep track of what has not been cherry picked, we cherry pick
    out of a copy of the dataframe, deleting those rows after they are used.
    Then what is left can be lumped into "other".  This also helps (though
    does not completely solve) the issue of an invalad taxonomy dict being
    passed as an argument.

    To get an "other" for a given taxonomic level, pass in a DataFrame that
    is already restricted to the taxonomic level you are looking for
    (e.g. Methylococcaceae).  Then the taxa you *aren't* picking out will be
    represented by "other", instead of all other taxa at all other taxonomic
    levels.

    :param dataframe: dataframe containing all the data to pick through
    :param taxa_dict: a dictionary with taxonomic levels as keys and
    names as values.  E.g. {'Phylum':['Bacteroidetes'],
    'Order':['Burkholderiales','Methylophilales', 'Methylococcales']}
    :param main_dir: directory where the data is stored.  This argument was
    added so jupyter notebooks could be run in a sub-directory.
    :param summarise_other: include an "other" row per sample?  (or omit)
    :return:
    """

    # Make a copy of the dataframe so we can sum the leftovers into an
    # "other" category.
    df = dataframe.copy()

    # First make sure all of the taxa name, value pairs are valid:
    # There is a unit test for this in:
    # unit_tests.test_abundance_plot_utils.
    # testDeleteRowsForTaxa#test_invalid_taxa_dict
    for key in taxa_dict.keys():
        for name in taxa_dict[key]:
            assert (key in df.columns), "column {} doesn't exist!".format(key)
            # Note: to check for a value in a series, use set(Series)
            assert (name in set(df[key])), \
                'Value "{}" in column "{}" does not exist. \n' \
                'Check spelling and conflict with other taxa in the ' \
                'taxa dict.'.format(name, key)

    # TODO: need to start at most general taxonomic level if you want to
    # check for errors in the taxa dict.
    # Currently no error will be thrown if you pick a Genera out, then
    # subsequently select a Family that would have included that Genera.
    # Also note that if you pass the same key twice in a dict, Python keeps
    # only the second key:value pair.  E.g.:
    # print({'Genus': ['Orcinus'], 'Genus': ['ABCD']})

    reduced_data = []
    for key in taxa_dict.keys():
        for name in taxa_dict[key]:
            # Get one row per week/oxygen condition:
            reduced_rows = sum_on_taxonomy(dataframe=df,
                                           taxa_level=key,
                                           name=name)
            # check that you got some rows.  Might be a typo if not!
            assert(reduced_rows.shape[0] > 0), \
                'found no rows for {} = "{}"'.format(key, name)

            df = delete_rows_for_taxa(dataframe=df,
                                      taxa_level=key, taxa_name=name)

            # the index needs to be dropped but it is stored below as
            # 'taxonomic level' and 'taxonomic name'
            # I haven't been able to reset_index on this series to drop the
            # index so I'm doing it this way:
            reduced_rows = reduced_rows.reset_index()
            del reduced_rows[key]
            # make a new dataframe out of it.
            reduced_data.append(
                pd.DataFrame(
                    {'taxonomic level': key,
                     'taxonomic name': name,
                     'abundance sum': reduced_rows['fraction of reads'],
                     'ID': reduced_rows['ID']}))
    # Concatenate data included in the taxa_dict
    # Has form like:
    #        ID  abundance sum taxonomic level   taxonomic name
    # 100_LOW12       0.084171           Order  Burkholderiales
    dataframe_of_keepers = pd.concat(reduced_data)
    print(dataframe_of_keepers.head())

    # Aggregate the leftovers into an "other" column, with headers to match
    # dataframe_of_keepers
    # TODO: aggregate into "other"
    dataframe_of_leftovers = collapse_unused_taxa_into_other(df)

    if summarise_other:
        # Merge the keepers and the leftovers.
        result_df = pd.concat([dataframe_of_keepers, dataframe_of_leftovers])
        print("merged result_df:")
        print(result_df.head())
        # merge on the sample info.
        result_df = pd.merge(left=result_df,
                             right=elviz_utils.read_sample_info(main_dir))
        # Check that the sum of abundances for each sample is really close
        # to 1:
        if check_totals_sum_to_1:
            sample_sums = result_df.groupby('ID')['abundance sum'].sum()
            assert (sample_sums > 0.999).all()
            assert (sample_sums < 1.001).all()
            print(sample_sums.head())
    else:
        result_df = pd.merge(left=dataframe_of_keepers,
                             right=elviz_utils.read_sample_info(main_dir))

    return result_df