Ejemplo n.º 1
0
args = parser.parse_args()

qthresh = 0.05
stats_method = 'kruskal-wallis'

dfdict = fio.read_dfdict_data(args.clean_data_dir, subset=args.subset)

print('Doing univariate tests...')
resultsdict = {}

for dataset in dfdict:
    df = dfdict[dataset]['df']
    meta = dfdict[dataset]['meta']

    # Collapse to genus level
    df = util.collapse_taxonomic_contents_df(df, 'genus')

    if args.split_cases:
        # Get samples in each class. Note that the two fio.get_classes
        # functions are basically the same, just with different diseases
        # hard-coded in.
        classes_list = fio.get_classes(meta, dataset)

        # Go through each case group one by one
        for dis_label in classes_list[1]:
            # old dataset = ibd_alm, new dataset = uc_alm
            newdataset = dis_label.lower() + '_' + dataset.split('_')[1]

            # Get samples
            sub_list = [classes_list[0], [dis_label]]
            H_smpls, dis_smpls = fio.get_samples(meta, sub_list)
Ejemplo n.º 2
0
               help='log function to use (default: %(default)s)',
               choices=['log2', 'log10'],
               default='log2')
p.add_argument('--method',
               help='measure of central tendency to use in ' +
               'calculating effect direction (default: %(default)s)',
               choices=['mean', 'median'],
               default='mean')
args = p.parse_args()

# Read in dfdict
dfdict = read_dfdict_data(args.datadir)
# Collapse to genus level
for dataset in dfdict:
    dfdict[dataset]['df'] = \
        collapse_taxonomic_contents_df(dfdict[dataset]['df'], 'genus')

# Read in qvalues. Tab-delimited, genera in index and datasets in columns
qvals = pd.read_csv(args.qvalues, sep='\t', index_col=0)

# Calculate logfold change with pandas-fu
allres = qvals.apply(lambda col: convert_dataset_to_logfold(
    col, dfdict, logfun=np.log2, method='mean'))

# Replace +/- infinity with max/min value in entire matrix
# From get_log_change(): +inf is returned when controls = 0, disease > 0;
# -inf is returned when controls > 0, disease = 0; 0 is returned when both
# controls and disease = 0
allres = allres.replace(np.inf, np.ma.masked_invalid(allres.fillna(0)).max())
allres = allres.replace(-np.inf, np.ma.masked_invalid(allres.fillna(0)).min())
df = pd.read_csv(args.fnotu, sep='\t', index_col=0)
meta = pd.read_csv(args.fnmeta, sep='\t', index_col=0)

meta_col = args.metacol
case_lbl = args.caselabel
ctrl_lbl = args.ctrllabel

## For each site, compare aspirators and non-aspirators
aspmeta = meta.dropna(subset=[meta_col])
# Convert the non-aspiration columns into strings
if aspmeta[meta_col].dtype == 'int' or aspmeta[meta_col].dtype == 'float64':
    aspmeta[meta_col] = aspmeta[meta_col].astype(int).astype(str)
qvals_otu = univariate_by_site(aspmeta, df, args.method, meta_col, case_lbl,
                               ctrl_lbl)
qvals_otu['level'] = 'otu'

## What if I do univariate at genus level?
genusdf = util.collapse_taxonomic_contents_df(df, 'genus').loc[aspmeta.index]
## Note: there are like 10 samples with fewer than 50% of reads annotated to genus level
# genusdf.loc[aspmeta.index].sum(axis=1).sort_values()
## 10 of these seem dominated by k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__;g__;s__;d__denovo34,
## which BLASTS to Bacillus thermoamylovorans
qvals_genus = univariate_by_site(aspmeta, genusdf, args.method, meta_col,
                                 case_lbl, ctrl_lbl)
qvals_genus['level'] = 'genus'

allqdf = pd.concat((qvals_otu, qvals_genus), ignore_index=True)
#allqdf = qvals_otu
allqdf.to_csv(args.fout, sep='\t', index=False)
Ejemplo n.º 4
0
def read_all_and_return_abun_ubiquity(datadir, fnpvals):
    """
    Read all clean datasets in datadir and return a tidy dataframe
    with the various ubiquity/abundance calculations for each genus.

    Parameters
    ----------
    datadir : str
        path to directory with *.otu_table.clean.feather and
        *.metadata.clean.feather files
    fnpvals : str
        path to file with 'overall' significant bugs (should have column labeled
        'overall' and OTUs in rows)

    Returns
    -------
    tidy : pandas tidy dataframe
       has the following columns:
          otu: str, 'k__Bacteria;...g__Akkermansia'
          variable: str, 'abundance_from_pooled_mean_total',
                    'ubiquity_mean_of_datasets_h', etc...
          value: float
          metric: str, {'abundance', 'ubiquity'}
          calculation: str, {'from_pooled_mean', 'mean_of_datasets'}
          patient: {'total', 'dis', 'h'}
          overall: float or str {'nan', 1.0, -1.0, 0.0}
          color: float or str, {RGBA values or 'k'}
          overall_significance: str, {'not_sig', 'disease', 'health', 'mixed'}
    """
    toconcat = []
    print('Reading datasets...')
    datasetids = fio.get_dataset_ids(datadir)
    for dataset in datasetids:
        print(dataset),
        ## Read dataset
        df, meta = fio.read_dataset_files(dataset, datadir)
        df = util.raw2abun(df)

        ## Collapse to genus level
        df = util.collapse_taxonomic_contents_df(df, 'genus')
        classes_list = fio.get_classes(meta)
        [ctrl_smpls, dis_smpls] = fio.get_samples(meta, classes_list)

        ## Do calculations for all patients
        # Note: dis_smpls and ctrl_smpls sometimes just grabs a subset of
        # patients. e.g. in CRC studies, this discards adenoma patients
        #all_smpls = ctrl_smpls + dis_smpls
        all_smpls = list(df.index)
        # tmp is a DataFrame with the calculations for different patient
        # groups in the columns, genera in rows
        tmp = pd.DataFrame(df.loc[all_smpls].sum(),
                           columns=['total_total_abun'])
        tmp['total_present'] = df.loc[all_smpls]\
                                 .applymap(lambda x: 1 if x else 0)\
                                 .sum()
        tmp['total_samples'] = len(all_smpls)

        ## Do calculations across healthy patients only
        H_smpls = meta.query('DiseaseState == "H"').index
        if len(H_smpls) > 0:
            tmp['total_h_abun'] = df.loc[H_smpls].sum()
            tmp['h_present'] = df.loc[H_smpls]\
                                 .applymap(lambda x: 1 if x else 0)\
                                 .sum()
            tmp['h_samples'] = len(H_smpls)

        else:
            # add the non-healthy controls to our disease patients
            dis_smpls += ctrl_smpls

        ## Do calculations across non-healthy patients
        tmp['total_dis_abun'] = df.loc[dis_smpls].sum()
        tmp['dis_present'] = df.loc[dis_smpls]\
                               .applymap(lambda x: 1 if x else 0)\
                               .sum()
        tmp['dis_samples'] = len(dis_smpls)

        tmp['dataset'] = dataset
        tmp['otu'] = tmp.index

        toconcat.append(tmp)

    df = pd.concat(toconcat, ignore_index=True)

    # Calculate ubiquity and abundance metrics for genera
    # df now has many columns, see the calculate_ubiquity_and_abun docstring
    # for these columns
    df = calculate_ubiquity_and_abun(df, 'h')
    df = calculate_ubiquity_and_abun(df, 'dis')
    df = calculate_ubiquity_and_abun(df, 'total')

    # Turn into tidy df with 'metric', 'calculation', and 'patient' columns
    tidy = tidyfy_df(df)

    # Add the core significance value for each genus
    overallsig = pd.read_csv(fnpvals, sep='\t', index_col=0)

    tidy = tidy.merge(overallsig, right_index=True, left_on='otu', how='left')
    # replace NaN with 'nan' to allow dict hashing
    tidy['overall'] = tidy['overall'].replace(np.nan, 'nan')
    # color each genus by its overall status (not used in final plots)
    pal = sns.diverging_palette(220,20, n=7)
    colordict = {1: tuple(pal[-1]),
                -1: tuple(pal[0]),
                0: 'k',
                'nan': tuple(pal[3])}
    tidy['color'] = tidy['overall'].map(lambda x: colordict[x])
    # add human-readable 'overall' labels
    rename_overall = {1: 'disease', -1: 'health', 'nan': 'not_sig', 0: 'mixed'}
    tidy.loc[tidy.index, 'overall_significance'] = \
        tidy['overall'].map(lambda x: rename_overall[x])

    return tidy