def fastspar_correlation(table: Table,
                         verbose: bool = False,
                         calc_pvalues=False,
                         bootstraps=1000,
                         nprocs=1,
                         p_adjust_method='fdr_bh') -> pd.DataFrame:
    with tempfile.TemporaryDirectory(prefix='fastspar') as temp:
        # To fix AttributeError: 'DataFrame' object has no attribute 'to_dense'. See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.to_dense.html
        table.to_dataframe().sparse.to_dense().to_csv(path.join(
            temp, 'otu_table.tsv'),
                                                      sep='\t',
                                                      index_label='#OTU ID')
        if verbose:
            stdout = None
        else:
            stdout = subprocess.DEVNULL
        run_fastspar(path.join(temp, 'otu_table.tsv'),
                     path.join(temp, path.join(temp, 'correl_table.tsv')),
                     path.join(temp, 'covar_table.tsv'), stdout, nprocs)
        cor = pd.read_csv(path.join(temp, 'correl_table.tsv'),
                          sep='\t',
                          index_col=0)
        correls = df_to_correls(cor)
        if calc_pvalues:
            subprocess.run([
                'fastspar_bootstrap', '-c',
                path.join(temp, 'otu_table.tsv'), '-n',
                str(bootstraps), '-p',
                path.join(temp, 'boot'), '-t',
                str(nprocs)
            ],
                           stdout=stdout)
            # infer correlations for each bootstrap count using all available processes
            with ThreadPoolExecutor(max_workers=nprocs) as executor:
                for i in glob((path.join(temp, 'boot*'))):
                    executor.submit(run_fastspar, i,
                                    i.replace('boot', 'cor_boot'),
                                    i.replace('boot', 'cov_boot'))
            # calculate p_values for correlation table
            subprocess.run([
                'fastspar_pvalues', '-c',
                path.join(temp, 'otu_table.tsv'), '-r',
                path.join(temp, 'correl_table.tsv'), '-p',
                path.join(temp, 'cor_boot'), '-t',
                str(nprocs), '-n',
                str(bootstraps), '-o',
                path.join(temp, 'pvalues.tsv')
            ],
                           stdout=stdout)
            pvals = pd.read_csv(path.join(temp, 'pvalues.tsv'),
                                sep='\t',
                                index_col=0)
            pvals = df_to_correls(pvals, col_label='p')
            correls = pd.concat([correls, pvals], axis=1, join='inner')
            correls['p_adjusted'] = p_adjust(correls.p, p_adjust_method)
        correls.index = pd.MultiIndex.from_tuples(
            [sorted(i) for i in correls.index])
        return correls
Esempio n. 2
0
def calculate_correlations(
        table: Table,
        corr_method=spearmanr,
        p_adjustment_method: str = 'fdr_bh') -> pd.DataFrame:
    # TODO: multiprocess this
    index = list()
    data = list()
    for (val_i, id_i, _), (val_j, id_j,
                           _) in table.iter_pairwise(axis='observation'):
        r, p = corr_method(val_i, val_j)
        index.append((id_i, id_j))
        data.append((r, p))
    correls = pd.DataFrame(data, index=index, columns=['r', 'p'])
    correls.index = pd.MultiIndex.from_tuples(
        correls.index)  # Turn tuple index into actual multiindex
    if p_adjustment_method is not None:
        correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method)
    return correls
Esempio n. 3
0
def calculate_correlations(table: Table, corr_method=spearmanr, p_adjust_method: str = 'fdr_bh', nprocs=1) -> \
        pd.DataFrame:
    if nprocs > multiprocessing.cpu_count():
        warnings.warn(
            "nprocs greater than CPU count, using all avaliable CPUs")
        nprocs = multiprocessing.cpu_count()

    pool = multiprocessing.Pool(nprocs)
    cor = partial(calculate_correlation, corr_method=corr_method)
    results = pool.map(
        cor,
        pairwise_iter_wo_metadata(table.iter_pairwise(axis='observation')))
    index = [i[0] for i in results]
    data = [i[1] for i in results]
    pool.close()
    pool.join()
    correls = pd.DataFrame(data, index=index, columns=['r', 'p'])
    # Turn tuple index into actual multiindex, now guaranteeing that correls index is sorted
    correls.index = pd.MultiIndex.from_tuples(
        [sorted(i) for i in correls.index])
    if p_adjust_method is not None:
        correls['p_adjusted'] = p_adjust(correls.p, method=p_adjust_method)
    return correls
Esempio n. 4
0
def test_bh_adjust(unadj_ps):
    adj_ps = np.array([.03, .075, .5])
    bh_ps = p_adjust(unadj_ps, 'fdr_bh')
    assert isinstance(bh_ps, np.ndarray)
    assert_allclose(adj_ps, bh_ps)
Esempio n. 5
0
def test_bonferroni_adjust(unadj_ps):
    adj_ps = np.array([.03, .15, 1])
    bon_ps = p_adjust(unadj_ps, method='b')
    assert isinstance(bon_ps, np.ndarray)
    assert_allclose(adj_ps, bon_ps)
Esempio n. 6
0
def between_correls(args):
    """TABLES MUST SORT SO THAT SAMPLES ARE IN THE SAME ORDER """
    logger = general.Logger("SCNIC_log.txt")
    logger["SCNIC analysis type"] = "between"

    # correlation and p-value adjustment methods
    correl_methods = {'spearman': spearmanr, 'pearson': pearsonr}
    correl_method = correl_methods[args.correl_method]

    # load tables
    table1 = load_table(args.table1)
    table2 = load_table(args.table2)
    logger["input table 1"] = args.table1
    logger["input table 1"] = args.table2

    table1 = table1.sort()
    table2 = table2.sort()

    # make new output directory and change to it
    if args.force and args.output is not None:
        shutil.rmtree(args.output, ignore_errors=True)
    if args.output is not None:
        os.makedirs(args.output)
        os.chdir(args.output)
        logger["output directory"] = args.output

    # filter tables
    if args.sparcc_filter is True:
        table1 = general.sparcc_paper_filter(table1)
        table2 = general.sparcc_paper_filter(table2)
        print("Table 1 filtered: %s observations" % str(table1.shape[0]))
        print("Table 2 filtered: %s observations" % str(table2.shape[0]))
        logger["sparcc paper filter"] = True
        logger["number of observations present in table 1 after filter"] = table1.shape[0]
        logger["number of observations present in table 2 after filter"] = table2.shape[0]
    if args.min_sample is not None:
        table1 = general.filter_table(table1, args.min_sample)
        table2 = general.filter_table(table2, args.min_sample)

    if not np.array_equal(table1.ids(), table2.ids()):
        raise ValueError("Tables have different sets of samples present")

    metadata = general.get_metadata_from_table(table1)
    metadata.update(general.get_metadata_from_table(table2))

    # make correlations
    logger["correlation metric"] = args.correl_method
    logger["p adjustment method"] = args.p_adjust
    correls = ca.between_correls_from_tables(table1, table2, correl_method, nprocs=args.procs)
    correls.sort_values(correls.columns[-1], inplace=True)
    correls['p_adj'] = general.p_adjust(correls['p'])
    correls.to_csv(open('correls.txt', 'w'), sep='\t', index=True)

    # make network
    correls_filt = general.filter_correls(correls, min_p=args.min_p, min_r=args.min_r)
    net = general.correls_to_net(correls_filt, metadata=metadata)
    logger["number of nodes"] = net.number_of_nodes()
    logger["number of edges"] = net.number_of_edges()
    nx.write_gml(net, 'crossnet.gml')

    logger.output_log()
Esempio n. 7
0
def within_correls(args):
    logger = general.Logger("SCNIC_within_log.txt")
    logger["SCNIC analysis type"] = "within"

    # correlation and p-value adjustment methods
    correl_methods = {'spearman': spearmanr, 'pearson': pearsonr, 'kendall': kendalltau, 'sparcc': 'sparcc'}
    correl_method = correl_methods[args.correl_method.lower()]

    # get features to be correlated
    table = load_table(args.input)
    logger["input table"] = args.input
    if args.verbose:
        print("Table loaded: " + str(table.shape[0]) + " observations")
        print("")
    logger["number of samples in input table"] = table.shape[1]
    logger["number of observations in input table"] = table.shape[0]

    # make new output directory and change to it
    if args.output is not None:
        if not os.path.isdir(args.output):
            os.makedirs(args.output)
        os.chdir(args.output)
    logger["output directory"] = os.getcwd()

    # filter
    if args.sparcc_filter is True:
        table_filt = general.sparcc_paper_filter(table)
        if args.verbose:
            print("Table filtered: %s observations" % str(table_filt.shape[0]))
            print("")
        logger["sparcc paper filter"] = True
        logger["number of observations present after filter"] = table_filt.shape[0]
    elif args.min_sample is not None:
        table_filt = general.filter_table(table, args.min_sample)
        if args.verbose:
            print("Table filtered: %s observations" % str(table_filt.shape[0]))
            print("")
        logger["min samples present"] = args.min_sample
        logger["number of observations present after filter"] = table_filt.shape[0]
    else:
        table_filt = table

    logger["number of processors used"] = args.procs

    # correlate features
    if correl_method in [spearmanr, pearsonr, kendalltau]:
        # calculate correlations
        if args.verbose:
            print("Correlating with %s" % args.correl_method)
        # correlate feature
        correls = ca.calculate_correlations(table_filt, correl_method)
    elif correl_method == 'sparcc':
        correls = ca.fastspar_correlation(table_filt, verbose=args.verbose)
        if args.sparcc_p is not None:
            raise NotImplementedError()  # TODO: reimplement with fastspar
    else:
        raise ValueError("How did this even happen?")
    logger["distance metric used"] = args.correl_method
    if args.verbose:
        print("Features Correlated")
        print("")

    if 'p' in correls.columns:
        correls['p_adj'] = general.p_adjust(correls['p'])
    correls.to_csv('correls.txt', sep='\t', index_label=('feature1', 'feature2'))
    if args.verbose:
        print("Correls.txt written")

    # make correlation network
    metadata = general.get_metadata_from_table(table_filt)
    net = general.correls_to_net(correls, metadata=metadata)
    nx.write_gml(net, 'correlation_network.gml')
    if args.verbose:
        print("Network made")
        print("")

    logger.output_log()