def main(counts, method='SparCC', **kwargs):
    '''
    Compute correlations between all components of counts matrix.
    Run several iterations, in each the fractions are re-estimated, 
    and return the median of all iterations.
    Running several iterations is only helpful with 'dirichlet' 
    normalization method, as with other methods all iterations 
    will give identical results. Thus, if using other normalizations
    set 'iter' parameter to 1.
     
    Parameters
    ----------
    counts : DataFrame
        2D array of counts. Columns are components, rows are samples.
        If using 'dirichlet' or 'pseudo' normalization, 
        counts (positive integers) are required to produce meaningful results, 
        though this is not explicitly checked by the code.  
    method : str, optional (default 'SparCC')
        The algorithm to use for computing correlation.
        Supported values: SparCC, clr, pearson, spearman, kendall
        Note that the pearson, spearman, kendall methods are not
        altered to account for the fact that the data is compositional,
        and are provided to facilitate comparisons to 
        the clr and sparcc methods.

    Returns
    -------
    cor_med: array
        Estimated correlation values.
    cov_med: array
        Estimated covariance matrix if method in {SparCC, clr},
        None otherwise.
              
    =======   ============ =======   ================================================
    kwarg     Accepts      Default   Desctiption
    =======   ============ =======   ================================================
    iter      int          20        number of estimation iteration to average over.
    oprint    bool         True      print iteration progress?
    th        0<th<1       0.1       exclusion threshold for SparCC.
    xiter     int          10        number of exclusion iterations for sparcc.
    norm      str          dirichlet method used to normalize the counts to fractions.
    log       bool         True      log-transform fraction? used if method ~= SparCC/CLR
    =======   ============ ========= ================================================
    '''
    method = method.lower()
    cor_list = []  # list of cor matrices from different random fractions
    var_list = []  # list of cov matrices from different random fractions
    oprint = kwargs.pop('oprint', True)
    n_iter = kwargs.pop('iter', 20)  # number of iterations
    norm = kwargs.pop('norm', 'dirichlet')
    log = kwargs.pop('log', 'True')
    th = kwargs.setdefault(
        'th', 0.1)  # exclusion threshold for iterative sparse algo
    if method in ['sparcc', 'clr']:
        for i in range(n_iter):
            if oprint: print '\tRunning iteration' + str(i)
            fracs = to_fractions(counts, method=norm)
            v_sparse, cor_sparse, cov_sparse = basis_corr(fracs,
                                                          method=method,
                                                          **kwargs)
            var_list.append(np.diag(cov_sparse))
            cor_list.append(cor_sparse)
        cor_array = np.array(cor_list)
        var_med = nanmedian(var_list, axis=0)  #median variances
        cor_med = nanmedian(cor_array, axis=0)  #median correlations
        x, y = np.meshgrid(var_med, var_med)
        cov_med = cor_med * x**0.5 * y**0.5
    elif method in ['pearson', 'kendall', 'spearman']:
        n = counts.shape[1]
        cor_array = np.zeros((n_iter, n, n))
        for i in range(n_iter):
            if oprint: print '\tRunning iteration ' + str(i)
            fracs = to_fractions(counts, method=norm)
            if log:
                x = np.log(fracs)
            else:
                x = fracs
            cor_mat, pval = correlation(x, method, axis=0)
            cor_array[i, :, :] = cor_mat
        cor_med = np.median(cor_array, axis=0)  #median correlation
        cov_med = None
    return cor_med, cov_med
Ejemplo n.º 2
0
def driver():
    #if __name__ == '__main__':
    #print "RUNNING CODE MAIN"
    ## parse input arguments
    from optparse import OptionParser
    kwargs = {}
    usage = (
        'Compute the correlation between components (e.g. OTUs).\n'
        'By default uses the SparCC algorithm to account for compositional effects.\n'
        'Correlation and covariance (when applies) matrices are written out as txt files. \n'
        'Counts file needs to be a tab delimited text file where columns are samples and rows are components (e.g. OTUS).\n'
        ' See example/fake_data.txt for an example file.\n'
        '\n'
        'Usage:   python SparCC.py counts_file [options]\n'
        'Example: python SparCC.py example/fake_data.txt -i 20 --cor_file=example/basis_corr/cor_mat_sparcc.out'
    )
    parser = OptionParser(usage)
    parser.add_option("-c",
                      "--cor_file",
                      dest="cor_file",
                      type='str',
                      help="File to which correlation matrix will be written.")
    parser.add_option("-v",
                      "--cov_file",
                      dest="cov_file",
                      type='str',
                      help="File to which covariance matrix will be written.")
    parser.add_option(
        "-a",
        "--algo",
        dest="algo",
        default='SparCC',
        help=
        "Name of algorithm used to compute correlations (SparCC (default) | pearson | spearman | kendall)"
    )
    parser.add_option(
        "-i",
        "--iter",
        dest='iter',
        type='int',
        default=20,
        help="Number of inference iterations to average over (20 default).")
    parser.add_option(
        "-x",
        "--xiter",
        dest='xiter',
        type='int',
        default=10,
        help=
        "Number of exclusion iterations to remove strongly correlated pairs (10 default)."
    )
    parser.add_option(
        "-t",
        "--thershold",
        dest='th',
        type='float',
        default=0.1,
        help="Correlation strength exclusion threshold (0.1 default).")
    parser.add_option("-p",
                      "--pval_file",
                      dest="pval_file",
                      type='str',
                      help="File to which pvalues will be written.")
    (options, args) = parser.parse_args()
    #print "OPTIONS: ", options
    #print "ARGS: ", args
    counts_file = args[0]

    from analysis_methods import basis_corr
    from io_methods import read_txt, write_txt

    kwargs = options.__dict__
    algo = kwargs.pop('algo')
    cor_file = kwargs.pop('cor_file')
    cov_file = kwargs.pop('cov_file')
    pval_file = kwargs.pop('pval_file')
    if cor_file is None: cor_file = 'cor_mat_' + algo + '.out'
    if cov_file is None: cov_file = 'cov_mat_' + algo + '.out'
    if pval_file is None: pval_file = 'pval_mat_' + algo + '.out'

    print('reading data')
    counts = read_txt(counts_file)

    ## Calculate correlations between components using SparCC
    print('computing correlations')
    cor, cov, pval = basis_corr(counts, method=algo, **kwargs)
    print(counts)
    ## write out results
    print('writing results')
    write_txt(cor, cor_file)
    print('wrote ' + cor_file)
    if cov is not None:
        write_txt(cov, cov_file)
        #print 'wrote ' + cov_file

    if pval is not None:
        write_txt(pval, pval_file)
        help="Correlation strength exclusion threshold (0.1 default).")
    (options, args) = parser.parse_args()
    counts_file = args[0]

    from analysis_methods import basis_corr
    from io_methods import read_txt, write_txt

    kwargs = options.__dict__
    algo = kwargs.pop('algo')
    cor_file = kwargs.pop('cor_file')
    cov_file = kwargs.pop('cov_file')
    if cor_file is None: cor_file = 'cor_mat_' + algo + '.out'
    if cov_file is None: cov_file = 'cov_mat_' + algo + '.out'

    print 'reading data'
    counts = read_txt(counts_file)

    ## Calculate correlations between components using SparCC
    print 'computing correlations'
    cor, cov = basis_corr(counts, method=algo, **kwargs)

    ## write out results
    print 'writing results'
    write_txt(cor, cor_file)
    print 'wrote ' + cor_file
    if cov is not None:
        write_txt(cov, cov_file)
        print 'wrote ' + cov_file

    print 'Done!'
Ejemplo n.º 4
0
def main(counts, method='sparcc', **kwargs):
    '''
    Compute correlations between all components of counts matrix.
    Run several iterations, in each the fractions are re-estimated, 
    and return the median of all iterations.
    Running several iterations is only helpful with 'dirichlet' 
    normalization method, as with other methods all iterations 
    will give identical results. Thus, if using other normalizations
    set 'iter' parameter to 1.
     
    Parameters
    ----------
    counts : DataFrame
        2D array of counts. Columns are components, rows are samples.
        If using 'dirichlet' or 'pseudo' normalization, 
        counts (positive integers) are required to produce meaningful results, 
        though this is not explicitly checked by the code.  
    method : str, optional (default 'SparCC')
        The algorithm to use for computing correlation.
        Supported values: SparCC, clr, pearson, spearman, kendall
        Note that the pearson, spearman, kendall methods are not
        altered to account for the fact that the data is compositional,
        and are provided to facilitate comparisons to 
        the clr and sparcc methods.

    Returns
    -------
    cor_med: array
        Estimated correlation values.
    cov_med: array
        Estimated covariance matrix if method in {SparCC, clr},
        None otherwise.
              
    =======   ============ =======   ================================================
    kwarg     Accepts      Default   Desctiption
    =======   ============ =======   ================================================
    iter      int          20        number of estimation iteration to average over.
    oprint    bool         True      print iteration progress?
    th        0<th<1       0.1       exclusion threshold for SparCC.
    xiter     int          10        number of exclusion iterations for sparcc.
    norm      str          dirichlet method used to normalize the counts to fractions.
    log       bool         True      log-transform fraction? used if method ~= SparCC/CLR
    =======   ============ ========= ================================================
    '''
    #print "RUNNING FUNCTION MAIN"
    method = method.lower()
    cor_list = []  # list of cor matrices from different random fractions
    var_list = []  # list of cov matrices from different random fractions
    oprint = kwargs.pop('oprint', True)
    n_iter = kwargs.pop('iter', 20)  # number of iterations
    norm = kwargs.pop('norm', 'normalize')  #'dirichlet')
    log = kwargs.pop('log', 'True')
    th = kwargs.setdefault(
        'th', 0.1)  # exclusion threshold for iterative sparse algo
    if method in ['sparcc', 'clr']:
        for i in range(n_iter):
            if oprint: print('\tRunning iteration' + str(i))
            #fracs = to_fractions(counts, method=norm)
            fracs = counts
            # PUT TO LOW VALUES?
            for col in fracs.keys():
                for otu in fracs[col].keys():
                    if (fracs[col][otu] == 0):
                        fracs[col][otu] = 1e-14
            #print "FRACS: ", fracs
            v_sparse, cor_sparse, cov_sparse = basis_corr(fracs,
                                                          method=method,
                                                          **kwargs)
            #print "V SPARSE: ", v_sparse
            #print "COR SPARSE: ", cor_sparse
            #print "COV SPARSE: ", cov_sparse
            var_list.append(np.diag(cov_sparse))
            cor_list.append(cor_sparse)
        cor_array = np.array(cor_list)
        #print "VAR MED"
        #print "VARLIST: ", var_list
        var_med = nanmedian(var_list, axis=0)  #median variances
        #print "COR MED"
        cor_med = nanmedian(cor_array, axis=0)  #median correlations
        #print "MESH GRID"
        x, y = np.meshgrid(var_med, var_med)
        #print "COV MED"
        cov_med = cor_med * x**0.5 * y**0.5
        pval = None
    elif method in ['pearson', 'kendall', 'spearman']:
        n = counts.shape[1]
        cor_array = np.zeros((n_iter, n, n))
        for i in range(n_iter):
            if oprint: print('\tRunning iteration ' + str(i))
            fracs = counts
            # Attempting this scaling in Pearson/Spearman as well
            #for col in fracs.keys():
            #   for otu in fracs[col].keys():
            #      if (fracs[col][otu] == 0):
            #         fracs[col][otu] = random.randint(1,9)*math.pow(10,-14)
            #print "SET TO: ", fracs[col][otu]
            #fracs = to_fractions(counts, method=norm)
            #if log:
            #    x = np.log(fracs)
            #else:
            #    x = fracs
            x = fracs
            cor_mat, pval = correlation(x, method, axis=0)
            #print "COR_MAT: ", cor_mat
            #print "PVAL: ", pval
            cor_array[i, :, :] = cor_mat
        cor_med = np.median(cor_array, axis=0)  #median correlation
        #print "COR_MED: ", cor_med
        cov_med = None
    return cor_med, cov_med, pval