Ejemplo n.º 1
0
def _spearman_r(a, b, weights, axis, skipna):
    """ndarray implementation of scipy.stats.spearmanr.

    Parameters
    ----------
    a : ndarray
        Input array.
    b : ndarray
        Input array.
    axis : int
        The axis to apply the correlation along.
    weights : ndarray
        Input array.
    skipna : bool
        If True, skip NaNs when computing function.

    Returns
    -------
    res : ndarray
        Spearmanr's correlation coefficient.

    See Also
    --------
    scipy.stats.spearmanr
    """
    if skipna:
        a, b, weights = _match_nans(a, b, weights)
    _a = rankdata(a, axis=axis)
    _b = rankdata(b, axis=axis)
    return _pearson_r(_a, _b, weights, axis, skipna)
Ejemplo n.º 2
0
def spearmanr(array2d):
    """
    Spearman correlation coefficient on a matrix columns with missing values and ties
    Should give the same result as cor(x, method="spearman", use="pairwise") in R
    """

    ra = np.ma.masked_invalid(bn.rankdata(array2d, axis=0))

    ncols = ra.shape[1]
    cor_mat = np.empty((ncols, ncols), dtype=float)
    for j in xrange(ncols):
        x = ra[:, j]
        xm = ra.mask[:, j]

        for k in xrange(ncols):
            if j == k:
                cor_mat[j, k] = 1
                continue
            if k > j:
                continue
            y = ra[:, k]
            ym = ra.mask[:, k]
            both_valid = np.logical_not(np.logical_or(xm, ym))
            r = np.corrcoef(x[both_valid], y[both_valid])[1, 0]
            cor_mat[j, k] = r
            cor_mat[k, j] = r

    return (cor_mat)
Ejemplo n.º 3
0
def normalize_cells(X, ranked=True):
    """

    Scale matrix sthat all cells (rows) sum to 1 and have l2-norm of 1

    Arguments:
        X {array} -- Cell x gene matrix (sparse or dense)

    Keyword Arguments:
        ranked {bool} -- Indicator whether to rank cells (default: {True})

    Returns:
        np.ndarray -- Cells x genes matrix of normalized cells
    """
    if sparse.issparse(X):
        res = X.toarray()
    else:
        res = X
    if ranked:
        res = bottleneck.rankdata(res, axis=1)

    avg = np.mean(res, axis=1)
    res -= avg[:, None]

    norm = np.sqrt(bottleneck.nansum(res**2, axis=1))[:, None]
    res /= norm
    return res
Ejemplo n.º 4
0
def combine_fcast_and_mcli(fcast, mcli):
    big_ds = xr.concat([
        mcli['Pressure'].drop('timestr'), fcast['Pressure'].expand_dims('time')
    ],
                       dim='time')
    percentile = bottleneck.rankdata(big_ds, axis=0) / len(big_ds['time'])
    return percentile
Ejemplo n.º 5
0
def roc_auc_score_bottleneck(actual, predicted, approx=False):
    if approx: r = np.argsort(predicted)
    else: r = rankdata(predicted)
    n_pos = np.sum(actual)
    n_neg = len(actual) - n_pos
    sum1 = (np.sum(r[actual == 1]) - n_pos * (n_pos + 1) / 2)
    print(f"bottleneck nPos {n_pos}  nNeg {n_neg}, sum {sum1}")
    return sum1 / (n_pos * n_neg)
Ejemplo n.º 6
0
def rank(data, nan_val):
    """Rank normalize data

    Rank standardize inplace

    Does not return
    Arguments:
        data {np.array} -- Array of data
    """
    finite = np.isfinite(data)
    ranks = bottleneck.rankdata(data[finite]).astype(data.dtype)

    ranks -= 1
    top = np.max(ranks)
    ranks /= top
    data[...] = nan_val
    data[np.where(finite)] = ranks
    del ranks, finite
    gc.collect()
Ejemplo n.º 7
0
def lilliefors_Dcrit_gev(n_obs, significance_levels, shape, n_sample=10000):
    """Estimate the critical values of the KS test using statistical simulation.
    See also:
    Wilks, D. S. (2011).
    Frequentist Statistical Inference.
    International Geophysics, 100, 133–186.
    https://doi.org/10.1016/B978-0-12-385022-5.00005-1
    """
    q_levels = [1-i for i in significance_levels]
    D_list = []
    for i in range(n_sample):
        ams_sim = scipy.stats.genextreme.rvs(c=shape, size=n_obs)
        rank = bottleneck.rankdata(ams_sim)
        ecdf = rank / n_obs
        loc, scale, shape = ev_fit.gev_pwm(ams_sim, ecdf, n_obs,
                                           ax_year=0, shape=np.full((1), shape))
        cdf = ev_fit.gev_cdf(ams_sim, loc, scale, shape)
        D = np.abs(ecdf-cdf).max()
        D_list.append(D)
    return np.quantile(D_list, q_levels)
Ejemplo n.º 8
0
def create_nw_spearman(data):
    """Create Co-expreesion network

    Computes co-expression nnetwork using Spearman correaltion and then ranking the network

    Arguments:
        data {array} -- Cells x Genes array of floats, either dense or sparse

    Returns:
        np.ndarray -- co-expression nnetwork (2-D dense array)
    """
    if sparse.issparse(data):
        data = data.toarray()
    data = bottleneck.rankdata(data, axis=0)

    nw = np.corrcoef(data, rowvar=False)
    np.fill_diagonal(nw, 1)
    rank(nw, nan_val=0)
    np.fill_diagonal(nw, 1)
    return nw
Ejemplo n.º 9
0
def compute_aurocs(votes, positives=None, compute_p=False):
    """Compute AUORCs based on neighbors voting and candidates identities


    Arguments:
        votes {pd.DataFrame} -- DataFrame with votes for cell types
        

    Keyword Arguments:
        positives {Vector} -- Vector of assignments for positive values. If left empty,
        cells are assumed to be the row names of the votes matrix (default: {None})
        compute_p {pd.DataFrame} -- Boolean for whether or not to compute the p value for the AUROC (default : {False})
    Returns:
        pd.DataFrame -- DataFrame of testing cell types x training cell types
    """
    res_col = votes.columns
    if positives is None:
        positives = design_matrix(votes.index)
    res_idx = positives.columns
    positives = positives.values

    n_pos = bottleneck.nansum(positives, axis=0)
    n_neg = positives.shape[0] - n_pos

    sum_pos_ranks = positives.T @ bottleneck.rankdata(votes.values, axis=0)
    roc = sum_pos_ranks / n_pos[:, None]
    roc -= (n_pos[:, None] + 1) / 2
    roc /= n_neg[:, None]

    if compute_p:
        n_pos = n_pos[:, None]
        n_neg = n_neg[:, None]

        U = roc * n_pos * n_neg
        Z = (np.abs(U -
                    (n_pos * n_neg / 2))) / np.sqrt(n_pos * n_neg *
                                                    (n_pos + n_neg + 1) / 12)
        p = stats.norm.sf(Z)
        p = pd.DataFrame(p, index=res_idx, columns=res_col)
        return pd.DataFrame(roc, index=res_idx, columns=res_col), p
    return pd.DataFrame(roc, index=res_idx, columns=res_col)
Ejemplo n.º 10
0
def spearman_correlation(x, y):
    """Spearman's s, or rank correlation coefficient.
    From http://xarray.pydata.org/en/stable/dask.html#automatic-parallelization. """
    x_ranks = bn.rankdata(x, axis=-1)
    y_ranks = bn.rankdata(y, axis=-1)
    return pearson_correlation(x_ranks, y_ranks)
Ejemplo n.º 11
0
def run_cm_mixomics_test(sample, feature, random_states, root):
    """
    Compare corrmapper with mixomics and marginal corr networks
    """
    
    import os
    import numpy as np
    import copy
    import pandas as pd
    import scipy as sp
    import bottleneck as bn
    from sklearn.datasets import make_sparse_spd_matrix
    from sklearn.preprocessing import StandardScaler
    from rpy2.robjects import numpy2ri
    from rpy2.robjects.packages import importr
    
    from cmTest import test_utils as tu
    from cmTest import hugeR

    for random_state in random_states:

        # ----------------------------------------------------------------------
        # SIMULATE DATASET
        # ----------------------------------------------------------------------
        
        informative = int(feature * .05)
        
        n_features = feature
        n_informative = informative
        n_redundant = 0
        n_relevant = n_informative + n_redundant
        s = sample
        f = feature
        i = informative
        i_half = n_informative/2
        r = n_redundant
        prec_real = make_sparse_spd_matrix(n_informative, smallest_coef=.4,
                                           alpha=.98, largest_coef=.8,
                                           random_state=random_state)

        cov_real = sp.linalg.inv(prec_real)
        d = np.sqrt(np.diag(cov_real))
        # divide through cols
        cov_real /= d
        # divide through rows
        cov_real /= d[:, np.newaxis]
        prec_real *= d
        prec_real *= d[:, np.newaxis]

        covs = [cov_real, cov_real]
        X, y = tu.make_classification(n_samples=s, n_features=f,
                                      n_clusters_per_class=1, n_informative=i,
                                      n_redundant=r, shuffle=False,
                                      class_sep=.25, cov=covs,
                                      random_state=random_state)

        # ----------------------------------------------------------------------
        # SPLIT DATASET INTO TWO
        # ----------------------------------------------------------------------

        # split relevant features equally and randomly
        X1_rel_feats = set(np.random.choice(n_relevant, n_relevant / 2, replace=False))
        X2_rel_feats = set(range(n_relevant)) - X1_rel_feats

        # now split all other features randomly and equally
        other_n = n_features - n_relevant
        X1_other_feats = set(np.random.choice(range(n_relevant, n_features), other_n / 2, replace=False))
        X2_other_feats = set(range(n_relevant, n_features)) - X1_other_feats

        # merge relevant and irrelevant features
        X1_feats = np.array(sorted(list(X1_other_feats.union(X1_rel_feats))))
        X2_feats = np.array(sorted(list(X2_other_feats.union(X2_rel_feats))))

        # check we have each feature only once
        sorted(list(set(X1_feats).union(X2_feats))) == range(n_features)

        # define X1 and X2
        X1 = X[:, X1_feats]
        X2 = X[:, X2_feats]
        datasets_original = [X1, X2]

        # these two lists will keep track of the features we are left with from
        # the original data matrix. feature nums < n_informative are informative.
        X12_feats_original = [X1_feats, X2_feats]

        # ----------------------------------------------------------------------
        # START PIPELINE
        # ----------------------------------------------------------------------
        
        file_id = ("samp_%d_feat_%d_rand_%d" % (s, f, random_state))
        results_folder = root + "cmTest/results2/"

        # this ensure that we only run analysis that ins't already finished
        result_file = results_folder + file_id + '.txt'
        file_exist = os.path.isfile(result_file)
        fs_methods = ["fdr", "l1svc", "boruta", "jmi"]
        ss = StandardScaler()
        if not file_exist or (file_exist and os.stat(result_file).st_size == 0):
            o = open(result_file, 'w')
            
            # -----------------------------------------------------------------            
            # CORRMAPPER
            o.write('Method, Prec, Recall\n')
            for fs_method in fs_methods:
                no_feat = False
                X12_feats = [copy.deepcopy(X12_feats_original[0]),
                             copy.deepcopy(X12_feats_original[1])]
                datasets = [copy.deepcopy(datasets_original[0]),
                            copy.deepcopy(datasets_original[1])]
                for i, dataset in enumerate(datasets):
                    # variance filtering
                    dataset = pd.DataFrame(dataset)
                    two_n = int(2 * dataset.shape[0])
                    top_var_ix = np.array(sorted(np.argsort(dataset.var())[-two_n:]))
                    dataset = dataset[top_var_ix].values
                    # update features of the datasets
                    X12_feats[i] = X12_feats[i][top_var_ix]

                    # FS
                    try:
                        sel = tu.do_fs(dataset, y, fs_method)
                        if len(sel) == 0:
                            no_feat = True
                        else:
                            # update features of the datasets
                            X12_feats[i] = X12_feats[i][sel]
                            datasets[i] = dataset[:, sel]
                    except:
                        no_feat = True

                if not no_feat:
                    # concatenate datasets
                    dataset1 = pd.DataFrame(datasets[0])
                    dataset2 = pd.DataFrame(datasets[1])
                    merged_datasets_df = dataset1.join(dataset2, how='inner',
                                                       lsuffix='_data1',
                                                       rsuffix='_data2')
                    X_fs = merged_datasets_df.values
                    # standardise
                    X_fs = ss.fit_transform(X_fs)
                    
                    # run hugeR's glasso and StARS
                    cov, prec = hugeR.hugeR(X_fs, 0.05)

                    # match features to original informative ones, check docstring
                    # of translate_estimated_matrix_into_original for explanation
                    if prec.shape[0] > 1:
                        prec = tu.translate_estimated_m_into_original(prec, X12_feats,
                                                                      informative)
                        # we only compare the N12 network to make it fair for mixomics
                                                                      
                        p, r = tu.quality_of_graph(prec_real[i_half:,:i_half], 
                                                   prec[i_half:,:i_half], sym=False)
                    else:
                        p, r = np.nan, np.nan
                    o.write(','.join(map(str, [fs_method, p, r])) + '\n')
                    
            # reorder real precision for other two methods
            all_rel_feats = list(X1_rel_feats) + list(X2_rel_feats)
            prec_real2 = prec_real
            prec_real2 = prec_real2[all_rel_feats,:]
            prec_real2 = prec_real2[:,all_rel_feats]
            prec_real2 = prec_real2[i_half:,:i_half]

            # -----------------------------------------------------------------
            # GRAPH LASSO
            try:
                dataset = pd.DataFrame(X)
                two_n = int(2 * dataset.shape[0])
                top_var_ix = np.array(sorted(np.argsort(dataset.var())[-two_n:]))
                X_gl = dataset[top_var_ix].values                                
                X_gl = X[:, top_var_ix]
                X_gl = ss.fit_transform(X_gl)
                cov, prec = hugeR.hugeR(X_gl, 0.05)
                p, r = tu.quality_of_graph(prec_real[i_half:,:i_half], prec[X1.shape[1]:X1.shape[1]+i_half,:i_half], sym=False)
                o.write(','.join(map(str, ["glasso", p, r])) + '\n')
            except:
                o.write(','.join(map(str, ["glasso", np.nan, np.nan])) + '\n')
            
            # -----------------------------------------------------------------
            # MARGINAL CORR NETWORK
            cov_thresholds = [.05, .1, .2, .3, .5, .7, .8]
            
            rX = bn.rankdata(X, axis=0)
            marg_cov = np.corrcoef(rX, rowvar=0)
            marg_cov = np.abs(marg_cov[X1.shape[1]:X1.shape[1]+i_half,:i_half])
            for thresh in cov_thresholds:
                tmp_cov = (marg_cov > thresh).astype(int)
                p, r = tu.quality_of_graph(prec_real2, tmp_cov, sym=False)
                method = "marginal %f" % thresh
                o.write(','.join(map(str, [method, p, r])) + '\n')                                
            
            # -----------------------------------------------------------------            
            # MIXOMICS
            base = importr('base')
            # this allows us to send numpy to R directly, neat
            numpy2ri.activate()
            mo = importr('mixOmics')
            
            mo_spls_model = mo.spls(X1, X2, ncomp = 3, keepX=i_half, keepY=i_half)
            mo_network = mo.network(mo_spls_model)
            mo_cov = np.array(base.as_data_frame(mo_network.rx('M')))
            numpy2ri.deactivate()
            mo_cov = np.abs(mo_cov[:i_half,:i_half])
            for thresh in cov_thresholds:
                tmp_cov = (mo_cov > thresh).astype(int)
                p, r = tu.quality_of_graph(prec_real2, tmp_cov, sym=False)
                method = "mixomics %f" % thresh
                o.write(','.join(map(str, [method, p, r])) + '\n')                                
            
            o.close()
Ejemplo n.º 12
0
def spearman_correlation_gufunc(x, y):
    x_ranks = bottleneck.rankdata(x, axis=-1)
    y_ranks = bottleneck.rankdata(y, axis=-1)
    return pearson_correlation_gufunc(x_ranks, y_ranks)
Ejemplo n.º 13
0
def ev_centrality_dist(net, core, show_plot = False, save_plot = True, outfile = None):
    '''
    Calculate eigenvector centrality, an empirical CDF, and ranking for each vertex.  
    Plot both centrality x empirical CDF and centrality x ranking, highlighting core vertices.
    Note that the plot is saved as a file only if *both* `save_plot` is true and
    output filename are given.  
    
    :param net: The network whose degree distribution we'd like to plot
    :param core: The property map of core vertices
    :param show_plot: Show the plot on the screen?
    :param save_plot: Save the plot as a file?
    :param outfile: Filename to use to save the plot
    
    :return: The CDF and ranking plots. 
    '''# Calculate eigenvector centrality and write it into the graph
    print('Calculating eigenvector centrality')
    net.vp['evc'] = gt.eigenvector(net, epsilon=1e-03)[1]
    print('Done')
    # Extract them into a useful format
    eigen_central = net.vp['evc'].get_array().tolist()
    # x values: centralities
    centralities = list(set(eigen_central))
    # Use the ecdf to build the y values
    eigen_central_ecdf = ecdf(eigen_central)
    # Use 1-ecdf for legibility when most nodes have centrality near 0
    centrality_distribution = \
        [1 - eigen_central_ecdf(centrality) for centrality in centralities]
    # Write 1-ecdf into the graph
    net.vp['evc ecdf'] = \
        net.new_vertex_property('float',
            vals = [1 - eigen_central_ecdf(net.vp['evc'][vertex])
                        for vertex in net.vertices()])

    # Rank the vertices by eigenvector centrality
    vertex_ranking = len(eigen_central) - bn.rankdata(eigen_central) + 1
    # Write them into the graph
    net.vp['evc rank'] = net.new_vertex_property('int', vals = vertex_ranking)
    #print(vertex_ranking)
    print('Mapping rankings to centralities')
    # Map these against `centralities`:  
    #  for each degree, get the index of its first occurrence in the 
    #  vertex-level list `eigen_central`; that index corresponds to the 
    #  index in `vertex_ranking`
    ranking = [vertex_ranking[eigen_central.index(centrality)] 
                for centrality in centralities]
    
     # Combine into a single data frame
    centrality_dist = pd.DataFrame({'centrality': centralities,
                                    'density': centrality_distribution,
                                    'rank': ranking})
    #print(centrality_dist.head())

    # Grab centralities and rankings for the core vertices
    centralities_core = [net.vp['evc'][vertex] for vertex in core]
    centrality_distribution_core = [net.vp['evc ecdf'][vertex] for vertex in core]
    ranking_core = [net.vp['evc rank'][vertex] for vertex in core]
    centrality_dist_core = \
        pd.DataFrame({'centrality': centralities_core,
                        'density': centrality_distribution_core,
                        'rank': ranking_core})
    #print(centrality_dist_core)
    print('Summary statistics for core vertex centralities:')
    print(pd.DataFrame({k: summary(centrality_dist_core[k]) for k in centrality_dist_core}))
    
    # Build the plot
    density_plot = ggplot(aes(x = 'centrality'), data = centrality_dist) +\
            geom_area(aes(ymin = 0, ymax = 'density', fill = 'blue'), alpha = .3) +\
            geom_line(aes(y = 'density'), color = 'blue', alpha = .8) +\
            xlab('Eigenvector centrality') +\
            ylab('1 - Cumulative probability density') +\
            scale_x_log10() + scale_y_log10() +\
            theme_bw()
    #Add a rug for the core vertices
    density_plot = density_plot + \
        geom_point(aes(x = 'centrality', y = 'density'),
                shape = '+', size = 250, alpha = .8, color = 'red',
                data = centrality_dist_core)
    
    # If requested, show the plot
    if show_plot:
        print(density_plot)
    
    # Save to disk
    if outfile is not None and save_plot:
        ggsave(filename = outfile + '.evc_density' + '.pdf', plot = density_plot)
    
    # Same thing for degree x ranking
    ranking_plot = ggplot(aes(x = 'centrality'), data = centrality_dist) +\
            geom_area(aes(ymin = 0, ymax = 'rank', fill = 'blue'), alpha = .3) +\
            geom_line(aes(y = 'rank'), color = 'blue', alpha = .8) +\
            xlab('Eigenvector centrality') +\
            ylab('Rank') +\
            scale_x_log10() + scale_y_log10() +\
            theme_bw()
    ranking_plot = ranking_plot +\
        geom_point(aes(x = 'centrality', y = 'rank'),
                shape = '+', size = 250, alpha = .8, color = 'red',
                data = centrality_dist_core)
    if show_plot:
        print(ranking_plot)
    if outfile is not None and save_plot:
        ggsave(filename = outfile + '.evc_rank' + '.pdf', plot = ranking_plot)
    
    return(density_plot, ranking_plot)
Ejemplo n.º 14
0
def degree_dist(net, core, show_plot = False, save_plot = True, outfile = None):
    '''
    Calculate out degree, an empirical CDF, and ranking for each vertex.  
    Plot both degree x empirical CDF and degree x ranking, highlighting core vertices.
    Note that the plot is saved as a file only if *both* `save_plot` is true and
    output filename are given.  
    
    :param net: The network whose degree distribution we'd like to plot
    :param core: The property map of core vertices
    :param show_plot: Show the plot on the screen?
    :param save_plot: Save the plot as a file?
    :param outfile: Filename to use to save the plot
    
    :return: The CDF and ranking plots. 
    '''
    # Build degree distribution
    # Out degree for every vertex
    out_degrees = [vertex.out_degree() for vertex in net.vertices()]
    # Write them into the graph
    net.vp['out-degree'] = net.new_vertex_property('int', vals = out_degrees)
    #  x values: degrees
    degrees = list(set(out_degrees))
     #  Use the ecdf to build the y values
    out_degree_ecdf = ecdf(out_degrees)
    #  Use 1-ecdf for legibility when most nodes have degree near 0
    out_degree_dist = [1 - out_degree_ecdf(degree) for degree in degrees]
    # Write 1-ecdf into the graph
    net.vp['out-degree ecdf'] = \
        net.new_vertex_property('float', 
            vals = [1 - out_degree_ecdf(net.vp['out-degree'][vertex]) 
                        for vertex in net.vertices()])
    
    # Rank the vertices by out-degree
    vertex_ranking = len(out_degrees) - bn.rankdata(out_degrees) + 1
    # Write them into the graph
    net.vp['out-degree rank'] = net.new_vertex_property('int', vals = vertex_ranking)
    # Map these against `degree`:  
    #  for each degree, get the index of its first occurrence in the 
    #  vertex-level list `out_degrees`; that index corresponds to the 
    #  index in `vertex_ranking`
    ranking = [vertex_ranking[out_degrees.index(degree)] 
                for degree in degrees]
    
     # Combine into a single data frame
    degree_dist = pd.DataFrame({'degree': degrees, 
                                'density': out_degree_dist, 
                                'rank': ranking})
    
    # Grab the degrees and rankings for the core vertices
    out_degrees_core = [net.vp['out-degree'][vertex] for vertex in core]
    out_degree_dist_core = [net.vp['out-degree ecdf'][vertex] for vertex in core]
    ranking_core = [net.vp['out-degree rank'][vertex] for vertex in core]
    degree_dist_core = \
        pd.DataFrame({'degree': out_degrees_core, 
                        'density': out_degree_dist_core, 
                        'rank': ranking_core})
    #print(degree_dist_core)
    print('Summary statistics for core vertex out-degrees:')
    print(pd.DataFrame({k: summary(degree_dist_core[k]) for k in degree_dist_core}))

    # Build the degree x density plot
    density_plot = ggplot(aes(x = 'degree'),
                        data = degree_dist) +\
            geom_area(aes(ymin = 0, ymax = 'density', fill = 'blue'), alpha = .3) +\
            geom_line(aes(y = 'density', color = 'blue'), alpha = .8) +\
            xlab('Out-degree') +\
            ylab('1 - Cumulative probability density') +\
            scale_x_log10() + scale_y_log10() +\
            theme_bw()
    # Add a rug for the core vertices
    density_plot = density_plot + \
        geom_point(aes(x = 'degree', y = 'density'),
                shape = '+', size = 250, alpha = .8, color = 'red',
                data = degree_dist_core)

    # If requested, show the plot
    if show_plot:
        print(density_plot)
    
    # Save to disk
    if outfile is not None and save_plot:
        ggsave(filename = outfile + '.degree_density' + '.pdf', plot = density_plot)
        
    # Same thing for degree x ranking
    ranking_plot = ggplot(aes(x = 'degree'), data = degree_dist) +\
            geom_area(aes(ymin = 0, ymax = 'rank', fill = 'blue'), alpha = .3) +\
            geom_line(aes(y = 'rank', color = 'blue'), alpha = .8) +\
            xlab('Out-degree') +\
            ylab('Rank') +\
            scale_x_log10() + scale_y_log10() +\
            theme_bw()
    ranking_plot = ranking_plot +\
        geom_point(aes(x = 'degree', y = 'rank'),
                shape = '+', size = 250, alpha = .8, color = 'red',
                data = degree_dist_core)
    if show_plot:
        print(ranking_plot)
    if outfile is not None and save_plot:
        ggsave(filename = outfile + '.degree_rank' + '.pdf', plot = ranking_plot)
    
    return(density_plot, ranking_plot)
Ejemplo n.º 15
0
def allPairs_sharpeWeightedRank_2D(datearray,symbols,adjClose,signal2D,LongPeriod,rankthreshold,riskDownside_min,riskDownside_max,rankThresholdPct):

    # adjClose      --     # 2D array with adjusted closing prices (axes are stock number, date)
    # rankthreshold --     # select this many funds with best recent performance

    import numpy as np
    import nose
    import os
    import sys
    try:
        import bottleneck as bn
        from bn import rankdata as rd
    except:
        import scipy.stats.mstats as bn

    gainloss = np.ones((adjClose.shape[0],adjClose.shape[1]),dtype=float)
    gainloss[:,1:] = adjClose[:,1:] / adjClose[:,:-1]
    gainloss[isnan(gainloss)]=1.

    # convert signal2D to contain either 1 or 0 for weights
    signal2D -= signal2D.min()
    signal2D *= signal2D.max()

    # apply signal to daily gainloss
    gainloss = gainloss * signal2D
    gainloss[gainloss == 0] = 1.0

    value = 10000. * np.cumprod(gainloss,axis=1)

    # calculate gainloss over period of "LongPeriod" days
    monthgainloss = np.ones((adjClose.shape[0],adjClose.shape[1]),dtype=float)
    monthgainloss[:,LongPeriod:] = adjClose[:,LongPeriod:] / adjClose[:,:-LongPeriod]
    monthgainloss[isnan(monthgainloss)]=1.

    monthgainlossweight = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=float)

    rankweight = 1./rankthreshold

    ########################################################################
    ## Calculate change in rank of active stocks each day (without duplicates as ties)
    ########################################################################
    monthgainlossRank = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=int)
    monthgainlossPrevious = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=float)
    monthgainlossPreviousRank = np.zeros((adjClose.shape[0],adjClose.shape[1]),dtype=int)

    
    ###
    ###
    ###
    
    monthgainlossRank = allPairsRanking( adjClose, symbols, datearray, span=LongPeriod )
    
    ###
    ###
    ###
    
    
    ########monthgainlossRank = bn.rankdata(monthgainloss,axis=0)
    # reverse the ranks (low ranks are biggest gainers)
    maxrank = np.max(monthgainlossRank)
    monthgainlossRank -= maxrank-1
    monthgainlossRank *= -1
    monthgainlossRank += 2

    monthgainlossPrevious[:,LongPeriod:] = monthgainloss[:,:-LongPeriod]
    monthgainlossPreviousRank = bn.rankdata(monthgainlossPrevious,axis=0)
    # reverse the ranks (low ranks are biggest gainers)
    maxrank = np.max(monthgainlossPreviousRank)
    monthgainlossPreviousRank -= maxrank-1
    monthgainlossPreviousRank *= -1
    monthgainlossPreviousRank += 2

    # weight deltaRank for best and worst performers differently
    rankoffsetchoice = rankthreshold
    delta = -( monthgainlossRank.astype('float') - monthgainlossPreviousRank.astype('float') ) / ( monthgainlossRank.astype('float') + float(rankoffsetchoice) )

    # if rank is outside acceptable threshold, set deltarank to zero so stock will not be chosen
    #  - remember that low ranks are biggest gainers
    rankThreshold = (1. - rankThresholdPct) * ( monthgainlossRank.max() - monthgainlossRank.min() )
    for ii in range(monthgainloss.shape[0]):
        for jj in range(monthgainloss.shape[1]):
            if monthgainloss[ii,jj] > rankThreshold :
                delta[ii,jj] = -monthgainloss.shape[0]/2
                if jj == monthgainloss.shape[1]:
                    print "*******setting delta (Rank) low... Stock has rank outside acceptable range... ",ii, symbols[ii], monthgainloss[ii,jj]

    # if adjClose is nan, set deltarank to zero so stock will not be chosen
    #  - remember that low ranks are biggest gainers
    rankThreshold = (1. - rankThresholdPct) * ( monthgainlossRank.max() - monthgainlossRank.min() )
    for ii in range(monthgainloss.shape[0]):
        if isnan( adjClose[ii,-1] )  :
            delta[ii,:] = -monthgainloss.shape[0]/2
            numisnans = adjClose[ii,:]
            # NaN in last value usually means the stock is removed from the index so is not updated, but history is still in HDF file
            print "*******setting delta (Rank) low... Stock has NaN for last value... ",ii, symbols[ii], numisnans[np.isnan(numisnans)].shape

    deltaRank = bn.rankdata( delta, axis=0 )

    # reverse the ranks (low deltaRank have the fastest improving rank)
    maxrank = np.max(deltaRank)
    deltaRank -= maxrank-1
    deltaRank *= -1
    deltaRank += 2

    for ii in range(monthgainloss.shape[1]):
        if deltaRank[:,ii].min() == deltaRank[:,ii].max():
            deltaRank[:,ii] = 0.


    ########################################################################
    ## Hold values constant for calendar month (gains, ranks, deltaRanks)
    ########################################################################

    for ii in range(1,monthgainloss.shape[1]):
        if datearray[ii].month == datearray[ii-1].month:
            monthgainloss[:,ii] = monthgainloss[:,ii-1]
            delta[:,ii] = delta[:,ii-1]
            deltaRank[:,ii] = deltaRank[:,ii-1]


    ########################################################################
    ## Calculate number of active stocks each day
    ########################################################################

    # TODO: activeCount can be computed before loop to save CPU cycles
    # count number of unique values
    activeCount = np.zeros(adjClose.shape[1],dtype=float)
    for ii in np.arange(0,monthgainloss.shape[0]):
        firsttradedate = np.argmax( np.clip( np.abs( gainloss[ii,:]-1. ), 0., .00001 ) )
        activeCount[firsttradedate:] += 1

    minrank = np.min(deltaRank,axis=0)
    maxrank = np.max(deltaRank,axis=0)
    # convert rank threshold to equivalent percent of rank range

    rankthresholdpercentequiv = np.round(float(rankthreshold)*(activeCount-minrank+1)/adjClose.shape[0])
    ranktest = deltaRank <= rankthresholdpercentequiv

    ########################################################################
    ### Calculate downside risk measure for weighting stocks.
    ### Use 1./ movingwindow_sharpe_ratio for risk measure.
    ### Modify weights with 1./riskDownside and scale so they sum to 1.0
    ########################################################################

    riskDownside = 1. / move_sharpe_2D(adjClose,gainloss,LongPeriod)
    riskDownside = np.clip( riskDownside, riskDownside_min, riskDownside_max)

    riskDownside[isnan(riskDownside)] = np.max(riskDownside[~isnan(riskDownside)])
    for ii in range(riskDownside.shape[0]) :
        riskDownside[ii] = riskDownside[ii] / np.sum(riskDownside,axis=0)

    ########################################################################
    ### calculate equal weights for ranks below threshold
    ########################################################################

    elsecount = 0
    elsedate  = 0
    for ii in np.arange(1,monthgainloss.shape[1]) :
        if activeCount[ii] > minrank[ii] and rankthresholdpercentequiv[ii] > 0:
            for jj in range(value.shape[0]):
                test = deltaRank[jj,ii] <= rankthresholdpercentequiv[ii]
                if test == True :
                    monthgainlossweight[jj,ii]  = 1./rankthresholdpercentequiv[ii]
                    monthgainlossweight[jj,ii]  = monthgainlossweight[jj,ii] / riskDownside[jj,ii]
                else:
                    monthgainlossweight[jj,ii]  = 0.
        elif activeCount[ii] == 0 :
            monthgainlossweight[:,ii]  *= 0.
            monthgainlossweight[:,ii]  += 1./adjClose.shape[0]
        else :
            elsedate = datearray[ii]
            elsecount += 1
            monthgainlossweight[:,ii]  = 1./activeCount[ii]

    aaa = np.sum(monthgainlossweight,axis=0)


    allzerotest = np.sum(monthgainlossweight,axis=0)
    sumallzerotest = allzerotest[allzerotest == 0].shape
    if sumallzerotest > 0:
        print ""
        print " invoking correction to monthgainlossweight....."
        print ""
        for ii in np.arange(1,monthgainloss.shape[1]) :
            if np.sum(monthgainlossweight[:,ii]) == 0:
                monthgainlossweight[:,ii]  = 1./activeCount[ii]

    print " weights calculation else clause encountered :",elsecount," times. last date encountered is ",elsedate
    rankweightsum = np.sum(monthgainlossweight,axis=0)

    monthgainlossweight[isnan(monthgainlossweight)] = 0.  # changed result from 1 to 0

    monthgainlossweight = monthgainlossweight / np.sum(monthgainlossweight,axis=0)
    monthgainlossweight[isnan(monthgainlossweight)] = 0.  # changed result from 1 to 0

    # input symbols and company names from text file
    companyName_file = os.path.join( os.getcwd(), "symbols",  "companyNames.txt" )
    with open( companyName_file, "r" ) as f:
        companyNames = f.read()

    print "\n\n\n"
    companyNames = companyNames.split("\n")
    ii = companyNames.index("")
    del companyNames[ii]
    companySymbolList  = []
    companyNameList = []
    for iname,name in enumerate(companyNames):
        name = name.replace("amp;", "")
        testsymbol, testcompanyName = name.split(";")
        companySymbolList.append(testsymbol)
        companyNameList.append(testcompanyName)

    # print list showing current rankings and weights
    # - symbol
    # - rank
    # - weight from sharpe ratio
    # - price
    import os
    rank_text = "<div id='rank_table_container'><h3>"+"<p>Current stocks, with ranks, weights, and prices are :</p></h3><font face='courier new' size=3><table border='1'> \
               <tr><td>Rank \
               </td><td>Symbol \
               </td><td>Company \
               </td><td>Weight \
               </td><td>Price  \
               </td><td>Trend  \
               </td></tr>\n"
    for i, isymbol in enumerate(symbols):
        for j in range(len(symbols)):
            if int( deltaRank[j,-1] ) == i :
                if signal2D[j,-1] == 1.:
                    trend = 'up'
                else:
                    trend = 'down'

                # search for company name
                try:
                    symbolIndex = companySymbolList.index(symbols[j])
                    companyName = companyNameList[symbolIndex]
                except:
                    companyName = ""

                rank_text = rank_text + \
                       "<tr><td>" + format(deltaRank[j,-1],'6.0f')  + \
                       "<td>" + format(symbols[j],'5s')  + \
                       "<td>" + format(companyName,'15s')  + \
                       "<td>" + format(monthgainlossweight[j,-1],'5.03f') + \
                       "<td>" + format(adjClose[j,-1],'6.2f')  + \
                       "<td>" + trend  + \
                       "</td></tr>  \n"
    rank_text = rank_text + "</table></div>\n"

    filepath = os.path.join( os.getcwd(), "pyTAAA_web", "pyTAAAweb_RankList.txt" )
    with open( filepath, "w" ) as f:
        f.write(rank_text)

    print "leaving function sharpeWeightedRank_2D..."

    return monthgainlossweight
Ejemplo n.º 16
0
    def rollingRankArgsort(array):
        try:
            return  bd.rankdata(array)[-1]

        except:
            return np.nan
for ii in np.arange(1,monthgainloss.shape[1]):
    if date[ii].month == date[ii-1].month:
        monthgainloss[:,ii] = monthgainloss[:,ii-1]
for ii in range(monthgainloss.shape[0]):
    plt.plot(datearray,monthgainloss[ii,:])

print " monthgainloss check: ",monthgainloss[isnan(monthgainloss)].shape

monthgainlossrange = np.ones(x.shape[0],dtype=float)
monthgainlossweight = np.zeros((x.shape[0],x.shape[2]),dtype=float)

rankthreshold = 9     # select this many funds with best recent performance
rankthreshold = 4     # select this many funds with best recent performance
rankthreshold = 4     # select this many funds with best recent performance
rankweight = 1./rankthreshold
monthgainlossrank = bn.rankdata(monthgainloss,axis=0)
rankmin = np.min(monthgainlossrank,axis=0)
rankmax = np.max(monthgainlossrank,axis=0)
rankcutoff = float(x.shape[0]-rankthreshold)/(x.shape[0]-1)*(rankmax-rankmin)*rankmin
ranktest = monthgainlossrank > rankcutoff
monthgainlossweight[ranktest == True]  = rankweight
rankweightsum = np.sum(monthgainlossweight,axis=0)

print " 2a - rankweightsum check isnan: ",rankweightsum[isnan(rankweightsum)].shape[0]
print " 2b - rankweightsum check isinf: ",rankweightsum[isinf(rankweightsum)].shape[0]
print " 2c - rankweightsum check: zero  ",rankweightsum[where(rankweightsum==0)].shape[0]
print " 2d - shape of rankweightsum :   ",rankweightsum.shape[0]
plt.figure(22)
plt.grid()
plt.title('rankweightsum')
plt.plot(datearray,rankweightsum)
Ejemplo n.º 18
0
def spearman_correlation_gufunc(x, y):
    x_ranks = bottleneck.rankdata(x, axis=-1)
    y_ranks = bottleneck.rankdata(y, axis=-1)
    return pearson_correlation_gufunc(x_ranks, y_ranks)
Ejemplo n.º 19
0
 def time_rankdata(self, dtype, shape):
     bn.rankdata(self.arr)
Ejemplo n.º 20
0
def allPairs_sharpeWeightedRank_2D(datearray, symbols, adjClose, signal2D,
                                   LongPeriod, rankthreshold, riskDownside_min,
                                   riskDownside_max, rankThresholdPct):

    # adjClose      --     # 2D array with adjusted closing prices (axes are stock number, date)
    # rankthreshold --     # select this many funds with best recent performance

    import numpy as np
    import nose
    import os
    import sys
    try:
        import bottleneck as bn
        from bn import rankdata as rd
    except:
        import scipy.stats.mstats as bn

    gainloss = np.ones((adjClose.shape[0], adjClose.shape[1]), dtype=float)
    gainloss[:, 1:] = adjClose[:, 1:] / adjClose[:, :-1]
    gainloss[isnan(gainloss)] = 1.

    # convert signal2D to contain either 1 or 0 for weights
    signal2D -= signal2D.min()
    signal2D *= signal2D.max()

    # apply signal to daily gainloss
    gainloss = gainloss * signal2D
    gainloss[gainloss == 0] = 1.0

    value = 10000. * np.cumprod(gainloss, axis=1)

    # calculate gainloss over period of "LongPeriod" days
    monthgainloss = np.ones((adjClose.shape[0], adjClose.shape[1]),
                            dtype=float)
    monthgainloss[:,
                  LongPeriod:] = adjClose[:,
                                          LongPeriod:] / adjClose[:, :
                                                                  -LongPeriod]
    monthgainloss[isnan(monthgainloss)] = 1.

    monthgainlossweight = np.zeros((adjClose.shape[0], adjClose.shape[1]),
                                   dtype=float)

    rankweight = 1. / rankthreshold

    ########################################################################
    ## Calculate change in rank of active stocks each day (without duplicates as ties)
    ########################################################################
    monthgainlossRank = np.zeros((adjClose.shape[0], adjClose.shape[1]),
                                 dtype=int)
    monthgainlossPrevious = np.zeros((adjClose.shape[0], adjClose.shape[1]),
                                     dtype=float)
    monthgainlossPreviousRank = np.zeros(
        (adjClose.shape[0], adjClose.shape[1]), dtype=int)

    ###
    ###
    ###

    monthgainlossRank = allPairsRanking(adjClose,
                                        symbols,
                                        datearray,
                                        span=LongPeriod)

    ###
    ###
    ###

    ########monthgainlossRank = bn.rankdata(monthgainloss,axis=0)
    # reverse the ranks (low ranks are biggest gainers)
    maxrank = np.max(monthgainlossRank)
    monthgainlossRank -= maxrank - 1
    monthgainlossRank *= -1
    monthgainlossRank += 2

    monthgainlossPrevious[:, LongPeriod:] = monthgainloss[:, :-LongPeriod]
    monthgainlossPreviousRank = bn.rankdata(monthgainlossPrevious, axis=0)
    # reverse the ranks (low ranks are biggest gainers)
    maxrank = np.max(monthgainlossPreviousRank)
    monthgainlossPreviousRank -= maxrank - 1
    monthgainlossPreviousRank *= -1
    monthgainlossPreviousRank += 2

    # weight deltaRank for best and worst performers differently
    rankoffsetchoice = rankthreshold
    delta = -(monthgainlossRank.astype('float') -
              monthgainlossPreviousRank.astype('float')) / (
                  monthgainlossRank.astype('float') + float(rankoffsetchoice))

    # if rank is outside acceptable threshold, set deltarank to zero so stock will not be chosen
    #  - remember that low ranks are biggest gainers
    rankThreshold = (1. - rankThresholdPct) * (monthgainlossRank.max() -
                                               monthgainlossRank.min())
    for ii in range(monthgainloss.shape[0]):
        for jj in range(monthgainloss.shape[1]):
            if monthgainloss[ii, jj] > rankThreshold:
                delta[ii, jj] = -monthgainloss.shape[0] / 2
                if jj == monthgainloss.shape[1]:
                    print "*******setting delta (Rank) low... Stock has rank outside acceptable range... ", ii, symbols[
                        ii], monthgainloss[ii, jj]

    # if adjClose is nan, set deltarank to zero so stock will not be chosen
    #  - remember that low ranks are biggest gainers
    rankThreshold = (1. - rankThresholdPct) * (monthgainlossRank.max() -
                                               monthgainlossRank.min())
    for ii in range(monthgainloss.shape[0]):
        if isnan(adjClose[ii, -1]):
            delta[ii, :] = -monthgainloss.shape[0] / 2
            numisnans = adjClose[ii, :]
            # NaN in last value usually means the stock is removed from the index so is not updated, but history is still in HDF file
            print "*******setting delta (Rank) low... Stock has NaN for last value... ", ii, symbols[
                ii], numisnans[np.isnan(numisnans)].shape

    deltaRank = bn.rankdata(delta, axis=0)

    # reverse the ranks (low deltaRank have the fastest improving rank)
    maxrank = np.max(deltaRank)
    deltaRank -= maxrank - 1
    deltaRank *= -1
    deltaRank += 2

    for ii in range(monthgainloss.shape[1]):
        if deltaRank[:, ii].min() == deltaRank[:, ii].max():
            deltaRank[:, ii] = 0.

    ########################################################################
    ## Hold values constant for calendar month (gains, ranks, deltaRanks)
    ########################################################################

    for ii in range(1, monthgainloss.shape[1]):
        if datearray[ii].month == datearray[ii - 1].month:
            monthgainloss[:, ii] = monthgainloss[:, ii - 1]
            delta[:, ii] = delta[:, ii - 1]
            deltaRank[:, ii] = deltaRank[:, ii - 1]

    ########################################################################
    ## Calculate number of active stocks each day
    ########################################################################

    # TODO: activeCount can be computed before loop to save CPU cycles
    # count number of unique values
    activeCount = np.zeros(adjClose.shape[1], dtype=float)
    for ii in np.arange(0, monthgainloss.shape[0]):
        firsttradedate = np.argmax(
            np.clip(np.abs(gainloss[ii, :] - 1.), 0., .00001))
        activeCount[firsttradedate:] += 1

    minrank = np.min(deltaRank, axis=0)
    maxrank = np.max(deltaRank, axis=0)
    # convert rank threshold to equivalent percent of rank range

    rankthresholdpercentequiv = np.round(
        float(rankthreshold) * (activeCount - minrank + 1) / adjClose.shape[0])
    ranktest = deltaRank <= rankthresholdpercentequiv

    ########################################################################
    ### Calculate downside risk measure for weighting stocks.
    ### Use 1./ movingwindow_sharpe_ratio for risk measure.
    ### Modify weights with 1./riskDownside and scale so they sum to 1.0
    ########################################################################

    riskDownside = 1. / move_sharpe_2D(adjClose, gainloss, LongPeriod)
    riskDownside = np.clip(riskDownside, riskDownside_min, riskDownside_max)

    riskDownside[isnan(riskDownside)] = np.max(
        riskDownside[~isnan(riskDownside)])
    for ii in range(riskDownside.shape[0]):
        riskDownside[ii] = riskDownside[ii] / np.sum(riskDownside, axis=0)

    ########################################################################
    ### calculate equal weights for ranks below threshold
    ########################################################################

    elsecount = 0
    elsedate = 0
    for ii in np.arange(1, monthgainloss.shape[1]):
        if activeCount[ii] > minrank[ii] and rankthresholdpercentequiv[ii] > 0:
            for jj in range(value.shape[0]):
                test = deltaRank[jj, ii] <= rankthresholdpercentequiv[ii]
                if test == True:
                    monthgainlossweight[
                        jj, ii] = 1. / rankthresholdpercentequiv[ii]
                    monthgainlossweight[jj, ii] = monthgainlossweight[
                        jj, ii] / riskDownside[jj, ii]
                else:
                    monthgainlossweight[jj, ii] = 0.
        elif activeCount[ii] == 0:
            monthgainlossweight[:, ii] *= 0.
            monthgainlossweight[:, ii] += 1. / adjClose.shape[0]
        else:
            elsedate = datearray[ii]
            elsecount += 1
            monthgainlossweight[:, ii] = 1. / activeCount[ii]

    aaa = np.sum(monthgainlossweight, axis=0)

    allzerotest = np.sum(monthgainlossweight, axis=0)
    sumallzerotest = allzerotest[allzerotest == 0].shape
    if sumallzerotest > 0:
        print ""
        print " invoking correction to monthgainlossweight....."
        print ""
        for ii in np.arange(1, monthgainloss.shape[1]):
            if np.sum(monthgainlossweight[:, ii]) == 0:
                monthgainlossweight[:, ii] = 1. / activeCount[ii]

    print " weights calculation else clause encountered :", elsecount, " times. last date encountered is ", elsedate
    rankweightsum = np.sum(monthgainlossweight, axis=0)

    monthgainlossweight[isnan(
        monthgainlossweight)] = 0.  # changed result from 1 to 0

    monthgainlossweight = monthgainlossweight / np.sum(monthgainlossweight,
                                                       axis=0)
    monthgainlossweight[isnan(
        monthgainlossweight)] = 0.  # changed result from 1 to 0

    # input symbols and company names from text file
    companyName_file = os.path.join(os.getcwd(), "symbols", "companyNames.txt")
    with open(companyName_file, "r") as f:
        companyNames = f.read()

    print "\n\n\n"
    companyNames = companyNames.split("\n")
    ii = companyNames.index("")
    del companyNames[ii]
    companySymbolList = []
    companyNameList = []
    for iname, name in enumerate(companyNames):
        name = name.replace("amp;", "")
        testsymbol, testcompanyName = name.split(";")
        companySymbolList.append(testsymbol)
        companyNameList.append(testcompanyName)

    # print list showing current rankings and weights
    # - symbol
    # - rank
    # - weight from sharpe ratio
    # - price
    import os
    rank_text = "<div id='rank_table_container'><h3>" + "<p>Current stocks, with ranks, weights, and prices are :</p></h3><font face='courier new' size=3><table border='1'> \
               <tr><td>Rank \
               </td><td>Symbol \
               </td><td>Company \
               </td><td>Weight \
               </td><td>Price  \
               </td><td>Trend  \
               </td></tr>\n"

    for i, isymbol in enumerate(symbols):
        for j in range(len(symbols)):
            if int(deltaRank[j, -1]) == i:
                if signal2D[j, -1] == 1.:
                    trend = 'up'
                else:
                    trend = 'down'

                # search for company name
                try:
                    symbolIndex = companySymbolList.index(symbols[j])
                    companyName = companyNameList[symbolIndex]
                except:
                    companyName = ""

                rank_text = rank_text + \
                       "<tr><td>" + format(deltaRank[j,-1],'6.0f')  + \
                       "<td>" + format(symbols[j],'5s')  + \
                       "<td>" + format(companyName,'15s')  + \
                       "<td>" + format(monthgainlossweight[j,-1],'5.03f') + \
                       "<td>" + format(adjClose[j,-1],'6.2f')  + \
                       "<td>" + trend  + \
                       "</td></tr>  \n"
    rank_text = rank_text + "</table></div>\n"

    filepath = os.path.join(os.getcwd(), "pyTAAA_web",
                            "pyTAAAweb_RankList.txt")
    with open(filepath, "w") as f:
        f.write(rank_text)

    print "leaving function sharpeWeightedRank_2D..."

    return monthgainlossweight
Ejemplo n.º 21
0
def corr_main(params):
    """
    This is the main function which performs the following steps:
    - open dataset(s), load selected features, merge datasets
    - perform GLASSO with huge R package
    - calculated permuted p-values with GPD approximation in parallel
    - correct for multiple testing
    - save r and p value matrices for users
    - save networks from r values for users
    - write variables and datasets for visualisation in JS
    """

    # --------------------------------------------------------------------------
    # CALCULATE GRAPHLASSO AND PERMUTED P-VALS
    # --------------------------------------------------------------------------

    # open first dataset
    path = os.path.join(params['output_folder'], params['dataset1'])
    dataset1, sep = open_file(path)
    n, p = dataset1.shape
    # if there's a 2nd dataset, merge them
    if not params['autocorr']:
        path2 = os.path.join(params['output_folder'], params['dataset2'])
        dataset2, sep2 = open_file(path2)
        # if two featres has the same name we need prefixes
        merged_datasets_df = dataset1.join(dataset2, how='inner',
                                           lsuffix='_data1', rsuffix='_data2')
        X = merged_datasets_df.values
    else:
        merged_datasets_df = dataset1
        X = merged_datasets_df.values

    # standardise X
    ss = StandardScaler()
    X = ss.fit_transform(X)

    # perform GLASSO with huge in R
    lambda_threshold = params['lambda_val']
    cov, prec = hugeR.hugeR(X, lambda_threshold)

    # create column ranked X for corr_permutation
    rX = bn.rankdata(X, axis=0)

    # get GPD approximated p-values
    perm_num = 10000
    rs, p_vals, p_mask = cp.gpd_spearman(rX, perm_num=perm_num, prec=prec,
                                         mc_method=params['multi_corr_method'],
                                         mc_alpha=params['alpha_val'])

    # delete correlations that did not pass the multi test correction
    rs[~p_mask] = 0
    p_vals[~p_mask] = 1

    # --------------------------------------------------------------------------
    # CHECK IF GENOMIC FILTERING IS NEEDED
    # --------------------------------------------------------------------------

    # if fs, load metadata column for fold_change calculation later
    if params['fs']:
        path = os.path.join(params['study_folder'], params['metadata_file'])
        y, _ = open_file(path)
        y = y[params['fs_cols']].iloc[1:].dropna()
    else:
        y = None

    # if genomic, check if filtering overlapping and distant corrs needed
    discard_or_constrain = params['discard_overlap'] or params['constrain_corr']
    if params['annotation'] and discard_or_constrain:
        genomic = True
    else:
        genomic = False

    # --------------------------------------------------------------------------
    # GENERATE PAIRWISE PLOTS FOR DATA1, DATA2, DATA1-2
    # --------------------------------------------------------------------------

    generate_pair_plots(params, rs, p_vals, merged_datasets_df, p)

    # --------------------------------------------------------------------------
    # WRITE RESULTS FOR DATA1, DATA2, DATA1-2
    # --------------------------------------------------------------------------

    params = write_results(params, rs[:p, :p], p_vals[:p, :p], genomic,
                           (dataset1, dataset1), 'dataset1', y, True)
    if not params['autocorr']:
        params = write_results(params, rs[p:, p:], p_vals[p:, p:], genomic,
                               (dataset2, dataset2), 'dataset2', y, True)
        params = write_results(params, rs[:p, p:], p_vals[:p, p:], genomic,
                               (dataset1, dataset2), 'dataset1_2', y)

    # if corr_done in params is False one of the writing steps failed
    if 'corr_done' not in params:
        params['corr_done'] = True
    return params