def test_results_sparse():
    seed(1234)

    adata = get_example_data(sparse=True)

    true_names_t_test, true_names_wilcoxon,\
    true_scores_t_test, true_scores_wilcoxon = get_true_scores()

    rank_genes_groups(adata, 'true_groups', n_genes=20, method='t-test')

    adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][
        'names'].astype(true_names_t_test.dtype)

    for name in true_scores_t_test.dtype.names:
        assert np.allclose(true_scores_t_test[name],
                           adata.uns['rank_genes_groups']['scores'][name])
    assert np.array_equal(true_names_t_test,
                          adata.uns['rank_genes_groups']['names'])

    rank_genes_groups(adata, 'true_groups', n_genes=20, method='wilcoxon')

    adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][
        'names'].astype(true_names_wilcoxon.dtype)

    for name in true_scores_t_test.dtype.names:
        assert np.allclose(true_scores_wilcoxon[name][:7],
                           adata.uns['rank_genes_groups']['scores'][name][:7])
    assert np.array_equal(true_names_wilcoxon[:7],
                          adata.uns['rank_genes_groups']['names'][:7])
def test_wilcoxon_symmetry():
    pbmc = pbmc68k_reduced()

    rank_genes_groups(
        pbmc,
        groupby="bulk_labels",
        groups=["CD14+ Monocyte", "Dendritic"],
        reference="Dendritic",
        method='wilcoxon',
        rankby_abs=True,
    )
    assert pbmc.uns["rank_genes_groups"]["params"]["use_raw"] is True

    stats_mono = (rank_genes_groups_df(
        pbmc, group="CD14+ Monocyte").drop(columns="names").to_numpy())

    rank_genes_groups(
        pbmc,
        groupby="bulk_labels",
        groups=["CD14+ Monocyte", "Dendritic"],
        reference="CD14+ Monocyte",
        method='wilcoxon',
        rankby_abs=True,
    )

    stats_dend = (rank_genes_groups_df(
        pbmc, group="Dendritic").drop(columns="names").to_numpy())

    assert np.allclose(np.abs(stats_mono), np.abs(stats_dend))
def test_emptycat():
    pbmc = pbmc68k_reduced()
    pbmc.obs['louvain'] = pbmc.obs['louvain'].cat.add_categories(['11'])

    with pytest.raises(ValueError,
                       match=rf"Could not calculate statistics.*{'11'}"):
        rank_genes_groups(pbmc, groupby='louvain')
Beispiel #4
0
def get_de(adata,
           mygroup,
           demethod='wilcoxon',
           topnr=5000,
           logfc=1,
           padj=0.05):
    """ Get a table of significant DE genes at certain cutoffs
    Based on an AnnData object and an annotation category (e.g. louvain) runs
    scanpy's rank_genes_groups using a specified method with specified cutoffs
    (nr. genes, logfc, padj) and returns a df with the results
    parameters
    ----------
    adata: `AnnData`
        AnnData object containing
    mygroup: `str`
        group for performing de, needs be in adata.obs
    demethod: `str`
        one of 't-test', 'wilcoxon', 't-test_overestim_var', 'logreg'
    topnr: `int`
        the number of top genes in the DE analysis
    padj: `float`
        log fold-change cutoff
    logfc: `float`
        adjusted p-value cutoff
    returns
    -------
    delist
        a list of panda DataFrames of differentially expressed genes
    """

    try:
        x = adata.obs[mygroup]
    except KeyError:
        print(
            "Oops!  The adata object does not have the specified column. Options are: "
        )
        print(list(adata.obs.columns))
        return

    mygroups = list(sort(list(set(adata.obs[mygroup]))))
    delist = {}
    rank_genes_groups(adata,
                      groupby=mygroup,
                      use_raw=True,
                      n_genes=adata.raw.X.shape[1],
                      method=demethod)
    for i in mygroups:
        df = DataFrame(adata.uns['rank_genes_groups']['names']).head(topnr)[i]
        dfS = DataFrame(
            adata.uns['rank_genes_groups']['scores']).head(topnr)[i]
        dfFC = DataFrame(
            adata.uns['rank_genes_groups']['logfoldchanges']).head(topnr)[i]
        dfp = DataFrame(
            adata.uns['rank_genes_groups']['pvals_adj']).head(topnr)[i]
        d = concat([df, dfS, dfFC, dfp], axis=1)
        d.columns = ['Name', 'Score', 'Log2FC', 'P.adj']
        delist[i] = d[(d['Log2FC'] >= logfc) & (d['P.adj'] <= padj)]
    return (delist)
def test_results_layers():
    seed(1234)

    adata = get_example_data(sparse=False)
    adata.layers["to_test"] = adata.X.copy()
    adata.X = adata.X * np.random.randint(0, 2, adata.shape, dtype=bool)

    (
        true_names_t_test,
        true_names_wilcoxon,
        true_scores_t_test,
        true_scores_wilcoxon,
    ) = get_true_scores()

    # Wilcoxon
    rank_genes_groups(
        adata,
        'true_groups',
        method='wilcoxon',
        layer="to_test",
        n_genes=20,
    )
    assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False
    for name in true_scores_t_test.dtype.names:
        assert np.allclose(
            true_scores_wilcoxon[name][:7],
            adata.uns['rank_genes_groups']['scores'][name][:7],
        )

    rank_genes_groups(adata, 'true_groups', method='wilcoxon', n_genes=20)
    for name in true_scores_t_test.dtype.names:
        assert not np.allclose(
            true_scores_wilcoxon[name][:7],
            adata.uns['rank_genes_groups']['scores'][name][:7],
        )

    # t-test
    rank_genes_groups(
        adata,
        'true_groups',
        method='t-test',
        layer="to_test",
        use_raw=False,
        n_genes=20,
    )
    for name in true_scores_t_test.dtype.names:
        assert np.allclose(
            true_scores_t_test[name][:7],
            adata.uns['rank_genes_groups']['scores'][name][:7],
        )

    rank_genes_groups(adata, 'true_groups', method='t-test', n_genes=20)
    for name in true_scores_t_test.dtype.names:
        assert not np.allclose(
            true_scores_t_test[name][:7],
            adata.uns['rank_genes_groups']['scores'][name][:7],
        )
def test_results_dense():
    seed(1234)

    adata = get_example_data()
    assert adata.raw is None  # Assumption for later checks

    (
        true_names_t_test,
        true_names_wilcoxon,
        true_scores_t_test,
        true_scores_wilcoxon,
    ) = get_true_scores()

    rank_genes_groups(adata, 'true_groups', n_genes=20, method='t-test')

    adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][
        'names'].astype(true_names_t_test.dtype)

    for name in true_scores_t_test.dtype.names:
        assert np.allclose(true_scores_t_test[name],
                           adata.uns['rank_genes_groups']['scores'][name])
    assert np.array_equal(true_names_t_test,
                          adata.uns['rank_genes_groups']['names'])
    assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False

    rank_genes_groups(adata, 'true_groups', n_genes=20, method='wilcoxon')

    adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][
        'names'].astype(true_names_wilcoxon.dtype)

    for name in true_scores_t_test.dtype.names:
        assert np.allclose(
            true_scores_wilcoxon[name][:7],
            adata.uns['rank_genes_groups']['scores'][name][:7],
        )
    assert np.array_equal(true_names_wilcoxon[:7],
                          adata.uns['rank_genes_groups']['names'][:7])
    assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False
Beispiel #7
0
def test_filter_rank_genes_groups():
    adata = pbmc68k_reduced()

    # fix filter defaults
    args = {
        'adata': adata,
        'key_added': 'rank_genes_groups_filtered',
        'min_in_group_fraction': 0.25,
        'min_fold_change': 1,
        'max_out_group_fraction': 0.5,
    }

    rank_genes_groups(adata,
                      'bulk_labels',
                      reference='Dendritic',
                      method='wilcoxon',
                      n_genes=5)
    filter_rank_genes_groups(**args)

    assert np.array_equal(
        names_reference,
        np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()),
    )

    rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', n_genes=5)
    filter_rank_genes_groups(**args)

    assert np.array_equal(
        names_no_reference,
        np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()),
    )

    rank_genes_groups(adata,
                      'bulk_labels',
                      method='wilcoxon',
                      pts=True,
                      n_genes=5)
    filter_rank_genes_groups(**args)

    assert np.array_equal(
        names_no_reference,
        np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()),
    )
Beispiel #8
0
def celltype_labeling(adata,
                      labeling_author,
                      results_folder,
                      labeling_to_use='celltype',
                      labeling_name='celltype',
                      labeling_description='manual celltype annotation',
                      cluster_method='louvain'):
    """ Standard Workflow function to export an additional labeling besides louvain to FAIR format.

    This function calculated marker genes per label (using rank_genes_groups and the method 'wilcoxon'), exports the labeling,
    generates a labeling_info file, and exports the rank file.

    parameters
    ----------
    adata: `AnnData`
      AnnData object from which the labeling is to be exported
    labeling_to_use: `str` | default = 'celltype'
      string identifying the column in adata.obs containing the labeling that is to be exported (also used
      to calculate the ranked_genes)
    labeling_name: `str` | default = 'celltype'
      string identifiying under which name the labeling should be exported
    labeling_description: `str` | default = 'manual celltype annotation'
      string defining the description which should be saved in the labeling_info file for the exported labeling
    labeling_author: `str`
      string defining the author of the labeling which should be saved in the labeling_info file for the exported labeling
    results_folder: `str`
      string indicating the basepath to the results folder which is automatically generated when using the standard workflow (pass results_folder)

    returns
    -------
    None
      writes out several files to folder results_folder/labelings/<labeling_name>
    """
    start = time()
    # calculate marker genes for labeling
    rank_genes_groups(adata,
                      labeling_to_use,
                      method='wilcoxon',
                      use_raw=True,
                      n_genes=adata.raw.X.shape[1])
    print('rank genes per label calculated using method wilcoxon.')
    logging.info(
        'Marker gene detection performed on a per-label basis using the method wilcoxon.'
    )
    logging.info('\tTime for marker gene detection: ' +
                 str(round(time() - start, 3)) + 's')
    # export labeling
    outpath = os.path.join(results_folder, 'labelings', labeling_name)
    start = time()
    labeling(adata, column=labeling_to_use, outpath=outpath)
    # generate labelinfo.tsv file
    labeling_info(outpath=outpath,
                  description=labeling_description,
                  public=False,
                  default=False,
                  expert=True,
                  reference=False,
                  method=labeling_author,
                  annotated_version_of=cluster_method)

    export_rank(adata,
                basepath=results_folder,
                type='wilcox',
                labeling_name=labeling_name)

    logging.info('Label level analysis and marker genes exported to file.')
    logging.info('\tTime for export of cluster level analysis: ' +
                 str(round(time() - start, 3)) + 's')

    return (None)
Beispiel #9
0
def clustering(adata, results_folder, myres=1, method='leiden'):
    """ Perform adata clustering and write the corresponding results

    parameters
    ----------
    adata: `ÀnnData`
        AnnData object that is to be exported
    results_folder: `str`
        path to the results folder
    myres: int
        resolution for the algorithm
    method: `str`
        clustering algorithm. Implemented: louvain/leiden

    returns
    -------
    None
        writes to file

    """
    if (not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    random_state = 0
    start = time()
    if method == 'louvain':
        sc_louvain(adata, resolution=myres, random_state=random_state)
    if method == 'leiden':
        sc_leiden(adata, resolution=myres, random_state=random_state)
    print(method + ' clustering performed with a resolution of ' + str(myres))
    clusNum = len(set(adata.obs[method]))

    sc_pl_umap(adata,
               color=[method],
               legend_loc='on data',
               save='.' + method + '.png')

    logging.info(method + 'clustering done. Found ' + str(clusNum) +
                 ' clusters.')
    logging.info('\tTime for ' + method + ' clustering: ' +
                 str(round(time() - start, 3)) + 's')

    # detect marker genes
    start = time()
    rank_genes_groups(adata,
                      method,
                      method='wilcoxon',
                      use_raw=True,
                      n_genes=adata.raw.X.shape[1])
    print('rank genes per cluster calculated using method wilcoxon.')

    logging.info(
        'Marker gene detection performed on a per-cluster basis using the method wilcoxon.'
    )
    logging.info('\tTime for marker gene detection: ' +
                 str(round(time() - start, 3)) + 's')

    # export  clustering to file
    start = time()
    export_clustering(adata, basepath=results_folder, method=method)
    export_rank(adata,
                basepath=results_folder,
                type='wilcox',
                labeling_name=method)
    logging.info('Cluster level analysis and marker genes exported to file.')
    logging.info('\tTime for export of cluster level analysis: ' +
                 str(round(time() - start, 3)) + 's')

    return (adata)
Beispiel #10
0
def perform_dge(adata,
                design_matrix,
                differentiating_criteria,
                constant_criteria,
                basepath,
                min_cells_per_group=30,
                method='wilcoxon'):
    """Perform differential gene expression between two conditions over many adata subsets.

    This function automatically generates top_tables and rank_files for a list of comparisons
    in a dataset. The comparison you wish to perform need to be identified in a so called design
    matrix (see below).

    This function is capable of handling comparisons where you wish to compare
    two conditions in a subset of the dataset, e.g. treatment vs control in the celltype CD4 T-cell.
    The conditions must be annotated in a column adata.obs, this represents the differentiating_criteria.
    This column may only have two different labels! The subsets in which this comparison should be made
    must be annotated in another column represented by 'constant_criteria'. This column may have as
    many labels as you wish.

    Design Matrix:
    the design matrix consists of a pandas.DataFrame with two columns. Each row
    represents one comparison that is to be made. The first column labeled 'Group1',
    contains a tuple identifying the first group for that comparison and the
    second column labeled 'Group2' contains a tuple identifying the second group for
    the comparison. The tuple has the form (differentiating_criteria, constant_criteria).

    >>> #example of a Design Matrix
    >>> celltypes = ['CD4 T-cell', 'CD8 T-cell', 'B-cell', 'myeloid cell']
    >>> design_matrix = pd.DataFrame({'Group1':[('PBMC', celltype) for celltype in celltypes], 'Group2':[('Skin', celltype) for celltype in celltypes]})

    parameters
    ----------
    adata: `AnnData`
        AnnData object containing
    design_matrix: `pandas.DataFrame`
        pandas.DataFrame containing all the comparisons that are to be made.
    method: `str`
        one of 't-test', 'wilcoxon', 't-test_overestim_var', 'logreg'
    """

    #get raw data
    adata_raw = get_raw(adata=adata)

    too_few_cells = []
    for i in range(design_matrix.shape[0]):

        #get group1 and group2
        group1 = design_matrix.values[i, 0]
        group2 = design_matrix.values[i, 1]

        contrast = str(group1[1] + '_' + group1[0] + '_vs_' + group2[1] + '_' +
                       group2[0])
        contrast = contrast.replace(' ', '_')
        contrast = contrast.replace('-', '_')
        contrast = contrast.replace('+', '')

        #perform sanitychecks
        if group1[1] != group2[1]:
            #sys.exit()
            print('please ensure design matrix is correct!')

        #get adata_subset
        adata_subset = subset_adata(
            adata_raw,
            filter_criteria=adata.obs.get(constant_criteria) == group1[1],
            raw=False)
        adata_subset.var = adata_raw.var

        #ensure that you have enough cells in each group
        counts = adata_subset.obs.get(differentiating_criteria).value_counts()
        if counts.get(group1[0]) is None:
            print('----------------------------------------')
            print('not enough cells for comparison', contrast,
                  ' in one of the groups')
            too_few_cells.append(i)
        elif counts.get(group2[0]) is None:
            print('----------------------------------------')
            print('not enough cells for comparison', contrast,
                  ' in one of the groups')
            too_few_cells.append(i)
        elif not ((counts.get(group1[0]) > min_cells_per_group) &
                  (counts.get(group2[0]) > min_cells_per_group)):
            print('----------------------------------------')
            print('not enough cells for comparison', contrast,
                  ' in one of the groups')
            too_few_cells.append(i)
        else:
            #caclulate rank_genes
            print('----------------------------------------')
            print('performing comparison', contrast)
            rank_genes_groups(adata=adata_subset,
                              groupby=differentiating_criteria,
                              reference=group2[0],
                              groups=[group1[0]],
                              method=method,
                              n_genes=adata_subset.n_vars)

            scores, pvalues, logFC, FDRs = extract_info_rank_genes_groups(
                adata_subset)

            #calculate mean expression of each gene in the data
            mean_all = adata_subset.X.todense().mean(axis=0).tolist()[0]
            mean_group1 = adata_subset[adata_subset.obs.get(
                differentiating_criteria) == group1[0]].X.todense().mean(
                    axis=0).tolist()[0]
            mean_group2 = adata_subset[adata_subset.obs.get(
                differentiating_criteria) == group2[0]].X.todense().mean(
                    axis=0).tolist()[0]

            #get mean expression
            log2cp10k = DataFrame(data={
                'log2cp10k': mean_all,
                'log2cp10k_' + group1[0]: mean_group1,
                'log2cp10k_' + group2[0]: mean_group2
            },
                                  index=adata_subset.var.ENSEMBL)

            #generate top_table for group1
            top_table = DataFrame(data={
                'Contrast': contrast,
                'SYMBOL': scores.Description,
                'ENSEMBL': scores.index.tolist(),
                'Score': scores.get(group1[0])
            },
                                  index=scores.index)
            #add p-value, logFC, FDRs, mean_expression
            top_table = top_table.merge(pvalues.get(
                group1[0]).to_frame().rename(columns={group1[0]: 'P-value'}),
                                        how='left',
                                        right_index=True,
                                        left_index=True)
            top_table = top_table.merge(logFC.get(
                group1[0]).to_frame().rename(columns={group1[0]: 'logFC'}),
                                        how='left',
                                        right_index=True,
                                        left_index=True)
            top_table = top_table.merge(FDRs.get(
                group1[0]).to_frame().rename(columns={group1[0]: 'FDR'}),
                                        how='left',
                                        right_index=True,
                                        left_index=True)
            top_table = top_table.merge(log2cp10k,
                                        how='left',
                                        right_index=True,
                                        left_index=True)

            #order according to score
            top_table.sort_values(ascending=False,
                                  axis=0,
                                  by='Score',
                                  inplace=True)

            #replace all 0 values with the smallest number possible in python
            smallest_num = 1e-308
            top_table.replace(0, 1e-308, inplace=True)

            #generate rank files
            rank_file = top_table.get(['SYMBOL', 'P-value', 'logFC'])
            rank_file['value'] = abs(log10(rank_file['P-value'])) * sign(
                rank_file['logFC'])
            rank_file.drop(columns=['P-value', 'logFC'], inplace=True)
            rank_file.sort_values(ascending=False,
                                  axis=0,
                                  by='value',
                                  inplace=True)

            #replace all inf values with very small or very large values
            rank_file.replace(inf, 1e100, inplace=True)
            rank_file.replace(-inf, -1e100, inplace=True)

            outpath = os.path.join(basepath)
            if not os.path.exists(outpath):
                os.makedirs(outpath)

            if method == 'wilcoxon':
                rank_File = os.path.join(outpath,
                                         "WilxRank_" + contrast + ".rnk")
                TopTable_File = os.path.join(
                    outpath, "WilxTopTable_" + contrast + ".txt")
            elif method == 't-test_overestim_var':
                rank_File = os.path.join(outpath,
                                         "OverestRank_" + contrast + ".rnk")
                TopTable_File = os.path.join(
                    outpath, "OverestTopTable_" + contrast + ".txt")
            elif method == 't-test':
                rank_File = os.path.join(outpath,
                                         "tTestRank_" + contrast + ".rnk")
                TopTable_File = os.path.join(
                    outpath, "tTestTopTable_" + contrast + ".txt")
            elif method == 'logreg':
                rank_File = os.path.join(outpath,
                                         "logregRank_" + contrast + ".rnk")
                TopTable_File = os.path.join(
                    outpath, "logregTopTable_" + contrast + ".txt")
            else:
                sys.exit(
                    'need to specify type as one of \'wilcoxon\' or \'t-test_overestim_var\'  or \'t-test\' or \'logreg\''
                )

            #write out rankfile
            rank_file.to_csv(rank_File,
                             sep='\t',
                             index=False,
                             header=False,
                             float_format='%.2f')
            print(rank_File, 'written out')

            #write out TopTable
            top_table.to_csv(TopTable_File,
                             sep='\t',
                             index=True,
                             header=True,
                             float_format='%1.3e')
            print(TopTable_File, 'written out')

    #return table containing the comparisions that could not be performed
    if len(too_few_cells) > 0:
        print('')
        print('')
        print('----------------------------------------------')
        print(
            'the following comparisons from the design_matrix could not per performed due to too few cells'
        )
        display(design_matrix.iloc[too_few_cells, :])

    return (None)
    sys.exit(0)