Ejemplo n.º 1
0
    def test__overwride_sample_representation(self, atac_analysis):

        prev = atac_analysis.samples[0].__repr__
        Analysis._overwride_sample_representation()
        new = atac_analysis.samples[0].__repr__

        assert prev != new
Ejemplo n.º 2
0
def test_project_with_subprojects(subproject_config):
    from ngs_toolkit import Analysis

    a = Analysis(from_pep=subproject_config)
    assert len(a.samples) == 0

    a = Analysis(from_pep=subproject_config, amendments=["test_subproject"])
    assert len(a.samples) > 0
Ejemplo n.º 3
0
    def test_with_object_as(self):
        name = "test_analysis"

        an = Analysis(name=name)
        with an as _an:
            assert an is _an
            assert an == _an
            assert _an.__repr__() == "Analysis '{}'.".format(name)
            assert "samples" not in _an.__repr__()
Ejemplo n.º 4
0
    def test_analysis_serialization(self, tmp_path):

        tmp_path = str(tmp_path)

        pickle_file = os.path.join(tmp_path, "analysis.pickle")
        a = Analysis(pickle_file=pickle_file)
        assert not file_exists(pickle_file)
        a.to_pickle()
        assert file_exists(pickle_file)
        assert file_not_empty(pickle_file)

        previous_size = os.stat(
            get_this_file_or_timestamped(pickle_file)).st_size
        a.random = np.random.random((100, 100))
        a.to_pickle()
        new_size = os.stat(get_this_file_or_timestamped(pickle_file)).st_size
        assert new_size > previous_size

        previous_size = os.stat(
            get_this_file_or_timestamped(pickle_file)).st_size
        a.random = np.random.random((100, 100))
        a.to_pickle(timestamp=True)
        assert len(glob.glob(os.path.join(tmp_path, "*.pickle"))) == 2
Ejemplo n.º 5
0
 def test__format_string_with_attributes_simple(self):
     t = Analysis()
     t.a = 1
     t.b = ""
     assert "1" == Analysis._format_string_with_attributes(t, "{a}{b}")
Ejemplo n.º 6
0
 def test__format_string_with_environment_variables(self, env_var, string):
     assert string == Analysis._format_string_with_environment_variables(
         env_var)
Ejemplo n.º 7
0
 def test__check_data_type_is_supported(self):
     assert Analysis._check_data_type_is_supported("ATAC-seq")
     assert Analysis._check_data_type_is_supported("ChIP-seq")
     assert Analysis._check_data_type_is_supported("RNA-seq")
     assert Analysis._check_data_type_is_supported("CNV")
     assert not Analysis._check_data_type_is_supported("Microarray")
Ejemplo n.º 8
0
    def test_analysis_representation(self):
        name = "test_analysis"

        an = Analysis(name=name)
        assert an.__repr__() == "Analysis '{}'.".format(name)
        assert "samples" not in an.__repr__()
Ejemplo n.º 9
0
    def test_analysis_loading(self, tmp_path):
        tmp_path = str(tmp_path)
        pickle_file = os.path.join(tmp_path, "pickle")
        secret = "I've existed before"

        a = Analysis()
        a.pickle_file = pickle_file
        a.secret = secret
        a.to_pickle()

        a2 = Analysis(from_pickle=pickle_file)
        assert a2.secret == secret

        a3 = Analysis()
        a3.update(pickle_file)
        assert a3.secret == secret

        a4 = Analysis()
        a4.pickle_file = pickle_file
        a4 = a4.from_pickle()
        assert a4.secret == secret

        shutil.rmtree(tmp_path)
Ejemplo n.º 10
0
def main():
    global args
    args = parse_args()

    # barcode annotations
    annotation_file = os.path.join(
        "metadata", "sciRNA-seq.PD190_sixlines.oligos_2019-09-05.csv")
    annotation = pd.read_csv(annotation_file)

    # convenience
    gene_set_libraries = [
        'Human_Gene_Atlas', 'ARCHS4_Tissues', 'WikiPathways_2019_Human',
        'NCI-Nature_2016', 'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'GO_Biological_Process_2018'
    ]

    # read h5ad file
    sc.settings.n_jobs = -1
    sc.settings.figdir = os.path.dirname(args.output_prefix)
    sc.settings.set_figure_params(dpi=300, dpi_save=300, format='svg')
    print(f"# {time.asctime()} - Reading input data.")
    adata = sc.read(args.input_h5ad, cache=True)

    # Annotate with gene names instead of Ensembl IDs
    print(f"# {time.asctime()} - Annotating genes.")
    if args.species_mixture:
        adata.var.loc[:, "species"] = pd.Series(
            adata.var_names.str.startswith("ENSMUS"),
            index=adata.var.index).replace(True,
                                           "mouse").replace(False, "human")
    human_m = query_biomart(
        attributes=["ensembl_gene_id", "external_gene_name"],
        species="hsapiens",
        ensembl_version='grch38')
    v = adata.var.join(human_m.set_index("ensembl_gene_id"))

    if args.species_mixture:
        mouse_m = query_biomart(
            attributes=["ensembl_gene_id", "external_gene_name"],
            species="mmusculus",
            ensembl_version='grcm38')
        v.update(mouse_m.set_index("ensembl_gene_id"))
    adata.var.index = v['external_gene_name'].fillna(
        v.index.to_series()).values
    adata.var_names_make_unique()

    # QC
    # sc.pl.highest_expr_genes(a, n_top=20)
    adata.var.loc[:, 'mito'] = adata.var_names.str.contains(r'^MT-',
                                                            case=False)
    adata.obs.loc[:, 'percent_mito'] = np.sum(
        adata[:, adata.var['mito']].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.var.loc[:, 'ribo'] = adata.var_names.str.contains(r'^RP', case=False)
    adata.obs.loc[:, 'percent_ribo'] = np.sum(
        adata[:, adata.var['ribo']].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.obs.loc[:, 'n_counts'] = adata.X.sum(axis=1).A1
    adata.obs.loc[:, 'log_counts'] = np.log10(adata.obs.loc[:, 'n_counts'])
    adata.obs.loc[:, 'n_genes'] = (adata.X != 0).sum(1).A1
    adata.obs.loc[:, 'log_genes'] = np.log10(adata.obs.loc[:, 'n_genes'])

    # Filter
    print(f"# {time.asctime()} - Filtering.")
    sc.pp.filter_cells(adata, min_counts=50)
    grid = sns.FacetGrid(data=adata.obs[[
        'log_counts', 'log_genes', 'percent_mito', 'percent_ribo'
    ]].melt(),
                         col="variable",
                         sharex=False,
                         sharey=False)
    grid.map(sns.distplot, "value", kde=False)
    for ax in grid.axes.flat:
        ax.set_yscale("log")

    sc.pp.filter_cells(adata, min_counts=100)
    sc.pp.filter_cells(adata, max_counts=8000)
    sc.pp.filter_genes(adata, min_counts=20)
    # sc.pp.filter_genes(adata, max_counts=5000)
    print(f"Kept {adata.shape[0]} cells and {adata.shape[1]} genes.")

    # # remove cells with extreme mitochondial/ribosomal expression

    # # visualize
    # sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito', 'percent_ribo'], jitter=0.4, multi_panel=True)

    # Add experiment-specific variables
    print(f"# {time.asctime()} - Adding experiment-specific variables.")
    # info = adata.obs.index.to_series().str.split("-").apply(pd.Series)
    # info.columns = ['plate', 'well', 'droplet']
    info = adata.obs.index.str.slice(0, 3)
    adata.obs = adata.obs.assign(plate_well=info)
    # remove cells not matching annotation
    if adata.obs['plate_well'].isnull().sum() > 0:
        msg = "Warning: not all cells matched plate_well annotation."
        print(f"# {time.asctime()} - {msg}")
        adata = adata[~adata.obs['plate_well'].isnull(), :]
    adata.obs = adata.obs.merge(annotation[args.r1_attributes],
                                on=["plate_well"],
                                validate='many_to_one').set_index(
                                    adata.obs.index)

    if args.r1_attributes == ['plate_well', 'cell_line']:
        adata.obs = adata.obs.assign(
            species=(adata.obs['cell_line'] == "3T3"
                     ).replace(True, "mouse").replace(False, "human"))

    adata.X = adata.X.astype(np.float)
    adata.raw = adata
    sc.write(args.input_h5ad.replace(".h5ad", ".filtered.h5ad"), adata)
    adata = sc.read(args.input_h5ad.replace(".h5ad", ".filtered.h5ad"),
                    cache=True)

    a = adata.copy()
    # gene_count = pd.Series(a.X.sum(0).A1, index=a.var.index).sort_values()

    # Normalize
    sc.pp.normalize_per_cell(a)
    sc.pp.log1p(a)

    sc.tl.rank_genes_groups(a,
                            'cell_line',
                            method='t-test_overestim_var',
                            n_genes=50,
                            use_raw=False)
    result = a.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    diff = pd.DataFrame({
        group + '_' + key[:1]: result[key][group]
        for group in groups
        for key in ['names', 'pvals', 'logfoldchanges', 'scores']
    })
    diff.to_csv(args.output_prefix +
                "cell_line.cluster_comparison.top_values.csv",
                index=False)

    from ngs_toolkit.general import enrichr

    res = list()
    for cell_line in diff.columns[diff.columns.str.endswith("_n")]:
        res.append(
            enrichr(diff.rename(columns={cell_line: "gene_name"}),
                    gene_set_libraries=['ARCHS4_Cell-lines'
                                        ]).assign(cell_line=cell_line))
    res = pd.concat(res)
    g = res.set_index("description").groupby(['cell_line'
                                              ])['combined_score'].nlargest(5)
    print(g)

    # Reduce variables
    sc.pp.highly_variable_genes(a, flavor="seurat", min_disp=1,
                                max_mean=1)  # , n_top_genes=100
    # sc.pp.highly_variable_genes(a, flavor="cell_ranger", n_top_genes=100, batch_key="species")  # , n_top_genes=100
    sc.pl.highly_variable_genes(a)

    # sc.pp.scale(a)
    # sc.pp.highly_variable_genes(a, flavor="seurat", min_disp=1.5, max_disp=1e5, min_mean=1, max_mean=1e5, n_bins=20)
    # sc.pl.highly_variable_genes(a)

    print(f"Found {a.var.highly_variable.sum()} highly variable genes.")

    sc.pp.scale(a)
    sc.pp.pca(a,
              svd_solver='arpack',
              zero_center=None,
              use_highly_variable=True)

    sc.pl.pca_variance_ratio(a, log=True)

    # Manifold
    # sc.pp.neighbors(a, use_rep="X_pca", n_neighbors=20, metric="correlation")
    sc.pp.neighbors(a, use_rep="X_pca", n_neighbors=20)
    sc.tl.umap(a)
    sc.tl.diffmap(a)

    # Cluster
    sc.tl.leiden(a, resolution=0.3)
    sc.pl.umap(a,
               color=['leiden', 'log_counts'],
               palette='tab20c',
               save=args.name + "leiden.cells_per_cluster.svg")
    sc.pl.diffmap(a,
                  color=['leiden', 'log_counts'],
                  palette='tab20c',
                  save=args.name + "leiden.cells_per_cluster.svg")

    a.uns['iroot'] = np.argmin(a.obsm['X_diffmap'][0])
    sc.tl.dpt(a, n_branchings=2)
    sc.pl.dpt_groups_pseudotime(a)
    sc.pl.diffmap(a, color=['leiden', 'dpt_pseudotime'])

    # Differential genes
    sc.tl.rank_genes_groups(a,
                            'leiden',
                            method='t-test',
                            n_genes=1e6,
                            use_raw=False)
    sc.pl.rank_genes_groups(a,
                            n_genes=25,
                            sharey=False,
                            save=args.name + "leiden.svg")

    result = a.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    diff = pd.DataFrame({
        group + '_' + key[:1]: result[key][group]
        for group in groups
        for key in ['names', 'pvals', 'logfoldchanges', 'scores']
    })
    diff.to_csv(args.output_prefix +
                "leiden.cluster_comparison.top_values.csv",
                index=False)

    # Enrichment analysis of clusters
    import gseapy as gp
    from ngs_toolkit.analysis import Analysis

    enrichr = list()
    for i in sorted(set(a.obs['leiden'].astype(int))):
        print(i)
        enrichr.append(
            gp.enrichr(gene_list=diff[f"{i}_n"].head(200),
                       gene_sets=gene_set_libraries,
                       cutoff=0.5).results.assign(comparison_name=i))
    enrichr = pd.concat(enrichr)
    enrichr = enrichr.rename(
        columns={
            "P-value": "p_value",
            "Term": 'description',
            "Gene_set": 'gene_set_library',
            'Combined Score': 'combined_score',
            'Z-score': 'z_score'
        })
    enrichr.to_csv(args.output_prefix + "leiden.cluster_enrichments.csv",
                   index=False)
    n = Analysis(genome='hg38')
    n.enrichment_results = {"enrichr": enrichr}
    n.plot_differential_enrichment(steps=['enrichr'],
                                   plot_types=['heatmap'],
                                   output_dir=".",
                                   output_prefix=args.output_prefix + "leiden",
                                   top_n=30)
    a.uns['enrichr'] = enrichr

    # Plot some top genes
    genes = diff.loc[:,
                     diff.columns.str.endswith("_n")].head().T.stack().values
    sc.pl.dotplot(a,
                  var_names=genes,
                  groupby='leiden',
                  use_raw=False,
                  save=args.name + ".leiden.svg")
    sc.pl.stacked_violin(a,
                         var_names=genes,
                         groupby='leiden',
                         use_raw=False,
                         log=True,
                         save=args.name + ".leiden.svg")
    genes = diff.loc[:,
                     diff.columns.str.endswith("_n")].head(20).T.stack().values
    sc.pl.matrixplot(a,
                     var_names=genes,
                     groupby='leiden',
                     use_raw=False,
                     save=args.name + ".leiden.svg")

    # Save processed data
    sc.write(args.name + "processed.h5ad", a)
    a = sc.read(args.name + "processed.h5ad", cache=True)

    # # Donor-specific analysis
    sc.tl.rank_genes_groups(a,
                            groupby='sex',
                            method='t-test',
                            n_genes=1e6,
                            use_raw=False)
    sc.pl.rank_genes_groups(a,
                            n_genes=25,
                            sharey=False,
                            save=args.name + "sex.svg")

    sc.tl.rank_genes_groups(a,
                            groupby='donor_id',
                            method='t-test',
                            n_genes=1e6,
                            use_raw=False)
    sc.pl.rank_genes_groups(a,
                            n_genes=25,
                            sharey=False,
                            save=args.name + "donor_id.svg")
Ejemplo n.º 11
0
def create_project(
    project_name,
    genome_assemblies,
    overwrite=False,
    root_projects_dir=None,
    username=None,
    email=None,
    url=None,
    git=True,
):
    """
    Main function: Create project.
    """
    import subprocess

    from ngs_toolkit.analysis import Analysis

    # Get defaults from config
    if root_projects_dir is None:
        root_projects_dir = _CONFIG["preferences"]["root_projects_dir"]

    root_projects_dir = Analysis._format_string_with_environment_variables(root_projects_dir)
    project_dir = os.path.join(root_projects_dir, project_name)

    if os.path.exists(project_dir):
        if not overwrite:
            _LOGGER.error("Detected existing project directory, skipping.")
            return 1

    # Get defaults from config
    if username is None:
        username = _CONFIG["username"]
    if username is None:
        username = os.getenv("USER")
    if email is None:
        email = _CONFIG["email"]
    if url is None:
        url = _CONFIG["website_root"]
    if url is not None:
        if "{project_name}" in url:
            url = url.format(project_name=project_name)

    metadata_dir = os.path.join(project_dir, "metadata")
    project_config = os.path.join(metadata_dir, "project_config.yaml")
    annotation_table = os.path.join(metadata_dir, "annotation.csv")
    sample_subannotation = os.path.join(metadata_dir, "sample_subannotation.csv")
    comparison_table = os.path.join(metadata_dir, "comparison_table.csv")
    src_dir = os.path.join(project_dir, "src")

    genome_assemblies = "\n            ".join(
        [
            "- if:{n12}    organism: '{org}'{n12}  then:{n12}    genome: '{gen}'".format(
                org=s, gen=g, n12="\n" + "".join([" "] * 12)
            )
            for s, g in genome_assemblies.items()
        ]
    )

    # make dirs
    for d in [project_dir, metadata_dir, src_dir]:
        if not os.path.exists(d):
            os.makedirs(d)

    project_config_template = """    pep_version: "2.0.0"
    project_name: {project_name}
    description: {project_name}
    username: {username}
    email: {email}
    root_dir: {project_dir}
    results_subdir: data
    submission_subdir: submission
    pipeline_interfaces: /home/{username}/workspace/open_pipelines/pipeline_interface.yaml
    sample_table: {annotation_table}
    subsample_table: {sample_subannotation}
    comparison_table: {comparison_table}
    sample_attributes:
        - sample_name
    group_attributes:
        - sample_name
    sample_modifiers:
        imply:
            {genome_assemblies}
        derive:
            attributes: [data_source]
            sources:
                bsf: /scratch/lab_bsf/samples/{{flowcell}}/{{flowcell}}_{{lane}}_samples/{{flowcell}}_{{lane}}#{{BSF_name}}.bam
                local: /tmp/tmptd4zmpiw/test_project/data/{{sample_name}}.bam
    trackhubs:
        trackhub_dir: {project_dir}/trackhubs
        url: {url}""".format(
        project_name=project_name,
        username=username,
        email=email,
        project_dir=project_dir,
        annotation_table=annotation_table,
        sample_subannotation=sample_subannotation,
        comparison_table=comparison_table,
        genome_assemblies=genome_assemblies,
        url=url,
    )

    merge_table_template = ",".join(["sample_name", "flowcell", "lane", "BSF_name", "data_source"])
    annotation_table_template = ",".join(
        [
            "sample_name",
            "toggle",
            "pass_qc",
            "protocol",
            "library",
            "cell_line",
            "cell_type",
            "condition",
            "experimental_batch",
            "experiment_name",
            "replicate",
            "organism",
            "flowcell",
            "lane",
            "BSF_name",
            "data_source",
        ]
    )
    comparison_table_template = ",".join(
        [
            "comparison_type",
            "data_type",
            "comparison_name",
            "comparison_side",
            "sample_name",
            "sample_group",
            "comparison_genome",
            "toggle",
        ]
    )

    # write config and tables
    with open(project_config, "w", 1) as handle:
        handle.write(textwrap.dedent(project_config_template + "\n"))
    with open(sample_subannotation, "w", 1) as handle:
        handle.write(merge_table_template)
    with open(annotation_table, "w", 1) as handle:
        handle.write(annotation_table_template)
    with open(comparison_table, "w", 1) as handle:
        handle.write(comparison_table_template)

    # Initialize git repository)
    if git:
        p = subprocess.Popen(
            "git init {}".format(project_dir).split(" "),
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        p.communicate()
        return p.returncode
    return 0