Esempio n. 1
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        # csv/tsv
        return df2regulons(
            load_motifs(fname, sep=suffixes_to_separator(extension)))
    elif is_valid_suffix(extension, 'ctx_yaml'):
        return load_from_yaml(fname)
    elif '.gmt' in extension:
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif '.dat' in extension:
        with openfile(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Esempio n. 2
0
def save_enriched_motifs(df, fname: str) -> None:
    """
    Save enriched motifs.

    Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML.

    :param df:
    :param fname:
    :return:
    """
    extension = PurePath(fname).suffixes
    if is_valid_suffix(extension, 'ctx'):
        df.to_csv(fname, sep=suffixes_to_separator(extension))
    else:
        regulons = df2regulons(df)
        if '.json' in extension:
            name2targets = {
                r.name: list(r.gene2weight.keys())
                for r in regulons
            }
            with openfile(fname, 'w') as f:
                f.write(json.dumps(name2targets))
        elif '.dat' in extension:
            with openfile(fname, 'wb') as f:
                pickle.dump(regulons, f)
        elif '.gmt' in extension:
            GeneSignature.to_gmt(fname, regulons)
        elif is_valid_suffix(extension, 'ctx_yaml'):
            save_to_yaml(regulons, fname)
        else:
            raise ValueError("Unknown file format \"{}\".".format(fname))
Esempio n. 3
0
def save_enriched_motifs(df, fname: str) -> None:
    """
    Save enriched motifs.

    Supported file formats are CSV, TSV, GMT, DAT (pickle), JSON or YAML.

    :param df:
    :param fname:
    :return:
    """
    extension = os.path.splitext(fname)[1].lower()
    if extension in FILE_EXTENSION2SEPARATOR.keys():
        df.to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension])
    else:
        regulons = df2regulons(df)
        if extension == '.json':
            name2targets = {
                r.name: list(r.gene2weight.keys())
                for r in regulons
            }
            with open(fname, 'w') as f:
                f.write(json.dumps(name2targets))
        elif extension == '.dat':
            with open(fname, 'wb') as f:
                pickle.dump(regulons, f)
        elif extension == '.gmt':
            GeneSignature.to_gmt(fname, regulons)
        elif extension in {'.yaml', '.yml'}:
            save_to_yaml(regulons, fname)
        else:
            raise ValueError("Unknown file format \"{}\".".format(fname))
Esempio n. 4
0
def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
    """
    Load genes signatures from disk.

    Supported file formats are GMT, DAT (pickled), YAML or CSV (enriched motifs).

    :param fname: The name of the file that contains the signatures.
    :return: A list of gene signatures.
    """
    extension = os.path.splitext(fname)[1].lower()
    if extension in FILE_EXTENSION2SEPARATOR.keys():
        return df2regulons(
            load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension]))
    elif extension in {'.yaml', '.yml'}:
        return load_from_yaml(fname)
    elif extension.endswith('.gmt'):
        sep = guess_separator(fname)
        return GeneSignature.from_gmt(fname,
                                      field_separator=sep,
                                      gene_separator=sep)
    elif extension == '.dat':
        with open(fname, 'rb') as f:
            return pickle.load(f)
    else:
        raise ValueError("Unknown file format \"{}\".".format(fname))
Esempio n. 5
0
def prune_targets_command(args):
    """
    Prune targets/find enriched features.
    """
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    # The alternative for which was opted in the end is binary pickling.
    if any(
            args.module_fname.name.endswith(ext)
            for ext in FILE_EXTENSION2SEPARATOR.keys()):
        LOGGER.info("Creating modules.")
        modules = _df2modules(args)
    else:
        LOGGER.info("Loading modules.")
        modules = _load_modules(args.module_fname.name)

    if len(modules) == 0:
        LOGGER.error("Not a single module loaded")
        sys.exit(1)

    LOGGER.info("Loading databases.")
    dbs = _load_dbs(args.database_fname)

    LOGGER.info("Calculating regulons.")
    motif_annotations_fname = args.annotations_fname.name
    calc_func = find_features if args.no_pruning == "yes" else prune2df
    with ProgressBar(
    ) if args.mode == "dask_multiprocessing" else NoProgressBar():
        out = calc_func(dbs,
                        modules,
                        motif_annotations_fname,
                        rank_threshold=args.rank_threshold,
                        auc_threshold=args.auc_threshold,
                        nes_threshold=args.nes_threshold,
                        client_or_address=args.mode,
                        module_chunksize=args.chunk_size,
                        num_workers=args.num_workers)

    LOGGER.info("Writing results to file.")
    if args.output_type == 'csv':
        out.to_csv(args.output)
    else:
        name2targets = {
            r.name: list(r.gene2weight.keys())
            for r in df2regulons(out)
        }
        args.output.write(json.dumps(name2targets))
        args.output.close()
    motif_enrichment_table = pd.read_pickle(path=f)
elif f.endswith('.csv') or f.endswith('.csv.gz'):
    motif_enrichment_table = utils.read_feature_enrichment_table(
        fname=args.motif_enrichment_table_fname.name, sep=",")
else:
    raise Exception(
        "VSN ERROR: The aggregated feature enrichment table is in the wrong format. Expecting .pickle or .csv formats."
    )
print(f"... took {time.time() - start} seconds to run.", flush=True)

print(f"Making the regulons...", flush=True)
start = time.time()
regulons = transform.df2regulons(df=motif_enrichment_table,
                                 save_columns=[
                                     COLUMN_NAME_NES,
                                     COLUMN_NAME_ORTHOLOGOUS_IDENTITY,
                                     COLUMN_NAME_MOTIF_SIMILARITY_QVALUE,
                                     COLUMN_NAME_ANNOTATION
                                 ])
print(f"{len(regulons)} regulons from df2regulons.")

# Read the signatures saved in out/multi_runs_regulons_[mtf|trk]
# Keep all regulons and targets (so that all can be visualized in SCope)
signatures = utils.read_signatures_from_tsv_dir(dpath=args.signatures_fname,
                                                noweights=False,
                                                weight_threshold=0,
                                                min_genes=0)
print(
    f"{len(signatures)} all regulons from out/multi_runs_regulons_[mtf|trk].")

# Filter regulons (regulons from motifs enrichment table) by the filtered signatures
Esempio n. 7
0
sc.settings.verbosity = 3
plt.rcParams["axes.grid"] = False

# Run time for demo data: several minutes.

DATA_FOLDER = './Data/'
exp_matrix = pd.read_csv(os.path.join(DATA_FOLDER, 'DC_exp.csv'), index_col=0)
exp_meta = pd.read_csv(os.path.join(DATA_FOLDER, 'DC_meta.txt'),
                       sep='\t',
                       index_col=0)

## Generate the regulons
Motifs_NAME = 'exp_matrix'
motifs = load_motifs(
    os.path.join(DATA_FOLDER, '{}.motifs.csv'.format(Motifs_NAME)))
regulons = df2regulons(motifs)

## Make a meta matrix for the regulon
reg_num = []
reg_target = []
reg_tf = []
for i in regulons:
    reg_tf.append(i.transcription_factor)
    reg_target.append(list(i.gene2weight.keys()))
    reg_num.append(len(list(i.gene2weight.keys())))

reg_meta = pd.DataFrame([reg_num, reg_target]).T
reg_meta.index = reg_tf
reg_meta.columns = ['n_targets', 'targets']

## Calculate the AUCell scores
Esempio n. 8
0
                ##df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local"
                #df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local"
                #print("DEFINED df, type:")
                #print(type(df))
                #regulons = df2regulons(df, NOMENCLATURE)

                from dask.diagnostics import ProgressBar
                with ProgressBar():
                    df = prune2df(
                        dbs,
                        modules,
                        MOTIF_ANNOTATIONS_FNAME,
                        client_or_address=client)  #originally "local"
                    print("DEFINED df, type:")
                    print(type(df))
                    regulons = df2regulons(df, NOMENCLATURE)

            else:
                from dask.diagnostics import ProgressBar
                with ProgressBar():
                    #regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local"
                    regulons = prune(
                        dbs,
                        modules,
                        MOTIF_ANNOTATIONS_FNAME,
                        client_or_address=client)  #originally "local"
                    print("DEFINED regulons")
                    print(type(regulons))
        else:
            if calcRegulonsWithIntermediateDf:
                df = prune2df(dbs,
Esempio n. 9
0
    DATABASES_GLOB = os.path.join("resources/network_analysis",
                                  "mm10_*.mc9nr.feather")
    MOTIF_ANNOTATIONS_FNAME = os.path.join("resources/network_analysis",
                                           "motifs-v9-nr.mgi-m0.001-o0.0.tbl")

    db_fnames = glob.glob(DATABASES_GLOB)

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    print(dbs)

    modules = list(modules_from_adjacencies(adjacencies, ex_matrix))

    df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, num_workers=4)

    print("prune2df done, now saving")
    with open(snakemake.output[0], "wb") as f:
        pickle.dump(df, f)

    print("df2regulons carrying out")
    regulons = df2regulons(df)

    print("prunedone, now saving")
    with open(snakemake.output[1], "wb") as f:
        pickle.dump(regulons, f)