Example #1
0
def run(args):
    # Set logging level.
    logging_debug_opt = False
    LOGGER.addHandler(create_logging_handler(logging_debug_opt))
    LOGGER.setLevel(logging.DEBUG)

    LOGGER.info("Using configuration {}.".format(args.config_filename))
    cfg = ConfigParser()
    cfg.read(args.config_filename)

    in_fname = cfg['data']['modules'] if not args.input else args.input
    LOGGER.info("Loading modules from {}.".format(in_fname))
    # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
    # Potential improvements are switching to JSON or to use a CLoader:
    # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
    if in_fname.endswith('.yaml'):
        modules = load_from_yaml(in_fname)
    else:
        with open(in_fname, 'rb') as f:
            modules = pickle.load(f)
    # Filter out modules with to few genes.
    min_genes = int(cfg['parameters']['min_genes'])
    modules = list(filter(lambda m: len(m) >= min_genes, modules))

    LOGGER.info("Loading databases.")

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";")))
    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    LOGGER.info("Calculating regulons.")
    motif_annotations_fname = cfg['data']['motif_annotations']
    mode = cfg['parameters']['mode']
    with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar():
        df = prune2df(dbs,
                      modules,
                      motif_annotations_fname,
                      rank_threshold=int(cfg['parameters']['rank_threshold']),
                      auc_threshold=float(cfg['parameters']['auc_threshold']),
                      nes_threshold=float(cfg['parameters']['nes_threshold']),
                      client_or_address=mode,
                      module_chunksize=cfg['parameters']['chunk_size'],
                      num_workers=args.num_workers)

    LOGGER.info("Writing results to file.")
    df.to_csv(cfg['parameters']['output'] if not args.output else args.output)
Example #2
0
    def run_regression(self):

        data_df = self.data.to_df()

        utils.Debug.vprint(
            "Calculating {m} adjacencies".format(m=self.adjacency_method),
            level=0)

        # Get adjacencies
        adj_method = ADJ_METHODS[self.adjacency_method]

        if MPControl.is_dask:
            client_or_address = MPControl.client.client
            MPControl.client.check_cluster_state()
        else:
            client_or_address = 'local'

        adjacencies = adj_method(data_df,
                                 tf_names=self.tf_names,
                                 verbose=True,
                                 client_or_address=client_or_address,
                                 seed=self.random_seed)

        if self.do_scenic:

            # Convert adjacencies to modules
            modules = list(modules_from_adjacencies(adjacencies, data_df))

            # Load feather (rank) databases
            dbs = [
                RankingDatabase(fname=self._feather_rank_file,
                                name="RANKING_PRIOR")
            ]

            utils.Debug.vprint("Pruning adjacencies with SCENIC", level=0)

            # Prune to df
            df = prune2df(dbs,
                          modules,
                          self._motif_link_table_file,
                          client_or_address=client_or_address)

            return self.reprocess_scenic_output_to_inferelator_results(
                df, self.priors_data)

        else:

            return self.reprocess_adj_to_inferelator_results(adjacencies)
Example #3
0
def calcTFs(
        expr,
        tf_names,
        db,
        prefix,
        motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl',
        out_path='../data/pySCENIC',
        ppn=8):
    """Computes motifs, regulons and trancriptional factor activation using pySCENIC.

    Arguments
    ---------
    expr: `pandas DataFrame` 
        cell X gene raw counts; FPKM; not TPM as coexpression will be calculated
    tf_names: `list` (`str`)
        curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt
    db: `list` (`FeatherRankingDatabase()`)
        feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")]
    prefix: `str` (default: `None`)
        Specify name to save files (eg, cell line names)

    Returns
    -------
    Do not return but write files (the calc takes too long...)
    """

    # Inference of co-expression modules
    adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True)
    modules = list(modules_from_adjacencies(adjacencies, expr))

    # Calculate a list of enriched motifs and the corresponding target genes for all modules.
    with ProgressBar():
        df = prune2df(db, modules, motif_path, num_workers=ppn)

    # Create regulons from this table of enriched motifs.
    regulons = df2regulons(df)

    # Save the enriched motifs and the discovered regulons to disk.
    with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f:
        pickle.dump(regulons, f)

    auc_mtx = aucell(expr, regulons, num_workers=ppn)
    tfs = [tf.strip('(+)') for tf in auc_mtx.columns]
    auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix))

    print('finished calculation for %s' % (prefix))
		## Derive potential regulons from co-expression modules
		if not os.path.isfile(modules_fname):
			modules = list(modules_from_adjacencies(adjacencies, data_train, keep_only_activating=False))
			pickle.dump(modules, open(modules_fname, 'wb'))
		else:
			modules = pickle.load(open(modules_fname, 'rb'))
		
		del adjacencies



		## Prune modules for targets with cis regulatory footprints (aka RcisTarget)

		### Calculate a list of enriched motifs and the corresponding target genes for all modules.
		if not os.path.isfile(motifs_fname):
			df = prune2df(dbs, modules, motif_annotations, num_workers=n_cores)
			df.to_csv(motifs_fname)
		else:
			df = pd.read_csv(motifs_fname)
		
		del modules



		### Create regulons from this table of enriched motifs.
		if not os.path.isfile(regulons_fname):
			regulons = df2regulons(df)
			pickle.dump(regulons, open(regulons_fname, 'wb'))
		else:
			regulons = pickle.load(open(regulons_fname, 'rb'))
Example #5
0
    adjacencies = grnboost2(
        ex_matrix, tf_names=tf_names,
        verbose=True)  # runs improved GRNBoost instance of GENIE3
    modules = list(modules_from_adjacencies(
        adjacencies, ex_matrix))  # identifies modules from GENIE3

    # save GRNBoost2 product so we don't have to repeat again
    adjacencies.to_csv("grnboost_output.csv")

    # load product in case something goes wrong
    adjacencies = pd.read_csv("grnboost_output.csv", index_col=0)

    # cisTarget process: IDs cis-regulatory footprints from motifs around the TSS
    with ProgressBar(
    ):  # calculate a list of enriched motifs and the corresponding target genes for all modules
        df = prune2df(dbs, modules, "motifs-v9-nr-mgi.txt")
    regulons = df2regulons(
        df)  # create regulons from this table of enriched motifs

    # save the discovered motifs and regulons
    df.to_csv(motifs_filename)
    with open(regulons_filename, "wb") as f:
        pickle.dump(regulons, f)

    # load the discovered motifs and regulons if saved previously
    df = load_motifs(motifs_filename)
    with open(regulons_filename, "rb") as f:
        regulons = pickle.load(f)

    # AUCell process: finds enrichment of each discovered regulon
    auc_matrix = aucell(ex_matrix, regulons, num_workers=4)
Example #6
0
    adjacencies.to_csv(out_file, sep='\t', index=False, header=False)
    print("grnboost done")
    modules = list(
        modules_from_adjacencies(adjacencies,
                                 ex_matrix,
                                 rho_mask_dropouts=True))

    #print("writing modules")
    #with open(MODULES_FNAME, 'wb') as f:
    #	pickle.dump(modules, f)

    print("Finding Enriched modules")
    # Calculate a list of enriched motifs and the corresponding target genes for all modules.

    with ProgressBar():
        df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
        df.head()

    # Create regulons from this table of enriched motifs.
    print("creating regulons")
    regulons = df2regulons(df)

    print("writing regulons")
    # Save the enriched motifs and the discovered regulons to disk.
    #df.to_csv(MOTIFS_FNAME)
    #with open(REGULONS_FNAME, "wb") as f:
    #	pickle.dump(regulons, f)

    print("Finding AUC of cells")
    auc_mtx = aucell(ex_matrix, regulons, num_workers=1)
    auc_file = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".csv")
Example #7
0
    print("STARTING PHASE II")

    if runOnCluster:
        if RegulonsViaDask:
            if calcRegulonsWithIntermediateDf:
                ##df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local"
                #df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address=client) #originally "local"
                #print("DEFINED df, type:")
                #print(type(df))
                #regulons = df2regulons(df, NOMENCLATURE)

                from dask.diagnostics import ProgressBar
                with ProgressBar():
                    df = prune2df(
                        dbs,
                        modules,
                        MOTIF_ANNOTATIONS_FNAME,
                        client_or_address=client)  #originally "local"
                    print("DEFINED df, type:")
                    print(type(df))
                    regulons = df2regulons(df, NOMENCLATURE)

            else:
                from dask.diagnostics import ProgressBar
                with ProgressBar():
                    #regulons = prune(dbs, modules, MOTIF_ANNOTATIONS_FNAME, client_or_address="dask_multiprocessing") #originally "local"
                    regulons = prune(
                        dbs,
                        modules,
                        MOTIF_ANNOTATIONS_FNAME,
                        client_or_address=client)  #originally "local"
Example #8
0
l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname]

#3. run
if __name__ =='__main__':
#	#1. Inference of co-expression modules
#	print('Inference...')
#	df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True)
#	df_adj.to_csv(f'{fd_out}/adj.csv', index=False)
	
	#2. prune
	df_adj=pd.read_csv(f'{fd_out}/adj.csv')  #if missing, always stuck at 98%
	print('Prune...')
	l_mod=list(modules_from_adjacencies(df_adj, df_cnt))

	with ProgressBar():
		df_prune = prune2df(l_db, l_mod, f_motif)
	df_prune.to_csv(f'{fd_out}/prune.csv')
	
	#3. create regulon
	print('Regulon...')
	regulon=df2regulons(df_prune)

	#4. Save the enriched motifs and the discovered regulons
	with open(f'{fd_out}/regulon.pkl', "wb") as f:
		pickle.dump(regulon, f)
	
	#5. auc
	print('AUC...')
	with open(f'{fd_out}/regulon.pkl', "rb") as f:   #if missing, always stuck
		regulon=pickle.load(f)
		
Example #9
0
    DATABASES_GLOB = os.path.join("resources/network_analysis",
                                  "mm10_*.mc9nr.feather")
    MOTIF_ANNOTATIONS_FNAME = os.path.join("resources/network_analysis",
                                           "motifs-v9-nr.mgi-m0.001-o0.0.tbl")

    db_fnames = glob.glob(DATABASES_GLOB)

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    print(dbs)

    modules = list(modules_from_adjacencies(adjacencies, ex_matrix))

    df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME, num_workers=4)

    print("prune2df done, now saving")
    with open(snakemake.output[0], "wb") as f:
        pickle.dump(df, f)

    print("df2regulons carrying out")
    regulons = df2regulons(df)

    print("prunedone, now saving")
    with open(snakemake.output[1], "wb") as f:
        pickle.dump(regulons, f)