Ejemplo n.º 1
0
def find_adjacencies_command(args):
    """
    Infer co-expression modules.
    """
    LOGGER.info("Loading expression matrix.")
    ex_mtx = _load_expression_matrix(args)
    tf_names = load_tf_names(args.tfs_fname.name)

    n_total_genes = len(ex_mtx.columns)
    n_matching_genes = len(ex_mtx.columns.isin(tf_names))
    if n_total_genes == 0:
        LOGGER.error("The expression matrix supplied does not contain any genes. Make sure the extension of the file matches the format (tab separation for TSV and comma sepatration for CSV).")
        sys.exit(1)
    if float(n_matching_genes)/n_total_genes < 0.80:
        LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.")

    LOGGER.info("Inferring regulatory networks.")
    client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers)
    try:
        network = grnboost2(expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client)
    finally:
        shutdown_callback(False)

    LOGGER.info("Writing results to file.")
    network.to_csv(args.output, index=False, sep='\t')
Ejemplo n.º 2
0
def find_adjacencies_command(args):
    """
    Infer co-expression modules.
    """
    LOGGER.info("Loading expression matrix.")
    try:
        ex_mtx = load_exp_matrix(
            args.expression_mtx_fname.name,
            (args.transpose == 'yes'),
            args.sparse,
            args.cell_id_attribute,
            args.gene_attribute,
        )
    except ValueError as e:
        LOGGER.error(e)
        sys.exit(1)

    tf_names = load_tf_names(args.tfs_fname.name)

    if args.sparse:
        n_total_genes = len(ex_mtx[1])
        n_matching_genes = len(ex_mtx[1].isin(tf_names))
    else:
        n_total_genes = len(ex_mtx.columns)
        n_matching_genes = len(ex_mtx.columns.isin(tf_names))
    if n_total_genes == 0:
        LOGGER.error(
            "The expression matrix supplied does not contain any genes. "
            "Make sure the extension of the file matches the format (tab separation for TSV and "
            "comma sepatration for CSV)."
        )
        sys.exit(1)
    if float(n_matching_genes) / n_total_genes < 0.80:
        LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.")

    LOGGER.info("Inferring regulatory networks.")
    client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers)
    method = grnboost2 if args.method == 'grnboost2' else genie3
    try:
        if args.sparse:
            network = method(
                expression_data=ex_mtx[0],
                gene_names=ex_mtx[1],
                tf_names=tf_names,
                verbose=True,
                client_or_address=client,
                seed=args.seed,
            )
        else:
            network = method(
                expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client, seed=args.seed
            )
    finally:
        shutdown_callback(False)

    LOGGER.info("Writing results to file.")

    extension = PurePath(args.output.name).suffixes
    network.to_csv(args.output.name, index=False, sep=suffixes_to_separator(extension))
Ejemplo n.º 3
0
def run(cfg_fname):
    # Read configuration file.
    cfg = ConfigParser()
    cfg.read(cfg_fname)

    # Set logging level.
    logging_debug_opt = cfg["params"]["debug"].lower().strip() in {
        "yes", "true", "y"
    }
    LOGGER.addHandler(create_logging_handler(logging_debug_opt))
    LOGGER.setLevel(logging.DEBUG)

    # Derive file names.
    #mtx_fnames = list(mapcat(glob.glob, cfg['data']['mtx_fnames'].split(";")))
    mtx_fnames = glob.glob(cfg['data']['mtx_fnames'])
    tfs = load_tf_names(cfg['data']['tfs_fname'])

    # Derive cluster information.
    not_cluster_ip = 'scheduler_ip' not in cfg['params']
    if not_cluster_ip:
        local_cluster = LocalCluster(n_workers=int(cfg['params']['num_cores']),
                                     threads_per_worker=1)
        client = Client(local_cluster)
    else:

        class DummyClient:
            def close(self):
                pass

        local_cluster = DummyClient()
        client = cfg['params']['scheduler_ip']

    # Remove fnames that already have a corresponding results file.
    def add_output(fname, out_folder):
        basename = os.path.basename(fname).split('.')[0]
        return fname, os.path.join(out_folder, "{}.net.csv".format(basename))

    out_folder = cfg['data']['out_folder']
    for in_fname, out_fname in filter(
            lambda t: not os.path.exists(t[1]),
            map(partial(add_output, out_folder=out_folder), mtx_fnames)):
        LOGGER.info("Running GRNboost for {}.".format(in_fname))
        try:
            process(in_fname, tfs, out_fname, client)
        except ValueError as e:
            LOGGER.error(
                "Unable to process {} because of \"{}\". Stacktrace:".format(
                    in_fname, str(e)))
            LOGGER.error(traceback.format_exc())

    if not_cluster_ip:
        client.close()
        local_cluster.close()

    print("{} - Done.".format(datetime.datetime.now()))
def run_algo(client, algo_name, seed_value):

    if algo_name == 'genie3':
        inf_algo = genie3
    elif algo_name == 'grnboost2':
        inf_algo = grnboost2
    else:
        raise ValueError('Houston, we have a problem between desk and chair.. ({})'.format(algo_name))

    scaler = StandardScaler()

    for network_name, exp_path, tfs_path in datasets:
        start_time = time.time()

        print('inferring {0} with seed {1}'.format(network_name, seed))

        exp_matrix = pd.read_csv(exp_path, sep='\t')

        scaled_values = scaler.fit_transform(exp_matrix)

        exp_matrix_scaled = pd.DataFrame(scaled_values, columns=exp_matrix.columns)

        tf_names = load_tf_names(tfs_path)
        network_df = inf_algo(client_or_address=client,
                              expression_data=exp_matrix_scaled,
                              tf_names=tf_names,
                              seed=seed_value,
                              limit=100000)

        inf_time = time.time()
        delta_time = inf_time - start_time

        print('inferred {0} with seed {1} in {2} seconds'.format(network_name, seed, str(delta_time)))

        network_out_path = '{0}{1}.seed_{2}.csv'.format(out_dir, network_name, seed)

        network_df.to_csv(network_out_path, sep='\t', index=None, header=None)

        print('{0} with seed {1} written to {2}'.format(network_name, seed, network_out_path))
Ejemplo n.º 5
0
    DATABASE_FOLDER = "databases"
    DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.mc9nr.feather")
    MOTIF_ANNOTATIONS_FNAME = os.path.join(
        DATABASE_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl")
    MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_curated_tfs.txt')
    SC_EXP_FNAME = os.path.join(DATA_FOLDER, (inputFilename + ".tsv"))
    MODULES_FNAME = os.path.join(RESULT_FOLDER,
                                 ("modules_" + inputFilename + ".p"))
    REGULONS_FNAME = os.path.join(RESULT_FOLDER,
                                  "regulons_" + inputFilename + ".p")
    MOTIFS_FNAME = os.path.join(RESULT_FOLDER,
                                "motifs_" + inputFilename + ".csv")

    ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
    print(ex_matrix.shape)
    tf_names = load_tf_names(MM_TFS_FNAME)
    print("tf names loaded")
    out_file = os.path.join(RESULT_FOLDER,
                            "grn_output_" + inputFilename + ".tsv")
    db_fnames = glob.glob(DATABASES_GLOB)

    def name(fname):
        return os.path.splitext(os.path.basename(fname))[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]
    print(dbs)
    print("running grnboost")
    print("tf_names head")
    print(tf_names[1:5])
Ejemplo n.º 6
0
    BINARYAUC_FNAME = os.path.join(OUT_FOLDER, (ASSAYID + "_binary_AUC.csv"))
    BINARYTHR_FNAME = os.path.join(OUT_FOLDER,
                                   (ASSAYID + "_binary_thresholds.csv"))
    NOMENCLATURE = "HGNC"

    print("FINISHED DECLARING CONSTANTS")

    #-----------load data-----------------------------------------------------------------
    #Load filtered expression matrix
    ex_matrix = pd.read_csv(SC_EXP_FILT_FNAME, sep='\t', header=0, index_col=0)

    ex_matrix = ex_matrix.T
    print("LOADED ex_matrix")

    #load TF names
    tf_names = load_tf_names(HG_TFS_FNAME)
    print("FIRST 10 TF NAMES:")
    print(tf_names[0:10])

    #load ranking databases
    db_fnames = glob.glob(FEATHER_GLOB)

    #print("DATABASE FILE NAMES:")
    print(db_fnames)

    def name(fname):
        return os.path.basename(fname).split(".")[0]

    #dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature=NOMENCLATURE) for fname in db_fnames]
    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
Ejemplo n.º 7
0
#parameter
cell_use='_Interneuron_classA_0.1'

input_file=cell_use+'_matrix.csv'
output_grnboost2=cell_use+'_grnboost2.csv'
output_genie3=cell_use+'_genie3.csv'

output_grnboost2_txt=cell_use+'_grnboost2.txt'
output_genie3_txt=cell_use+'_genie3.txt'


#load data
ex_matrix=pd.read_csv(input_file,sep=',')
ex_matrix=np.transpose(ex_matrix)
tf_names=load_tf_names('mm_mgi_tfs.txt')

#infer the gene regulatory network
network_n=grnboost2(ex_matrix, tf_names=tf_names, verbose=True) 

network_g=genie3(ex_matrix, tf_names=tf_names, verbose=True) 


#for following igraph analysis
network_n.to_csv(output_grnboost2,sep='\t')
network_g.to_csv(output_genie3,sep='\t')


#txt file with no header and index for FAC calculation
network_n.to_csv(output_grnboost2_txt,sep='\t',header=False,index=False)
network_g.to_csv(output_genie3_txt,sep='\t',header=False,index=False)
Ejemplo n.º 8
0
    print('reading expression matrix from "{}"'.format(args.i))

    expression_matrix = pd.read_csv(args.i, sep='\t')

    print('expression matrix shape: {}'.format(str(expression_matrix.shape)))
    em_time = time.time()
    print('expression matrix read in {} seconds\n'.format(em_time -
                                                          start_time))

    # -------------------------- #
    # READ TRANSCRIPTION FACTORS #
    # -------------------------- #

    print('reading transcription factors from "{}"'.format(args.tf))

    tf_names = load_tf_names(args.tf)

    gene_names = expression_matrix.columns
    tfs_in_matrix = set(tf_names).intersection(set(gene_names))
    print('{} transcription factors in common with expression matrix\n'.format(
        str(len(tfs_in_matrix))))

    # ------------- #
    # INFER NETWORK #
    # ------------- #

    if args.genie3:
        inf_algo = genie3
        inf_algo_name = 'GENIE3'
    else:
        inf_algo = grnboost2
	## Load randing databases
	db_fnames = glob.glob(db_folder)
	def name(fname):
		return os.path.basename(fname).split(".")[0]
	dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
	dbs


	## Initialize cluster
	local_cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, processes=False, memory_limit=memory_limit)
	custom_client = Client(local_cluster)


	## Load TFs
	tf_names = load_tf_names(TFs_file)


	## Collect here regulons passing correlation filter
	cortest_passed_regulons = []


	for i in range(0, iterations):

		## Split to train and test
		data[grouping_variable] = metadata[grouping_variable]
		data_sampled = data.groupby(grouping_variable).apply(lambda x: x.sample(n=min(n_cells, len(x)), random_state=i))
		data_sampled.index = data_sampled.index.get_level_values(1)
		data_train = data_sampled.groupby(grouping_variable).apply(lambda x: x.sample(n=round(len(x) * train_pct), random_state=i))
		data_train.index = data_train.index.get_level_values(1)
		data_test = data_sampled[~data_sampled.index.isin(data_train.index)]
Ejemplo n.º 10
0
import os
import numpy as np
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

data_dir = '/home/brad/data2/rstudio/birds/scRNA/devin_combined/finch_cells/grn/export_to_numpy_glut'
expr_fname = os.path.join(data_dir, '1.1_exprMatrix_filtered_t.txt')
tf_fname = os.path.join(data_dir, '1.1_inputTFs.txt')

if __name__ == '__main__':
    # ex_matrix is a numpy ndarray, which has no notion of column names
    ex_matrix = np.genfromtxt(expr_fname, delimiter='\t', skip_header=1)

    # we read the gene names from the first line of the file
    with open(expr_fname) as file:
        gene_names = [gene.strip() for gene in file.readline().split('\t')]

    # sanity check to verify the ndarray's nr of columns equals the length of the gene_names list
    assert ex_matrix.shape[1] == len(gene_names)

    # tf_names is read using a utility function included in Arboreto
    tf_names = load_tf_names(tf_fname)

    network = grnboost2(expression_data=ex_matrix,
                        gene_names=gene_names,  # specify the gene_names
                        tf_names=tf_names)

    network.to_csv('output.tsv', sep='\t', index=False, header=False)
                                                                    
                             threads_per_worker=1,
                             memory_limit=8e9)
custom_client = Client(local_cluster)
print(custom_client)

#### get expression matrix
ex_path = indir + 'log_CPM_matrix.tsv'
print('reading expression data: ' + ex_path)
ex_matrix = pd.read_csv(ex_path,
                        index_col=0,
                        sep='\t',
                        skiprows=1,
                        header=None).T
#### get tfs
tf_path = '/ddn1/vol1/staging/leuven/stg_00002/lcb/kspan/analyses/ThreeLines10xSCENIC2/hg19_allTFs.lst'
tf_names = load_tf_names(tf_path)
tf_names = list(set(tf_names).intersection(ex_matrix.columns))
print(len(tf_names))

#run grnboost2
outfile = indir + 'grnboost2.tsv'
print('grnboost2 results will be printed to: ' + outfile)
start_time = time.time()
network = grnboost2(expression_data=ex_matrix,
                    tf_names=tf_names,
                    client_or_address=custom_client,
                    verbose=True)
print(time.time() - start_time, "seconds")
print(network.head())
network.to_csv(outfile, sep='\t', index=False, header=False)
sys.stdout = saveout
Ejemplo n.º 12
0
if __name__ == "__main__":

    import os
    import glob
    import pickle
    import pandas as pd
    import numpy as np

    #from dask.diagnostics import ProgressBar

    from arboreto.utils import load_tf_names
    from arboreto.algo import grnboost2

    #from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
    #from pyscenic.utils import modules_from_adjacencies, load_motifs
    #from pyscenic.prune import prune2df, df2regulons
    #from pyscenic.aucell import aucell
    #from pyscenic.binarization import binarize

    ex_matrix = pd.read_csv(snakemake.input[0],
                            sep='\t',
                            header=0,
                            index_col=0).T
    tf_names = load_tf_names("resources/tfs.txt")

    adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)

    with open(snakemake.output[0], "wb") as f:
        pickle.dump(adjacencies, f)
Ejemplo n.º 13
0
#get df count
adata=sc.read(f_cnt)
df_cnt=pd.DataFrame(adata.X.toarray(), index=adata.obs.index, columns=adata.var.index)

#---------------functions-----------------
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]


#################################################################
##1. load df (already loaded in setup)
#df_cnt=pd.read_csv(f_in, index_col=0)

#2. tf genes
tf_name=load_tf_names(f_tf)

#3. ranking databases (only 2 mm10 dbs)
l_fname=list(Path(fd_db).glob('*.feather'))
l_db=[RankingDatabase(fname=fname, name=name(fname)) for fname in l_fname]

#3. run
if __name__ =='__main__':
#	#1. Inference of co-expression modules
#	print('Inference...')
#	df_adj=grnboost2(df_cnt, tf_names=tf_name, verbose=True)
#	df_adj.to_csv(f'{fd_out}/adj.csv', index=False)
	
	#2. prune
	df_adj=pd.read_csv(f'{fd_out}/adj.csv')  #if missing, always stuck at 98%
	print('Prune...')
Ejemplo n.º 14
0
                  ]].drop_duplicates().dropna()  # cleans raw TF annotations
    tfs["ID"] = list(map(int, tfs["Gene ID"]))  # lists the genes by ID
    conv_tfs = pd.read_csv(
        "TF_conversion.txt",
        delimiter="\t")  # imports in MGI gene names from DAVID

    def extract_symbol(name):  # extracts the abbreviated ID from full name
        s_idx = name.rfind('(')
        e_idx = name.rfind(')')
        return name[s_idx + 1:e_idx]

    conv_tfs["Gene Name"].apply(extract_symbol).to_csv(
        TF_list_filename, index=False)  # turns list into CSV for future import

    # this cell loads TF list, expression matrix, and databases for downstream SCENIC analysis
    tf_names = load_tf_names(
        TF_list_filename)  # if TF list has been made, this imports it
    ex_matrix = pd.read_csv(
        "GENIE3_import.csv", header=0,
        index_col=0).T  # loads expression matrix, make sure you transpose back
    databases_glob = os.path.join(
        "mm10__*.feather")  # loads cisTarget databases into memory
    db_fnames = glob.glob(databases_glob)

    def name(fname):
        return os.path.basename(fname).split(".")[0]

    dbs = [
        RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames
    ]

    # GENIE3 process: returns co-expression modules
Ejemplo n.º 15
0
wd = '/home/pezoldt/NAS2/pezoldt/Analysis/scRNAseq/scenic/' + sample_ID + '/' + cell_type + '/int'
#Set directories for TF list and expMat data
net1_ex_path = wd + '/1.1_exprMatrix_filtered_t.txt'
net1_tf_path = wd + '/1.2_inputTFs.txt'

#Load data
ex_matrix = pd.read_csv(net1_ex_path, sep='\t')

#shape of matrix
ex_matrix.shape

#head of matrix
ex_matrix.head()

#load TF list from file
tf_names = load_tf_names(net1_tf_path)
#Quick inspection
tf_names[:5]
len(tf_names)

#Set computational local environment
# Obersvation: Less Ascertion errors if run with less people on cluster
from distributed import LocalCluster, Client
local_cluster = LocalCluster(n_workers=6, threads_per_worker=1)
custom_client = Client(local_cluster)
custom_client

#Start Job
network = grnboost2(expression_data=ex_matrix,
                    tf_names=tf_names,
                    client_or_address=custom_client)
Ejemplo n.º 16
0
sys.stderr.write("\nStarting Dusk cluster...")
local_cluster = LocalCluster(n_workers=32,
                             threads_per_worker=1,
                             memory_limit=8e10)
custom_client = Client(local_cluster)
sys.stderr.write("done.\n")

# ex_matrix is a DataFrame with gene names as column names
sys.stderr.write("\nReading count matrix...")
ex_matrix = pd.read_csv(in_file, sep='\t', index_col=0, header=None).T
sys.stderr.write("done.\n")

# tf_names is read using a utility function included in Arboreto
sys.stderr.write("\nLoading putative transcription factors...")
tf_names = load_tf_names(tf_file)
sys.stderr.write("done.\n")
sys.stderr.write("\nPredicting co-expression network in chunks...\n")
i = 0
for chunk in grouper(tf_names, 20):
    sys.stderr.write("Working on chunk %s\n" % str(i))
    network = grnboost2(expression_data=ex_matrix,
                        tf_names=chunk,
                        client_or_address=custom_client)
    network.to_csv("network_reddien_" + str(i) + ".csv",
                   sep=",",
                   header=False,
                   index=False)
    i += 1
sys.stderr.write("done.\n")
Ejemplo n.º 17
0
    import pandas as pd
    import numpy as np

    #from dask.diagnostics import ProgressBar

    from arboreto.utils import load_tf_names
    from arboreto.algo import grnboost2

    #from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
    #from pyscenic.utils import modules_from_adjacencies, load_motifs
    #from pyscenic.prune import prune2df, df2regulons
    #from pyscenic.aucell import aucell
    #from pyscenic.binarization import binarize

    ex_matrix = pd.read_csv(snakemake.input[0],
                            sep='\t',
                            header=0,
                            index_col=0).T
    tf_names = load_tf_names("resources/network_analysis/mm_mgi_tfs.txt")

    print(ex_matrix.shape)

    print("finish loading, now do grn")

    adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)

    print("grn_complete, now dumping")

    with open(snakemake.output[0], "wb") as f:
        pickle.dump(adjacencies, f)
        if args.sparse:
            ex_matrix = ds.layers[''].sparse().T.tocsc()
            gene_names = pd.Series(ds.ra[args.gene_attribute])
            #cells = ds.ca[args.cell_id_attribute]
        else:
            ex_matrix = pd.DataFrame(data=ds[:, :],
                                     index=ds.ra[args.gene_attribute],
                                     columns=ds.ca[args.cell_id_attribute]).T
            gene_names = pd.Series(ds.ra[args.gene_attribute])

    end_time = time.time()
    print(
        f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...',
        file=sys.stderr)

    tf_names = load_tf_names(args.tfs_fname.name)
    print(f'Loaded {len(tf_names)} TFs...', file=sys.stderr)

    ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names,
                                                     tf_names)
    tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names,
                                                   tf_names)

    print(f'starting {args.method} using {args.num_workers} processes...',
          file=sys.stderr)
    start_time = time.time()

    with Pool(args.num_workers) as p:
        adjs = list(
            tqdm.tqdm(p.imap(run_infer_partial_network,
                             target_gene_indices(gene_names,
Ejemplo n.º 19
0
    print('finished calculation for %s' % (prefix))


if __name__ == "__main__":

    wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes'
    os.chdir(wkdir)

    db = [
        RankingDatabase(
            fname=
            '../data/pySCENIC/ref/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather',
            name='hg38__refseq-r80__10kb_up_and_down_tss.mc9nr')
    ]
    tf_names = load_tf_names('../data/pySCENIC/ref/hs_hgnc_curated_tfs.txt')

    CellTypes = [
        'MCF7', 'T47D WT', 'T47D KO', 'MM134', 'SUM44', 'BCK4', 'MCF10A',
        'HEK293'
    ]

    for cell in CellTypes:

        tmp = adata_raw[adata_raw.obs['CellType'] == cell]
        RawSplicedCts = pd.DataFrame(tmp.layers['spliced'].todense(),
                                     index=tmp.obs.index,
                                     columns=tmp.var.index)  # cell X gene
        print(cell, RawSplicedCts.shape)

        if not isfile('{}/{}_auc_mtx.csv'.format('../data/pySCENIC', cell)):