Ejemplo n.º 1
0
    def run_discrete(df, pc=None):
        """
        Run the algorithm against a discrete dataframe to return a dot format causal graph.
        :param df: dataframe
        :return: dot graph string
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='rfci',
                       dfs=df,
                       testId='chi-square-test',
                       dataType=AlgorithmConstants.DISCRETE,
                       depth=3,
                       maxPathLength=-1,
                       completeRuleSetUsed=True,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 2
0
    def run_continuous(df, pc=None):
        """
        Run the algorithm against a continuous dataframe to return a dot format causal graph.
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='rfci',
                       dfs=df,
                       testId='fisher-z-test',
                       depth=-1,
                       maxPathLength=-1,
                       completeRuleSetUsed=False,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 3
0
    def run_continuous(df, pc=None):
        """
        Run the algorithm against a continuous dataframe to return a dot format causal graph.
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='fges',
                       dfs=df,
                       maxDegree=-1,
                       faithfulnessAssumed=True,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 4
0
    def test_multiple_algo_run(self):
        dot_str_list = []

        pc = pycausal()
        pc.start_vm()
        tetrad = s.tetradrunner()

        data_dir = os.path.join(self.data_dir,
                                "sim_discrete_data_20vars_100cases.txt")
        df = pd.read_table(data_dir, sep="\t")
        dot_str_list.append(self.pc_util.algo_bayes_est(df, pc))

        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")
        dot_str_list.append(self.pc_util.algo_fci(df, pc))

        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")
        dot_str_list.append(self.pc_util.algo_pc(df, pc))

        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")
        dot_str_list.append(self.pc_util.algo_fges_continuous(df, pc))

        data_dir = os.path.join(self.data_dir, "audiology.txt")
        df = pd.read_table(data_dir, sep="\t")
        dot_str_list.append(self.pc_util.algo_fges_discrete(df, pc))

        pc.stop_vm()

        self.assertTrue(len(dot_str_list) == 5)
Ejemplo n.º 5
0
    def run_discrete(df, pc=None):
        """
        Run the algorithm against a discrete dataframe to return a dot format causal graph.
        :param df: dataframe
        :return: dot graph string
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='fges',
                       dfs=df,
                       scoreId='bdeu-score',
                       dataType=AlgorithmConstants.DISCRETE,
                       maxDegree=3,
                       faithfulnessAssumed=True,
                       symmetricFirstStep=True,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 6
0
    def run(df, pc=None):
        """
        Run the algorithm against the dataframe to return a dot format causal graph.
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='pc-all',
                       dfs=df,
                       testId='fisher-z-test',
                       fasRule=2,
                       depth=2,
                       conflictRule=1,
                       concurrentFAS=True,
                       useMaxPOrientationHeuristic=True,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 7
0
def GetGraphData(df):
	
	global GLOBAL_data
	global GLOBAL_subgraphs_data
	global GLOBAL_corr_list
	global GLOBAL_node_data

	pc.start_vm()
	GLOBAL_tetrad = s.tetradrunner()


	GLOBAL_tetrad.run(algoId = 'fges', dfs = df, scoreId = 'sem-bic', dataType = 'continuous',
		   maxDegree = -1, faithfulnessAssumed = True, verbose = True)
	GLOBAL_corr_list = corr_data.tolist()
	col = []
	for c in GLOBAL_dataframe.columns:
		col.append(c)

	all_links = []

	for i in range(len(col)):
		for j in range(len(col)):
			all_links.append({"source": col[i], "target": col[j], "value": GLOBAL_corr_list[i][j]})

	all_nodes = []
	for i in range(len(col)):
		all_nodes.append({"id": col[i], "group": 1})



	GLOBAL_node_data = GLOBAL_tetrad.getNodes()
	edge_dict = {}
	edge_data = GLOBAL_tetrad.getEdges()
	
	for e in edge_data:
		if(" --> " in e):
			pair = e.split(" --> ")	
			if pair[0] in edge_dict:	
				val_list = edge_dict.get(pair[0])
				val_list.append(pair[1])
				edge_dict[pair[0]] = val_list

			edge_dict[pair[0]] = [pair[1]]

	graph_links = []
	for link_dict in all_links:
		if (link_dict.get("source") in edge_dict) and (link_dict.get("target") in edge_dict.get(link_dict.get("source"))):
			graph_links.append(link_dict)
	GLOBAL_data = {"nodes": all_nodes, "links": graph_links}


	subgraphs = []
	i = 0
	while(len(subgraphs) < 5):
		subgraph = []
		while(len(subgraph) < 5):
			subgraph.append(GLOBAL_data['links'][i % len(GLOBAL_data['links'])])
			i = i+1
		subgraphs.append(subgraph)
	GLOBAL_subgraphs_data = subgraphs
Ejemplo n.º 8
0
def py_causal(
    sp_pro_deg_v, knowledge_path, bp=False, 
    dataType="discrete", algoId="fges",
    scoreId="disc-bic-score", structurePrior=1.0, samplePrior=1.0, maxDegree=20,
    faithfulnessAssumed=True, symmetricFirstStep=True, verbose=True, 
    numberResampling=100, percentResampleSize=90, resamplingEnsemble=1,
    addOriginalDataset=True, resamplingWithReplacement=True):

    """Notes for py_causal.

    Args:
        sp_sga_deg: 2-D df, index are sample IDs, columns are SGAs and DGEs names. 
                    df's value is 0 or 1.
        bp: bool, using bootstrap. Default is False.
        
    Returns:
        node_l: A list of contain nodes in network.
        edge_l: A 2-D df contains two columns, source node to target node, 
                didn't contain edge without direction.
        bic: fges output, contain nodes, edges, network score.
    """
    #avoid changing input file
    sp_pro_deg = deepcopy(sp_pro_deg_v)

    # connect to java
    from pycausal.pycausal import pycausal as pc
    pc = pc()
    pc.start_vm(java_max_heap_size="1000M")

    # generate knowledge
    from pycausal import prior as p 
    prior = p.knowledgeFromFile(knowledge_path)

    # search
    from pycausal import search as s
    tetrad = s.tetradrunner()

    if bp == True:
        tetrad.run(
            dfs=sp_pro_deg, priorKnowledge=prior, 
            dataType=dataType, algoId=algoId,
            scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree,
            faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose, 
            numberResampling=numberResampling, percentResampleSize=percentResampleSize, resamplingEnsemble=resamplingEnsemble,
            addOriginalDataset=addOriginalDataset, resamplingWithReplacement=resamplingWithReplacement)
    else:
        tetrad.run(
            dfs=sp_pro_deg, priorKnowledge=prior, 
            dataType=dataType, algoId=algoId,
            scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree,
            faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose)

    node_l = tetrad.getNodes()
    edge_l = tetrad.getEdges()
    bic = tetrad.getTetradGraph()

    return node_l, edge_l, bic
Ejemplo n.º 9
0
def findKeyAttrs(samples, protect_attr='', result_attr='class'):
    """
    Args:
        samples(pandas DataFrame): 
        protect_attr(string || Array<string>): 
    Return:
        key_attrs(list<string>): a list of key attributes that directly influence the decision
    """
    from pycausal.pycausal import pycausal as pc
    pc = pc()
    pc.start_vm()
    from pycausal import search as s
    # pc import must keep the above order

    # choose a causal mining algorithm
    causal = 'fges'
    if causal == 'bayes':
        ### use bayes Est to find the key attributes
        ### somewhat slow, extract more key attributes
        graph = s.bayesEst(samples, depth=0, alpha=0.05, verbose=True)
    else:
        ## OR use Fast Greedy Equivalence Search
        ## faster than bayes, get less key attributes
        graph = s.tetradrunner()
        graph.getAlgorithmParameters(algoId='fges', scoreId='bdeu')
        graph.run(algoId='fges',
                  dfs=samples,
                  scoreId='bdeu',
                  priorKnowledge=None,
                  dataType='discrete',
                  structurePrior=0.5,
                  samplePrior=0.5,
                  maxDegree=5,
                  faithfulnessAssumed=True,
                  verbose=False)

    # graph.getNodes()
    key_attrs = []
    print('edges', graph.getEdges())
    for edge in graph.getEdges():
        if 'class' in edge:
            # extract attr name from the edge
            # remove --> or --o or --- and white space
            attr = re.sub(r'-+>?o?|{}|\s+'.format(result_attr), '', edge)
            key_attrs.append(attr)

    # remove protect attrs
    if type(protect_attr) is not str:
        # if protect attr is a list
        for a in protect_attr:
            if a in key_attrs:
                key_attrs.remove(a)
    elif protect_attr in key_attrs:
        # if protect attr is a string
        key_attrs.remove(protect_attr)
    print('key attributes', key_attrs)
    return key_attrs
Ejemplo n.º 10
0
def setup():
    global pc
    global tetrad
    pc = pcm()
    pc.start_vm()
    tetrad = pcs.tetradrunner()
    tetrad.listAlgorithms()
    tetrad.listIndTests()
    tetrad.listScores()
Ejemplo n.º 11
0
 def init_causal_graph_dot_src(self, df, forbidden_edges, required_edges):
     p = pc()
     p.start_vm()
     tetrad = s.tetradrunner()
     prior = pr.knowledge(forbiddirect = forbidden_edges, requiredirect = required_edges)
     tetrad.run(algoId = 'fges', dfs = df, priorKnowledge = prior, scoreId = 'sem-bic', dataType = 'continuous', penaltyDiscount = 2, maxDegree = -1, faithfulnessAssumed = True, verbose = True)
     dot_src = p.tetradGraphToDot(tetrad.getTetradGraph())
     #p.stop_vm()
     self.edges = tetrad.getEdges()
     self.nodes = tetrad.getNodes()
     dot_src = self.trim_init_src_string(dot_src)
     self.dot_src_lines = self.dot_src_to_lines(dot_src)
     self.dot_src = self.lines_to_dot_src(self.dot_src_lines)
     self.init_dot_src = self.dot_src
     self.uncolored_dot_src = self.init_dot_src
Ejemplo n.º 12
0
    def run_mixed(df, pc=None):
        """
        Run the algorithm against a mixed dataframe to return a dot format causal graph.
        :param df: dataframe
        :return: dot graph string
        """
        single_run = False
        dot_str = None
        try:
            # start java vm and get algo runner
            if pc is None:
                pc = pycausal()
                pc.start_vm()
                single_run = True

            tetrad = s.tetradrunner()
            tetrad.run(algoId='gfci',
                       dfs=df,
                       testId='cg-lr-test',
                       scoreId='cg-bic-score',
                       dataType=AlgorithmConstants.MIXED,
                       numCategoriesToDiscretize=4,
                       maxDegree=3,
                       maxPathLength=-1,
                       completeRuleSetUsed=False,
                       faithfulnessAssumed=True,
                       verbose=AlgorithmConstants.VERBOSE)
            graph = tetrad.getTetradGraph()
            dot_str = pc.tetradGraphToDot(graph)

            # shutdown java vm
            if single_run:
                pc.stop_vm()
        except Exception as e:
            _logger.error(str(e))
            print(str(e))
        return dot_str
Ejemplo n.º 13
0
 def learn_fci(self, df, tabu_edges):
     """This function is used to learn model using FCI"""
     from pycausal.pycausal import pycausal as pc
     from pycausal import search as s
     from pycausal import prior as p
     pc = pc()
     pc.start_vm()
     forbid = [list(i) for i in tabu_edges]
     prior = p.knowledge(forbiddirect=forbid)
     tetrad = s.tetradrunner()
     tetrad.getAlgorithmParameters(algoId='fci', testId='fisher-z-test')
     tetrad.run(algoId='fci',
                dfs=df,
                testId='fisher-z-test',
                depth=-1,
                maxPathLength=-1,
                completeRuleSetUsed=False,
                verbose=False)
     edges = tetrad.getEdges()
     dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph())
     graph = pydot.graph_from_dot_data(dot_str)
     # graph[0].write_pdf(fname)
     pc.stop_vm()
     return edges
#!/usr/local/bin/python

import os
import pandas as pd
import pydot
from IPython.display import SVG

data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt')
df = pd.read_table(data_dir, sep="\t")

from pycausal.pycausal import pycausal as pc
pc = pc()
pc.start_vm(java_max_heap_size='100M')

from pycausal import search as s
tetrad = s.tetradrunner()
tetrad.run(algoId='fges',
           dfs=df,
           scoreId='bdeu',
           dataType='discrete',
           structurePrior=1.0,
           samplePrior=1.0,
           maxDegree=3,
           faithfulnessAssumed=True,
           verbose=True)

tetrad.getNodes()
tetrad.getEdges()

dot = tetrad.getDot()
dot.write_svg('fges-discrete.svg')
#!/usr/local/bin/python


import os
import pandas as pd
import pydot
from IPython.display import SVG

data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt')
df = pd.read_table(data_dir, sep="\t")

from pycausal.pycausal import pycausal as pc
pc = pc()
pc.start_vm(java_max_heap_size = '100M')

from pycausal import search as s
tetrad = s.tetradrunner()
tetrad.run(algoId = 'fges', dfs = df, scoreId = 'bdeu', dataType = 'discrete',
           structurePrior = 1.0, samplePrior = 1.0, maxDegree = 3, faithfulnessAssumed = True, verbose = True)

tetrad.getNodes()
tetrad.getEdges()

dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph())
graphs = pydot.graph_from_dot_data(dot_str)
graphs[0].write_svg('fges-discrete.svg')

pc.stop_vm()
Ejemplo n.º 16
0
def fges_stem(file_path, sys_iter, SGA_l, A_D):

    BIC_l = [float(0)]

    SGA = pd.DataFrame(SGA_l)
    SGA.columns = ['cause gene name']
    A_D_i = A_D

    for i in range(sys_iter):
        print(i)
        file_l = os.listdir(file_path + '/Output/run%i' % i)
        while 'completeMatrixn.csv' not in file_l:
            df_name = file_path + '/Output/run%i/completeMatrix.csv' % i
            df = pd.read_csv(df_name, header=0, index_col=None)

            from pycausal.pycausal import pycausal as pc
            pc = pc()
            pc.start_vm(java_max_heap_size='6400M')

            from pycausal import prior as p
            # get knowledge from knowledge file
            #prior = p.knowledgeFromFile(file_path + '/Input/Knowledge')

            # get knowledge from DEG and SGA list
            DEG_l = [x for x in df.columns if x not in SGA_l]
            A_D_i = A_D_i[DEG_l]
            forbid = create_knowledge(SGA, SGA_l, A_D_i)
            temporal = [SGA_l, p.ForbiddenWithin(DEG_l)]
            prior = p.knowledge(forbiddirect=forbid, addtemporal=temporal)

            from pycausal import search as s
            tetrad = s.tetradrunner()
            tetrad.getAlgorithmParameters(algoId='fges', scoreId='bdeu-score')

            tetrad.run(
                algoId='fges',
                dfs=df,
                scoreId='bdeu-score',
                priorKnowledge=prior,
                dataType='discrete',
                structurePrior=1.0,
                samplePrior=1.0,
                maxDegree=100,
                faithfulnessAssumed=True,
                verbose=True,
                symmetricFirstStep=True
            )  # , numberResampling=10, resamplingEnsemble=1, addOriginalDataset=True)

            # save edges.csv
            node_l = tetrad.getNodes()
            edge_l = tetrad.getEdges()
            #edge_split_l = []
            #for edge in edge_l:
            #if '---' in edge:
            #edge_n = edge.split(' ')
            #if np.sum(df[edge.split(' ')[0]]) > np.sum(df[edge.split(' ')[2]]):
            #    edge_n.reverse()
            #else:
            #    edge_n = edge_n
            #edge_split_l.append(edge_n)
            #else:
            #edge_split_l.append(edge.split(' '))

            #edge_split_l = [edge.split(' ') for edge in edge_l if '---' not in edge]
            edge_split_l = [edge.split(' ') for edge in edge_l]

            edge_df = pd.DataFrame(edge_split_l).iloc[:, [0, 2]]
            edge_df.to_csv(file_path + '/Output/run%i/Edge.csv' % i,
                           index=False,
                           header=False)

            # save completeMatrixn.csv
            new_df = df.loc[:, node_l]
            new_df.to_csv(file_path + '/Output/run%i/completeMatrixn.csv' % i,
                          index=False,
                          header=True)

            # save BIC.txt
            print(tetrad.getTetradGraph(),
                  file=open(file_path + '/Output/run%i/BIC.txt' % i, 'a'))
            file_l = os.listdir(file_path + '/Output/run%i' % i)

        else:
            # save BIC which used to verify convergency
            with open(file_path + '/Output/run%i/BIC.txt' % i, 'r') as BIC_txt:
                for line in BIC_txt:
                    if 'BIC: -' in line:
                        BIC_l.append(float(line[5:-1]))

            j = i + 1
            mk_dir(file_path + '/Output/run%d' % j)
            next_file_l = os.listdir(file_path + '/Output/run%i' % j)
            while 'completeMatrix.csv' not in next_file_l:
                exe_path = './MCMC/inferSGAInNetwork_TDI.exe'
                m_path = ' -m ' + file_path + '/Output/run%i/completeMatrixn.csv' % i
                i_path = ' -i ' + file_path + '/Input/S_A0.csv'
                e_path = ' -e ' + file_path + '/Output/run%i/Edge.csv' % i
                o_path = ' -o ' + file_path + '/Output/run%d/ -x 50' % j
                combine = exe_path + m_path + i_path + e_path + o_path
                os.system(combine)
                time.sleep(20)
                next_file_l = os.listdir(file_path + '/Output/run%i' % j)
            else:
                pd.DataFrame(BIC_l).to_csv(file_path + '/Output/BIC.csv',
                                           index=False,
                                           header=False)
Ejemplo n.º 17
0
	graph_links = []
	for link_dict in all_links:
		if (link_dict.get("source") in edge_dict) and (link_dict.get("target") in edge_dict.get(link_dict.get("source"))):
			graph_links.append(link_dict)
	GLOBAL_data = {"nodes": all_nodes, "links": graph_links}


	subgraphs = []
	i = 0
	while(len(subgraphs) < 5):
		subgraph = []
		while(len(subgraph) < 5):
			subgraph.append(GLOBAL_data['links'][i % len(GLOBAL_data['links'])])
			i = i+1
		subgraphs.append(subgraph)
	GLOBAL_subgraphs_data = subgraphs

if __name__ == "__main__":

	pc = pc()
	pc.start_vm()
	GLOBAL_tetrad = s.tetradrunner()

	# graph data from file							      
	GLOBAL_dataframe = pd.read_csv(datasetName)
	GLOBAL_initial_nodes = list(GLOBAL_dataframe.columns)
	GLOBAL_dataframe = GLOBAL_dataframe.dropna()
	corr_data = GLOBAL_dataframe.corr()
	corr_data = corr_data.to_numpy()
	GetGraphData(GLOBAL_dataframe)
	app.run(debug=True)