Example #1
0
def export_dsc(et, fiti, custom_classes, path):

    # Get adjacency matrix
    adj = get_adjacency_matrix_from_et(et)

    # Structure to bnlearn
    nr, nc = adj.shape
    adjr = ro.r.matrix(adj.transpose().flatten().tolist(), nrow=nr, ncol=nc)
    ro.r.assign("adj", adjr)
    ro.r('library(bnlearn)')
    ro.r('names = paste0("V",0:50)')
    ro.r('e = empty.graph(as.character(names))')
    ro.r('amat <- data.matrix(adj)')
    ro.r('dimnames(amat) <- list(names,names)')
    ro.r('amat(e) = amat')

    # TODO change names of variables and classes
    vnames = ["V{}".format(xi) for xi in range(adj.shape[1])]
    names_classes = custom_classes

    # Create cpts
    ro.r('dist = list()')
    for vi in range(len(vnames)):
        # Get cpt
        factor = fiti.get_factor(len(vnames) + vi)
        cpti = factor.get_prob()
        variables = factor.get_variables()
        ncat = factor.get_num_categories()
        # Export CPT
        ro.r.assign("cpti", cpti)
        ro.r("cpti = unlist(cpti)")
        ro.r.assign("variables", variables)
        ro.r("variables = unlist(variables)")
        ro.r.assign("ncat", ncat)
        ro.r("ncat = unlist(ncat)")
        ro.r('dim(cpti) = ncat')
        ro.r('dn = list()')
        for vj in variables:
            v_name_j = vnames[vj]
            classes = names_classes[vj]
            ro.r.assign("v_name_j", v_name_j)
            ro.r("v_name_j = unlist(v_name_j)")
            ro.r.assign("classes", classes)
            ro.r("classes = unlist(classes)")
            ro.r('dn[[v_name_j]] = classes')
        ro.r('dimnames(cpti) = dn')
        # Append cpt to list
        v_name_i = vnames[vi]
        ro.r.assign("v_name_i", v_name_i)
        ro.r("v_name_i = unlist(v_name_i)")
        ro.r('dist[[v_name_i]] = cpti')

    # Assign custom fit
    ro.r('bn = custom.fit(e, dist = dist)')
    ro.r('write.net("{}", bn)'.format(path))
Example #2
0
def hill_climbing(data_frame, et0=None, u=5, metric='bic', metric_params=None, chc=False, tw_bound_type='b', tw_bound=5, optimization_type='size', k_complex=0.1, cores=multiprocessing.cpu_count(), forbidden_parent=None):
    """Gets the adjacency matrix of the Bayesian network encoded by the elimination tree et

    Args:
        data_frame (pandas.DataFrame): Input data
        et0 (elimination_tree.ElimTree): Initial limination tree
        u (int): maximum number of parents
        metric (str): scoring functions
        metric_params (list): Parameters for the scoring function
        chc (bool): If truth efficient learning (constrained hill-climbing) is performed
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        k_complex (float): complexity penalization
        optimization_type (str): 'tw' tries to reduce tree-width, 'size' tries to reduce size. Only used if tw_bound_type!='n'
        cores (int): Number of cores
        forbidden_parent (list): balcklist with forbidden parents

    Returns:
        elimination_tree.ElimTree Learned elimination tree
        float Learning time
    """
        
    #Initialize variables
    k = k_complex
    count_i = 0
    data = data_type.data(data_frame)
    if et0 is None:
        et = ElimTree('et', data.col_names, data.classes)
    else:
        et = et0.copyInfo()
    len_nodes = data.ncols
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(data.ncols)]
    else:
        forbidden_parents = forbidden_parent

    # Initialize score_best := Array with the metric value for all the variables and the complexity penalization
    score_best = []
    for i in range(0, len_nodes):
        parents = et.nodes[i].parents.display()
        score_best.append(score_function(data, i, parents, metric, metric_params))

    # Complexity of the network
    if tw_bound_type != 'n':
        score_best.append(-k * et.evaluate())
    else:
        score_best.append(0)

    ok_to_proceed = True
    learn_time = 0

    # While there is an improvement
    while ok_to_proceed:
        sys.stdout.write("\r Iteration %d" % count_i)
        sys.stdout.flush()
        count_i += 1

        # Input, Output and new
        ta = time()
        et_new, score_new, time_score, time_compile, time_opt, best_change, forbidden_parents = best_pred_hc(data, et, score_best, forbidden_parents, u, metric, metric_params, chc, tw_bound_type, tw_bound, optimization_type, k_complex, cores)
        
        # If mixed, update order
        if optimization_type == 'mixed' and et_new.tw()>et.tw():
            adj = get_adjacency_matrix_from_et(et_new)
            order, tw_greedy = greedy_tree_width(adj, method='fill')
            if tw_greedy < et_new.tw(False):
                et_new.compile_ordering(order.tolist())
                if tw_bound_type == 'b':
                    if tw_bound >= et_new.tw(False):
                        score_new[-1] = et_new.size() * k_complex
                    else:
                        score_new[-1] = float("-inf")
                elif tw_bound_type == 'p':
                    score_new[-1] = et_new.size() * k_complex
            
        tb = time()
        time_total = time_score + time_compile + time_opt
        print 'total time: ', time_total, ', Score Time: ', time_score, ', Compile Time: ', time_compile, ', Opt time: ', time_opt, ', pct: ', float(time_score) / float(time_total)
        print 'change: ', best_change, ', tw: ', et_new.tw(False), ', tw again: ', et_new.tw()
        learn_time += tb - ta
        if sum(score_new) > sum(score_best):
            et = et_new
            score_best = score_new
        else:
            ok_to_proceed = False
        f = open('learn_aux_dump', 'w')
        flag_it1 = et0 is None
        cloudpickle.dump([et, et_new, forbidden_parents, learn_time, flag_it1], f)
        f.close()
        print 'return tw: ', et.tw()

    return et, learn_time, score_best
Example #3
0
import pandas
from learn_structure_cache import hill_climbing_cache
from export import get_adjacency_matrix_from_et

# Load ASIA dataframe
file_name = "data/asia.csv"
data = pandas.read_csv(file_name)
var_classes = [['yes', 'no'] for _ in range(8)]

# ----LEARNING BAYESIAN NETWORKS WITH BOUNDED TREEWIDTH---- #
# Learn elimination tree (ET) with hc-et, using a tw bound of 3 and BIC as the objective score
et = hill_climbing_cache(data,
                         metric='bic',
                         tw_bound=3,
                         custom_classes=var_classes)
# Learn ET with hc-et-poly, using a tw bound of 3 and BIC as the objective score
et2 = hill_climbing_cache(data,
                          metric='bic',
                          tw_bound=3,
                          custom_classes=var_classes,
                          add_only=True)

# Get adjacency matrix of the Bayesian network encoded by the ET et
adj_mat = get_adjacency_matrix_from_et(et)
Example #4
0
mbc_gen_cpp = PyFactorTree(et_descriptor[0], et_descriptor[1],
                           et_descriptor[2], et_descriptor[3])

# Transfrom dataframe to data_type
data_p = datat(data, var_classes)
# Learn parameters: alpha is the Dirichlet hyperparameter for the Bayesian estimation.
#                   If alpha=0, the Maximum likelihood parameters are obtained
mbc_gen_cpp.learn_parameters(data_p, alpha=1)

# (Optional) Optimize conditional likelihood of the parameters using l_bfgs
mbc_gen_cpp = l_bfgs(data, mbc_gen, cll_query, var_classes)

# ----INTERPRETING THE BAYESIAN NETWORK---- #
# Obtaining the MBC adjacency matrix from cppet
num_nodes = data.shape[1]
adj_mat = get_adjacency_matrix_from_et(mbc_disc)
# Obtaining the parameters of node Tub
xi = 1  #Tub is node 1
factor = mbc_gen_cpp.get_factor(num_nodes + xi)
parameters = factor.get_prob()

# ----Multidimensional classification---- #
# Obtaining most probable explanations (MPEs)
# Set evidence. For example, Asia = 'yes' and Lung Cancer = 'no'
mbc_gen_cpp.set_evidence(features, [0, 1, 0, 0, 1])
# Compute MPE
mbc_gen_cpp.max_compute_tree()
# Get factor with results
factor = mbc_gen_cpp.get_factor(-1)
mpe_idx = factor.get_mpe()[0]  # Get the MPE
mpe = [var_classes[i][ci] for i, ci in enumerate(mpe_idx)]
Example #5
0
def learn_mbc_generative(data_frame, query_cll, pruned=True, et0=None, u=5, metric='bic', tw_bound_type='b', tw_bound=5,  cores=multiprocessing.cpu_count(), forbidden_parent=None, add_only = False, custom_classes=None, constraint = True, verbose=False):
    """Learning MBCs with bounded treewidth (pruned graph)
    Args:
        data_frame (pandas.DataFrame): Input data
        query_cll: list of query variables, the rest are treated as evidence
        pruned (bool): if true, bound the pruned graph. Otherwise, bound the complete graph
        et0 (elimination_tree.ElimTree): Initial elimination tree (optional)
        u (int): maximum number of parents allowed
        metric (str): scoring functions
        tw_bound_type (str): 'b' bound, 'n' none
        tw_bound (float): tree-width bound
        cores (int): Number of cores
        forbidden_parent (list): blacklist with forbidden parents
        add_only (bool): If true, allow only arc additions
        custom_classes: If not None, the classes of each variable are set to custom_classes
        constraint: If true, the additions and reversals that exceed the treewidth bound are stored in a blacklist
        verbose (bool): If True, print details of the learning process 

    Returns:
        elimination_tree.ElimTree Learned MBC
    """    
    n = data_frame.shape[1]
    features = list(set([i for i in range(n)]).difference(query_cll))
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(n)]
    else:
        forbidden_parents = forbidden_parent
    #learn class and bridge subgraph
    for xi in query_cll:
        forbidden_parents[xi] = list(set(forbidden_parents[xi] + features))
    for xj in features:
        forbidden_parents[xj] = list(set(forbidden_parents[xj] + features))
    et = hill_climbing_cache(data_frame, et0=et0, u=u, metric=metric, tw_bound_type=tw_bound_type, tw_bound=tw_bound,  cores=cores, forbidden_parent=forbidden_parents, add_only = add_only, custom_classes=custom_classes, constraint = constraint, verbose=verbose)
    if pruned:
        #Get topological ordering of the ET, for later compilation
        order_pruned = et.getDesc_py(-1)
        order_pruned.reverse()
        #Feature variables are positioned in the tail of the order
        for xi in features:
            order_pruned.remove(xi)
        order_pruned = order_pruned + features  
    
    # Learn feature subgraph
    if forbidden_parent is None:
        forbidden_parents = [[] for _ in range(n)]
    else:
        forbidden_parents = forbidden_parent
    for xi in query_cll:
        forbidden_parents[xi] = list(set(forbidden_parents[xi] + query_cll + features))
    for xi in features:
        forbidden_parents[xi] = list(set(forbidden_parents[xi] + query_cll))
    forbidden_parents = forbidden_mbc(et, query_cll, forbidden_parent = forbidden_parents)
    if pruned:
        et = hill_climbing_cache(data_frame, et0=et, u=u, metric=metric, tw_bound_type='n',  cores=cores, forbidden_parent=forbidden_parents, add_only = add_only, custom_classes=custom_classes, constraint = constraint)
        et.compile_ordering(order_pruned)
    else:
        et = hill_climbing_cache(data_frame, et0=et, u=u, metric=metric, tw_bound_type=tw_bound_type, tw_bound=tw_bound,  cores=cores, forbidden_parent=forbidden_parents, add_only = add_only, custom_classes=custom_classes, constraint = constraint, verbose=verbose)
    if tw_bound_type == 'n':
        adj = get_adjacency_matrix_from_et(et)
        order, _ = greedy_tree_width(adj, method='fill')
        et.compile_ordering(order.tolist())        
    return et
Example #6
0
df_all.iloc[:,:-3] = df_all.drop(response,axis=1).astype(np.float32)

# ----------- Train MBC -------------#
# Forbidden parents for the MBC
num_nds = df_all.shape[1]
forbidden_parent = [[] for _ in range(num_nds-3)]
forbidden_parent.append(range(num_nds-3) + [num_nds-1,num_nds-2])
forbidden_parent.append(range(num_nds-3) + [num_nds-1])
forbidden_parent.append(range(num_nds-3)) 
# Fit classifier
estimator = MBCClassifier(response =response, custom_classes=custom_classes, repeats = 20, metric_sem = "aic", metric_classifier= "aic", alpha = 2.5, forbidden_parents=forbidden_parent)
estimator.fit(df_all)

# ----------- Predict MBC -------------#
# Just as an example, we predict the same instances used for training the model
y_hat = estimator.predict_proba(df_all,repeats_inf=20)

# ----------- Predict MBC -------------#
# Get adjacency matrix of imputation model
adj_imputer = get_adjacency_matrix_from_et(estimator.et_sem)
# Get adjacency matrix of the first mbc
adj_mbc0 = get_adjacency_matrix_from_et(estimator.mbc_ets[0])
# Get adjacency matrix of the second mbc
adj_mbc1 = get_adjacency_matrix_from_et(estimator.mbc_ets[1])
# ... up to mbc_ets[19], given that repeats = 20 

# Export dsc files with the imputation model and the mbcs (Requires R package bnlearn)
path = "models/"
estimator.export_dsc(path)