Ejemplo n.º 1
0
cols = [
    'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross',
    'Netpay', 'Porosity', 'Permeability', 'Depth'
]
h = h[cols]

p2 = time.time()
print(f"Time elapsed for uploading data: {p2 - p1}")

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5,
                                  encode='ordinal',
                                  strategy='uniform')

p = Preprocessor([('encoder', encoder)])
discretized_data, est = p.apply(h)
info = p.info

# --------------------- VALIDATION TEST-------------------------------
nodes_type_mixed = gru.nodes_types(h)
columns = [
    col for col in h.columns.to_list()
    if nodes_type_mixed[col] in ['disc', 'disc_num']
]  # GET ONLY DISCRETE
discrete_data = h[columns]

discretized_data, est = p.apply(discrete_data)  # warning
info = p.info

bn = Networks.HybridBN()
Ejemplo n.º 2
0
from bamt.Preprocessors import Preprocessor
import pandas as pd
from sklearn import preprocessing as pp
import bamt.Networks as Networks

vk_data = pd.read_csv(r"data\real data\vk_data.csv").sample(150)

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5,
                                  encode='ordinal',
                                  strategy='uniform')

p = Preprocessor([('encoder', encoder)])
discretized_data, est = p.apply(vk_data)
info = p.info

bn = Networks.HybridBN(has_logit=False, use_mixture=False)
bn.add_nodes(descriptor=info)
params = {
    "init_nodes": ["sex", "has_pets", "is_parent", "relation", "tr_per_month"],
    "init_edges": [("age", "mean_tr"), ("sex", "mean_tr"), ("sex", "has_pets"),
                   ("is_parent", "has_pets"), ("has_pets", "median_tr"),
                   ("is_driver", "tr_per_month"),
                   ("tr_per_month", "median_tr"), ("tr_per_month", "relation")]
}

bn.add_edges(data=discretized_data,
             optimizer='HC',
             scoring_function=('MI', ),
             params=params)
bn.fit_parameters(data=vk_data)
    'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross',
    'Netpay', 'Porosity', 'Permeability', 'Depth'
]].dropna()

cont_test_data = cont_data[cont_data.columns[:-1]]
cont_target = cont_data[cont_data.columns[-1]]
disc_test_data = disc_data[disc_data.columns[:-1]]
disc_target = disc_data[disc_data.columns[-1]]
hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]]
hybrid_target = hybrid_data[hybrid_data.columns[-1]]

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5,
                                  encode='ordinal',
                                  strategy='uniform')
p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)])

# Discrete pipeline
discretized_data, _ = p.apply(disc_data)
disc_bn = Networks.DiscreteBN()
info = p.info
disc_bn.add_nodes(info)
disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score))
disc_bn.fit_parameters(data=disc_data)
disc_bn.calculate_weights(discretized_data)
disc_predicted_values = disc_bn.predict(test=disc_test_data)
disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values,
                                               orient='columns')
synth_disc_data = disc_bn.sample(50)

disc_bn.save('./disc_bn.json')
Ejemplo n.º 4
0
print(f"Time elapsed for importing: {p1 - start}")

h = pd.read_csv("data/real data/hack_processed_with_rf.csv")
cols = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability',
        'Depth']
h = h[cols]

print(h.describe())
print("-----")
p2 = time.time()
print(f"Time elapsed for preparing data: {p2 - p1}")

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)])

# -----------
discrete_data, est = p.apply(h)
info = p.info

bn = Networks.HybridBN(has_logit=True)  # all may vary
bn.add_nodes(descriptor=info)
bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',))

bn.get_info(as_df=False)
t1 = time.time()
bn.fit_parameters(data=h)
t2 = time.time()
print(f'PL elapsed: {t2 - t1}')
Ejemplo n.º 5
0
from bamt.Preprocessors import Preprocessor
import pandas as pd
from sklearn import preprocessing as pp

vk_data = pd.read_csv("data/real data/vk_data.csv")

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)])

data, en = p.apply(vk_data)
# for i, j in en.items():
#     print(f"{i}:{j}")
# ------------------------

p2 = Preprocessor([('discretizer', discretizer)])

data, en = p2.apply(vk_data)

print(data)
p1 = time.time()
print(f"Time elapsed for importing: {p1 - start}")

h = pd.read_csv("data/real data/hack_processed_with_rf.csv")
cols = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability',
        'Depth']
h = h[cols]

print(h.describe())

p2 = time.time()
print(f"Time elapsed for uploading data: {p2 - p1}")

discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

p = Preprocessor([('discretizer', discretizer)])

# -----------
nodes_type_mixed = p.get_nodes_types(h)
columns = [col for col in h.columns.to_list() if not nodes_type_mixed[col] in ['disc', 'disc_num']]  # GET ONLY CONT
discrete_data = h[columns]

discretized_data, est = p.apply(discrete_data)  # info
info = p.info

bn = Networks.ContinuousBN(use_mixture=True)  # use_mixture = False as well

bn.add_nodes(descriptor=info)

bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI',))
bn.get_info(as_df=False)
Ejemplo n.º 7
0
p1 = time.time()
print(f"Time elapsed for importing: {p1 - start}")

vk_data = pd.read_csv(r"data/real data/vk_data.csv")
ROWS = 50
vk_data = vk_data.iloc[:ROWS, :]

p2 = time.time()
print(f"Time elapsed for uploading data: {p2 - p1}")

encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5,
                                  encode='ordinal',
                                  strategy='uniform')

p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)])

discretized_data, est = p.apply(vk_data)
info = p.info

bn = Networks.DiscreteBN()
bn.add_nodes(descriptor=info)  # error

params = {'init_nodes': None, 'bl_add': None}

bn.add_edges(data=discretized_data,
             optimizer='HC',
             scoring_function=('K2', K2Score),
             params=params)  # error

# # --------------------