cols = [ 'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ] h = h[cols] p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(h) info = p.info # --------------------- VALIDATION TEST------------------------------- nodes_type_mixed = gru.nodes_types(h) columns = [ col for col in h.columns.to_list() if nodes_type_mixed[col] in ['disc', 'disc_num'] ] # GET ONLY DISCRETE discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # warning info = p.info bn = Networks.HybridBN()
from bamt.Preprocessors import Preprocessor import pandas as pd from sklearn import preprocessing as pp import bamt.Networks as Networks vk_data = pd.read_csv(r"data\real data\vk_data.csv").sample(150) encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder)]) discretized_data, est = p.apply(vk_data) info = p.info bn = Networks.HybridBN(has_logit=False, use_mixture=False) bn.add_nodes(descriptor=info) params = { "init_nodes": ["sex", "has_pets", "is_parent", "relation", "tr_per_month"], "init_edges": [("age", "mean_tr"), ("sex", "mean_tr"), ("sex", "has_pets"), ("is_parent", "has_pets"), ("has_pets", "median_tr"), ("is_driver", "tr_per_month"), ("tr_per_month", "median_tr"), ("tr_per_month", "relation")] } bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI', ), params=params) bn.fit_parameters(data=vk_data)
'Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth' ]].dropna() cont_test_data = cont_data[cont_data.columns[:-1]] cont_target = cont_data[cont_data.columns[-1]] disc_test_data = disc_data[disc_data.columns[:-1]] disc_target = disc_data[disc_data.columns[-1]] hybrid_test_data = hybrid_data[hybrid_data.columns[:-1]] hybrid_target = hybrid_data[hybrid_data.columns[-1]] encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # Discrete pipeline discretized_data, _ = p.apply(disc_data) disc_bn = Networks.DiscreteBN() info = p.info disc_bn.add_nodes(info) disc_bn.add_edges(data=discretized_data, scoring_function=('K2', K2Score)) disc_bn.fit_parameters(data=disc_data) disc_bn.calculate_weights(discretized_data) disc_predicted_values = disc_bn.predict(test=disc_test_data) disc_predicted_values = pd.DataFrame.from_dict(disc_predicted_values, orient='columns') synth_disc_data = disc_bn.sample(50) disc_bn.save('./disc_bn.json')
print(f"Time elapsed for importing: {p1 - start}") h = pd.read_csv("data/real data/hack_processed_with_rf.csv") cols = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth'] h = h[cols] print(h.describe()) print("-----") p2 = time.time() print(f"Time elapsed for preparing data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) # ----------- discrete_data, est = p.apply(h) info = p.info bn = Networks.HybridBN(has_logit=True) # all may vary bn.add_nodes(descriptor=info) bn.add_edges(data=discrete_data, optimizer='HC', scoring_function=('MI',)) bn.get_info(as_df=False) t1 = time.time() bn.fit_parameters(data=h) t2 = time.time() print(f'PL elapsed: {t2 - t1}')
from bamt.Preprocessors import Preprocessor import pandas as pd from sklearn import preprocessing as pp vk_data = pd.read_csv("data/real data/vk_data.csv") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) data, en = p.apply(vk_data) # for i, j in en.items(): # print(f"{i}:{j}") # ------------------------ p2 = Preprocessor([('discretizer', discretizer)]) data, en = p2.apply(vk_data) print(data)
p1 = time.time() print(f"Time elapsed for importing: {p1 - start}") h = pd.read_csv("data/real data/hack_processed_with_rf.csv") cols = ['Tectonic regime', 'Period', 'Lithology', 'Structural setting', 'Gross', 'Netpay', 'Porosity', 'Permeability', 'Depth'] h = h[cols] print(h.describe()) p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile') p = Preprocessor([('discretizer', discretizer)]) # ----------- nodes_type_mixed = p.get_nodes_types(h) columns = [col for col in h.columns.to_list() if not nodes_type_mixed[col] in ['disc', 'disc_num']] # GET ONLY CONT discrete_data = h[columns] discretized_data, est = p.apply(discrete_data) # info info = p.info bn = Networks.ContinuousBN(use_mixture=True) # use_mixture = False as well bn.add_nodes(descriptor=info) bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('MI',)) bn.get_info(as_df=False)
p1 = time.time() print(f"Time elapsed for importing: {p1 - start}") vk_data = pd.read_csv(r"data/real data/vk_data.csv") ROWS = 50 vk_data = vk_data.iloc[:ROWS, :] p2 = time.time() print(f"Time elapsed for uploading data: {p2 - p1}") encoder = pp.LabelEncoder() discretizer = pp.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') p = Preprocessor([('encoder', encoder), ('discretizer', discretizer)]) discretized_data, est = p.apply(vk_data) info = p.info bn = Networks.DiscreteBN() bn.add_nodes(descriptor=info) # error params = {'init_nodes': None, 'bl_add': None} bn.add_edges(data=discretized_data, optimizer='HC', scoring_function=('K2', K2Score), params=params) # error # # --------------------