def __init__(self):
     messages = ut.read_sql()
     self.idata = pd.read_csv('../resources/implement.csv')
     self.idata = self.idata['Summary']
     self.X_train, self.X_test, self.y_train, self.y_test, self.Score = ut.train_test_val_split(
         messages, 0.2, 42)
     self.X_train_tfidf, self.X_test_tfidf, self.idata_tfidf = Preprocessing(
     ).train_preprocess(self.X_train, self.X_test, self.idata)
bookmarks = config["bookmarks"]

n_reps = 5
use_saved_model = True
append_timestamp = False
save_best_model = True

if use_saved_model:
    n_reps = 1

acc_train_vect = {}
acc_test_vect = {}

output_filename = "naive_bayes_1"

prep = Preprocessing()

if config["one_hot_encoding"]:
    prep.create_encoder(
        prep.adapt_input(generator.generate_binary(config["n_valves"])))

if config["run_clean"] and not use_saved_model:
    loader.clean(root_crt_model_folder)


def init_vect(vect):
    for key in vect["data"]:
        # print(key)
        vect["data"][key].append(None)

    vect["count"].append(None)
model_filenames = filenames

# root_data_folder += "/random1"
# root_crt_model_folder = "./data/models/deep_rnn_random"
# filenames = ["exp_179"]
# model_filenames = ["exp_179"]

# set this as in saved models folder
n_reps = 5

results_vect_train = []
results_vect_test = []

use_randomforest = True

prep = Preprocessing()

if use_randomforest:
    root_crt_model_folder = config["root_model_container"] + "/dtree_multi"
    output_filename = "dtree_2_multioutput"
else:
    root_crt_model_folder = config["root_model_container"] + "/dtree"
    output_filename = "dtree_1"

# output_filename = "eval_deep_3_rnn_random_"
# output_filename = "eval_deep_5_rnn_random_"

if config["one_hot_encoding"]:
    binv = generator.generate_binary(config["n_valves"])
    print("binv:")
    print(binv)
df1_id = 'id'
df2_id = 'id'
match_id1 = 'idAmazon'  #corresponds to df1_id
match_id2 = 'idGoogleBase'  #corresponds to df2_id
df2["price"] = df2.price.str.replace(r"[a-zA-Z]",'').astype(float)

# save for later use to generate labels
df1_id_col = df1[df1_id]
df2_id_col = df2[df2_id]

# drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])

processed_data = Preprocessing().overall_preprocess(df1.drop(columns=['description']), df2.drop(columns=['description']),
                                                    special_columns=['title','manufacturer'],
                                                    word_embedding_model='none') # may take a while bc loading pretrained word embedding model

num_matrix_1, num_matrix_2 = processed_data["numerical"][0],processed_data["numerical"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][0],processed_data["special_fields"][1]
num_final_data = similarities().numerical_similarity_on_matrix(num_matrix_1,num_matrix_2)
spc_final_data = similarities().text_similarity_on_matrix(spc_matrix_1,spc_matrix_2,method='jaccard')

df1['key'] = 0
df2['key'] = 0
merged = pd.merge(df1, df2, on='key')[['description_x', 'description_y']]

'''
train-test split
'''
non_empty = []
model_filenames = filenames

# root_data_folder += "/random1"
# root_crt_model_folder = "./data/models/deep_rnn_random"
# filenames = ["exp_179"]
# model_filenames = ["exp_179"]

# set this as in saved models folder
n_reps = 5

results_vect_train = []
results_vect_test = []

use_rnn = True

prep = Preprocessing()

output_filename = "eval_deep_1_"
if use_rnn:
    output_filename = "eval_deep_2_rnn_"

# output_filename = "eval_deep_3_rnn_random_"
# output_filename = "eval_deep_5_rnn_random_"

if config["one_hot_encoding"]:
    binv = generator.generate_binary(config["n_valves"])
    print("binv:")
    print(binv)
    binv = prep.adapt_input(binv)
    print("adapted:")
    print(binv)
Example #6
0
'''
id column manipulation
'''
# save for later use to generate labels
df1_id_col = df1[df1_id]
df2_id_col = df2[df2_id]

# drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])
'''
preprocess both dataframes
'''
processed_data = Preprocessing().overall_preprocess(
    df1,
    df2,
    special_columns=['name', 'addressStreet'],
    zip_code="addressZip",
    embedding_weight='tfidf')
# may take a while bc loading pretrained word embedding model
'''
get numerical data
'''
# need fix addressZip and not to see it as numeric
num_matrix_1, num_matrix_2 = processed_data["numerical"][0], processed_data[
    "numerical"][1]
embed_matrix_1, embed_matrix_2 = processed_data["word_embedding_fields"][
    0], processed_data["word_embedding_fields"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][
    0], processed_data["special_fields"][1]
'''
calculate similarities