def __init__(self): messages = ut.read_sql() self.idata = pd.read_csv('../resources/implement.csv') self.idata = self.idata['Summary'] self.X_train, self.X_test, self.y_train, self.y_test, self.Score = ut.train_test_val_split( messages, 0.2, 42) self.X_train_tfidf, self.X_test_tfidf, self.idata_tfidf = Preprocessing( ).train_preprocess(self.X_train, self.X_test, self.idata)
bookmarks = config["bookmarks"] n_reps = 5 use_saved_model = True append_timestamp = False save_best_model = True if use_saved_model: n_reps = 1 acc_train_vect = {} acc_test_vect = {} output_filename = "naive_bayes_1" prep = Preprocessing() if config["one_hot_encoding"]: prep.create_encoder( prep.adapt_input(generator.generate_binary(config["n_valves"]))) if config["run_clean"] and not use_saved_model: loader.clean(root_crt_model_folder) def init_vect(vect): for key in vect["data"]: # print(key) vect["data"][key].append(None) vect["count"].append(None)
model_filenames = filenames # root_data_folder += "/random1" # root_crt_model_folder = "./data/models/deep_rnn_random" # filenames = ["exp_179"] # model_filenames = ["exp_179"] # set this as in saved models folder n_reps = 5 results_vect_train = [] results_vect_test = [] use_randomforest = True prep = Preprocessing() if use_randomforest: root_crt_model_folder = config["root_model_container"] + "/dtree_multi" output_filename = "dtree_2_multioutput" else: root_crt_model_folder = config["root_model_container"] + "/dtree" output_filename = "dtree_1" # output_filename = "eval_deep_3_rnn_random_" # output_filename = "eval_deep_5_rnn_random_" if config["one_hot_encoding"]: binv = generator.generate_binary(config["n_valves"]) print("binv:") print(binv)
df1_id = 'id' df2_id = 'id' match_id1 = 'idAmazon' #corresponds to df1_id match_id2 = 'idGoogleBase' #corresponds to df2_id df2["price"] = df2.price.str.replace(r"[a-zA-Z]",'').astype(float) # save for later use to generate labels df1_id_col = df1[df1_id] df2_id_col = df2[df2_id] # drop id columns because we don't need to compute id similarity df1 = df1.drop(columns=[df1_id]) df2 = df2.drop(columns=[df2_id]) processed_data = Preprocessing().overall_preprocess(df1.drop(columns=['description']), df2.drop(columns=['description']), special_columns=['title','manufacturer'], word_embedding_model='none') # may take a while bc loading pretrained word embedding model num_matrix_1, num_matrix_2 = processed_data["numerical"][0],processed_data["numerical"][1] spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][0],processed_data["special_fields"][1] num_final_data = similarities().numerical_similarity_on_matrix(num_matrix_1,num_matrix_2) spc_final_data = similarities().text_similarity_on_matrix(spc_matrix_1,spc_matrix_2,method='jaccard') df1['key'] = 0 df2['key'] = 0 merged = pd.merge(df1, df2, on='key')[['description_x', 'description_y']] ''' train-test split ''' non_empty = []
model_filenames = filenames # root_data_folder += "/random1" # root_crt_model_folder = "./data/models/deep_rnn_random" # filenames = ["exp_179"] # model_filenames = ["exp_179"] # set this as in saved models folder n_reps = 5 results_vect_train = [] results_vect_test = [] use_rnn = True prep = Preprocessing() output_filename = "eval_deep_1_" if use_rnn: output_filename = "eval_deep_2_rnn_" # output_filename = "eval_deep_3_rnn_random_" # output_filename = "eval_deep_5_rnn_random_" if config["one_hot_encoding"]: binv = generator.generate_binary(config["n_valves"]) print("binv:") print(binv) binv = prep.adapt_input(binv) print("adapted:") print(binv)
''' id column manipulation ''' # save for later use to generate labels df1_id_col = df1[df1_id] df2_id_col = df2[df2_id] # drop id columns because we don't need to compute id similarity df1 = df1.drop(columns=[df1_id]) df2 = df2.drop(columns=[df2_id]) ''' preprocess both dataframes ''' processed_data = Preprocessing().overall_preprocess( df1, df2, special_columns=['name', 'addressStreet'], zip_code="addressZip", embedding_weight='tfidf') # may take a while bc loading pretrained word embedding model ''' get numerical data ''' # need fix addressZip and not to see it as numeric num_matrix_1, num_matrix_2 = processed_data["numerical"][0], processed_data[ "numerical"][1] embed_matrix_1, embed_matrix_2 = processed_data["word_embedding_fields"][ 0], processed_data["word_embedding_fields"][1] spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][ 0], processed_data["special_fields"][1] ''' calculate similarities