Python Preprocessing Examples

Programming Language: Python

Namespace/Package Name: modules.preprocessing

Class/Type: Preprocessing

Examples at hotexamples.com: 6

Python Preprocessing - 6 examples found. These are the top rated real world Python examples of modules.preprocessing.Preprocessing extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Preprocessing(4)

adapt_input(2)

create_encoder(2)

decode(1)

decode_int_onehot(1)

encode(1)

str_to_list(1)

Example #1

Show file

File: P3.py Project: thorswapnil7/Sentiment_Analysis_Project

 def __init__(self):
     messages = ut.read_sql()
     self.idata = pd.read_csv('../resources/implement.csv')
     self.idata = self.idata['Summary']
     self.X_train, self.X_test, self.y_train, self.y_test, self.Score = ut.train_test_val_split(
         messages, 0.2, 42)
     self.X_train_tfidf, self.X_test_tfidf, self.idata_tfidf = Preprocessing(
     ).train_preprocess(self.X_train, self.X_test, self.idata)

Example #2

Show file

File: train_naive_bayes.py Project: alexp25/wdn-model-experiments

bookmarks = config["bookmarks"]

n_reps = 5
use_saved_model = True
append_timestamp = False
save_best_model = True

if use_saved_model:
    n_reps = 1

acc_train_vect = {}
acc_test_vect = {}

output_filename = "naive_bayes_1"

prep = Preprocessing()

if config["one_hot_encoding"]:
    prep.create_encoder(
        prep.adapt_input(generator.generate_binary(config["n_valves"])))

if config["run_clean"] and not use_saved_model:
    loader.clean(root_crt_model_folder)


def init_vect(vect):
    for key in vect["data"]:
        # print(key)
        vect["data"][key].append(None)

    vect["count"].append(None)

Example #3

Show file

File: show_data_cmap_eval_model_dtree.py Project: alexp25/wdn-model-experiments

model_filenames = filenames

# root_data_folder += "/random1"
# root_crt_model_folder = "./data/models/deep_rnn_random"
# filenames = ["exp_179"]
# model_filenames = ["exp_179"]

# set this as in saved models folder
n_reps = 5

results_vect_train = []
results_vect_test = []

use_randomforest = True

prep = Preprocessing()

if use_randomforest:
    root_crt_model_folder = config["root_model_container"] + "/dtree_multi"
    output_filename = "dtree_2_multioutput"
else:
    root_crt_model_folder = config["root_model_container"] + "/dtree"
    output_filename = "dtree_1"

# output_filename = "eval_deep_3_rnn_random_"
# output_filename = "eval_deep_5_rnn_random_"

if config["one_hot_encoding"]:
    binv = generator.generate_binary(config["n_valves"])
    print("binv:")
    print(binv)

Example #4

Show file

File: google_amazon_deep.py Project: rakshita95/capstone-entitymatching

df1_id = 'id'
df2_id = 'id'
match_id1 = 'idAmazon'  #corresponds to df1_id
match_id2 = 'idGoogleBase'  #corresponds to df2_id
df2["price"] = df2.price.str.replace(r"[a-zA-Z]",'').astype(float)

# save for later use to generate labels
df1_id_col = df1[df1_id]
df2_id_col = df2[df2_id]

# drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])

processed_data = Preprocessing().overall_preprocess(df1.drop(columns=['description']), df2.drop(columns=['description']),
                                                    special_columns=['title','manufacturer'],
                                                    word_embedding_model='none') # may take a while bc loading pretrained word embedding model

num_matrix_1, num_matrix_2 = processed_data["numerical"][0],processed_data["numerical"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][0],processed_data["special_fields"][1]
num_final_data = similarities().numerical_similarity_on_matrix(num_matrix_1,num_matrix_2)
spc_final_data = similarities().text_similarity_on_matrix(spc_matrix_1,spc_matrix_2,method='jaccard')

df1['key'] = 0
df2['key'] = 0
merged = pd.merge(df1, df2, on='key')[['description_x', 'description_y']]

'''
train-test split
'''
non_empty = []

Example #5

Show file

File: show_data_cmap_eval_model.py Project: alexp25/wdn-model-experiments

model_filenames = filenames

# root_data_folder += "/random1"
# root_crt_model_folder = "./data/models/deep_rnn_random"
# filenames = ["exp_179"]
# model_filenames = ["exp_179"]

# set this as in saved models folder
n_reps = 5

results_vect_train = []
results_vect_test = []

use_rnn = True

prep = Preprocessing()

output_filename = "eval_deep_1_"
if use_rnn:
    output_filename = "eval_deep_2_rnn_"

# output_filename = "eval_deep_3_rnn_random_"
# output_filename = "eval_deep_5_rnn_random_"

if config["one_hot_encoding"]:
    binv = generator.generate_binary(config["n_valves"])
    print("binv:")
    print(binv)
    binv = prep.adapt_input(binv)
    print("adapted:")
    print(binv)

Example #6

Show file

'''
id column manipulation
'''
# save for later use to generate labels
df1_id_col = df1[df1_id]
df2_id_col = df2[df2_id]

# drop id columns because we don't need to compute id similarity
df1 = df1.drop(columns=[df1_id])
df2 = df2.drop(columns=[df2_id])
'''
preprocess both dataframes
'''
processed_data = Preprocessing().overall_preprocess(
    df1,
    df2,
    special_columns=['name', 'addressStreet'],
    zip_code="addressZip",
    embedding_weight='tfidf')
# may take a while bc loading pretrained word embedding model
'''
get numerical data
'''
# need fix addressZip and not to see it as numeric
num_matrix_1, num_matrix_2 = processed_data["numerical"][0], processed_data[
    "numerical"][1]
embed_matrix_1, embed_matrix_2 = processed_data["word_embedding_fields"][
    0], processed_data["word_embedding_fields"][1]
spc_matrix_1, spc_matrix_2 = processed_data["special_fields"][
    0], processed_data["special_fields"][1]
'''
calculate similarities