-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess-assg3.py
72 lines (54 loc) · 2.45 KB
/
preprocess-assg3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import numpy as np
import preprocess as pr
import utils as util
import split as splt
def get_one_hot_encoding(data, column, value):
df = data[data[column] == value]
df = df.drop(columns=[column])
related_columns = [col for col in df.columns if col.startswith(column+'__')]
df = df[related_columns]
df = df.drop_duplicates()
assert(len(df) <= 1)
if len(df) == 0:
return [0 for col in df[column].unique().tolist()[:-1]]
return df.values.tolist()[0]
if __name__ == "__main__":
if util.final == True:
columns, data = util.readFile('dating-full.csv',None)
else:
columns, data = util.readFile('test_dataset.csv')
# Answer to question 1.i
data = data[:6500]
# Preprocess similar to assignment 2 1.i
pr.stripQuotes(data, ['race', 'race_o', 'field'])
# Preprocess similar to assignment 2 1.ii
pr.toLowerCase(data, ['field'])
# Preprocess similar to assignment 2 1.iv
pr.normalizeColumns(data, util.psParticipants, util.psPartners)
# Answer to question 1.ii
dum = pd.get_dummies(data, prefix=[val+'_' for val in util.categorical], columns=util.categorical)
# Rearranging columns
indices = [list(data.columns).index(col) for col in util.categorical]
for i in range(len(indices)):
dum[util.categorical[i]] = data[util.categorical[i]]
cols = dum.columns.tolist()
cols = cols[:indices[i]] + [util.categorical[i]] + cols[indices[i]:-1]
dum = dum[cols]
decision_index = dum.columns.tolist().index('decision')
cols = dum.columns.tolist()
cols = cols[:decision_index] + cols[decision_index+1:] +['decision']
data = dum[cols]
# Drop last columns. So the one-hot-encoding of last value will be an all zero vector.
for col in util.categorical:
column_values = data[col].sort_values(ascending=False).values
data = data.drop(columns=[col+'__'+column_values[0]])
values = ['female', 'Black/African American', 'Other', 'economics']
for i in range(len(util.categorical)):
one_hot_encoding = get_one_hot_encoding(data, util.categorical[i], values[i])
print('Mapped vector for {} in column {}: {}'.format(values[i], util.categorical[i], one_hot_encoding))
# print(len(one_hot_encoding))
data = data.drop(columns=util.categorical)
# Answer to question 1.iii
train, test = splt.split(data, random_state=25, frac=0.2)
splt.save_train_and_test_split(train, test)