forked from subhashbylaiah/Network-Fault-Prediction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_preprocessing.py
executable file
·155 lines (132 loc) · 8.08 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
'''
Vidya Sagar Kalvakunta (vkalvaku)
Subhash Bylaiah (sybylaiah)
'''
import os
import pandas as pd
import numpy as np
from scipy.stats.mstats import mode
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import rcParams
datadir = 'data'
result_datadir = 'data'
# Read file from the given path
def readFile(file_name):
return pd.read_csv(os.path.join(datadir, file_name))
def process_merge_event_type(df, event_type):
event_type = event_type.merge(df, on='id')
unique_event_type = pd.DataFrame(event_type['event_type'].value_counts())
# initialize:
unique_event_type['preprocess'] = unique_event_type.index.values
unique_event_type['TrainPerc'] = event_type.pivot_table(values='source', index='event_type',
aggfunc=lambda x: sum(x == 'train') / float(len(x)))
# remove the ones not present in train:
unique_event_type['preprocess'].loc[unique_event_type['TrainPerc'] == 0] = 'Delete'
unique_event_type['severity_mode'] = event_type.loc[event_type['source'] == 'train'].pivot_table(
values='fault_severity', index='event_type', aggfunc=lambda x: mode(x)[0])
# replace the lower ones with mode:
top_unchange = 33
unique_event_type['preprocess'].iloc[top_unchange:] = unique_event_type['severity_mode'].iloc[top_unchange:].apply(
lambda x: 'Delete' if pd.isnull(x) else 'event_type others_%d' % int(x))
# Merge preprocess into original and then into train:
event_type = event_type.merge(unique_event_type[['preprocess']], left_on='event_type', right_index=True)
event_type_merge = event_type.pivot_table(values='event_type', index='id', columns='preprocess',
aggfunc=lambda x: len(x), fill_value=0)
return (df.merge(event_type_merge, left_on='id', right_index=True), event_type)
def process_merge_log_feature(df, log_feature):
# log feature
log_feature = log_feature.merge(df[['id', 'fault_severity', 'source']], on='id')
uniuqe_log_feature = pd.DataFrame(log_feature['log_feature'].value_counts())
uniuqe_log_feature['TrainPerc'] = log_feature.pivot_table(values='source', index='log_feature',
aggfunc=lambda x: sum(x == 'train') / float(len(x)))
uniuqe_log_feature = pd.DataFrame(log_feature['log_feature'].value_counts())
uniuqe_log_feature['TrainPerc'] = log_feature.pivot_table(values='source', index='log_feature',
aggfunc=lambda x: sum(x == 'train') / float(len(x)))
# Determine the mode of each:
uniuqe_log_feature['severity_mode'] = log_feature.loc[log_feature['source'] == 'train'].pivot_table(
values='fault_severity', index='log_feature',
aggfunc=lambda x: mode(x)[0])
uniuqe_log_feature['preprocess'] = uniuqe_log_feature.index.values
# remove the ones all in train
uniuqe_log_feature['preprocess'].loc[uniuqe_log_feature['TrainPerc'] == 1] = np.nan
top_unchange = 128
uniuqe_log_feature['preprocess'].iloc[top_unchange:] = uniuqe_log_feature['severity_mode'].iloc[
top_unchange:].apply(
lambda x: 'Delete' if pd.isnull(x) else 'feature others_%d' % int(x))
log_feature = log_feature.merge(uniuqe_log_feature[['preprocess']], left_on='log_feature', right_index=True)
log_feature_merge = log_feature.pivot_table(values='volume', index='id', columns='preprocess',
aggfunc=np.sum, fill_value=0)
return df.merge(log_feature_merge, left_on='id', right_index=True)
def process_merge_resource_type(df, resource_type):
# resource type
resource_type = resource_type.merge(df[['id', 'fault_severity', 'source']], on='id')
unique_resource_type = pd.DataFrame(resource_type['resource_type'].value_counts())
unique_resource_type['PercTrain'] = resource_type.pivot_table(values='source', index='resource_type',
aggfunc=lambda x: sum(x == 'train') / float(len(x)))
unique_resource_type.head()
unique_resource_type['severity_mode'] = resource_type.loc[resource_type['source'] == 'train'].pivot_table(
values='fault_severity', index='resource_type', aggfunc=lambda x: mode(x)[0])
resource_type_merge = resource_type.pivot_table(values='source', index='id', columns='resource_type',
aggfunc=lambda x: len(x), fill_value=0)
return df.merge(resource_type_merge, left_on='id', right_index=True)
def process_merge_severity_type(df, severity_type, event_type):
# severity type
severity_type = severity_type.merge(df[['id', 'fault_severity', 'source']], on='id')
unique_severity_type = pd.DataFrame(severity_type['severity_type'].value_counts())
unique_severity_type['PercTrain'] = severity_type.pivot_table(values='source', index='severity_type',
aggfunc=lambda x: sum(x == 'train') / float(len(x)))
unique_severity_type['severity_mode'] = severity_type.loc[severity_type['source'] == 'train'].pivot_table(
values='fault_severity', index='severity_type', aggfunc=lambda x: mode(x)[0])
severity_type.loc[event_type['source'] == 'train'].pivot_table(values='fault_severity', index='severity_type',
aggfunc=lambda x: mode(x))
severity_type_merge = severity_type.pivot_table(values='source', index='id', columns='severity_type',
aggfunc=lambda x: len(x), fill_value=0)
return df.merge(severity_type_merge, left_on='id', right_index=True)
def main():
train = readFile('train.csv')
test = readFile('test.csv')
# print "Test Data: ",test.head(10)
event_type = readFile('event_type.csv')
# print "event_type:", event_type.head(10)
log_feature = readFile('log_feature.csv')
# print "log_feature:", log_feature.head(10)
resource_type = readFile('resource_type.csv')
# print "resource_type:", resource_type.head(10)
severity_type = readFile('severity_type.csv')
# print "severity_type:", severity_type.head(10)
train['source'] = 'train'
test['source'] = 'test'
df = pd.concat([train, test], ignore_index=True)
# print "Imbalanced Class: ",df['fault_severity'].value_counts()
# Process each file and merge df
df, event_type = process_merge_event_type(df, event_type)
df = process_merge_log_feature(df, log_feature)
df = process_merge_resource_type(df, resource_type)
df = process_merge_severity_type(df, severity_type, event_type)
# print df.head(10)
# Location Count:
location_count = df['location'].value_counts()
df['fualt_on_loc_count'] = df['location'].apply(lambda x: location_count[x])
# Feature Count:
features = [x for x in df.columns if 'feature ' in x]
df['feature_count'] = df[features].apply(np.sum, axis=1)
df['feature_count'].sum()
# Convert location to numeric:
encoder = LabelEncoder()
df['location'] = encoder.fit_transform(df['location'])
# Delete extra columns
df.drop(['Delete_x', 'Delete_y'], axis=1, inplace=True)
processed_train = df.loc[df['source'] == 'train']
processed_test = df.loc[df['source'] == 'test']
processed_train.drop('source', axis=1, inplace=True)
processed_test.drop('source', axis=1, inplace=True)
processed_train['num'] = processed_train.groupby('location')['fault_severity'].transform(
lambda x: np.arange(x.shape[0]) + 1)
processed_train['num'] = processed_train.groupby('location').cumcount() + 1
# print processed_train.head(10)
processed_test['num'] = processed_test.groupby('location')['fault_severity'].transform(
lambda x: np.arange(x.shape[0]) + 1)
processed_test['num'] = processed_test.groupby('location').cumcount() + 1
processed_train.to_csv(os.path.join(result_datadir, 'processed_train1.csv'), index=False)
processed_test.to_csv(os.path.join(result_datadir, 'processed_test1.csv'), index=False)
print "Done!"