Ejemplo n.º 1
0
data = spark.read.csv('./data/train_sample100w.csv', header=True)

print('==PREPROCESSING== \n')
# Input the selected features
col = pd.read_csv('/Users/jaycheng/Dropbox/python/ms_comp/feature1.csv',
                  index_col=0)
col = col.iloc[:, 0].tolist()
col.append('MachineIdentifier')
col.append('HasDetections')
data = data.select(col)

# drop samples with missing value
# data = data.dropna('any')

# fill missing value -1
data = data.fillna('-1')

print('==StringIndexer== \n')
ignore = ['MachineIdentifier', 'HasDetections']
# StringIndexer all features.
stringindexer = [
    StringIndexer(inputCol=i, outputCol=i + "_index") for i in data.columns
    if i not in ignore
]
pipeline = Pipeline(stages=stringindexer)
data = pipeline.fit(data).transform(data)

# Fit on whole dataset to include all labels in index.
labelindex = StringIndexer(inputCol="HasDetections", outputCol="indexedLabel")
data = labelindex.fit(data).transform(data)