/
Utils.py
71 lines (44 loc) · 1.97 KB
/
Utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import DataReader as dataReader
import RegularFeatureExtractor as featureExtractor
import ClassifierSelector as classifierSelector
import glob as glob
import pandas as pd
import os as os
import random as random
numberOfPartitions = 80
def splitTestDataIntoChunks():
testData = dataReader.getWholeTestData()
miniDataFrames = np.array_split(testData, numberOfPartitions)
for i in range(numberOfPartitions):
outputFileName = 'data\\miniTestData\\miniDataFrame'+str(i)+'.csv'
miniDataFrames[i].to_csv(outputFileName,index=False)
def trainClassifierOnTrainingDataReturnAll(numberOfTrainingExamples = -1):
trainData = dataReader.getTrainData(numberOfTrainingExamples)
# feature engineering
trainData = featureExtractor.convertTargetFeatureToNumeric(trainData)
xTrain, yTrain = featureExtractor.getRegularFeatures(trainData, True)
# classifier training
classifier = classifierSelector.trainClassifier(xTrain, yTrain)
return classifier, xTrain, yTrain
def getDifferentTrainAndTestData(trainDataSize, testDataSize):
data = dataReader.getWholeTrainingData()
if trainDataSize+testDataSize > data.shape[0]: # request more rows than the DF has
print "Getting different train & test data with possible duplicates"
trainData = data.sample(trainDataSize)
testData = data.sample(testDataSize)
else:
print "Getting totally different train & test data"
indexes = np.arange(data.shape[0]) #0->873k
random.shuffle(indexes) # works in-place
trainData = data.ix[indexes[0:trainDataSize]]
testData = data.ix[indexes[trainDataSize+1:trainDataSize+1+testDataSize]]
return trainData,testData
class InitialClassifierAdapter(object):
def __init__(self, est):
self.est = est
def fit(self, X, y, sample_weight=None):
self.est.fit(X, y)
return self
def predict(self, X):
return self.est.predict_proba(X)[:, 1][:, np.newaxis]