def experiment_invoke(dataset='bank'): if dataset == 'wine': reader = WineDataReader() else: reader = BankDataReader() ds = DataSplitter(reader) ds.read_split_data() run_final = False er = KMeansExp(reader, ds, run_final) er.experiment() er = EMExp(reader, ds, run_final) er.experiment() er = ANNExp(reader, ds) er.experiment_bank() er.experiment_clusters() run_final = True er = KMeansExp(reader, ds, run_final) er.experiment() er = EMExp(reader, ds, run_final) er.experiment() er = ANNExp(reader, ds) er.experiment_bank() er.experiment_clusters()
def __init__(self, convertor, splitter_method_index, splitter_method_parameter): DataSplitter.__init__(self, convertor, splitter_method_index, splitter_method_parameter) self.splitter_given_n = splitter_method_parameter self.start_time_id = 0 self.methods = { 0: self.get_given_n_by_user, 1: self.get_given_n_by_item, 2: self.get_given_n_by_user_date, 3: self.get_given_n_by_item_date, 4: self.get_given_n_by_date, }
def splitClassificationTest(self): train, test = DataSplitter().splitDataEqually(self.data, self.labelCol) i = 0 for clf in self.classifiers: self.testClassifier(clf, train, test, i) i += 1
def splitMultiClassificationTest(self): train, test = DataSplitter().splitDataEqually(self.data, self.labelCol) i = 0 print("DATA IN TESTER", self.data) for clf in self.classifiers: self.testMultiClassifier(clf, train, test, i) i += 1
def createFolders(self): #it prepares the folders for k-fold cross validation folders = [[]] * self.k dataTmp = self.data.copy() for u in range(0, self.k): #it splits data equally according to each label folder, dataTmp = DataSplitter().splitDataEqually( dataTmp, self.labelCol, 1.0 / (self.k - u)) folders[u] = folder self.folders = folders
data = data.iloc[:, 1:] # One-hot encode the data using pandas get_dummies data = pd.get_dummies(data) # data transformation: real values into labels to classify data = data.apply(transRow, axis=1) notWeek = eliminateWeekSections(data.columns) print(notWeek) data = data[data.columns[notWeek]] print(data.columns) labelName = "shares" train, test = DataSplitter().splitDataEqually(data, labelName) Y_train = pd.factorize(train[labelName])[0] X_train_origin = train.iloc[:, 0:train.columns.size - 1].copy() Y_test = pd.factorize(test[labelName])[0] X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy() scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X_train_origin) #scaling of training data X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()), columns=X_train_origin.columns) # apply same transformation to test data X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()), columns=X_test_origin.columns) trainTmp = X_train_origin.copy() trainTmp[labelName] = Y_train
def __init__(self, convertor, splitter_method_index, splitter_method_parameter): DataSplitter.__init__(self, convertor, splitter_method_index, splitter_method_parameter)
if(__name__ == "__main__"): np.random.seed(12345) # Read in data and display first 5 rows data = pd.read_csv('regression.csv', sep=",") print('The shape of our data is:', data.shape) #one hot encoding: transorming nominal values labelName = "G3" labels = data[labelName] data = pd.get_dummies(data) data[labelName] = labels train, test = DataSplitter().splitData(data.copy()) print(train.copy()) #preparing test and training for final evaluation: using copies not to create problems scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) #don't cheat: fit only on training data scaler.fit(train) trainTmp = pd.DataFrame(scaler.transform(train.copy()), columns=train.columns) # apply same transformation to test data testTmp = pd.DataFrame(scaler.transform(test.copy()), columns=test.columns) fsSize = train.columns.size
data = data.iloc[:, 1:] # One-hot encode the data using pandas get_dummies data = pd.get_dummies(data) # data transformation: real values into labels to classify data = data.apply(transRow, axis=1) notWeek = eliminateWeekSections(data.columns) print(notWeek) data = data[data.columns[notWeek]] print(data.columns) labelName = "shares" train, test = DataSplitter().splitDataEqually(data, labelName) Y_train = pd.factorize(train[labelName])[0] X_train_origin = train.iloc[:, 0:train.columns.size - 1].copy() Y_test = pd.factorize(test[labelName])[0] X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy() scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X_train_origin) #scaling of training data X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()), columns=X_train_origin.columns) # apply same transformation to test data X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()), columns=X_test_origin.columns)
data = data.iloc[:, 1:] # print(data) # One-hot encode the data using pandas get_dummies data = pd.get_dummies(data) notWeek = eliminateWeekSections(data.columns) print(notWeek) data = data[data.columns[notWeek]] print(data.columns) labelName = "shares" train, test = DataSplitter().splitData(data.copy()) print("Splitted") print("Fitting") scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(train) print("Fitted") trainTmp = pd.DataFrame(scaler.transform(train.copy()), columns=train.columns) # apply same transformation to test data testTmp = pd.DataFrame(scaler.transform(test.copy()), columns=test.columns)
def split_data(self, save_path, experiment_id): if not self.load_train_test_data(save_path, experiment_id): self.methods[self.splitter_method_index](self.splitter_given_n) DataSplitter.split_data(self, save_path, experiment_id)