def transform(self, X, y=None): output = X.copy() # if NaNs are not in dataset we just label encode each specified column # or all of them if not specified if self.keep_nan == False: if self.columns is not None: for col in self.columns: output[col] = LabelEncoder().fit_transform(output[col]) else: for colname, col in output.iteritems(): output[colname] = LabelEncoder().fit_transform(col) #If y is passed, it is encoded as well if y is not None: target = LabelEncoder().fit_transform(y) return output, target else: return output # else we will use masking to keep track of NaNs and restore them after label encoding else: if self.columns is not None: for col in self.columns: original = output[col] mask = output[col].isnull() new_col = LabelEncoder().fit_transform( output[col].astype('str')) new_col = new_col.astype('int') new_col = pd.Series(new_col, name=output[col].name) output[col] = new_col.where(~mask, original) else: for colname, col in output.iteritems(): original = output[colname] mask = output[colname].isnull() new_col = LabelEncoder().fit_transform( output[colname].astype('str')) new_col = new_col.astype('int') new_col = pd.Series(new_col, name=colname) output[colname] = new_col.where(~mask, original) #If y is passed, it is encoded as well if y is not None: original = y mask = y.isnull() target = LabelEncoder().fit_transform(y.astype('str')) target = target.astype('int') target = pd.Series(target, name=y.name) target = target.where(~mask, original) return output, target else: return output
class CSVDataset(Dataset): def __init__(self, path): df = pd.read_csv(path, header=None) self.X = df.values[:, :-1] self.y = df.values[:, -1] self.X = self.X.astype('float32') self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape(len(self.y), 1) def __len__(self): return len(self.X) def __getitem__(self, idx): return [self.X[idx], self.y[idx]] def get_splits(self, n_test=0.33): test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size return random_split(self, [train_size, test_size])
class CSVDataset(Dataset): # load the dataset def __init__(self, path): # load csv file as a dataframe using pandas df = read_csv(path, header=None) # store inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] print("Input data shape:", np.shape(self.X)) print("Input label shape:", np.shape(self.y)) # Ensure input X values are floats self.X = self.X.astype('float32') # Encode target labels and ensure they are floats self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape(len(self.y), 1) #print("After reshaping, input label shape:", np.shape(self.y)) #print("Unique labels:", np.unique(self.y)) # Number of rows in dataset def __len__(self): return len(self.X) # Get a row at an index def __getitem__(self, idx): return [self.X[idx], self.y[idx]] # Get indices for train and test rows def get_splits(self, n_train=0.7): train_split = round(n_train * len(self.X)) test_split = len(self.X) - train_split return random_split(self, [train_split, test_split])
class CSVDataset(Dataset): # load the dataset def __init__(self, path): # load the csv file as a dataframe df = read_csv(path, header=None) # store the inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] # ensure input data is floats self.X = self.X.astype('float32') # label encode target and ensure the values are floats self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape((len(self.y), 1)) # number of rows in the dataset def __len__(self): return len(self.X) # get a row at an index def __getitem__(self, idx): return [self.X[idx], self.y[idx]] # get indexes for train and test rows def get_splits(self, n_test=0.33): # determine sizes test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size # calculate the split return random_split(self, [train_size, test_size])
def transform(self, X, y=None): output = X.copy() for col in self.columns: df = pd.get_dummies(X[col]) for index, row in df.iterrows(): if pd.Series(row.values).any(): pass else: new_row = row.replace(0, np.nan, inplace=True) df.loc[index] = new_row same_columns_names = set(df.columns).intersection(output.columns) if same_columns_names: count = 1 new_names = [] for i in range(len(df.columns)): name = df.columns[i] new_names.append(f'{name}_{count}') count += 1 df.columns = new_names output = pd.concat([output,df], axis=1) output.drop(self.columns, axis=1, inplace=True) if y is not None: original = y mask = y.isnull() target = LabelEncoder().fit_transform(y.astype('str')) target = target.astype('int') target = pd.Series(target, name = y.name) target = target.where(~mask, original) return output, target
class CSVDataset(Dataset): def __init__(self, path): ##Load the csv dataset as Dataframe df = read_csv(path, header=None) ### Store the inputs and outputs self.X = df.values[:, :-1] self.y = df.values[:, -1] ## make them floats self.X = self.X.astype('float32') ## encode the targets self.y = LabelEncoder().fit_transform(self.y) self.y = self.y.astype('float32') self.y = self.y.reshape((len(self.y), 1)) ## number of rows in the dataset def __len__(self): return len(self.X) ## get a row from the dataset def __getitem__(self, index): return [self.X[index], self.y[index]] ### get index for test and train rows def get_splits(self, n_test=0.33): #determine sizes test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size ## calculate the split return random_split(self, [train_size, test_size])
def get_task_split(task, seed, split): if not (len(split) == 2 or len(split) == 3): raise ValueError(f'Splits must be either 2 or 3 floats\n{split=}') task = openml.tasks.get_task(task) X, y, categorical_mask, _ = task.get_dataset().get_data(task.target_name) # Process labels if y is not None: if y.dtype == 'category' or y.dtype == object: y = LabelEncoder().fit_transform(y.values) elif y.dtype == bool: y = y.astype('int') if isinstance(y, pandas.Series): y = y.to_numpy() # Process Features for col in X.columns: mode = X[col].mode()[0] X[col].fillna(mode, inplace=True) encoding_frames = [] for col in list(X.columns[categorical_mask]): encodings = pandas.get_dummies(X[col], prefix=col, prefix_sep='_') X.drop(col, axis=1, inplace=True) encoding_frames.append(encodings) X = pandas.concat([X, *encoding_frames], axis=1) X = X.to_numpy() # Create split if len(split) == 2: train_split = split[0] test_split = split[1] splits = split_data(X, y, test_split, seed) return { 'baseline_train': splits['split_1'], 'baseline_test': splits['split_2'] } else: algo_split = split[0] selector_split = split[1] test_split = split[2] # Split data between testing and training train_test_split = split_data(X, y, test_split, seed) # Further divide the train split of train_test_split to be between # the algorithm and the selector selector_relative_split = selector_split / (algo_split + selector_split) X_train, y_train = train_test_split['split_1'] train_splits = split_data( X_train, y_train, selector_relative_split, seed) return { 'algo_train': train_splits['split_1'], 'selector_train': train_splits['split_2'], 'test': train_test_split['split_2'] }
def __init__(self,fileName): df = pd.read_csv(fileName, header=None) self.X = tensor(df.values[:,:-1].astype('float32')) encoded_output = LabelEncoder().fit_transform(df.values[:,-1]) self.y = tensor(encoded_output.astype('float32')) self.y = self.y.reshape((len(self.y),1)) self.numInFeatures = len(self.X[0]) self.numData = len(self.X)
def load_dataset(): # load the dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv' df = read_csv(url, header=None) data = df.values X, y = data[:, :-1], data[:, -1] # minimally prepare dataset X = X.astype('float') y = LabelEncoder().fit_transform(y.astype('str')) return X, y
def get_dataset(): # load dataset url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv" dataset = read_csv(url, header=None) data = dataset.values # separate into input and output columns X, y = data[:, :-1], data[:, -1] # ensure inputs are floats and output is an integer label X = X.astype('float32') y = LabelEncoder().fit_transform(y.astype('str')) return X, y
def transform(self, X, y=None): output = X.copy() for col in self.columns: #create a df with encoded columns for each category in col df = pd.get_dummies(X[col]) # Encoding if self.keep_nan == True: #restoring NaNs for index, row in df.iterrows(): # if there are all zero rows it means Nan for those features if pd.Series(row.values).any(): pass else: # replacing zeros and restoring Nans new_row = row.replace(0, np.nan, inplace=True) df.loc[index] = new_row # Changing columns labels to get unique for each one same_columns_names = set(df.columns).intersection(output.columns) if same_columns_names: count = 1 new_names = [] for i in range(len(df.columns)): name = df.columns[i] new_names.append(f'{name}_{count}') count += 1 df.columns = new_names #To this point we should have got a df with encoded columns to concatenate to output output = pd.concat([output, df], axis=1) # Now we drop the old columns output.drop(self.columns, axis=1, inplace=True) #If y is passed, it is encoded as well if y is not None and self.keep_nan == True: original = y mask = y.isnull() target = LabelEncoder().fit_transform(y.astype('str')) target = target.astype('int') target = pd.Series(target, name=y.name) target = target.where(~mask, original) return output, target elif y is not None and self.keep_nan == False: target = LabelEncoder().fit_transform(y) return output, target else: return output
def __init__(self, csv_file='telescope.dat', path='data/'): """ constructor to load a csv, preprocess it into torch Dataset """ self.dataset = pd.read_table(path + csv_file, header=None, delimiter=',') self.dataset.columns = [ 'FLength', 'FWidth', 'FSize', 'FConc', 'FConc1', 'FAsym', 'FM3Long', 'FM3Trans', 'FAlpha', 'FDist', 'Class' ] scaler = StandardScaler() data = scaler.fit_transform(self.dataset.iloc[:, :-1]) target = LabelEncoder().fit_transform(self.dataset.Class) self.x = data.astype(np.float32) self.y = target.astype(np.long)
class CSVDataset(Dataset): def __init__(self, path): df = read_csv(path, header=None) # load the csv file as a dataframe self.X = df.values[:, :-1] # store the inputs self.y = df.values[:, -1] # and outputs self.X = self.X.astype('float32') # ensure input data is floats self.y = LabelEncoder().fit_transform(self.y) # label target self.y = self.y.astype('float32') # ensure floats self.y = self.y.reshape((len(self.y), 1)) def __len__(self): return len(self.X) def __getitem__(self, idx): return [self.X[idx], self.y[idx]] def get_splits(self, n_test): test_size = round(n_test * len(self.X)) train_size = len(self.X) - test_size return random_split(self, (train_size, test_size)) # originally list
def process_openml_task(task: OpenMLTask) -> Tuple[ndarray, ndarray]: """ Process an openml task in a generic way, LabelEncoder for the categorical labels One Hot Encoding for categorical features """ X, y, categorical_mask, _ = task.get_dataset().get_data(task.target_name) # Process labels if y is not None: if y.dtype == 'category' or y.dtype == object: y = LabelEncoder().fit_transform(y.values) # y = pandas.Series(encoded_labels) elif y.dtype == bool: y = y.astype('int') if type(y) == pandas.core.series.Series: y = y.to_numpy() # Process NA's for col in X.columns: mode = X[col].mode()[0] X[col].fillna(mode, inplace=True) # Process Categorical features encoding_frames = [] for col_name in list(X.columns[categorical_mask]): encodings = pandas.get_dummies(X[col_name], prefix=col_name, prefix_sep='_') encoding_frames.append(encodings) X.drop(col_name, axis=1, inplace=True) X = pandas.concat([X, *encoding_frames], axis=1) return X, y
from numpy import mean from numpy import std from pandas import read_csv from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.linear_model import LogisticRegression # load dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv' dataframe = read_csv(url, header=None) data = dataframe.values # separate into input and output elements X, y = data[:, :-1], data[:, -1] # minimally prepare dataset X = X.astype('float') #Encoding! y = LabelEncoder().fit_transform(y.astype('str')) # define the model model = LogisticRegression(solver='liblinear') # define the evaluation procedure cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # evaluate the model m_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) # summarize the result print('Accuracy: %.3f (%.3f)' % (mean(m_scores), std(m_scores)))
X_train = X[[ 'PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked' ]].copy() #print(X_train.shape) le = preprocessing.LabelEncoder() #print(X_train.head()) X_2 = X_train.apply(le.fit_transform) #print(X_2.head()) enc = preprocessing.OneHotEncoder() enc.fit(X_2) #Transform onehotlabels = enc.transform(X_2).toarray() print(onehotlabels.shape) #label reformatting label_2 = LabelEncoder().fit_transform(label) label_2 = label_2.astype('float32') label_2 = label_2.reshape((len(label_2), 1)) print(label_2.shape) class MLP(nn.Module): def __init__(self): super().__init__() self.fc1 = nn.Linear() self.fc2 = nn.Linear() self.fc2 = nn.Linear()
def predict(self, X): return np.sign(self.project(X)) if __name__ == "__main__": import pandas as pd from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder df = pd.read_csv('Iris.csv') df_1 = df.iloc[:100, 2:4] y = df.iloc[:100, -1] y = LabelEncoder().fit_transform(y) yi = y.astype(float) X = df_1.to_numpy() yi[yi == 0] = -1 tmp = np.ones(len(X)) yi = tmp * yi X_train, X_test, y_train, y_test = train_test_split(X, yi, random_state=0) svc = SVM() svc.fit(X_train, y_train) y_pred = svc.predict(X_test) print('Confusion matrix') print(confusion_matrix(y_test, y_pred))
f'14-ada-': ada1, f'15-gpc': gpc1, f'16-GBclass': GBclass1, f'17-histgclas': histgclass, f'18-bagclas': bagclass, f'19-ridge': ridge1, f'20-SVC2': SVC2, f'21-linear SVC': linear1, } for model_name, model in classifier_mapping.items(): train_test_model(model_name, model, x, y, train_size_pct) from google.colab import drive drive.mount('/content/drive') df = pd.read_csv('/content/drive/MyDrive/breast-cancer-wisconsin.data.txt') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) X = np.array(df.drop(['class'], 1)) X = X.astype('float32') y = np.array(df['class']) y = LabelEncoder().fit_transform(y.astype(str)) # Look at the dataset again print(X.shape, y.shape) print(df.head()) print(f'[*] Beginning evaluations: All Features') evaluateIndividualClassifiers(X, y, TRAIN_PCT)