def main(): """ """ # data = credit.dataset_31_credit_g() data = wine.wine_quality_red_csv() print(data.columns) # column = data['volatile_acidity'].values.reshape(-1, 1) # column = data[].values.reshape(-1, 1) X, y = data['volatile_acidity'].values.reshape(-1, 1), data['class'] X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=0) sets = split(X_test, y_test, test_size=.5, random_state=0) X_first_half, X_second_half, y_first_half, y_second_half = sets # print(X_first_half.shape, X_second_half.shape) # X_train, X_test, y_train, y_test = split(X, y, # test_size=0.2, # random_state=0) pipeline = WineQualityPipeline() classifier = RandomForest() model = pipeline.with_estimator(classifier).fit(X_train, y_train) # prediction = model.predict(X_test) # pipeline = CreditGPipeline() shift_detector = SklearnDataShiftDetector(model, n_bins=30) shift_detector.iteration(X_first_half) new_second_half = deepcopy(X_second_half) mask = np.logical_and(X_second_half > .4, X_second_half < 1.) new_second_half[mask] *= 3. plt.plot(range(X_first_half.shape[0]), X_first_half, 'go') plt.plot(range(new_second_half.shape[0]), new_second_half, 'r^') plt.show() shift_detector.iteration(new_second_half) print(shift_detector.data_is_shifted())
def load_data(task, seed, val_size, test_size): """Load dataset Task: Graph Classification - benchmark datasets. Consisting of various graphs we want to classify. Returns: - train_dataset: PyG dataset - val_dataset: PyG dataset - test_dataset: PyG dataset - test_idx: indices of the original dataset used for testing - to recover original data. """ path = "../data/TUDataset" if task == 'mutag': dataset = 'Mutagenicity' dataset = TUDataset(path, dataset, transform=T.NormalizeFeatures(), cleaned=True) elif task == 'enzymes': dataset = 'ENZYMES' dataset = TUDataset(path, dataset, transform=T.NormalizeFeatures(), cleaned=True) elif task == 'proteins': dataset = 'PROTEINS' dataset = TUDataset(path, dataset, transform=T.NormalizeFeatures(), cleaned=True) else: NameError(f"task {args.task} not allowed") print(f'Dataset: {dataset}:') print('====================') print(f'Number of graphs: {len(dataset)}') print(f'Number of features: {dataset.num_features}') print(f'Number of classes: {dataset.num_classes}') indices = [i for i in range(len(dataset))] train_idx, test_idx = split(indices, random_state=seed, test_size=test_size) train_dataset = dataset[train_idx] test_dataset = dataset[test_idx] indices = [i for i in range(len(train_dataset))] train_idx, val_idx = split(indices, random_state=seed, test_size=val_size / (1 - test_size)) val_dataset = train_dataset[val_idx] train_dataset = train_dataset[train_idx] print(f'Number of training graphs: {len(train_dataset)}') print(f'Number of val graphs: {len(val_dataset)}') print(f'Number of test graphs: {len(test_dataset)}') return train_dataset, val_dataset, test_dataset, test_idx
def main(): parser = argparse.ArgumentParser( description='Preprocess a directory of XML files') parser.add_argument('--indir', type=str, default='../eltec/ELTeC-eng', help='location of the original Eltec XML-files') parser.add_argument('--outdir', type=str, default='data/raw', help='location of the train, dev, and test spit') parser.add_argument('--seed', type=int, default=12345, help='random seed') parser.add_argument('--train_size', type=float, default=.9, help='Proportion of training data') args = parser.parse_args() print(args) filenames = glob.glob(os.sep.join((f'{args.indir}/level1', '**', '*.xml')), recursive=True) train, rest = split(filenames, train_size=args.train_size, shuffle=True, random_state=args.seed) dev, test = split(rest, train_size=0.5, shuffle=True, random_state=args.seed) print(f'# train files: {len(train)}') print(f'# dev files: {len(dev)}') print(f'# test files: {len(test)}') if not os.path.exists(args.outdir): os.mkdir(args.outdir) out_folder = f'{args.outdir}/{os.path.basename(args.indir)}' try: shutil.rmtree(out_folder) except FileNotFoundError: pass os.mkdir(out_folder) with open(f'{out_folder}/train.txt', 'w') as f: for fn in train: f.write(plain_text(fn) + '\n') with open(f'{out_folder}/dev.txt', 'w') as f: for fn in dev: f.write(plain_text(fn) + '\n') with open(f'{out_folder}/test.txt', 'w') as f: for fn in test: f.write(plain_text(fn) + '\n')
def partition(characteristics, categories, output, batch, training_percentage = 100, validation_percentage = 0, flag = False): train_percent = training_percentage / 100 validate_percent = validation_percentage / 100 test_percent = round(1 - (train_percent + validate_percent), 1) if ((output == 1) & (isinstance(categories[0], np.ndarray) == False) & (flag == False)): categories = categories.reshape((len(categories), -1)) training_x, training_y = characteristics, categories train_x, train_y = torch.FloatTensor(training_x), training_y if (output > 1): train_y = torch.LongTensor(training_y) elif (output == 1): train_y = torch.FloatTensor(training_y) train = group(train_x, train_y) trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2) tester, validater = None, None if (test_percent > 0): training_x, testing_x, training_y, testing_y = split(characteristics, categories, test_size = test_percent, random_state = np.random.randint(1, 100), shuffle = True, stratify = categories) train_x = torch.FloatTensor(training_x) if (output > 1): train_y = torch.LongTensor(training_y) elif (output == 1): train_y = torch.FloatTensor(training_y) else: train_y = training_y.float() train = group(train_x, train_y) trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2) test_x = torch.FloatTensor(testing_x) if (output > 1): test_y = torch.LongTensor(testing_y) elif (output == 1): test_y = torch.FloatTensor(testing_y) else: test_y = testing_y.float() test = group(test_x, test_y) tester = loader(dataset = test, batch_size = batch, shuffle = True, num_workers = 2) duplicate = validate_percent validate_percent = (validation_percentage*len(characteristics) / len(train_x)) / 100 if (validate_percent > 0): training_x, validation_x, training_y, validation_y = split(training_x, training_y, test_size = validate_percent, random_state = np.random.randint(1, 100), shuffle = True, stratify = training_y) train_x = torch.FloatTensor(training_x) if (output > 1): train_y = torch.LongTensor(training_y) elif (output == 1): train_y = torch.FloatTensor(training_y) else: train_y = training_y.float() train = group(train_x, train_y) trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2) validate_x = torch.FloatTensor(validation_x) if (output > 1): validate_y = torch.LongTensor(validation_y) elif (output == 1): validate_y = torch.FloatTensor(validation_y) else: validate_y = validation_y.float() validate = group(validate_x, validate_y) validater = loader(dataset = validate, batch_size = batch, shuffle = True, num_workers = 2) return trainer, tester, validater
def kfold_validate(model, k, kwargs): """ This functin does something similar to k fold validation. We train and test our model k times, by randomly splitting our entire data set into three parts (train, dev and test) and return the average of the K runs. Args: model (str): What kind of model to use. It can be either lstm or cnn k (int): Number of iterations over which to average kwargs (dict): The parameters that define the model Returns: dict: A dictionary of results, contating the keys precision, recall and fscore. """ p_1 = 0.0 r_1 = 0.0 f_1 = 0.0 train_data = ATEDataProcessor(kwargs["train_file"], **kwargs) test_data = ATEDataProcessor(kwargs["test_file"], pos_id=get_count(train_data.annotated_sentences), **kwargs) sentences = train_data.annotated_sentences + test_data.annotated_sentences for i in range(k): print("Run number: {}".format(i)) train_set, test_set = split(sentences, test_size=0.2, random_state=42) train_set, dev_set = split(train_set, test_size=kwargs["test_size"], random_state=42) train = DataIterator(train_set, **kwargs) dev = DataIterator(dev_set, **kwargs) test = DataIterator(test_set, **kwargs) if model == "lstm": model = LSTMNetwork(**kwargs) elif model == "cnn": model = CNNNetwork(max_sentence_length=train_data.max_sentence_len, **kwargs) model.build() model.train(train, dev) results = model.evaluate(test) p_1 += float(results["p_1"]) r_1 += float(results["r_1"]) f_1 += float(results["f_1"]) model.close_session() print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1/k, r_1/k, f_1/k)) return { "precision": p_1/k, "recall": r_1/k, "fscore": f_1/k }
def train_save(epochs=30, jsonfile="model.json", weightfile="model.h5"): X, y, classes = load_cifar10() nclasses = len(classes) # слои моей модели layers = [("Conv2D", {"filters":32, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "input_shape":X.shape[1:], "padding":"same"}), ("BatchNormalization", {}), ("Conv2D", {"filters":32, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}), ("MaxPooling2D", {"pool_size":(2,2), "padding":"same"}), ("BatchNormalization", {}), ("Dropout", {"rate":0.5}), ("Conv2D", {"filters":64, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}), ("BatchNormalization", {}), ("Conv2D", {"filters":64, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}), ("BatchNormalization", {}), ("MaxPooling2D", {"pool_size":(2,2), "padding":"same"}), ("Dropout", {"rate":0.5}), ("Flatten", {}), ("Dense", {"units":64, "activation":"relu"}), ("Dropout", {"rate":0.5}), ("Dense", {"units":nclasses, "activation":"softmax"})] # тренируем model = create_model(layers) Xtrain, Xtest, ytrain, ytest = split(X, y, test_size=0.2) model.fit(Xtrain, ytrain, batch_size=256, epochs=epochs, validation_data = (Xtest, ytest)) # сохранить модель model_json = model.to_json() with open(jsonfile, "w") as json_file: json_file.write(model_json) # сохранить веса (БОЛЬШОЙ ФАЙЛ) model.save_weights(weightfile) return model
def __init__(self, data, test_size=0.3, weight=False, negative=NEGATIVE, positive=POSITIVE, oversample=False, undersample=False): data_train, data_test = split(data, test_size=test_size, random_state=0) if oversample: data_train = Dataset.oversample(data_train, 'Income') if undersample: data_train = Dataset.undersample(data_train, 'Income') if weight: positive_size = (data_train['Income'] == positive).sum() negative_size = data_train.shape[0] - positive_size ratio = negative_size / positive_size data_train = Dataset.weight(data_train, 'Income', { negative: 1.0, positive: ratio }) data_test = Dataset.weight(data_test, 'Income', { negative: 1.0, positive: 1.0 }) self.y_train = data_train.pop('Income') self.x_train = data_train self.y_test = data_test.pop('Income') self.x_test = data_test
def traintest(x, y, test_size=None, random_state=None): """get train-test split of data from archive Inputs: x: 2D input data array of shape (pts, nx) y: 1D output data array of shape (pts,) test_size: float, % data to use for test [0,1] (default: copy [1,1]) random_state: int, seed for splitting data Returns: xtrain,xtest,ytrain,ytest: arrays of training and test data For example: >>> x = [[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5.0, 3.6, 1.4, 0.2]] >>> y = [0, 1, 0, 0, 2] >>> >>> xx,xt,yy,yt = traintest(x, y, test_size=.4) >>> len(xt)/len(x) == len(yt)/len(y) == .4 True >>> len(xx)/len(x) == len(yy)/len(y) == 1-.4 True >>> >>> xx,xt,yy,yt = traintest(x, y) >>> len(yy) == len(yt) == len(y) True >>> len(xx) == len(xt) == len(x) True """ # build train/test data xx = np.array(x) yy = np.array(y) if test_size is None: return xx, xx, yy, yy from sklearn.model_selection import train_test_split as split return split(xx, yy, test_size=test_size, random_state=random_state)
def train_evaluate(ga_individual_solution): # Decode the Genetic Algorithm solution to get the window size and number of bits window_size_bits = BitArray(ga_individual_solution[0:6]) num_units_bits = BitArray(ga_individual_solution[6:]) window_size = window_size_bits.uint num_of_units = num_units_bits.uint print('\nWindow Size: ', window_size, ', Num of Units: ', num_of_units) # Return fitness score of 100 if window_size or num_unit is zero if window_size == 0 or num_of_units == 0: return 100, # Segment the train_data based on new window_size; # Split the dataset into train set(80) and validation set(20) X_data, Y_data = prepare_dataset(train_data, window_size) X_train, X_val, y_train, y_val = split(X_data, Y_data, test_size=0.20, random_state=1120) # Design an LSTM model to train on training data and predict on validation data input_ph = Input(shape=(window_size, 1)) x = LSTM(num_of_units, input_shape=(window_size, 1))(input_ph) predicted_values = Dense(1, activation='tanh')(x) model = Model(inputs=input_ph, outputs=predicted_values) model.compile(optimizer='adam', loss='mean_squared_error') model.fit(X_train, y_train, epochs=5, batch_size=20, shuffle=True) y_pred = model.predict(X_val) # Calculate the RMSE score as fitness score for GA rmse = np.sqrt(mean_squared_error(y_val, y_pred)) print('Validation RMSE: ', rmse, '\n') return rmse,
def main(): dataset = CompositionData(data_path=args.data_path, fea_path=args.fea_path) orig_atom_fea_len = dataset.atom_fea_dim if args.test_path: print("using independent test set: {}".format(args.test_path)) train_set = dataset test_set = CompositionData(data_path=args.test_path, fea_path=args.fea_path) else: print("using {} of training set as test set".format(args.test_size)) indices = list(range(len(dataset))) train_idx, test_idx = split(indices, random_state=args.seed, test_size=args.test_size) train_set = torch.utils.data.Subset(dataset, train_idx[0::args.sample]) test_set = torch.utils.data.Subset(dataset, test_idx) if not os.path.isdir("models/"): os.makedirs("models/") if not os.path.isdir("runs/"): os.makedirs("runs/") if not os.path.isdir("results/"): os.makedirs("results/") ensemble(args.data_id, train_set, test_set, args.ensemble, orig_atom_fea_len)
def load_data(self, val_set_size=0.20): (X, Y), (x_test, y_test) = mnist.load_data() # Splitting Data into train and validation sets # x_train, x_val, y_train, y_val = split(X, Y, test_size=val_set_size, shuffle=True) x_train, x_val, y_train, y_val = split(X, Y, test_size=val_set_size, random_state=self.seed, shuffle=True) self.y_train = y_train self.y_test = y_test self.y_val = y_val # Flattening Data x_train = tf.reshape(tf.Variable(x_train, dtype=tf.float32), [len(x_train), 784]) x_val = tf.reshape(tf.Variable(x_val, dtype=tf.float32), [len(x_val), 784]) x_test = tf.reshape(tf.Variable(x_test, dtype=tf.float32), [len(x_test), 784]) # Normalizing Data self.x_train = x_train / 255 self.x_test = x_test / 255 self.x_val = x_val / 255
def main(): #setup util functions DEBUG = False PRINT = True util = Util(DEBUG, PRINT) #easy access to print funcs uprt = util.outPrt uerr = util.errPrt #read in data from csv data = pd.read_csv("./btc.csv") #print columns uprt("Columns", data.columns.values.tolist()) #pull out target column y = data["PriceUSD"] X = data.loc[:, data.columns != "PriceUSD"] #split dataset xtrain, xtest, ytrain, ytest = split(X, y) #print split data sets uprt("\nxtrain:", xtrain) uprt("\nytrain:", ytrain) uprt("\nxtest:", xtest) uprt("\nytest:", ytest)
def train_evaluate(self, ga_individual_solution): # Decode GA solution to integer for window_size and num_units window_size_bits = BitArray(ga_individual_solution[0:6]) num_units_bits = BitArray(ga_individual_solution[6:]) window_size = window_size_bits.uint num_units = num_units_bits.uint print('\nWindow Size: ', window_size, ', Num of Units: ', num_units) # Return fitness score of 100 if window_size or num_unit is zero if window_size*num_units == 0: return 100, # Segment the train_data based on new window_size; split into train and validation (80/20) X, Y = prepare_dataset(self.train_data, window_size) X_train, X_val, y_train, y_val = split(X, Y, test_size=0.20, random_state=1120) # Train LSTM model and predict on validation set inputs = Input(shape=(window_size, 1)) x = LSTM(num_units, input_shape=(window_size, 1))(inputs) predictions = Dense(1, activation='linear')(x) model = Model(inputs=inputs, outputs=predictions) model.compile(optimizer='adam', loss='mean_squared_error') model.fit(X_train, y_train, epochs=5, batch_size=10, shuffle=True) y_pred = model.predict(X_val) # Calculate the RMSE score as fitness score for GA rmse = np.sqrt(mean_squared_error(y_val, y_pred)) self.generations_rmse.append(rmse) print('Validation RMSE: ', rmse, '\n') return rmse,
def extract_data(data_dir, train_csv, dev_csv, class_txt): class_dict = {} class_idx = 0 audio_train, label_train, audio_dev, label_dev = [], [], [], [] classes = os.listdir(data_dir) for idx in classes: class_dict[idx] = class_idx class_path = os.path.join(data_dir, idx) files = os.listdir(class_path) path = [os.path.abspath(os.path.join(class_path, fn)) for fn in files] X_train, X_dev, y_train, y_dev = split(path, [class_idx] * (len(files)), test_size=0.2) audio_train += X_train label_train += y_train audio_dev += X_dev label_dev += y_dev class_idx += 1 extract_csv(audio_train, label_train, train_csv) extract_csv(audio_dev, label_dev, dev_csv) f = open(class_txt, 'w') for idx in class_dict: f.write(f'{idx}\t{class_dict[idx]}\n') f.close()
def main(): """ """ # data = credit.dataset_31_credit_g() data = wine.wine_quality_red_csv() print(data.shape) print(data.columns) target = "class" X, y = data[[col for col in data.columns if col != target]], data[target] X_train, X_test, y_train, y_test = split(X, y, test_size=0.2, random_state=0) # pipeline = CreditGPipeline() pipeline = WineQualityPipeline() classifier = RandomForest(size=40) model = pipeline.with_estimator(classifier).fit(X_train, y_train) prediction = model.predict(X_test) print(accuracy_score(y_test, prediction)) suite = TestSuite() automated_suite = AutomatedTestSuite() data_profile = DataFrameProfiler().on(X_train) pipeline_profile = SklearnPipelineProfiler().on(model) suite.add(Test().is_complete( data_profile.for_column('volatile_acidity')).is_in_range( data_profile.for_column('alcohol'))) warnings = suite.on(X_test) print("*** TEST_SUITE, X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn) error_generator = ExplicitMissingValues() corrupted_X_test = error_generator.run(X_test, ['volatile_acidity']) warnings = suite.on(corrupted_X_test) print("*** TEST_SUITE, CORRUPTED_X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn) tests, warnings = (automated_suite.with_profiles( data_profile, pipeline_profile).run(corrupted_X_test)) print("*** AUTOMATED_TEST_SUITE, CORRUPTED_X_TEST") if warnings and (len(warnings) != 0): print("======= WARNINGS =======") for warn in warnings: print(warn)
def transform_data(df: pd.DataFrame): train, test = split(df, test_size=0.35, random_state=45) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(train['Review']) X_test = vectorizer.transform(test['Review']) Y_train = train['Class'] Y_test = test['Class'] return X_train, Y_train, X_test, Y_test
def train_evaluate(ga_individual_solution): # Decode GA solution to integer for window_size and num_units window_size = BitArray(ga_individual_solution[0:4]).uint num_units = BitArray(ga_individual_solution[4:8]).uint dropout_key = BitArray(ga_individual_solution[8:]).uint dropout_rate = dropout_dict[dropout_key] print('\nWindow Size: ', window_size, ', Num of Units: ', num_units, ', Dropout Rate: ', dropout_rate) print(ga_individual_solution[0:4], ga_individual_solution[4:8], ga_individual_solution[8:]) print(BitArray(ga_individual_solution[0:4])) #print(list('{BitArray(ga_individual_solution[0:4])}'.format(6))) # Set minumum sizes for window and units to 1 if window_size == 0: ga_individual_solution[0:4] = [0, 0, 0, 1] window_size = 1 if num_units == 0: ga_individual_solution[4:8] = [0, 0, 0, 1] num_units = 1 print('\nWindow Size: ', window_size, ', Num of Units: ', num_units, ', Dropout Rate: ', dropout_rate) print(ga_individual_solution[0:4], ga_individual_solution[4:8], ga_individual_solution[8:]) # Segment the train_data based on new window_size; split into train and validation (80/20) X, Y = prepare_dataset(train_data, window_size) X_train, X_val, y_train, y_val = split(X, Y, test_size=0.20, random_state=1120) # Train LSTM model and predict on validation set model = Sequential() model.add(LSTM(num_units, input_shape=(window_size, 1))) model.add(Dropout(dropout_rate)) model.add(Dense(1, activation='linear')) model.compile(optimizer='rmsprop', loss='mse', metrics=['mae']) history = model.fit(X_train, y_train, epochs=4, batch_size=10, shuffle=True) y_pred = model.predict(X_val) plt.plot(history.history['loss'], label='train') # Calculate the RMSE score as fitness score for GA mse = mean_squared_error(y_val, y_pred) rmse = np.sqrt(mean_squared_error(y_val, y_pred)) adjusted_rmse = .1 * ((num_units / 16) + ((np.square(num_units) * window_size) / (np.power(16, 3)))) + rmse print('Validation RMSE: ', rmse, '\nValidation MSE: ', mse, 'adjusted: ', adjusted_rmse) print return adjusted_rmse,
def impData(sPath, sCat, fP): reviews = [] for i, filename in enumerate(os.listdir(sPath)): reviews.insert(i, [open(sPath + filename).read(), sCat]) #Create training & test sets lTrain, lTest = split(reviews, train_size=fP) return lTrain, lTest
def modelar_dados (teste): ### FUNCAO PARA MODELAR OS DADOS E DIVIDI-LOS EM UMA BASE DE TREINO E DE TESTES DE ACORDO COM O PRAMETRO TAM_TESTES INFORMADO NAS CONSTANTES ### dataset = pd.read_csv('leaf.csv', header = None) classes = dataset[dataset.columns[0]].values dataset.drop([0], axis = 1, inplace = True) ind_t, ind_te, cl_t, cl_te = split(dataset, classes, test_size = teste, shuffle = True) return ind_t, ind_te, cl_t, cl_te
def __init__(self, text, Y, train_size=.85): self.model_builders = {'dtc': dtc, 'rfc': rfc} steps = ['tfidf', 'feature_engineering', 'lda', 'model'] self.pipeline_dic = {step: None for step in steps} self.text_train, self.text_test, self.Y_train, self.Y_test = split( text, Y, train_size=train_size, stratify=Y) self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[ 'tfidf']) self.keep_features = lambda features_dic: (features_dic == self. pipeline_dic['features']) self.prob_info = lambda prob: -prob * np.log(prob) self.pipeline_dic = {step: "Default" for step in steps} self.train_size = train_size
def split_dataset(data, target_feature, hp): train, val = hp['train_ratio'], hp['val_ratio'] test, target = hp['test_ratio'], hp['target_ratio'] denominator = np.sum([train, val, test, target]) train /= denominator val /= denominator test /= denominator target /= denominator random_state = hp['random_state'] X = data[[col for col in data.columns if col != target_feature]] y = data[target_feature] X_rest, X_train, y_rest, y_train = split(X, y, test_size=train, random_state=random_state) X_rest, X_target, y_rest, y_target = split(X_rest, y_rest, test_size=target / (1. - train), random_state=random_state) X_val, X_test, y_val, y_test = split(X_rest, y_rest, test_size=test / (val + test), random_state=random_state) return X_train, y_train, X_val, y_val, X_test, y_test, X_target, y_target
def setUp(self): self.resource_folder = get_resource_path() self.pipeline = CreditGPipeline() # data = credit.dataset_31_credit_g() data = pd.read_csv(os.path.join(self.resource_folder, 'data', 'credit-g/dataset_31_credit-g.csv')) target = 'class' # I guess it will work only if the target value is the last one. self.features = [col for col in data.columns if col != target] X = data[self.features] y = data[target] sets = split(X, y, test_size=0.2, random_state=0) self.X_train, self.X_test, self.y_train, self.y_test = sets self.data_profile = DataFrameProfiler().on(self.X_train) self.automated_suite = AutomatedTestSuite()
def train_roost(args, model_name, csv_train, csv_val=None, val_frac=0.0, resume=False, transfer=None, fine_tune=None): args.data_path = f'data/datasets/{csv_train}' args.val_size = val_frac dataset = CompositionData(data_path=args.data_path, fea_path=args.fea_path) orig_atom_fea_len = dataset.atom_fea_dim args.fea_len = orig_atom_fea_len if resume: args.resume = resume else: if transfer is not None: args.transfer = transfer elif fine_tune is not None: args.fine_tune = fine_tune if csv_val is None: indices = list(range(len(dataset))) train_idx, val_idx = split(indices, random_state=args.seed, test_size=args.val_size) train_set = torch.utils.data.Subset(dataset, train_idx[0::args.sample]) val_set = torch.utils.data.Subset(dataset, val_idx) else: train_set = dataset val_set = CompositionData(data_path=f'data/datasets/{csv_val}', fea_path=args.fea_path) if not os.path.isdir("models/"): os.makedirs("models/") if not os.path.isdir("runs/"): os.makedirs("runs/") if not os.path.isdir("results/"): os.makedirs("results/") ensemble(model_name, args.fold_id, train_set, val_set, args.ensemble, orig_atom_fea_len, args)
def train_model(best_window_size, best_num_units): # Decode GA solution to integer for window_size and num_units train_data = DATA split_point = SPLIT_POINT window_size = best_window_size num_units = best_num_units # window_size = 200 # num_units = 150 print('\nWindow Size: ', window_size, ', Num of Units: ', num_units) # Return fitness score of 100 if window_size or num_unit is zero if window_size == 0 or num_units == 0: return 0, print("DEBUG: The data size is : ", len(train_data[0])) mp_prep = MultiProcessPreprocessor(train_data, window_size) X,Y = mp_prep.mp_preprocessor() del mp_prep # X,Y = prepare_dataset(train_data,window_size) print("DEBUG: The X size is : ", len(X)) X_train, X_val, y_train, y_val = split(X, Y, test_size = 0.20, random_state = 1120) inputs = Input(shape=(window_size, 3)) x = LSTM(num_units, input_shape=(window_size, 3))(inputs) ## x = Dense(200, activation='relu')(x) x = Dense(100, activation='relu')(x) x = Dense(50, activation='relu')(x) ## predictions = Dense(6, activation='softmax')(x) opt = optimizers.SGD(lr=0.01, momentum=0.9) model = Model(inputs=inputs, outputs=predictions) model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, epochs=10, batch_size=20, shuffle=True) _, train_acc = model.evaluate(X_train, y_train, verbose=0) _, test_acc = model.evaluate(X_val, y_val, verbose=0) print('Train: %.3f, Test: %.3f' % (train_acc, test_acc)) return model, test_acc
def train_test_split(data, test_ratio=0.5, n_splits=10, best_split=True): """ This function splits the data into two using stratified sampling in ratios as determined by test_ratio. The strata is constructed out of creating quartile splits on the target variable, i.e., sales_volume. If the best_split is True, The split which yields the minimum differen in the means of target is returned. Else, the last split is returned as it is. Note the number of splits are determined by n_splits""" # Object for stratified sampling split_obj = split(n_splits=n_splits, test_size=test_ratio, random_state=180) # Discretizing the target volume to guide stratified sampling data['categories'] = pd.qcut(data['sales_volume'], 4, labels=["low", "low mid", 'high mid', "high"]) # best split is one that yields least difference in mean sales_volume of both folds least_diff_in_means = None best_split = None, None # Looping over each split for idx_train, idx_test in split_obj.split(data, data['categories']): train = data.iloc[idx_train] test = data.iloc[idx_test] diff_in_means = abs(train.sales_volume.mean() - test.sales_volume.mean()) """ Update the best split if best_split=True and either the current split is the first split or the best split. """ if best_split and ((least_diff_in_means is None) or (least_diff_in_means > diff_in_means)): least_diff_in_means = diff_in_means best_split = idx_train, idx_test if best_split[0] is None: best_split = idx_train, idx_test del data['categories'] idx_train, idx_test = best_split train = data.iloc[idx_train] test = data.iloc[idx_test] return train, test
def data_split(x, y, i): # split x and y1 data, test size = ratio(default = 0.25), train_size = the remaining data # random default state 123 j = 0 x_train = [] x_test = [] y_train = [] y_test = [] while j < 3: x_tr, x_te, y_tr, y_te = split(x, y[j], test_size=i, random_state=123) j += 1 x_train.append(x_tr) x_test.append(x_te) y_train.append(y_tr) y_test.append(y_te) return x_train, x_test, y_train, y_test
def pred_score(X, y): # split the data into train and test, and train iem X_train, X_test, y_train, y_test = split(X, y, test_size=0.5) rgs.fit(X_train, y_train) # predict the response in test cases: pred_iem = rgs.predict(X_test) err_iem = [ circ_dist(pred_iem[i_], y_test[i_]) for i_ in range(len(pred_iem)) ] # score the test cases: u = np.sum([err_iem[i_]**2 for i_ in range(len(err_iem))]) v = 90**2 score_rgs = 1 - u / v # compute reconstruction: recon = rgs._predict_direction_responses(X_test) # score classifier and get average reconstructions per category: cat_predict = [] for i_tr in range(0, len(pred_iem)): # targ_dist = np.abs(pred_iem[i_tr] - targ_directions) targ_dist = [ np.abs(circ_dist(pred_iem[i_tr], targ_directions[i_targ])) for i_targ in range(len(targ_directions)) ] targ_match = np.argmin(targ_dist) cat_predict.append(targ_directions[targ_match]) cls_hits = [cat_predict[i_] == y_test[i_] for i_ in range(len(y_test))] score_cls = np.sum(cls_hits) / len(cls_hits) recon_cat = np.empty((3, 180)) for i_cat in range(0, len(targ_directions)): d_cat = [ recon[:, x] for x in range(0, len(cat_predict)) if cat_predict[x] == targ_directions[i_cat] ] recon_cat[i_cat, :] = np.mean(d_cat, 0) return (score_rgs, score_cls, recon_cat)
def train(): columns = [f'Unnamed: {i}' for i in range(1,6)] for index,column in enumerate(columns): ''' Get Dataset ''' features, labels = ImagePreprocessor.createDataset(column, grayscale = False, normalize = True) features = np.reshape(features, (len(features), 128, 128, 3)) ''' onehot for multiclass ''' if index == 1: labels = onehot(labels) else: labels = [0 if label == '-1' else int(label) for label in labels] ''' split train and test sets ''' features, testX, labels, testY = split(features, labels, test_size = 0.2) model = createModel(multiclass = (column == 'Unnamed: 1')) ''' Stop after 2 epochs if val loss doent decrease, reduce LR when val loss stops decreasing, save model on epoch end ''' callbacks = [EarlyStopping(monitor = 'val_loss', restore_best_weights= True, patience = 2), ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 1)] callbacks.append(ModelCheckpoint(f'models/Task{index+1}.h5')) ''' loss function different for multi and binary ''' loss = 'categorical_crossentropy' if index == 1 else 'binary_crossentropy' ''' adam optimizer validate on split class weighting for undereprestented classes ''' model.compile(optimizer = 'adam', loss = loss, metrics = ['accuracy']) history = model.fit(features, labels, batch_size = 32, epochs = 20, validation_data = (testX, testY), callbacks = callbacks, class_weight = 'auto')
def __prepare_train_data(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.tokenize = self.__tokenize sample_weight = None if sample_weight is None else list(sample_weight) sw = [1 for yi in y] if sample_weight is None else sample_weight s = y if Counter(y).most_common()[-1][1] > 1 else None X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s, random_state=self.random_state, train_size=self.split_ratio) fields = [("text", self.__text_field), ("label", self.__label_field)] examples = [[X_t[i], y_t[i]] for i in range(len(X_t))] examples = [Example.fromlist(example, fields) for example in examples] weights = compute_sample_weight(self.class_weight, y_t) weights = [weights[i] * w_t[i] for i in range(len(y_t))] min_weight = min(weights) weights = [int(round(weight / min_weight)) for weight in weights] for i in range(len(X_t)): Xi = [X_t[i] for j in range(weights[i] - 1)] examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] train_data = Dataset(examples, fields) dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))] dev_data = [Example.fromlist(example, fields) for example in dev_data] dev_data = Dataset(dev_data, fields) self.__text_field.build_vocab(train_data, dev_data, vectors=self.vectors) self.__label_field.build_vocab(train_data, dev_data) batch_sizes = (self.batch_size, len(dev_data)) return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes, sort_key=lambda ex: len(ex.text), repeat=False)
def decision_tree_main(cost_mode, max_depth): iris = load_iris() data = iris.get("data") target = iris.get("target") train_d, test_d, train_t, test_t = split(data, target, test_size=0.3, random_state=0) d_tree = DecisionTree(max_depth=int(max_depth), target_num=len(set(target))) d_tree.fit(train_d, train_t, cost_mode) pred_list = d_tree.predict(test_d) #精度計算 accuracy = 0 for pred, answer in zip(pred_list, test_t): if pred == answer: accuracy += 1 accuracy = float(accuracy / len(pred_list)) print("max depth : ", max_depth, "accuracy : ", accuracy) return accuracy