def prepare_data(dataset_name, n_minority_samples=20, scaler='MinMax'): dataset = datasets.load(dataset_name) (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1] X, y = np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]) if n_minority_samples is not None: minority_class = Counter(y).most_common()[1][0] majority_class = Counter(y).most_common()[0][0] n_minority = Counter(y).most_common()[1][1] n_majority = Counter(y).most_common()[0][1] X, y = RandomUnderSampler( sampling_strategy={ minority_class: np.min([n_minority, n_minority_samples]), majority_class: n_majority }, random_state=42, ).fit_sample(X, y) X = TSNE(n_components=2, random_state=42).fit_transform(X) if scaler == 'MinMax': X = MinMaxScaler().fit_transform(X) elif scaler == 'Standard': X = StandardScaler().fit_transform(X) else: raise NotImplementedError return X, y
def dump_ds(kind): ps = qu.Params(**(ds_small if kind == 'small' else ds_large)) ss = [s for s in qd.dump(ps, f'/tmp/q/data/{kind}')] ds = qd.load(ps, shards=ss).map(qd.adapter) for i, _ in enumerate(ds): pass print(f'dumped {i + 1} batches of {ps.dim_batch} samples each')
def run(model_settings, dataset_settings, _log): _log.info('dataset_settings: ' + str(dataset_settings)) _log.info('model_settings: ' + str(model_settings)) dataset = datasets.load(dataset_settings) model_settings.update({'dataset': dataset}) model = models.load(model_settings) train(model, dataset) evaluate(model, dataset)
def run(model_settings, dataset_settings, num_experiments, _log): _log.info('dataset_settings: ' + str(dataset_settings)) _log.info('model_settings: ' + str(model_settings)) ex.info['evaluations'] = [] for i in range(num_experiments): dataset = datasets.load(dataset_settings) model_settings.update({'dataset': dataset}) model = models.load(model_settings) train(model, dataset) ex.info['evaluations'].append(evaluate(model, dataset))
def run(model_settings, dataset_settings, num_experiments, _log): _log.info('dataset_settings: ' + str(dataset_settings)) _log.info('model_settings: ' + str(model_settings)) ex.info['evaluations'] = [] for i in range(1, num_experiments+1): print('#'*10, 'Run', i, '#'*10) dataset_settings['train_size'] = i/num_experiments dataset = datasets.load(dataset_settings) model_settings.update({'dataset': dataset}) model = models.load(model_settings) train(model, dataset) ex.info['evaluations'].append(evaluate(model, dataset)) ex.info['sota'] = dataset.sota
def run(model_settings, dataset_settings, _log): _log.info('dataset_settings: ' + str(dataset_settings)) dataset = datasets.load(dataset_settings) model_settings.update({ 'input_shape': dataset.input_shape, 'num_classes': dataset.num_classes, }) _log.info('model_settings: ' + str(model_settings)) model = models.load(model_settings) train(model, dataset) evaluate(model, dataset)
def load_experiment(path, alignment='luminance'): """ Load an FHD experiment located at given path. Parameters ---------- path : str The path of the experiment to load. alignment : str, default = 'luminance' Default alignment of layers for the FHDs. Returns ------- experiment : Bunch The loaded FHD experiment. Notes ----- FHistograms are scaled between [0, 1] globally (no loss of information) and independently for shapes and spatial relations. """ path = os.path.normpath(path) dataset = datasets.load(path.split('/')[-2]) n_layers = int(path.split('/')[-1].split('-')[0]) if alignment not in ALIGNMENTS: raise ValueError("Incorrect alignment.") fhd_files = sorted(glob.glob(os.path.join(path, '*/fhd.txt'))) fhds = np.array([ from_file(fhd_file, n_layers, alignment=alignment) for fhd_file in fhd_files ]) # Feature scaling (shapes and spatial relations independently) shapes = np.vstack([_[np.diag_indices(n_layers)] for _ in fhds]) spatials = np.vstack([_[np.triu_indices(n_layers, 1)] for _ in fhds]) for fhd in fhds: fhd[np.diag_indices(n_layers)] -= shapes.min() fhd[np.diag_indices(n_layers)] /= (shapes.max() - shapes.min()) fhd[np.triu_indices(n_layers, 1)] -= spatials.min() fhd[np.triu_indices(n_layers, 1)] /= (spatials.max() - spatials.min()) experiment = dataset experiment['path'] = path experiment['n_layers'] = n_layers experiment['fhds'] = fhds return experiment
def test_create_package(): acc = datasets.run('iris.csv') candidate = acc.candidates[-1] example = autom8.create_example_input( pipeline=candidate.pipeline, dataset=datasets.load('iris.csv'), indices=acc.test_indices[1:3], ) package_bytes = autom8.create_package( package_name='autom8-test', pipeline=candidate.pipeline, example_input=example, extra_notes='foo bar baz', ) with zipfile.ZipFile(io.BytesIO(package_bytes)) as z: assert sorted(z.namelist()) == sorted([ 'autom8-test/.dockerignore', 'autom8-test/Dockerfile', 'autom8-test/LICENSE', 'autom8-test/Makefile', 'autom8-test/README.md', 'autom8-test/pipeline.pickle', 'autom8-test/requirements.txt', 'autom8-test/service.py', 'autom8-test/tests.py', ]) def read(name): with z.open(f'autom8-test/{name}') as f: return f.read().decode('utf-8') assert 'requirements.txt' in read('Dockerfile') assert 'MIT License' in read('LICENSE') with z.open('autom8-test/pipeline.pickle') as f: pipeline = pickle.load(f) readme = read('README.md') assert 'foo bar baz' in readme sample_input = _extract_json(readme, '--data \'') expected_output = _extract_json(readme, '\nThis will return:\n') received_output = pipeline.run(sample_input['rows']) assert expected_output['predictions'] == received_output.predictions
def save_dataset(dataset_name, output_dir, seed=0): """ Save single dataset """ train_filename = os.path.join(output_dir, tfrecord_filename(dataset_name, "train")) valid_filename = os.path.join(output_dir, tfrecord_filename(dataset_name, "valid")) test_filename = os.path.join(output_dir, tfrecord_filename(dataset_name, "test")) # Skip if they already exist if os.path.exists(train_filename) \ and os.path.exists(valid_filename) \ and os.path.exists(test_filename): if FLAGS.debug: print("Skipping:", train_filename, valid_filename, test_filename, "already exist") return if FLAGS.debug: print("Saving dataset", dataset_name) dataset, dataset_class = datasets.load(dataset_name) # Skip if already normalized/bounded, e.g. UCI HAR datasets already_normalized = dataset_class.already_normalized # Split into training/valid datasets valid_data, valid_labels, train_data, train_labels = \ valid_split(dataset.train_data, dataset.train_labels, seed=seed) # Calculate normalization only on the training data if FLAGS.normalize != "none" and not already_normalized: normalization = datasets.calc_normalization(train_data, FLAGS.normalize) # Apply the normalization to the training, validation, and testing data train_data = datasets.apply_normalization(train_data, normalization) valid_data = datasets.apply_normalization(valid_data, normalization) test_data = datasets.apply_normalization(dataset.test_data, normalization) else: test_data = dataset.test_data # Saving write(train_filename, train_data, train_labels) write(valid_filename, valid_data, valid_labels) write(test_filename, test_data, dataset.test_labels)
def run_test(): for current_dataset in Datasets: for current_fold in range (1,11): df_train, df_test, ds_infos = ds.load(source_path, current_dataset, current_fold) X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1].values X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1].values for v in range(0, 3, 2): fileName = current_dataset + '.BA' + str(current_fold) + '.O5.T10.V' + str(v) results_dir = os.path.join(output_path, 'trees10', current_dataset,'Results_' + current_dataset) curr_dir = os.path.join(output_path, 'trees10', current_dataset, fileName) source_dir = Path(curr_dir) files = source_dir.iterdir() files = source_dir.glob('*.tree') resultFile = os.path.join(results_dir,fileName + '.results.txt') f = open(resultFile, 'w') for file in files: str_file = str(file) pos = str_file.find('.D') pos2 = str_file.find('.tree') depth = str_file[pos+2:pos2] born_againO5 = tree_io.classifier_from_file(str_file, X_train, y_train, pruning=False) # BornAgainNew banew_test_pred = born_againO5.predict(X_test) banew_train_pred = born_againO5.predict(X_train) report_banew = classification_report(y_test, banew_test_pred, output_dict=True) test_acc = report_banew['accuracy'] test_F1 = report_banew['weighted avg']['f1-score'] f.write(depth) f.write(" " + str(test_acc)) f.write(" " + str(test_F1)) f.write("\n") f.close() print(fileName) tree_view.plotStatistics(resultFile)
def main(argv): # Don't bother using the GPU for this os.environ["CUDA_VISIBLE_DEVICES"] = "" # Input data if FLAGS.target != "": source_dataset, target_dataset = datasets.load_da( FLAGS.source, FLAGS.target) else: source_dataset = datasets.load(FLAGS.source) target_dataset = None if not FLAGS.test: source_data = source_dataset.train_images source_labels = source_dataset.train_labels target_data = target_dataset.train_images \ if target_dataset is not None else None target_labels = target_dataset.test_labels \ if target_dataset is not None else None else: source_data = source_dataset.test_images target_data = target_dataset.test_images \ if target_dataset is not None else None display("Source", source_data, source_labels, office="office_" in FLAGS.source) if target_dataset is not None: display("Target", target_data, target_labels, office="office_" in FLAGS.target) plt.show()
from flask import Flask from datasets import load, load_buildings, load_invest import os app = Flask(__name__) app.url_map.strict_slashes = False _dir = os.path.dirname(os.path.abspath(__file__)) app.template_folder = os.path.join(_dir, "templates") app.static_folder = os.path.join(_dir, "static") app.config['UPLOAD_FOLDER'] = os.path.join(_dir, "upload") power = load() invest = load_invest() apart = load_buildings()
def monks(task_type, param_grid, model_assessment=False): # this file contains the whole dataset, we rely on it instead of using the provided splitting # because in that way we simulate a splitting according to hold-out technique dataset = ds.load('datasets/'+ task_type + '.test', 'monks') dataset.shuffle() # bacause data are taken randomly in monks-*.train # simple hold-out strategy # ~123 elements for training set as in the original splitting (monks-1, monks-3) splitting = 43/100 if task_type == 'monks-2': # monks-2 uses ~169 elements in the training set splitting = 59/100 trainvalset, testset = dataset.split(splitting) # validation set is half of training set trainset, validationset = trainvalset.split(66.6/100) for params in ms.grid_search(param_grid): # if batch size is -1 means we want the batch equal to the entire training set size params['batch_size'] = params['batch_size'] if params['batch_size'] > 0 else trainset.size() print(params) epochs = params['epochs'] # value taken from the monks problem paper batch_size = params['batch_size'] # trying different runs, to be independent from random weights init # and to have a bias-variance estimation (ensemble learning) when using inference on testset runs_number = 3 # 5 can be used as well for r in range(runs_number): # we are going to init more instances of the model to # perform a better computation of the metrics nn.from_parameters(params, 'sigmoid', 'sigmoid') model = nn.build() ms.add_model(model) ms.set_datasets(trainset,validationset) start_time = time.time() for e in range(epochs): printProgressBar(e + 1, epochs, prefix = 'Training:', suffix = 'Complete') # for each model we initialized above for model_id, model in ms.models(): # doing one step of training model.fit(trainset,batch_size,e) # computing the output values for this training step train_outputs = model.forward_dataset(trainset) val_outputs = model.forward_dataset(validationset) # compute the metrics ms.compute_error(model_id, train_outputs, val_outputs) ms.compute_other(model_id, train_outputs, val_outputs, metrics=['acc'],threshold=0.5) training_time = time.time() - start_time print("TRAINING TIME " + str(training_time) + " seconds") # getting the average of errors and accuracy avg_tr_error, avg_val_error = ms.avg_mse() avg_tr_acc, avg_val_acc = ms.avg_acc() # precision and recall will be used during model assessment (see below) final_accuracy = avg_val_acc[-1] res.set_task(task_type) plt = res.plot_mse(epochs, avg_tr_error, avg_val_error, params, final_accuracy) msepath = res.save_plot(plt,'mse') plt = res.plot_acc(epochs,avg_tr_acc,avg_val_acc,params) res.save_plot(plt,'acc') # adding the result res.add_result(avg_tr_error[-1], avg_val_error[-1], params['batch_size'], params['weights_bound'], params['learning_rate'] , params['momentum_alpha'], final_accuracy, msepath) if not model_assessment: # cleaning model selection for next run ms.clean() res.add_result_header('mse_tr' , 'mse_val','batch_s','weights', 'lr','m_alpha', 'acc', 'path') res.save_results() # WARNING this code must be executed only once # it must be executed only after model selection otherwise we will invalidate the test set if model_assessment: # here we want to use the testset to assess the model performances trained_models = [m for _, m in ms.models()] voted_outputs = [] avg_outputs = [] for batch in testset.batch(1): for pattern in batch: tmp_voted_outputs = [] tmp_real_outputs = [] for m in trained_models: class_out , real_out = m.classify(pattern[1],threshold=0.5) tmp_voted_outputs.append( class_out ) tmp_real_outputs.append(real_out) # we get the most frequent element ( majority vote) voted_outputs.append(mode(tmp_voted_outputs)) # we get the average output to compute the error avg_outputs.append([mean(tmp_real_outputs)]) metrics = ms.get_metrics() target_outputs = [ x[0] for x in testset.data_set[:,2]] # computing acc, rec and precision for the testset acc = metrics.accuracy(voted_outputs,target_outputs) recall = metrics.recall(voted_outputs, target_outputs) precision = metrics.precision(voted_outputs, target_outputs) mse = metrics.mean_square_error(avg_outputs, testset.data_set[:,2]) print("ACCURACY " + str(acc)) print("PRECISION " + str(precision)) print("RECALL " + str(recall)) print("MSE " + str(mse))
def __init__(self, username, password, download_directory=None): self.data_dir = self.setup_data_dir(download_directory) self.datasets = datasets.load() res = self.login(username, password)
"Russian Federation":"Russia", "Congo, Dem. Rep.":"Congo (Kinshasa)", "Venezuela, RB":"Venezuela", "St. Lucia":"Saint Lucia", "St. Vincent and the Grenadines":"Saint Vincent and the Grenadines", "Congo, Rep.":"Republic of the Congo", "Bahamas, The":"The Bahamas", "Gambia, The":"The Gambia" } for t in trans : s["Country/Region"] = s["Country/Region"].replace(t, trans[t]) return(s) if __name__ == "__main__": dsets = datasets.load() covid = datasets.combine(dsets) if (os.path.isfile(WDI_FILE)) : warnings.warn("Reading cached WDI data from disk, delete file to download updated") wdi = pd.read_pickle(WDI_FILE) else : wdi = covid.drop(columns=["Date","Province/State","Lat","Long", datasets.CONFIRMED,"deaths","recoveries"]).drop_duplicates() for id in INDICES_USED: s = wb.download(indicator=id, country="all", start=2005, end=2019).reset_index() # use most recent non missing value s = s.dropna().groupby("country").last() s = s.drop(columns="year").reset_index() # match country names to covid data s = s.rename(columns={"country":"Country/Region"})
parser.add_argument('-name') parser.add_argument('-iteration', type=int) args = vars(parser.parse_args()) print('Running iteration #%d for dataset %s...' % (args['iteration'], args['name'])) RESULTS_PATH = os.path.join(os.path.dirname(__file__), 'results') TRIAL_PATH = os.path.join(RESULTS_PATH, args['name']) for path in [RESULTS_PATH, TRIAL_PATH]: if not os.path.exists(path): os.mkdir(path) partitions = load(args['name']) for i in range(5): partition = partitions[i] for j in range(2): X_train, y_train = partition[j % 2] X_test, y_test = partition[(j + 1) % 2] mask = select(X_train, y_train, verbose=True) base_score = score(X_train, y_train, X_test, y_test, RandomForestClassifier()) selection_score = score(X_train[:, mask], y_train, X_test[:, mask], y_test, RandomForestClassifier()) features = []
def evaluate_trial(resampler_name, fold): RESULTS_PATH = Path(__file__).parents[0] / 'results_final' RANDOM_STATE = 42 resamplers = { 'SMOTE': sv.SMOTE(random_state=RANDOM_STATE), 'polynom-fit-SMOTE': sv.polynom_fit_SMOTE(random_state=RANDOM_STATE), 'Lee': sv.Lee(random_state=RANDOM_STATE), 'SMOBD': sv.SMOBD(random_state=RANDOM_STATE), 'G-SMOTE': sv.G_SMOTE(random_state=RANDOM_STATE), 'LVQ-SMOTE': sv.LVQ_SMOTE(random_state=RANDOM_STATE), 'Assembled-SMOTE': sv.Assembled_SMOTE(random_state=RANDOM_STATE), 'SMOTE-TomekLinks': sv.SMOTE_TomekLinks(random_state=RANDOM_STATE), 'RBO': RBO(random_state=RANDOM_STATE), 'PA': PA(random_state=RANDOM_STATE) } for dataset_name in datasets.names(): classifiers = { 'CART': DecisionTreeClassifier(random_state=RANDOM_STATE), 'KNN': KNeighborsClassifier(n_neighbors=3), 'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE), 'MLP': MLPClassifier(random_state=RANDOM_STATE) } trial_name = f'{dataset_name}_{fold}_{resampler_name}' trial_path = RESULTS_PATH / f'{trial_name}.csv' if trial_path.exists(): continue logging.info(f'Evaluating {trial_name}...') dataset = datasets.load(dataset_name) (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] resampler = resamplers[resampler_name] assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2 X_train, y_train = resampler.sample(X_train, y_train) rows = [] for classifier_name in classifiers.keys(): classifier = classifiers[classifier_name] clf = classifier.fit(X_train, y_train) predictions = clf.predict(X_test) scoring_functions = { 'Precision': metrics.precision, 'Recall': metrics.recall, 'AUC': metrics.auc, 'G-mean': metrics.g_mean } for scoring_function_name in scoring_functions.keys(): score = scoring_functions[scoring_function_name](y_test, predictions) row = [ dataset_name, fold, classifier_name, resampler_name, scoring_function_name, score ] rows.append(row) columns = [ 'Dataset', 'Fold', 'Classifier', 'Resampler', 'Metric', 'Score' ] RESULTS_PATH.mkdir(exist_ok=True, parents=True) pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
def extract(k=5, verbose=True): rows = [] columns = ['Name', 'DI', 'IR', 'Samples', 'Features'] for name in tqdm(datasets.names()): dataset = datasets.load(name) (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1] X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) n_samples = X.shape[0] n_features = X.shape[1] majority_class = Counter(y).most_common()[0][0] n_majority_samples = Counter(y).most_common()[0][1] n_minority_samples = Counter(y).most_common()[1][1] imbalance_ratio = np.round(n_majority_samples / n_minority_samples, 2) knn = NearestNeighbors(k + 1).fit(X) difficulty_coefficients = [] for X_i, y_i in zip(X, y): if y_i == majority_class: continue else: indices = knn.kneighbors([X_i], return_distance=False)[0, 1:] n_majority_neighbors = sum(y[indices] == majority_class) difficulty_coefficients.append(n_majority_neighbors / k) difficulty_index = np.round(np.mean(difficulty_coefficients), 3) rows.append( [name, difficulty_index, imbalance_ratio, n_samples, n_features]) df = pd.DataFrame(rows, columns=columns) df = df.sort_values('DI') df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv', index=False) if verbose: for column in ['DI', 'IR']: df[column] = df[column].map(lambda x: f'{x:.2f}') for i in range(30): row = [str(df.iloc[i][c]) for c in columns] if i + 30 < len(df): row += [str(df.iloc[i + 30][c]) for c in columns] else: row += ['' for _ in columns] print(' & '.join(row).replace('_', '\\_') + ' \\\\') return df
def run(): while True: trial = pull_pending() if trial is None: break params = eval(trial['Parameters']) logging.info(trial) dataset = load(trial['Dataset']) fold = int(trial['Fold']) - 1 (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] n_minority = Counter(y_train).most_common()[1][1] n_majority = Counter(y_train).most_common()[0][1] imblearn_ratios = [ ((n_majority - n_minority) * ratio + n_minority) / n_majority for ratio in [0.5, 0.75, 1.0] ] clf = { 'NB': NB(), 'KNN': KNN(), 'SVM': SVM(gamma='scale'), 'CART': CART() }[params['classifier']] if (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'): algorithm = None else: algorithms = { 'AKNN': ResamplingCV(AKNN, clf, n_neighbors=[1, 3, 5, 7]), 'Bord': ResamplingCV(SMOTE, clf, kind=['borderline1'], k_neighbors=[1, 3, 5, 7, 9], m_neighbors=[5, 10, 15], sampling_strategy=imblearn_ratios), 'CC': ResamplingCV(CC, clf, sampling_strategy=imblearn_ratios), 'CNN': ResamplingCV(CNN, clf, n_neighbors=[1, 3, 5, 7]), 'ENN': ResamplingCV(ENN, clf, n_neighbors=[1, 3, 5, 7]), 'IHT': ResamplingCV(IHT, clf, sampling_strategy=imblearn_ratios, cv=[2]), 'NCL': ResamplingCV(NCL, clf, n_neighbors=[1, 3, 5, 7]), 'NM': ResamplingCV(NM, clf, n_neighbors=[1, 3, 5, 7]), 'OSS': ResamplingCV(OSS, clf, n_neighbors=[1, 3, 5, 7]), 'RBO': ResamplingCV(RBO, clf, gamma=[0.01, 0.1, 1.0, 10.0], ratio=[0.5, 0.75, 1.0]), 'RBU': ResamplingCV(RBU, clf, gamma=params.get('gamma'), ratio=params.get('ratio')), 'RENN': ResamplingCV(RENN, clf, n_neighbors=[1, 3, 5, 7]), 'ROS': ResamplingCV(ROS, clf, sampling_strategy=imblearn_ratios), 'RUS': ResamplingCV(RUS, clf, sampling_strategy=imblearn_ratios), 'SMOTE': ResamplingCV(SMOTE, clf, k_neighbors=[1, 3, 5, 7, 9], sampling_strategy=imblearn_ratios), 'SMOTE+ENN': ResamplingCV( SMOTEENN, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'SMOTE+TL': ResamplingCV( SMOTETomek, clf, smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]], sampling_strategy=imblearn_ratios), 'TL': TL(), } algorithm = algorithms.get(trial['Algorithm']) if algorithm is None: raise NotImplementedError if algorithm is not None: X_train, y_train = algorithm.fit_sample(X_train, y_train) clf = clf.fit(X_train, y_train) predictions = clf.predict(X_test) scores = { 'Precision': metrics.precision(y_test, predictions), 'Recall': metrics.recall(y_test, predictions), 'F-measure': metrics.f_measure(y_test, predictions), 'AUC': metrics.auc(y_test, predictions), 'G-mean': metrics.g_mean(y_test, predictions) } submit_result(trial, scores)
def run(): while True: trial = pull_pending() if trial is None: break params = eval(trial['Parameters']) print(trial) clf = eval(params['classifier'])() if trial['Algorithm'] == 'RBO': algorithm = RBO(gamma=params['gamma'], n_steps=params['n_steps'], step_size=params['step_size'], stop_probability=params['stop_probability'], criterion=params['criterion']) elif trial['Algorithm'] == 'RBOSelection': if params['measure'] == 'AUC': measure = metrics.roc_auc_score else: raise NotImplementedError algorithm = RBOSelection(classifier=clf, measure=measure, gammas=params['gammas'], n_steps=params['n_steps'], step_size=params['step_size'], stop_probability=params['stop_probability'], criterion=params['criterion']) elif (trial['Algorithm'] is None) or (trial['Algorithm'] == 'None'): algorithm = None else: algorithms = { 'SMOTE': SMOTE(), 'SMOTE+ENN': SMOTEENN(), 'SMOTE+TL': SMOTETomek(), 'Bord': SMOTE(kind='borderline1'), 'ADASYN': ADASYN(), 'NCL': NCL() } algorithm = algorithms.get(trial['Algorithm']) if algorithm is None: raise NotImplementedError dataset = load(trial['Dataset'], noise_type=params.get('noise_type', None), noise_level=params.get('noise_level', 0.0)) fold = int(trial['Fold']) - 1 (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] labels = np.unique(y_test) counts = [len(y_test[y_test == label]) for label in labels] minority_class = labels[np.argmin(counts)] if algorithm.__class__ in [SMOTE, SMOTEENN, SMOTETomek]: train_labels = np.unique(y_train) train_counts = [len(y_train[y_train == train_label]) for train_label in train_labels] train_minority_class = labels[np.argmin(train_counts)] algorithm.k = algorithm.k_neighbors = np.min([len(y_train[y_train == train_minority_class]) - 1, 5]) if algorithm is not None: X_train, y_train = algorithm.fit_sample(X_train, y_train) clf = clf.fit(X_train, y_train) predictions = clf.predict(X_test) g_mean = 1.0 for label in np.unique(y_test): idx = (y_test == label) g_mean *= metrics.accuracy_score(y_test[idx], predictions[idx]) g_mean = np.sqrt(g_mean) scores = { 'Accuracy': metrics.accuracy_score(y_test, predictions), 'Average accuracy': np.mean(metrics.recall_score(y_test, predictions, average=None)), 'Precision': metrics.precision_score(y_test, predictions, pos_label=minority_class), 'Recall': metrics.recall_score(y_test, predictions, pos_label=minority_class), 'F-measure': metrics.f1_score(y_test, predictions, pos_label=minority_class), 'AUC': metrics.roc_auc_score(y_test, predictions), 'G-mean': g_mean } submit_result(trial, scores)
def cup(param_grid): dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP') # we do the train combining the previous trainingset and validation set # to have more data trainset, testset = dataset.split(75 / 100) # data normalization params = next(ms.grid_search(param_grid)) print(params) params['batch_size'] = params[ 'batch_size'] if params['batch_size'] > 0 else trainset.size() epochs = params['epochs'] batch_size = params['batch_size'] runs_number = 1 for run in range(runs_number): nn.from_parameters(params, 'sigmoid', 'linear') model = nn.build() ms.add_model(model) ms.set_datasets(trainset, testset) start = time.time() for e in range(epochs): ppb(e + 1, epochs, prefix='Training', suffix='Completed') for model_id, model in ms.models(): model.fit(trainset, batch_size, e) train_outputs = model.forward_dataset(trainset) test_outputs = model.forward_dataset(testset) ms.compute_error(model_id, train_outputs, test_outputs, metrics=['mse', 'mee']) training_time = time.time() - start print('TRAINING TIME: ' + str(training_time) + 'seconds') avg_tr_mse, avg_ts_mse = ms.avg_mse() avg_tr_mee, avg_ts_mee = ms.avg_mee() res.set_task('CUP') plt = res.plot_mse(epochs, avg_tr_mse, avg_ts_mse, params, label2='test') msepath = res.save_plot(plt, 'mse') plt = res.plot_mee(epochs, avg_tr_mee, avg_ts_mee, params, label2='test') res.save_plot(plt, 'mee') print("TRAINING MSE " + str(avg_tr_mse[-1])) print("TRAINING MEE " + str(avg_tr_mee[-1])) # here we want to use the testset to assess the model performances trained_models = [m for _, m in ms.models()] avg_outputs = [] for batch in testset.batch(1): for pattern in batch: tmp_real_outputs_x = [] tmp_real_outputs_y = [] for m in trained_models: real_out = m.feed_forward(pattern[1]) tmp_real_outputs_x.append(real_out[0]) tmp_real_outputs_y.append(real_out[1]) # we get the average output to compute the error avg_outputs.append( [mean(tmp_real_outputs_x), mean(tmp_real_outputs_y)]) metrics = ms.get_metrics() mse = metrics.mean_square_error(avg_outputs, testset.data_set[:, 2]) mee = metrics.mean_euclidian_error(avg_outputs, testset.data_set[:, 2]) print("MSE " + str(mse)) print("MEE " + str(mee)) blindds = ds.load_blind('datasets/ML-CUP19-TS.csv', 'CUP') avg_outputs = [] for batch in blindds.batch(1): for pattern in batch: tmp_real_outputs_x = [] tmp_real_outputs_y = [] for m in trained_models: real_out = m.feed_forward(pattern[1]) tmp_real_outputs_x.append(real_out[0]) tmp_real_outputs_y.append(real_out[1]) # we get the average output to compute the error avg_outputs.append( [mean(tmp_real_outputs_x), mean(tmp_real_outputs_y)]) with open("report/poxebur_wikilele_ML-CUP-TS.csv", "a+") as cupfile: # cleaning the file cupfile.seek(0) cupfile.truncate() cupfile.write("# Leonardo Frioli Luigi Quarantiello \n") cupfile.write("# poxebur_wikilele \n") cupfile.write("# ML-CUP19 \n") cupfile.write("# 10/01/2020 \n") for i in range(len(avg_outputs)): cupfile.write( str(i + 1) + ", " + str(avg_outputs[i][0]) + ", " + str(avg_outputs[i][1]) + "\n")
def cup(param_grid): dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP') # 25% testset, 75% training set + validationset trainvalset, testset = dataset.split(75 / 100) # if we use hold out: validation set == 1/2 trainingset trainset, validationset = trainvalset.split(66.6 / 100) for params in ms.grid_search(param_grid): params['batch_size'] = params[ 'batch_size'] if params['batch_size'] > 0 else trainset.size() print(params) epochs = params['epochs'] batch_size = params['batch_size'] runs_number = 1 for run in range(runs_number): nn.from_parameters(params, 'sigmoid', 'linear') model = nn.build() ms.add_model(model) ms.set_datasets(trainset, validationset) start = time.time() for e in range(epochs): ppb(e + 1, epochs, prefix='Training', suffix='Completed') for model_id, model in ms.models(): model.fit(trainset, batch_size, e) train_outputs = model.forward_dataset(trainset) val_outputs = model.forward_dataset(validationset) ms.compute_error(model_id, train_outputs, val_outputs, metrics=['mse', 'mee']) training_time = time.time() - start print('TRAINING TIME: ' + str(training_time) + 'seconds') avg_tr_mse, avg_val_mse = ms.avg_mse() avg_tr_mee, avg_val_mee = ms.avg_mee() res.set_task('CUP') plt = res.plot_mse(epochs, avg_tr_mse, avg_val_mse, params) msepath = res.save_plot(plt, 'mse') plt = res.plot_mee(epochs, avg_tr_mee, avg_val_mee, params) res.save_plot(plt, 'mee') res.add_result(avg_tr_mse[-1], avg_val_mse[-1], avg_tr_mee[-1], avg_val_mee[-1], params['epochs'], params['batch_size'], params['weights_bound'], params['learning_rate'], params['momentum_alpha'], params['use_nesterov'], params['regularization_lambda'], msepath) ms.clean() res.add_result_header('mse_tr', 'mse_val', 'mee_tr', 'mee_val', 'batch_s', 'weights', 'lr', 'm_alpha', 'nesterov', 'r_lambda', 'path') res.save_results()
from KarfNN.layer import Dense, Dropout from KarfNN.models import Karf from datasets import load def toDummies(df, Columns): for Column in Columns: new_df = pandas.get_dummies(df[Column], prefix=Column) df = pandas.concat([df, new_df], axis=1) df = df.drop(Columns, axis=1) return df np.random.seed(1) df = load("Iris") # shuffle data data = df.iloc[np.random.permutation(len(df))] # split data to X and y and code Species names to numbers X = data.drop(["Id", "Species"], axis=1).astype(np.float) y = data[["Species"]] # OneHot encoding for output vector y = toDummies(y,["Species"]) # split data to training sets and testing sets train_split = int(len(X) * 0.75) Xtrain = X[:train_split].values
def run_average(): for v in range(0, 3, 2): for current_dataset in Datasets: avgAccRF = avgAccBA = avgAccBANew = avgF1RF = avgF1BA = avgF1BANew = 0.0; for current_fold in range(1, 11): df_train, df_test, ds_infos = ds.load(source_path, current_dataset, current_fold) X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1].values X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1].values fileName = current_dataset + '.RF' + str(current_fold) + ".txt" random_forest_file = os.path.join(source_path, 'resources', 'forests', current_dataset, fileName) random_forest = tree_io.classifier_from_file(random_forest_file, X_train, y_train, pruning=False) fileName = current_dataset + '.BA' + str(current_fold) + '.O0.T10.tree' born_again_O0_file = os.path.join(output_path, 'trees10', current_dataset, fileName) born_againO0 = tree_io.classifier_from_file(born_again_O0_file, X_train, y_train, pruning=False) # print(born_again_O0_file) fileName = current_dataset + '.BA' + str(current_fold) + '.O5.T10.V' + str(v) + '.tree' born_again_O5_file = os.path.join(output_path, 'trees10', current_dataset, 'ExactDepth', fileName) print(born_again_O5_file) born_againO5 = tree_io.classifier_from_file(born_again_O5_file, X_train, y_train, pruning=False) # RandomForest rf_test_pred = random_forest.predict(X_test) rf_train_pred = random_forest.predict(X_train) report_rf = classification_report(y_test, rf_test_pred, output_dict=True) report_rf_train = classification_report(y_train, rf_train_pred, output_dict=True) # BornAgain ba_test_pred = born_againO0.predict(X_test) ba_train_pred = born_againO0.predict(X_train) report_ba = classification_report(y_test, ba_test_pred, output_dict=True) report_ba_train = classification_report(y_train, ba_train_pred, output_dict=True) # BornAgainNew banew_test_pred = born_againO5.predict(X_test) banew_train_pred = born_againO5.predict(X_train) report_banew = classification_report(y_test, banew_test_pred, output_dict=True) report_banew_train = classification_report(y_train, banew_train_pred, output_dict=True) add_report(df, 'RandomForest', report_rf_train, report_rf) add_report(df, 'BornAgain', report_ba_train, report_ba) add_report(df, 'BornAgainNew', report_banew_train, report_banew) avgAccRF = avgAccRF + report_rf_train['accuracy'] avgF1RF = avgF1RF + report_rf['weighted avg']['f1-score'] avgAccBA = avgAccBA + report_ba_train['accuracy'] avgF1BA = avgF1BA + report_ba['weighted avg']['f1-score'] avgAccBANew = avgAccBANew + report_banew_train['accuracy'] avgF1BANew = avgF1BANew + report_banew['weighted avg']['f1-score'] print("Average RF Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str( avgAccRF / 10) + " " + str(avgF1RF / 10)) print("Average BA Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str( avgAccBA / 10) + " " + str(avgF1BA / 10)) print("Average BANew Accuracy and F1 in " + current_dataset + " with value " + str(v) + " : " + str( avgAccBANew / 10) + " " + str(avgF1BANew / 10)) a = pd.DataFrame(data=df, index=None) path = output_path + '/ResultsV' + str(v) + '.xlsx' a.to_excel(path)
def evaluate_trial(ratio, fold): RESULTS_PATH = Path(__file__).parents[0] / 'results_ratio' RANDOM_STATE = 42 for dataset_name in datasets.names(): classifiers = { 'CART': DecisionTreeClassifier(random_state=RANDOM_STATE), 'KNN': KNeighborsClassifier(n_neighbors=3), 'SVM': SVC(kernel='rbf', random_state=RANDOM_STATE), 'MLP': MLPClassifier(random_state=RANDOM_STATE) } trial_name = f'{dataset_name}_{fold}_{ratio}' trial_path = RESULTS_PATH / f'{trial_name}.csv' if trial_path.exists(): continue logging.info(f'Evaluating {trial_name}...') dataset = datasets.load(dataset_name) (X_train, y_train), (X_test, y_test) = dataset[fold][0], dataset[fold][1] resampler = PA(ratio=ratio, random_state=RANDOM_STATE) assert len(np.unique(y_train)) == len(np.unique(y_test)) == 2 try: X_train, y_train = resampler.sample(X_train, y_train) except RuntimeError: continue rows = [] for classifier_name in classifiers.keys(): classifier = classifiers[classifier_name] clf = classifier.fit(X_train, y_train) predictions = clf.predict(X_test) scoring_functions = { 'Precision': metrics.precision, 'Recall': metrics.recall, 'AUC': metrics.auc, 'G-mean': metrics.g_mean } for scoring_function_name in scoring_functions.keys(): score = scoring_functions[scoring_function_name](y_test, predictions) row = [ dataset_name, fold, classifier_name, ratio, scoring_function_name, score ] rows.append(row) columns = ['Dataset', 'Fold', 'Classifier', 'Ratio', 'Metric', 'Score'] RESULTS_PATH.mkdir(exist_ok=True, parents=True) pd.DataFrame(rows, columns=columns).to_csv(trial_path, index=False)
from __future__ import division, print_function import numpy as np import utils from model01 import MLPModel01 from metrics import performance_report import datasets n_categories = 2 # implicit in prepare_data (maybe parameterise) lookahead = 1 window = 60 sym = 'USDJPY' # In[21]: X_train, Y_train, prices_train, _ = datasets.load( datasets.filename('DS2', lookahead, window, sym, 2009)) X_dev, Y_dev, prices_dev = datasets.load( datasets.filename('DS2', lookahead, window, sym, 2010)) # sample 50k records from 2010 as dev set dev_idx = np.random.choice(len(X_dev), 50000, replace=False) X_dev, Y_dev, prices_dev, _ = X_dev.ix[dev_idx], Y_dev.ix[ dev_idx], prices_dev.ix[dev_idx] X_test, Y_test, prices_test, _ = datasets.load( datasets.filename('DS2', lookahead, window, sym, 2011)) # In[23]: print("train", X_train.shape) print("dev", X_dev.shape)
def extract(verbose=True): dfs = [] for partition in ['preliminary', 'final']: rows = [] for name in tqdm(datasets.names(partition)): dataset = datasets.load(name) (X_train, y_train), (X_test, y_test) = dataset[0][0], dataset[0][1] X = np.concatenate([X_train, X_test]) y = np.concatenate([y_train, y_test]) n_samples = X.shape[0] n_features = X.shape[1] majority_class = Counter(y).most_common()[0][0] n_majority_samples = Counter(y).most_common()[0][1] n_minority_samples = Counter(y).most_common()[1][1] imbalance_ratio = np.round(n_majority_samples / n_minority_samples, 2) knn = NearestNeighbors(6).fit(X) n_safe = 0 n_borderline = 0 n_rare = 0 n_outliers = 0 for X_i, y_i in zip(X, y): if y_i == majority_class: continue else: indices = knn.kneighbors([X_i], return_distance=False)[0, 1:] n_majority_neighbors = sum(y[indices] == majority_class) if n_majority_neighbors in [0, 1]: n_safe += 1 elif n_majority_neighbors in [2, 3]: n_borderline += 1 elif n_majority_neighbors == 4: n_rare += 1 elif n_majority_neighbors == 5: n_outliers += 1 else: raise ValueError n_total = n_safe + n_borderline + n_rare + n_outliers percentage_safe = np.round(n_safe / n_total * 100, 2) percentage_borderline = np.round(n_borderline / n_total * 100, 2) percentage_rare = np.round(n_rare / n_total * 100, 2) percentage_outlier = np.round(n_outliers / n_total * 100, 2) rows.append([ name, imbalance_ratio, n_samples, n_features, percentage_safe, percentage_borderline, percentage_rare, percentage_outlier ]) df = pd.DataFrame(rows, columns=[ 'name', 'imbalance_ratio', 'n_samples', 'n_features', 'percentage_safe', 'percentage_borderline', 'percentage_rare', 'percentage_outlier' ]) df = df.sort_values('imbalance_ratio') dfs.append(df) df = pd.concat(dfs).reset_index(drop=True) df.to_csv(Path(__file__).parent / 'results' / 'dataset_info.csv', index=False) if verbose: for i, row in df.iterrows(): row = [str(v).replace('_', '\_') for v in row] print(' & '.join(row) + ' \\\\') if i == 19: print('\\midrule') return df
eval_model = Evaluate(scan_model) results = eval_model.evaluate(np.array(test_x), np.array(test_y), task='continuous',folds=10, metric='loss') return np.array([inverse_transform(scaler,result) for result in results]) if __name__ == "__main__": base_dir = os.getcwd() start_time = datetime.now() experiment_name = start_time.strftime("%m_%d_%Y_%H_%M_%S") scaler = MinMaxScaler() dataset = 'hele_norge' train_x, train_y, validation_x, validation_y, test_x, test_y, scaler = datasets.load(f'../input/'+dataset+'.csv', scaler) #Save scaler for future predictions: joblib.dump(scaler, f'../talos_training/'+ dataset +'.scaler') round_lim = 30 if len(sys.argv) == 2: print("10-feature training initialized") features = ['boligtype_Leilighet', 'boligtype_Enebolig', 'bruksareal', 'boligtype_Tomannsbolig', 'postnummer', 'boligtype_Rekkehus', 'neighborhood_environment_demographics_housingage_10-30', 'neighborhood_environment_demographics_housingprices_0-2000000', 'neighborhood_environment_demographics_housingage_30-50', 'eieform_Andel'] parameters = {'activation_1':['relu', 'elu'], 'activation_2':['relu', 'elu'], 'activation_3':['relu', 'elu'],
def prepare(dataset, partition, fold, mode='OVA', output_path=DEFAULT_ROOT_OUTPUT_PATH, energy=0.25, cleaning_strategy='translate', selection_strategy='proportional', p_norm=1.0, method='sampling'): logging.info('Processing fold %dx%d of dataset "%s"...' % (partition, fold, dataset)) output_path = Path(output_path) / dataset output_path.mkdir(parents=True, exist_ok=True) (X_train, y_train), (X_test, y_test) = datasets.load(dataset, partition, fold) header = pd.read_csv(DEFAULT_DATA_PATH / 'folds' / dataset / ('%s.%d.%d.train.csv' % (dataset, partition, fold))).columns if mode == 'OVA': logging.info('Training distribution before resampling: %s.' % Counter(y_train)) X_train, y_train = algorithms.MultiClassCCR( energy=energy, cleaning_strategy=cleaning_strategy, selection_strategy=selection_strategy, p_norm=p_norm, method=method).fit_sample(X_train, y_train) logging.info('Training distribution after resampling: %s.' % Counter(y_train)) csv_path = output_path / ('%s.%d.%d.train.oversampled.csv' % (dataset, partition, fold)) pd.DataFrame(np.c_[X_train, y_train]).to_csv(csv_path, index=False, header=header) elif mode == 'OVO': classes = np.unique(np.concatenate([y_train, y_test])) for i in range(len(classes)): for j in range(i + 1, len(classes)): logging.info('Resampling class %s vs. class %s.' % (classes[i], classes[j])) indices = ((y_train == classes[i]) | (y_train == classes[j])) X, y = X_train[indices].copy(), y_train[indices].copy() logging.info('Training distribution before resampling: %s.' % Counter(y)) X, y = algorithms.CCR(energy=energy, cleaning_strategy=cleaning_strategy, selection_strategy=selection_strategy, p_norm=p_norm).fit_sample(X, y) logging.info('Training distribution after resampling: %s.' % Counter(y)) csv_path = output_path / ( '%s.%d.%d.train.oversampled.%dv%d.csv' % (dataset, partition, fold, classes[i], classes[j])) pd.DataFrame(np.c_[X, y]).to_csv(csv_path, index=False, header=header) else: raise NotImplementedError
def get_datasets(): return datasets.load()