def main(configuration_file): """Main function to hierarchically classify new camera trap images. Parameters ---------- configuration_file : string file containing the all paths """ # Load configuration file with open("config.yml") as yaml_config: config = load(yaml_config) # Check if all folders exist and create folder if needed for path in config: if not os.path.exists(config[path]): os.makedirs(config[path]) # Step 1: resize images ######################## # Input: images and Agouti export file (observations) # Output: resized images in similar folder structure as original images resize_images(config["general_folder_path"], config["resized_folder_path"]) # Step 2: preprocess images ########################### # Input: resized images and Agouti export (observations + assets + pickupsetup) # Output: file containing coordinates of the regions of interest preprocessing(config["general_folder_path"], config["resized_folder_path"], config["preprocessing_output_path"]) # Step 3: extract bottleneck features using the pretrained network ResNet50 ############################################################################ # Input: resized images and preprocessing output containing the coordinates of the boxes # Output: bottleneck features of all images extract_bottleneck_features(config["preprocessing_output_path"], config["bottleneck_features_output_path"], config["resized_folder_path"]) # Step 4 : run top model to classify the new images ################################################## # Input: extracted bottleneck features # Ouput: predicted probabilities hierarchical_bottleneck_predict(config["bottleneck_features_output_path"], config["weight_path"], config["predictions_output_path"]) # Step 5 : convert output probabilities to hierarchical classification ###################################################################### # Input: predicted probabilities # Output: hierarchical classification of the sequences hierarchical_predictions_sequences( config["predictions_output_path"], config["bottleneck_features_output_path"])
def main(): database.connect(config.database) if not fsys.islocked(config.resources + "/.initialization.lock"): initialization() fsys.lock(config.resources + "/.initialization.lock") if not fsys.islocked(config.resources + "/.preprocessing.lock"): preprocessing() fsys.lock(config.resources + "/.preprocessing.lock") app = QtWidgets.QApplication([]) window = Window() window.show() sys.exit(app_exit(app))
def main(dataset, subject, model, params, exp, mode, log, ph, plot): printd(dataset, subject, model, params, exp, mode, log, ph, plot) # retrieve model's parameters search = locate_search(params) params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING & TUNING """ if search: params = find_best_hyperparameters(subject, model_class, params, search, ph_f, train, valid, test) raw_results = make_predictions(subject, model_class, params, ph_f, train, valid, test, mode=mode) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ results = ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results) printd(results.compute_results()) if plot: results.plot(0)
def main_standard(dataset, subject, model, params, exp, eval_set, ph): printd(dataset, subject, model, params, exp, eval_set, ph) # retrieve model's parameters params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING """ raw_results = make_predictions_pclstm(subject, model_class, params, ph_f, train, valid, test, scalers, mode=eval_set) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results).save_raw_results()
def main_target_training(source_dataset, target_dataset, target_subject, model, params, eval_mode, exp, plot): hist_f = params["hist"] // freq train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f) raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test, eval_mode=eval_mode, fit=True, save_model_file=None) return evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot, "target_training")
def _load_subject_data(self, subject): if subject not in list(self.train.keys()): train_sbj, valid_sbj, test_sbj, scalers_sbj = preprocessing(self.dataset, subject, self.ph, self.hist, cs.day_len_f) self.train[subject] = train_sbj self.valid[subject] = valid_sbj self.test[subject] = test_sbj self.scalers[subject] = scalers_sbj
def run(args): if args.mode == 'prepare': preprocessing(args) else: cls = main_model.get(args.model) if cls is None: return model = MainModel(cls=cls, config=args) print('--------------------------------------------------------------') print(' 本次对应的层次为: %s' % args.arrangement) print('--------------------------------------------------------------') if args.mode == 'train': model.train() elif args.mode == 'predict': results = model.predict(load_best_model=args.load_best_model) save_results(results, args) else: model.evaluate(load_best_model=args.load_best_model)
def main_target_global(source_dataset, target_dataset, target_subject, model, params, weights_exp, eval_mode, exp, plot): hist_f = params["hist"] // freq weights_file = compute_weights_file(model, source_dataset, target_dataset, target_subject, weights_exp) train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f) raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test, weights_file=weights_file, eval_mode=eval_mode, fit=False, save_model_file=None) return evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot, "target_global")
def main_cgega_iterative_training(dataset, subject, model, params1, params2, exp, eval_set, ph, save_iter=False): printd(dataset, subject, model, params1, params2, exp, eval_set, ph) # retrieve model's parameters params1 = locate_params(params1) params2 = locate_params(params2) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params1["hist"] // cs.freq day_len_f = cs.day_len // cs.freq freq_ds = misc.datasets.datasets[dataset]["glucose_freq"] """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) """ MODEL TRAINING """ dir = join(cs.path, "processing", "models", "weights", "cg_ega") file = join(dir, exp, model_class.__name__ + "_" + dataset + subject) results_test, results_valid_iter = progressive_improvement_clinical_acceptability( subject, model_class, params1, params2, ph, freq_ds, train, valid, test, scalers, file, eval_set) results_test = postprocessing(results_test, scalers, dataset) results_valid_iter = postprocessing_all_iter(results_valid_iter, scalers, dataset) ResultsSubject(model, exp, ph, dataset, subject, params=[params1, params2], results=results_test).save_raw_results() if save_iter: ResultsSubjectPICA(model, exp, ph, dataset, subject, params=[params1, params2], results=results_valid_iter).save_raw_results()
def run(args): if args.mode == 'prepare': preprocessing('./rawData', './data', need_punct=args.need_punct, char_max_len=args.char_max_len, glove_filename=args.glove_file) else: # loading preprocessed data with open('./data/dataset.pkl', 'rb') as fr, \ open('./data/embedding_matrix.pkl', 'rb') as fr_embed, \ open('./data/char2index.json', 'r') as fr_char: data = pkl.load(fr) embedding_matrix = pkl.load(fr_embed) char2index = json.load(fr_char) train_samples = [data[k + '.xml'] for k in args.train_list] dev_samples = [data[k + '.xml'] for k in args.dev_list] test_samples = [data[k + '.xml'] for k in args.test_list] all_data = BatchDatasets(args.max_len, args.char_max_len, need_shuffle=args.need_shuffle, batch_size=args.batch_size, k_fold=args.k_fold, categories_num=args.categories_num, train_samples=train_samples, dev_samples=dev_samples, test_samples=test_samples) model = QCN(embedding_matrix=embedding_matrix, args=args, char_num=len(char2index)) if args.mode == 'train': model.train(all_data, args) elif args.mode == 'test': model.test(all_data, args)
def transform(data): count_vect = load_count_vectorizer() tfidf_transformer = load_tfidf_transformer() preprocessed_data = preprocessing([data], remove_stopwords=True, lemmatization=True, remove_accented=True) train_cv = count_vect.transform(preprocessed_data) X_train_idf = tfidf_transformer.transform(train_cv) return X_train_idf
def main_target_finetuning(source_dataset, target_dataset, target_subject, Model, params, weights_exp, eval_mode, exp, plot): hist_f = params["hist"] // freq weights_file = compute_weights_file(Model, source_dataset, target_dataset, target_subject, weights_exp) train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f) raw_results = make_predictions_tl(target_subject, Model, params, ph_f, train, valid, test, weights_file=weights_file, tl_mode="target_finetuning", eval_mode=eval_mode) evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, Model, params, exp, plot, "target_finetuning")
def end_to_end(source_dataset, target_dataset, target_subject, model, params, weights_exp, eval_mode, exp, plot): hist_f = params["hist"] // freq save_file = compute_weights_file(model, source_dataset, target_dataset, target_subject, weights_exp) train_m, valid_m, test_m, scalers_m = preprocessing_source_multi(source_dataset, target_dataset, target_subject, ph_f, hist_f, day_len_f) make_predictions_tl(target_subject, model, params, ph_f, train_m, valid_m, test_m, eval_mode=eval_mode, fit=True, save_model_file=save_file) train, valid, test, scalers = preprocessing(target_dataset, target_subject, ph_f, hist_f, day_len_f) raw_results = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test, weights_file=save_file, eval_mode=eval_mode, fit=False, save_model_file=None) evaluation(raw_results, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot, "target_global") raw_results_2 = make_predictions_tl(target_subject, model, params, ph_f, train, valid, test, weights_file=save_file, eval_mode=eval_mode, fit=True, save_model_file=None) return evaluation(raw_results_2, scalers, source_dataset, target_dataset, target_subject, model, params, exp, plot, "target_finetuning")
# LightGBM cluster lgb_cls_train = pd.read_csv(path_train + "lgb_cls_val.csv") lgb_cls_test = pd.read_csv(path_test + "lgb_cls_test.csv") # Catboost standard cat_std_train = pd.read_csv(path_train + "catboost_val.csv") cat_std_test = pd.read_csv(path_test + "catboost_test.csv") # XGBoost Incremental xgb_train = pd.read_csv(path_train + "xgb_inc_val.csv") xgb_test = pd.read_csv(path_test + "xgb_inc_test.csv") train = pd.read_csv("../dataset/original/train.csv") test = pd.read_csv("../dataset/original/x_test.csv") df = preprocessing(train, test, useTest=False) df_scope = df[['Date', 'sku', 'scope']].copy() # Train prediction_train = cat_std_train.merge( lgb_std_train, how='left', on=['Date', 'sku', 'target', 'real_target']) prediction_train = lgb_cls_train.merge( prediction_train, how='left', on=['Date', 'sku', 'target', 'real_target']) prediction_train = prediction_train.merge( xgb_train, how='left', on=['Date', 'sku', 'target', 'real_target']) prediction_train.Date = pd.to_datetime(prediction_train.Date) prediction_train.Date = pd.to_datetime(prediction_train.Date) prediction_train = prediction_train.merge(df[['Date', 'sku', 'scope']],
def main(): # Read run agument for configuration config_path = sys.argv[1] cfg_dict = op_util.get_all_configs(config_path) cfg_dict['t_date'] = pd.to_datetime("today").strftime("%Y_%m_%d") cfg_dict = op_util.get_te_window_from_cfg(cfg_dict) #tdict is a dictionary of objects #tmap keeps track of the process stage tdict = {} tmap = {} "General preprocessing" tmap['general_preprocessing'] = 1 tdict[0] = attr_gen_preprocess(cfg_dict['DATA_FILE_DICT'] , {}) tdict[1] = attr_gen_preprocess({} , cfg_dict) gen_preproc = general_preprocess(1, tdict, tmap) gen_preproc.run() print (gen_preproc.data_plus_meta_[1].data_) "Preprocessing" tdict = gen_preproc.data_plus_meta_ tmap = gen_preproc.racks_map_ tmap['preprocessing'] = 2 cfg_dict = tdict[1].config_ tdict[2] = attr_preprocess({},cfg_dict ) preprocess = preprocessing(2, tdict, tmap) preprocess.run() print (preprocess.data_plus_meta_[2].data_) "Imputation" tdict = preprocess.data_plus_meta_ tmap = preprocess.racks_map_ tmap['imputation'] = 3 cfg_dict = tdict[2].config_ tdict[3] = attr_imputation({},cfg_dict ) impute = imputation(3, tdict, tmap) impute.run() print (impute.data_plus_meta_[3].data_) "Enrichment" tdict = impute.data_plus_meta_ tmap = impute.racks_map_ tmap['enrich_data'] = 4 cfg_dict = tdict[3].config_ tdict[4] = attr_enrich_data({},cfg_dict ) enrich = enrich_data(4, tdict, tmap) enrich.run() print (enrich.data_plus_meta_[4].data_) "Splitting" tdict = enrich.data_plus_meta_ tmap = enrich.racks_map_ tmap['split'] = 5 cfg_dict = tdict[4].config_ tdict[5] = attr_split_data({},cfg_dict ) split = split_data(5, tdict, tmap) split.run() print (split.data_plus_meta_[5].data_.train_set_dict_, split.data_plus_meta_[5].data_.validate_set_dict_) "Sampling" tdict = split.data_plus_meta_ tmap = split.racks_map_ tmap['sample'] = 6 cfg_dict = tdict[5].config_ tdict[6] = attr_sample_data({},cfg_dict ) sample = sample_data(6, tdict, tmap) sample.run() print (sample.data_plus_meta_[6].data_.train_set_dict_, sample.data_plus_meta_[6].data_.validate_set_dict_, sample.data_plus_meta_[6].data_.predict_set_dict_) "FeatureSelection" tdict = sample.data_plus_meta_ tmap = sample.racks_map_ tmap['feature_select'] = 7 cfg_dict = tdict[6].config_ tdict[7] = attr_feature_select({},cfg_dict ) select_feature = feature_select(7, tdict, tmap) select_feature.run() print (select_feature.data_plus_meta_[7].data_.train_set_dict_, select_feature.data_plus_meta_[7].data_.validate_set_dict_, select_feature.data_plus_meta_[7].data_.predict_set_dict_)
def main(): # the features which should be used. feature_names = [ # Features.Face_count, # Features.Rot_distance, # Features.Face_bb, # Features.Face_bb_full_img, # Features.Face_bb_quarter_imgs, # Features.Face_bb_eighth_imgs, # Features.Tilted_edges, # Features.Edge_hist_v0, # Features.Edge_hist_v1, # Features.Edge_hist_v2, # Features.Symmetry, # Features.Hsv_hist, Features.DenseSIFT_L0, # Features.DenseSIFT_L1, # Features.DenseSIFT_L2, # Features.Hog_L0, # Features.Hog_L1, # Features.Hog_L2, # Features.Lbp_L0, # Features.Lbp_L1, # Features.Lbp_L2, Features.Gist, # Features.CNN_fc7, # Features.CNN_prob ] runname = 1 do_preprocessing = False # use this only at your first run on the dataset calc_features = False # calculates the selected features use_second_dev_classification_method = False # True: classifies with second order deviation method global dir_root # the root directory of your data dir_root = 'C:\Users\Andreas\Desktop\prvc\InterestingnessData2016' ####################### ###STOP EDITING HERE### ####################### # root directories for training and test data dir_training_data = os.path.join(dir_root, 'devset') dir_test_data = os.path.join(dir_root, 'testset') # dicts containing path to images as keys and ground truth as values img_dirs_training = read_img_dirs_and_gt(dir_training_data) img_dirs_test = read_img_dirs(dir_test_data) # preprocessing if do_preprocessing: prvc_preprocessing.preprocessing(img_dirs_training.keys()) prvc_preprocessing.preprocessing(img_dirs_test) print 'preprocessing finished.' # calculate features if calc_features: features_train = feature_calculation.calc_features(img_dirs_training.keys(), feature_names) features_test = feature_calculation.calc_features(img_dirs_test, feature_names) print 'feature calculation finished.' else: # load features from file features_train = feature_files.load_features(img_dirs_training.keys(), feature_names) features_test = feature_files.load_features(img_dirs_test, feature_names) print('features loaded.') if Features.Face_bb in feature_names: # bring bounding box feature matrices to same shape # find matrix with maximal columns and reshape other matrix before concatenating them features_train = make_face_bb_equal_col_size(features_train) features_test = make_face_bb_equal_col_size(features_test) features_train, features_test = make_face_bb_train_test_equal_col_size(features_train, features_test) X_trains = gen_feature_matrices_per_feature(features_train) X_tests = gen_feature_matrices_per_feature(features_test) # scale features (because svm is not scale invariant) X_trains_scaled = scale_features(X_trains) X_tests_scaled = scale_features(X_tests) # generate final feature matrix X_train = gen_final_feature_matrix(X_trains) X_test = gen_final_feature_matrix(X_tests) X_train_scaled = gen_final_feature_matrix(X_trains_scaled) X_test_scaled = gen_final_feature_matrix(X_tests_scaled) #DEBUG save #np.savetxt('C:\Users\Andreas\Desktop\\X_train_fc7.txt.gz', X_train) #np.savetxt('C:\Users\Andreas\Desktop\\X_train_fc7.txt.gz_scaled.txt.gz', X_train_scaled) #np.savetxt('C:\Users\Andreas\Desktop\\X_test_fc7.txt.gz', X_test) #np.savetxt('C:\Users\Andreas\Desktop\\X_test_fc7.txt.gz_scaled.txt.gz', X_test_scaled) # get interestingness y_train = get_target_vec(img_dirs_training) #upsampling of class 'interesting' via SMOTE #sm = SMOTE() #X_train_upsampled, y_train_upsampled = sm.fit_sample(X_train, y_train) #X_train = X_train_upsampled #y_train = y_train_upsampled # # train and test svm # #C = 0.125 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') #class_weight='balanced' #results_1 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_1 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 0.25 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_2 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_2 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 0.5 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_3 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_3 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 1 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_4 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_4 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 2 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_5 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_5 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 4 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_6 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_6 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 8 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_7 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_7 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # #C = 16 # SVM regularization parameter #svc = svm.SVC(kernel='rbf', C=C, class_weight='balanced') # class_weight='balanced' #results_8 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled_8 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, # use_second_dev_classification_method) # submission_format = gen_submission_format(results_1) # save_submission.save_submission(submission_format, 1) # submission_format = gen_submission_format(results_scaled_1) # save_submission.save_submission(submission_format, 2) # # submission_format = gen_submission_format(results_2) # save_submission.save_submission(submission_format, 3) # submission_format = gen_submission_format(results_scaled_2) # save_submission.save_submission(submission_format, 4) # # submission_format = gen_submission_format(results_3) # save_submission.save_submission(submission_format, 5) # submission_format = gen_submission_format(results_scaled_3) # save_submission.save_submission(submission_format, 6) # # submission_format = gen_submission_format(results_4) # save_submission.save_submission(submission_format, 7) # submission_format = gen_submission_format(results_scaled_4) # save_submission.save_submission(submission_format, 8) # # submission_format = gen_submission_format(results_5) # save_submission.save_submission(submission_format, 9) # submission_format = gen_submission_format(results_scaled_5) # save_submission.save_submission(submission_format, 10) # # submission_format = gen_submission_format(results_6) # save_submission.save_submission(submission_format, 11) # submission_format = gen_submission_format(results_scaled_6) # save_submission.save_submission(submission_format, 12) # # submission_format = gen_submission_format(results_7) # save_submission.save_submission(submission_format, 13) # submission_format = gen_submission_format(results_scaled_7) # save_submission.save_submission(submission_format, 14) # # submission_format = gen_submission_format(results_8) # save_submission.save_submission(submission_format, 15) # submission_format = gen_submission_format(results_scaled_8) # save_submission.save_submission(submission_format, 16) #LAPI Settings for HSVHist + GIST ---MAP should be 0.1714 #print("svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1 : 10})") #svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1 : 10}) #results = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) #results_scaled = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, use_second_dev_classification_method) #svc = svm.SVC(kernel='poly', degree=18, gamma=2) #LAPI Settings for DSIFT + GIST ---MAP should be 0.1398 print("svm.SVC(kernel='poly', degree=3, gamma=32, class_weight={1: 10})") svc = svm.SVC(kernel='poly', degree=3, gamma=32, class_weight={1: 10}) #svc = svm.SVC(kernel='poly', degree=3, gamma=32) results = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) results_scaled = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, use_second_dev_classification_method) print("svm.SVC(kernel='poly', degree=3, gamma=32, class_weight='balanced')") svc = svm.SVC(kernel='poly', degree=3, gamma=32, class_weight='balanced') results_2 = predict(svc, X_train, y_train, X_test, features_test, use_second_dev_classification_method) results_scaled_2 = predict(svc, X_train_scaled, y_train, X_test_scaled, features_test, use_second_dev_classification_method) print("save results") submission_format = gen_submission_format(results) save_submission.save_submission(submission_format, 1) submission_format = gen_submission_format(results_scaled) save_submission.save_submission(submission_format, 2) submission_format = gen_submission_format(results_2) save_submission.save_submission(submission_format, 3) submission_format = gen_submission_format(results_scaled_2) save_submission.save_submission(submission_format, 4) ''' #read ground truth of testset img_dirs_test = read_img_dirs_and_gt(dir_test_data) y_test = get_target_vec(img_dirs_test) print('UNSCALED') print('LAPI 1:10') svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1: 10}) scores = cross_val_score(svc, X_test, y_test, cv=3, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI') svc = svm.SVC(kernel='poly', degree=18, gamma=2) scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI c=0.1') svc = svm.SVC(kernel='poly', degree=18, gamma=2, C=0.1) scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI balanced') svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight='balanced') scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI like libsvm balanced') svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight='balanced', cache_size=100) scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI like libsvm 1 to 10') svc = svm.SVC(kernel='poly', degree=18, gamma=2, class_weight={1: 10}, cache_size=100) scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print('LAPI like libsvm 1 to 10, C=0.25') svc = svm.SVC(kernel='poly', C=0.25, degree=18, gamma=2, class_weight={1: 10}, cache_size=100) scores = cross_val_score(svc, X_test, y_test, cv=10, scoring='average_precision') print("Mean Average Precision: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ''' print("finished.")
def main(dataset, subject, Model, params, ph, eval="valid", print=True, plot=False, save=True, excel_file=None): printd(dataset, subject, Model.__name__) file = os.path.join("data", "dynavolt", dataset, dataset + "_subject" + subject + ".csv") """ PREPROCESSING """ train_sets, valid_sets, test_sets, norm_min, norm_max = preprocessing( file, misc.hist, ph, misc.freq, misc.cv) # TODO REMOVE - one split testing # split_number = 7 # train_sets, valid_sets, test_sets = [train_sets[split_number]], [valid_sets[split_number]], [ # test_sets[split_number]] # norm_min, norm_max = [norm_min[split_number]],[norm_max[split_number]] """ CROSS-VALIDATION """ results = [] for i, [train, valid, test] in enumerate(zip(train_sets, valid_sets, test_sets)): train_x, train_y = train.iloc[:, :-2], train.iloc[:, -2:] valid_x, valid_y = valid.iloc[:, :-2], valid.iloc[:, -2:] test_x, test_y = test.iloc[:, :-2], test.iloc[:, -2:] model = Model(params) if Model.__name__ in misc.nn_models: model.fit(x_train=train_x, y_train=train_y, x_valid=valid_x, y_valid=valid_y) else: model.fit(x=train_x, y=train_y) if eval == "valid": y_true, y_pred = model.predict(x=valid_x, y=valid_y) elif eval == "test": y_true, y_pred = model.predict(x=test_x, y=test_y) results.append(np.c_[y_true, y_pred]) """ POST-PROCESSING """ results = postprocessing(results.copy(), hist=misc.hist, ph=misc.ph, freq=misc.freq, min=norm_min, max=norm_max) """ EVALUATION """ res = Results(Model.__name__, misc.ph, dataset, subject, misc.freq, results=np.array(results)) metrics = res.get_results() if print: printd(metrics) if save: res.save() if plot: res.plot() if excel_file is not None: res.to_excel(params, len(res.results), file_name=excel_file)
def compute_glucose_distribution(dataset, train_valid_or_test="train", plot=False, save=False, hypo_hyper_stats=False): """ load data""" glucose = [] for subject in misc.datasets.datasets[dataset]["subjects"]: glucose_sbj = [] train, valid, test, scalers = preprocessing(dataset, subject, 30 // 5, 180 // 5, 1440 // 5) if train_valid_or_test == "train": set = train elif train_valid_or_test == "valid": set = valid elif train_valid_or_test == "test": set = test for set_i, scalers_i in zip(set, scalers): glucose_sbj.append(set_i.y.values * scalers_i.scale_[-1] + scalers_i.mean_[-1]) glucose.append(glucose_sbj) """ create average subject histograms """ nbins = 40 n_sbj = [] for glucose_sbj in glucose: n_split = [] for glucose_sbj_split in glucose_sbj: (n, bins, _) = plt.hist(glucose_sbj_split, bins=nbins, range=[0, 400], density=True, stacked=True) plt.close() n_split.append(n) n_sbj.append(np.mean(n_split, axis=0)) """ compute distributions """ n_arr = np.array(n_sbj) * 400 / nbins middle_bins = ((bins[1:] + bins[:-1]) / 2) mean = np.mean(n_arr, axis=0) std = np.std(n_arr, axis=0) """ plot """ if plot: plt.figure() plt.plot(middle_bins, mean, color='#CC4F1B') plt.fill_between(middle_bins, mean - std, mean + std, alpha=0.5, edgecolor='#CC4F1B', facecolor='#FF9848') plt.title( "Distribution des échantillons de glycémie pour le jeu de données " + dataset + ".") plt.xlabel("glycémie [mg/dL]") plt.ylabel("probabilité") """ save """ if save: df = pd.DataFrame( data=np.c_[middle_bins, mean, mean - std, mean + std], columns=["middle_bins", "mean", "plus-std", "minus-std"]) df.to_csv(path.join( cs.path, "tmp", "figures_data", "glucose_distribution_" + dataset + "_" + train_valid_or_test + ".dat"), index_label="index") """ hypo hyper stats """ if hypo_hyper_stats: print(np.sum(mean[np.where(bins <= 70)[0][:-1]]) * 100) print(np.sum(mean[np.where(bins >= 180)[0][:-1]]) * 100) return n_arr, bins, middle_bins, mean, std
def main(dataset, subject, model, params, exp, mode, log, ph, plot, save=False): printd(dataset, subject, model, params, exp, mode, log, ph, plot) # retrieve model's parameters search = locate_search(params) params = locate_params(params) model_class = locate_model(model) # scale variables in minutes to the benchmark sampling frequency ph_f = ph // cs.freq hist_f = params["hist"] // cs.freq day_len_f = cs.day_len // cs.freq """ PREPROCESSING """ train, valid, test, scalers = preprocessing(dataset, subject, ph_f, hist_f, day_len_f) start = time.time() """ MODEL TRAINING & TUNING """ if search: params = find_best_hyperparameters(subject, model_class, params, search, ph_f, train, valid, test) if save: dir = os.path.join(cs.path, "processing", "models", "weights", model_class.__name__, exp) file = os.path.join(dir, model_class.__name__ + "_" + dataset + subject) else: file = None raw_results = make_predictions(subject, model_class, params, ph_f, train, valid, test, mode=mode, save_model_file=file) """ POST-PROCESSING """ raw_results = postprocessing(raw_results, scalers, dataset) """ EVALUATION """ results = ResultsSubject(model, exp, ph, dataset, subject, params=params, results=raw_results) printd(results.compute_mean_std_results()) end = time.time() printd("Time elapsed : " + str(end - start) + " seconds") if plot: results.plot(0)
def run_main(model_params, useTest=False, useScope=True, save=False, completeCV=False, dataAugm=True, drop_cols=[], cluster=None, name='', categorical_features=['sku', 'pack', 'brand']): abs_path = Path(__file__).absolute().parent train_path = os.path.join(abs_path, "dataset/original/train.csv") test_path = os.path.join(abs_path, "dataset/original/x_test.csv") train = pd.read_csv(train_path) test = pd.read_csv(test_path) useTest = useTest useScope = useScope isEvaluation = False useSampleWeights, weights_type = True, 2 save = save completeCV = completeCV dataAugm = dataAugm if completeCV: useTest = False useScope = False df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm) df, categorical_f = add_all_features(df) categorical_f = list(set(categorical_features + categorical_f)) drop_cols = drop_cols categorical_f = [x for x in categorical_f if x not in drop_cols] df = df.sort_values('Date') # --------------- Model ----------------- CLUSTER = cluster NAME = name if NAME == 'lgb_std' or NAME == 'lgb_cls': model = LightGBM(**model_params) elif NAME == 'catboost': model = CatBoost(**model_params) print('Start the model ' + NAME) model = model model_gen = Generator( df, model, categorical_features=categorical_f, drop_columns=drop_cols, isScope=useScope, sample_weights_type=weights_type, evaluation=isEvaluation, useTest=useTest, cluster=CLUSTER, name=NAME, completeCV=completeCV, dataAugmentation=dataAugm, ) model_gen.run_generator(save) model_gen.plot_feature_importance() print(model_gen.compute_MAPE())
'TEICOPLANINA_.MG.', 'TIGECICLINA_.MG.', 'TOBRAMICINA_.MG.', 'TOBRAMICINA_NEB_.MG.', 'VANCOMICINA_.MG.'] # Load data df_train = pd.read_csv('./data/train.csv') df_test = pd.read_csv('./data/test_challenge.csv') # ------------------------------------------------ # Preprocessing # ------------------------------------------------ # General preprocessing df_train_cln = preprocessing(df_train) df_test_cln = preprocessing(df_test) # PCA resulting_features_names = ['PC1_DIAGNOSTIC', 'PC2_DIAGNOSTIC'] pc_diagnosis = PCA_r(df_train_cln, features_diagnostic, 2, resulting_features_names) resulting_features_names = ['PC1_ANTIBIOTIC', 'PC2_ANTIBIOTIC'] pc_antibiotics = PCA_r(df_train_cln, features_antibiotic, 2, resulting_features_names) # Adding PCA columns to original dataset df_train_cln = pd.concat([df_train_cln, pc_diagnosis, pc_antibiotics], axis=1) # ------------------------------------------------ # Modelling # ------------------------------------------------
def run_xgboost(useTest=False, useScope=False, completeCV=False, dataAugm=False, save=True): train = pd.read_csv("../dataset/original/train.csv") test = pd.read_csv("../dataset/original/x_test.csv") useTest = useTest useScope = useScope isEvaluation = False useSampleWeights, weights_type = True, 2 save = save completeCV = completeCV # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train # e predice via via tutte le settimane successive incrementando il train dataAugm = dataAugm # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo # non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane] if isEvaluation: useTest = False useScope = False if completeCV: useTest = False useScope = False df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm) df, categorical_f = add_all_features(df) categorical_f = ['sku', 'pack', 'brand'] + categorical_f df = df.sort_values('Date') df_scope = df[['Date', 'sku', 'scope']].copy() def wmape_train_(y_true, data): """ IMPORTANTE: sortare prima gli elementi del df per ('sku', 'Date'): df.sort_values(['sku','Date'] Give less importance to previous [in time] values, exponentially :param y_true: :param y_pred: :return: """ # global s y_true = np.array(y_true) y_pred = data.get_label() N = int(y_true.shape[0] / 133) weight = np.arange(y_true.shape[0]) weight = weight % N weight = weight / N grad = -100 * ((y_true - y_pred) / y_true) * (np.exp(weight)) hess = 100 / (y_true) * (np.exp(weight)) return grad, hess def ohe_categorical(df, categorical_features): for c in categorical_features: dummy = pd.get_dummies(df[c], prefix=c) df[dummy.columns] = dummy return df df = ohe_categorical(df, ['cluster', 'heavy_light']) cat_cols = ['pack', 'brand', 'scope', 'heavy_light', 'cluster', 'year'] df = df.drop(cat_cols, axis=1) if useTest: df = df.sort_values('Date') test_dates = df[df.Date >= '2019-06-29'] test_dates = test_dates.drop_duplicates('Date').Date gen = dfs_gen(df, test_dates) else: train = df[~df.target.isna()] if completeCV: if dataAugm: dates = train[train.Date >= '2016-12-10'].Date.sort_values( ).drop_duplicates(keep='first') else: dates = train.Date.sort_values().drop_duplicates(keep='first') val_dates = dates[1:] else: _, _, val_dates = train_validation_split(train) gen = dfs_gen(train, val_dates) params = { 'obj': wmape_train_, 'learning_rate': 0.1, 'max_depth': 10, # 'min_child_weight': 3, # 'tree_method': 'hist' } # RUNNING MODEL prediction_df = pd.DataFrame() feature_importances = [] prev_df_test = pd.DataFrame() drop_target = ['real_target', 'target', 'Date', 'sku'] xgb_model = None for i, (df_train, df_test) in enumerate(gen): if i == 0: xgb_model = xgb.train(params, dtrain=xgb.DMatrix( df_train.drop(drop_target, axis=1), df_train.target), num_boost_round=700) feature_importances.append(xgb_model.get_fscore()) else: # xgb_model.fit(prev_df_test.drop(drop_target, axis=1), prev_df_test.target, xgb_model='xgb_model_online.model') params.update({ # 'learning_rate': 0.05, 'updater': 'refresh', 'process_type': 'update', 'refresh_leaf': True, # 'reg_lambda': 3, # L2 # 'reg_alpha': 3, # L1 'silent': False, }) xgb_model = xgb.train(params, dtrain=xgb.DMatrix( df_train.drop(drop_target, axis=1), df_train.target), num_boost_round=400, xgb_model=xgb_model) df_test['prediction'] = xgb_model.predict( xgb.DMatrix(df_test.drop(drop_target, axis=1))) # print(df_test[['Date', 'sku', 'target', 'prediction']]) # xgb_model.save_model('xgb_model_online.model') prediction_df = pd.concat([ prediction_df, df_test[['Date', 'sku', 'real_target', 'target', 'prediction']] ]) prev_df_test = df_test.drop(['prediction'], axis=1).copy() feature_importances.append(xgb_model.get_fscore()) prediction_df['real_prediction'] = np.expm1(prediction_df.prediction) prediction_df = prediction_df.merge(df_scope, how='left', on=['Date', 'sku']) if not useTest: train = df[~df.target.isna()] _, _, val_dates = train_validation_split(train) mask_val = (prediction_df.Date.isin(val_dates)) & (prediction_df.scope == 1) print( f'MAPE {MAPE(prediction_df[mask_val].real_target, prediction_df[mask_val].real_prediction)}' ) if save: if useTest: prediction_df.drop('scope', axis=1).to_csv( "../dataset/prediction/test/xgb_inc_test.csv", index=False) else: if completeCV: prediction_df.drop('scope', axis=1).to_csv( "../dataset/prediction/val/xgb_inc_val.csv", index=False) plt.figure(figsize=(20, 10)) feat_imp = { k: v for k, v in sorted(feature_importances[1].items(), key=lambda item: item[1]) } x = list(feat_imp.keys()) y = list(feat_imp.values()) plt.barh(x, y) plt.show()
def main(model, useTest, useScope, save, completeCV, dataAugm, categorical_features=['cluster', 'sku', 'pack', 'brand'], drop_cols=[ 'scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster' ], cluster=None, name='', useSampleWeights=True, weights_type=2, isEvaluation=False, rand_noise=False): train = pd.read_csv("dataset/original/train.csv") test = pd.read_csv("dataset/original/x_test.csv") useTest = useTest useScope = useScope isEvaluation = isEvaluation useSampleWeights, weights_type = useSampleWeights, weights_type save = save completeCV = completeCV # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train # e predice via via tutte le settimane successive incrementando il train dataAugm = dataAugm # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo # non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane] rand_noise = rand_noise if isEvaluation: useTest = False useScope = False if completeCV: useTest = False useScope = False df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm, rand_noise=rand_noise) df, categorical_f = add_all_features(df) #categorical_f = ['sku', 'pack', 'brand'] + categorical_f categorical_f = categorical_features drop_cols = drop_cols df = df.sort_values('Date') # --------------- Model ----------------- #drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster'] categorical_f = [x for x in categorical_f if x not in drop_cols] # df = ohe_categorical(df, [c for c in categorical_f if c != 'sku']) # Usare per mettere in ohe le features categoriche # CLUSTER = [1,2,3] # Set CLUSTER = None if you want NOT to consider any cluster CLUSTER = cluster NAME = name print('Start the model ' + NAME) model = model model_gen = Generator( df, model, categorical_features=categorical_f, drop_columns=drop_cols, isScope=useScope, sample_weights_type=weights_type, evaluation=isEvaluation, useTest=useTest, cluster=CLUSTER, name=NAME, completeCV=completeCV, dataAugmentation=dataAugm, ) model_gen.run_generator(save) print(model_gen.compute_MAPE())
completeCV = False # Per avere le predizioni sul train, impostarlo a True: parte dalla prima settimana del train # e predice via via tutte le settimane successive incrementando il train dataAugm = False # Crea il 2016: consiglio di metterlo a True quando completeCV = True, in modo che l'algoritmo # non traini usando solo la prima settimana del train originale, ma usando tutto il 2016 [52 settimane] if isEvaluation: useTest = False useScope = False if completeCV: useTest = False useScope = False df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm) df, categorical_f = add_all_features(df) categorical_f = ['sku', 'pack', 'brand'] + categorical_f df = df.sort_values('Date') # --------------- Model ----------------- drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster'] categorical_f = [x for x in categorical_f if x not in drop_cols] #CLUSTER = [1,2,3] # Set CLUSTER = None if you want NOT to consider any cluster CLUSTER = None NAME = 'lightgbm' params = {
def run_gte_feature(): abs_path = Path(__file__).absolute().parent train_path = os.path.join(abs_path, "../dataset/original/train.csv") test_path = os.path.join(abs_path, "../dataset/original/x_test.csv") train = pd.read_csv(train_path) test = pd.read_csv(test_path) df = preprocessing(train, test, useTest=True, dataAugmentation=True) # df, categorical_f = add_all_features(df) df_cluster = get_cluster() df = df.merge(df_cluster, how='left', on='sku') def simple_gen(df): df = df.sort_values('Date') dates = df[df.Date >= '2016-12-10']['Date'].drop_duplicates().values dates = dates[1:] for d in dates: yield df[df.Date < d], df[df.Date == d] gen = simple_gen(df) group_and_priors = { ('pack'): None, ('brand'): None, ('cluster'): None, ('pack', 'brand'): ['gte_pack', 'gte_brand'], ('pack', 'cluster'): ['gte_pack', 'gte_cluster'], ('brand', 'cluster'): ['gte_brand', 'gte_cluster'], ('pack', 'brand', 'cluster'): ['gte_pack_brand', 'gte_pack_cluster'], } df_gte = pd.DataFrame() window = 8 prior_precision = 50 for t, v in tqdm(gen): date = v.Date.drop_duplicates(keep='first') features = [] for group_cols, prior_cols in group_and_priors.items(): if isinstance(group_cols, str): f_name = "gte_" + group_cols else: f_name = "gte_" + '_'.join(group_cols) features.append(f_name) gte = GaussianTargetEncoder(group_cols, 'target', prior_cols) dates = t.Date.drop_duplicates() if len(dates) > window: t = t[t.Date.isin(dates[-window:])] #print(f'Encoding Train: days < {date} : rows {t.shape[0]}') t.loc[:, features[-1]] = gte.fit_transform( t, prior_precision=prior_precision, window=window) #print(f'Encoding Validation = {date} \n') v.loc[:, features[-1]] = gte.transform( v, prior_precision=prior_precision) df_gte = pd.concat([df_gte, v]) gte_cols = [x for x in df_gte.columns if 'gte' in x] save_path = os.path.join( abs_path, f"gte_features_w{window}_prp{prior_precision}.csv") df_gte[['Date', 'sku', 'target', 'real_target'] + gte_cols].to_csv( save_path, index=False)