def run(): df = pd.read_csv('spambase_data\\spambase.data', header=None) df = df.sample(frac=1).reset_index(drop=True) print(f'No. missing values: {df.isnull().sum().sum()}') X = df.drop(57, axis=1) y = df.loc[:, 57] # Feature selection abs_corr_w_target = X.apply( lambda col: col.corr(y)).abs().sort_values().to_frame() abs_corr = X.corr().abs() plotting_tools.plot_heatmap(abs_corr_w_target, title='Correlation with target', size=(8, 16), one_dim=True) plotting_tools.plot_heatmap(abs_corr, title='Correlation before feature selection', size=(10, 16)) to_drop = set() # Amount of variation variance = X.var(axis=0, ddof=1) to_drop.update(variance[variance < 0.01].index.values) # Correlation with target to_drop.update(abs_corr_w_target[abs_corr_w_target[0] < 0.01].index.values) # Pairwise correlation to_drop.update(preprocessing.find_correlated(abs_corr, abs_corr_w_target)) to_drop = list(to_drop) nr_dropped = len(to_drop) X.drop(to_drop, axis=1, inplace=True) abs_corr = X.corr().abs() plotting_tools.plot_heatmap(abs_corr, title='Correlation after feature selection', size=(10, 16)) print(f'Dropped features: {to_drop}') # Data standardization works better, use normalization only for tests # X = preprocessing.normalize_data(X) X = preprocessing.standardize_data(X) X = X.values y = y.values train_inputs, cv_inputs, test_inputs = np.split( X, [int(0.6 * len(df)), int(0.8 * len(df))]) train_outputs, cv_outputs, test_outputs = np.split( y, [int(0.6 * len(df)), int(0.8 * len(df))]) print(f'Training set size: {train_outputs.shape[0]}\n' f'Cross validation set size: {cv_outputs.shape[0]}\n' f'Test set size: {test_outputs.shape[0]}') model = NeuralNetwork([57 - nr_dropped, 32, 1], activation_function='sigmoid') # Only use this part for tuning hyperparameters, slows down the program significantly # lambdas = list(np.arange(0.5, 1.5, 0.1)) # model.plot_learning_curves(train_inputs, train_outputs, cv_inputs, cv_outputs, # learning_rate=1.5, epochs=500, lambda_=0.6) # model.plot_validation_curves(train_inputs, train_outputs, cv_inputs, cv_outputs, # learning_rate=1.5, epochs=1000, lambdas=lambdas) model.gradient_descent(train_inputs, train_outputs, 1.5, 4000, 0.6, gradient_check=False, plot_cost=False) train_predictions = np.where(model.predict(train_inputs) > 0.5, 1, 0) test_predictions = np.where(model.predict(test_inputs) > 0.5, 1, 0) train_columns = { 'Train predictions': train_predictions[:, 0], 'Train outputs': train_outputs } test_columns = { 'Test predictions': test_predictions[:, 0], 'Test outputs': test_outputs } train_results = pd.DataFrame(train_columns) test_results = pd.DataFrame(test_columns) train_correct = pd.value_counts(train_results['Train predictions'] == train_results['Train outputs'])[True] test_correct = pd.value_counts( test_results['Test predictions'] == test_results['Test outputs'])[True] test_positive_predictions = test_results[test_results['Test predictions'] == 1] test_negative_predictions = test_results[test_results['Test predictions'] == 0] test_is_positive_correct = pd.value_counts( test_positive_predictions['Test predictions'] == test_positive_predictions['Test outputs']) test_is_negative_correct = pd.value_counts( test_negative_predictions['Test predictions'] == test_negative_predictions['Test outputs']) test_true_positives = test_is_positive_correct[True] test_false_positives = test_is_positive_correct[False] test_true_negatives = test_is_negative_correct[True] test_false_negatives = test_is_negative_correct[False] test_precision = test_true_positives / (test_true_positives + test_false_positives) test_recall = test_true_positives / (test_true_positives + test_false_negatives) test_confusion_matrix = pd.DataFrame( [[test_true_positives, test_false_positives], [test_false_negatives, test_true_negatives]], columns=[1, 0], index=[1, 0]) train_acc = train_correct / len(train_outputs) test_acc = test_correct / len(test_outputs) print(f'train_acc = {train_acc}') print(f'test_acc = {test_acc}') print(f'test_precision = {test_precision}') print(f'test_recall = {test_recall}') plotting_tools.plot_cm(test_confusion_matrix, title='Confusion matrix')
# --- LSTM nao balanceada tomando media em 10 intervalos --- data_num = 500 num_int = 2560 #Lendo todos os dados do experimento X, y = pre.load_3dim('dataset/', data_num, num_int) #Pegando a media em um numero de 10 intervalos para cada componente X = pre.med_intervalo_3dim(X, 10) #Remodelado as dimensões de y para ser aceito na dummy y = np.reshape(y, (y.shape[0], -1)) #Passando y para dummy variables y_dummy = pre.dummy_variables(y) #Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes) X_train, X_test, y_train, y_test = pre.split_data(X, y_dummy, 0.2, None) #Padronizando dados X_train, X_test = pre.standardize_data(X_train, X_test) #Implementando a LSTM from keras.models import Sequential from keras.layers import LSTM from keras.layers import Embedding from keras.layers import Dense #Dimensao da camada invisivel hidden_size = 32 #Criando obbjeto da rede sl_model = Sequential() #gerando uma camada do tipo LSTM que recebe o numero de saidas, o tipo de funcao de ativacao geralmente a tangente hiperbolica # o quanto irei desconsiderar dos dados de entrada nessa camada e quanto irei desconsideradar do estado de recorrencia anterior sl_model.add( LSTM(units=hidden_size, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh',
# Split the data into training/testing sets train_x = train_data.drop(["Income"], axis=1) test_x = test_data.drop(["Income"], axis=1) # Split the targets into training/testing sets train_y = train_data["Income"] test_y = test_data["Income"] # Polynomialize the datasets if (polynomialize): train_x = polynomialize_data(train_x) test_x = polynomialize_data(test_x) # Standardize the datasets if (standardize): train_x = standardize_data(train_x) test_x = standardize_data(test_x) # Normalize the datasets if (normalize): train_x = normalize_data(train_x) test_x = normalize_data(test_x) # Train the model using the training sets # and make predictions using testing sets regr.fit(train_x, train_y) pred_y = regr.predict(test_x) # Save results to file if not test: predictions = {"Instance": range(111994, 185224), "Income": pred_y}
def main(data_directory_path, init_file): pred = pklload('predidcted.txt') wins = pklload('windows.txt') raws = pklload('r_data.txt') print('DATA_DIRECTORY:{}'.format(data_directory_path)) print('CONFIGURATION_FILE: {}'.format(init_file)) # settings recovering # via settings.ini file print('Parameters recovering..') config = ConfigParser() config.read('settings.ini') # parameters recovering # features domain fdom = config.get('section_b', 'fdom') sampling_freq = config.getfloat('section_b', 'sampling_freq') # epoch half size as int epk_half_sizei = config.getint('section_a', 'epk_half_size') # frequencies banks frequency_bands = eval(config.get('section_a', 'frequency_bands')) # best setting recovering best_setting = config.get('section_c', 'best_setting').split(',') if (best_setting[0] == 'None'): print('please run training_pipeline script before testing!!') else: # freatures domain fdom = best_setting[0] # reduction procedure redux_proc = best_setting[1] # classifiers clf_type = best_setting[2] # Raw data recovering print('Data loading..') r_data = load_raws_within_dir(data_directory_path) # BUILDING ARTIFICIAL EQUALLY SPACED WINDOWS OVER PSEUDO EVENTS windows = [] for raw in r_data: windows.append( windower(raw, 0, -epk_half_sizei / sampling_freq, epk_half_sizei / sampling_freq)) # FEATURES COMPUTATION features_set = None if (fdom == 'time') or (fdom == 'time_frequency'): print('######################## Time Domain Features - computations -') tdf = extract_td_features_from_epks(windows) # data formatting/reshaping rtdf = reshape_numpy(tdf) # standardization rtdf_std = [] for data in rtdf: rtdf_std.append(standardize_data(data)) features_set = rtdf_std if (fdom == 'frequency') or (fdom == 'time_frequency'): # frequency domain coefficients computing print( '########################Frequency domain coefficients computation..' ) print(type(frequency_bands)) fd_coeffs = band_filter(windows, frequency_bands) print( '######################## Frequency Domain Features - computations -' ) fdf = [] for dec in fd_coeffs: fdf.append(svm_features(dec)) # data formatting (reshaping) rfdf = reshape_numpy(fdf) # standardization rfdf_std = [] for data in rfdf: rfdf_std.append(standardize_data(data)) features_set = rfdf_std if fdom == 'time_frequency': # time and frequency domain features concatenation rtfdf = [] for tf, ff in zip(rtdf, rfdf): print(tf.shape, ff.shape) rtfdf.append(np.concatenate((tf, ff), axis=1)) # standardization_events_to_raws rtfdf_std = [] for features in rtfdf: rtfdf_std.append(standardize_data(features)) features_set = rtfdf_std # DIMENSION REDUCTION redux_set = [] for features in features_set: if redux_proc == 'pca': redux_set.append(pca(features, 2)) elif redux_proc == 'ica': redux_set.append(ica(features, 2)) #elif redux_proc == 'lda': # redux = eest.lda(fset, 2, labset) else: # no reduction -> ident redux_set.append(ident(features)) # CLASSIFICATION # classifier selection n_classes = 2 if clf_type == 'kmeans': clf = KMeans(n_clusters=n_classes) #elif clf_type == 'svm': # # SVM- support vector machine # clf = svm.SVC() elif clf_type == 'hc': # hierarchical clustering clf = AgglomerativeClustering(n_clusters=n_classes, affinity='euclidean', linkage='ward') elif clf_type == 'if': # isolation forest clf = IsolationForest() elif clf_type == 'em': # n_components shall be chosen via bic criterion # cv_type: full(default)/spherical/tied/dag clf = GaussianMixture(n_components=n_classes, covariance_type='full') elif clf_type == 'ap': # affinity propagation clf = AffinityPropagation( random_state=5, max_iter=1000) # convergence issues might need tuning elif clf_type == 'bgm': # BayesianGaussianMixture clf = BayesianGaussianMixture(n_components=n_classes, max_iter=200) else: # error handling (default behaviour) todo print('lkajdflkj----- bad clf_type') clf = None # PREDICTION predicted = [] for features in redux_set: clf.fit(features[0]) predicted.append(clf.predict(features[0])) # RAW OBJECT: EVENT ADDITION pkldump(r_data, 'r_data.txt') pkldump(windows, 'windows.txt') pkldump(predicted, 'predidcted.txt') tagged = add_events_to_raws(predicted, windows, r_data) a = 11
def run_best_model(cdn): ft = 'raw' seed = 2222 standardize_method = 'z' is_cz = False cu.checkAndCreate('%s/seed%d' % (cdn, seed)) pp.split_nfolds('%s/alldata_readmit.csv' % cdn, '%s/seed%d/alldata_readmit' % (cdn, seed), shuffle=True, seed=seed) pp.split_by_feature_type(cdn='%s/seed%d' % (cdn, seed), fn_prefix='%s/seed%d/alldata_readmit' % (cdn, seed)) cu.checkAndCreate('%s/seed%d/raw/interp' % (cdn, seed)) cu.checkAndCreate('%s/seed%d/raw/interp/mean/dataset' % (cdn, seed)) for i in range(5): pp.impute_by_interpolation_on_last12h( '%s/seed%d/raw/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/extrapolation_log_test_fold%d.txt' % (cdn, seed, i)) pp.impute_by_interpolation_on_last12h( '%s/seed%d/raw/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/extrapolation_log_train_fold%d.txt' % (cdn, seed, i)) pp.impute_by_mean( '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' % (cdn, seed, i)) pp.standardize_data( '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) # run temporal model freq_list = ['011'] freq_to_trainFreq_map = {'011': '014'} nel_graph_length = 13 cu.checkAndCreate('%s/seed%d/%s/interp/mean/%s' % (cdn, seed, ft, standardize_method)) e = rn.Experiment( '%s/seed%d/%s/interp/mean/%s' % (cdn, seed, ft, standardize_method), '%s/seed%d/%s/interp/mean/dataset' % (cdn, seed, ft), seed, is_cz, standardize_method, freq_list, freq_to_trainFreq_map, nel_graph_length) isg = 0 freq_t = '011' nc = 110 c = 2 pl = 'l1' cw = 'balanced' ntestth = 2 cu.checkAndCreate('%s/isg%d' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/pt_sg_w' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/res' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/nmf_piks' % (e.cdn, isg)) for foldi in range(5): train = e.ftrain % (e.dataset_folder, foldi, e.standardize_method) test = e.ftest % (e.dataset_folder, foldi, e.standardize_method) print train print test ftrnel = "%s/mimic_train_fold%d.nel" % (e.cdn, foldi) ftrnode = "%s/mimic_train_fold%d.node" % (e.cdn, foldi) fnel = "%s/mimic_fold%d.nel" % (e.cdn, foldi) fnode = "%s/mimic_fold%d.node" % (e.cdn, foldi) e.interpolation(trcsv=train, tecsv=test, ftrnel=ftrnel, ftrnode=ftrnode, fnel=fnel, fnode=fnode) e.get_freq_to_trainFreq_map(foldi) for freq_t in e.moss_freq_threshold_list: e.subgraph_mining(tr_nel=ftrnel, tr_te_nel=fnel, freq_t=freq_t, foldi=foldi) e.gen_pt_sg_files(isg, freq_t, foldi) cu.checkAndCreate('%s/seed%d/raw/interp/mean/last_measures/dataset' % (cdn, seed)) # run baseline model for i in range(5): pp.get_last_measurements( '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) pp.get_last_measurements( '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) best_features = rfe( '%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed), 50, standardize_method, 5, 'l1', 'balanced') print best_features # best_features = ['urineByHrByWeight', 'HCT', 'INR', 'Platelets', 'RBC', # 'DeliveredTidalVolume', 'PlateauPres', 'RAW', 'RSBI', 'mDBP', 'CV_HR', # 'Art_BE', 'Art_CO2', 'Art_PaCO2', 'Art_pH', 'Cl', 'Mg', 'Anticoagulant', # 'beta.Blocking_agent', 'Somatostatin_preparation', 'Vasodilating_agent', # 'AIDS', 'MetCarcinoma'] baseline_auc = lr('%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed), standardize_method, 5, 'l1', 'balanced', 50) print 'baseline AUC: %s' % baseline_auc res_list = [] for foldi in range(5): fnaddtr = '../data/seed2222/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv' % ( foldi, standardize_method) fnaddte = '../data/seed2222/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv' % ( foldi, standardize_method) prediction_matrics = e.read_prediction_matrics(isg, freq_t) (res, gt_te, pt_te, res_baseline) = e.nmfClassify_ob( prediction_matrics['ptsg'][foldi], prediction_matrics['ptwd'][foldi], prediction_matrics['sgs'][foldi], prediction_matrics['pt'][foldi], prediction_matrics['gt'][foldi], '%s/isg%d/nmf_piks/nmf_%s_fold%d_%d.pik' % (e.cdn, isg, freq_t, foldi, nc), ntestth, foldi, nc, c, pl, cw, fnaddtr, fnaddte, best_features) res_list.append(res) (auc, tr_auc) = e.get_mean_auc(res_list) print auc, tr_auc for i in range(len(res_list)): with open( '../data/seed2222/raw/interp/mean/z/isg0/res/c_pre_te_fold%d' % i, 'wb') as f: pickle.dump(res_list[i]['c_pre_te'], f) with open('../data/seed2222/raw/interp/mean/z/isg0/res/res_fold%d' % i, 'wb') as f: pickle.dump(res_list[i], f)
def main(data_directory_path, init_file): print('DATA_DIRECTORY :', data_directory_path) print('CONFIGURATION_FILE:', init_file) # settings recovering # via settings.ini file print('Parameters recovering..') config = ConfigParser() config.read('settings.ini') # parameters recovering # features domain fdom = config.get('section_b', 'fdom') # reduction procedures to be applied redux_procedures = config.get('section_b', 'redux_procedures').split(',') # classifiers to be used classifiers = config.get('section_b', 'classifiers').split(',') # epoch half size as int epk_half_sizei = config.getint('section_a', 'epk_half_size') # epoch half size as float epk_half_sizef = epk_half_sizei / 1000 # number of folds k_folds = config.getint('section_b', 'k_folds') # frequencies banks frequency_bands = eval(config.get('section_a', 'frequency_bands')) # Raw data recovering print('Data loading..') #r_data = msc.load_raws_from_dir(data_directory_path) r_data = load_raws_within_dir(data_directory_path) print('Spikeless events tagging..') r_01 = add_spikeless_events_to_raws(r_data, epk_half_sizei) # labels revovering from raw data labels = get_labels_from_raws(r_01) #''' # Epochs computation print('Epoch building..') epks = get_epochs_from_raws(r_01, epk_half_sizef) # FEATURES COMPUTATION if (fdom == 'time') or (fdom == 'time_frequency'): print('######################## Time Domain Features - computations -') tdf = extract_td_features_from_epks(epks) #msc.pkldump(tdf,'td_features.pkl') #tdf = msc.pklload('td_features.pkl') # data formatting (reshaping) rtdf = reshape_numpy(tdf) # standardization rtdf_std = [] for data in rtdf: rtdf_std.append(standardize_data(data)) if (fdom == 'frequency') or (fdom == 'time_frequency'): # frequency domain coefficients computing print( '########################Frequency domain coefficients computation..' ) print(type(frequency_bands)) fd_coeffs = band_filter(epks, frequency_bands) #msc.pkldump(fd_coeffs, 'fd_coeffs.pkl') #fd_coeff = msc.pklload('fd_coeffs.pkl') print( '######################## Frequency Domain Features - computations -' ) fdf = [] for dec in fd_coeffs: fdf.append(svm_features(dec)) #msc.pkldump(fdf,'fd_features.pkl') #fdf = msc.pklload('fd_features.pkl') # data formatting (reshaping) rfdf = reshape_numpy(fdf) # standardization rfdf_std = [] for data in rfdf: rfdf_std.append(standardize_data(data)) if fdom == 'time_frequency': # time and frequency domain features concatenation rtfdf = [] for tf, ff in zip(rtdf, rfdf): print(tf.shape, ff.shape) rtfdf.append(np.concatenate((tf, ff), axis=1)) # standardization rtfdf_std = [] for features in rtfdf: rtfdf_std.append(standardize_data(features)) # DIMENSION REDUCTION redux = [] for fset, labset in zip(rtfdf_std, labels): # loop on subjects features sets sbj_redux = [] for rdx in redux_procedures: # loop on reduction procedure to apply if rdx == 'pca': sbj_redux.append(pca(fset, 2)) elif rdx == 'ica': sbj_redux.append(ica(fset, 2)) elif rdx == 'lda': sbj_redux.append(lda(fset, 2, labset)) else: # no reduction -> ident sbj_redux.append(ident(fset, 2, labset)) redux.append(sbj_redux) #msc.pkldump(redux,'features_reductions.pkl') #redux = msc.pklload('features_reductions.pkl') # CLASSIFIER TRAINING res = [] for subject, labels_set in zip( redux, labels): # loop on sujbects & according labels print('#####') tmp = [] for clf in classifiers: print('~~~~~~~') for rdx in subject: print('-----') #todo warning on y_pred¦y_true labels clf_list = train_clf(features_set=rdx[0], clf_type=clf, n_classes=2, labels_set=labels_set, k_folds=k_folds, additional_params=[rdx[2]]) tmp.append(clf_list) res.append(tmp) #msc.pkldump(res,'train_pipe_res.pkl') #''' #res = msc.pklload('train_pipe_res.pkl') # BEST SETTINGS RECOVERING % RECORDING ordered = order_by_perf(res, 1) # MOSIAC PLOT hmd_data = recover_data_4mosaicplot(res, 1, len(classifiers), len(redux_procedures)) heatmap_plot(hmd_data, 'lkjdf', xlabs=redux_procedures, ylabs=classifiers) ad = 19