def feature_precisions(A, y, feature_sels, test_size=0.2, ITER_TIMES=50, classifier='xgboost', params=None): precisions = defaultdict(list) xgb_params = {'silent': 1, 'objective': 'multi:softmax', 'num_class': 10} if params is not None and classifier == 'xgboost': xgb_params.update(params) elif params is None: params = {} num_round = 50 for r in range(ITER_TIMES): A_train, A_test, y_train, y_test = train_test_split(A, y, test_size=test_size) ms = [] for key, features_selection in feature_sels.items(): _A_train = select_features(A_train, features_selection) _A_test = select_features(A_test, features_selection) if classifier == 'xgboost': _A_train = xgb.DMatrix(_A_train, label=y_train) _A_test = xgb.DMatrix(_A_test, label=y_test) clf = xgb.train(xgb_params, _A_train, num_round) elif classifier == 'svm': clf = OneVsOneClassifier(SVC(**params)) clf.fit(_A_train, y_train) h = np.array(clf.predict(_A_test)).astype(int) p = accuracy_score(h, y_test) precisions[key].append(p) ms.append('{:>7s} precision:{:7.2%}'.format(key, p)) sys.stdout.flush() sys.stdout.write('Round {:3d}/{:3d}:{}\r'.format(r+1, ITER_TIMES, '|'.join(ms))) precision_info(precisions, ITER_TIMES) return precisions
def get_coords(routes:list, output='dict'): # output is either 'df' or 'dict' url = make_request_url(routes, verbose=True) json = utils.grab(url) trains = parse_train_data(json) features = ['lat', 'lon'] trains = utils.select_features(trains, features) if output == 'df': trains = pd.DataFrame.from_dict(trains, orient='index') return trains
def run(data_fn, prop_missing=0., max_num_feature=-1, feature_selection='random', k=10, data_dir='_data', out_dir='_out'): """Run RIDDLE classification interpretation pipeline. Arguments: data_fn: string data file filename prop_missing: float proportion of feature observations which should be randomly masked; values in [0, 1) max_num_feature: int maximum number of features to use feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k: int number of partitions for k-fold cross-validation interpret_model: bool whether to interpret the trained model for first k-fold partition which_half: str which half of experiments to do; values = {'first', 'last', 'both'} data_dir: string directory where data files are located cache_dir: string directory where cached files (e.g., saved parameters) are located out_dir: string outer directory where outputs (e.g., results) should be saved """ from keras.models import load_model from riddle import emr, feature_importance from riddle.models import MLP start = time.time() base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing, max_num_feature, feature_selection) recursive_mkdir(base_out_dir) # get common data x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = ( get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing)) num_feature = len(idx_feat_dict) num_class = len(idx_class_dict) list_sums_D, list_sums_D2, list_sums_contribs = [], [], [] for k_idx in range(k): full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx) print('\nPartition k = {}'.format(k_idx)) x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition( x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, idx_feat_dict = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature # interpret start = time.time() temp_mlp = MLP(num_feature=num_feature, num_class=num_class) hdf5_path = full_out_dir + '/model.h5' sums_D, sums_D2, sums_contribs, pairs = \ feature_importance.get_diff_sums( hdf5_path, x_test_unvec, process_x_func=temp_mlp.process_x, num_feature=num_feature, num_class=num_class) with open(full_out_dir + '/sums_D.pkl', 'wb') as f: pickle.dump(sums_D, f) with open(full_out_dir + '/sums_D2.pkl', 'wb') as f: pickle.dump(sums_D2, f) with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f: pickle.dump(sums_contribs, f) list_sums_D.append(sums_D) list_sums_D2.append(sums_D2) list_sums_contribs.append(sums_contribs) def compute_total_sums(list_sums): total_sums = list_sums[0] for i in range(1, len(list_sums)): for j in range(len(total_sums)): total_sums[j] = np.add(total_sums[j], list_sums[i][j]) return total_sums total_sums_D = compute_total_sums(list_sums_D) total_sums_D2 = compute_total_sums(list_sums_D2) total_sums_contribs = compute_total_sums(list_sums_contribs) num_sample = len(x_unvec) run_interpretation_summary(x_unvec, y, total_sums_D, total_sums_D2, total_sums_contribs, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, pairs=pairs, num_sample=num_sample, full_out_dir=base_out_dir) print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format( time.time() - start)) print('-' * 72) print()
def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature, num_class, max_num_sample, feature_selection, k_idx, k, num_search, perm_indices): """Run a parameter search for a single k-fold partitions Arguments: method: string name of classification method; values = {'logit', 'random_forest', 'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'} x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes present feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation num_search: int number of searches (parameter configurations) to try perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) Returns: best_param: {string: ?} dictionary mapping parameter names to the best values found """ print('-' * 72) print('Partition k = {}'.format(k_idx)) x_train_unvec, y_train, x_val_unvec, y_val, _, _ = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, _ = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict) num_feature = max_num_feature # cap number of validation samples if max_num_sample != None and len(x_val_unvec) > max_num_sample: x_val_unvec = x_val_unvec[0:max_num_sample] y_val = y_val[0:max_num_sample] start = time.time() if method == 'riddle': model_class = MLP init_args = {'num_feature': num_feature, 'num_class': num_class} param_dist = { 'num_hidden_layer': 2, # [1, 2] 'num_hidden_node': 512, # [128, 256, 512] 'activation': ['prelu', 'relu'], 'dropout': tuning.Uniform(lo=0.2, hi=0.8), 'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1), } best_param = tuning.random_search(model_class, init_args, param_dist, x_val_unvec, y_val, num_class=num_class, k=TUNING_K, num_search=num_search) else: # scikit-learn methods x_val = vectorize_features(x_val_unvec, num_feature) if method == 'logit': # logistic regression from sklearn.linear_model import LogisticRegression estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs') param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-3, hi=3)} elif method == 'random_forest': from sklearn.ensemble import RandomForestClassifier estimator = RandomForestClassifier() param_dist = { 'max_features': ['sqrt', 'log2', None], 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7), 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8) } elif method == 'linear_svm': from sklearn.svm import SVC # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear' # kernel is much slower than the sparse 'poly' kernel, so we use # the 'poly' kernel with degree=1 over the 'linear' kernel estimator = SVC(kernel='poly', degree=1, coef0=0., gamma=1., probability=True, cache_size=1000) param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)} elif method == 'poly_svm': from sklearn.svm import SVC estimator = SVC(kernel='poly', probability=True, cache_size=1000) param_dist = { 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1), 'degree': [2, 3, 4], 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1) } elif method == 'rbf_svm': from sklearn.svm import SVC estimator = SVC(kernel='rbf', probability=True, cache_size=1000) param_dist = { 'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1), 'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1) } elif method == 'gbdt': from xgboost import XGBClassifier estimator = XGBClassifier(objective='multi:softprob') param_dist = { 'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5), 'n_estimators': tuning.UniformIntegerLogSpace(base=2, lo=4, hi=8), 'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0) } else: raise ValueError('unknown method: {}'.format(method)) param_search = RandomizedSearchCV(estimator, param_dist, refit=False, n_iter=num_search, scoring=loss_scorer) param_search.fit(x_val, y_val) best_param = param_search.best_params_ print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'.format( method, k_idx, best_param, time.time() - start)) return best_param
def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature, num_class, feature_selection, k_idx, k, params, perm_indices, init_args, full_out_dir): """Run a classification pipeline for a single k-fold partition. Arguments: ModelClass: Python class classification model x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes present feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation params: [{string: ?}] list of dictionary mapping parameter names to values for each k-fold partition perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) init_args: {string: ?} dictionary mapping initialization argument names to values out_dir: string directory where outputs (e.g., results) should be saved """ print('-' * 72) print('Partition k = {}'.format(k_idx)) print(params[k_idx]) x_train_unvec, y_train, _, _, x_test_unvec, y_test = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, _ = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature x_train = vectorize_features(x_train_unvec, num_feature) x_test = vectorize_features(x_test_unvec, num_feature) args = dict(init_args) # copy dictionary args.update(params[k_idx]) start = time.time() model = ModelClass(**args) model.fit(x_train, y_train) y_test_probas = model.predict_proba(x_test) runtime = time.time() - start evaluate(y_test, y_test_probas, runtime, num_class=num_class, out_dir=full_out_dir)
def run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, num_feature, max_num_feature, num_class, feature_selection, k_idx, k, params, perm_indices, full_out_dir): """Run a RIDDLE classification pipeline for a single k-fold partition. Arguments: x_unvec: [[int]] feature indices that have not been vectorized; each inner list collects the indices of features that are present (binary on) for a sample y: [int] list of class labels as integer indices idx_feat_dict: {int: string} dictionary mapping feature indices to features idx_class_dict: {int: string} dictionary mapping class indices to classes icd9_descript_dict: {string: string} dictionary mapping ICD9 codes to description text num_feature: int number of features present in the dataset max_num_feature: int maximum number of features to use num_class: int number of classes feature_selection: string feature selection method; values = {'random', 'frequency', 'chi2'} k_idx: int index of the k-fold partition to use k: int number of partitions for k-fold cross-validation params: [{string: ?}] list of dictionary mapping parameter names to values for each k-fold partition perm_indices: np.ndarray, int array of indices representing a permutation of the samples with shape (num_sample, ) full_out_dir: string directory where outputs (e.g., results) should be saved """ from keras import backend as K from riddle import emr, feature_importance from riddle.models import MLP print('Partition k = {}'.format(k_idx)) print() x_train_unvec, y_train, x_val_unvec, y_val, x_test_unvec, y_test = ( emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)) if max_num_feature > 0: # select features and re-encode feat_encoding_dict, idx_feat_dict = select_features( x_train_unvec, y_train, idx_feat_dict, method=feature_selection, num_feature=num_feature, max_num_feature=max_num_feature) x_train_unvec = subset_reencode_features(x_train_unvec, feat_encoding_dict) x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict) x_test_unvec = subset_reencode_features(x_test_unvec, feat_encoding_dict) num_feature = max_num_feature # set up max_num_epoch = -1 if 'debug' in full_out_dir: max_num_epoch = 3 model = MLP(num_feature=num_feature, num_class=num_class, max_num_epoch=max_num_epoch, **params[k_idx]) # train and test start = time.time() model.train(x_train_unvec, y_train, x_val_unvec, y_val) y_test_probas = model.predict_proba(x_test_unvec) runtime = time.time() - start print('Completed training and testing in {:.4f} seconds'.format(runtime)) print('-' * 72) print() # evaluate model performance evaluate(y_test, y_test_probas, runtime, num_class=num_class, out_dir=full_out_dir) model.save_model(path=full_out_dir + '/model.h5') K.clear_session() print('Finished with partition k = {}'.format(k_idx)) print('=' * 72) print()
def main(): """ Module to execute the entire package from data retrieval to model performance metrics @:param: None :return: Post process results """ # Importing inhibitor notation data # The SMILES and InChI logs of the same material have identical indices # Creating and joining the SMILES and InChI dataframes along the same index utils.check_files() df_compounds_smiles = utils.create_dataframe( 'data/chemical_notation_' 'data/compounds_smiles.txt', 'smiles') df_compounds_smiles.rename(columns={'ID': 'CID'}, inplace=True) df_compounds_smiles.sort_values(by='CID', inplace=True) # Importing inhibitor activity data activity = pd.read_csv('data/activity_data/AID_743255_datatable.csv') activity = utils.clean_activity_dataframe(activity) # Merging activity data and compound notation data df = activity.merge(df_compounds_smiles) df.sort_values(by='CID', inplace=True) df.reset_index(drop=True, inplace=True) # Drop non-descriptor columns before feature space reduction df_target = df.drop(['SMILES', 'CID', 'Phenotype'], axis=1) # Extracting molecular descriptors for all compounds # print("Sending data for descriptor calculation") # utils.extract_all_descriptors(df, 'SMILES') # Importing feature sets df_charge = pd.DataFrame.from_csv('data/df_charge.csv') df_basak = pd.DataFrame.from_csv('data/df_basak.csv') df_con = pd.DataFrame.from_csv('data/df_con.csv') df_estate = pd.DataFrame.from_csv('data/df_estate.csv') df_constitution = pd.DataFrame.from_csv('data/df_constitution.csv') df_property = pd.DataFrame.from_csv('data/df_property.csv') df_kappa = pd.DataFrame.from_csv('data/df_kappa.csv') df_moe = pd.DataFrame.from_csv('data/df_moe.csv') print("Joining dataframes") df_descriptor = df_kappa.join(df_moe).join(df_constitution).\ join(df_property).join(df_charge).join(df_estate).join(df_con).join( df_basak) print("Joining dataframes done") print("Checking dataframe for NaN, infinite or too large values") df_descriptor = utils.remove_nan_infinite(df_descriptor) # Transform all column values to mean 0 and unit variance print("Transforming dataframe using mean and variance") df_descriptor = utils.transform_dataframe(df_descriptor) print("Transforming dataframe using mean and variance done") # Feature selection and space reduction print("Selecting best features in dataframe") df_features = utils.select_features(df_descriptor, df_target) print("Selecting best features in dataframe done") df = df_features.join(df_target) # Data to training task # Type check inputs for sanity if df is None: raise ValueError('df is None') if not isinstance(df, pd.DataFrame): raise TypeError('df is not a dataframe') if TARGET_COLUMN is None: raise ValueError('target_column is None') if not isinstance(TARGET_COLUMN, basestring): raise TypeError('target_column is not a string') if TARGET_COLUMN not in df.columns: raise ValueError('target_column (%s) is not a valid column name' % TARGET_COLUMN) # Train, validation and test split df_train, df_test = sklearn.cross_validation.train_test_split( df, test_size=0.25) # Remove the classification column from the dataframe x_train = df_train.drop(TARGET_COLUMN, 1) x_test = df_test.drop(TARGET_COLUMN, 1) y_train = pd.DataFrame(df_train[TARGET_COLUMN]) y_test = pd.DataFrame(df_test[TARGET_COLUMN]) with open(XY_PICKLE, 'wb') as results: pickle.dump(x_train, results, pickle.HIGHEST_PROTOCOL) pickle.dump(x_test, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_train, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_test, results, pickle.HIGHEST_PROTOCOL) models.run_models(x_train, y_train, x_test, y_test) post_process.results()