def main(): ###################### # Prepare filesystem # ###################### mkdir("models") ################# # Load networks # ################# networks, dims = load_ppmi_matrices(data_path) A = minmax_scale(networks[0]) ######################### # Train the autoencoder # ######################### model_name = [f'{org}', f'{model_type}-{args.layers}'] if ofile_tags != '': model_name.append(ofile_tags) model_name = '_'.join(model_name) stdout("Running for architecture", model_name) best_model_filename = 'best_model.h5' autoencoder = DeepAutoencoder( x_train=A, x_val=0.1, layers=layers, epochs=epochs, sparse=sparse, dropout=dropout, batch_size=batch_size, activation=activation, optimizer=optimizer, # early_stopping=(5, 0.), save_best_model=best_model_filename, verbose=2) autoencoder.train() history = autoencoder.history.history with open(os.path.join(models_path, f'{model_name}_training_history.pkl'), 'wb') as f: pickle.dump(history, f) plot_loss({model_name: history}, f'{models_path}/{model_name}') embeddings = autoencoder.encode(A) embeddings_path = os.path.join(models_path, f'{model_name}_embeddings.mat') sio.savemat(embeddings_path, {'embeddings': embeddings}) os.remove(best_model_filename)
def main(): print(__doc__) stdout("Command line arguments", args) s = STRING() s.convert_ids_to_numbers() stdout('Writing networks to', output_path) s.write()
def load_ppmi_matrices(data_path): '''Load PPMI matrices. # Arguments data_path: str, path to .mat files # Returns Ms: List[numpy.ndarray], PPMI matrices dims: List[int], dimensions of matrices ''' paths = sorted(glob.glob(os.path.join(data_path, "*.mat"))) stdout('Networks', paths) Ms = [] for p in paths: M = _load_ppmi_matrix(p) Ms.append(minmax_scale(M)) dims = [i.shape[1] for i in Ms] stdout('Input dims', dims) return Ms, dims
def main(): ###################### # Prepare filesystem # ###################### mkdir(results_path) ################# # Load networks # ################# networks, dims = load_ppmi_matrices(data_path) ############### # Load models # ############### model_names = sorted( glob.glob( os.path.join(os.path.expandvars(models_path), f'{org}_MDA_arch_*.h5'))) stdout("Model names", model_names) # For now I am only loading a single model at a time, specified by # `--architecture`. TODO improve this to handle multiple models at one # time. if architecture: for m in model_names: if int(m[-4]) == architecture: mid_model = load_model(m) model_name = os.path.basename(m).split(".")[0] else: raise Warning("`--architecture` must be supplied") ################################ # Calculate network embeddings # ################################ stdout("Calculating embeddings for", model_name) embeddings = minmax_scale(mid_model.predict(networks)) embeddings_path = os.path.join(os.path.expandvars(results_path), f'{model_name}_features.mat') stdout("Writing embeddings to", embeddings_path) sio.savemat(embeddings_path, {'embeddings': embeddings})
def cross_validation(X, y, n_trials=10, n_jobs=1, n_threads=1, random_state=None, clf_type='LRC', max_depth=5): '''Perform model selection via cross validation. ''' stdout('Number of samples pre-filtering', X.shape) # Filter samples with no annotations del_rid = np.where(y.sum(axis=1) == 0)[0] y = np.delete(y, del_rid, axis=0) X = np.delete(X, del_rid, axis=0) stdout('Number of samples post-filtering', X.shape) # Scoring scoring = { 'accuracy': 'accuracy', 'f1': 'f1_micro', 'M_AUPR': make_scorer(M_AUPR), 'm_AUPR': make_scorer(m_AUPR) } # Split training data trials = ShuffleSplit(n_splits=n_trials, test_size=0.2, random_state=random_state) # Performance performance_metrics = ("accuracy", "m_AUPR", "M_AUPR", "f1") perf = defaultdict(dict) perf['grid_search'] = {} # Model selection for optimum hyperparameters iteration = 0 for train_idx, test_idx in trials.split(X): # Split data X_train = X[train_idx] X_test = X[test_idx] y_train = y[train_idx] y_test = y[test_idx] iteration += 1 stdout('Cross validation trial', iteration) stdout('Train samples', y_train.shape[0]) stdout('Test samples', y_test.shape[0]) # Classifier if clf_type == 'SVC': clf = SVClassifier(n_jobs=n_jobs, random_state=random_state) grid_search_params = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.001, 0.005, 0.01, 0.05, 0.1], 'kernel': ['rbf'] } if clf_type == 'LinearSVC': clf = SVClassifier(n_jobs=n_jobs, random_state=random_state) grid_search_params = { 'C': np.logspace(-1, 2, 4), 'kernel': ['linear'] } elif clf_type == 'LRC': clf = LRClassifier(n_jobs=n_jobs, random_state=random_state) grid_search_params = {'C': np.logspace(-2, 2, 5)} elif clf_type == 'RFC': clf = RFClassifier(n_jobs=n_jobs, random_state=random_state) grid_search_params = {'max_features': ['auto']} elif clf_type == 'XGB': clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, n_jobs=n_jobs, n_threads=n_threads, random_state=random_state) grid_search_params = { 'max_depth': max_depth, 'min_child_weight': range(1, 3) } else: raise ValueError('`clf` must be a class in agape.ml.classifer') # Perform a grid search over the hyperparameter ranges stdout('Grid search') clf.grid_search(X_train, y_train, grid_search_params, scoring=scoring, refit='m_AUPR', cv=5, verbose=10) # Get the best hyperparameters clf_params = clf.get_clf().get_params()['estimator'] \ .get_params()['estimator'] \ .get_params() best_params = { k: clf_params[k.replace('estimator__', '')] for k in grid_search_params } perf['grid_search'][iteration] = {} perf['grid_search'][iteration]['best_estimator_'] = {} for k, v in best_params.items(): k = k.split('__')[-1] perf['grid_search'][iteration]['best_estimator_'][k] = v stdout('Optimal parameters', best_params) perf['grid_search'][iteration]['best_score_'] = \ clf.clf_grid_search.best_score_ stdout('Train dataset AUPR', clf.clf_grid_search.best_score_) # Train a classifier with the optimal hyperparameters using the full # training data clf.fit(X_train, y_train) # Compute performance on test set y_pred = clf.predict(X_test) y_score = clf.predict_proba(X_test) stdout('Number of positive predictions', len(y_pred.nonzero()[0])) perf_trial = _Performance(y_test, y_score, y_pred) stdout('Test dataset') for pm in performance_metrics: pm_v = getattr(perf_trial, pm) stdout(pm, pm_v) perf[pm][iteration] = pm_v dummy = DummyClassifier().fit(X_train, y_train).score(X_test, y_test) perf['dummy'][iteration] = dummy # Performance across K-fold cross-validation def calculate_mean_std(metric): values = list(perf[metric].values()) perf['metrics'][metric]['mean'] = np.mean(values) perf['metrics'][metric]['std'] = std(values) for pm in performance_metrics: perf['metrics'][pm] = {} calculate_mean_std(pm) return perf
def main(): ###################### # Prepare filesystem # ###################### directory_exists(models_path) mkdir(results_path) ################### # Load embeddings # ################### embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0] model_name = os.path.splitext(os.path.basename(embeddings_file))[0] print(model_name) stdout('Loading embeddings', embeddings_file) embeddings = load_embeddings(embeddings_file) embeddings = minmax_scale(embeddings) ####################### # Load GO annotations # ####################### annotation_dir = os.path.join(data_path, 'annotations') if validation == 'cerevisiae': annotation_file = os.path.join(annotation_dir, 'cerevisiae_annotations.mat') else: annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat') stdout('Loading GO annotations', annotation_file) GO = sio.loadmat(annotation_file) #################### # Train classifier # #################### stdout('Running cross-validation for', level) annotations = GO[level] # Silence certain warning messages during cross-validation for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning, RuntimeWarning): warnings.filterwarnings("ignore", category=w) # Only use a subset of the data for testing purposes embeddings = embeddings[:test] annotations = annotations[:test] # performance = cross_validation( # embeddings, # annotations, # n_trials=n_trials, # n_jobs=n_jobs, # n_threads=n_threads, # random_state=random_state, # clf_type=clf_type, # max_depth=max_depth[level]) performance = cross_validation(embeddings, annotations, n_trials=n_trials) performance['my_level'] = level pprint(performance) fout = f'{model_name}_{level}_{clf_type}_performance.json' with open(os.path.join(results_path, fout), 'w') as f: json.dump(performance, f)
default=-1, type=int, help='Set sklearn random_state. If -1, then sklearn uses \ the system randomness as a seed. If int, then this \ number will be used as a seed.') parser.add_argument('--tags', default="", type=str) parser.add_argument('-j', '--n_jobs', default=1, type=int) parser.add_argument('-t', '--n_threads', default=1, type=int) parser.add_argument('-c', '--clf_type', default='LRC', type=str) parser.add_argument('--test', default=None, type=int, help='If True, then only a subset of the data is used') args = parser.parse_args() stdout("Command line arguments", args) org = args.organism models_path = os.path.expandvars(args.models_path) results_path = os.path.expandvars(args.results_path) data_path = os.path.expandvars(args.data_path) n_trials = args.n_trials tags = args.tags validation = args.validation n_jobs = args.n_jobs n_threads = args.n_threads random_state = args.random_state level = args.level clf_type = args.clf_type test = args.test
def main(): # Prepare filesystem directory_exists(models_path) mkdir(results_path) # Load embeddings embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0] model_name = os.path.splitext( os.path.basename(embeddings_file))[0].replace('_embeddings', '') stdout('Loading embeddings', embeddings_file) embeddings = load_embeddings(embeddings_file).astype('int32') # Load annotations annotation_dir = os.path.join(data_path, 'annotations') if validation == 'cerevisiae': annotation_file = os.path.join( annotation_dir, 'cerevisiae_annotations.mat') else: annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat') stdout('Loading GO annotations', annotation_file) annotation_file = sio.loadmat(annotation_file) # Train classifier stdout('Running cross-validation for', level) if validation == 'cv': if level in ('P', 'F', 'C'): annotations = np.hstack( [annotation_file[f'{level}_{i}'] for i in range(1, 4)]) else: annotations = annotation_file[level] elif validation == 'cerevisiae': if level == 'all': annotations = np.hstack( [annotation_file[f'level{i}'] for i in range(1, 4)]) else: annotations = annotation_file[level] annotations = annotations.astype('int32') # Silence certain warning messages during cross-validation for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning, RuntimeWarning): warnings.filterwarnings("ignore", category=w) # Remove genes with no annotations x = embeddings y = annotations del_rid = np.where(y.sum(axis=1) == 0)[0] x = np.delete(x, del_rid, axis=0) y = np.delete(y, del_rid, axis=0) # Set up CV performance_metrics = ('accuracy', 'm_AUPR', 'M_AUPR', 'f1') performance_repeats = defaultdict(dict) for repeat in range(1, repeats + 1): performance_repeats[f'repeat_{repeat}'] = defaultdict(dict) performance = performance_repeats[f'repeat_{repeat}'] trials = ShuffleSplit(n_splits=n_trials, test_size=0.2, random_state=random_state) iteration = 0 # CV-folds for train_idx, test_idx in trials.split(x): iteration += 1 x_train = x[train_idx] x_test = x[test_idx] y_train = y[train_idx] y_test = y[test_idx] # Define the MLP architecture model = MLP(x_train, y_train) model.compile('adam', 'binary_crossentropy', ['acc']) # Train the model callbacks = [EarlyStopping(min_delta=0., patience=20), ModelCheckpoint('best_model.h5', save_best_only=True)] history = model.fit(x_train, y_train, batch_size=batch_size, epochs=200, validation_split=0.2, shuffle=True, callbacks=callbacks, verbose=2) performance['history'][iteration] = {} for tm in history.history: performance['history'][iteration][tm] = history.history[tm] # Read the best model from file (defined as the model which minimizes # the validation loss. model = load_model('best_model.h5') # Predict annotations y_score = model.predict(x_test) y_pred = y_score.copy() positive_threshold = .5 y_pred[y_pred < positive_threshold] = 0 y_pred[y_pred > 0] = 1 performance_trial = _Performance(y_test, y_score, y_pred) for pm in performance_metrics: performance[pm][iteration] = getattr(performance_trial, pm) calculate_mean_std(performance, pm) dummy = DummyClassifier().fit(x_train, y_train).score(x_test, y_test) performance['dummy'][iteration] = dummy performance['level'] = level pprint(performance) # Save results and training history fout = f'{model_name}_{level}_{clf_type}' with open(os.path.join(results_path, f'{fout}.json'), 'w') as f: json.dump(performance_repeats, f) # Delete the best model file os.remove('best_model.h5') return None
def test_string_object(self): stdout(self.s, self.o, file=self.f) assert self.f.getvalue() == 'string:\n object\n\n'
def test_string(self): stdout(self.s, file=self.f) assert self.f.getvalue() == 'string\n\n\n'