def main(): start_time = time() args = userargs.setup() utils.save_json_objects(config.iodir[0], **{'args': args}) #args = utils.load_json_objects(r'C:\Users\Samuel\Documents\Visual Studio 2015\Projects\postnovo\test', 'args') #userargs.set_global_vars(args) alg_basename_dfs_dict = input.load_files() utils.save_pkl_objects(config.iodir[0], **{'alg_basename_dfs_dict': alg_basename_dfs_dict}) #alg_basename_dfs_dict = utils.load_pkl_objects(config.iodir[0], 'alg_basename_dfs_dict') ## example: ## alg_basename_dfs_dict = odict('novor': novor input df, 'pn': pn input df) prediction_df = consensus.make_prediction_df(alg_basename_dfs_dict) utils.save_pkl_objects(config.iodir[0], **{'consensus_prediction_df': prediction_df}) #prediction_df = utils.load_pkl_objects(config.iodir[0], 'consensus_prediction_df') prediction_df = masstol.update_prediction_df(prediction_df) utils.save_pkl_objects(config.iodir[0], **{'mass_tol_prediction_df': prediction_df}) #prediction_df = utils.load_pkl_objects(config.iodir[0], 'mass_tol_prediction_df') prediction_df = interspec.update_prediction_df(prediction_df) utils.save_pkl_objects(config.iodir[0], **{'interspec_prediction_df': prediction_df}) #prediction_df = utils.load_pkl_objects(config.iodir[0], 'interspec_prediction_df') classifier.classify(prediction_df=prediction_df) #classifier.classify() utils.verbose_print('total time elapsed:', time() - start_time)
def find_target_accuracy(prediction_df): utils.verbose_print('loading', basename(config.db_search_ref_file[0])) db_search_ref = load_db_search_ref_file(config.db_search_ref_file[0]) utils.verbose_print('loading', basename(config.fasta_ref_file[0])) fasta_ref = load_fasta_ref_file(config.fasta_ref_file[0]) utils.verbose_print('finding sequence matches to database search reference') prediction_df.reset_index(inplace = True) comparison_df = prediction_df.merge(db_search_ref, how = 'left', on = 'scan') prediction_df['scan has db search PSM'] = comparison_df['ref seq'].notnull().astype(int) comparison_df['ref seq'][comparison_df['ref seq'].isnull()] = '' denovo_seqs = comparison_df['seq'].tolist() psm_seqs = comparison_df['ref seq'].tolist() seq_pairs = list(zip(denovo_seqs, psm_seqs)) matches = [] for seq_pair in seq_pairs: if seq_pair[0] in seq_pair[1]: matches.append(1) else: matches.append(0) prediction_df['de novo seq matches db search seq'] = matches utils.verbose_print('finding de novo sequence matches to fasta reference for scans lacking database search PSM') no_db_search_psm_df = prediction_df[prediction_df['scan has db search PSM'] == 0] no_db_search_psm_df = no_db_search_psm_df[no_db_search_psm_df['seq'].apply(len) >= config.min_ref_match_len[0]] unique_long_denovo_seqs = list(set(no_db_search_psm_df['seq'])) utils.verbose_print('finding minimum de novo sequence length to uniquely match fasta reference') #config.min_ref_match_len[0] = find_min_seq_len(fasta_ref = fasta_ref, cores = config.cores[0]) one_percent_number_denovo_seqs = len(unique_long_denovo_seqs) / 100 / config.cores[0] multiprocessing_pool = Pool(config.cores[0]) single_var_match_seq = partial(match_seq_to_fasta_ref, fasta_ref = fasta_ref, one_percent_number_denovo_seqs = one_percent_number_denovo_seqs, cores = config.cores[0]) fasta_matches = multiprocessing_pool.map(single_var_match_seq, unique_long_denovo_seqs) multiprocessing_pool.close() multiprocessing_pool.join() fasta_match_dict = dict(zip(unique_long_denovo_seqs, fasta_matches)) single_var_get_match_from_dict = partial(get_match_from_dict, match_dict = fasta_match_dict) no_db_search_psm_df['correct de novo seq not found in db search'] = no_db_search_psm_df['seq'].apply(single_var_get_match_from_dict) prediction_df = prediction_df.merge(no_db_search_psm_df['correct de novo seq not found in db search'].to_frame(), left_index = True, right_index = True, how = 'left') prediction_df['correct de novo seq not found in db search'].fillna(0, inplace = True) prediction_df['ref match'] = prediction_df['de novo seq matches db search seq'] +\ prediction_df['correct de novo seq not found in db search'] prediction_df.set_index(config.is_alg_col_names + ['scan'] + config.frag_mass_tols + ['is longest consensus', 'is top rank consensus'], inplace = True) ref_correspondence_df = pd.concat([prediction_df['scan has db search PSM'], prediction_df['de novo seq matches db search seq'], prediction_df['correct de novo seq not found in db search']], 1) prediction_df.drop(['scan has db search PSM', 'de novo seq matches db search seq', 'correct de novo seq not found in db search'], axis = 1, inplace = True) prediction_df = prediction_df.reset_index().set_index(config.is_alg_col_names + ['scan']) return prediction_df, ref_correspondence_df, db_search_ref
def load_files(): alg_basename_dfs_dict = OrderedDict() if config.novor_files: alg_basename_dfs_dict['novor'] = OrderedDict.fromkeys( [basename(novor_file) for novor_file in config.novor_files]) for i, novor_file in enumerate(config.novor_files): utils.verbose_print('loading', basename(novor_file)) check_file_fragment_mass_tol(novor_file, config.frag_mass_tols[i]) if i == 0: find_precursor_mass_tol(novor_file) alg_basename_dfs_dict['novor'][basename(novor_file)] = load_novor_file(novor_file) if config.peaks_files: alg_basename_dfs_dict['peaks'] = OrderedDict.fromkeys( [basename(peaks_file) for peaks_file in config.peaks_files]) for i, peaks_file in enumerate(config.peaks_files): utils.verbose_print('loading', basename(peaks_file)) alg_basename_dfs_dict['peaks'][basename(peaks_file)] = load_peaks_file(peaks_file) if config.pn_files: alg_basename_dfs_dict['pn'] = OrderedDict.fromkeys( [basename(pn_file) for pn_file in config.pn_files]) for i, pn_file in enumerate(config.pn_files): utils.verbose_print('loading', basename(pn_file)) alg_basename_dfs_dict['pn'][basename(pn_file)] = load_pn_file(pn_file) utils.verbose_print('cleaning up input data') alg_basename_dfs_dict = filter_shared_scans(alg_basename_dfs_dict) return alg_basename_dfs_dict
def plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key): if len(alg_key) > 1: utils.verbose_print('plotting errors vs tree size for', '-'.join(alg_key), 'consensus sequences') else: utils.verbose_print('plotting errors vs tree size for', alg_key[0], 'sequences') ensemble_clfs = [ #('max_features=\'sqrt\'', # RandomForestClassifier(warm_start = True, max_features = 'sqrt', oob_score = True, max_depth = 15, n_jobs = config.cores[0], random_state = 1)), ('max_features=None', RandomForestClassifier(warm_start = True, max_features = None, oob_score = True, max_depth = 15, n_jobs = config.cores[0], random_state = 1)) ] oob_errors = OrderedDict((label, []) for label, _ in ensemble_clfs) #validation_errors = OrderedDict((label, []) for label, _ in ensemble_clfs) min_estimators = 10 max_estimators = 500 for label, clf in ensemble_clfs: for tree_number in range(min_estimators, max_estimators + 1, 100): clf.set_params(n_estimators = tree_number) clf.fit(data_train_split, target_train_split) oob_error = 1 - clf.oob_score_ oob_errors[label].append((tree_number, oob_error)) #validation_error = 1 - clf.score(data_validation_split, target_validation_split) #validation_errors[label].append((tree_number, validation_error)) fig, ax1 = plt.subplots() for label, oob_error in oob_errors.items(): xs, ys = zip(*oob_error) ax1.plot(xs, ys, label = 'oob error: ' + label) #for label, validation_error in validation_errors.items(): # xs, ys = zip(*validation_error) # ax1.plot(xs, ys, label = 'validation error: ' + label) ax1.set_xlim(min_estimators, max_estimators) ax1.set_xlabel('n_estimators') ax1.set_ylabel('error rate') ax1.legend(loc = 'upper right') fig.set_tight_layout(True) alg_key_str = '_'.join(alg_key) save_path = join(config.iodir[0], alg_key_str + '_error.pdf') fig.savefig(save_path, bbox_inches = 'tight')
def make_forest_dict(train_target_arr_dict, rf_params): forest_dict = {}.fromkeys(train_target_arr_dict) for alg_key in forest_dict: if len(alg_key) > 1: utils.verbose_print('making random forest for', '-'.join(alg_key), 'consensus sequences') else: utils.verbose_print('making random forest for', alg_key[0], 'sequences') train_data = train_target_arr_dict[alg_key]['train'] target_data = train_target_arr_dict[alg_key]['target'] forest = RandomForestClassifier(n_estimators = config.rf_n_estimators, max_depth = rf_params[alg_key]['max_depth'], max_features = rf_params[alg_key]['max_features'], oob_score = True, n_jobs = config.cores[0]) forest.fit(train_data, target_data) forest_dict[alg_key] = forest return forest_dict
def make_predictions(prediction_df, db_search_ref): forest_dict = utils.load_pkl_objects(config.training_dir, 'forest_dict') prediction_df['probability'] = np.nan for multiindex_key in config.is_alg_col_multiindex_keys: alg_group = tuple([alg for i, alg in enumerate(config.alg_list) if multiindex_key[i]]) alg_group_data = prediction_df.xs(multiindex_key) if config.run_type[0] == 'predict': alg_group_data.drop(['seq', 'probability'], axis = 1, inplace = True) elif config.run_type[0] == 'test': accuracy_labels = alg_group_data['ref match'].tolist() alg_group_data.drop(['seq', 'ref match', 'probability'], axis = 1, inplace = True) alg_group_data.dropna(1, inplace = True) forest_dict[alg_group].n_jobs = config.cores[0] probabilities = forest_dict[alg_group].predict_proba(alg_group_data.as_matrix())[:, 1] if config.run_type[0] == 'test': utils.verbose_print('making', '_'.join(alg_group), 'test plots') #plot_roc_curve(accuracy_labels, probabilities, alg_group, alg_group_data) plot_precision_recall_curve(accuracy_labels, probabilities, alg_group, alg_group_data) prediction_df.loc[multiindex_key, 'probability'] = probabilities if config.run_type[0] == 'test': plot_precision_yield(prediction_df, db_search_ref) prediction_df = prediction_df.reset_index().set_index('scan') max_probabilities = prediction_df.groupby(level = 'scan')['probability'].transform(max) best_prediction_df = prediction_df[prediction_df['probability'] == max_probabilities] best_prediction_df = best_prediction_df.groupby(level = 'scan').first() reported_prediction_df = best_prediction_df[best_prediction_df['probability'] >= config.min_prob[0]] reported_cols_in_order = [] for reported_df_col in config.reported_df_cols: if reported_df_col in reported_prediction_df.columns: reported_cols_in_order.append(reported_df_col) reported_prediction_df = reported_prediction_df.reindex_axis(reported_cols_in_order, axis = 1) return reported_prediction_df
def make_training_forests(training_df): train_target_arr_dict = make_train_target_arr_dict(training_df) if config.run_type[0] == 'train': forest_dict = make_forest_dict(train_target_arr_dict, config.rf_default_params) ## REMOVE for alg_key in forest_dict: data_train_split, data_validation_split, target_train_split, target_validation_split =\ train_test_split(train_target_arr_dict[alg_key]['train'], train_target_arr_dict[alg_key]['target'], stratify = train_target_arr_dict[alg_key]['target']) # #plot_feature_importances(forest_dict[alg_key], alg_key, train_target_arr_dict[alg_key]['feature_names']) plot_binned_feature_importances(forest_dict[alg_key], alg_key, train_target_arr_dict[alg_key]['feature_names']) # plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key) elif config.run_type[0] == 'optimize': utils.verbose_print('optimizing random forest parameters') optimized_params = optimize_model(train_target_arr_dict) forest_dict = make_forest_dict(train_target_arr_dict, optimized_params) return forest_dict
def plot_binned_feature_importances(forest, alg_key, feature_names): if len(alg_key) > 1: utils.verbose_print('plotting feature importances for', '-'.join(alg_key), 'consensus sequences') else: utils.verbose_print('plotting feature importances for', alg_key[0], 'sequences') feature_importances = forest.feature_importances_ feature_group_importances = [] feature_group_stds = [] for feature_group, features in config.feature_groups.items(): feature_group_importance = 0.0 feature_group_var = 0.0 for i, feature_name in enumerate(feature_names): if feature_name in features: feature_group_importance += feature_importances[i] feature_group_var += np.var( [tree.feature_importances_[i] for tree in forest.estimators_], axis = 0) feature_group_importances.append(feature_group_importance) feature_group_stds.append(np.sqrt(feature_group_var)) feature_group_importances = np.array(feature_group_importances) feature_group_stds = np.array(feature_group_stds) indices = np.argsort(feature_group_importances)[::-1] fig, ax = plt.subplots() ax.set_title('Binned feature importances') x = np.arange(len(feature_group_importances)) ax.bar(left = x, height = feature_group_importances[indices], color = 'r', yerr = feature_group_stds[indices], width = 0.9, align = 'center') ax.set_xticks(x) labels = np.array(list(config.feature_groups))[indices] ax.set_xticklabels(labels, rotation = -45, ha = 'left') ax.set_xlim([-1, len(feature_group_importances)]) ax.set_ylim(ymin = 0) fig.set_tight_layout(True) alg_key_str = '_'.join(alg_key) save_path = join(config.iodir[0], alg_key_str + '_binned_feature_importances.pdf') fig.savefig(save_path, bbox_inches = 'tight')
def optimize_model(train_target_arr_dict): optimized_params = {} for alg_key in train_target_arr_dict: optimized_params[alg_key] = {} data_train_split, data_validation_split, target_train_split, target_validation_split =\ train_test_split(train_target_arr_dict[alg_key]['train'], train_target_arr_dict[alg_key]['target'], stratify = train_target_arr_dict[alg_key]['target']) forest_grid = GridSearchCV(RandomForestClassifier(n_estimators = config.rf_n_estimators, oob_score = True), {'max_features': ['sqrt', None], 'max_depth': [depth for depth in range(11, 20)]}, n_jobs = config.cores[0]) forest_grid.fit(data_train_split, target_train_split) optimized_forest = forest_grid.best_estimator_ optimized_params[alg_key]['max_depth'] = optimized_forest.max_depth utils.verbose_print(alg_key, 'optimized max depth:', optimized_forest.max_depth) optimized_params[alg_key]['max_features'] = optimized_forest.max_features utils.verbose_print(alg_key, 'optimized max features:', optimized_forest.max_features) plot_feature_importances(optimized_forest, alg_key, train_target_arr_dict[alg_key]['feature_names']) plot_binned_feature_importances(optimized_forest, alg_key, train_target_arr_dict[alg_key]['feature_names']) plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key) return optimized_params
def update_prediction_df(prediction_df): utils.verbose_print() if len(config.frag_mass_tols) == 1: return prediction_df utils.verbose_print('setting up mass tolerance comparison') prediction_df.reset_index(inplace=True) # combo level col = sum of 'is novor seq', 'is peaks seq', 'is pn seq' values prediction_df['combo level'] = prediction_df.iloc[:, :len(config.alg_list )].sum(axis=1) scan_list = sorted(list(set(prediction_df['scan']))) one_percent_number_scans = len(scan_list) / 100 / config.cores[0] tol_group_key_list = [] for i, tol in enumerate(config.frag_mass_tols): tol_group_key = [0] * len(config.frag_mass_tols) tol_group_key[-(i + 1)] = 1 tol_group_key_list.append(tuple(tol_group_key)) # set index as scan, '0.2' -> '0.7', combo level prediction_df.set_index(['scan'] + config.frag_mass_tols, inplace=True) # tol list indices are sorted backwards: 0.7 predictions come before 0.2 in scan group prediction_df.sort_index(level=['scan'] + config.frag_mass_tols, inplace=True) mass_tol_compar_df = prediction_df[['seq', 'combo level']] scan_groups = mass_tol_compar_df.groupby(level='scan') # single processor method #child_initialize(scan_groups, config.frag_mass_tols, tol_group_key_list) #tol_match_array_list = [] #utils.verbose_print('performing mass tolerance comparison') #for scan in scan_list: # tol_match_array_list.append(make_mass_tol_match_array(scan)) multiprocessing_pool = Pool(config.cores[0], initializer=child_initialize, initargs=(scan_groups, config.frag_mass_tols, tol_group_key_list, config.cores[0], one_percent_number_scans)) utils.verbose_print('performing mass tolerance comparison') tol_match_array_list = multiprocessing_pool.map(make_mass_tol_match_array, scan_list) multiprocessing_pool.close() multiprocessing_pool.join() tol_match_cols = [tol + ' seq match' for tol in config.frag_mass_tols] tol_match_df = pd.DataFrame(np.fliplr( np.concatenate(tol_match_array_list)), index=prediction_df.index, columns=tol_match_cols) prediction_df = pd.concat([prediction_df, tol_match_df], axis=1) prediction_df.drop(['combo level'], axis=1, inplace=True) prediction_df.reset_index(inplace=True) prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace=True) prediction_df.sort_index(level=['scan'] + config.is_alg_col_names, inplace=True) return prediction_df
def make_prediction_df(alg_basename_dfs_dict): utils.verbose_print() if config.run_type[0] in ['train', 'optimize']: consensus_min_len = config.train_consensus_len elif config.run_type[0] in ['predict', 'test']: consensus_min_len = config.min_len[0] tol_prediction_df_list = [] for tol in config.frag_mass_tols: utils.verbose_print('setting up', tol, 'Da consensus comparison') alg_compar_list = config.tol_alg_dict[tol] if len(alg_compar_list) > 1: df_name_compar_list = config.tol_basenames_dict[tol] alg_df_dict = OrderedDict([(alg, alg_basename_dfs_dict[alg][df_name_compar_list[i]]) for i, alg in enumerate(alg_compar_list)]) tol_prediction_df = make_prediction_df_for_tol(consensus_min_len, alg_df_dict, tol) tol_prediction_df_list.append(tol_prediction_df) prediction_df = pd.concat(tol_prediction_df_list) grouped_by_scan = prediction_df.groupby(['scan']) prediction_df['retention time'] = grouped_by_scan['retention time'].transform(max) prediction_df = prediction_df[~prediction_df['retention time'].isnull()] for tol in config.frag_mass_tols: prediction_df[tol].fillna(0, inplace = True) for is_alg_col_name in config.is_alg_col_names: prediction_df[is_alg_col_name].fillna(0, inplace = True) prediction_df[is_alg_col_name] = prediction_df[is_alg_col_name].astype(int) prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace = True) prediction_df.sort_index(level = ['scan'] + config.is_alg_col_names, inplace = True) return prediction_df
def plot_feature_importances(forest, alg_key, feature_names): if len(alg_key) > 1: utils.verbose_print('plotting feature importances for', '-'.join(alg_key), 'consensus sequences') else: utils.verbose_print('plotting feature importances for', alg_key[0], 'sequences') importances = forest.feature_importances_ feature_std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0) indices = np.argsort(importances)[::-1] fig, ax = plt.subplots() ax.set_title('Feature importances') x = np.arange(len(importances)) ax.bar(left = x, height = importances[indices], color = 'r', yerr = feature_std[indices], width = 0.9, align = 'center') ax.set_xticks(x) labels = np.array(feature_names)[indices] ax.set_xticklabels(labels, rotation = -45, ha = 'left') ax.set_xlim([-1, len(importances)]) ax.set_ylim(ymin = 0) fig.set_tight_layout(True) alg_key_str = '_'.join(alg_key) save_path = join(config.iodir[0], alg_key_str + '_feature_importances.pdf') fig.savefig(save_path, bbox_inches = 'tight')
def classify(prediction_df = None): utils.verbose_print() if config.run_type[0] in ['train', 'test', 'optimize']: prediction_df, ref_correspondence_df, db_search_ref = find_target_accuracy(prediction_df) utils.verbose_print('formatting data for compatability with model') prediction_df = standardize_prediction_df_cols(prediction_df) utils.save_pkl_objects(config.iodir[0], **{'prediction_df': prediction_df}) #prediction_df = utils.load_pkl_objects(config.iodir[0], 'prediction_df') if config.run_type[0] == 'predict': reported_prediction_df = make_predictions(prediction_df) reported_prediction_df.to_csv(os.path.join(config.iodir[0], 'best_predictions.csv')) elif config.run_type[0] == 'test': reported_prediction_df = make_predictions(prediction_df, db_search_ref) reported_prediction_df = reported_prediction_df.reset_index().\ merge(ref_correspondence_df.reset_index(), how = 'left', on = config.is_alg_col_names + ['scan'] + config.frag_mass_tols + ['is longest consensus', 'is top rank consensus']) reported_prediction_df.set_index('scan', inplace = True) reported_cols_in_order = [] for reported_df_col in config.reported_df_cols: if reported_df_col in reported_prediction_df.columns: reported_cols_in_order.append(reported_df_col) reported_prediction_df = reported_prediction_df.reindex_axis(reported_cols_in_order, axis = 1) reported_prediction_df.to_csv(os.path.join(config.iodir[0], 'best_predictions.csv')) elif config.run_type[0] in ['train', 'optimize']: #subsampled_df = subsample_training_data(prediction_df) #utils.save_pkl_objects(config.iodir[0], **{'subsampled_df': subsampled_df}) #subsampled_df = utils.load_pkl_objects(config.iodir[0], 'subsampled_df') utils.verbose_print('updating training database') training_df = update_training_data(prediction_df) #training_df = utils.load_pkl_objects(config.training_dir, 'training_df') forest_dict = make_training_forests(training_df) utils.save_pkl_objects(config.training_dir, **{'forest_dict': forest_dict})
def update_prediction_df(prediction_df): utils.verbose_print() utils.verbose_print('setting up inter-spectrum comparison') prediction_df['mass error'] = prediction_df[ 'measured mass'] * config.precursor_mass_tol[0] * 10**-6 prediction_df.reset_index(inplace=True) prediction_df.set_index(config.is_alg_col_names, inplace=True) tol_group_key_list = [] for i, tol in enumerate(config.frag_mass_tols): tol_group_key = [0] * len(config.frag_mass_tols) tol_group_key[i] = 1 tol_group_key_list.append(tuple(tol_group_key)) full_precursor_array_list = [] for multiindex_key in config.is_alg_col_multiindex_keys: alg_combo = '-'.join([ alg for i, alg in enumerate(config.alg_list) if multiindex_key[i] ]) alg_group_precursor_array_list = [] alg_combo_df = prediction_df.xs(multiindex_key) alg_combo_df.reset_index(inplace=True) alg_combo_df.set_index(config.frag_mass_tols, inplace=True) for tol_group_key in tol_group_key_list: tol = config.frag_mass_tols[tol_group_key.index(1)] try: tol_df = alg_combo_df.xs(tol_group_key)[[ 'seq', 'measured mass', 'mass error' ]] except TypeError: tol_df = alg_combo_df.xs( tol_group_key[0])[['seq', 'measured mass', 'mass error']] tol_df.sort('measured mass', inplace=True) measured_masses = tol_df['measured mass'].tolist() mass_errors = tol_df['mass error'].tolist() # precursor indices represent different spectra clustered by mass precursor_indices = [] precursor_index = 0 # assign the first seq prediction to precursor 0 precursor_indices.append(0) previous_mass = tol_df.iat[0, 1] for mass_index, mass in enumerate(measured_masses[1:]): if mass - mass_errors[mass_index] > previous_mass: precursor_index += 1 previous_mass = mass precursor_indices.append(precursor_index) tol_df['precursor index'] = precursor_indices precursor_groups = tol_df.groupby('precursor index') ## single processor method #tol_group_precursor_array_list = [] #utils.verbose_print('performing inter-spectrum comparison for', alg_combo + ',', tol, 'Da seqs') #for precursor_index in range(precursor_indices[-1] + 1): # child_initialize(precursor_groups) # tol_group_precursor_array_list.append(make_precursor_info_array(precursor_index)) ## multiprocessing method precursor_range = range(precursor_indices[-1] + 1) one_percent_number_precursors = len( precursor_range) / 100 / config.cores[0] multiprocessing_pool = Pool( config.cores[0], initializer=child_initialize, initargs=(precursor_groups, config.cores[0], one_percent_number_precursors)) utils.verbose_print('performing inter-spectrum comparison for', alg_combo + ',', tol, 'Da seqs') tol_group_precursor_array_list = multiprocessing_pool.map( make_precursor_info_array, precursor_range) multiprocessing_pool.close() multiprocessing_pool.join() alg_group_precursor_array_list += tol_group_precursor_array_list full_precursor_array_list += alg_group_precursor_array_list interspec_df = pd.DataFrame( np.concatenate(full_precursor_array_list), index=prediction_df.index, columns=['precursor seq agreement', 'precursor seq count']) # concatenate full array columnwise with prediction_df prediction_df = pd.concat([prediction_df, interspec_df], axis=1) prediction_df.drop(['measured mass', 'mass error'], axis=1, inplace=True) prediction_df.reset_index(inplace=True) prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace=True) prediction_df.sort_index(level=['scan'] + config.is_alg_col_names, inplace=True) return prediction_df
def make_prediction_df_for_tol(consensus_min_len, alg_df_dict, tol): for alg, df in alg_df_dict.items(): encode_seqs(df, consensus_min_len) combo_level_alg_dict = make_combo_level_alg_dict(alg_df_dict) highest_level_alg_combo = config.alg_combo_list[-1] ## examples ## combo_level_alg_dict = odict(2: [('novor', 'peaks'), ('novor', 'pn'), ('peaks', 'pn')], 3: [('novor', 'peaks', 'pn')]) ## highest_level_alg_combo = ('novor', 'peaks', 'pn') alg_df_dict = add_measured_mass_col(alg_df_dict) alg_consensus_source_df_dict = make_alg_consensus_source_df_dict(highest_level_alg_combo, alg_df_dict) consensus_scan_list = make_consensus_scan_list(alg_consensus_source_df_dict, highest_level_alg_combo) one_percent_number_consensus_scans = len(consensus_scan_list) / 100 / config.cores[0] scan_consensus_info_dict, scan_generator_fns_dict, scan_common_substrings_info_dict = setup_scan_info_dicts(combo_level_alg_dict) ## examples ## scan_consensus_info_dict = odict(2: odict( ## ('novor', 'pn'): ## {'longest_cs': {'seq_starts': None, 'rank_sum': None, 'consensus_len': None, 'alg_ranks': None}, ## 'top_rank_cs': {'seq_starts': None, 'rank_sum': None, 'consensus_len': None, 'alg_ranks': None} ## }, ...), 3: odict(...)) ## scan_generator_fns_dict = odict(2: odict( ## ('novor', 'peaks'): generator fn, ## ('novor', 'pn'): generator fn, ## ('peaks', 'pn'): generator fn)) ## scan_common_substrings_info_dict = odict(2: odict( ## ('novor', 'peaks'): list of common substrings, ## ('novor', 'pn'): list of common substrings, ## ('peaks', 'pn'): list of common substrings) first_seq_second_seq_rank_comparisons_dict, first_seq_second_seq_max_ranks_dict, first_seq_second_seq_alg_positions_dict =\ make_first_seq_second_seq_comparisons_dicts(scan_consensus_info_dict) ## examples ## first_seq_second_seq_rank_comparisons_dict = odict(2: odict( ## ('novor', 'peaks'): [((0,), (0,)), ((0,), (1,)), ..., ((0,), (18,)), ((0,), (19,))], ## ..., ## ('peaks', 'pn'): [((0,), (0,)), ((0,), (1,)), ..., ((19,), (18,)), ((19,), (19,))]), ## 3: odict( ## ('novor', 'peaks', 'pn'): [((0, 0), (0,)), ((0, 0), (1,)), ..., ((0, 19), (18,)), ((0, 19), (19,))])) ## first_seq_second_seq_max_ranks_dict = odict(2: odict( ## ('novor', 'peaks'): [1, 20], ## ('novor', 'pn'): [1, 20], ## ('peaks', 'pn'): [20, 20]), ## 3: odict( ## ('novor', 'peaks', 'pn'): [1, 20, 20])) ## first_seq_second_seq_alg_positions_dict = odict(2: odict( ## ('novor', 'peaks'): odict('novor': (0, 0), 'peaks': (1, 0)), ## ('novor', 'pn'): odict('novor': (0, 0), 'pn': (1, 0)) ## ('peaks', 'pn'): odict('peaks': (0, 0), 'pn': (1, 0))), ## 3: odict( ## ('novor', 'peaks', 'pn'): odict('novor': (0, 0), 'peaks': (0, 1), 'pn': (1, 0)))) # multiprocessing method multiprocessing_pool = Pool(config.cores[0], initializer = child_initialize, initargs = (alg_consensus_source_df_dict, scan_consensus_info_dict, scan_generator_fns_dict, scan_common_substrings_info_dict, consensus_min_len, first_seq_second_seq_rank_comparisons_dict, first_seq_second_seq_max_ranks_dict, first_seq_second_seq_alg_positions_dict, tol, config.cores[0], one_percent_number_consensus_scans) ) utils.verbose_print('finding', tol, 'Da consensus sequences') grand_scan_prediction_dict_list = multiprocessing_pool.map(make_scan_prediction_dicts, consensus_scan_list) multiprocessing_pool.close() multiprocessing_pool.join() ## single processor method #child_initialize(alg_consensus_source_df_dict, # scan_consensus_info_dict, scan_generator_fns_dict, # scan_common_substrings_info_dict, consensus_min_len, # first_seq_second_seq_rank_comparisons_dict, first_seq_second_seq_max_ranks_dict, # first_seq_second_seq_alg_positions_dict, tol) #grand_scan_prediction_dict_list = [] #utils.verbose_print('finding', tol, 'Da consensus sequences') #for consensus_scan in consensus_scan_list: # grand_scan_prediction_dict_list.append( # make_scan_prediction_dicts(consensus_scan)) scan_prediction_dict_list = [seq_prediction_dict for scan_prediction_dict_list in grand_scan_prediction_dict_list for seq_prediction_dict in scan_prediction_dict_list] tol_prediction_df = pd.DataFrame().from_dict(scan_prediction_dict_list) tol_prediction_df[tol] = 1 return tol_prediction_df
def subsample_training_data(prediction_df_orig): subsample_row_indices = [] prediction_df_orig['unique index'] = [i for i in range(prediction_df_orig.shape[0])] prediction_df_orig.set_index('unique index', append = True, inplace = True) prediction_df = prediction_df_orig.copy() prediction_df.drop(['is top rank single alg', 'seq'], axis = 1, inplace = True) accuracy_bins = sorted([round(x / config.subsample_accuracy_divisor, 1) for x in range(config.subsample_accuracy_divisor)], reverse = True) lower = config.subsample_accuracy_distribution_lower_bound upper = config.subsample_accuracy_distribution_upper_bound weight_bins = np.arange(lower, upper + (upper - lower) / config.subsample_accuracy_divisor, (upper - lower) / config.subsample_accuracy_divisor) sigma = config.subsample_accuracy_distribution_sigma mu_location = config.subsample_accuracy_distribution_mu_location accuracy_weights = (norm.cdf(weight_bins[1: 1 + config.subsample_accuracy_divisor], loc = mu_location, scale = sigma) - norm.cdf(weight_bins[: config.subsample_accuracy_divisor], loc = mu_location, scale = sigma))\ / (norm.cdf(upper, loc = mu_location, scale = sigma) - norm.cdf(lower, loc = mu_location, scale = sigma)) accuracy_subsample_weights = {acc_bin: weight for acc_bin, weight in zip(accuracy_bins, accuracy_weights)} accuracy_subsample_sizes = {acc_bin: int(weight * config.subsample_size) for acc_bin, weight in accuracy_subsample_weights.items()} while sum(accuracy_subsample_sizes.values()) != config.subsample_size: accuracy_subsample_sizes[accuracy_bins[0]] += 1 for multiindex_key in config.is_alg_col_multiindex_keys: multiindex_list = list(multiindex_key) alg_group_df_key = tuple([alg for i, alg in enumerate(config.alg_list) if multiindex_key[i]]) if sum(multiindex_key) == 1: utils.verbose_print('subsampling', alg_group_df_key[0], 'top-ranking sequences') else: utils.verbose_print('subsampling', '-'.join(alg_group_df_key), 'consensus sequences') alg_group_df = prediction_df.xs(multiindex_key) alg_group_unique_index = alg_group_df.index.get_level_values('unique index') alg_group_df.reset_index(inplace = True) alg_group_df.set_index(['scan'], inplace = True) alg_group_df.dropna(1, inplace = True) ref_match_col = alg_group_df['ref match'].copy() retained_features_target = round(config.clustering_feature_retention_factor_dict[sum(multiindex_key)] / alg_group_df.shape[0], 0) if retained_features_target < config.clustering_min_retained_features: retained_features_target = config.clustering_min_retained_features retained_features_list = [] retained_feature_count = 0 for feature in config.features_ordered_by_importance: if feature in alg_group_df.columns: retained_features_list.append(feature) retained_feature_count += 1 if retained_feature_count == retained_features_target: break alg_group_df = alg_group_df[retained_features_list] if alg_group_df.shape[0] > config.subsample_size: pipe = make_pipeline(StandardScaler(), Birch(threshold = config.clustering_birch_threshold, n_clusters = None)) cluster_assignments = pipe.fit_predict(alg_group_df.as_matrix()) cluster_assignment_accuracies = zip(cluster_assignments, ref_match_col) sum_cluster_accuracies = {}.fromkeys(cluster_assignments, 0) for cluster, acc in cluster_assignment_accuracies: sum_cluster_accuracies[cluster] += acc cluster_counts = Counter(cluster_assignments) mean_cluster_accuracies = {}.fromkeys(sum_cluster_accuracies, 0) for cluster in cluster_counts: mean_cluster_accuracies[cluster] = min( accuracy_bins, key = lambda accuracy_bin: abs(accuracy_bin - int( sum_cluster_accuracies[cluster] * 10 / cluster_counts[cluster]) / 10)) ordered_clusters_accuracies = sorted( mean_cluster_accuracies.items(), key = lambda cluster_accuracy_tuple: cluster_accuracy_tuple[1], reverse = True) cluster_assignments_row_indices = [(cluster, index) for index, cluster in enumerate(cluster_assignments)] cluster_row_indices_dict = {cluster: [] for cluster in mean_cluster_accuracies} for cluster, index in cluster_assignments_row_indices: cluster_row_indices_dict[cluster].append(index) cluster_accuracies_ordered_by_cluster = [cluster_acc_tuple[1] for cluster_acc_tuple in sorted(ordered_clusters_accuracies, key = lambda cluster_acc_tuple: cluster_acc_tuple[0])] cluster_accuracies_row_indices = [(x[1], x[0][1]) for x in sorted( zip(cluster_row_indices_dict.items(), cluster_accuracies_ordered_by_cluster), key = lambda cluster_acc_tuple: cluster_acc_tuple[1], reverse = True)] accuracy_row_indices_dict = {acc: [] for acc in cluster_accuracies_ordered_by_cluster} for acc_row_indices_tuple in cluster_accuracies_row_indices: accuracy_row_indices_dict[acc_row_indices_tuple[0]] += acc_row_indices_tuple[1] alg_group_subsample_indices = [] remaining_subsample_size = config.subsample_size remaining_accuracy_bins = [acc_bin for acc_bin in accuracy_bins] remaining_accuracy_subsample_sizes = {acc_bin: size for acc_bin, size in accuracy_subsample_sizes.items()} loop_remaining_accuracy_bins = [acc_bin for acc_bin in remaining_accuracy_bins] while remaining_subsample_size > 0: for acc_bin in loop_remaining_accuracy_bins: if acc_bin not in accuracy_row_indices_dict: residual = remaining_accuracy_subsample_sizes[acc_bin] remaining_accuracy_bins.remove(acc_bin) remaining_accuracy_subsample_sizes[acc_bin] = 0 remaining_accuracy_subsample_sizes = redistribute_residual_subsample( residual, remaining_accuracy_bins, accuracy_subsample_weights, remaining_accuracy_subsample_sizes) elif remaining_accuracy_subsample_sizes[acc_bin] > len(accuracy_row_indices_dict[acc_bin]): acc_bin_subsample_size = len(accuracy_row_indices_dict[acc_bin]) remaining_subsample_size -= acc_bin_subsample_size residual = remaining_accuracy_subsample_sizes[acc_bin] - acc_bin_subsample_size remaining_accuracy_bins.remove(acc_bin) remaining_accuracy_subsample_sizes[acc_bin] = 0 alg_group_subsample_indices += accuracy_row_indices_dict[acc_bin] remaining_accuracy_subsample_sizes = redistribute_residual_subsample( residual, remaining_accuracy_bins, accuracy_subsample_weights, remaining_accuracy_subsample_sizes) else: alg_group_subsample_indices += np.random.choice( accuracy_row_indices_dict[acc_bin], remaining_accuracy_subsample_sizes[acc_bin], replace = False).tolist() remaining_subsample_size -= remaining_accuracy_subsample_sizes[acc_bin] remaining_accuracy_subsample_sizes[acc_bin] = 0 loop_remaining_accuracy_bins = remaining_accuracy_bins scan_index = alg_group_df.index for i in alg_group_subsample_indices: subsample_row_indices.append(tuple(multiindex_list + [scan_index[i], alg_group_unique_index[i]])) else: for i, scan in enumerate(alg_group_df.index): subsample_row_indices.append(tuple(multiindex_list + [scan, alg_group_unique_index[i]])) subsampled_df = prediction_df_orig.loc[sorted(subsample_row_indices)] retained_multiindices = subsampled_df.index.names[:-1] subsampled_df.reset_index(inplace = True) subsampled_df.drop('unique index', axis = 1, inplace = True) subsampled_df.set_index(retained_multiindices, inplace = True) return subsampled_df