Exemple #1
0
def main():
    start_time = time()

    args = userargs.setup()
    utils.save_json_objects(config.iodir[0], **{'args': args})
    #args = utils.load_json_objects(r'C:\Users\Samuel\Documents\Visual Studio 2015\Projects\postnovo\test', 'args')
    #userargs.set_global_vars(args)

    alg_basename_dfs_dict = input.load_files()
    utils.save_pkl_objects(config.iodir[0],
                           **{'alg_basename_dfs_dict': alg_basename_dfs_dict})
    #alg_basename_dfs_dict = utils.load_pkl_objects(config.iodir[0], 'alg_basename_dfs_dict')
    ## example:
    ## alg_basename_dfs_dict = odict('novor': novor input df, 'pn': pn input df)

    prediction_df = consensus.make_prediction_df(alg_basename_dfs_dict)
    utils.save_pkl_objects(config.iodir[0],
                           **{'consensus_prediction_df': prediction_df})
    #prediction_df = utils.load_pkl_objects(config.iodir[0], 'consensus_prediction_df')

    prediction_df = masstol.update_prediction_df(prediction_df)
    utils.save_pkl_objects(config.iodir[0],
                           **{'mass_tol_prediction_df': prediction_df})
    #prediction_df = utils.load_pkl_objects(config.iodir[0], 'mass_tol_prediction_df')

    prediction_df = interspec.update_prediction_df(prediction_df)
    utils.save_pkl_objects(config.iodir[0],
                           **{'interspec_prediction_df': prediction_df})
    #prediction_df = utils.load_pkl_objects(config.iodir[0], 'interspec_prediction_df')

    classifier.classify(prediction_df=prediction_df)
    #classifier.classify()

    utils.verbose_print('total time elapsed:', time() - start_time)
Exemple #2
0
def find_target_accuracy(prediction_df):
    utils.verbose_print('loading', basename(config.db_search_ref_file[0]))
    db_search_ref = load_db_search_ref_file(config.db_search_ref_file[0])
    utils.verbose_print('loading', basename(config.fasta_ref_file[0]))
    fasta_ref = load_fasta_ref_file(config.fasta_ref_file[0])

    utils.verbose_print('finding sequence matches to database search reference')

    prediction_df.reset_index(inplace = True)
    comparison_df = prediction_df.merge(db_search_ref, how = 'left', on = 'scan')
    prediction_df['scan has db search PSM'] = comparison_df['ref seq'].notnull().astype(int)
    comparison_df['ref seq'][comparison_df['ref seq'].isnull()] = ''
    denovo_seqs = comparison_df['seq'].tolist()
    psm_seqs = comparison_df['ref seq'].tolist()
    seq_pairs = list(zip(denovo_seqs, psm_seqs))
    matches = []
    for seq_pair in seq_pairs:
        if seq_pair[0] in seq_pair[1]:
            matches.append(1)
        else:
            matches.append(0)
    prediction_df['de novo seq matches db search seq'] = matches

    utils.verbose_print('finding de novo sequence matches to fasta reference for scans lacking database search PSM')

    no_db_search_psm_df = prediction_df[prediction_df['scan has db search PSM'] == 0]
    no_db_search_psm_df = no_db_search_psm_df[no_db_search_psm_df['seq'].apply(len) >= config.min_ref_match_len[0]]
    unique_long_denovo_seqs = list(set(no_db_search_psm_df['seq']))

    utils.verbose_print('finding minimum de novo sequence length to uniquely match fasta reference')
    #config.min_ref_match_len[0] = find_min_seq_len(fasta_ref = fasta_ref, cores = config.cores[0])
    one_percent_number_denovo_seqs = len(unique_long_denovo_seqs) / 100 / config.cores[0]

    multiprocessing_pool = Pool(config.cores[0])
    single_var_match_seq = partial(match_seq_to_fasta_ref, fasta_ref = fasta_ref,
                                   one_percent_number_denovo_seqs = one_percent_number_denovo_seqs, cores = config.cores[0])
    fasta_matches = multiprocessing_pool.map(single_var_match_seq, unique_long_denovo_seqs)
    multiprocessing_pool.close()
    multiprocessing_pool.join()

    fasta_match_dict = dict(zip(unique_long_denovo_seqs, fasta_matches))
    single_var_get_match_from_dict = partial(get_match_from_dict, match_dict = fasta_match_dict)
    no_db_search_psm_df['correct de novo seq not found in db search'] = no_db_search_psm_df['seq'].apply(single_var_get_match_from_dict)
    prediction_df = prediction_df.merge(no_db_search_psm_df['correct de novo seq not found in db search'].to_frame(),
                                        left_index = True, right_index = True, how = 'left')
    prediction_df['correct de novo seq not found in db search'].fillna(0, inplace = True)

    prediction_df['ref match'] = prediction_df['de novo seq matches db search seq'] +\
        prediction_df['correct de novo seq not found in db search']
    prediction_df.set_index(config.is_alg_col_names + ['scan'] + config.frag_mass_tols + ['is longest consensus', 'is top rank consensus'],
                            inplace = True)
    ref_correspondence_df = pd.concat([prediction_df['scan has db search PSM'],
                                       prediction_df['de novo seq matches db search seq'],
                                       prediction_df['correct de novo seq not found in db search']], 1)
    prediction_df.drop(['scan has db search PSM',
                        'de novo seq matches db search seq',
                        'correct de novo seq not found in db search'], axis = 1, inplace = True)
    prediction_df = prediction_df.reset_index().set_index(config.is_alg_col_names + ['scan'])

    return prediction_df, ref_correspondence_df, db_search_ref
Exemple #3
0
def load_files():

    alg_basename_dfs_dict = OrderedDict()

    if config.novor_files:
        alg_basename_dfs_dict['novor'] = OrderedDict.fromkeys(
            [basename(novor_file) for novor_file in config.novor_files])
        for i, novor_file in enumerate(config.novor_files):
            utils.verbose_print('loading', basename(novor_file))
            check_file_fragment_mass_tol(novor_file, config.frag_mass_tols[i])
            if i == 0:
                find_precursor_mass_tol(novor_file)

            alg_basename_dfs_dict['novor'][basename(novor_file)] = load_novor_file(novor_file)
    
    if config.peaks_files:
        alg_basename_dfs_dict['peaks'] = OrderedDict.fromkeys(
            [basename(peaks_file) for peaks_file in config.peaks_files])
        for i, peaks_file in enumerate(config.peaks_files):
            utils.verbose_print('loading', basename(peaks_file))
            alg_basename_dfs_dict['peaks'][basename(peaks_file)] = load_peaks_file(peaks_file)

    if config.pn_files:
        alg_basename_dfs_dict['pn'] = OrderedDict.fromkeys(
            [basename(pn_file) for pn_file in config.pn_files])
        for i, pn_file in enumerate(config.pn_files):
            utils.verbose_print('loading', basename(pn_file))
            alg_basename_dfs_dict['pn'][basename(pn_file)] = load_pn_file(pn_file)

    utils.verbose_print('cleaning up input data')
    alg_basename_dfs_dict = filter_shared_scans(alg_basename_dfs_dict)

    return alg_basename_dfs_dict
Exemple #4
0
def plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key):
    if len(alg_key) > 1:
        utils.verbose_print('plotting errors vs tree size for', '-'.join(alg_key), 'consensus sequences')
    else:
        utils.verbose_print('plotting errors vs tree size for', alg_key[0], 'sequences')

    ensemble_clfs = [
        #('max_features=\'sqrt\'',
        # RandomForestClassifier(warm_start = True, max_features = 'sqrt', oob_score = True, max_depth = 15, n_jobs = config.cores[0], random_state = 1)),
        ('max_features=None',
         RandomForestClassifier(warm_start = True, max_features = None, oob_score = True, max_depth = 15, n_jobs = config.cores[0], random_state = 1))
    ]

    oob_errors = OrderedDict((label, []) for label, _ in ensemble_clfs)
    #validation_errors = OrderedDict((label, []) for label, _ in ensemble_clfs)
    min_estimators = 10
    max_estimators = 500

    for label, clf in ensemble_clfs:
        for tree_number in range(min_estimators, max_estimators + 1, 100):
            clf.set_params(n_estimators = tree_number)
            clf.fit(data_train_split, target_train_split)

            oob_error = 1 - clf.oob_score_
            oob_errors[label].append((tree_number, oob_error))

            #validation_error = 1 - clf.score(data_validation_split, target_validation_split)
            #validation_errors[label].append((tree_number, validation_error))

    fig, ax1 = plt.subplots()
    for label, oob_error in oob_errors.items():
        xs, ys = zip(*oob_error)
        ax1.plot(xs, ys, label = 'oob error: ' + label)
    #for label, validation_error in validation_errors.items():
    #    xs, ys = zip(*validation_error)
    #    ax1.plot(xs, ys, label = 'validation error: ' + label)

    ax1.set_xlim(min_estimators, max_estimators)
    ax1.set_xlabel('n_estimators')
    ax1.set_ylabel('error rate')
    ax1.legend(loc = 'upper right')
    fig.set_tight_layout(True)

    alg_key_str = '_'.join(alg_key)
    save_path = join(config.iodir[0], alg_key_str + '_error.pdf')
    fig.savefig(save_path, bbox_inches = 'tight')
Exemple #5
0
def make_forest_dict(train_target_arr_dict, rf_params):

    forest_dict = {}.fromkeys(train_target_arr_dict)
    for alg_key in forest_dict:
        if len(alg_key) > 1:
            utils.verbose_print('making random forest for', '-'.join(alg_key), 'consensus sequences')
        else:
            utils.verbose_print('making random forest for', alg_key[0], 'sequences')

        train_data = train_target_arr_dict[alg_key]['train']
        target_data = train_target_arr_dict[alg_key]['target']
        forest = RandomForestClassifier(n_estimators = config.rf_n_estimators,
                                        max_depth = rf_params[alg_key]['max_depth'],
                                        max_features = rf_params[alg_key]['max_features'],
                                        oob_score = True,
                                        n_jobs = config.cores[0])
        forest.fit(train_data, target_data)
        forest_dict[alg_key] = forest

    return forest_dict
Exemple #6
0
def make_predictions(prediction_df, db_search_ref):

    forest_dict = utils.load_pkl_objects(config.training_dir, 'forest_dict')

    prediction_df['probability'] = np.nan
    for multiindex_key in config.is_alg_col_multiindex_keys:
        alg_group = tuple([alg for i, alg in enumerate(config.alg_list) if multiindex_key[i]])

        alg_group_data = prediction_df.xs(multiindex_key)
        if config.run_type[0] == 'predict':
            alg_group_data.drop(['seq', 'probability'], axis = 1, inplace = True)
        elif config.run_type[0] == 'test':
            accuracy_labels = alg_group_data['ref match'].tolist()
            alg_group_data.drop(['seq', 'ref match', 'probability'], axis = 1, inplace = True)
        alg_group_data.dropna(1, inplace = True)
        forest_dict[alg_group].n_jobs = config.cores[0]
        probabilities = forest_dict[alg_group].predict_proba(alg_group_data.as_matrix())[:, 1]

        if config.run_type[0] == 'test':
            utils.verbose_print('making', '_'.join(alg_group), 'test plots')
            #plot_roc_curve(accuracy_labels, probabilities, alg_group, alg_group_data)
            plot_precision_recall_curve(accuracy_labels, probabilities, alg_group, alg_group_data)

        prediction_df.loc[multiindex_key, 'probability'] = probabilities

    if config.run_type[0] == 'test':
        plot_precision_yield(prediction_df, db_search_ref)

    prediction_df = prediction_df.reset_index().set_index('scan')
    max_probabilities = prediction_df.groupby(level = 'scan')['probability'].transform(max)
    best_prediction_df = prediction_df[prediction_df['probability'] == max_probabilities]
    best_prediction_df = best_prediction_df.groupby(level = 'scan').first()
    reported_prediction_df = best_prediction_df[best_prediction_df['probability'] >= config.min_prob[0]]
    
    reported_cols_in_order = []
    for reported_df_col in config.reported_df_cols:
        if reported_df_col in reported_prediction_df.columns:
            reported_cols_in_order.append(reported_df_col)
    reported_prediction_df = reported_prediction_df.reindex_axis(reported_cols_in_order, axis = 1)

    return reported_prediction_df
Exemple #7
0
def make_training_forests(training_df):

    train_target_arr_dict = make_train_target_arr_dict(training_df)
    
    if config.run_type[0] == 'train':
        forest_dict = make_forest_dict(train_target_arr_dict, config.rf_default_params)

        ## REMOVE
        for alg_key in forest_dict:
            data_train_split, data_validation_split, target_train_split, target_validation_split =\
                train_test_split(train_target_arr_dict[alg_key]['train'], train_target_arr_dict[alg_key]['target'], stratify = train_target_arr_dict[alg_key]['target'])
        #    #plot_feature_importances(forest_dict[alg_key], alg_key, train_target_arr_dict[alg_key]['feature_names'])
            plot_binned_feature_importances(forest_dict[alg_key], alg_key, train_target_arr_dict[alg_key]['feature_names'])
        #    plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key)

    elif config.run_type[0] == 'optimize':
        utils.verbose_print('optimizing random forest parameters')
        optimized_params = optimize_model(train_target_arr_dict)
        forest_dict = make_forest_dict(train_target_arr_dict, optimized_params)

    return forest_dict
Exemple #8
0
def plot_binned_feature_importances(forest, alg_key, feature_names):
    if len(alg_key) > 1:
        utils.verbose_print('plotting feature importances for', '-'.join(alg_key), 'consensus sequences')
    else:
        utils.verbose_print('plotting feature importances for', alg_key[0], 'sequences')

    feature_importances = forest.feature_importances_
    feature_group_importances = []
    feature_group_stds = []
    for feature_group, features in config.feature_groups.items():
        feature_group_importance = 0.0
        feature_group_var = 0.0
        for i, feature_name in enumerate(feature_names):
            if feature_name in features:
                feature_group_importance += feature_importances[i]
                feature_group_var += np.var(
                    [tree.feature_importances_[i] for tree in forest.estimators_], axis = 0)
        feature_group_importances.append(feature_group_importance)
        feature_group_stds.append(np.sqrt(feature_group_var))

    feature_group_importances = np.array(feature_group_importances)
    feature_group_stds = np.array(feature_group_stds)
    indices = np.argsort(feature_group_importances)[::-1]

    fig, ax = plt.subplots()
    ax.set_title('Binned feature importances')
    x = np.arange(len(feature_group_importances))
    ax.bar(left = x, height = feature_group_importances[indices], color = 'r', yerr = feature_group_stds[indices], width = 0.9, align = 'center')
    ax.set_xticks(x)
    labels = np.array(list(config.feature_groups))[indices]
    ax.set_xticklabels(labels, rotation = -45, ha = 'left')
    ax.set_xlim([-1, len(feature_group_importances)])
    ax.set_ylim(ymin = 0)
    fig.set_tight_layout(True)

    alg_key_str = '_'.join(alg_key)
    save_path = join(config.iodir[0], alg_key_str + '_binned_feature_importances.pdf')
    fig.savefig(save_path, bbox_inches = 'tight')
Exemple #9
0
def optimize_model(train_target_arr_dict):

    optimized_params = {}
    for alg_key in train_target_arr_dict:
        optimized_params[alg_key] = {}

        data_train_split, data_validation_split, target_train_split, target_validation_split =\
            train_test_split(train_target_arr_dict[alg_key]['train'], train_target_arr_dict[alg_key]['target'], stratify = train_target_arr_dict[alg_key]['target'])
        forest_grid = GridSearchCV(RandomForestClassifier(n_estimators = config.rf_n_estimators, oob_score = True),
                                   {'max_features': ['sqrt', None], 'max_depth': [depth for depth in range(11, 20)]},
                                   n_jobs = config.cores[0])
        forest_grid.fit(data_train_split, target_train_split)
        optimized_forest = forest_grid.best_estimator_
        optimized_params[alg_key]['max_depth'] = optimized_forest.max_depth
        utils.verbose_print(alg_key, 'optimized max depth:', optimized_forest.max_depth)
        optimized_params[alg_key]['max_features'] = optimized_forest.max_features
        utils.verbose_print(alg_key, 'optimized max features:', optimized_forest.max_features)

        plot_feature_importances(optimized_forest, alg_key, train_target_arr_dict[alg_key]['feature_names'])
        plot_binned_feature_importances(optimized_forest, alg_key, train_target_arr_dict[alg_key]['feature_names'])
        plot_errors(data_train_split, data_validation_split, target_train_split, target_validation_split, alg_key)

    return optimized_params
Exemple #10
0
def update_prediction_df(prediction_df):
    utils.verbose_print()

    if len(config.frag_mass_tols) == 1:
        return prediction_df

    utils.verbose_print('setting up mass tolerance comparison')
    prediction_df.reset_index(inplace=True)
    # combo level col = sum of 'is novor seq', 'is peaks seq', 'is pn seq' values
    prediction_df['combo level'] = prediction_df.iloc[:, :len(config.alg_list
                                                              )].sum(axis=1)
    scan_list = sorted(list(set(prediction_df['scan'])))
    one_percent_number_scans = len(scan_list) / 100 / config.cores[0]
    tol_group_key_list = []
    for i, tol in enumerate(config.frag_mass_tols):
        tol_group_key = [0] * len(config.frag_mass_tols)
        tol_group_key[-(i + 1)] = 1
        tol_group_key_list.append(tuple(tol_group_key))
    # set index as scan, '0.2' -> '0.7', combo level
    prediction_df.set_index(['scan'] + config.frag_mass_tols, inplace=True)
    # tol list indices are sorted backwards: 0.7 predictions come before 0.2 in scan group
    prediction_df.sort_index(level=['scan'] + config.frag_mass_tols,
                             inplace=True)
    mass_tol_compar_df = prediction_df[['seq', 'combo level']]
    scan_groups = mass_tol_compar_df.groupby(level='scan')

    # single processor method
    #child_initialize(scan_groups, config.frag_mass_tols, tol_group_key_list)
    #tol_match_array_list = []
    #utils.verbose_print('performing mass tolerance comparison')
    #for scan in scan_list:
    #    tol_match_array_list.append(make_mass_tol_match_array(scan))

    multiprocessing_pool = Pool(config.cores[0],
                                initializer=child_initialize,
                                initargs=(scan_groups, config.frag_mass_tols,
                                          tol_group_key_list, config.cores[0],
                                          one_percent_number_scans))
    utils.verbose_print('performing mass tolerance comparison')
    tol_match_array_list = multiprocessing_pool.map(make_mass_tol_match_array,
                                                    scan_list)
    multiprocessing_pool.close()
    multiprocessing_pool.join()

    tol_match_cols = [tol + ' seq match' for tol in config.frag_mass_tols]
    tol_match_df = pd.DataFrame(np.fliplr(
        np.concatenate(tol_match_array_list)),
                                index=prediction_df.index,
                                columns=tol_match_cols)
    prediction_df = pd.concat([prediction_df, tol_match_df], axis=1)
    prediction_df.drop(['combo level'], axis=1, inplace=True)
    prediction_df.reset_index(inplace=True)
    prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace=True)
    prediction_df.sort_index(level=['scan'] + config.is_alg_col_names,
                             inplace=True)

    return prediction_df
Exemple #11
0
def make_prediction_df(alg_basename_dfs_dict):
    utils.verbose_print()

    if config.run_type[0] in ['train', 'optimize']:
        consensus_min_len = config.train_consensus_len
    elif config.run_type[0] in ['predict', 'test']:
        consensus_min_len = config.min_len[0]

    tol_prediction_df_list = []
    for tol in config.frag_mass_tols:
        utils.verbose_print('setting up', tol, 'Da consensus comparison')
        alg_compar_list = config.tol_alg_dict[tol]

        if len(alg_compar_list) > 1:
            df_name_compar_list = config.tol_basenames_dict[tol]
            alg_df_dict = OrderedDict([(alg, alg_basename_dfs_dict[alg][df_name_compar_list[i]])
                                       for i, alg in enumerate(alg_compar_list)])

            tol_prediction_df = make_prediction_df_for_tol(consensus_min_len, alg_df_dict, tol)
            tol_prediction_df_list.append(tol_prediction_df)

    prediction_df = pd.concat(tol_prediction_df_list)
    grouped_by_scan = prediction_df.groupby(['scan'])
    prediction_df['retention time'] = grouped_by_scan['retention time'].transform(max)
    prediction_df = prediction_df[~prediction_df['retention time'].isnull()]

    for tol in config.frag_mass_tols:
        prediction_df[tol].fillna(0, inplace = True)

    for is_alg_col_name in config.is_alg_col_names:
        prediction_df[is_alg_col_name].fillna(0, inplace = True)
        prediction_df[is_alg_col_name] = prediction_df[is_alg_col_name].astype(int)
    prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace = True)
    prediction_df.sort_index(level = ['scan'] + config.is_alg_col_names, inplace = True)

    return prediction_df
Exemple #12
0
def plot_feature_importances(forest, alg_key, feature_names):
    if len(alg_key) > 1:
        utils.verbose_print('plotting feature importances for', '-'.join(alg_key), 'consensus sequences')
    else:
        utils.verbose_print('plotting feature importances for', alg_key[0], 'sequences')

    importances = forest.feature_importances_
    feature_std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)
    indices = np.argsort(importances)[::-1]
    
    fig, ax = plt.subplots()
    ax.set_title('Feature importances')
    x = np.arange(len(importances))
    ax.bar(left = x, height = importances[indices], color = 'r', yerr = feature_std[indices], width = 0.9, align = 'center')
    ax.set_xticks(x)
    labels = np.array(feature_names)[indices]
    ax.set_xticklabels(labels, rotation = -45, ha = 'left')
    ax.set_xlim([-1, len(importances)])
    ax.set_ylim(ymin = 0)
    fig.set_tight_layout(True)

    alg_key_str = '_'.join(alg_key)
    save_path = join(config.iodir[0], alg_key_str + '_feature_importances.pdf')
    fig.savefig(save_path, bbox_inches = 'tight')
Exemple #13
0
def classify(prediction_df = None):
    utils.verbose_print()

    if config.run_type[0] in ['train', 'test', 'optimize']:
        prediction_df, ref_correspondence_df, db_search_ref = find_target_accuracy(prediction_df)

    utils.verbose_print('formatting data for compatability with model')
    prediction_df = standardize_prediction_df_cols(prediction_df)
    utils.save_pkl_objects(config.iodir[0], **{'prediction_df': prediction_df})
    #prediction_df = utils.load_pkl_objects(config.iodir[0], 'prediction_df')

    if config.run_type[0] == 'predict':
        reported_prediction_df = make_predictions(prediction_df)
        reported_prediction_df.to_csv(os.path.join(config.iodir[0], 'best_predictions.csv'))

    elif config.run_type[0] == 'test':
        reported_prediction_df = make_predictions(prediction_df, db_search_ref)
        reported_prediction_df = reported_prediction_df.reset_index().\
            merge(ref_correspondence_df.reset_index(),
                  how = 'left',
                  on = config.is_alg_col_names + ['scan'] + config.frag_mass_tols + ['is longest consensus', 'is top rank consensus'])
        reported_prediction_df.set_index('scan', inplace = True)
        reported_cols_in_order = []
        for reported_df_col in config.reported_df_cols:
            if reported_df_col in reported_prediction_df.columns:
                reported_cols_in_order.append(reported_df_col)
        reported_prediction_df = reported_prediction_df.reindex_axis(reported_cols_in_order, axis = 1)
        reported_prediction_df.to_csv(os.path.join(config.iodir[0], 'best_predictions.csv'))
    
    elif config.run_type[0] in ['train', 'optimize']:
        
        #subsampled_df = subsample_training_data(prediction_df)
        #utils.save_pkl_objects(config.iodir[0], **{'subsampled_df': subsampled_df})
        #subsampled_df = utils.load_pkl_objects(config.iodir[0], 'subsampled_df')

        utils.verbose_print('updating training database')
        training_df = update_training_data(prediction_df)
        #training_df = utils.load_pkl_objects(config.training_dir, 'training_df')

        forest_dict = make_training_forests(training_df)
        utils.save_pkl_objects(config.training_dir, **{'forest_dict': forest_dict})
Exemple #14
0
def update_prediction_df(prediction_df):
    utils.verbose_print()

    utils.verbose_print('setting up inter-spectrum comparison')
    prediction_df['mass error'] = prediction_df[
        'measured mass'] * config.precursor_mass_tol[0] * 10**-6
    prediction_df.reset_index(inplace=True)
    prediction_df.set_index(config.is_alg_col_names, inplace=True)
    tol_group_key_list = []
    for i, tol in enumerate(config.frag_mass_tols):
        tol_group_key = [0] * len(config.frag_mass_tols)
        tol_group_key[i] = 1
        tol_group_key_list.append(tuple(tol_group_key))
    full_precursor_array_list = []

    for multiindex_key in config.is_alg_col_multiindex_keys:
        alg_combo = '-'.join([
            alg for i, alg in enumerate(config.alg_list) if multiindex_key[i]
        ])

        alg_group_precursor_array_list = []
        alg_combo_df = prediction_df.xs(multiindex_key)
        alg_combo_df.reset_index(inplace=True)
        alg_combo_df.set_index(config.frag_mass_tols, inplace=True)

        for tol_group_key in tol_group_key_list:
            tol = config.frag_mass_tols[tol_group_key.index(1)]

            try:
                tol_df = alg_combo_df.xs(tol_group_key)[[
                    'seq', 'measured mass', 'mass error'
                ]]
            except TypeError:
                tol_df = alg_combo_df.xs(
                    tol_group_key[0])[['seq', 'measured mass', 'mass error']]
            tol_df.sort('measured mass', inplace=True)
            measured_masses = tol_df['measured mass'].tolist()
            mass_errors = tol_df['mass error'].tolist()

            # precursor indices represent different spectra clustered by mass
            precursor_indices = []
            precursor_index = 0
            # assign the first seq prediction to precursor 0
            precursor_indices.append(0)
            previous_mass = tol_df.iat[0, 1]

            for mass_index, mass in enumerate(measured_masses[1:]):
                if mass - mass_errors[mass_index] > previous_mass:
                    precursor_index += 1
                    previous_mass = mass
                precursor_indices.append(precursor_index)
            tol_df['precursor index'] = precursor_indices
            precursor_groups = tol_df.groupby('precursor index')

            ## single processor method
            #tol_group_precursor_array_list = []
            #utils.verbose_print('performing inter-spectrum comparison for', alg_combo + ',', tol, 'Da seqs')
            #for precursor_index in range(precursor_indices[-1] + 1):
            #    child_initialize(precursor_groups)
            #    tol_group_precursor_array_list.append(make_precursor_info_array(precursor_index))

            ## multiprocessing method
            precursor_range = range(precursor_indices[-1] + 1)
            one_percent_number_precursors = len(
                precursor_range) / 100 / config.cores[0]
            multiprocessing_pool = Pool(
                config.cores[0],
                initializer=child_initialize,
                initargs=(precursor_groups, config.cores[0],
                          one_percent_number_precursors))
            utils.verbose_print('performing inter-spectrum comparison for',
                                alg_combo + ',', tol, 'Da seqs')
            tol_group_precursor_array_list = multiprocessing_pool.map(
                make_precursor_info_array, precursor_range)
            multiprocessing_pool.close()
            multiprocessing_pool.join()

            alg_group_precursor_array_list += tol_group_precursor_array_list
        full_precursor_array_list += alg_group_precursor_array_list
    interspec_df = pd.DataFrame(
        np.concatenate(full_precursor_array_list),
        index=prediction_df.index,
        columns=['precursor seq agreement', 'precursor seq count'])
    # concatenate full array columnwise with prediction_df
    prediction_df = pd.concat([prediction_df, interspec_df], axis=1)

    prediction_df.drop(['measured mass', 'mass error'], axis=1, inplace=True)
    prediction_df.reset_index(inplace=True)
    prediction_df.set_index(config.is_alg_col_names + ['scan'], inplace=True)
    prediction_df.sort_index(level=['scan'] + config.is_alg_col_names,
                             inplace=True)

    return prediction_df
Exemple #15
0
def make_prediction_df_for_tol(consensus_min_len, alg_df_dict, tol):

    for alg, df in alg_df_dict.items():
        encode_seqs(df, consensus_min_len)

    combo_level_alg_dict = make_combo_level_alg_dict(alg_df_dict)
    highest_level_alg_combo = config.alg_combo_list[-1]
    ## examples
    ## combo_level_alg_dict = odict(2: [('novor', 'peaks'), ('novor', 'pn'), ('peaks', 'pn')], 3: [('novor', 'peaks', 'pn')])
    ## highest_level_alg_combo = ('novor', 'peaks', 'pn')

    alg_df_dict = add_measured_mass_col(alg_df_dict)
    alg_consensus_source_df_dict = make_alg_consensus_source_df_dict(highest_level_alg_combo, alg_df_dict)
    consensus_scan_list = make_consensus_scan_list(alg_consensus_source_df_dict, highest_level_alg_combo)
    one_percent_number_consensus_scans = len(consensus_scan_list) / 100 / config.cores[0]

    scan_consensus_info_dict, scan_generator_fns_dict, scan_common_substrings_info_dict = setup_scan_info_dicts(combo_level_alg_dict)
    ## examples
    ## scan_consensus_info_dict = odict(2: odict(
    ##    ('novor', 'pn'):
    ##    {'longest_cs': {'seq_starts': None, 'rank_sum': None, 'consensus_len': None, 'alg_ranks': None},
    ##     'top_rank_cs': {'seq_starts': None, 'rank_sum': None, 'consensus_len': None, 'alg_ranks': None}
    ##    }, ...), 3: odict(...))
    ## scan_generator_fns_dict = odict(2: odict(
    ##    ('novor', 'peaks'): generator fn,
    ##    ('novor', 'pn'): generator fn,
    ##    ('peaks', 'pn'): generator fn))
    ## scan_common_substrings_info_dict = odict(2: odict(
    ##    ('novor', 'peaks'): list of common substrings,
    ##    ('novor', 'pn'): list of common substrings,
    ##    ('peaks', 'pn'): list of common substrings)

    first_seq_second_seq_rank_comparisons_dict, first_seq_second_seq_max_ranks_dict, first_seq_second_seq_alg_positions_dict =\
        make_first_seq_second_seq_comparisons_dicts(scan_consensus_info_dict)
    ## examples
    ## first_seq_second_seq_rank_comparisons_dict = odict(2: odict(
    ##    ('novor', 'peaks'): [((0,), (0,)), ((0,), (1,)), ..., ((0,), (18,)), ((0,), (19,))],
    ##    ...,
    ##    ('peaks', 'pn'): [((0,), (0,)), ((0,), (1,)), ..., ((19,), (18,)), ((19,), (19,))]),
    ##    3: odict(
    ##    ('novor', 'peaks', 'pn'): [((0, 0), (0,)), ((0, 0), (1,)), ..., ((0, 19), (18,)), ((0, 19), (19,))]))
    ## first_seq_second_seq_max_ranks_dict = odict(2: odict(
    ##    ('novor', 'peaks'): [1, 20],
    ##    ('novor', 'pn'): [1, 20],
    ##    ('peaks', 'pn'): [20, 20]),
    ##    3: odict(
    ##    ('novor', 'peaks', 'pn'): [1, 20, 20]))
    ## first_seq_second_seq_alg_positions_dict = odict(2: odict(
    ##    ('novor', 'peaks'): odict('novor': (0, 0), 'peaks': (1, 0)),
    ##    ('novor', 'pn'): odict('novor': (0, 0), 'pn': (1, 0))
    ##    ('peaks', 'pn'): odict('peaks': (0, 0), 'pn': (1, 0))),
    ##    3: odict(
    ##    ('novor', 'peaks', 'pn'): odict('novor': (0, 0), 'peaks': (0, 1), 'pn': (1, 0))))

    # multiprocessing method
    multiprocessing_pool = Pool(config.cores[0],
                                initializer = child_initialize,
                                initargs = (alg_consensus_source_df_dict, scan_consensus_info_dict, scan_generator_fns_dict,
                                            scan_common_substrings_info_dict, consensus_min_len, first_seq_second_seq_rank_comparisons_dict,
                                            first_seq_second_seq_max_ranks_dict, first_seq_second_seq_alg_positions_dict,
                                            tol, config.cores[0], one_percent_number_consensus_scans)
                                )
    utils.verbose_print('finding', tol, 'Da consensus sequences')
    grand_scan_prediction_dict_list = multiprocessing_pool.map(make_scan_prediction_dicts, consensus_scan_list)
    multiprocessing_pool.close()
    multiprocessing_pool.join()

    ## single processor method
    #child_initialize(alg_consensus_source_df_dict,
    #                 scan_consensus_info_dict, scan_generator_fns_dict,
    #                 scan_common_substrings_info_dict, consensus_min_len,
    #                 first_seq_second_seq_rank_comparisons_dict, first_seq_second_seq_max_ranks_dict,
    #                 first_seq_second_seq_alg_positions_dict, tol)
    #grand_scan_prediction_dict_list = []
    #utils.verbose_print('finding', tol, 'Da consensus sequences')
    #for consensus_scan in consensus_scan_list:
    #    grand_scan_prediction_dict_list.append(
    #        make_scan_prediction_dicts(consensus_scan))

    scan_prediction_dict_list = [seq_prediction_dict
                                 for scan_prediction_dict_list in grand_scan_prediction_dict_list
                                 for seq_prediction_dict in scan_prediction_dict_list]
    tol_prediction_df = pd.DataFrame().from_dict(scan_prediction_dict_list)
    tol_prediction_df[tol] = 1
    
    return tol_prediction_df
Exemple #16
0
def subsample_training_data(prediction_df_orig):

    subsample_row_indices = []
    prediction_df_orig['unique index'] = [i for i in range(prediction_df_orig.shape[0])]
    prediction_df_orig.set_index('unique index', append = True, inplace = True)
    prediction_df = prediction_df_orig.copy()
    prediction_df.drop(['is top rank single alg', 'seq'], axis = 1, inplace = True)

    accuracy_bins = sorted([round(x / config.subsample_accuracy_divisor, 1) for x in range(config.subsample_accuracy_divisor)], reverse = True)
    
    lower = config.subsample_accuracy_distribution_lower_bound
    upper = config.subsample_accuracy_distribution_upper_bound
    weight_bins = np.arange(lower, upper + (upper - lower) / config.subsample_accuracy_divisor, (upper - lower) / config.subsample_accuracy_divisor)
    sigma = config.subsample_accuracy_distribution_sigma
    mu_location = config.subsample_accuracy_distribution_mu_location
    accuracy_weights = (norm.cdf(weight_bins[1: 1 + config.subsample_accuracy_divisor], loc = mu_location, scale = sigma)
                        - norm.cdf(weight_bins[: config.subsample_accuracy_divisor], loc = mu_location, scale = sigma))\
                            / (norm.cdf(upper, loc = mu_location, scale = sigma)
                               - norm.cdf(lower, loc = mu_location, scale = sigma))
    accuracy_subsample_weights = {acc_bin: weight for acc_bin, weight in zip(accuracy_bins, accuracy_weights)}
    accuracy_subsample_sizes = {acc_bin: int(weight * config.subsample_size) for acc_bin, weight in accuracy_subsample_weights.items()}
    while sum(accuracy_subsample_sizes.values()) != config.subsample_size:
        accuracy_subsample_sizes[accuracy_bins[0]] += 1

    for multiindex_key in config.is_alg_col_multiindex_keys:
        multiindex_list = list(multiindex_key)
        alg_group_df_key = tuple([alg for i, alg in enumerate(config.alg_list) if multiindex_key[i]])
        if sum(multiindex_key) == 1:
            utils.verbose_print('subsampling', alg_group_df_key[0], 'top-ranking sequences')
        else:
            utils.verbose_print('subsampling', '-'.join(alg_group_df_key), 'consensus sequences')
        alg_group_df = prediction_df.xs(multiindex_key)
        alg_group_unique_index = alg_group_df.index.get_level_values('unique index')
        alg_group_df.reset_index(inplace = True)
        alg_group_df.set_index(['scan'], inplace = True)
        alg_group_df.dropna(1, inplace = True)
        ref_match_col = alg_group_df['ref match'].copy()

        retained_features_target = round(config.clustering_feature_retention_factor_dict[sum(multiindex_key)] / alg_group_df.shape[0], 0)
        if retained_features_target < config.clustering_min_retained_features:
            retained_features_target = config.clustering_min_retained_features
        retained_features_list = []
        retained_feature_count = 0
        for feature in config.features_ordered_by_importance:
            if feature in alg_group_df.columns:
                retained_features_list.append(feature)
                retained_feature_count += 1
            if retained_feature_count == retained_features_target:
                break
        alg_group_df = alg_group_df[retained_features_list]

        if alg_group_df.shape[0] > config.subsample_size:

            pipe = make_pipeline(StandardScaler(),
                                 Birch(threshold = config.clustering_birch_threshold, n_clusters = None))
            cluster_assignments = pipe.fit_predict(alg_group_df.as_matrix())

            cluster_assignment_accuracies = zip(cluster_assignments, ref_match_col)
            sum_cluster_accuracies = {}.fromkeys(cluster_assignments, 0)
            for cluster, acc in cluster_assignment_accuracies:
                sum_cluster_accuracies[cluster] += acc
            cluster_counts = Counter(cluster_assignments)
            mean_cluster_accuracies = {}.fromkeys(sum_cluster_accuracies, 0)
            for cluster in cluster_counts:
                mean_cluster_accuracies[cluster] = min(
                    accuracy_bins,
                    key = lambda accuracy_bin: abs(accuracy_bin - int(
                        sum_cluster_accuracies[cluster] * 10 / cluster_counts[cluster]) / 10))
            ordered_clusters_accuracies = sorted(
                mean_cluster_accuracies.items(), key = lambda cluster_accuracy_tuple: cluster_accuracy_tuple[1], reverse = True)
            cluster_assignments_row_indices = [(cluster, index) for index, cluster in enumerate(cluster_assignments)]
            cluster_row_indices_dict = {cluster: [] for cluster in mean_cluster_accuracies}
            for cluster, index in cluster_assignments_row_indices:
                cluster_row_indices_dict[cluster].append(index)
            cluster_accuracies_ordered_by_cluster = [cluster_acc_tuple[1] for cluster_acc_tuple in
                                                     sorted(ordered_clusters_accuracies, key = lambda cluster_acc_tuple: cluster_acc_tuple[0])]
            cluster_accuracies_row_indices = [(x[1], x[0][1]) for x in sorted(
                zip(cluster_row_indices_dict.items(), cluster_accuracies_ordered_by_cluster),
                key = lambda cluster_acc_tuple: cluster_acc_tuple[1], reverse = True)]

            accuracy_row_indices_dict = {acc: [] for acc in cluster_accuracies_ordered_by_cluster}
            for acc_row_indices_tuple in cluster_accuracies_row_indices:
                accuracy_row_indices_dict[acc_row_indices_tuple[0]] += acc_row_indices_tuple[1]

            alg_group_subsample_indices = []
            remaining_subsample_size = config.subsample_size
            remaining_accuracy_bins = [acc_bin for acc_bin in accuracy_bins]
            remaining_accuracy_subsample_sizes = {acc_bin: size for acc_bin, size in accuracy_subsample_sizes.items()}
            loop_remaining_accuracy_bins = [acc_bin for acc_bin in remaining_accuracy_bins]
            while remaining_subsample_size > 0:
                for acc_bin in loop_remaining_accuracy_bins:
                    if acc_bin not in accuracy_row_indices_dict:
                        residual = remaining_accuracy_subsample_sizes[acc_bin]
                        remaining_accuracy_bins.remove(acc_bin)
                        remaining_accuracy_subsample_sizes[acc_bin] = 0
                        remaining_accuracy_subsample_sizes = redistribute_residual_subsample(
                            residual, remaining_accuracy_bins, accuracy_subsample_weights, remaining_accuracy_subsample_sizes)
                    elif remaining_accuracy_subsample_sizes[acc_bin] > len(accuracy_row_indices_dict[acc_bin]):
                        acc_bin_subsample_size = len(accuracy_row_indices_dict[acc_bin])
                        remaining_subsample_size -= acc_bin_subsample_size
                        residual = remaining_accuracy_subsample_sizes[acc_bin] - acc_bin_subsample_size
                        remaining_accuracy_bins.remove(acc_bin)
                        remaining_accuracy_subsample_sizes[acc_bin] = 0
                        alg_group_subsample_indices += accuracy_row_indices_dict[acc_bin]
                        remaining_accuracy_subsample_sizes = redistribute_residual_subsample(
                            residual, remaining_accuracy_bins, accuracy_subsample_weights, remaining_accuracy_subsample_sizes)
                    else:
                        alg_group_subsample_indices += np.random.choice(
                            accuracy_row_indices_dict[acc_bin], remaining_accuracy_subsample_sizes[acc_bin], replace = False).tolist()
                        remaining_subsample_size -= remaining_accuracy_subsample_sizes[acc_bin]
                        remaining_accuracy_subsample_sizes[acc_bin] = 0
                loop_remaining_accuracy_bins = remaining_accuracy_bins

            scan_index = alg_group_df.index
            for i in alg_group_subsample_indices:
                subsample_row_indices.append(tuple(multiindex_list + [scan_index[i], alg_group_unique_index[i]])) 

        else:
            for i, scan in enumerate(alg_group_df.index):
                subsample_row_indices.append(tuple(multiindex_list + [scan, alg_group_unique_index[i]]))

    subsampled_df = prediction_df_orig.loc[sorted(subsample_row_indices)]
    retained_multiindices = subsampled_df.index.names[:-1]
    subsampled_df.reset_index(inplace = True)
    subsampled_df.drop('unique index', axis = 1, inplace = True)
    subsampled_df.set_index(retained_multiindices, inplace = True)

    return subsampled_df