Exemple #1
0
 def test_Unimod_mass(self):
     db = mass.Unimod(gzip.open('unimod.xml.gz'))
     for x in db.mods:
         self.assertGreater(
             0.00001,
             abs(x['mono_mass'] - mass.calculate_mass(
                 x['composition'], mass_data=db.mass_data)))
Exemple #2
0
 def test_Unimod_methods(self):
     db = mass.Unimod(gzip.open('unimod.xml.gz'))
     rec_id = 1
     rec_name = 'Acetylation'
     rec_title = 'Acetyl'
     record = db.by_id(rec_id)
     self.assertEqual(record['title'], rec_title)
     self.assertEqual(record['full_name'], rec_name)
     self.assertEqual(record, db[rec_id])
     self.assertEqual(record, db.by_title(rec_title))
     self.assertEqual(record, db.by_name(rec_name))
def _unimod_parser_():
    '''
    :return: dictionairy with unimod IDs as keys and avg mass as value
    '''
    # create UNIMOD dictionairy (record_id->avg mass)
    unimod_dict = {}
    # connect UNIMOD database
    unimod_db = mass.Unimod(source='http://www.unimod.org/xml/unimod.xml')
    for mod in unimod_db.mods:
        unimod_dict[(mod['record_id'])] = mod['avge_mass']
    return unimod_dict
Exemple #4
0
import operator
import logging
import pandas as pd
import numpy as np
import warnings
from collections import Counter
import re
import pathlib
import itertools as it
from pyteomics import parser, pepxml, mass

logger = logging.getLogger(__name__)

MASS_FORMAT = '{:+.4f}'
UNIMOD = mass.Unimod(
    pathlib.Path(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     'unimod.xml')).as_uri())
INTERNAL = 5
DIFF_C13 = mass.calculate_mass(formula='C[13]') - mass.calculate_mass(
    formula='C')
H = mass.nist_mass['H+'][0][0]


def internal(*args, **kwargs):
    """Emit log message with level INTERNAL, which is lower than DEBUG."""
    logger.log(INTERNAL, *args, **kwargs)


def mass_format(mass):
    return MASS_FORMAT.format(mass)
Exemple #5
0
 def get_unimod_db(self):
     self.unimod_db = mass.Unimod()
Exemple #6
0
def main():
    pars = argparse.ArgumentParser()
    pars.add_argument('--params', help='CFG file with parameters.'
        'An example can be found at https://github.com/SimpleNumber/aa_stat',
        required=True)
    pars.add_argument('--dir', help='Directory to store the results. '
        'Default value is current directory.', default='.')
    pars.add_argument('-v', '--verbosity', action='count', default=1, help='Increase output verbosity')

    input_spectra = pars.add_mutually_exclusive_group()
    input_spectra.add_argument('--mgf',  nargs='+', help='MGF files to localize modifications')
    input_spectra.add_argument('--mzML',  nargs='+', help='mzML files to localize modifications')

    input_file = pars.add_mutually_exclusive_group(required=True)
    input_file.add_argument('--pepxml', nargs='+', help='List of input files in pepXML format')
    input_file.add_argument('--csv', nargs='+', help='List of input files in CSV format')
    levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    args = pars.parse_args()
    save_directory = args.dir
    level = 2 if args.verbosity >= 2 else args.verbosity
    logging.basicConfig(format='%(levelname)5s: %(asctime)s %(message)s',
                        datefmt='[%H:%M:%S]', level=levels[level])
    logger = logging.getLogger(__name__)
    logger.info("Starting...")


    params = ConfigParser(delimiters=('=', ':'),
                          comment_prefixes=('#'),
                          inline_comment_prefixes=('#'))
    params.read(args.params)
    params_dict = AA_stat.get_parameters(params)
    params_dict = AA_stat.get_additional_params(params_dict) #params_dict 'window'

    data = AA_stat.read_input(args, params_dict)

    hist, popt_pvar = AA_stat.fit_peaks(data, args, params_dict)
#    print(popt_pvar)
#    print('=======================================')
    final_mass_shifts = AA_stat.filter_mass_shifts(popt_pvar)
#    print(final_mass_shifts)
    mass_shift_data_dict = AA_stat.group_specific_filtering(data, final_mass_shifts, params_dict)
#    print('======================',mass_shift_data_dict )
    zero_mass_shift = AA_stat.get_zero_mass_shift(list(mass_shift_data_dict.keys()))

    logger.info("Systematic mass shift equals to %s", AA_stat.mass_format(zero_mass_shift) )
    mass_shift_data_dict = AA_stat.systematic_mass_shift_correction(mass_shift_data_dict, zero_mass_shift)
    if len(mass_shift_data_dict) < 2:
        logger.info('Mass shifts were not found.')
        logger.info('Filtered mass shifts:')
        for i in mass_shift_data_dict.keys():
#            print(mass_shift_data_dict.keys())Da
            logger.info(AA_stat.MASS_FORMAT.format(i))
    else:
        distributions, number_of_PSMs, ms_labels = AA_stat.calculate_statistics(mass_shift_data_dict, 0, params_dict, args)

#    print(mass_shift_data_dict)
    table = AA_stat.save_table(distributions, number_of_PSMs, ms_labels)
#    print(table['mass shift'])
    table.to_csv(os.path.join(save_directory, 'aa_statistics_table.csv'), index=False)
#    print('=======================', table)

    AA_stat.summarizing_hist(table, save_directory)
    logger.info('Summarizing hist prepared')
    AA_stat.render_html_report(table, params_dict, save_directory)
    logger.info('AA_stat results saved to %s', os.path.abspath(args.dir))

    table.index = table['mass shift'].apply(AA_stat.mass_format)
    spectra_dict = AA_stat.read_spectra(args)
    if spectra_dict.keys():
        if args.mgf:
            params_dict['mzml_files'] = False
        else:
            params_dict['mzml_files'] = True
        logger.info('Starting Localization using MS/MS spectra...')
#        print(params_dict['mzml_files'])
        ms_labels = pd.Series(ms_labels)
        locmod_df = pd.DataFrame({'mass shift': ms_labels})
        locmod_df['# peptides in bin'] = table['# peptides in bin']
        locmod_df[['is isotope', 'isotop_ind']] =  locTools.find_isotopes(locmod_df['mass shift'], tolerance=AA_stat.ISOTOPE_TOLERANCE)
        locmod_df['sum of mass shifts'] = locTools.find_modifications(locmod_df.loc[~locmod_df['is isotope'], 'mass shift'])
        locmod_df['sum of mass shifts'].fillna(False, inplace=True)
        locmod_df['aa_stat candidates'] = locTools.get_candidates_from_aastat(table,
                 labels=params_dict['labels'], threshold=AA_stat.AA_STAT_CAND_THRESH)
        u = mass.Unimod().mods
        unimod_db = np.array(u)
        unimod_df = pd.DataFrame(u)
        locmod_df['unimod candidates'] = locmod_df['mass shift'].apply(lambda x: locTools.get_candidates_from_unimod(x, AA_stat.UNIIMOD_TOLERANCE, unimod_db, unimod_df))
        locmod_df['all candidates'] = locmod_df.apply(lambda x: set(x['unimod candidates'])|(set(x['aa_stat candidates'])), axis=1)
        locmod_df.to_csv(os.path.join(save_directory, 'test1.csv'))
        for i in locmod_df.loc[locmod_df['is isotope']].index:
#            print(i)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                locmod_df['all candidates'][i] = locmod_df['all candidates'][i].union(locmod_df['all candidates'][locmod_df['isotop_ind'][i]])
#            !print(locmod_df['all candidates'][i])
        localization_dict = {}
        for ms, df in mass_shift_data_dict.items():
            if locmod_df['sum of mass shifts'][AA_stat.mass_format(ms)] == False and ms != 0.0:
#                if abs(ms - 125) < 0.3:
#                    print('here')
#                    print([x for x in df[params_dict['peptides_column']] if 'C' in x])
                locations_ms = locmod_df.loc[AA_stat.mass_format(ms), 'all candidates']
                logger.info('For %s mass shift candidates %s', AA_stat.mass_format(ms), str(locations_ms))
                tmp = pd.DataFrame(df.apply(lambda x: AA_stat.localization_of_modification([ms], x, locations_ms,
                                                                                params_dict, spectra_dict), axis=1).to_list(),
                                 index=df.index, columns=['top_isoform', 'loc_counter'])
#                print(tmp['loc_counter'].sum())
                new_localizations = set(tmp['loc_counter'].sum().keys()).difference({'non-localized'})
#                print(new_localizations)
                df[['top_isoform', 'loc_counter' ]] = pd.DataFrame(df.apply(lambda x: AA_stat.localization_of_modification([ms], x, new_localizations,
                                                                                params_dict, spectra_dict), axis=1).to_list(),
                                 index=df.index, columns=['top_isoform', 'loc_counter'])
                localization_dict[AA_stat.mass_format(ms)] = df['loc_counter'].sum()
#                print(df['loc_counter'].sum())
        localization_dict[AA_stat.mass_format(0.000000)] = Counter()
        masses_to_calc = set(locmod_df.index).difference(set(localization_dict.keys()))
        if any(locmod_df['sum of mass shifts'] != False):
            cond = True
        else:
            cond = False
        locmod_df.to_csv(os.path.join(save_directory, 'localization_statistics.csv'), index=False)
        while cond:
            for ms in masses_to_calc:
                masses = locmod_df['sum of mass shifts'][ms]
                if masses != False:
                    if len(masses) == 1:
                        mass_1 = masses[0]
                        mass_2 = masses[0]
                    else:
                        mass_1, mass_2 = masses
                    if AA_stat.mass_format(mass_1) in localization_dict and AA_stat.mass_format(mass_2) in localization_dict:

                        df = mass_shift_data_dict[locmod_df['mass shift'][ms]]
                        locations_ms = locmod_df.loc[ms, 'all candidates']
                        locations_ms1 = set([x for x in localization_dict[AA_stat.mass_format(mass_1)].keys() if len(x) == 1])
                        locations_ms2 = set([x for x in localization_dict[AA_stat.mass_format(mass_2)].keys() if len(x) == 1])
                        tmp = pd.DataFrame(df.apply(lambda x: AA_stat.localization_of_modification([locmod_df['mass shift'][ms],mass_1, mass_2],
                                                                                        x, [locations_ms, locations_ms1,locations_ms2 ], params_dict,
                                                                                        spectra_dict, sum_mod=True), axis=1).to_list(),
                                 index=df.index, columns=['top_isoform', 'loc_counter'])
#                        print(tmp['loc_counter'].sum())
                        new_localizations = set(tmp['loc_counter'].sum().keys()).difference({'non-localized'})
                        locations_ms = []
                        locations_ms1 = []
                        locations_ms2 = []
                        for i in new_localizations:
                            if i.endswith('mod1'):
                                locations_ms1.append(i.split('_')[0])
                            elif i.endswith('mod2'):
                                locations_ms2.append(i.split('_')[0])
                            else:
                                locations_ms.append(i)
#                        print(new_localizations)
#                        print(locations_ms1)
#                        print(locations_ms2)
                        df[['top_isoform', 'loc_counter']] = pd.DataFrame(df.apply(lambda x: AA_stat.localization_of_modification([locmod_df['mass shift'][ms],mass_1, mass_2],
                                                                                        x, [locations_ms, locations_ms1,locations_ms2 ], params_dict,
                                                                                        spectra_dict, sum_mod=True), axis=1).to_list(),
                                 index=df.index, columns=['top_isoform', 'loc_counter'])
                        localization_dict[ms] = df['loc_counter'].sum()
#                        print(df['loc_counter'].sum())
                        masses_to_calc = masses_to_calc.difference(set([ms]))
            if len(masses_to_calc) == 0:
                cond = False
        locmod_df['localization'] = pd.Series(localization_dict)
        print(locmod_df)
        locmod_df.to_csv(os.path.join(save_directory, 'localization_statistics.csv'), index=False)
#        logger.info('Done')
    else:
        logger.info('No spectra files. MSMS spectrum localization is not performed.')
Exemple #7
0
AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")

XTANDEM_MOD_PATTERN = re.compile(
    r"""
                                 \.?       # 0 or 1 dot
                                 \(        # Opening parenthesis
                                   (       # Begin capture
                                    (?:        # Not capture the following
                                     [^\(\)] | # Either not parentheses or
                                     \(-?\d+\) # parentheses containing an int
                                    )+         # multiple times
                                   )       # End capture
                                 \)        # Closing parenthesis
                                 """, re.VERBOSE)

UNIMOD_MODS = mass.Unimod()

# This variable is obsoleted if an natural element is be named X.
USED_ELEMS = "CHONPSX"

# Columns of interest for export
COLUMNS_OF_INTEREST = [
    "stfi_neutral_mass", "stfi_formula", "stfi_formula_X", "stfi_M0_NC",
    "stfi_M1_NC", "stfi_M0_12C", "stfi_M1_12C"
]

# Set custom logger.
log = logging.getLogger(__name__)
log_formatter = logging.Formatter("[%(asctime)s] %(levelname)-8s: %(message)s",
                                  "%Y-%m-%d, %H:%M:%S")
log_handler = logging.StreamHandler()