Esempio n. 1
0
def check_unimod_pep_input(test_dict):
    if 'aa_compositions' not in test_dict.keys():
        test_dict[
            'aa_compositions'] = ursgal.chemical_composition_kb.aa_compositions
    cc = ursgal.ChemicalComposition(
        test_dict['input'], aa_compositions=test_dict['aa_compositions'])
    print('hill:', cc.hill_notation_unimod())
    print(cc.composition_of_aa_at_pos)
    print(cc.composition_of_mod_at_pos)
    assert cc.hill_notation_unimod() == test_dict['output']
    if 'mod_pos_info' in test_dict.keys():
        for mod_dict in test_dict['mod_pos_info']:
            cc_mods = cc.composition_of_mod_at_pos.get(mod_dict['pos'], None)
            assert cc_mods == mod_dict['cc_mods']
    if 'aa_pos_info' in test_dict.keys():
        for aa_dict in test_dict['aa_pos_info']:
            cc_aa = cc.composition_of_aa_at_pos.get(aa_dict['pos'], None)
            assert cc_aa == aa_dict['cc_mods']
Esempio n. 2
0
def main(input_file=None, output_file=None, scan_rt_lookup=None,
         peptide_regex_lookup=None, params=None, search_engine=None,
         score_colname=None):
    '''
    Arguments:
        input_file (str): input filename of csv which should be unified
        output_file (str): output filename of csv after unifying
        scan_rt_lookup (dict): dictionary with entries of scanID to
            retention time under key 'scan_2_rt'
        force (bool): force True or False
        params (dict): params as passed by ursgal
        search_engine(str): the search engine the csv file stems from
        score_colname (str): the column names of the search engine's
            score (i.e. 'OMSSA:pvalue')

    List of fixes

    All engines
        * Retention Time (s) is correctly set using _ursgal_lookup.pkl
          During mzML conversion to mgf the retention time for every spec
          is stored in a internal lookup and used later for setting the RT.
        * All modifications are checked if they were given in
          params['modifications'], converted to the name that was given
          there and sorted according to their position.
        * Fixed modifications are added in 'Modifications', if not reported
          by the engine.
        * Rows describing the same PSM (i.e. when two proteins share the
          same peptide) are merged to one row.

    X!Tandem
        * 'RTINSECONDS=' is stripped from Spectrum Title if present in .mgf or
          in search result.

    Myrimatch
        * Spectrum Title is corrected
        * 15N label is not formatted correctly these modifications are
          removed for further analysis.
        * When using 15N modifications on amino acids and Carbamidomethyl
          myrimatch reports sometimes Carboxymethylation on Cystein.

    MS-GF+
        * 15N label is not formatted correctly these modifications are
          removed for further analysis.
        * 'Is decoy' column is properly set to true/false
        * Carbamidomethyl is updated and set if label is 15N

    OMSSA
        * Carbamidomethyl is updated and set
        * Selenocystein is not reported with the correct unimod modification

    MS-Amanda
        * Selenocystein is not reported with the correct unimod modification
        * multiple protein ID per peptide are splitted in two entries.
          (is done in MS-Amanda postflight)
        * short protein IDs are mapped to the full protein ID, it is checked
          which peptides map on which protein ID (is done in MS-Amanda
          postflight)

    '''
    print(
        '''
[ unifycsv ] Converting {0} of engine {1} to unified CSV format...
        '''.format(
            os.path.basename(input_file),
            search_engine,
        )
    )

    # get the rows which define a unique PSM (i.e. sequence+spec+score...)
    psm_defining_colnames = get_psm_defining_colnames(score_colname)

    cc = ursgal.ChemicalComposition()
    # un = ursgal.UNode()

    # if peptide_regex_lookup == None:
        # peptide_regex_lookup = {}
    # already_seen_protein_pep = {}

    use15N = False
    if params['label'] == '15N':
        use15N = True

    aa_exception_dict = params['aa_exception_dict']
    n_term_replacement = {
        'Ammonia-loss' : None,
        'Trimethyl'    : None,
        'Gly->Val'     : None,
    }
    fixed_mods = {}
    opt_mods = {}
    modname2aa = {}
    cam = False

    #mod pattern
    mod_pattern = re.compile( r''':(?P<pos>[0-9]*$)''' )

    for modification in params['modifications']:
        aa = modification.split(',')[0]
        mod_type = modification.split(',')[1]
        pos = modification.split(',')[2]
        name = modification.split(',')[3]
        if name not in modname2aa.keys():
            modname2aa[name] = []
        modname2aa[name].append(aa)
        if 'N-term' in pos:
            n_term_replacement[name] = aa
        if mod_type == 'fix':
            fixed_mods[aa] = name
        if mod_type == 'opt':
            opt_mods[aa] = name
        if 'C,fix,any,Carbamidomethyl' in modification:
            cam = True

    ursgal.GlobalUnimodMapper._reparseXML()
    de_novo_engines = ['novor', 'pepnovo', 'uninovo', 'unknown_engine']
    database_search_engines = ['msamanda', 'msgf', 'myrimatch', 'omssa', 'xtandem']
    de_novo = False
    database_search = False
    for de_novo_engine in de_novo_engines:
        if de_novo_engine in search_engine.lower():
            de_novo = True
    for db_se in database_search_engines:
        if db_se in search_engine.lower():
            database_search = True

    psm_counter = Counter()
    # if a PSM with multiple rows is found (i.e. in omssa results), the psm
    # rows are merged afterwards

    output_file_object = open(output_file,'w')
    mz_buffer = {}
    csv_kwargs = {}
    if sys.platform == 'win32':
        csv_kwargs['lineterminator'] = '\n'
    else:
        csv_kwargs['lineterminator'] = '\r\n'
    with open( input_file, 'r' ) as in_file:
        csv_input  = csv.DictReader(
            in_file
        )
        csv_output = csv.DictWriter(
            output_file_object,
            list(csv_input.fieldnames) + ['uCalc m/z'],
            **csv_kwargs
        )
        csv_output.writeheader()
        for line_dict in csv_input:
            if line_dict['Spectrum Title'] != '':
                '''
                Valid for:
                    OMSSA
                    MSGF+
                    X!Tandem
                '''
                if 'RTINSECONDS=' in line_dict['Spectrum Title']:
                    line_2_split = line_dict['Spectrum Title'].split(' ')[0]
                else:
                    line_2_split = line_dict['Spectrum Title']
                line_dict['Spectrum Title'] = line_2_split

                input_file_basename, spectrum_id, _spectrum_id, charge = line_2_split.split('.')

            elif 'scan=' in line_dict['Spectrum ID']:
                pure_input_file_name                = os.path.basename(
                    line_dict['Raw data location']
                )
                input_file_basename = pure_input_file_name.split(".")[0] # not
                # using os.path.splitext because we could have multiple file
                # extensions (i.e. ".mzml.gz")

                '''
                Valid for:
                    myrimatch
                '''
                spectrum_id = line_dict['Spectrum ID'].split('=')[-1]
                line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format(
                    input_file_basename,
                    spectrum_id,
                    line_dict['Charge']
                )
            elif line_dict['Spectrum Title'] == '':
                '''
                Valid for:
                    Novor
                '''
                pure_input_file_name = os.path.basename(
                    line_dict['Raw data location']
                )
                input_file_basename = pure_input_file_name.split(".")[0]
                spectrum_id = line_dict['Spectrum ID']
                line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format(
                    input_file_basename,
                    spectrum_id,
                    line_dict['Charge']
                )
            else:
                raise Exception( 'New csv format present for engine {0}'.format( engine ) )
            line_dict['Spectrum ID'] = spectrum_id
            #we should check if data has minute format or second format...
            try:
                retention_time_in_minutes = \
                    scan_rt_lookup[ input_file_basename ][ 'scan_2_rt' ]\
                        [ spectrum_id ]
            except KeyError as e:
                error_msg = ''' Could not find scan ID {0} in scan_rt_lookup[ {1} ]
                '''.format( spectrum_id, input_file_basename )
                raise KeyError( error_msg ) from e
            if scan_rt_lookup[ input_file_basename ]['unit'] == 'second':
                rt_corr_factor = 1
            else:
                rt_corr_factor = 60
            line_dict['Retention Time (s)'] = float( retention_time_in_minutes ) * rt_corr_factor
            #
            # Modification block

            # some engines do not report fixed modifications
            # include in unified csv
            if fixed_mods != {}:
                for aa, name in fixed_mods.items():
                    for pos, aminoacid in enumerate(line_dict['Sequence']):
                        if aminoacid == aa:
                            tmp = '{0}:{1}'.format(
                                            name,
                                            pos + 1
                                        )
                            if tmp in line_dict['Modifications']:
                                # everything is ok :)
                                pass
                            else:
                                tmp_mods = line_dict['Modifications'].split(';')
                                tmp_mods.append(tmp)
                                line_dict['Modifications'] = ';'.join( tmp_mods )

            # Myrimatch and msgf+ can not handle 15N that easily
            # report all AAs moded with unknown modification
            # Note: masses are checked below to avoid any mismatch
            if use15N:
                if 'myrimatch' in search_engine.lower() or \
                        'msgf' in search_engine.lower():
                    line_dict['Modifications'] = re.sub(
                        'unknown modification:[0-9]*',
                        '',
                        line_dict['Modifications']
                    )
                if 'myrimatch' in search_engine.lower():
                    if 'Carboxymethyl' in line_dict['Modifications'] and cam == True:
                        line_dict['Modifications'] = line_dict['Modifications'].replace(
                            'Carboxymethyl',
                            'Carbamidomethyl'
                        )
                    elif 'Delta:H(6)C(3)O(1)' in line_dict['Modifications']:
                        line_dict['Modifications'] = line_dict['Modifications'].replace(
                            'Delta:H(6)C(3)O(1)',
                            'Carbamidomethyl'
                        )

            tmp_mods = []
            for modification in line_dict['Modifications'].split(';'):
                Nterm = False
                if modification == '':
                    continue
                pos, mod = None, None
                for match in mod_pattern.finditer( modification ):
                    pos = int( match.group('pos') )
                    mod = modification[ :match.start() ]
                    break
                assert pos != None,'''
                        The format of the modification {0}
                        is not recognized by ursgal'''.format(
                            modification
                        )

                # old version, does not work with ':' in modification
                # mod = modification.split(':')[0]
                # pos = int(modification.split(':')[1])

                if pos == 0 or pos == 1:
                    Nterm = True
                    pos = 1
                aa = line_dict['Sequence'][pos-1]
                if mod in modname2aa.keys():
                    correct_mod = False
                    if aa in modname2aa[mod]:
                        # everything is ok
                        correct_mod = True
                    elif Nterm and '*' in modname2aa[mod]:
                        correct_mod = True
                        # still is ok
                    assert correct_mod == True,'''
                            A modification was reported for an aminoacid for which it was not defined
                            unify_csv cannot deal with this, please check your parameters and engine output
                            reported modification: {0} on {1}
                            modifications in parameters: {2}
                            '''.format(
                                mod,
                                aa,
                                params['modifications']
                            )
                elif 'unknown modification' == mod:
                    modification_known = False
                    if aa  in opt_mods.keys(): # fixed mods are corrected/added already
                        modification = '{0}:{1}'.format(opt_mods[aa],pos)
                        modification_known = True
                    assert modification_known == True,'''
                            unify csv does not work for the given unknown modification for
                            {0} {1}
                            maybe an unknown modification with terminal position was given?
                            '''.format(
                                line_dict['Sequence'], modification
                            )
                else:
                    try:
                        name_list = ursgal.GlobalUnimodMapper.appMass2name_list( round(float(mod), 4), decimal_places = 4 )
                    except:
                        print('''
                            A modification was reported that was not included in the search parameters
                            unify_csv cannot deal with this, please check your parameters and engine output
                            reported modification: {0}
                            modifications in parameters: {1}
                            '''.format(mod, params['modifications'])
                        )
                        raise Exception('unify_csv failed because a '\
                            'modification was reported that was not '\
                            'given in params.'
                        )
                    mapped_mod = False
                    for name in name_list:
                        if name in modname2aa.keys():
                            if aa in modname2aa[name]:
                                modification = '{0}:{1}'.format(name,pos)
                                mapped_mod = True
                            elif Nterm and '*' in modname2aa[name]:
                                modification = '{0}:{1}'.format(name,0)
                                mapped_mod = True
                            else:
                                continue
                    assert mapped_mod == True, '''
                            A mass was reported that does not map on any unimod or userdefined modification
                            or the modified aminoacid is no the specified one
                            unify_csv cannot deal with this, please check your parameters and engine output
                            reported mass: {0}
                            maps on: {1}
                            reported modified aminoacid: {2}
                            modifications in parameters: {3}
                            '''.format(
                                mod,
                                name_list,
                                aa,
                                params['modifications']
                            )
                tmp_mods.append(modification)
            line_dict['Modifications'] = ';'.join( tmp_mods )

            for unimod_name in n_term_replacement.keys():
                if '{0}:1'.format(unimod_name) in line_dict['Modifications']:
                    replace = False
                    if unimod_name in modname2aa.keys():
                        aa = modname2aa[unimod_name]
                        if aa != '*':
                            if line_dict['Sequence'][0] == aa:
                                continue
                    line_dict['Modifications'] = line_dict['Modifications'].replace(
                        '{0}:1'.format( unimod_name ),
                        '{0}:0'.format( unimod_name )
                        )

            for aa_to_replace, replace_dict in aa_exception_dict.items():
                if aa_to_replace in line_dict['Sequence']:
                    #change mods only if unimod has to be changed...
                    if 'unimod_name' in replace_dict.keys():
                        for r_pos, aa in enumerate(line_dict['Sequence']):
                            if aa == aa_to_replace:
                                index_of_U = r_pos + 1
                                unimod_name = replace_dict['unimod_name']
                                if cam:
                                    unimod_name = replace_dict['unimod_name_with_cam']
                                new_mod = '{0}:{1}'.format(
                                    unimod_name,
                                    index_of_U
                                )
                                if line_dict['Modifications'] == '':
                                    line_dict['Modifications'] += new_mod
                                else:
                                    line_dict['Modifications'] += ';{0}'.format(
                                        new_mod
                                    )
                    line_dict['Sequence'] = line_dict['Sequence'].replace(
                        aa_to_replace,
                        replace_dict['original_aa']
                    )
            # remove the double ';''
            if line_dict['Modifications'] != '':
                tmp = []
                for e in line_dict['Modifications'].split(';'):
                    if e == '':
                        # that remove the doubles ....
                        continue
                    else:
                        # other way to do it...
                        # pos_of_split_point = re.search( ':\d*\Z', e )
                        # pattern = re.compile( r''':(?P<pos>[0-9]*$)''' )
                        for occ, match in enumerate( mod_pattern.finditer( e )):
                            mod = e[:match.start()]
                            mod_pos = e[match.start()+1:]
                            # mod, pos = e.split(':')
                            m = (int(mod_pos), mod)
                            if m not in tmp:
                                tmp.append( m )
                tmp.sort()
                line_dict['Modifications'] = ';'.join(
                    [
                        '{m}:{p}'.format( m=mod, p=pos) for pos, mod in tmp
                    ]
                )

            # caculate m/z

            upep = line_dict['Sequence'] + '#' + line_dict['Modifications']
            buffer_key = (upep, line_dict['Charge'], params['label'])
            if buffer_key not in mz_buffer.keys():
                cc.use(upep)
                if use15N:
                    number_N = dc( cc['N'] )
                    cc['15N'] = number_N
                    del cc['N']
                    if cam:
                        c_count = line_dict['Sequence'].count('C')
                        cc['14N'] = c_count
                        cc['15N'] -= c_count
                    # mass = mass + ( DIFFERENCE_14N_15N * number_N )
                mass = cc._mass()
                calc_mz = ursgal.ucore.calculate_mz(
                    mass,
                    line_dict['Charge']
                )
                mz_buffer[ buffer_key ] = calc_mz
            else:
                calc_mz = mz_buffer[ buffer_key ]
            line_dict['uCalc m/z'] = calc_mz
            if 'msamanda' in search_engine.lower():
                # ms amanda does not return calculated mz values
                line_dict['Calc m/z'] = calc_mz

            # protein block, only for database search engine

            if database_search == True:

                # check if proteinacc_start_stop_pre_post is correct ... work in progress
                tmp_decoy = set()
                tmp_proteinacc = []
                for protein in line_dict['proteinacc_start_stop_pre_post_;'].split('<|>'):
                    # match = re.search('_\d+_\d+_[A-Z-]_[A-Z-]', protein)
                    # if match == None:
                    #     id_stop = len(protein)
                    # else:
                    #     id_stop = match.start()
                    # protein_id = protein[0:id_stop]
                    # peptide = line_dict['Sequence']
                    # protein_pep = '{0}_{1}'.format(protein_id, peptide)
                    # database_protein_pep = '{0}_{1}'.format(
                    #     params['database'],
                    #     protein_id,
                    #     peptide
                    # )

                    # allowed_aa = params['enzyme'][0] + '-'
                    # cleavage_site = params['enzyme'][1] + '-'


                    # if protein_pep not in already_seen_protein_pep:
                        # if database_protein_pep not in peptide_regex_lookup:
                        #     peptide_regex_lookup[database_protein_pep] = un.peptide_regex(
                        #         params['database'],
                        #         protein_id,
                        #         peptide
                        #     )
                        # returned_peptide_regex_list = peptide_regex_lookup[database_protein_pep]
                        
                        # corr_proteinacc_start_stop_pre_post = []
                        # for protein in returned_peptide_regex_list:
                        #     for pep_regex in protein:
                        #         print(pep_regex)
                        #         nterm_correct = False
                        #         cterm_correct = False
                        #         start, stop, pre_aa, post_aa, returned_protein_id = pep_regex
                        #         proteinacc_start_stop_pre_post = '{0}_{1}_{2}_{3}_{4}'.format(
                        #             returned_protein_id,
                        #             start,
                        #             stop,
                        #             pre_aa,
                        #             post_aa
                        #         )
    # 
                    #             if cleavage_site == 'C':
                    #                 if pre_aa in allowed_aa:
                    #                     nterm_correct = True
                                    # if peptide[-1] in allowed_aa:
                                    #     cterm_correct = True
                    #             elif cleavage_site == 'N':
                                    # if peptide[0] in allowed_aa:
                                    #     nterm_correct = True
                    #                 if post_aa not in allowed_aa:
                    #                     cterm_correct = True

                                # if params['semi_enzyme'] == True:
                                #     if cterm_correct == True or nterm_correct == True:
                                #         corr_proteinacc_start_stop_pre_post.append(proteinacc_start_stop_pre_post)
                                # elif cterm_correct == True and nterm_correct == True:
                                #     corr_proteinacc_start_stop_pre_post.append(proteinacc_start_stop_pre_post)
                        # already_seen_protein_pep[protein_pep] = corr_proteinacc_start_stop_pre_post
                    # corr_proteinacc_start_stop_pre_post = already_seen_protein_pep[protein_pep]

                    # mzidentml-lib does not always set 'Is decoy' correctly
                    # (it's always 'false' for MS-GF+ results), this is fixed here:
                    if params['decoy_tag'] in protein:
                        tmp_decoy.add('true')
                    else:
                        tmp_decoy.add('false')
                if len(tmp_decoy) >= 2:
                    print(
                        '''
                        [ WARNING ] The following peptide occurs in a target as well as decoy protein
                        [ WARNING ] {0} 
                        [ WARNING ] 'Is decoy' has been set to 'True' '''.format(
                            line_dict['Sequence'],
                        )
                    )
                    line_dict['Is decoy'] = 'true'
                else:
                    line_dict['Is decoy'] = list(tmp_decoy)[0]

                # count each PSM occurence to check whether row-merging is needed:
                psm = tuple([line_dict[x] for x in psm_defining_colnames])
                psm_counter[psm] += 1

                #csv_output.writerow(line_dict) #wrong indentation
            csv_output.writerow(line_dict)
            '''
                to_be_written_csv_lines.append( line_dict )
            '''
    output_file_object.close()

    # if there are multiple rows for a PSM, we have to merge them aka rewrite the csv...
    if psm_counter != Counter():
        if max(psm_counter.values()) > 1:
            merge_duplicate_psm_rows(output_file, psm_counter, psm_defining_colnames)
            '''
            to_be_written_csv_lines = merge_duplicate_psm_rows(
                to_be_written_csv_lines,
                psm_counter
            )
            '''
        '''
        do output_file magic with to_be_written_csv_lines
        '''
    return peptide_regex_lookup
Esempio n. 3
0
        },
    },
    {
        "composition_1": {
            "N": 1,
            "14N": 1,
            "12C": 1
        },
        "composition_2": {
            "N": 2,
            "C": 1
        },
    },
]

cc_1 = ursgal.ChemicalComposition()
cc_2 = ursgal.ChemicalComposition()


def pepitde_with_unimod_test():
    for test_id, test_dict in enumerate(TESTS):
        yield mass_checker, test_dict


def mass_checker(test_dict):

    cc_1.add_chemical_formula(test_dict["composition_1"])
    cc_2.add_chemical_formula(test_dict["composition_2"])
    assert cc_1._mass() == cc_2._mass()
    cc_1.clear()
    cc_2.clear()
Esempio n. 4
0
    def preflight(self):
        '''
        Formatting the command line and writing the param input file via 
        self.params

        Returns:
                dict: self.params
        '''
        self.input_file = os.path.join(self.params['input_dir_path'],
                                       self.params['input_file'])

        self.param_file_name = os.path.join(
            self.params['output_dir_path'],
            '{0}_msfragger.params'.format(self.input_file))
        self.created_tmp_files.append(self.param_file_name)
        # further prepare and translate params

        # pprint.pprint(self.params['translations']['_grouped_by_translated_key'])
        # pprint.pprint(self.params)
        # exit()
        self.params_to_write = {
            'output_file_extension':
            'tsv',  # tsv or pepXML we fix it...
            'output_format':
            'tsv',  # pepXML or tsv
            'digest_mass_range':
            '{0} {1}'.format(
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_min_mass']['precursor_min_mass'],
                self.params['translations']['_grouped_by_translated_key']
                ['precursor_max_mass']['precursor_max_mass'])
        }

        write_exclusion_list = [
            'precursor_min_mass', 'precursor_max_mass', 'precursor_min_charge',
            'precursor_max_charge', 'label', '-Xmx', 'header_translations',
            'validation_score_field'
        ]

        additional_15N_modifications = []
        if self.params['translations']['_grouped_by_translated_key']['label'][
                'label'] == '15N':
            self.print_info(
                'Search with label=15N may still be errorprone. Evaluate with care!',
                caller='WARNING')
            for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items():
                existing = False
                for mod_dict in self.params['mods']['fix']:
                    if aminoacid == mod_dict['aa']:
                        mod_dict['mass'] += N15_Diff
                        mod_dict['name'] += '_15N_{0}'.format(aminoacid)
                        existing = True
                if existing == True:
                    continue
                else:
                    mod_key = 'add_{0}_{1}'.format(
                        aminoacid,
                        ursgal.chemical_composition_kb.aa_names[aminoacid])
                    self.params_to_write[mod_key] = N15_Diff

        self.mass_shift_lookup = {}
        self.mass_glycan_lookup = {}
        for msfragger_param_name in self.params['translations'][
                '_grouped_by_translated_key'].keys():
            for ursgal_param_name, param_value in self.params['translations'][
                    '_grouped_by_translated_key'][msfragger_param_name].items(
                    ):
                if msfragger_param_name in write_exclusion_list:
                    continue
                elif msfragger_param_name == 'enzyme':
                    '''
                    search_enzyme_name = Trypsin
                    search_enzyme_cutafter = KR
                    search_enzyme_butnotafter = P
                    '''
                    aa_site, term, inhibitor = param_value.split(';')
                    self.params_to_write['search_enzyme_name'] = self.params[
                        'enzyme']
                    self.params_to_write['search_enzyme_cutafter'] = aa_site
                    self.params_to_write[
                        'search_enzyme_butnotafter'] = inhibitor
                elif msfragger_param_name == 'num_enzyme_termini':
                    # num_enzyme_termini = 2 # 2 for enzymatic, 1 for
                    # semi-enzymatic, 0 for nonspecific digestion

                    if self.params['translations'][
                            '_grouped_by_translated_key']['enzyme'][
                                'enzyme'] == 'nonspecific':
                        self.params_to_write[msfragger_param_name] = 0
                    else:
                        self.params_to_write[
                            msfragger_param_name] = param_value
                elif msfragger_param_name == 'clear_mz_range':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '{0} {1}'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'remove_precursor_range':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '{0},{1}'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'delta_mass_exclude_ranges':
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = '({0},{1})'.format(
                            min_mz, max_mz)
                elif msfragger_param_name == 'precursor_mass_lower':
                    self.params_to_write[
                        msfragger_param_name] = -1 * param_value
                elif msfragger_param_name == 'modifications':
                    '''
                    #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
                    variable_mod_01 = 15.9949 M
                    variable_mod_02 = 42.0106 [*
                    #variable_mod_03 = 79.96633 STY
                    #variable_mod_03 = -17.0265 nQnC
                    #variable_mod_04 = -18.0106 nE
                    '''
                    # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name])
                    # pprint.pprint(self.params[ 'mods' ])
                    # exit()
                    mass_to_mod_aa = ddict(list)
                    for mod_dict in self.params['mods']['opt']:
                        '''
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        '''
                        aa_to_append = mod_dict['aa']
                        pos_modifier = None
                        if mod_dict['pos'] == 'Prot-N-term':
                            pos_modifier = '['
                        elif mod_dict['pos'] == 'Prot-C-term':
                            pos_modifier = ']'
                        elif mod_dict['pos'] == 'N-term':
                            pos_modifier = 'n'
                        elif mod_dict['pos'] == 'C-term':
                            pos_modifier = 'c'
                        elif mod_dict['pos'] == 'any':
                            pass
                        else:
                            print('''
                            Unknown positional argument for given modification:
                            {0}
                            MSFragger cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        if pos_modifier is not None:
                            aa_to_append = '{0}{1}'.format(
                                pos_modifier, aa_to_append)
                        mass_to_mod_aa[mod_dict['mass']].append(aa_to_append)
                    for pos, (mass,
                              aa_list) in enumerate(mass_to_mod_aa.items()):
                        self.params_to_write['variable_mod_0{0}'.format(
                            pos + 1)] = '{0} {1}'.format(
                                mass, ''.join(aa_list))
                    for mod_dict in self.params['mods']['fix']:
                        '''
                        add_C_cysteine = 57.021464             # added to C - avg. 103.1429, mono. 103.00918
                        '''
                        if mod_dict['pos'] == 'Prot-N-term':
                            mod_key = 'add_Nterm_protein'
                        elif mod_dict['pos'] == 'Prot-C-term':
                            mod_key = 'add_Cterm_protein'
                        elif mod_dict['pos'] == 'N-term':
                            mod_key = 'add_Nterm_peptide'
                        elif mod_dict['pos'] == 'C-term':
                            mod_key = 'add_Cterm_peptide'
                        else:
                            mod_key = 'add_{0}_{1}'.format(
                                mod_dict['aa'],
                                ursgal.chemical_composition_kb.aa_names[
                                    mod_dict['aa']])
                        self.params_to_write[mod_key] = mod_dict['mass']

                elif msfragger_param_name == 'override_charge':
                    self.params_to_write[msfragger_param_name] = param_value
                    if param_value == 1:
                        self.params_to_write[
                            'precursor_charge'] = '{0} {1}'.format(
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_min_charge']
                                ['precursor_min_charge'],
                                self.params['translations']
                                ['_grouped_by_translated_key']
                                ['precursor_max_charge']
                                ['precursor_max_charge'])
                elif msfragger_param_name == 'fragment_ion_series':
                    ion_list = []
                    for ion in param_value:
                        if ion not in [
                                'a',
                                'b',
                                'c',
                                'y~',
                                'x',
                                'y',
                                'z',
                                'b~',
                                'y-18',
                                'b-18',
                                'Y',
                        ]:
                            print('''
                                [ WARNING ] MSFragger does not allow the following ion:
                                {0}
                                This ion will be skipped, i.e. not included in the search.
                            '''.format(ion))
                            continue
                        ion_list.append(ion)
                    self.params_to_write[msfragger_param_name] = ','.join(
                        ion_list)
                elif msfragger_param_name in [
                        'mass_offsets',
                        'Y_type_masses',
                ]:
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value['masses']:
                        masses.append(str(m))
                    for m in param_value['glycans']:
                        cc.clear()
                        cc.add_glycan(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_glycan_lookup.keys():
                            self.mass_glycan_lookup[tm] = set()
                        self.mass_glycan_lookup[tm].add(m)
                    for m in param_value['chemical_formulas']:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    for m in param_value['unimods']:
                        unimod_mass = umama.name2mass(m)
                        masses.append(str(unimod_mass))
                        # for tm in self.transform_mass_add_error(unimod_mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    self.params_to_write[msfragger_param_name] = '/'.join(
                        masses)
                elif msfragger_param_name == 'diagnostic_fragments':
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value['masses']:
                        masses.append(m)
                    for m in param_value['glycans']:
                        cc.clear()
                        cc.add_glycan(m)
                        masses.append(cc._mass())
                    for m in param_value['chemical_formulas']:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        masses.append(cc._mass())
                    for m in param_value['unimods']:
                        unimod_mass = umama.name2mass(m)
                        masses.append(unimod_mass)
                    mzs = []
                    for mass in masses:
                        mzs.append(str(ursgal.ucore.calculate_mz(mass, 1)))
                    self.params_to_write[msfragger_param_name] = '/'.join(mzs)
                else:
                    self.params_to_write[msfragger_param_name] = param_value

        self.write_params_file()

        if self.input_file.lower().endswith('.mzml') or \
                self.input_file.lower().endswith('.mzml.gz') or \
                self.input_file.lower().endswith('.mgf'):
            self.params['translations']['mzml_input_file'] = self.input_file
        # elif self.input_file.lower().endswith('.mgf'):
        #     self.params['translations']['mzml_input_file'] = \
        #         self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file )
        #     self.print_info(
        #         'MSFragger can only read Proteowizard MGF input files,'
        #         'the corresponding mzML file {0} will be used instead.'.format(
        #             os.path.abspath(self.params['translations']['mzml_input_file'])
        #         ),
        #         caller = "INFO"
        #     )
        else:
            raise Exception(
                'MSFragger input spectrum file must be in mzML or MGF format!')

        self.params['command_list'] = [
            'java',
            '-Xmx{0}'.format(self.params['translations']
                             ['_grouped_by_translated_key']['-Xmx']['-xmx']),
            '-jar', self.exe, self.param_file_name,
            self.params['translations']['mzml_input_file']
        ]

        self.params['translations']['output_file_incl_path'] = os.path.join(
            self.params['output_dir_path'], self.params['output_file'])
        return self.params
Esempio n. 5
0
def main(input_file=None,
         output_file=None,
         scan_rt_lookup=None,
         params=None,
         search_engine=None,
         score_colname=None,
         upeptide_mapper=None):
    '''
    Arguments:
        input_file (str): input filename of csv which should be unified
        output_file (str): output filename of csv after unifying
        scan_rt_lookup (dict): dictionary with entries of scanID to
            retention time under key 'scan_2_rt'
        force (bool): force True or False
        params (dict): params as passed by ursgal
        search_engine(str): the search engine the csv file stems from
        score_colname (str): the column names of the search engine's
            score (i.e. 'OMSSA:pvalue')

    List of fixes

    All engines
        * Retention Time (s) is correctly set using _ursgal_lookup.pkl
          During mzML conversion to mgf the retention time for every spec
          is stored in a internal lookup and used later for setting the RT.
        * All modifications are checked if they were given in
          params['modifications'], converted to the name that was given
          there and sorted according to their position.
        * Fixed modifications are added in 'Modifications', if not reported
          by the engine.
        * The monoisotopic m/z for for each line is calculated (uCalc m/z),
          since not all engines report the monoisotopic m/z
        * Mass accuracy calculation (in ppm), also taking into account that
          not always the monoisotopic peak is picked
        * All peptide Sequences are remapped to their corresponding protein,
          assuring correct start, stop, pre and post aminoacid. Thereby,
          also correct enzymatic cleavage is checked.
        * Rows describing the same PSM (i.e. when two proteins share the
          same peptide) are merged to one row.

    X!Tandem
        * 'RTINSECONDS=' is stripped from Spectrum Title if present in .mgf or
          in search result.

    Myrimatch
        * Spectrum Title is corrected
        * 15N label is not formatted correctly these modifications are
          removed for further analysis.
        * When using 15N modifications on amino acids and Carbamidomethyl
          myrimatch reports sometimes Carboxymethylation on Cystein.

    MS-GF+
        * 15N label is not formatted correctly these modifications are
          removed for further analysis.
        * 'Is decoy' column is properly set to true/false
        * Carbamidomethyl is updated and set if label is 15N

    OMSSA
        * Carbamidomethyl is updated and set
        * Selenocystein is not reported with the correct unimod modification

    MS-Amanda
        * Selenocystein is not reported with the correct unimod modification
        * multiple protein ID per peptide are splitted in two entries.
          (is done in MS-Amanda postflight)
        * short protein IDs are mapped to the full protein ID, it is checked
          which peptides map on which protein ID (is done in MS-Amanda
          postflight)

    '''
    print('''
[ unifycsv ] Converting {0} of engine {1} to unified CSV format...
        '''.format(
        os.path.basename(input_file),
        search_engine,
    ))

    # get the rows which define a unique PSM (i.e. sequence+spec+score...)
    psm_defining_colnames = get_psm_defining_colnames(score_colname)
    joinchar = params['translations']['protein_delimiter']
    do_not_delete = False
    created_tmp_files = []
    use15N = False

    if 'label' in params.keys():
        if params['label'] == '15N':
            use15N = True
    else:
        params['label'] = '14N'
    # print(use15N)
    # exit()
    aa_exception_dict = params['translations']['aa_exception_dict']
    n_term_replacement = {
        'Ammonia-loss': None,
        'Trimethyl': None,
        'Gly->Val': None,
    }
    fixed_mods = {}
    opt_mods = {}
    modname2aa = {}
    cam = False

    # mod pattern
    mod_pattern = re.compile(r''':(?P<pos>[0-9]*$)''')

    for modification in params['translations']['modifications']:
        aa = modification.split(',')[0]
        mod_type = modification.split(',')[1]
        pos = modification.split(',')[2]
        name = modification.split(',')[3]
        if name not in modname2aa.keys():
            modname2aa[name] = []
        modname2aa[name].append(aa)
        if 'N-term' in pos:
            n_term_replacement[name] = aa
        if mod_type == 'fix':
            fixed_mods[aa] = name
        if mod_type == 'opt':
            opt_mods[aa] = name
        if 'C,fix,any,Carbamidomethyl' in modification:
            cam = True

    cc = ursgal.ChemicalComposition()
    ursgal.GlobalUnimodMapper._reparseXML()
    de_novo_engines = ['novor', 'pepnovo', 'uninovo', 'unknown_engine']
    database_search_engines = [
        'msamanda', 'msgf', 'myrimatch', 'omssa', 'xtandem'
    ]
    de_novo = False
    database_search = False
    for de_novo_engine in de_novo_engines:
        if de_novo_engine in search_engine.lower():
            de_novo = True
    for db_se in database_search_engines:
        if db_se in search_engine.lower():
            database_search = True

    if upeptide_mapper is None:
        upapa = ursgal.UPeptideMapper()
    else:
        upapa = upeptide_mapper

    if database_search is True:
        target_decoy_peps = set()
        non_enzymatic_peps = set()
        pep_map_lookup = {}
        fasta_lookup_name = upapa.build_lookup_from_file(
            params['translations']['database'],
            force=False,
        )
    # print('Cached!')
    # input()
    psm_counter = Counter()
    # if a PSM with multiple rows is found (i.e. in omssa results), the psm
    # rows are merged afterwards

    output_file_object = open(output_file, 'w')
    protein_id_output = open(output_file + '_full_protein_names.txt', 'w')
    mz_buffer = {}
    csv_kwargs = {'extrasaction': 'ignore'}
    if sys.platform == 'win32':
        csv_kwargs['lineterminator'] = '\n'
    else:
        csv_kwargs['lineterminator'] = '\r\n'
    total_lines = len(list(csv.reader(open(input_file, 'r'))))
    ze_only_buffer = {}

    if params['translations']['enzyme'] != 'nonspecific':
        allowed_aa, cleavage_site, inhibitor_aa = params['translations'][
            'enzyme'].split(';')
    else:
        allowed_aa = ''.join(list(ursgal.ursgal_kb.NITROGENS.keys()))
        cleavage_site = 'C'
        inhibitor_aa = ''
    allowed_aa += '-'

    with open(input_file, 'r') as in_file:
        csv_input = csv.DictReader(in_file)

        output_fieldnames = list(csv_input.fieldnames)
        for remove_fieldname in [
                'proteinacc_start_stop_pre_post_;',
                'Start',
                'Stop',
                'NIST score',
                'gi',
                'Accession',
        ]:
            if remove_fieldname not in output_fieldnames:
                continue
            output_fieldnames.remove(remove_fieldname)
        new_fieldnames = [
            'uCalc m/z',
            'Accuracy (ppm)',
            'Protein ID',
            'Sequence Start',
            'Sequence Stop',
            'Sequence Pre AA',
            'Sequence Post AA',
        ]

        for new_fieldname in new_fieldnames:
            if new_fieldname not in output_fieldnames:
                output_fieldnames.insert(-5, new_fieldname)
        csv_output = csv.DictWriter(output_file_object, output_fieldnames,
                                    **csv_kwargs)
        csv_output.writeheader()
        print('''[ unify_cs ] parsing csv''')
        import time
        for line_nr, line_dict in enumerate(csv_input):
            if line_nr % 500 == 0:
                print(
                    '[ unify_cs ] Processing line number: {0}/{1} .. '.format(
                        line_nr,
                        total_lines,
                    ),
                    end='\r')

            if line_dict['Spectrum Title'] != '':
                '''
                Valid for:
                    OMSSA
                    MSGF+
                    X!Tandem
                '''
                if 'RTINSECONDS=' in line_dict['Spectrum Title']:
                    line_2_split = line_dict['Spectrum Title'].split(
                        ' ')[0].strip()
                else:
                    line_2_split = line_dict['Spectrum Title']
                line_dict['Spectrum Title'] = line_2_split

                input_file_basename, spectrum_id, _spectrum_id, charge = line_2_split.split(
                    '.')
                pure_input_file_name = ''

            elif 'scan=' in line_dict['Spectrum ID']:
                pure_input_file_name = os.path.basename(
                    line_dict['Raw data location'])
                input_file_basename = pure_input_file_name.split(".")[0]
                # not using os.path.splitext because we could have multiple file
                # extensions (i.e. ".mzml.gz")
                '''
                Valid for:
                    myrimatch
                '''
                spectrum_id = line_dict['Spectrum ID'].split('=')[-1]
                line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format(
                    input_file_basename, spectrum_id, line_dict['Charge'])

            elif line_dict['Spectrum Title'] == '':
                '''
                Valid for:
                    Novor
                '''
                pure_input_file_name = os.path.basename(
                    line_dict['Raw data location'])
                input_file_basename = pure_input_file_name.split(".")[0]
                spectrum_id = line_dict['Spectrum ID']
                line_dict['Spectrum Title'] = '{0}.{1}.{1}.{2}'.format(
                    input_file_basename, spectrum_id, line_dict['Charge'])
            else:
                raise Exception(
                    'New csv format present for engine {0}'.format(engine))

            #update spectrum ID from block above
            line_dict['Spectrum ID'] = spectrum_id

            # now check for the basename in the scan rt lookup
            # possible cases:
            #   - input_file_basename
            #   - input_file_basename + prefix
            #   - input_file_basename - prefix

            input_file_basename_for_rt_lookup = None
            if input_file_basename in scan_rt_lookup.keys():
                input_file_basename_for_rt_lookup = input_file_basename
            else:
                basename_with_prefix = '{0}_{1}'.format(
                    params['prefix'], input_file_basename)
                basename_without_prefix = input_file_basename.replace(
                    params['prefix'], '')
                if basename_with_prefix in scan_rt_lookup.keys():
                    input_file_basename_for_rt_lookup = basename_with_prefix
                elif basename_without_prefix in scan_rt_lookup.keys():
                    input_file_basename_for_rt_lookup = basename_without_prefix
                else:
                    print('''
Could not find scan ID {0} in scan_rt_lookup[ {1} ]
                        '''.format(spectrum_id, input_file_basename))

            retention_time_in_minutes = \
                scan_rt_lookup[ input_file_basename_for_rt_lookup ][ 'scan_2_rt' ]\
                    [ spectrum_id ]

            #we should check if data has minute format or second format...
            if scan_rt_lookup[input_file_basename]['unit'] == 'second':
                rt_corr_factor = 1
            else:
                rt_corr_factor = 60
            line_dict['Retention Time (s)'] = float(
                retention_time_in_minutes) * rt_corr_factor

            #
            # now lets buffer for real !! :)
            #
            _ze_ultra_buffer_key_ = '{Sequence} || {Charge} || {Modifications} || '.format(
                **line_dict) + params['label']
            if _ze_ultra_buffer_key_ not in ze_only_buffer.keys():
                line_dict_update = {}
                #
                # Modification block

                # some engines do not report fixed modifications
                # include in unified csv
                if fixed_mods != {}:
                    for pos, aminoacid in enumerate(line_dict['Sequence']):
                        if aminoacid in fixed_mods.keys():
                            name = fixed_mods[aminoacid]
                            tmp = '{0}:{1}'.format(name, pos + 1)
                            if tmp in line_dict['Modifications']:
                                # everything is ok :)
                                pass
                            else:
                                tmp_mods = line_dict['Modifications'].split(
                                    ';')
                                tmp_mods.append(tmp)
                                line_dict['Modifications'] = ';'.join(tmp_mods)

                # Myrimatch and msgf+ can not handle 15N that easily
                # report all AAs moded with unknown modification
                # Note: masses are checked below to avoid any mismatch
                if use15N:
                    if 'myrimatch' in search_engine.lower() or \
                            'msgfplus_v9979' in search_engine.lower():
                        for p in range(1, len(line_dict['Sequence']) + 1):
                            line_dict['Modifications'] = \
                                line_dict['Modifications'].replace(
                                    'unknown modification:{0}'.format(p),
                                    '',
                                    1,
                                )
                    if 'myrimatch' in search_engine.lower():
                        if 'Carboxymethyl' in line_dict[
                                'Modifications'] and cam == True:
                            line_dict['Modifications'] = line_dict[
                                'Modifications'].replace(
                                    'Carboxymethyl', 'Carbamidomethyl')
                        elif 'Delta:H(6)C(3)O(1)' in line_dict[
                                'Modifications']:
                            line_dict['Modifications'] = line_dict[
                                'Modifications'].replace(
                                    'Delta:H(6)C(3)O(1)', 'Carbamidomethyl')

                tmp_mods = []
                for modification in line_dict['Modifications'].split(';'):
                    Nterm = False
                    Cterm = False
                    skip_mod = False
                    if modification == '':
                        continue
                    pos, mod = None, None
                    match = mod_pattern.search(modification)
                    pos = int(match.group('pos'))
                    mod = modification[:match.start()]
                    assert pos is not None, '''
                            The format of the modification {0}
                            is not recognized by ursgal'''.format(modification)
                    if pos <= 1:
                        Nterm = True
                        new_pos = 1
                    elif pos > len(line_dict['Sequence']):
                        Cterm = True
                        new_pos = len(line_dict['Sequence'])
                    else:
                        new_pos = pos
                    aa = line_dict['Sequence'][new_pos - 1].upper()
                    # if aa in fixed_mods.keys():
                    #     fixed_mods[ aminoacid ]
                    #     # fixed mods are corrected/added already
                    #     continue
                    if mod in modname2aa.keys():
                        correct_mod = False
                        if aa in modname2aa[mod]:
                            # everything is ok
                            correct_mod = True
                        elif Nterm or Cterm:
                            if '*' in modname2aa[mod]:
                                correct_mod = True
                                # still is ok
                        assert correct_mod is True, '''
                                A modification was reported for an aminoacid for which it was not defined
                                unify_csv cannot deal with this, please check your parameters and engine output
                                reported modification: {0} on {1}
                                modifications in parameters: {2}
                                '''.format(
                            mod, aa, params['translations']['modifications'])
                    elif 'unknown modification' == mod:
                        modification_known = False
                        if aa in opt_mods.keys():
                            # fixed mods are corrected/added already
                            modification = '{0}:{1}'.format(
                                opt_mods[aa], new_pos)
                            modification_known = True
                        assert modification_known == True, '''
                                unify csv does not work for the given unknown modification for
                                {0} {1} aa: {2}
                                maybe an unknown modification with terminal position was given?
                                '''.format(line_dict['Sequence'], modification,
                                           aa)
                    else:
                        if aa in fixed_mods.keys() and use15N \
                            and 'msgfplus' in search_engine.lower():
                            if pos != 0:
                                mod = float(
                                    mod) - ursgal.ursgal_kb.DICT_15N_DIFF[aa]
                        try:
                            name_list = ursgal.GlobalUnimodMapper.appMass2name_list(
                                round(float(mod), 3), decimal_places=3)
                        except:
                            print('''
                                A modification was reported that was not included in the search parameters
                                unify_csv cannot deal with this, please check your parameters and engine output
                                reported modification: {0}
                                modifications in parameters: {1}
                                '''.format(
                                mod, params['translations']['modifications']))
                            raise Exception('unify_csv failed because a '\
                                'modification was reported that was not '\
                                'given in params.'
                                '{0}'.format(modification)
                            )
                        mapped_mod = False
                        for name in name_list:
                            if name in modname2aa.keys():
                                if aa in modname2aa[name]:
                                    modification = '{0}:{1}'.format(
                                        name, new_pos)
                                    mapped_mod = True
                                elif Nterm and '*' in modname2aa[name]:
                                    modification = '{0}:{1}'.format(name, 0)
                                    mapped_mod = True
                                else:
                                    continue
                            elif use15N and name in [
                                    'Label:15N(1)', 'Label:15N(2)',
                                    'Label:15N(3)', 'Label:15N(4)'
                            ]:
                                mapped_mod = True
                                skip_mod = True
                                break
                        assert mapped_mod is True, '''
                                A mass was reported that does not map on any unimod or userdefined modification
                                or the modified aminoacid is not the specified one
                                unify_csv cannot deal with this, please check your parameters and engine output
                                reported mass: {0}
                                maps on: {1}
                                reported modified aminoacid: {2}
                                modifications in parameters: {3}
                                '''.format(
                            mod, name_list, aa,
                            params['translations']['modifications'])
                    if modification in tmp_mods or skip_mod is True:
                        continue
                    tmp_mods.append(modification)
                line_dict_update['Modifications'] = ';'.join(tmp_mods)
                #
                # ^^--------- REPLACED MODIFICATIONS! ---------------^
                #
                for unimod_name in n_term_replacement.keys():
                    if '{0}:1'.format(unimod_name) in line_dict_update[
                            'Modifications'].split(';'):
                        if unimod_name in modname2aa.keys():
                            aa = modname2aa[unimod_name]
                            if aa != ['*']:
                                if line_dict['Sequence'][0] in aa:
                                    continue
                        line_dict_update['Modifications'] = line_dict_update[
                            'Modifications'].replace(
                                '{0}:1'.format(unimod_name),
                                '{0}:0'.format(unimod_name))

                for aa_to_replace, replace_dict in aa_exception_dict.items():
                    if aa_to_replace in line_dict['Sequence']:
                        #change mods only if unimod has to be changed...
                        if 'unimod_name' in replace_dict.keys():
                            for r_pos, aa in enumerate(line_dict['Sequence']):
                                if aa == aa_to_replace:
                                    index_of_U = r_pos + 1
                                    unimod_name = replace_dict['unimod_name']
                                    if cam and replace_dict[
                                            'original_aa'] == 'C':
                                        unimod_name = replace_dict[
                                            'unimod_name_with_cam']
                                    new_mod = '{0}:{1}'.format(
                                        unimod_name, index_of_U)
                                    if line_dict_update['Modifications'] == '':
                                        line_dict_update[
                                            'Modifications'] += new_mod
                                    else:
                                        line_dict_update[
                                            'Modifications'] += ';{0}'.format(
                                                new_mod)
                        line_dict['Sequence'] = line_dict['Sequence'].replace(
                            aa_to_replace, replace_dict['original_aa'])

                line_dict_update['Sequence'] = line_dict['Sequence']
                #
                # ^^--------- REPLACED SEQUENCE! ---------------^
                #
                # remove the double ';''
                if line_dict_update['Modifications'] != '':
                    tmp = []
                    for e in line_dict_update['Modifications'].split(';'):
                        if e == '':
                            # that remove the doubles ....
                            continue
                        else:
                            # other way to do it...
                            # pos_of_split_point = re.search( ':\d*\Z', e )
                            # pattern = re.compile( r''':(?P<pos>[0-9]*$)''' )
                            for occ, match in enumerate(
                                    mod_pattern.finditer(e)):
                                mod = e[:match.start()]
                                mod_pos = e[match.start() + 1:]
                                # mod, pos = e.split(':')
                                m = (int(mod_pos), mod)
                                if m not in tmp:
                                    tmp.append(m)
                    tmp.sort()
                    line_dict_update['Modifications'] = ';'.join(
                        ['{m}:{p}'.format(m=mod, p=pos) for pos, mod in tmp])

                # calculate m/z
                cc.use('{Sequence}#{Modifications}'.format(**line_dict_update))
                if use15N:
                    number_N = dc(cc['N'])
                    cc['15N'] = number_N
                    del cc['N']
                    if cam:
                        c_count = line_dict_update['Sequence'].count('C')
                        cc['14N'] = c_count
                        cc['15N'] -= c_count
                    # mass = mass + ( DIFFERENCE_14N_15N * number_N )
                mass = cc._mass()
                calc_mz = ursgal.ucore.calculate_mz(mass, line_dict['Charge'])
                # mz_buffer[ buffer_key ] = calc_mz

                line_dict_update['uCalc m/z'] = calc_mz
                # if 'msamanda' in search_engine.lower():
                # ms amanda does not return calculated mz values
                if line_dict['Calc m/z'] == '':
                    line_dict_update['Calc m/z'] = calc_mz

                line_dict_update['Accuracy (ppm)'] = \
                    (float(line_dict['Exp m/z']) - line_dict_update['uCalc m/z'])/line_dict_update['uCalc m/z'] * 1e6
                prec_m_accuracy = (
                    params['translations']['precursor_mass_tolerance_minus'] +
                    params['translations']['precursor_mass_tolerance_plus']
                ) / 2
                i = 0
                while abs(
                        line_dict_update['Accuracy (ppm)']) > prec_m_accuracy:
                    i += 1
                    if i > len(params['translations']
                               ['precursor_isotope_range'].split(',')) - 1:
                        break
                    isotope = params['translations'][
                        'precursor_isotope_range'].split(',')[i]
                    isotope = int(isotope)
                    if isotope == 0:
                        continue
                    calc_mz = ursgal.ucore.calculate_mz(
                        mass + isotope * 1.008664904, line_dict['Charge'])
                    line_dict_update['Accuracy (ppm)'] = \
                        (float(line_dict['Exp m/z']) - calc_mz)/calc_mz * 1e6

                # ------------
                # BUFFER END
                # -----------
                ze_only_buffer[_ze_ultra_buffer_key_] = line_dict_update

            line_dict_update = ze_only_buffer[_ze_ultra_buffer_key_]
            line_dict.update(line_dict_update)

            # protein block, only for database search engine
            if database_search is True:
                # remap peptides to proteins, check correct enzymatic
                # cleavage and decoy assignment
                lookup_identifier = '{0}><{1}'.format(line_dict['Sequence'],
                                                      fasta_lookup_name)
                if lookup_identifier not in pep_map_lookup.keys():
                    tmp_decoy = set()
                    # tmp_protein_id = {}

                    upeptide_maps = upapa.map_peptide(
                        peptide=line_dict['Sequence'],
                        fasta_name=fasta_lookup_name)
                    '''
                    <><><><><><><><><><><><><>
                    '''
                    # assert upeptide_maps != [],'''
                    #         The peptide {0} could not be mapped to the
                    #         given database {1}

                    #         {2}

                    #         '''.format(
                    #             line_dict['Sequence'],
                    #             fasta_lookup_name,
                    #             ''
                    #         )
                    if upeptide_maps == []:
                        print('''
[ WARNING ] The peptide {0} could not be mapped to the
[ WARNING ] given database {1}
[ WARNING ] {2}
[ WARNING ] This PSM will be skipped.
                            '''.format(line_dict['Sequence'],
                                       fasta_lookup_name, ''))
                        continue

                    sorted_upeptide_maps = [
                        protein_dict
                        for protein_dict in sorted(upeptide_maps,
                                                   key=lambda x: x['id'])
                    ]
                    # sorted(bacterial_protein_collector[race].items(),key=lambda x: x[1]['psm_count'])
                    # print()
                    # print(line_dict['Sequence'])
                    # print(sorted_upeptide_maps)
                    protein_mapping_dict = None
                    last_protein_id = None
                    for protein in sorted_upeptide_maps:
                        # print(line_dict)
                        # print(protein)
                        add_protein = False
                        nterm_correct = False
                        cterm_correct = False
                        if params['translations'][
                                'keep_asp_pro_broken_peps'] is True:
                            if line_dict['Sequence'][-1] == 'D' and\
                                    protein['post'] == 'P':
                                cterm_correct = True
                            if line_dict['Sequence'][0] == 'P' and\
                                    protein['pre'] == 'D':
                                nterm_correct = True

                        if cleavage_site == 'C':
                            if protein['pre'] in allowed_aa\
                                    or protein['start'] in [1, 2, 3]:
                                if line_dict['Sequence'][0] not in inhibitor_aa\
                                        or protein['start'] in [1, 2, 3]:
                                    nterm_correct = True
                            if protein['post'] not in inhibitor_aa:
                                if line_dict['Sequence'][-1] in allowed_aa\
                                     or protein['post'] == '-':
                                    cterm_correct = True

                        elif cleavage_site == 'N':
                            if protein['post'] in allowed_aa:
                                if line_dict['Sequence'][-1] not in inhibitor_aa\
                                        or protein['post'] == '-':
                                    cterm_correct = True
                            if protein['pre'] not in inhibitor_aa\
                                or protein['start'] in [1, 2, 3]:
                                if line_dict['Sequence'][0] in allowed_aa\
                                    or protein['start'] in [1, 2, 3]:
                                    nterm_correct = True

                        if params['translations']['semi_enzyme'] is True:
                            if cterm_correct is True or nterm_correct is True:
                                add_protein = True
                        elif cterm_correct is True and nterm_correct is True:
                            add_protein = True

                        if add_protein is True:
                            # print(add_protein)
                            # print(cterm_correct, nterm_correct)
                            if protein_mapping_dict is None:
                                protein_mapping_dict = {
                                    'Protein ID': protein['id'],
                                    'Sequence Start': str(protein['start']),
                                    'Sequence Stop': str(protein['end']),
                                    'Sequence Pre AA': protein['pre'],
                                    'Sequence Post AA': protein['post'],
                                }
                            else:
                                if protein['id'] == last_protein_id:
                                    tmp_join_char = ';'
                                else:
                                    tmp_join_char = joinchar

                                    protein_mapping_dict[
                                        'Protein ID'] += '{0}{1}'.format(
                                            tmp_join_char, protein['id'])

                                protein_mapping_dict[
                                    'Sequence Start'] += '{0}{1}'.format(
                                        tmp_join_char, str(protein['start']))
                                protein_mapping_dict[
                                    'Sequence Stop'] += '{0}{1}'.format(
                                        tmp_join_char, str(protein['end']))
                                protein_mapping_dict[
                                    'Sequence Pre AA'] += '{0}{1}'.format(
                                        tmp_join_char, protein['pre'])
                                protein_mapping_dict[
                                    'Sequence Post AA'] += '{0}{1}'.format(
                                        tmp_join_char, protein['post'])

                            # print(protein_mapping_dict['Protein ID' ])
                            last_protein_id = protein['id']

                            # mzidentml-lib does not always set 'Is decoy' correctly
                            # (it's always 'false' for MS-GF+ results), this is fixed here:
                            if params['translations']['decoy_tag'] in protein[
                                    'id']:
                                tmp_decoy.add('true')
                            else:
                                tmp_decoy.add('false')

                    if protein_mapping_dict is None:
                        non_enzymatic_peps.add(line_dict['Sequence'])
                        continue

                    if len(protein_mapping_dict['Protein ID']) >= 2000:
                        print('{0}: {1}'.format(
                            line_dict['Sequence'],
                            protein_mapping_dict['Protein ID']),
                              file=protein_id_output)
                        protein_mapping_dict[
                            'Protein ID'] = protein_mapping_dict[
                                'Protein ID'][:1990] + ' ...'
                        do_not_delete = True

                    if len(tmp_decoy) >= 2:
                        target_decoy_peps.add(line_dict['Sequence'])
                        protein_mapping_dict['Is decoy'] = 'true'
                    else:
                        protein_mapping_dict['Is decoy'] = list(tmp_decoy)[0]

                    pep_map_lookup[lookup_identifier] = protein_mapping_dict

                buffered_protein_mapping_dict = pep_map_lookup[
                    lookup_identifier]
                line_dict.update(buffered_protein_mapping_dict)
                # count each PSM occurence to check whether row-merging is needed:
                psm = tuple([line_dict[x] for x in psm_defining_colnames])
                psm_counter[psm] += 1

            csv_output.writerow(line_dict)
            '''
                to_be_written_csv_lines.append( line_dict )
            '''
    output_file_object.close()

    if database_search is True:
        # upapa.purge_fasta_info( fasta_lookup_name )
        if len(non_enzymatic_peps) != 0:
            print('''
                [ WARNING ] The following peptides could not be mapped to the
                [ WARNING ] given database {0}
                [ WARNING ] with correct enzymatic cleavage sites:
                [ WARNING ] {1}
                [ WARNING ] These PSMs were skipped.'''.format(
                params['translations']['database'], non_enzymatic_peps))
        if len(target_decoy_peps) != 0:
            print('''
                [ WARNING ] The following peptides occured in a target as well as decoy protein
                [ WARNING ] {0}
                [ WARNING ] 'Is decoy' has been set to 'True' '''.format(
                target_decoy_peps, ))

    # if there are multiple rows for a PSM, we have to merge them aka rewrite the csv...
    if psm_counter != Counter():
        if max(psm_counter.values()) > 1:
            merge_duplicate_psm_rows(
                output_file, psm_counter, psm_defining_colnames,
                params['translations']['psm_merge_delimiter'])
            '''
            to_be_written_csv_lines = merge_duplicate_psm_rows(
                to_be_written_csv_lines,
                psm_counter
            )
            '''
        '''
        do output_file magic with to_be_written_csv_lines
        '''
    if do_not_delete is False:
        created_tmp_files.append(output_file + '_full_protein_names.txt')
    return created_tmp_files
Esempio n. 6
0
def check_hill_notation(aa, chemformula):
    cc = ursgal.ChemicalComposition(aa)
    cc.subtract_chemical_formula('H2O')
    print(aa, chemformula)
    print(cc)
    assert cc.hill_notation() == chemformula
Esempio n. 7
0
    def preflight(self):
        """
        Formatting the command line and writing the param input file via
        self.params

        Returns:
                dict: self.params
        """
        self.input_file = os.path.join(self.params["input_dir_path"],
                                       self.params["input_file"])

        self.param_file_name = os.path.join(
            self.params["output_dir_path"],
            "{0}_msfragger.params".format(self.input_file),
        )
        self.created_tmp_files.append(self.param_file_name)
        # further prepare and translate params

        # pprint.pprint(self.params['translations']['_grouped_by_translated_key'])
        # pprint.pprint(self.params)
        # exit()
        self.params_to_write = {
            "output_file_extension":
            "tsv",  # tsv or pepXML we fix it...
            "output_format":
            "tsv",  # pepXML or tsv
            "digest_mass_range":
            "{0} {1}".format(
                self.params["translations"]["_grouped_by_translated_key"]
                ["precursor_min_mass"]["precursor_min_mass"],
                self.params["translations"]["_grouped_by_translated_key"]
                ["precursor_max_mass"]["precursor_max_mass"],
            ),
        }

        write_exclusion_list = [
            "precursor_min_mass",
            "precursor_max_mass",
            "precursor_min_charge",
            "precursor_max_charge",
            "label",
            "-Xmx",
            "header_translations",
            "validation_score_field",
        ]

        additional_15N_modifications = []
        if (self.params["translations"]["_grouped_by_translated_key"]["label"]
            ["label"] == "15N"):
            self.print_info(
                "Search with label=15N may still be errorprone. Evaluate with care!",
                caller="WARNING",
            )
            for aminoacid, N15_Diff in ursgal.ukb.DICT_15N_DIFF.items():
                existing = False
                for mod_dict in self.params["mods"]["fix"]:
                    if aminoacid == mod_dict["aa"]:
                        mod_dict["mass"] += N15_Diff
                        mod_dict["name"] += "_15N_{0}".format(aminoacid)
                        existing = True
                if existing == True:
                    continue
                else:
                    mod_key = "add_{0}_{1}".format(
                        aminoacid,
                        ursgal.chemical_composition_kb.aa_names[aminoacid])
                    self.params_to_write[mod_key] = N15_Diff

        self.mass_shift_lookup = {}
        self.mass_glycan_lookup = {}
        for msfragger_param_name in self.params["translations"][
                "_grouped_by_translated_key"].keys():
            for ursgal_param_name, param_value in self.params["translations"][
                    "_grouped_by_translated_key"][msfragger_param_name].items(
                    ):
                if msfragger_param_name in write_exclusion_list:
                    continue
                elif msfragger_param_name == "enzyme":
                    """
                    search_enzyme_name = Trypsin
                    search_enzyme_cutafter = KR
                    search_enzyme_butnotafter = P
                    """
                    aa_site, term, inhibitor = param_value.split(";")
                    self.params_to_write["search_enzyme_name"] = self.params[
                        "enzyme"]
                    self.params_to_write["search_enzyme_cutafter"] = aa_site
                    self.params_to_write[
                        "search_enzyme_butnotafter"] = inhibitor
                elif msfragger_param_name == "num_enzyme_termini":
                    # num_enzyme_termini = 2 # 2 for enzymatic, 1 for
                    # semi-enzymatic, 0 for nonspecific digestion

                    if (self.params["translations"]
                        ["_grouped_by_translated_key"]["enzyme"]["enzyme"] ==
                            "nonspecific"):
                        self.params_to_write[msfragger_param_name] = 0
                    else:
                        self.params_to_write[
                            msfragger_param_name] = param_value
                elif msfragger_param_name == "clear_mz_range":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "{0} {1}".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "remove_precursor_range":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "{0},{1}".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "delta_mass_exclude_ranges":
                    min_mz, max_mz = param_value
                    self.params_to_write[
                        msfragger_param_name] = "({0},{1})".format(
                            min_mz, max_mz)
                elif msfragger_param_name == "precursor_mass_lower":
                    self.params_to_write[
                        msfragger_param_name] = -1 * param_value
                elif msfragger_param_name == "modifications":
                    """
                    #maximum of 7 mods - amino acid codes, * for any amino acid, [ and ] specifies protein termini, n and c specifies peptide termini
                    variable_mod_01 = 15.9949 M
                    variable_mod_02 = 42.0106 [*
                    #variable_mod_03 = 79.96633 STY
                    #variable_mod_03 = -17.0265 nQnC
                    #variable_mod_04 = -18.0106 nE
                    """
                    # print(self.params['translations']['_grouped_by_translated_key'][msfragger_param_name])
                    # pprint.pprint(self.params[ 'mods' ])
                    # exit()
                    mass_to_mod_aa = ddict(list)
                    for mod_dict in self.params["mods"]["opt"]:
                        """
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        """
                        aa_to_append = mod_dict["aa"]
                        pos_modifier = None
                        if mod_dict["pos"] == "Prot-N-term":
                            pos_modifier = "["
                        elif mod_dict["pos"] == "Prot-C-term":
                            pos_modifier = "]"
                        elif mod_dict["pos"] == "N-term":
                            pos_modifier = "n"
                        elif mod_dict["pos"] == "C-term":
                            pos_modifier = "c"
                        elif mod_dict["pos"] == "any":
                            pass
                        else:
                            print("""
                            Unknown positional argument for given modification:
                            {0}
                            MSFragger cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            """.format(mod_dict["org"]))
                            sys.exit(1)
                        if pos_modifier is not None:
                            aa_to_append = "{0}{1}".format(
                                pos_modifier, aa_to_append)
                        mass_to_mod_aa[mod_dict["mass"]].append(aa_to_append)
                    for pos, (mass,
                              aa_list) in enumerate(mass_to_mod_aa.items()):
                        self.params_to_write["variable_mod_0{0}".format(
                            pos + 1)] = "{0} {1}".format(
                                mass, "".join(aa_list))
                    for mod_dict in self.params["mods"]["fix"]:
                        """
                        add_C_cysteine = 57.021464             # added to C - avg. 103.1429, mono. 103.00918
                        """
                        if mod_dict["pos"] == "Prot-N-term":
                            mod_key = "add_Nterm_protein"
                        elif mod_dict["pos"] == "Prot-C-term":
                            mod_key = "add_Cterm_protein"
                        elif mod_dict["pos"] == "N-term":
                            mod_key = "add_Nterm_peptide"
                        elif mod_dict["pos"] == "C-term":
                            mod_key = "add_Cterm_peptide"
                        else:
                            mod_key = "add_{0}_{1}".format(
                                mod_dict["aa"],
                                ursgal.chemical_composition_kb.aa_names[
                                    mod_dict["aa"]],
                            )
                        self.params_to_write[mod_key] = mod_dict["mass"]

                elif msfragger_param_name == "override_charge":
                    self.params_to_write[msfragger_param_name] = param_value
                    if param_value == 1:
                        self.params_to_write[
                            "precursor_charge"] = "{0} {1}".format(
                                self.params["translations"]
                                ["_grouped_by_translated_key"]
                                ["precursor_min_charge"]
                                ["precursor_min_charge"],
                                self.params["translations"]
                                ["_grouped_by_translated_key"]
                                ["precursor_max_charge"]
                                ["precursor_max_charge"],
                            )
                elif msfragger_param_name == "fragment_ion_series":
                    ion_list = []
                    for ion in param_value:
                        if ion not in [
                                "a",
                                "b",
                                "c",
                                "y~",
                                "x",
                                "y",
                                "z",
                                "b~",
                                "y-18",
                                "b-18",
                                "Y",
                        ]:
                            print("""
                                [ WARNING ] MSFragger does not allow the following ion:
                                {0}
                                This ion will be skipped, i.e. not included in the search.
                            """.format(ion))
                            continue
                        ion_list.append(ion)
                    self.params_to_write[msfragger_param_name] = ",".join(
                        ion_list)
                elif msfragger_param_name in [
                        "mass_offsets",
                        "Y_type_masses",
                ]:
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value["masses"]:
                        masses.append(str(m))
                    for m in param_value["glycans"]:
                        cc.clear()
                        cc.add_glycan(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_glycan_lookup.keys():
                            self.mass_glycan_lookup[tm] = set()
                        self.mass_glycan_lookup[tm].add(m)
                    for m in param_value["chemical_formulas"]:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        mass = cc._mass()
                        masses.append(str(mass))
                        # for tm in self.transform_mass_add_error(mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    for m in param_value["unimods"]:
                        unimod_mass = umama.name2mass(m)
                        masses.append(str(unimod_mass))
                        # for tm in self.transform_mass_add_error(unimod_mass):
                        tm = round(mass * 1e5)
                        if tm not in self.mass_shift_lookup.keys():
                            self.mass_shift_lookup[tm] = set()
                        self.mass_shift_lookup[tm].add(m)
                    self.params_to_write[msfragger_param_name] = "/".join(
                        masses)
                elif msfragger_param_name == "diagnostic_fragments":
                    cc = ursgal.ChemicalComposition()
                    umama = ursgal.UnimodMapper()
                    masses = []
                    for m in param_value["masses"]:
                        masses.append(m)
                    for m in param_value["glycans"]:
                        cc.clear()
                        cc.add_glycan(m)
                        masses.append(cc._mass())
                    for m in param_value["chemical_formulas"]:
                        cc.clear()
                        cc.add_chemical_formula(m)
                        masses.append(cc._mass())
                    for m in param_value["unimods"]:
                        unimod_mass = umama.name2mass(m)
                        masses.append(unimod_mass)
                    mzs = []
                    for mass in masses:
                        mzs.append(str(ursgal.ucore.calculate_mz(mass, 1)))
                    self.params_to_write[msfragger_param_name] = "/".join(mzs)
                else:
                    self.params_to_write[msfragger_param_name] = param_value

        self.write_params_file()

        if (self.input_file.lower().endswith(".mzml")
                or self.input_file.lower().endswith(".mzml.gz")
                or self.input_file.lower().endswith(".mgf")):
            self.params["translations"]["mzml_input_file"] = self.input_file
        # elif self.input_file.lower().endswith('.mgf'):
        #     self.params['translations']['mzml_input_file'] = \
        #         self.meta_unodes['ucontroller'].get_mzml_that_corresponds_to_mgf( self.input_file )
        #     self.print_info(
        #         'MSFragger can only read Proteowizard MGF input files,'
        #         'the corresponding mzML file {0} will be used instead.'.format(
        #             os.path.abspath(self.params['translations']['mzml_input_file'])
        #         ),
        #         caller = "INFO"
        #     )
        else:
            raise Exception(
                "MSFragger input spectrum file must be in mzML or MGF format!")

        self.params["command_list"] = [
            "java",
            "-Xmx{0}".format(self.params["translations"]
                             ["_grouped_by_translated_key"]["-Xmx"]["-xmx"]),
            "-jar",
            self.exe,
            self.param_file_name,
            self.params["translations"]["mzml_input_file"],
        ]

        self.params["translations"]["output_file_incl_path"] = os.path.join(
            self.params["output_dir_path"], self.params["output_file"])
        return self.params
Esempio n. 8
0
    def preflight(self):
        '''
        Formatting the command line and writing the param input file via 
        self.params

        Returns:
            dict: self.params
        '''
        self.params['translations']['mgf_input_file'] = os.path.join(
            self.params['input_dir_path'], self.params['input_file'])
        self.params['translations']['output_file_incl_path'] = os.path.join(
            self.params['output_dir_path'], self.params['output_file'])
        self.param_file_name = os.path.join(
            self.params['translations']['output_file_incl_path'].strip(
                '.csv') + '_pnovo.param')
        # self.created_tmp_files.append(self.param_file_name)

        self.params_to_write = {
            'output_dir_path': self.params['output_dir_path'],
            'input_file': self.params['translations']['mgf_input_file'],
        }

        print('''
            [ WARNING ] precursor_mass_tolerance_plus and precursor_mass_tolerance_minus
            [ WARNING ] need to be combined for pNovo (use of symmetric tolerance window).
            [ WARNING ] The arithmetic mean is used.
            ''')
        self.params['translations']['_grouped_by_translated_key'][
            'pep_tol'] = {
                'precursor_mass_tolerance': ( float(self.params['precursor_mass_tolerance_plus']) + \
                                            float(self.params['precursor_mass_tolerance_minus']) ) \
                                            / 2.0
            }
        opt_mods = []
        fix_mods = []
        self.mod_lookup = {}
        for pnovo_param_name in self.params['translations'][
                '_grouped_by_translated_key'].keys():
            for ursgal_param_name, param_value in self.params['translations'][
                    '_grouped_by_translated_key'][pnovo_param_name].items():
                if pnovo_param_name == 'spec_path1':
                    self.params_to_write[pnovo_param_name] = self.params[
                        'translations']['mgf_input_file'].replace(
                            '.mgf', '.ms2')
                    self.params_to_write['out_path'] = os.path.dirname(
                        self.params['translations']['output_file_incl_path'])
                elif pnovo_param_name == 'modifications':
                    #If you want to add a variable modification,
                    #please use a letter from (a-z) instead.
                    #For example, if M+Oxidation is to be added,
                    #you can add the line below(without '#'),
                    #in which 147.0354 = mass(M) + mass(Oxidation)

                    #a=147.0354
                    #b=160.030654
                    #N- or C- terminal variable modifications can be added as follows (using 0-9)

                    #c-term=0.984016

                    #A fixed modification can be added like (without '#'):

                    #C=160.030654
                    #in which 160.030654 = mass(C) + mass(Carbamidomethyl)

                    #FixMod Carbamidomethyl[C]  C
                    # C=160.030654 Carbamidomethyl[C]
                    #VarMod Oxidation[M]    M
                    # a=147.035405 Oxidation[M]
                    import string
                    alphabet = [x for x in string.ascii_lowercase]
                    sum_opt_mods = 0
                    for mod_dict in self.params['mods']['opt']:
                        '''
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        '''
                        if 'Prot' in mod_dict['pos']:
                            print('''
                            Protein N/C-terminal modifications are not supported by pNovo
                            Please change or delete the following modification:
                            {0}
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        elif mod_dict['pos'] == 'N-term':
                            mod_dict['pos'] = 'n-term'
                        elif mod_dict['pos'] == 'C-term':
                            mod_dict['pos'] = 'c-term'
                        elif mod_dict['pos'] == 'any':
                            pass
                        else:
                            print('''
                            Unknown positional argument for given modification:
                            {0}
                            pGlyco (or Ursgal) cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        cc = ursgal.ChemicalComposition()
                        if 'term' in mod_dict['pos']:
                            if mod_dict['aa'] != '*':
                                print('''
                                    Specific amino acids are not supported with terminal modifications
                                    in pNovo. Please change or delete the following modification:
                                    {0}
                                    '''.format(mod_dict['org']))
                                sys.exit(1)
                            opt_mods.append('{0}={1}'.format(
                                mod_dict['pos'],
                                mod_dict['mass'],
                            ))
                        else:
                            if mod_dict['aa'] == '*':
                                print('''
                                Not sure how to handle this modification in pNovo:
                                {0}
                                '''.format(mod_dict['org']))
                                sys.exit(1)
                            cc.use('{0}#{1}:1'.format(mod_dict['aa'],
                                                      mod_dict['name']))
                            mod_dict['mass'] = cc._mass()
                            opt_mods.append('{0}={1} {2}[{3}]'.format(
                                alphabet[sum_opt_mods],
                                mod_dict['mass'],
                                mod_dict['name'],
                                mod_dict['aa'],
                            ))
                            self.mod_lookup[alphabet[sum_opt_mods]] = (
                                mod_dict['name'], mod_dict['aa'])
                            sum_opt_mods += 1

                    for mod_dict in self.params['mods']['fix']:
                        if 'term' in mod_dict['pos']:
                            print('''
                            Fixed N/C-terminal modifications are not supported by pNovo
                            Please change or delete the following modification:
                            {0}
                            '''.format(mod_dict['org']))
                            sys.exit(1)
                        else:
                            cc = ursgal.ChemicalComposition()
                            cc.use('{0}#{1}:1'.format(mod_dict['aa'],
                                                      mod_dict['name']))
                            mod_dict['mass'] = cc._mass()
                            opt_mods.append('{0}={1} {2}[{3}]'.format(
                                mod_dict['aa'],
                                mod_dict['mass'],
                                mod_dict['name'],
                                mod_dict['aa'],
                            ))
                else:
                    self.params_to_write[pnovo_param_name] = param_value
        self.params_to_write['FixMod'] = '\n'.join(fix_mods)
        self.params_to_write['VarMod'] = '\n'.join(opt_mods)

        self.write_params_file()

        self.params['command_list'] = [
            self.exe,
            self.param_file_name,
        ]
        print(' '.join(self.params['command_list']))
        return self.params
Esempio n. 9
0
    def preflight(self):
        """
        Formatting the command line and writing the param input file via
        self.params

        Returns:
            dict: self.params
        """
        self.params["translations"]["mgf_input_file"] = os.path.join(
            self.params["input_dir_path"], self.params["input_file"]
        )
        self.params["translations"]["output_file_incl_path"] = os.path.join(
            self.params["output_dir_path"], self.params["output_file"]
        )
        self.param_file_name = os.path.join(
            self.params["translations"]["output_file_incl_path"].strip(".csv")
            + "_pnovo.param"
        )
        # self.created_tmp_files.append(self.param_file_name)

        self.params_to_write = {
            "output_dir_path": self.params["output_dir_path"],
            "input_file": self.params["translations"]["mgf_input_file"],
        }

        print(
            """
            [ WARNING ] precursor_mass_tolerance_plus and precursor_mass_tolerance_minus
            [ WARNING ] need to be combined for pNovo (use of symmetric tolerance window).
            [ WARNING ] The arithmetic mean is used.
            """
        )
        self.params["translations"]["_grouped_by_translated_key"]["pep_tol"] = {
            "precursor_mass_tolerance": (
                float(self.params["precursor_mass_tolerance_plus"])
                + float(self.params["precursor_mass_tolerance_minus"])
            )
            / 2.0
        }
        opt_mods = []
        fix_mods = []
        self.mod_lookup = {}
        for pnovo_param_name in self.params["translations"][
            "_grouped_by_translated_key"
        ].keys():
            for ursgal_param_name, param_value in self.params["translations"][
                "_grouped_by_translated_key"
            ][pnovo_param_name].items():
                if pnovo_param_name == "spec_path1":
                    self.params_to_write[pnovo_param_name] = self.params[
                        "translations"
                    ]["mgf_input_file"].replace(".mgf", ".ms2")
                    self.params_to_write["out_path"] = os.path.dirname(
                        self.params["translations"]["output_file_incl_path"]
                    )
                elif pnovo_param_name == "modifications":
                    # If you want to add a variable modification,
                    # please use a letter from (a-z) instead.
                    # For example, if M+Oxidation is to be added,
                    # you can add the line below(without '#'),
                    # in which 147.0354 = mass(M) + mass(Oxidation)

                    # a=147.0354
                    # b=160.030654
                    # N- or C- terminal variable modifications can be added as follows (using 0-9)

                    # c-term=0.984016

                    # A fixed modification can be added like (without '#'):

                    # C=160.030654
                    # in which 160.030654 = mass(C) + mass(Carbamidomethyl)

                    # FixMod Carbamidomethyl[C]  C
                    # C=160.030654 Carbamidomethyl[C]
                    # VarMod Oxidation[M]    M
                    # a=147.035405 Oxidation[M]
                    import string

                    alphabet = [x for x in string.ascii_lowercase]
                    sum_opt_mods = 0
                    for mod_dict in self.params["mods"]["opt"]:
                        """
                        {'_id': 0,
                          'aa': '*',
                          'composition': {'C': 2, 'H': 2, 'O': 1},
                          'id': '1',
                          'mass': 42.010565,
                          'name': 'Acetyl',
                          'org': '*,opt,Prot-N-term,Acetyl',
                          'pos': 'Prot-N-term',
                          'unimod': True},
                        """
                        if "Prot" in mod_dict["pos"]:
                            print(
                                """
                            Protein N/C-terminal modifications are not supported by pNovo
                            Please change or delete the following modification:
                            {0}
                            """.format(
                                    mod_dict["org"]
                                )
                            )
                            sys.exit(1)
                        elif mod_dict["pos"] == "N-term":
                            mod_dict["pos"] = "n-term"
                        elif mod_dict["pos"] == "C-term":
                            mod_dict["pos"] = "c-term"
                        elif mod_dict["pos"] == "any":
                            pass
                        else:
                            print(
                                """
                            Unknown positional argument for given modification:
                            {0}
                            pGlyco (or Ursgal) cannot deal with this, please use one of the follwing:
                            any, Prot-N-term, Prot-C-term, N-term, C-term
                            """.format(
                                    mod_dict["org"]
                                )
                            )
                            sys.exit(1)
                        cc = ursgal.ChemicalComposition()
                        if "term" in mod_dict["pos"]:
                            if mod_dict["aa"] != "*":
                                print(
                                    """
                                    Specific amino acids are not supported with terminal modifications
                                    in pNovo. Please change or delete the following modification:
                                    {0}
                                    """.format(
                                        mod_dict["org"]
                                    )
                                )
                                sys.exit(1)
                            opt_mods.append(
                                "{0}={1}".format(
                                    mod_dict["pos"],
                                    mod_dict["mass"],
                                )
                            )
                        else:
                            if mod_dict["aa"] == "*":
                                print(
                                    """
                                Not sure how to handle this modification in pNovo:
                                {0}
                                """.format(
                                        mod_dict["org"]
                                    )
                                )
                                sys.exit(1)
                            cc.use("{0}#{1}:1".format(mod_dict["aa"], mod_dict["name"]))
                            mod_dict["mass"] = cc._mass()
                            opt_mods.append(
                                "{0}={1} {2}[{3}]".format(
                                    alphabet[sum_opt_mods],
                                    mod_dict["mass"],
                                    mod_dict["name"],
                                    mod_dict["aa"],
                                )
                            )
                            self.mod_lookup[alphabet[sum_opt_mods]] = (
                                mod_dict["name"],
                                mod_dict["aa"],
                            )
                            sum_opt_mods += 1

                    for mod_dict in self.params["mods"]["fix"]:
                        if "term" in mod_dict["pos"]:
                            print(
                                """
                            Fixed N/C-terminal modifications are not supported by pNovo
                            Please change or delete the following modification:
                            {0}
                            """.format(
                                    mod_dict["org"]
                                )
                            )
                            sys.exit(1)
                        else:
                            cc = ursgal.ChemicalComposition()
                            cc.use("{0}#{1}:1".format(mod_dict["aa"], mod_dict["name"]))
                            mod_dict["mass"] = cc._mass()
                            opt_mods.append(
                                "{0}={1} {2}[{3}]".format(
                                    mod_dict["aa"],
                                    mod_dict["mass"],
                                    mod_dict["name"],
                                    mod_dict["aa"],
                                )
                            )
                else:
                    self.params_to_write[pnovo_param_name] = param_value
        self.params_to_write["FixMod"] = "\n".join(fix_mods)
        self.params_to_write["VarMod"] = "\n".join(opt_mods)

        self.write_params_file()

        self.params["command_list"] = [
            self.exe,
            self.param_file_name,
        ]
        print(" ".join(self.params["command_list"]))
        return self.params
Esempio n. 10
0
File: run.py Progetto: JB-MS/SugarPy
    def build_combinations(
        self,
        max_tree_length=None,
        monosaccharides=None,
        mode='replacement',
    ):
        '''
        Builds and returns a dictionary containing chemical compositions 
        of all combinations (with replacement, not ordered)
        of a given dict of monosaccharides and a maximal length of the tree.

        Keyword arguments:
            max_tree_length (int): Maximum number of monosaccharides in one combination
            monosaccharides(dict): Dictionary containing name and chemical composition of monosaccharides

        Returns:
            dict: keys: chemical compositions of all combinations (with replacement, not ordered), 
                values: combination(s) monosaccharide names corresponding to the chemical composition

        ToDo: change monosaccharides to list and get compositions from ursgal.ChemicalComposition(),
              keyword argument for calculate_formula?
        '''
        if monosaccharides is None:
            monosaccharides = self.monosaccharides

        if mode == 'replacement':
            print('[ SugarPy  ] Building combinations for:')
            print(
                '[ SugarPy  ]',
                len(monosaccharides),
                'given monosaccharides and a max tree length of',
                max_tree_length
            )
        # mode = 'sugarqb'
        # sugarqb_glycan_db = open('sugarqb_glycan_db.txt', 'w')
        glycan_combinations = {}
        for nr_repeats in range(1, max_tree_length + 1):
            if mode == 'combinations':
                glycan_combinations[nr_repeats] = []
                tmp_combinations = set()
                for combo in combinations(monosaccharides, nr_repeats):
                    tmp_combinations.add(combo)
                for tmp_combo in tmp_combinations:
                    glycan_dict = {}
                    for monosacch in set(tmp_combo):
                        count = tmp_combo.count(monosacch)
                        glycan_dict[monosacch] = count
                    glycan_combinations[nr_repeats].append(glycan_dict)
            elif mode == 'replacement':
                for combo in combinations_with_replacement(monosaccharides, nr_repeats):
                    cc = ursgal.ChemicalComposition()
                    for monosacch in combo:
                        cc.add_chemical_formula(monosaccharides[monosacch])
                    hill_notation = cc.hill_notation_unimod()
                    if hill_notation not in glycan_combinations:
                        glycan_combinations[hill_notation] = set()
                    glycan_combinations[hill_notation].add(combo)
            elif mode == 'sugarqb':
                for combo in combinations_with_replacement(monosaccharides, nr_repeats):
                    cc = ursgal.ChemicalComposition()
                    glycan_dict = {}
                    print(combo)
                    for monosacch in combo:
                        cc.add_glycan(monosacch)
                        count = combo.count(monosacch)
                        glycan_dict[monosacch] = count
                    sugarqb_list = []
                    for monosacch, count in glycan_dict.items():
                        sugarqb_list.append('{0}{1}'.format(monosacch, count))    
                    sugarqb_str = '{0}_N-Glycan {1}'.format(','.join(sorted(sugarqb_list)), round(cc._mass(), 6))
                    print(sugarqb_str, file=sugarqb_glycan_db)
            else:
                print('''
                    [ SugarPy  ] ERROR! the mode for build_combinations
                    [ SugarPy  ] is not available: {0}
                '''.format(mode))

        if mode == 'replacement':
            print('[ SugarPy  ] built', len(glycan_combinations), 'combinations')

        return glycan_combinations
Esempio n. 11
0
File: run.py Progetto: JB-MS/SugarPy
    def add_glycans2peptide(
        self,
        peptide_list=[],
        max_tree_length=None,
        monosaccharides=None
    ):
        '''
        Adds chemical composition of glycans to a given list of peptides.
        Peptides need to be in unimod style (Peptide#Modifications).
        The chemical composition of the original peptidoform is returned as well.

        Keyword Arguments:
            peptide_list (list): List of peptides in unimod style
            max_tree_length (int): maximum number of monosaccharides in one combination
            monosaccharides(dict): dictionary containing name and chemical composition of monosaccharides

        Returns:
            dict: { 'Sequence#Modifications : {glycan_hill_notation': ['Name']}}
        '''
        if monosaccharides is None:
            monosaccharides == self.monosaccharides

        monosacch_combinations = self.build_combinations(
            max_tree_length=max_tree_length,
            monosaccharides=monosaccharides
        )
        peptides_with_glycans = {}
        for peptide in peptide_list:
            if peptide in peptides_with_glycans.keys():
                continue
            peptides_with_glycans[peptide] = {}
            # pep_with_glycans = {}
            cc = ursgal.ChemicalComposition()
            cc.use(peptide)
            hill_notation = cc.hill_notation_unimod()
            peptides_with_glycans[peptide][hill_notation] = [
                peptide]  # ['{0}'.format(peptide)]

            for composition in monosacch_combinations.keys():
                cc.add_chemical_formula(composition)
                hill_notation = cc.hill_notation_unimod()
                for combo in monosacch_combinations[composition]:
                    combo_name = ''
                    combo_dict = {}
                    for monosacch in combo:
                        if monosacch not in combo_dict.keys():
                            combo_dict[monosacch] = 0
                        combo_dict[monosacch] += 1
                    for monosacch in sorted(combo_dict.keys()):
                        combo_name += '{0}({1})'.format(
                            monosacch,
                            combo_dict[monosacch]
                        )
                    if hill_notation not in peptides_with_glycans[peptide].keys():
                        peptides_with_glycans[peptide][hill_notation] = []
                    peptides_with_glycans[peptide][hill_notation].append(
                        '{0}|{1}'.format(peptide, combo_name)
                    )
                cc.subtract_chemical_formula(composition)

        print('[ SugarPy  ] Added glycans to peptides.')
        return peptides_with_glycans