Beispiel #1
0
def save_as_mgf(spectrums: List[Spectrum], filename: str):
    """Save spectrum(s) as mgf file.

    :py:attr:`~matchms.Spectrum.losses` of spectrum will not be saved.

    Arguments:
    ----------
    spectrums:
        Expected input are match.Spectrum.Spectrum() objects.
    filename:
        Provide filename to save spectrum(s).
    """
    if not isinstance(spectrums, list):
        # Assume that input was single Spectrum
        spectrums = [spectrums]

    # Convert matchms.Spectrum() into dictionaries for pyteomics
    for spectrum in spectrums:
        spectrum_dict = {
            "m/z array": spectrum.peaks.mz,
            "intensity array": spectrum.peaks.intensities,
            "params": spectrum.metadata
        }
        # Append spectrum to file
        py_mgf.write([spectrum_dict], filename)
def format_mgf_deepnovo(mgf_input: str, mgf_output: str):
    """Format MGF file for use with DeepNovoV2.

    Necessary spectrum parameters will be reordered to comply with DeepNovoV2
    convention. Other parameters will be discarded. Empty spectra will be
    discarded.

    Args:
        mgf_input (str): path to the input MGF file.
        mgf_output (str): path to the output MGF file.

    """
    key_order = ['title', 'pepmass', 'charge', 'scans', 'rtinseconds']
    with mgf.read(mgf_input, read_charges=False) as reader:
        for spectrum in reader:
            # Check if spectrum isn't emtpy.
            if spectrum['m/z array'].size:
                # Remove unnecessary parameters.
                to_remove = [c for c in spectrum['params'].keys() if c not in
                             key_order]
                for col in to_remove:
                    spectrum['params'].pop(col)
                # Append current spectrum to MGF output with correct params
                # order.
                mgf.write((spectrum,), mgf_output, key_order=key_order)
Beispiel #3
0
def writeMGF(spectra, outputFile):

    headers = {
        'COM': 'OpenMS_search',
        'USERNAME': '******',
        'FORMAT': 'Mascot generic',
        'TOLU': 'Da',
        'ITOLU': 'Da',
        'FORMVER': '1.01',
        'DB': 'MSDB',
        'SEARCH': 'MIS',
        'REPORT': 'AUTO',
        'CLE': 'Trypsin',
        'MASS': 'monoisotopic',
        'INSTRUMENT': 'Default',
        'PFA': '1',
        'TOL': '3',
        'ITOL': '0.3',
        'TAXONOMY': 'All entries',
        'CHARGE': '1,2,3'
    }

    mgf.write(spectra=spectra, output=outputFile)

    return
Beispiel #4
0
def convert_mq_mracluster_mgf(mq_msms, mrcluster_clusters, mgf_file, output,
                              px_accession, raw_name):
    if mq_msms is None or mrcluster_clusters is None or mgf_file is None:
        print_help()

    # Read the input spectra
    input_spectra = mgf.read(mgf_file)
    spectra_list = list(input_spectra)
    print('Number of Spectra: ' + str(len(spectra_list)))

    # Read the msms.txt files using, for now the peptides will be a dictionary, where the key is the scan number
    # and the values is the peptide sequence. We need to be aware that we can have cases when one scan can be associated with more
    # than one peptide sequence

    peptides = read_peptides(mq_msms)
    print('Number of Peptides: ' + str(len(peptides)))

    # Read clusters, the clusters will be a map where the key is the scan and the value is the cluster where the scan belongs
    clusters = read_clusters(mrcluster_clusters)
    print("Number of Clusters: " + str(len(clusters)))

    for scan in clusters:
        print('scan: ' + str(scan))
        for spectra in spectra_list:
            if spectra['params']['title'].endswith('scan=' + str(scan)):
                cluster_accession = clusters[scan]
                if scan not in peptides:
                    peptide_sequence = None
                else:
                    peptide_sequence = peptides[scan]
                charge = int(spectra['params']['charge'][0])
                spectra['params']['title'] = buid_usi_accession(
                    cluster_accession, peptide_sequence, scan, px_accession,
                    raw_name, charge)
                mgf.write([spectra], output)
Beispiel #5
0
    def patchMgf(self, input_path, output_path):
        maxWindowDiff = 156.10112 + 2 * 1.00782503 + 15.9949146
        with mgf.read(input_path) as spectra:
            spectra_out = []
            for spectrum in spectra:
                int_dic = spectrum['intensity array']
                mz_dic = spectrum['m/z array']
                param_dic = spectrum['params']
                chrg_spec = spectrum['params']['charge'][0]

                pos = 0
                del_array = []
                for m in mz_dic:
                    if m < 175:  # smallest y ion - arginin
                        del_array.append(pos)
                    elif m > spectrum['params']['pepmass'][0] * chrg_spec - (
                            chrg_spec - 1) * 1.00782503 - maxWindowDiff:
                        del_array.append(pos)
                    pos += 1

                int_dic = np.delete(int_dic, del_array, 0)
                mz_dic = np.delete(mz_dic, del_array, 0)

                spectra_out.append({
                    'm/z array': mz_dic,
                    'intensity array': int_dic,
                    'params': param_dic
                })

        mgf.write(spectra=spectra_out, output=output_path)
def annotate_mgf(mgf_input: str, mascot_input: str, mgf_output: str):
    """Annotate MGF file using Mascot XML results.

    annotate_mgf will annotate the MGF file using peptide sequences found in
    the Mascot XML results and write the resulting MGF file to mgf_output.

    Args:
        mgf_input (str): path to the MGF input file.
        mascot_intput (str): path to the Mascot XML results.
        mgf_output (str): path to the MGF output file.

    """
    # Retrieve mascot sequences.
    mascot_seq = extract_mascot_sequences(mascot_input)

    with mgf.read(mgf_input, read_charges=False) as reader:
        for spectrum in reader:
            sequences = mascot_seq.loc[
                mascot_seq.title == spectrum['params']['title'], 'sequence'
            ].values
            # If multiple sequences are associated to a single spectrum, the
            # latter will be duplicated for each sequence.
            for seq in sequences:
                spectrum['params']['seq'] = seq
                mgf.write((spectrum,), mgf_output)
Beispiel #7
0
 def export_annotated_spectra_to_mgf(self,
                                     mgf_path,
                                     report_just_heavy=False):
     spectra_out = []
     for ref in self.references:
         if report_just_heavy:
             ms = ref.just_create_heavy_ms()
         else:
             ms = ref.create_ms(iMin_similarity=self.min_rel_similarity)
         buf_peaks = []
         buf_int = []
         for chrg in ms.spectrum:
             for key in ms.spectrum[chrg].keys():
                 for mp in ms.spectrum[chrg][key]:
                     if report_just_heavy:
                         buf_peaks.append(mp.mz)
                         buf_int.append(mp.intensity)
                     else:
                         if mp.meta['mass'] != -1:
                             buf_peaks.append(mp.mz)
                             buf_int.append(mp.intensity)
             if len(buf_peaks) != 0:
                 spectra_out.append({
                     'm/z array': buf_peaks,
                     'intensity array': buf_int,
                     'params': ref.params
                 })
     mgf.write(spectra=spectra_out, output=mgf_path)
Beispiel #8
0
def MGF_generator(df, folder_mzxml, output_filename):
    temp_spectrum_manager = []
    for index, row in df.iterrows():
        fn_mzxml_file = folder_mzxml + row['Original_Peaklist'] + '.mzXML'
        with mzxml.read(fn_mzxml_file) as output:
            spectrum_1 = str(row['ScanmzXML'])
            data_spectrum1 = output[spectrum_1]
            title = str('File:' + row['Original_Peaklist'] + '.' +
                        str(row['ScanmzXML']) + ' "scan=' +
                        str(row['ScanId']) + '"')
            params_dict = {
                'TITLE': title,
                'CHARGE': str('1+'),
                'PEPMASS': str(row['ExpMz']),
                'SCANS': str(row['ScanId'])
            }
            dictionnaire = {
                'params': params_dict,
                'm/z array': data_spectrum1['m/z array'],
                'intensity array': data_spectrum1['intensity array']
            }
            temp_spectrum_manager.append(dictionnaire)

    output_filename = output_filename + '.mgf'
    mgf.write(temp_spectrum_manager, output_filename)
    def load_recalibrate(self):
        fc = calculate_Delta_by_ppm(self.ppm)
        tmt_mass = calculate_tag_tmt10()
        with mgf.read(self.path) as spectra:
            for spectrum in spectra:
                ms = MasterSpectrum()
                params = spectrum['params']
                for mass, intensity in zip(spectrum['m/z array'],
                                           spectrum['intensity array']):
                    ms.add(Peak(mass, intensity, fc))

                peak = Peak(tmt_mass, 0.5, fc)
                if peak.key() not in ms.spectrum[0]:
                    recalibrate = False
                else:
                    idx, bin_to_ack, a, b = ms.binary(
                        peak, 0,
                        len(ms.spectrum[0][peak.key()]) - 1, 0)
                    if idx == -1:
                        recalibrate = False
                    else:
                        recalibrate = True
                        recalibration_mass = ms.spectrum[0][peak.key()][idx].mz
                        diff = tmt_mass - recalibration_mass
                        print(params['title'])
                        print("original={0}\tdiff={1}".format(
                            recalibration_mass, diff))

                mass_list = []
                int_list = []
                if recalibrate:
                    ppm_shift = calculate_ppm_shift(diff, tmt_mass)

                for key in ms.spectrum[0].keys():
                    for mp in ms.spectrum[0][key]:
                        if recalibrate:
                            if self.type == 'ppm':
                                diff = calculate_da_shift(mp.mz, ppm_shift)
                                mass_list.append(mp.mz + diff)
                            elif self.type == 'absolute':
                                diff = diff
                                mass_list.append(mp.mz + diff)
                            else:
                                print(self.type)
                                raise ValueError("what did you dooooo")
                        else:
                            mass_list.append(mp.mz)
                        int_list.append(mp.intensity)
                print("len is:\t{0}".format(len(mass_list)))
                mgf.write(spectra=[{
                    'm/z array': mass_list,
                    'intensity array': int_list,
                    'params': params
                }],
                          output=self.file_out)
Beispiel #10
0
def clean_mgf_file(file):
    spectra = mgf.read(file)
    fasta = 'C:/Users/ccranney/Desktop/Caleb_Files/data/2019-03-14-td-UP000005640.fasta'
    fDict = {}
    longPep = ''
    for record in SeqIO.parse(open(fasta, 'r'), 'fasta'):
        fDict[len(longPep)] = record.id
        longPep += str(record.seq) + '.'
    cleaned = []
    count = 0
    pepCount = 0
    for spec in spectra:
        count += 1
        #if count % 40==0: break
        #mzValues = return_frag_mzs(spec['params']['seq'],1)
        #peaks = list(tuple(zip(spec['m/z array'],spec['intensity array'])))
        #for i in range(len(peaks)-1,-1,-1):
        #    if approx_list(peaks[i][0],mzValues)==-1: peaks.pop(i)
        #if len(peaks)==0: continue
        #peaks.sort(key=lambda x:x[0])
        #spec['m/z array'],spec['intensity array'] = map(list,zip(*peaks))
        #'''
        decoy = False
        if 'protein' in spec['params'] and 'DECOY' in spec['params']['protein']:
            decoy = True
        else:
            seq = re.sub(r'\+\d+\.\d+', '', spec['params']['seq'])
            listOfI = [m.start() for m in re.finditer(seq, longPep)]
            sorted_keys = sorted(fDict.keys())
            proteins = set()
            for i in listOfI:
                insertion_point = bisect.bisect_left(sorted_keys, i)
                # adjust, as bisect returns not exactly what we want
                if insertion_point == len(
                        sorted_keys) or sorted_keys[insertion_point] != i:
                    insertion_point -= 1
                protein = fDict[sorted_keys[insertion_point]]
                proteins.add(fDict[sorted_keys[insertion_point]])
            if len(proteins) == 0: proteins.add(spec['params']['seq'])

        if decoy: proteins = ['DECOY_0_' + x for x in proteins]

        protein = str(len(proteins)) + '/' + '/'.join(sorted(proteins))
        spec['params']['protein'] = protein
        if protein != '0/': pepCount += 1
        #'''
        cleaned.append(spec)
        if count % 1000 == 0:
            print(count)
            print(pepCount)
            print(protein)

    cleanedFile = re.sub('(.*).mgf', r'\1_proteinsAdded.mgf', file)
    mgf.write(cleaned, cleanedFile)
    def select_mgf(self):
        spectra_out = []
        with mgf.read(self.path_mgf_in) as spectra:
            for spectrum in spectra:

                scanid = int(parse_scan_id(spectrum['params']['title']))
                if scanid in self.list_chosen:
                    spectra_out.append({'m/z array': spectrum['m/z array'],
                                        'intensity array': spectrum['intensity array'],
                                        'params': spectrum['params']})

        mgf.write(spectra=spectra_out, output=self.path_mgf_out)
Beispiel #12
0
def convert(args, out=sys.stdout):
    """Outputs spectral library sorted by mass."""

    spectra = []
    for oidx, sp in enumerate(mgf.read(args.lib_mgf)):
        print("Original spectra", sp)
        spectra.append((sp['params']['pepmass'][0],oidx,sp))

    for _,_,s in sorted(spectra):
        print("Sorted order", s)
        if s['m/z array'].shape[0] < args.min_peak_count:
            continue
        mgf.write([s,], output=out)

    """
Beispiel #13
0
 def setUp(self):
     self.path = 'test.mgf'
     self.header = mgf.read_header(self.path)
     self.spectra = list(mgf.read(self.path))
     self.tmpfile = tempfile.TemporaryFile(mode='r+')
     mgf.write(header=self.header,
               spectra=self.spectra,
               output=self.tmpfile)
     self.tmpfile.seek(0)
     self.header2 = mgf.read_header(self.tmpfile)
     self.tmpfile.seek(0)
     tmpreader = mgf.read(self.tmpfile)
     self.spectra2 = list(tmpreader)
     self.ns = len(self.spectra)
     self.tmpfile.close()
Beispiel #14
0
def save_mgf_output(merge_result, ms2_file, output_dir, timestamp):
    #create merged directory and save renamed file out to it
    output_directory = generate_output_directory_name(ms2_file, output_dir,
                                                      timestamp)
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    merged_mgf_filename = generate_output_merged_mgf_name(
        ms2_file, output_dir, timestamp)
    print("\nWriting merged MGF: " + merged_mgf_filename)
    mgf.write(merge_result["merged_mgf"],
              output=merged_mgf_filename,
              use_numpy=True,
              write_charges=False,
              fragment_format='%.4f %.4f')
def clean_mgf_file(mgfFile, fasta, ions=False):
    spectra = mgf.read(mgfFile)
    fDict = {}
    longPep = ''
    for record in SeqIO.parse(open(fasta,'r'),'fasta'):
        fDict[len(longPep)] = record.id
        longPep += str(record.seq) + '.'
    cleaned = []
    count = 0
    pepCount = 0
    for spec in spectra:
        count += 1
        #if count % 40 == 0: break
        if ions:
            mzValues = return_frag_mzs(spec['params']['seq'],1)
            peaks = list(tuple(zip(spec['m/z array'],spec['intensity array'])))
            for i in range(len(peaks)-1,-1,-1):
                if smf.approx_list(peaks[i][0],mzValues)==-1: peaks.pop(i)
            if len(peaks)==0: continue
            peaks.sort(key=lambda x:x[0])
            spec['m/z array'],spec['intensity array'] = map(list,zip(*peaks))
        decoy = False
        if 'protein' in spec['params'] and 'DECOY' in spec['params']['protein']: decoy = True
        else:
            seq = re.sub(r'\+\d+\.\d+', '', spec['params']['seq'])
            listOfI = [m.start() for m in re.finditer(seq, longPep)]
            sorted_keys = sorted(fDict.keys())
            proteins = set()
            for i in listOfI:
                insertion_point = bisect_left(sorted_keys,i)
                if insertion_point==len(sorted_keys) or sorted_keys[insertion_point]!=i:
                    insertion_point-=1
                protein = fDict[sorted_keys[insertion_point]]
                proteins.add(fDict[sorted_keys[insertion_point]])
            if len(proteins)==0: proteins.add('protein_not_in_fasta_'+spec['params']['seq'])

        if decoy: proteins = ['DECOY_0_'+x for x in proteins]

        protein = str(len(proteins)) + '/' + '/'.join(sorted(proteins))
        if protein != '0/': spec['params']['protein'] = protein; pepCount += 1
        cleaned.append(spec)
        if count % 1000 == 0: print(count); print(pepCount); print(protein)


    cleanedFile = re.sub('(.*).mgf', r'\1_proteinsAdded.mgf', mgfFile)
    if ions: cleanedFile = re.sub('(.*).mgf', r'\1_YBionsOnly.mgf', cleanedFile)
    mgf.write(cleaned, cleanedFile)
Beispiel #16
0
def old_mgf_writer(spectrum_data_dict, output_dir, datatype):
    # deprecated function
    # write spectrum data to .mgf file
    processed_sample_params = {
        key: spectrum_data_dict[key]
        for key in spectrum_data_dict.keys()
        if key != 'm/z array' and key != 'intensity array'
    }
    processed_sample_spectra = [{
        'm/z array':
        spectrum_data_dict['m/z array'],
        'intensity array':
        spectrum_data_dict['intensity array'],
        'params':
        processed_sample_params
    }]
    pytmgf.write(spectra=processed_sample_spectra,
                 output=output_dir + datatype + "_data.mgf")
Beispiel #17
0
    def patchMgf(self, input_path, output_path):
        '''
        '''

        with mgf.read(input_path) as spectra:
            spectra_out = []
            for spectrum in spectra:
                int_dic = spectrum['intensity array']
                mz_dic = spectrum['m/z array']
                param_dic = spectrum['params']

                chrg_spec = spectrum['params']['charge'][0]
                precursor = calculatePrecursor(
                    mz=spectrum['params']['pepmass'][0], charge=chrg_spec)
                pos = 0
                del_array = []
                for m in mz_dic:
                    peak = Peak(m, 0, self.delta_function)
                    if peak.key() in self.exclusionSpectrum.spectrum[0]:
                        idx, bin_to_ack, should_merge_left_peak, should_merge_right_peak = self.exclusionSpectrum.binary(
                            peak, 0,
                            len(self.exclusionSpectrum.spectrum[0][peak.key()])
                            - 1, 0)
                        if idx != -1:  # found
                            del_array.append(pos)
                    else:
                        mp = MasterPeak(peak)
                        for precursorDelta in self.precursorDeltas:
                            if mp.isInsideMz(precursor - precursorDelta):
                                del_array.append(pos)
                            else:
                                pass
                    pos += 1

                int_dic = np.delete(int_dic, del_array, 0)
                mz_dic = np.delete(mz_dic, del_array, 0)

                spectra_out.append({
                    'm/z array': mz_dic,
                    'intensity array': int_dic,
                    'params': param_dic
                })

        mgf.write(spectra=spectra_out, output=output_path)
Beispiel #18
0
def main():
    pars = argparse.ArgumentParser()
    pars.add_argument('input', help='MGF file with clustered spectra.')
    pars.add_argument('output', nargs='?', help='Output file (default is stdout).')
    pars.add_argument('--mode', choices=['single', 'encoded_clusters'], default='encoded_clusters',
                      help='Operation mode. Single: input MGF is interpreted as a single cluster.'
                           'encoded_clusters: cluster IDs are parsed out of spectrum titles.')
    pars.add_argument('--dyn-range', type=float, default=DYN_RANGE,
                      help='Dynamic range to apply to output spectra')
    pars.add_argument('--min-fraction', type=float, default=MIN_FRACTION,
                      help='Minimum fraction of cluster spectra where MS/MS peak is present.')
    pars.add_argument('--mz-accuracy', type=float, default=DIFF_THRESH,
                      help='Minimum distance between MS/MS peak clusters.')
    pars.add_argument('--append', action='store_true',
                      help='Append to output file instead of replacing it.')
    pars.add_argument('--rt', choices=['median', 'mass_lower_median'], default='median')
    pars.add_argument('--pepmass', choices=['naive_average', 'neutral_average',
                                            'lower_median'], default='lower_median')
    pars.add_argument('--msms_avg', choices=['naive', 'weighted'], default='weighted')

    args = pars.parse_args()
    if args.pepmass == 'lower_median':
        args.rt = 'mass_lower_median'
    get_rt = {'median': median_rt, 'mass_lower_median': lower_median_mass_rt}[args.rt]
    get_pepmass = {'naive_average': naive_average_mass_and_charge,
                   'neutral_average': neutral_average_mass_and_charge,
                   'lower_median': lower_median_mass}[args.pepmass]

    kwargs = {'mz_accuracy': args.mz_accuracy, 'dyn_range': args.dyn_range,
              'min_fraction': args.min_fraction, 'msms_avg': args.msms_avg}
    mode = 'wa'[args.append]
    if args.mode == 'single':
        spectra = list(mgf.read(args.input))
        mz, c = get_pepmass(spectra)
        rt = get_rt(spectra)
        mgf.write([average_spectrum(spectra,
                                    title=args.output, pepmass=mz, charge=c, rtinseconds=rt, **kwargs)],
                  args.output, file_mode=mode)
    elif args.mode == 'encoded_clusters':
        mgf.write(process_maracluster_mgf(args.input,
                                          get_pepmass=get_pepmass, get_rt=get_rt, **kwargs), args.output,
                  file_mode=mode)
    else:
        raise NotImplementedError('This mode is not implemented yet.')
 def seperateMgf(self, folder_out):
     '''
     '''
     for idea in self.goodBadUgly.keys():
         spectra_out = []
         with mgf.read(self.pathMgf) as spectra:
             for spectrum in spectra:
                 int_dic = spectrum['intensity array']
                 mz_dic = spectrum['m/z array']
                 param_dic = spectrum['params']
                 if spectrum['params']['title'] in self.goodBadUgly[idea]:
                     spectra_out.append({
                         'm/z array': mz_dic,
                         'intensity array': int_dic,
                         'params': param_dic
                     })
         output_path = folder_out + self.file_name + "_" + str(
             idea) + ".mgf"
         mgf.write(spectra=spectra_out, output=output_path)
Beispiel #20
0
def save_mgf_output(merge_result, ms2_file, output_dir, timestamp):
    #create merged directory and save renamed file out to it
    output_directory = generate_output_directory_name(ms2_file, output_dir,
                                                      timestamp)
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)

    merged_mgf_filename = generate_output_merged_mgf_name(
        ms2_file, output_dir, timestamp)
    utility.print_timestamp("Merge MS2/MS3 - Writing merged MGF - Start - " +
                            basename(merged_mgf_filename))
    mgf.write(merge_result["merged_mgf"],
              output=merged_mgf_filename,
              use_numpy=True,
              write_charges=False,
              fragment_format='%.4f %.4f')
    utility.print_timestamp(
        "Merge MS2/MS3 - Writing merged MGF - Complete - " +
        basename(merged_mgf_filename))
Beispiel #21
0
def save_as_mgf(spectrums: List[Spectrum], filename: str):
    """Save spectrum(s) as mgf file.

    :py:attr:`~matchms.Spectrum.losses` of spectrum will not be saved.

    Example:

    .. code-block:: python

        import numpy
        from matchms import Spectrum
        from matchms.exporting import save_as_mgf

        # Create dummy spectrum
        spectrum = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"),
                            intensities=numpy.array([10, 10, 500], dtype="float"),
                            metadata={"charge": -1,
                                      "inchi": '"InChI=1S/C6H12"',
                                      "precursor_mz": 222.2})

        # Write spectrum to test file
        save_as_mgf(spectrum, "test.mgf")

    Parameters
    ----------
    spectrums:
        Expected input are match.Spectrum.Spectrum() objects.
    filename:
        Provide filename to save spectrum(s).
    """
    if not isinstance(spectrums, list):
        # Assume that input was single Spectrum
        spectrums = [spectrums]

    # Convert matchms.Spectrum() into dictionaries for pyteomics
    for spectrum in spectrums:
        spectrum_dict = {
            "m/z array": spectrum.peaks.mz,
            "intensity array": spectrum.peaks.intensities,
            "params": spectrum.metadata
        }
        # Append spectrum to file
        py_mgf.write([spectrum_dict], filename)
Beispiel #22
0
 def setUp(self):
     self.path = 'test.mgf'
     self.header = mgf.read_header(self.path)
     with mgf.read(self.path) as f:
         self.spectra = list(f)
     self.tmpfile = tempfile.TemporaryFile(mode='r+')
     mgf.write(header=self.header,
               spectra=self.spectra,
               output=self.tmpfile)
     self.tmpfile.seek(0)
     self.header2 = mgf.read_header(self.tmpfile)
     self.tmpfile.seek(0)
     tmpreader = mgf.read(self.tmpfile)
     self.spectra2 = list(tmpreader)
     self.ns = len(self.spectra)
     self.tmpfile.close()
     self.path_annotated = 'test_annotated.mgf'
     self.header_annotated = mgf.read_header(self.path_annotated)
     with mgf.read(self.path_annotated, read_ions=True) as f:
         self.spectra_annotated = list(f)
Beispiel #23
0
 def test_read_write_with_ions(self):
     formats = ['{:.6f} {:.6f} {}', '%.6f %.6f %s']
     for use_numpy in range(2):
         with tempfile.TemporaryFile(mode='r+') as f:
             mgf.write(self.spectra_annotated,
                       f,
                       write_ions=True,
                       use_numpy=use_numpy,
                       fragment_format=formats[use_numpy])
             f.seek(0)
             spectra = list(mgf.read(f, read_ions=True))
         for spec_data, spec_read in zip(data.mgf_spectra_annotated_long,
                                         spectra):
             # Check that the spectra have the same dict keys
             self.assertEqual(spec_data.keys(), spec_read.keys())
             for key in spec_data.keys():
                 if type(spec_data[key]) == dict:
                     self.assertDictEqual(spec_data[key], spec_read[key])
                 else:
                     np.testing.assert_array_equal(spec_data[key],
                                                   spec_read[key])
Beispiel #24
0
def save_as_mgf(spectrums: Union[Spectrum, List[Spectrum]], filename: str):
    """Save spectrum(s) as mgf file.

    Args:
    ----
    spectrums: list of Spectrum() objects, Spectrum() object
        Expected input are match.Spectrum.Spectrum() objects.
    filename: str
        Provide filename to save spectrum(s).
    """
    if not isinstance(spectrums, list):
        # Assume that input was single Spectrum
        spectrums = [spectrums]

    # Convert matchms.Spectrum() into dictionaries for pyteomics
    for spectrum in spectrums:
        spectrum_dict = {
            "m/z array": spectrum.peaks.mz,
            "intensity array": spectrum.peaks.intensities,
            "params": spectrum.metadata
        }
        # Append spectrum to file
        py_mgf.write([spectrum_dict], filename)
Beispiel #25
0
def main():
    rt_prediction_df = pd.read_csv(snakemake.input['rt_prediction_output'],
                                   sep="\t",
                                   index_col=False)
    with mgf.read(snakemake.input['msms_prediction_output'],
                  read_ions=True,
                  convert_arrays=1,
                  index_by_scans=False) as msms_prediction_mgf:

        # Split indices into as many chunks as we have cores
        index_splits = np.array_split(
            np.asarray(list(msms_prediction_mgf.index.keys())), mp.cpu_count())
        # Merge RT with MSMS Chunk-Wise in parallel, then concatenate the results
        with mp.Pool(mp.cpu_count()) as pool:
            prediction_mgf_aslist = np.hstack(
                pool.map(
                    partial(utils.add_rt_to_spectra, msms_prediction_mgf,
                            rt_prediction_df), index_splits))
        mgf.write(spectra=prediction_mgf_aslist,
                  output=snakemake.output[0],
                  fragment_format="%.6f %.7f %s",
                  header=msms_prediction_mgf.header,
                  write_charges=False,
                  write_ions=True)
def write_mgf(filename: str, spectra: List[sus.MsmsSpectrum]) -> None:
    """
    Write the given spectra to an MGF file.

    Parameters
    ----------
    filename : str
        The file name of the MGF output file.
    spectra : List[sus.MsmsSpectrum]
        The spectra to be written to the MGF file.
    """
    spectra_dict = [{
        'm/z array': spectrum.mz,
        'intensity array': spectrum.intensity,
        'params': {
            'title': (f'{spectrum.cluster};'
                      f'{spectrum.identifier}'),
            'pepmass': spectrum.precursor_mz,
            'rtinseconds': spectrum.retention_time,
            'charge': spectrum.precursor_charge
        }
    } for spectrum in spectra]
    with open(filename, 'w') as f_out:
        mgf.write(spectra_dict, f_out)
Beispiel #27
0
import os
import re
import sys

from pyteomics import mgf


def natural_sort(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in _nsre.split(s)]    


mgf_dir = sys.argv[1] if len(sys.argv) > 1 else '.'

scan_nr = 1
spectra = []
for filename in sorted(os.listdir(mgf_dir), key=natural_sort):
    if filename.endswith('.mgf'):
        print(f'{filename}\t{scan_nr}')
        for spectrum_dict in mgf.read(os.path.join(mgf_dir, filename), use_index=False):
            spectrum_dict['params']['scans'] = str(scan_nr)
            spectra.append(spectrum_dict)
            scan_nr += 1

f_out = mgf.write(spectra, os.path.join(mgf_dir, 'merged.mgf'), file_mode='w')
f_out.close()
lib = lib.dropna(subset=['Smiles'])  
lib = lib[lib.Smiles != ' ']
lib['Smiles'] = lib['Smiles'].str.strip() # remove white spaces

ikeys = [j for i in ikeys.values.tolist() for j in i]
ikeys = [w.replace('InChIKey=', '') for w in ikeys]
smiles["inchikey"] = ikeys
smiles = smiles.rename(columns = {'SMILES':'Smiles'})
smiles = smiles.drop_duplicates(subset=['Smiles']) 

libcomb = pd.merge(lib, smiles,how="left",on="Smiles")
libcomb = libcomb.dropna(subset=['inchikey'])
libcomb = libcomb.drop_duplicates(subset='inchikey', keep='first', inplace=False) # remove duplicate InChIKeys

# load GNPS .mgf file downloaded from https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=6e22f85aeb0744208e872d1640f508d9
mgf_file = 'ProteoSAFe-METABOLOMICS-SNETS-6e22f85a-download_cluster_buckettable/METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main_ChargeReplaced.mgf'
scans = libcomb.Scan.tolist()

counter=0
with mgf.read(mgf_file) as reader:
    for spectrum in reader:
        for idx, scan in enumerate(scans):
            if spectrum['params']['scans'] == str(scan):
                file_name = '{}.mgf'.format("GNPSLibraries_uniqueSMILES_withFeatureIDs")
                spectrum['params']['SMILES'] = libcomb.Smiles.tolist()[idx]
                spectrum['params']['InchiKey'] = libcomb.inchikey.tolist()[idx]
                counter+=1
                spectrum['params']['FEATURE_ID'] = counter
                mgf.write((spectrum,), file_name)
def to_mgf(sample_run, output_path=None, header=None, **kwargs):
    if header is None:
        header = {}
    return mgf.write(process_database(sample_run, **kwargs), output_path, header)
Beispiel #30
0
    # output MGF file
    output_df = df[df.prediction > 0.5]
    index_list = output_df['mzxml_index'].astype(int).tolist()
    indices = [(x - 1) for x in index_list]
    if len(indices) > 0:
        steroid_file = file[indices]
        output_mgf = steroid_file.copy()
        for i in range(len(output_mgf)):
            output_mgf[i] = removekey(steroid_file[i], [
                'num', 'centroided', 'retentionTime', 'polarity', 'msLevel',
                'collisionEnergy', 'peaksCount', 'lowMz', 'highMz',
                'basePeakMz', 'basePeakIntensity', 'totIonCurrent',
                'precursorMz', 'id'
            ])
            output_mgf[i]['params'] = dict([
                ('title', steroid_file[i]['num']),
                ('rtinseconds', 60 * steroid_file[i]['retentionTime']),
                ('pepmass', steroid_file[i]['precursorMz'][0]['precursorMz']),
                ('charge', '0+')
            ])

        os.chdir(output_dir)
        mgf_name = 'steroid_' + mzxml_file[0:-6] + '.mgf'
        mgf.write(output_mgf,
                  output=mgf_name,
                  key_order=['title', 'rtinseconds', 'pepmass', 'charge'],
                  write_charges=True)

    print('File completed')
Beispiel #31
0
        spec_id = spectrum['params']['spectrumid']
        spec_inchi = spectrum['params']['inchikey']
        spec_smiles = spectrum['params']['smiles']
        for mibig_ids, mibig_inchi, mibig_smiles in mibig_entries:
            mibig_id, mibig_name, ext_id = mibig_ids
            output_list = [
                spec_id, spec_inchi, spec_smiles, mibig_id, mibig_name,
                mibig_inchi, mibig_smiles
            ]
            fwriter.writerow(output_list)
            #print(output_string)

# In[8]:

# Write a MGF file with the matched spectra
mgf.write([x[0] for x in matches], 'matched_mibig_gnps_update.mgf')

# In[14]:

matches[0][0]['params']

# In[24]:

matches_with_bgc_id = []
for ms, bgc in matches:
    bgc_id = bgc[0][0][0]
    spectrum_id = ms['params']['spectrumid']
    compound_id = '.'.join((bgc_id, spectrum_id))
    ms['params']['BGCID'] = compound_id
    matches_with_bgc_id.append(ms)