Exemple #1
0
def calculate_ndp_time(spectra_mgf_file1, spectra_mgf_file2):
  score_list = []

  bins_spectrum_01, bins_spectrum_02 = [], []

  tmp_time_01 = time.perf_counter()

  spectra01 = read(spectra_mgf_file1, convert_arrays=1)
  spectra02 = read(spectra_mgf_file2, convert_arrays=1)

  for data01 in spectra01:
    spectrum01_mz_array = data01.get("m/z array")
    spectrum01_intens_array = data01.get("intensity array")
    bin_spectrum01 = ndp_bin_spectrum(spectrum01_mz_array, spectrum01_intens_array)
    bins_spectrum_01.append(bin_spectrum01)

  for data02 in spectra02:
    spectrum02_mz_array = data02.get("m/z array")
    spectrum02_intens_array = data02.get("intensity array")
    bin_spectrum02 = ndp_bin_spectrum(spectrum02_mz_array, spectrum02_intens_array)
    bins_spectrum_02.append(bin_spectrum02)

  time01 = time.perf_counter()
  print("两文件编码所用的时间为:{}".format(time01 - tmp_time_01))

  for j in range(len(bins_spectrum_01)):
    score = caculate_nornalization_dp(bins_spectrum_01[j], bins_spectrum_02[j])
    score_list.append(score)
  # np.savetxt("./data/1130_test_use_time_ndp.txt", score_list)
  time02 = time.perf_counter()
  print("Similarity use time: {}".format(time02 - time01))
Exemple #2
0
 def test_read_array_conversion(self):
     with mgf.read(self.path, convert_arrays=0) as reader:
         self.assertEqual(data.mgf_spectra_lists, list(reader))
     with mgf.read(self.path, convert_arrays=2) as reader:
         s = next(reader)
         self.assertTrue(
             isinstance(s['charge array'], np.ma.core.MaskedArray))
         self.assertTrue(isinstance(s['m/z array'], np.ndarray))
     with mgf.read(self.path, convert_arrays=1) as reader:
         s = next(reader)
         self.assertTrue(isinstance(s['charge array'], np.ndarray))
         self.assertTrue(isinstance(s['m/z array'], np.ndarray))
def main(cluster_file, consensus_file):
    with mgf.read(consensus_file) as reader:
        for spectrum_dict in reader:
            peptide_seq = spectrum_dict['params']['title']
            precursor_mz = spectrum_dict['params']['pepmass'][0]
            precursor_charge = spectrum_dict['params']['charge'][0]
            cons_mz = spectrum_dict['m/z array']
            cons_intensity = spectrum_dict['intensity array']
            retention_time = float(spectrum_dict['params']['rtinseconds'])
            break
    cons_spec = sus.MsmsSpectrum(peptide_seq,
                                 precursor_mz=precursor_mz,
                                 precursor_charge=precursor_charge,
                                 mz=cons_mz,
                                 intensity=cons_intensity,
                                 retention_time=retention_time,
                                 peptide=peptide_seq)
    with mgf.read(cluster_file) as reader:
        for spectrum_dict in reader:
            precursor_mz = spectrum_dict['params']['pepmass'][0]
            precursor_charge = spectrum_dict['params']['charge'][0]
            mz = spectrum_dict['m/z array']
            intensity = spectrum_dict['intensity array']
            retention_time = float(spectrum_dict['params']['rtinseconds'])

        spectrum = sus.MsmsSpectrum(peptide_seq,
                                    precursor_mz=precursor_mz,
                                    precursor_charge=precursor_charge,
                                    mz=mz,
                                    intensity=intensity,
                                    retention_time=retention_time,
                                    peptide=peptide_seq)
        # Process the MS/MS spectrum.
        fragment_tol_mass = 10
        fragment_tol_mode = 'ppm'
        #    fragment_tol_mass = .5
        #    fragment_tol_mode = 'Da'
        spectrum = (spectrum.set_mz_range(
            min_mz=100, max_mz=1400).remove_precursor_peak(
                fragment_tol_mass, fragment_tol_mode).filter_intensity(
                    min_intensity=0.05, max_num_peaks=50).scale_intensity(
                        'root').annotate_peptide_fragments(fragment_tol_mass,
                                                           fragment_tol_mode,
                                                           ion_types='aby'))
    # Generate theoretical spec
    # Plot the MS/MS spectrum.
    fig, ax = plt.subplots(figsize=(12, 6))
    #    sup.spectrum(spectrum, ax=ax)
    sup.mirror(spectrum, tspec, ax=ax)
    plt.show()
    plt.close()
Exemple #4
0
 def setUp(self):
     self.path = 'test.mgf'
     self.header = mgf.read_header(self.path)
     self.spectra = list(mgf.read(self.path))
     self.tmpfile = tempfile.TemporaryFile(mode='r+')
     mgf.write(header=self.header,
               spectra=self.spectra,
               output=self.tmpfile)
     self.tmpfile.seek(0)
     self.header2 = mgf.read_header(self.tmpfile)
     self.tmpfile.seek(0)
     tmpreader = mgf.read(self.tmpfile)
     self.spectra2 = list(tmpreader)
     self.ns = len(self.spectra)
     self.tmpfile.close()
def get_cluster_spectra(mgf_filename: str) -> Dict[str, sus.MsmsSpectrum]:
    """
    Read all spectra from the given MGF file corresponding to a single cluster.

    Parameters
    ----------
    mgf_filename : str
        The file name of the MGF file to be read.

    Returns
    -------
    Dict[str, sus.MsmsSpectrum]
        A dictionary with as keys the scan numbers and as values the
        corresponding spectra.
    """
    spectra = {}
    for spectrum_dict in mgf.read(mgf_filename):
        # TODO: Make sure the USI doesn't contain a peptide identification.
        cluster, usi = spectrum_dict['params']['title'].split(';')
        spectrum = sus.MsmsSpectrum(
            usi,
            spectrum_dict['params']['pepmass'][0],
            spectrum_dict['params']['charge'][0],
            spectrum_dict['m/z array'],
            spectrum_dict['intensity array'],
            retention_time=spectrum_dict['params']['rtinseconds'])
        spectrum.cluster = cluster
        if usi in spectra:
            raise ValueError(f'Non-unique USI: {usi}')
        spectra[usi] = spectrum
    return spectra
Exemple #6
0
    def patchMgf(self, input_path, output_path):
        maxWindowDiff = 156.10112 + 2 * 1.00782503 + 15.9949146
        with mgf.read(input_path) as spectra:
            spectra_out = []
            for spectrum in spectra:
                int_dic = spectrum['intensity array']
                mz_dic = spectrum['m/z array']
                param_dic = spectrum['params']
                chrg_spec = spectrum['params']['charge'][0]

                pos = 0
                del_array = []
                for m in mz_dic:
                    if m < 175:  # smallest y ion - arginin
                        del_array.append(pos)
                    elif m > spectrum['params']['pepmass'][0] * chrg_spec - (
                            chrg_spec - 1) * 1.00782503 - maxWindowDiff:
                        del_array.append(pos)
                    pos += 1

                int_dic = np.delete(int_dic, del_array, 0)
                mz_dic = np.delete(mz_dic, del_array, 0)

                spectra_out.append({
                    'm/z array': mz_dic,
                    'intensity array': int_dic,
                    'params': param_dic
                })

        mgf.write(spectra=spectra_out, output=output_path)
def process(mgf_path, output_file):

    identifications = dict()
    imported_n = 0
    for file in os.listdir(mgf_path):
        if not file.lower().endswith('.mgf'):
            continue
        imported_n += 1
        for spectrum in mgf.read(mgf_path + "/" + file):
            params = spectrum.get('params')
            title = params.get('title')
            seq = params.get('seq')
            if seq == "" or seq == None:
                #                seq = "_UNID_"
                continue
            identifications[title] = seq
    if imported_n < 1:  #read from old identification files
        if os.path.isfile(output_file) and os.path.getsize(output_file) > 1000:
            with open(output_file, 'r') as o:
                lines = o.readlines()[1:]  #remove the table head line
            for line in lines:
                line = line.rstrip()
                sections = line.split("\t")
                title = sections[0]
                seq = sections[1]
                identifications[title] = seq
        else:
            raise Exception("No mgf file found here! %s" % (mgf_path))
    write_to_file(identifications, output_file)
    #phoenix_writer.export_ident_to_phoenix("pxd000021_test", "localhost", identifications)
    """
Exemple #8
0
def plain_parse(mgf_read_path, mgf_txt_write_path):
    this_dir = os.path.dirname(os.path.realpath(__file__))

    if os.path.isfile(mgf_txt_write_path):
        return "mgf_txt_write_path is already a file"

    utility.print_timestamp("Plain Parse MGF - Start - " +
                            basename(mgf_read_path))

    with open(mgf_txt_write_path, 'w') as mgf_csv:
        with mgf.read(mgf_read_path) as mgf_reader:
            csv_writer = csv.writer(mgf_csv, delimiter='\t')
            csv_writer.writerow(
                ['filename', 'scan', 'charge', 'rt', 'ms1 intensity'])
            for spectrum in mgf_reader:
                scans = spectrum['params']['scans']
                charge = re.sub(r'[^\d.]+', '',
                                str(spectrum['params']['charge']))
                rt = spectrum['params']['rtinseconds']
                ms1_intensity = spectrum['params']['pepmass'][1]
                csv_writer.writerow([
                    os.path.basename(mgf_read_path), scans, charge, rt,
                    ms1_intensity
                ])

    utility.print_timestamp("Plain Parse MGF - Complete - " +
                            basename(mgf_read_path))
Exemple #9
0
def extractAndAppend(filePath,n,instrum):
    intensity=[];MZ=[];charge=[];title=[];pepmass=[];
    print("\n█████████████████████████████开始读入"+filePath+"的信息█████████████████████████████\n")
    for spectrum in mgf.read(filePath):
        #print ("\n\n Spectrum  \n\n\n",spectrum)
        params = spectrum.get('params')
        MZ.append(spectrum.get('m/z array'))
        intensity.append(spectrum.get("intensity array"))
        charge.append(params.get('charge'))
        title.append(params.get('title'))
        pepmass.append(params.get('pepmass'))
    print("\n█████████████████████████████读入"+filePath+"信息完毕█████████████████████████████\n")
    resultList=random.sample(range(0,len(charge)),int(len(charge)*0.2))
    index=sorted(resultList)
    #print(index)
    
    """write"""
    for i in range(0,len(index)):
        Mz=list(MZ[index[i]]);Intensity=list(intensity[index[i]]);
        #print(Mz)
        PepMass=list(pepmass[index[i]])
        with open("D"+str(n)+".mgf","a") as f1:
            f1.write("BEGIN IONS\n")
            f1.write("TITLE="+title[index[i]]+"-MGF-instrumentation="+instrum+"\n")
            f1.write("PEPMASS="+str(PepMass[0])+"\n")
            f1.write("CHARGE="+str(charge[index[i]])+"\n")
            for i1 in range(len(Mz)):
                f1.write(str(Mz[i1])+" "+str(Intensity[i1])+"\n")
            f1.write("END IONS\n")
Exemple #10
0
def convert_mq_mracluster_mgf(mq_msms, mrcluster_clusters, mgf_file, output,
                              px_accession, raw_name):
    if mq_msms is None or mrcluster_clusters is None or mgf_file is None:
        print_help()

    # Read the input spectra
    input_spectra = mgf.read(mgf_file)
    spectra_list = list(input_spectra)
    print('Number of Spectra: ' + str(len(spectra_list)))

    # Read the msms.txt files using, for now the peptides will be a dictionary, where the key is the scan number
    # and the values is the peptide sequence. We need to be aware that we can have cases when one scan can be associated with more
    # than one peptide sequence

    peptides = read_peptides(mq_msms)
    print('Number of Peptides: ' + str(len(peptides)))

    # Read clusters, the clusters will be a map where the key is the scan and the value is the cluster where the scan belongs
    clusters = read_clusters(mrcluster_clusters)
    print("Number of Clusters: " + str(len(clusters)))

    for scan in clusters:
        print('scan: ' + str(scan))
        for spectra in spectra_list:
            if spectra['params']['title'].endswith('scan=' + str(scan)):
                cluster_accession = clusters[scan]
                if scan not in peptides:
                    peptide_sequence = None
                else:
                    peptide_sequence = peptides[scan]
                charge = int(spectra['params']['charge'][0])
                spectra['params']['title'] = buid_usi_accession(
                    cluster_accession, peptide_sequence, scan, px_accession,
                    raw_name, charge)
                mgf.write([spectra], output)
Exemple #11
0
def read_mgf(path):
    """
    returns (scan ID, time, charge, mz, mass estimate)
    """
    with mgf.read(path) as reader:
        for i in reader:
            scan = int(re.match(".* scan=([0-9]+)", i["params"]["title"])[1])
            time = i["params"]["rtinseconds"]
            chargelist = i["params"]["charge"]
            if len(chargelist) > 1:
                raise AssertionError("ChargeList length>1 unsupported")
            charge = int(chargelist[0])

            peptide_mz = i["params"]["pepmass"][0]
            peptide_intensity = i["params"]["pepmass"][1]
            peptide_mass_estimate = peptide_mz * charge - charge * PROTON_MASS

            fragments_mz = i["m/z array"]
            fragments_intensity = i["intensity array"]

            yield PeptideMeasurement(
                scan,
                time,
                charge,
                peptide_mz,
                peptide_intensity,
                peptide_mass_estimate,
                fragments_mz,
                fragments_intensity,
            )
Exemple #12
0
def create_ppk_matrix_stripe_serial(filter_func, shift, normalise, output_name):
    iokr_data_path = '/home/grimur/iokr/data'
    data_gnps = scipy.io.loadmat("/home/grimur/iokr/data/data_GNPS.mat")
    ms_path = '/home/grimur/iokr/data/SPEC'
    candidate_set = '/home/grimur/iokr/data/mibig/matched_mibig_gnps_2.0.mgf'
    candidate_set_size = 257

    iokrdata = data.IOKRDataServer(iokr_data_path)
    ker_size = len(iokrdata.spectra)

    kernel_matrix_peaks = numpy.zeros((candidate_set_size, ker_size))
    kernel_matrix_nloss = numpy.zeros_like(kernel_matrix_peaks)

    t0 = time.time()
    names = [x[0] for x in iokrdata.spectra]
    cnt = 0
    for i in mgf.read(candidate_set):
        i_ms = MSSpectrum(i)
        # active_jobs.append((i, p.apply_async(do_stripe, (i, names))))
        res = do_stripe(i_ms, names, filter_func, shift, normalise)

        for j_idx, values in enumerate(res):
            ij_peaks, ij_nloss = values

            kernel_matrix_peaks[cnt, j_idx] = ij_peaks
            kernel_matrix_nloss[cnt, j_idx] = ij_nloss

        cnt += 1
        print('done %s / %s, %s' % (cnt, candidate_set_size, time.time() - t0))

    numpy.save(output_name + '_test_peaks.npy', kernel_matrix_peaks)
    numpy.save(output_name + '_test_nloss.npy', kernel_matrix_nloss)
def annotate_mgf(mgf_input: str, mascot_input: str, mgf_output: str):
    """Annotate MGF file using Mascot XML results.

    annotate_mgf will annotate the MGF file using peptide sequences found in
    the Mascot XML results and write the resulting MGF file to mgf_output.

    Args:
        mgf_input (str): path to the MGF input file.
        mascot_intput (str): path to the Mascot XML results.
        mgf_output (str): path to the MGF output file.

    """
    # Retrieve mascot sequences.
    mascot_seq = extract_mascot_sequences(mascot_input)

    with mgf.read(mgf_input, read_charges=False) as reader:
        for spectrum in reader:
            sequences = mascot_seq.loc[
                mascot_seq.title == spectrum['params']['title'], 'sequence'
            ].values
            # If multiple sequences are associated to a single spectrum, the
            # latter will be duplicated for each sequence.
            for seq in sequences:
                spectrum['params']['seq'] = seq
                mgf.write((spectrum,), mgf_output)
def format_mgf_deepnovo(mgf_input: str, mgf_output: str):
    """Format MGF file for use with DeepNovoV2.

    Necessary spectrum parameters will be reordered to comply with DeepNovoV2
    convention. Other parameters will be discarded. Empty spectra will be
    discarded.

    Args:
        mgf_input (str): path to the input MGF file.
        mgf_output (str): path to the output MGF file.

    """
    key_order = ['title', 'pepmass', 'charge', 'scans', 'rtinseconds']
    with mgf.read(mgf_input, read_charges=False) as reader:
        for spectrum in reader:
            # Check if spectrum isn't emtpy.
            if spectrum['m/z array'].size:
                # Remove unnecessary parameters.
                to_remove = [c for c in spectrum['params'].keys() if c not in
                             key_order]
                for col in to_remove:
                    spectrum['params'].pop(col)
                # Append current spectrum to MGF output with correct params
                # order.
                mgf.write((spectrum,), mgf_output, key_order=key_order)
Exemple #15
0
def read_mgf(filename):
    """
    Read all spectra from the given mgf file.

    Args:
        filename: The mgf filename from which to read the spectra.

    Returns:
        A tuple of a `Spectrum` (containing the spectrum's information), an
        array of masses, and an array of intensities.
    """
    # test if the given file is an mzML file
    verify_extension(['.mgf'], filename)

    # get all query spectra
    for mgf_spectrum in mgf.read(filename):
        # create query spectrum
        identifier = mgf_spectrum['params']['title']
        precursor_mz = float(mgf_spectrum['params']['pepmass'][0])
        retention_time = float(mgf_spectrum['params']['rtinseconds'])
        if 'charge' in mgf_spectrum['params']:
            precursor_charge = int(mgf_spectrum['params']['charge'][0])
        else:
            precursor_charge = None

        read_spectrum = spectrum.Spectrum(identifier, precursor_mz,
                                          precursor_charge, retention_time)
        read_spectrum.set_peaks(mgf_spectrum['m/z array'],
                                mgf_spectrum['intensity array'])

        yield read_spectrum
Exemple #16
0
def binSpectra(fi, binfunc, binparams):
    # this function reads an mgf file
    # and assigns bins to the spectra
    # according to the given bin-function and bin-parameters
    bins = {}
    with mgf.read(fi) as reader:
        for spectrum in reader:
            pmass = spectrum['params']['pepmass'][0]
            # here the bin-function is called
            bin_ = binfunc(spectrum, binparams)
            if bin_ not in bins:
                # raw_intensities, raw_masses, norm_masses
                bins[bin_] = [[], 0, []]
            bins[bin_][0].extend(spectrum['intensity array'])
            # to save space we only store the normalised masses
            # bins[bin_][1].extend(spectrum['m/z array'])
            bins[bin_][1] += 1
            bins[bin_][2].extend(spectrum['m/z array'] - pmass)

    # sort the peaks in each bin by mass, then intensity
    for bin_ in bins:
        nSpectra = bins[bin_][1]
        bins[bin_] = list(reversed(map(list, zip(*sorted(zip(bins[bin_][2], bins[bin_][0]))))))
        bins[bin_].insert(1, nSpectra)
    return bins
Exemple #17
0
def getMassHistogram(fi, binsize=50):
    masses = []
    with mgf.read(fi) as reader:
        for spectrum in reader:
            pmass = spectrum['params']['pepmass'][0]
            masses.extend(spectrum['m/z array'] - pmass)
    # the histogram of the data with histtype='step'
    fig = plt.figure()
    plot = fig.add_subplot(111)

    n, bins, patches = plot.hist(masses, binsize, normed=1, histtype='stepfilled')
    plt.setp(patches, 'facecolor', 'b', 'alpha', 0.75)

    # add a line showing the expected distribution
    y = matplotlib.mlab.normpdf(bins, np.mean(masses), np.std(masses))
    l = plot.plot(bins, y, 'r--', linewidth=1.5)
    fig.tight_layout()
    fig.savefig(os.path.basename(fi) + '.massHist1.png', dpi=300)
    fig.savefig(os.path.basename(fi) + '.massHist1.svg', dpi=300)
    plt.close(fig)

    fig = plt.figure()
    plot = fig.add_subplot(111)

    n, bins, patches = plot.hist(masses, binsize, normed=1, histtype='bar')

    # add a line showing the expected distribution
    y = matplotlib.mlab.normpdf(bins, np.mean(masses), np.std(masses))
    l = plot.plot(bins, y, 'r--', linewidth=1.5)
    fig.tight_layout()
    fig.savefig(os.path.basename(fi) + '.massHist2.png', dpi=300)
    fig.savefig(os.path.basename(fi) + '.massHist2.svg', dpi=300)
    plt.close(fig)

    pass
Exemple #18
0
def convert(args, out=sys.stdout):
    """Remaps the nodes from a network analysis (sorted by pepmass) to the same order as the spectral library."""
    with mgf.read(args.lib_mgf) as reader:
        spectra = list(
            sorted((sp['params']['pepmass'][0], oidx)
                   for oidx, sp in enumerate(reader)))

    node_fmt = "{0}l{1:010d}".format
    with open(args.edges_tsv) as f:
        for line in f:
            if line.startswith('#'):
                out.write(line)
                continue
            tokens = line.split('\t')
            s_mz, s_idx = tokens[0].split('-')
            s_omz, s_oidx = spectra[int(s_idx)]
            check_mz(s_mz, s_omz, tokens[0], s_oidx)

            t_mz, t_idx = tokens[1].split('-')
            t_omz, t_oidx = spectra[int(t_idx)]
            check_mz(t_mz, t_omz, tokens[1], t_oidx)

            out.write('\t'.join([
                node_fmt(s_mz, s_oidx),
                node_fmt(t_mz, t_oidx),
                'll',
            ] + tokens[3:]))
def write_to_csv(projectid, mgf_file, data_type):

    filename = os.path.basename(mgf_file)

    spec_file_name = mgf_file[:-4] + "_spec.csv"
    spec_file = open(spec_file_name, "w")
    spec_writer = csv.writer(spec_file, lineterminator='\n')
    spec_writer.writerow([
        'spectrumTitle', 'precursorMz', 'precursorIntens', 'charge',
        'peaklistMz', 'peaklistIntens'
    ])

    if data_type == "peak_psm":
        psm_file_name = mgf_file[:-4] + "_psm.csv"
        psm_file = open(psm_file_name, "w")
        psm_writer = csv.writer(psm_file)
        psm_writer.writerow(['spectrumTitle', 'sequence', 'modifications'])

    spectra_list = mgf.read(mgf_file)

    print("Handling the data in %s" % (mgf_file))
    for index, spectrum in enumerate(spectra_list, start=1):  # default is zero
        (spec_row, psm_row) = get_row(projectid, filename, index, spectrum,
                                      data_type)
        spec_writer.writerow(spec_row)
        if data_type == "peak_psm":
            psm_writer.writerow(psm_row)
    print("The data had been wrote in the csv file.")
Exemple #20
0
def reader(path, flag):
    intensity = []
    MZ = []
    charge = []
    title = []
    pepmass = []
    print("\n█████████████████████████████开始读入" + path +
          "的信息█████████████████████████████\n")
    for spectrum in mgf.read(path):
        #print ("\n\n Spectrum  \n\n\n",spectrum)
        params = spectrum.get('params')
        MZ.append(spectrum.get('m/z array'))
        intensity.append(spectrum.get("intensity array"))
        charge.append(params.get('charge'))
        title.append(params.get('title'))
        pepmass.append(params.get('pepmass'))
    print("\n█████████████████████████████读入" + path +
          "信息完毕█████████████████████████████\n")
    #resultList=random.sample(range(0,len(charge)),int(len(charge)*0.2))
    #index=sorted(resultList)
    if flag == 1:
        labels = [1 for i in range(0, len(handle(title)))]
    else:
        labels = [2 for i in range(0, len(handle(title)))]
    return handle(title), labels
def mgf_library_upload(fileName):
    libMGF = mgf.read(fileName)
    smf.print_milestone('Enter library dictionary upload: ')
    lib = {}
    id = 0
    for spec in libMGF:
        id += 1
        key = (spec['params']['pepmass'][0], spec['params']['seq'])
        charge = int(re.sub('[+-]', '', str(spec['params']['charge'][0])))
        name = spec['params']['title']
        if 'protein' in spec['params']: protein = spec['params']['protein']
        else: protein = ''
        if 'DECOY' in name: decoy = 1
        else: decoy = 0
        mz = spec['m/z array']
        intensity = spec['intensity array']
        intensity = [x**0.5 for x in intensity]
        keyList = [id for x in mz]
        peaks = list(tuple(zip(mz, intensity, keyList)))
        peaks.sort(key=lambda x: x[1], reverse=True)
        if len(peaks) > 10: peaks = peaks[:10]
        peaks.sort(key=lambda x: x[0])
        tempDict = {
            'PrecursorCharge': charge,
            'transition_group_id': name,
            'ProteinName': protein,
            'Peaks': peaks,
            'ID': id,
            'Decoy': decoy,
        }
        lib[key] = tempDict
    return lib
Exemple #22
0
def merge_mgf_files(ms2_file, ms3_file, mz_cutoff):
    ms2_count = 0
    ms3_count = 0
    current_count = 0
    merged_count = 0

    #preloading the files into memory
    #ms2 - so we have a total spectra count for the progress bar
    #ms3 - so we don't have to read in repeatedly per ms2 spectra
    merged_mgf = []
    ms2_spectrum_list = []
    ms3_spectrum_list = []

    print("Reading MS2 file: " + ms2_file)
    with mgf.read(ms2_file) as ms2_reader:
        for ms2_temp in ms2_reader:
            ms2_spectrum_list.append(ms2_temp)
            ms2_count += 1

    print("Reading MS3 file: " + ms3_file)
    with mgf.read(ms3_file) as ms3_reader:
        for ms3_temp in ms3_reader:
            ms3_spectrum_list.append(ms3_temp)
            ms3_count += 1

    #Loop through all MS2/MS3 spectra looking for fuzzy matches.
    for ms2_spectrum in ms2_spectrum_list:
        for ms3_index, ms3_spectrum in enumerate(ms3_spectrum_list):
            if compare_spectrums_with_fuzzy_rt(ms2_spectrum, ms3_spectrum):
                merged_xy = merge_xy_arrays(ms2_spectrum, ms3_spectrum,
                                            mz_cutoff)
                ms2_spectrum['m/z array'] = merged_xy[0]
                ms2_spectrum['intensity array'] = merged_xy[1]
                merged_count += 1
                #remove the element we just found from the list to avoid dupes and save time
                del ms3_spectrum_list[ms3_index]
                break
        merged_mgf.append(ms2_spectrum)  #add no matter if it was merged or not
        current_count += 1
        write_progress_bar(current_count, ms2_count)

    return {
        "merged_mgf": merged_mgf,
        "ms2_count": ms2_count,
        "ms3_count": ms3_count,
        "merged_count": merged_count
    }
Exemple #23
0
def readmgf(fn):
    file = open(fn, "r")
    data = mgf.read(file, convert_arrays=1, read_charges=False,
                    dtype='float32', use_index=False)

    codes = parse_spectra(data)
    file.close()
    return codes
class TestMgf:
    data = pd.DataFrame(mgf.read("../data/raw/example.mgf"))

    def test_one(self):
        assert self.data.count()[0] == 5

    def test_two(self):
        assert self.data.iloc[0]['params']['title'] == 'scan=986 profile data'
Exemple #25
0
def read_mgf(file):
    spectra = mgf.read(file)
    spectrum = next(spectra)
    #create parameters: mz = m/z array, para = parameters, amp = intensity array
    mz = spectrum['m/z array']
    params = spectrum['params']
    amp = spectrum['intensity array']
    return (mz, params, amp)
Exemple #26
0
 def load_file(self, mgf_path, csv_path):
     print('Start to load file data...')
     info = pd.read_csv(csv_path, header=None)
     self.spectrum1 = info[0].tolist()
     self.spectrum2 = info[1].tolist()
     self.label = info[2].tolist()
     for mgf in read(mgf_path, convert_arrays=1):
         self.MGF[mgf.get('params').get('title')] = mgf
     print('Finish to load data...')
Exemple #27
0
def extractSpectraFast(x):
    rows = 0
    with mgf.read(x) as spectra:
        for spectrum in spectra:
            rows = rows + len(spectrum['m/z array'])
    # first argument is rownumber, sec argument is columnnumber
    n = np.empty((rows, 2))
    with mgf.read(x) as spectra:
        i = 0
        for spectrum in spectra:
            m = spectrum['m/z array']
            intensity = spectrum['intensity array']
            for m, intensity in zip(spectrum['m/z array'],
                                    spectrum['intensity array']):
                n[i, 0] = m
                n[i, 1] = intensity
                i = i + 1
    return np
Exemple #28
0
def extractSpectraFastForR(x):
    i = 1
    with mgf.read(x) as spectra, open("/home/tobiass/df.csv", "wt") as csvfile:
        writr = csv.writer(csvfile)
        for spectrum in spectra:
            for m, intensity in zip(spectrum['m/z array'],
                                    spectrum['intensity array']):
                writr.writerow((m, intensity, i))
            i = i + 1
def mgf_library_upload_quant(fileName, scanDict, digDict, aaDict, maxPeaks):

    # mgf file is read in using the pyteomics mgf module
    libMGF = mgf.read(fileName)

    # return value is initialized
    lib = defaultdict(list)

    keyList = sorted(list(scanDict.keys()))
    # each spectrum in the mgf file
    for spec in libMGF:

        seq = spec['params']['seq']
        precMz = spec['params']['pepmass'][0]

        key = (round(precMz, 2), seq)
        if key not in scanDict: continue

        # Decimal values are replaced with numeric placeholders to be included in the analysis.
        sequence = re.sub(r'\+\d+\.\d+', lambda m: digDict.get(m.group()), seq)

        # peaks of the library file are intialized
        mz = list(spec['m/z array'])
        intensity = [x for x in list(spec['intensity array'])]
        z = spec['params']['charge'][0]

        # The y-ion mz value for each fragment of the peptide is calculated. If it is in the library, it and it's intensity are stored in a list
        # NOTE: y-ions are singled out because they should have at least one lysine or arginine, so will have a heavy counterpart that can show up. B-ions don't have that guarantee.
        fragList = []
        for x in range(1, len(sequence) - 1):
            fragseq = sequence[x:]
            lightfragmz = mass.fast_mass(
                sequence=sequence[x:], ion_type='y', charge=1,
                aa_mass=aaDict)  # Do I need to use different possible charges?
            i = smf.approx_list(lightfragmz, mz)
            if i == -1: continue
            fragList.append((intensity[i], lightfragmz, fragseq))

        # y-ion peaks are sorted by intensity, and lower-intensity peaks are filtered out.
        fragList.sort(reverse=True)
        if maxPeaks != 0 and len(fragList) >= maxPeaks:
            fragList = fragList[:maxPeaks]

        # heavy counterpart mz is calculated. Light and heavy pairs are additionally tagged by their intensity rank and included in the final output.
        peaks = []
        for i in range(len(fragList)):
            fragMz = fragList[i][1]
            fragInt = fragList[i][0]
            peaks.append((fragMz, fragInt, (0, i, seq)))
            peaks.append((smf.calculate_heavy_mz(fragList[i][2], fragMz,
                                                 1), fragInt, (1, i, seq)))

        peaks.sort(key=lambda x: x[0])

        lib[scanDict[key]] += peaks
    return lib
Exemple #30
0
def plotHeatmapOld(fi, limit=1000):
    # X, Y, Z = [], [], []
    grid_d = {}
    xmin, xmax, ymin, ymax = None, None, None, None
    processedSpectra = 0
    with mgf.read(fi) as reader:
        for spectrum in reader:
            if processedSpectra > limit:
                break
            processedSpectra += 1
            x = int(spectrum['params']['pepmass'][0] * 100000)
            xmin, xmax = (min(x, xmin), max(x, xmax)) if xmin is not None else (x, x)
            for y, z in it.izip(spectrum['m/z array'], spectrum['intensity array']):
                y = int(y + 0.5)
                ymin, ymax = (min(y, ymin), max(y, ymax)) if ymin is not None else (y, y)
                grid_d[(x,y)] = z

            # X.extend([spectrum['params']['pepmass'][0] for i in spectrum['intensity array']])
            # Y.extend(spectrum['m/z array'])
            # Z.append(spectrum['intensity array'])
            # Z.extend(spectrum['intensity array'])

    grid = []
    for x in xrange(xmin, xmax + 1):
        for y in xrange(ymin, ymax + 1):
            if (x, y) in grid_d:
                grid.append(grid_d[(x,y)])
                del grid_d[(x,y)]
            else:
                grid.append(0.0)

    nrows, ncols = xmax - xmin + 1, ymax - ymin + 1
    grid2 = np.array(grid).reshape((nrows, ncols))
    grid = grid2

    #fig = plt.figure()
                                                    #plot = fig.add_subplot(111)

    plt.imshow(grid,
               extent=(xmin, xmax, ymin, ymax),
               interpolation='nearest',
               cmap=matplotlib.cm.seismic)

    # X = np.array(X)
    # Y = np.array(Y)
    # Z = np.matrix(Z)
    #range_ = min(Z), max(Z)

    #Z = matplotlib.cm.rainbow(map(lambda x:x/range_[1], Z))
    #plot.scatter(X, Y, color=Z) # pcolormesh(X, Y, Z)

    plt.tight_layout()
    plt.savefig(os.path.basename(fi) + '.colormap.png', dpi=300)
    plt.savefig(os.path.basename(fi) + '.colormap.svg', dpi=300)
    plt.close()
    def load_recalibrate(self):
        fc = calculate_Delta_by_ppm(self.ppm)
        tmt_mass = calculate_tag_tmt10()
        with mgf.read(self.path) as spectra:
            for spectrum in spectra:
                ms = MasterSpectrum()
                params = spectrum['params']
                for mass, intensity in zip(spectrum['m/z array'],
                                           spectrum['intensity array']):
                    ms.add(Peak(mass, intensity, fc))

                peak = Peak(tmt_mass, 0.5, fc)
                if peak.key() not in ms.spectrum[0]:
                    recalibrate = False
                else:
                    idx, bin_to_ack, a, b = ms.binary(
                        peak, 0,
                        len(ms.spectrum[0][peak.key()]) - 1, 0)
                    if idx == -1:
                        recalibrate = False
                    else:
                        recalibrate = True
                        recalibration_mass = ms.spectrum[0][peak.key()][idx].mz
                        diff = tmt_mass - recalibration_mass
                        print(params['title'])
                        print("original={0}\tdiff={1}".format(
                            recalibration_mass, diff))

                mass_list = []
                int_list = []
                if recalibrate:
                    ppm_shift = calculate_ppm_shift(diff, tmt_mass)

                for key in ms.spectrum[0].keys():
                    for mp in ms.spectrum[0][key]:
                        if recalibrate:
                            if self.type == 'ppm':
                                diff = calculate_da_shift(mp.mz, ppm_shift)
                                mass_list.append(mp.mz + diff)
                            elif self.type == 'absolute':
                                diff = diff
                                mass_list.append(mp.mz + diff)
                            else:
                                print(self.type)
                                raise ValueError("what did you dooooo")
                        else:
                            mass_list.append(mp.mz)
                        int_list.append(mp.intensity)
                print("len is:\t{0}".format(len(mass_list)))
                mgf.write(spectra=[{
                    'm/z array': mass_list,
                    'intensity array': int_list,
                    'params': params
                }],
                          output=self.file_out)
Exemple #32
0
def calculate_dsmapper_time(spectra_mgf_file1, spectra_mgf_file2):
    score_list = []
    # model = "../SpectraPairsData/080802_20_1000_NM500R_model.pkl"
    model = "./data/080802_20_1000_NM500R_model.pkl"

    tmp_time_01 = time.perf_counter()
    net = torch.load(model)
    tmp_time_02 = time.perf_counter()
    print("加载模型用时:{}".format(tmp_time_02 - tmp_time_01))

    # 五百个参考的谱图
    # reference_spectra = read("./0715_50_rf_spectra.mgf", convert_arrays=1)
    reference_spectra = read("../SpectraPairsData/0722_500_rf_spectra.mgf",
                             convert_arrays=1)
    # reference_spectra = read("./data/0722_500_rf_spectra.mgf", convert_arrays=1)
    reference_intensity = np.array([
        bin_spectrum(r.get('m/z array'), r.get('intensity array'))
        for r in reference_spectra
    ])

    spectra_pairs_num = more_itertools.ilen(
        read(spectra_mgf_file1, convert_arrays=1))
    tmp_time_03 = time.perf_counter()
    print("准备相关数据用时:{}".format(tmp_time_03 - tmp_time_02))

    embedded_01 = embedding_dataset(net, spectra_mgf_file1,
                                    reference_intensity, spectra_pairs_num)
    embedded_02 = embedding_dataset(net, spectra_mgf_file2,
                                    reference_intensity, spectra_pairs_num)

    # embedded_01 = embedded_01.reshape(embedded_01.shape[0], 1, embedded_01.shape[1])
    # embedded_02 = embedded_02.reshape(embedded_02.shape[0], 1, embedded_02.shape[1])

    time01 = time.perf_counter()
    print("数据编码加嵌入的总用时:{}".format(time01 - tmp_time_03))

    for i in range(embedded_01.shape[0]):
        score = np.linalg.norm(embedded_01[i] - embedded_02[i])
        score_list.append(score)
    # np.savetxt("./data/091801_test_use_time_dsmapper.txt", score_list)
    time02 = time.perf_counter()
    print("calc_EU use time: {}".format(time02 - time01))
def readSpectrum(mgffile, scanindex):
    msms1 = []
    with mgf.read(mgffile) as allspectra:
        for spectrum in allspectra:
            n = int(re.sub("\D+", "", spectrum['params']['title']))
            if n == scanindex:
                mz = spectrum['m/z array']
                inty = spectrum['intensity array']
                for i in range(len(mz)):
                    msms1.append((mz[i], inty[i]))
    return (pd.DataFrame(msms1, columns=['Mass', 'Intensity']))
Exemple #34
0
def getBinMembers(fi, binfunc, binparams):
    bins = {}
    with mgf.read(fi) as reader:
        for spectrum in reader:
            pmass = spectrum['params']['pepmass'][0]
            bin_ = binfunc(spectrum, binparams)
            if bin_ not in bins:
                # raw_intensities, raw_masses, norm_masses
                bins[bin_] = []
            bins[bin_].append(spectrum['params']['scans'])
    return bins
Exemple #35
0
def plotDensities(fi, precMassBinSize=100):


    binf = maxMS2IntensityBinFunc
    massBins = {}
    spectrum_bin_map = {}
    with mgf.read(fi) as reader:
        spectra = sorted(((spectrum['params']['pepmass'][0], spectrum['params']['title'], binf(spectrum, params={'winsize': precMassBinSize})) for spectrum in reader), key=lambda x:(x[2], x[0]))
        for i, spectrum in enumerate(spectra):
            spectrum_bin_map[i] = spectrum[2]

    grid = n_subplots(len(set(spectrum_bin_map.values())))
    f, axes = plt.subplots(grid[0], grid[1], figsize=(16,16), sharey=True)
    sns.despine(left=True)
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.0})
    pal = sns.color_palette("Reds", n_colors=101)

    with mgf.read(fi) as reader:
        for i, spectrum in enumerate(reader):
            massBins[spectrum_bin_map[i]] = massBins.get(spectrum_bin_map[i], []) + [spectrum]

    row, col = 0, 0
    for mbin in sorted(massBins):
        print row, col, mbin, len(massBins[mbin])
        maxIntensity = max(max(spectrum['intensity array']) for spectrum in massBins[mbin])

        spectra = sorted(((max(spectrum['intensity array']), spectrum['params']['pepmass'][0], i) for i, spectrum in enumerate(massBins[mbin])), key=lambda x:x[0])

        for intensity, precMass, i in spectra:
            dp = sns.distplot(massBins[mbin][i]['m/z array'] - precMass, hist=False, color=pal[int(intensity/maxIntensity * 100 + 0.5)], ax=axes[row,col])
            axes[row, col].set_ylim(0, 0.004)
        if col == grid[1] - 1:
            row += 1
            col = 0
        else:
            col += 1

    plt.tight_layout()
    plt.savefig("test.png", dpi=300)
Exemple #36
0
def plain_parse(mgf_read_path, mgf_txt_write_path):
	this_dir = os.path.dirname(os.path.realpath(__file__))

	if os.path.isfile(mgf_txt_write_path):
		return "mgf_txt_write_path is already a file"

	with open(mgf_txt_write_path,'wb') as mgf_csv:
		with mgf.read(mgf_read_path) as mgf_reader:
			csv_writer = csv.writer(mgf_csv, delimiter='\t')
			csv_writer.writerow(['filename', 'scan', 'charge', 'rt', 'ms1 intensity'])
			for spectrum in mgf_reader:
				scans = spectrum['params']['scans']
				charge = re.sub(r'[^\d.]+', '', str(spectrum['params']['charge']))
				rt = spectrum['params']['rtinseconds']
				ms1_intensity = spectrum['params']['pepmass'][1]
				csv_writer.writerow([os.path.basename(mgf_read_path), scans, charge, rt, ms1_intensity])
Exemple #37
0
    device.ylabel("Intensity")
    device.bar(spectrum["m/z array"], spectrum["intensity array"],
              width=0.5, linewidth=1, edgecolor=color)
    return

def plotSingleSpectrum(filename, spectrum):
    pylab.figure()
    createSpectrumFigure(spectrum, pylab)
    pylab.savefig(filename)
    return

## main
if len(sys.argv) == 2:
    ## download file
    filename, header = urllib.urlretrieve (sys.argv[1])

    spectra = mgf.read(filename)
    ## we only support the first spectrum now
    spectrum = next(spectra)

    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "output.svg")

    plotSingleSpectrum(filename, spectrum)
    ## write to stdout
    with open(filename, "r") as fin:
        print(fin.read())
else:
    print("Usage:\n\t" + sys.argv[0] + " URL")  

Exemple #38
0
@author: ilya
"""

from pyteomics import fasta, mgf, parser
import pylab

fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta'
mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf'

peptides = set()
with open(fasta_file) as fi:
    for description, sequence in fasta.read(fi):
        new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin'])
        peptides.update(new_peptides)
        
print "UNIQUE PEPTIDES"
print peptides

with open(mgf_file) as fi:
    for spectrum in mgf.read(fi):
        pylab.figure()
        pylab.xlabel('m/z, Th')
        pylab.ylabel('Intensity, rel.units')
        pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black')
        pylab.show()
        inp = raw_input("Show more?")
        if inp != "yes":
            break;
            
print "DONE!"
Exemple #39
0
def plotHeatmap(fi, nSpectra=4000, fragBinsize=10, precursorBinsize=5):
    binf = maxMS2IntensityBinFunc

    # filter spectra so that at most 4000 spectra with log10(maxMS2Intensity) == 2 are used
    # spectra are sorted by log10(maxMS2Intensity), precursor mass
    with mgf.read(fi) as reader:
        spectra = sorted([(spectrum['params']['pepmass'][0], spectrum['params']['title'], binf(spectrum)) for spectrum in reader],
                         key=lambda x:(x[2], x[0]))
        spectra = [spectrum for spectrum in spectra if spectrum[2] == 2][:nSpectra]

    for spectrum in spectra:
        print spectrum

    # build dict spectrum-id => index (might have to be precursor mass)
    usedSpectra = dict([(spectrum[1], i) for i, spectrum in enumerate(spectra)])

    # extract spectrum masses/intensities from file according to usedSpectra list
    grid_d = {}

    X,Y = [], []

    with mgf.read(fi) as reader:
        for spectrum in reader:
            if spectrum['params']['title'] in usedSpectra:
                # find precursor bin (y-axis)
                binPrecursor = usedSpectra[spectrum['params']['title']] // precursorBinsize

                maxIntensity, maxIntensityFM = 0, 0
                for fragMass, fragIntensity in it.izip(spectrum['m/z array'], spectrum['intensity array']):
                    # find fragment mass bin (x-axis) for each fragment
                    binFrag = int(fragMass // fragBinsize)
                    if fragIntensity > maxIntensity:
                        maxIntensity = fragIntensity
                        maxIntensityFM = binFrag
                        maxIntensityBP = binPrecursor

                    # key = (binFrag, binPrecursor)
                    key = (binPrecursor, binFrag)

                    # grid_d holds 2D bins (precursor mass, fragment mass)
                    if key not in grid_d:
                        grid_d[key] = []
                    grid_d[key].append(fragIntensity)
                for i in xrange(int(maxIntensity + 0.5)):
                    X.append(int(maxIntensityFM + 0.5))
                    Y.append(int(maxIntensityBP + 0.5))




    with sns.axes_style("white"):
        # f, axes = plt.subplots(1, 1, figsize=(16,16), sharey=True)
        #fig = plt.figure()
        #sp = fig.add_subplot(1,1,1)
        sns.despine(left=True)
        sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.0})
        pal = sns.color_palette("Reds", n_colors=101)
        pl = sns.jointplot(x=np.array(X), y=np.array(Y), kind="kde", color="g")
        pl.savefig('hex.png', dpi=300)

        #plt.tight_layout()
        #plt.savefig("hex.png", dpi=300)
        del X
        del Y

    def mean(L):
        return sum(L) / len(L)
    nrows, ncols = 2000 // precursorBinsize, 2000 // fragBinsize

    for key in sorted(grid_d):
        print key, grid_d[key]

    # build colormesh grid - each cell is either 0.0 or the mean intensity
    grid = []
    for x in xrange(ncols):
        for y in xrange(nrows):
            if (x, y) in grid_d:
                grid.append(mean(grid_d[(x,y)]))
                del grid_d[(x,y)]
            else:
                grid.append(0.0)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.pcolormesh(np.array(grid).reshape((nrows, ncols)), cmap=matplotlib.cm.seismic, vmin=min(grid), vmax=max(grid))
    """
    grid = []
    for y in xrange(nrows):
        row = []
        for x in xrange(ncols):
            if (x, y) in grid_d:
                row.append(mean(grid_d[(x,y)]))
                del grid_d[(x,y)]
            else:
                row.append(0.0)
        grid.append(row)

    grid = grid[::-1]
    grid = [item for sublist in grid for item in sublist]

    grid = np.array(grid).reshape((nrows, ncols))
    plt.imshow(grid,
               extent=(0, ncols, 0, nrows),
               interpolation='nearest',
               cmap=matplotlib.cm.seismic)
    """

    plt.tight_layout()
    plt.savefig(os.path.basename(fi) + '.colormap.png', dpi=300)
    plt.savefig(os.path.basename(fi) + '.colormap.svg', dpi=300)
    plt.close()

    pass
Exemple #40
0
def ingest_mgf(input_filename):
    """Ingest an mgf file given its name and return a dataframe of the file
    """
    with mgf.read('tests/test.mgf') as reader:
        auxiliary.print_tree(next(reader))