Example #1
0
def test_ms_deisotope():
    runner = CliRunner(mix_stderr=False)
    path = datafile("20150710_3um_AGP_001_29_30.mzML.gz")
    reference = datafile("20150710_3um_AGP_001_29_30.preprocessed.mzML.gz")
    outpath = tempfile.mktemp()
    result = runner.invoke(deisotoper.deisotope, [
        "-b", 0, "-t", 20, "-tn", 10, "-m", 3, "-mn", 1, path, outpath
    ])
    result_reader = ProcessedMzMLDeserializer(outpath)
    reference_reader = ProcessedMzMLDeserializer(_compression.get_opener(reference))
    assert len(result_reader) == len(reference_reader)
    for a_bunch, b_bunch in zip(result_reader, reference_reader):
        assert len(a_bunch.products) == len(b_bunch.products)
        aprec = a_bunch.precursor
        bprec = b_bunch.precursor
        assert aprec.id == bprec.id
        diffa, diffb = diff_deconvoluted_peak_set(
            aprec.deconvoluted_peak_set, bprec.deconvoluted_peak_set)
        assert len(aprec.deconvoluted_peak_set) == len(
            bprec.deconvoluted_peak_set), "Peak Counts Diff On %r, (%r, %r)" % (aprec.id, diffa, diffb)
        assert aprec.deconvoluted_peak_set == bprec.deconvoluted_peak_set, "Peaks Diff On %r, (%r, %r)" % (
            aprec.id, diffa, diffb)

        for aprod, bprod in zip(a_bunch.products, b_bunch.products):
            assert aprod.id == bprod.id
            diffa, diffb = diff_deconvoluted_peak_set(aprod.deconvoluted_peak_set, bprod.deconvoluted_peak_set)
            assert len(aprod.deconvoluted_peak_set) == len(
                bprod.deconvoluted_peak_set), "Peak Counts Diff On %r, (%r, %r)" % (aprod.id, diffa, diffb)
            assert aprod.deconvoluted_peak_set == bprod.deconvoluted_peak_set, "Peaks Diff On %r" % (
                aprod.id, diffa, diffb)

    result_reader.close()
    reference_reader.close()
    os.remove(outpath)
Example #2
0
def metadata_index(paths, processes=4, deconvoluted=False):
    '''Build an external scan metadata index for a mass spectrometry data file

    This extended index is saved in a separate JSON file that can be loaded with
    :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor
    mass of MSn scans, as well as the relationships between precursor and product ion
    scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information
    '''
    for path in paths:
        click.echo("Indexing %s" % (path, ))
        if deconvoluted:
            reader = ProcessedMzMLDeserializer(path, use_extended_index=False)
        else:
            reader = MSFileLoader(path)
        try:
            fn = reader.prebuild_byte_offset_file
            if not reader.source._check_has_byte_offset_file():
                fn(path)
        except AttributeError:
            pass
        if processes > 1:
            progbar = progress(label='Building Index', length=100)
            acc = [0]

            def update_bar(x):
                '''Progress Bar update callback for :func:`~.quick_index.index`
                '''
                x = int(x * 100)
                x -= acc[0]  # pylint: disable=cell-var-from-loop
                progbar.update(x)  # pylint: disable=cell-var-from-loop
                acc[0] += x  # pylint: disable=cell-var-from-loop

            with progbar:
                update_bar(0.0)
                index, _ = quick_index.index(
                    reader, processes, progress_indicator=update_bar)
        else:
            index = quick_index.ExtendedScanIndex()
            reader.reset()
            try:
                n = len(reader)
                progbar = progress(label='Building Index', length=n)
            except TypeError:
                progbar = spinner(title="Building Index")
            with progbar:
                for bunch in reader.make_iterator(grouped=True):
                    i = 0
                    i += bunch.precursor is not None
                    i += len(bunch.products)
                    index.add_scan_bunch(bunch)
                    progbar.update(i)

        name = path
        index_file_name = index.index_file_name(name)
        with open(index_file_name, 'w') as fh:
            index.serialize(fh)
Example #3
0
def main(source_paths, output_path):
    '''Combine multiple processed mzML files together into a single file sorted by time.
    '''
    sources = []
    for source_path in source_paths:
        click.echo("Reading %r" % source_path)
        sources.append(ProcessedMzMLDeserializer(source_path))
    total_n = sum(map(len, sources))
    writer = MzMLSerializer(open(output_path, 'wb'), total_n)
    iterator = TimeOrderMergingIterator(sources)
    writer.copy_metadata_from(sources[0])
    with writer:
        i = 0
        for bunch in iterator:
            i += 1
            if i % 100 == 0:
                click.echo("Processed %d batches. %d sources depeted" %
                           (i, iterator.count_exhausted()))
            writer.save(bunch)
Example #4
0
 def reader(self):
     reader = ProcessedMzMLDeserializer(get_opener(self.path))
     return reader
Example #5
0
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None,
                        in_memory=False, deconvoluted=False):
    '''Cluster spectra by precursor mass and cosine similarity.

    Spectrum clusters are written out to a text file recording
    cluster precursor mass, within-cluster similarity, and the
    source file and scan ID for each cluster member.
    '''
    if not similarity_thresholds:
        similarity_thresholds = [0.1, 0.4, 0.7]
    else:
        similarity_thresholds = sorted(similarity_thresholds)
    if output_path is None:
        output_path = "-"
    msn_scans = []
    n_spectra = 0

    with click.progressbar(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '') as progbar:
        key_seqs = []
        for path in progbar:
            if deconvoluted:
                reader = ProcessedMzMLDeserializer(path)
                index = reader.extended_index
            else:
                reader, index = _ensure_metadata_index(path)
            key_seqs.append((reader, index))
            n_spectra += len(index.msn_ids)

    with click.progressbar(label="Loading Spectra", length=n_spectra,
                           item_show_func=lambda x: str(x) if x else '') as progbar:
        for reader, index in key_seqs:
            if not in_memory:
                proxy_context = ScanProxyContext(reader)
                pinfo_map = {
                    pinfo.product_scan_id: pinfo for pinfo in
                    index.get_precursor_information()
                }
                for i in index.msn_ids:
                    progbar.current_item = i
                    progbar.update(1)
                    scan = proxy_context(i)
                    scan.precursor_information = pinfo_map[i]
                    msn_scans.append(scan)
            else:
                for i in index.msn_ids:
                    progbar.current_item = i
                    progbar.update(1)
                    scan = reader.get_scan_by_id(i)
                    if scan.peak_set is None and not deconvoluted:
                        scan.pick_peaks()
                    msn_scans.append(scan)

    click.echo("Begin Clustering", err=True)
    clusters = iterative_clustering(
        msn_scans, precursor_error_tolerance, similarity_thresholds)
    by_size = Counter()
    for cluster in clusters:
        by_size[len(cluster)] += 1
    click.echo("Clusters: {:d}".format(len(clusters)))
    for key, value in sorted(by_size.items()):
        click.echo("Size {:d}: {:d}".format(key, value))
    with click.open_file(output_path, mode='w') as outfh:
        writer = ScanClusterWriter(outfh)
        for cluster in clusters:
            writer.save(cluster)
Example #6
0
 def load_spectra(self):
     return list(
         ProcessedMzMLDeserializer(
             get_test_data("example_glycopeptide_spectra.mzML")))
 def reader(self):
     reader = ProcessedMzMLDeserializer(idzip.open(self.path))
     return reader
Example #8
0
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None,
                        in_memory=False, deconvoluted=False, cache_size=2**10):
    '''Cluster spectra by precursor mass and cosine similarity.

    Spectrum clusters are written out to a text file recording
    cluster precursor mass, within-cluster similarity, and the
    source file and scan ID for each cluster member.
    '''
    if not similarity_thresholds:
        similarity_thresholds = [0.1, 0.4, 0.7]
    else:
        similarity_thresholds = sorted(similarity_thresholds)
    if output_path is None:
        output_path = "-"
    msn_scans = []
    n_spectra = 0

    with click.progressbar(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '') as progbar:
        key_seqs = []
        for path in progbar:
            if deconvoluted:
                reader = ProcessedMzMLDeserializer(path)
                index = reader.extended_index
            else:
                reader, index = _ensure_metadata_index(path)
            key_seqs.append((reader, index))
            n_spectra += len(index.msn_ids)

    with click.progressbar(label="Loading Spectra", length=n_spectra,
                           item_show_func=lambda x: str(x) if x else '') as progbar:
        for reader, index in key_seqs:
            if not in_memory:
                if not reader.has_fast_random_access:
                    click.secho(
                        "%s does not have fast random access, scan fetching may be slow!" % (
                            reader, ), fg='yellow')
                proxy_context = ScanProxyContext(reader, cache_size=cache_size)
                pinfo_map = {
                    pinfo.product_scan_id: pinfo for pinfo in
                    index.get_precursor_information()
                }
                for i in index.msn_ids:
                    progbar.current_item = i
                    progbar.update(1)
                    scan = proxy_context(i)
                    scan.precursor_information = pinfo_map[i]
                    msn_scans.append(scan)
            else:
                if reader.has_fast_random_access:
                    # We have fast random access so we can just loop over the index and pull out
                    # the MSn scans directly without completely traversing the file.
                    for i in index.msn_ids:
                        progbar.current_item = i
                        progbar.update(1)
                        scan = reader.get_scan_by_id(i)
                        if scan.peak_set is None and not deconvoluted:
                            scan = scan.pick_peaks().pack()
                        msn_scans.append(scan)
                else:
                    # If we don't  have fast random access, it's better just to loop over the file,
                    # and absorb the cost of parsing the MS1 scans
                    reader.reset()
                    reader.make_iterator(grouped=False)
                    for scan in reader:
                        if scan.ms_level != 1:
                            progbar.current_item = scan.id
                            progbar.update(1)
                            if scan.peak_set is None and not deconvoluted:
                                scan = scan.pick_peaks().pack(bind=True)
                            msn_scans.append(scan)
                # Dispose of the state that is no longer required.
                reader.reset()
                index.clear()


    click.echo("Begin Clustering", err=True)
    clusters = iterative_clustering(
        msn_scans, precursor_error_tolerance, similarity_thresholds)
    click.echo("Clusering Finished", err=True)
    by_size = Counter()
    for cluster in clusters:
        by_size[len(cluster)] += 1
    click.echo("Clusters: {:d}".format(len(clusters)), err=True)
    for key, value in sorted(by_size.items()):
        click.echo("Size {:d}: {:d}".format(key, value), err=True)
    with click.open_file(output_path, mode='w') as outfh:
        writer = ScanClusterWriter(outfh)
        for cluster in clusters:
            writer.save(cluster)
def main(path, reference_path):
    reader = ProcessedMzMLDeserializer(get_opener(path))
    reference_reader = ProcessedMzMLDeserializer(get_opener(reference_path))
    compare_readers(reader, reference_reader)
    print("Processed Files Appear to Match Perfectly.")