Ejemplo n.º 1
0
def metadata_index(paths, processes=4, deconvoluted=False):
    '''Build an external scan metadata index for a mass spectrometry data file

    This extended index is saved in a separate JSON file that can be loaded with
    :class:`~.ExtendedScanIndex`. It includes the scan time of all scans, the precursor
    mass of MSn scans, as well as the relationships between precursor and product ion
    scans, as well as other details. See :class:`~.ExtendedScanIndex` for more information
    '''
    for path in paths:
        click.echo("Indexing %s" % (path, ))
        if deconvoluted:
            reader = ProcessedMzMLDeserializer(path, use_extended_index=False)
        else:
            reader = MSFileLoader(path)
        try:
            fn = reader.prebuild_byte_offset_file
            if not reader.source._check_has_byte_offset_file():
                fn(path)
        except AttributeError:
            pass
        if processes > 1:
            progbar = progress(label='Building Index', length=100)
            acc = [0]

            def update_bar(x):
                '''Progress Bar update callback for :func:`~.quick_index.index`
                '''
                x = int(x * 100)
                x -= acc[0]  # pylint: disable=cell-var-from-loop
                progbar.update(x)  # pylint: disable=cell-var-from-loop
                acc[0] += x  # pylint: disable=cell-var-from-loop

            with progbar:
                update_bar(0.0)
                index, _ = quick_index.index(
                    reader, processes, progress_indicator=update_bar)
        else:
            index = quick_index.ExtendedScanIndex()
            reader.reset()
            try:
                n = len(reader)
                progbar = progress(label='Building Index', length=n)
            except TypeError:
                progbar = spinner(title="Building Index")
            with progbar:
                for bunch in reader.make_iterator(grouped=True):
                    i = 0
                    i += bunch.precursor is not None
                    i += len(bunch.products)
                    index.add_scan_bunch(bunch)
                    progbar.update(i)

        name = path
        index_file_name = index.index_file_name(name)
        with open(index_file_name, 'w') as fh:
            index.serialize(fh)
Ejemplo n.º 2
0
def spectrum_clustering(paths, precursor_error_tolerance=1e-5, similarity_thresholds=None, output_path=None,
                        in_memory=False, deconvoluted=False, cache_size=2**10):
    '''Cluster spectra by precursor mass and cosine similarity.

    Spectrum clusters are written out to a text file recording
    cluster precursor mass, within-cluster similarity, and the
    source file and scan ID for each cluster member.
    '''
    if not similarity_thresholds:
        similarity_thresholds = [0.1, 0.4, 0.7]
    else:
        similarity_thresholds = sorted(similarity_thresholds)
    if output_path is None:
        output_path = "-"
    msn_scans = []
    n_spectra = 0

    with click.progressbar(paths, label="Indexing", item_show_func=lambda x: str(x) if x else '') as progbar:
        key_seqs = []
        for path in progbar:
            if deconvoluted:
                reader = ProcessedMzMLDeserializer(path)
                index = reader.extended_index
            else:
                reader, index = _ensure_metadata_index(path)
            key_seqs.append((reader, index))
            n_spectra += len(index.msn_ids)

    with click.progressbar(label="Loading Spectra", length=n_spectra,
                           item_show_func=lambda x: str(x) if x else '') as progbar:
        for reader, index in key_seqs:
            if not in_memory:
                if not reader.has_fast_random_access:
                    click.secho(
                        "%s does not have fast random access, scan fetching may be slow!" % (
                            reader, ), fg='yellow')
                proxy_context = ScanProxyContext(reader, cache_size=cache_size)
                pinfo_map = {
                    pinfo.product_scan_id: pinfo for pinfo in
                    index.get_precursor_information()
                }
                for i in index.msn_ids:
                    progbar.current_item = i
                    progbar.update(1)
                    scan = proxy_context(i)
                    scan.precursor_information = pinfo_map[i]
                    msn_scans.append(scan)
            else:
                if reader.has_fast_random_access:
                    # We have fast random access so we can just loop over the index and pull out
                    # the MSn scans directly without completely traversing the file.
                    for i in index.msn_ids:
                        progbar.current_item = i
                        progbar.update(1)
                        scan = reader.get_scan_by_id(i)
                        if scan.peak_set is None and not deconvoluted:
                            scan = scan.pick_peaks().pack()
                        msn_scans.append(scan)
                else:
                    # If we don't  have fast random access, it's better just to loop over the file,
                    # and absorb the cost of parsing the MS1 scans
                    reader.reset()
                    reader.make_iterator(grouped=False)
                    for scan in reader:
                        if scan.ms_level != 1:
                            progbar.current_item = scan.id
                            progbar.update(1)
                            if scan.peak_set is None and not deconvoluted:
                                scan = scan.pick_peaks().pack(bind=True)
                            msn_scans.append(scan)
                # Dispose of the state that is no longer required.
                reader.reset()
                index.clear()


    click.echo("Begin Clustering", err=True)
    clusters = iterative_clustering(
        msn_scans, precursor_error_tolerance, similarity_thresholds)
    click.echo("Clusering Finished", err=True)
    by_size = Counter()
    for cluster in clusters:
        by_size[len(cluster)] += 1
    click.echo("Clusters: {:d}".format(len(clusters)), err=True)
    for key, value in sorted(by_size.items()):
        click.echo("Size {:d}: {:d}".format(key, value), err=True)
    with click.open_file(output_path, mode='w') as outfh:
        writer = ScanClusterWriter(outfh)
        for cluster in clusters:
            writer.save(cluster)