Beispiel #1
0
    def test_consumer(self):
        (ms1_peak_picking_args, msn_peak_picking_args, ms1_deconvolution_args,
         msn_deconvolution_args) = self.build_args()
        outdir = self.make_output_directory()
        outpath = os.path.join(outdir, "test-output.mzML")

        consumer = SampleConsumer(
            agp_glycomics_mzml,
            ms1_peak_picking_args=ms1_peak_picking_args,
            ms1_deconvolution_args=ms1_deconvolution_args,
            msn_peak_picking_args=msn_peak_picking_args,
            msn_deconvolution_args=msn_deconvolution_args,
            storage_path=outpath,
            sample_name=None,
            n_processes=5,
            extract_only_tandem_envelopes=False)
        consumer.start()

        reader = ProcessedMzMLDeserializer(outpath)

        scan = reader.get_scan_by_id("scanId=1601016")
        self.assertIsNotNone(
            scan.deconvoluted_peak_set.has_peak(958.66, use_mz=1))

        reader.close()

        self.cleanup(outdir)
    def test_consumer(self):
        (ms1_peak_picking_args, msn_peak_picking_args,
         ms1_deconvolution_args, msn_deconvolution_args) = self.build_args()
        outdir = self.make_output_directory()
        outpath = os.path.join(outdir, "test-output.mzML")

        consumer = SampleConsumer(
            agp_glycomics_mzml,
            ms1_peak_picking_args=ms1_peak_picking_args,
            ms1_deconvolution_args=ms1_deconvolution_args,
            msn_peak_picking_args=msn_peak_picking_args,
            msn_deconvolution_args=msn_deconvolution_args,
            storage_path=outpath, sample_name=None,
            n_processes=5,
            extract_only_tandem_envelopes=False)
        consumer.start()

        reader = ProcessedMzMLDeserializer(outpath)

        scan = reader.get_scan_by_id("scanId=1601016")
        self.assertIsNotNone(scan.deconvoluted_peak_set.has_peak(958.66, use_mz=1))

        reader.close()

        self.cleanup(outdir)
Beispiel #3
0
 def _make_scan_loader(self):
     if self.mzml_path is not None:
         if not os.path.exists(self.mzml_path):
             raise IOError("No such file {}".format(self.mzml_path))
         self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
     else:
         self.mzml_path = self.analysis.parameters['sample_path']
         if not os.path.exists(self.mzml_path):
             raise IOError((
                 "No such file {}. If {} was relocated, you may need to explicily pass the"
                 " corrected file path.").format(
                 self.mzml_path,
                 self.database_connection._original_connection))
         self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
Beispiel #4
0
 def _make_scan_loader(self):
     if self.mzml_path is not None:
         if not os.path.exists(self.mzml_path):
             raise IOError("No such file {}".format(self.mzml_path))
         self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
     else:
         self.mzml_path = self.analysis.parameters['sample_path']
         if not os.path.exists(self.mzml_path):
             raise IOError((
                 "No such file {}. If {} was relocated, you may need to explicily pass the"
                 " corrected file path.").format(
                     self.mzml_path,
                     self.database_connection._original_connection))
         self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
 def __init__(self, record, minimum_mass=None, abundance_threshold=None):
     SimpleViewBase.__init__(self)
     self.record = record
     self.reader = ProcessedMzMLDeserializer(record.path)
     self.scan_levels = {
         "1": len(self.reader.extended_index.ms1_ids),
         "N": len(self.reader.extended_index.msn_ids)
     }
     self.minimum_mass = minimum_mass
     self.abundance_threshold = abundance_threshold
     self._chromatograms = None
     self.chromatograms = None
     self.total_ion_chromatogram = None
     self.oxonium_ion_chromatogram = None
     self.chromatogram_artist = None
     self.oxonium_ion_artist = None
Beispiel #6
0
def add_sample(project_path, sample_path):
    from glycresoft_app.project.project import Project
    from ms_deisotope.output.mzml import ProcessedMzMLDeserializer
    project = Project(project_path)
    reader = ProcessedMzMLDeserializer(abspath(sample_path))
    record = project.sample_manager.make_record(reader)
    project.sample_manager.put(record)
    project.sample_manager.dump()
Beispiel #7
0
 def is_resolvable(self):
     if not os.path.exists(self.path):
         return False
     reader = ProcessedMzMLDeserializer(self.path, use_index=False)
     sample_run = reader.sample_run
     if sample_run.uuid != self.uuid:
         return False
     return True
Beispiel #8
0
    def test_consumer(self):
        (ms1_peak_picking_args, msn_peak_picking_args, ms1_deconvolution_args,
         msn_deconvolution_args) = self.build_args()
        outdir = self.make_output_directory()
        outpath = os.path.join(outdir, "test-output.mzML")

        consumer = SampleConsumer(
            agp_glycproteomics_mzml,
            ms1_peak_picking_args=ms1_peak_picking_args,
            ms1_deconvolution_args=ms1_deconvolution_args,
            msn_peak_picking_args=msn_peak_picking_args,
            msn_deconvolution_args=msn_deconvolution_args,
            storage_path=outpath,
            sample_name=None,
            n_processes=5,
            extract_only_tandem_envelopes=True,
            ms1_averaging=1)
        consumer.start()

        reader = ProcessedMzMLDeserializer(outpath)
        reference = ProcessedMzMLDeserializer(
            agp_glycproteomics_mzml_reference)

        for a_bunch, b_bunch in zip(reader, reference):
            assert a_bunch.precursor.id == b_bunch.precursor.id
            assert len(a_bunch.products) == len(b_bunch.products)
            for a_product, b_product in zip(a_bunch.products,
                                            b_bunch.products):
                assert a_product.precursor_information.defaulted == b_product.precursor_information.defaulted
                matched = np.isclose(
                    a_product.precursor_information.neutral_mass,
                    b_product.precursor_information.neutral_mass)
                message = [
                    "%0.3f not close to %0.3f for %s of %s" %
                    (a_product.precursor_information.neutral_mass,
                     b_product.precursor_information.neutral_mass,
                     a_product.id,
                     a_product.precursor_information.precursor_scan_id)
                ]
                message.append(
                    "Found precursor score %r, expected %r" %
                    (a_product.precursor_information.precursor.
                     deconvoluted_peak_set.has_peak(
                         a_product.precursor_information.neutral_mass).score,
                     b_product.precursor_information.precursor.
                     deconvoluted_peak_set.has_peak(
                         b_product.precursor_information.neutral_mass).score))
                assert matched, '\n'.join(message)
                assert len(a_product.deconvoluted_peak_set) == len(
                    b_product.deconvoluted_peak_set)

        reader.close()
        reference.close()

        self.cleanup(outdir)
Beispiel #9
0
def oxonium_signature(ms_file, g_score_threshold=0.05):
    reader = ProcessedMzMLDeserializer(ms_file)
    if not reader.has_index_file():
        click.secho("Building temporary index...", fg='yellow')
        index, intervals = quick_index.index(ms_deisotope.MSFileLoader(ms_file))
        reader.extended_index = index
        with open(reader._index_file_name, 'w') as handle:
            index.serialize(handle)

    from glycan_profiling.tandem.glycan.scoring.signature_ion_scoring import SignatureIonScorer
    from glycan_profiling.tandem.oxonium_ions import gscore_scanner
    refcomp = glypy.GlycanComposition.parse("{Fuc:1; Hex:5; HexNAc:4; Neu5Ac:2}")
    for scan_id in reader.extended_index.msn_ids.keys():
        scan = reader.get_scan_by_id(scan_id)
        gscore = gscore_scanner(scan.deconvoluted_peak_set)
        if gscore >= g_score_threshold:
            signature_match = SignatureIonScorer.evaluate(scan, refcomp)
            click.echo("%s\t%f\t%r\t%f\t%f" % (
                scan_id, scan.precursor_information.neutral_mass,
                scan.precursor_information.charge, gscore,
                signature_match.score))
 def make_peak_loader(self):
     peak_loader = ProcessedMzMLDeserializer(self.sample_path)
     if peak_loader.extended_index is None:
         if not peak_loader.has_index_file():
             self.log("Index file missing. Rebuilding.")
             peak_loader.build_extended_index()
         else:
             peak_loader.read_index_file()
         if peak_loader.extended_index is None or len(peak_loader.extended_index.msn_ids) < 1:
             raise ValueError("Sample Data Invalid: Could not validate MS/MS Index")
     return peak_loader
Beispiel #11
0
def msfile_info(ms_file):
    reader = ProcessedMzMLDeserializer(ms_file)
    if not reader.has_index_file():
        index, intervals = quick_index.index(
            ms_deisotope.MSFileLoader(ms_file))
        reader.extended_index = index
        with open(reader._index_file_name, 'w') as handle:
            index.serialize(handle)
    click.echo("Name: %s" % (os.path.basename(ms_file), ))
    click.echo("MS1 Scans: %d" % (len(reader.extended_index.ms1_ids), ))
    click.echo("MSn Scans: %d" % (len(reader.extended_index.msn_ids), ))

    n_defaulted = 0
    n_orphan = 0

    charges = defaultdict(int)
    first_msn = float('inf')
    last_msn = 0
    for scan_info in reader.extended_index.msn_ids.values():
        n_defaulted += scan_info.get('defaulted', False)
        n_orphan += scan_info.get('orphan', False)
        charges[scan_info['charge']] += 1
        rt = scan_info['scan_time']
        if rt < first_msn:
            first_msn = rt
        if rt > last_msn:
            last_msn = rt

    click.echo("First MSn Scan: %0.2f Minutes" % (first_msn, ))
    click.echo("Last MSn Scan: %0.2f Minutes" % (last_msn, ))

    for charge, count in sorted(charges.items()):
        if not isinstance(charge, int):
            continue
        click.echo("Precursors with Charge State %d: %d" % (charge, count))

    click.echo("Defaulted MSn Scans: %d" % (n_defaulted, ))
    click.echo("Orphan MSn Scans: %d" % (n_orphan, ))
Beispiel #12
0
def oxonium_signature(ms_file, g_score_threshold=0.05):
    reader = ProcessedMzMLDeserializer(ms_file)
    if not reader.has_index_file():
        click.secho("Building temporary index...", fg='yellow')
        index, intervals = quick_index.index(
            ms_deisotope.MSFileLoader(ms_file))
        reader.extended_index = index
        with open(reader._index_file_name, 'w') as handle:
            index.serialize(handle)

    from glycan_profiling.tandem.glycan.scoring.signature_ion_scoring import SignatureIonScorer
    from glycan_profiling.tandem.oxonium_ions import gscore_scanner
    refcomp = glypy.GlycanComposition.parse(
        "{Fuc:1; Hex:5; HexNAc:4; Neu5Ac:2}")
    for scan_id in reader.extended_index.msn_ids.keys():
        scan = reader.get_scan_by_id(scan_id)
        gscore = gscore_scanner(scan.deconvoluted_peak_set)
        if gscore >= g_score_threshold:
            signature_match = SignatureIonScorer.evaluate(scan, refcomp)
            click.echo("%s\t%f\t%r\t%f\t%f" %
                       (scan_id, scan.precursor_information.neutral_mass,
                        scan.precursor_information.charge, gscore,
                        signature_match.score))
Beispiel #13
0
 def make_record(cls, reader):
     if isinstance(reader, basestring):
         reader = ProcessedMzMLDeserializer(reader, use_index=True)
     sample = reader.sample_run
     if len(reader.extended_index.msn_ids) > 0:
         sample_type = "MS/MS Sample"
     else:
         sample_type = "MS Sample"
     record = SampleRunRecord(name=sample.name,
                              uuid=sample.uuid,
                              path=reader.source_file,
                              completed=True,
                              sample_type=sample_type)
     return record
Beispiel #14
0
def msfile_info(ms_file):
    reader = ProcessedMzMLDeserializer(ms_file)
    if not reader.has_index_file():
        index, intervals = quick_index.index(ms_deisotope.MSFileLoader(ms_file))
        reader.extended_index = index
        with open(reader._index_file_name, 'w') as handle:
            index.serialize(handle)
    click.echo("Name: %s" % (os.path.basename(ms_file),))
    click.echo("MS1 Scans: %d" % (len(reader.extended_index.ms1_ids),))
    click.echo("MSn Scans: %d" % (len(reader.extended_index.msn_ids),))

    n_defaulted = 0
    n_orphan = 0

    charges = defaultdict(int)
    first_msn = float('inf')
    last_msn = 0
    for scan_info in reader.extended_index.msn_ids.values():
        n_defaulted += scan_info.get('defaulted', False)
        n_orphan += scan_info.get('orphan', False)
        charges[scan_info['charge']] += 1
        rt = scan_info['scan_time']
        if rt < first_msn:
            first_msn = rt
        if rt > last_msn:
            last_msn = rt

    click.echo("First MSn Scan: %0.2f Minutes" % (first_msn,))
    click.echo("Last MSn Scan: %0.2f Minutes" % (last_msn,))

    for charge, count in sorted(charges.items()):
        if not isinstance(charge, int):
            continue
        click.echo("Precursors with Charge State %d: %d" % (charge, count))

    click.echo("Defaulted MSn Scans: %d" % (n_defaulted,))
    click.echo("Orphan MSn Scans: %d" % (n_orphan,))
 def make_peak_loader(self):
     peak_loader = ProcessedMzMLDeserializer(self.sample_path)
     if peak_loader.extended_index is None:
         if not peak_loader.has_index_file():
             self.log("Index file missing. Rebuilding.")
             peak_loader.build_extended_index()
         else:
             peak_loader.read_index_file()
         if peak_loader.extended_index is None or len(peak_loader.extended_index.msn_ids) < 1:
             raise ValueError("Sample Data Invalid: Could not validate MS/MS Index")
     return peak_loader
    def test_consumer(self):
        (ms1_peak_picking_args, msn_peak_picking_args,
         ms1_deconvolution_args, msn_deconvolution_args) = self.build_args()
        outdir = self.make_output_directory()
        outpath = os.path.join(outdir, "test-output.mzML")

        consumer = SampleConsumer(
            agp_glycproteomics_mzml,
            ms1_peak_picking_args=ms1_peak_picking_args,
            ms1_deconvolution_args=ms1_deconvolution_args,
            msn_peak_picking_args=msn_peak_picking_args,
            msn_deconvolution_args=msn_deconvolution_args,
            storage_path=outpath, sample_name=None,
            n_processes=5,
            extract_only_tandem_envelopes=True,
            ms1_averaging=1)
        consumer.start()

        reader = ProcessedMzMLDeserializer(outpath)
        reference = ProcessedMzMLDeserializer(agp_glycproteomics_mzml_reference)

        for a_bunch, b_bunch in zip(reader, reference):
            assert a_bunch.precursor.id == b_bunch.precursor.id
            assert len(a_bunch.products) == len(b_bunch.products)
            for a_product, b_product in zip(a_bunch.products, b_bunch.products):
                assert a_product.precursor_information.defaulted == b_product.precursor_information.defaulted
                matched = np.isclose(a_product.precursor_information.neutral_mass,
                                     b_product.precursor_information.neutral_mass)
                message = ["%0.3f not close to %0.3f for %s of %s" % (
                    a_product.precursor_information.neutral_mass,
                    b_product.precursor_information.neutral_mass,
                    a_product.id, a_product.precursor_information.precursor_scan_id)]
                message.append("Found precursor score %r, expected %r" % (
                    a_product.precursor_information.precursor.deconvoluted_peak_set.has_peak(
                        a_product.precursor_information.neutral_mass).score,
                    b_product.precursor_information.precursor.deconvoluted_peak_set.has_peak(
                        b_product.precursor_information.neutral_mass).score
                ))
                assert matched, '\n'.join(message)
                assert len(a_product.deconvoluted_peak_set) == len(b_product.deconvoluted_peak_set)

        reader.close()
        reference.close()

        self.cleanup(outdir)
def analyze_glycopeptide_sequences(
        database_connection,
        sample_path,
        hypothesis_identifier,
        output_path,
        analysis_name,
        grouping_error_tolerance=1.5e-5,
        mass_error_tolerance=1e-5,
        msn_mass_error_tolerance=2e-5,
        psm_fdr_threshold=0.05,
        peak_shape_scoring_model=None,
        minimum_oxonium_threshold=0.05,
        workload_size=1000,
        use_peptide_mass_filter=True,
        mass_shifts=None,
        permute_decoy_glycan_fragments=False,
        include_rare_signature_ions=False,
        model_retention_time=False,
        search_strategy=GlycopeptideSearchStrategyEnum.classic,
        decoy_database_connection=None,
        decoy_hypothesis_id=None,
        tandem_scoring_model=None,
        channel=None,
        **kwargs):
    if peak_shape_scoring_model is None:
        peak_shape_scoring_model = GeneralScorer.clone()
        peak_shape_scoring_model.add_feature(get_feature("null_charge"))

    database_connection = DatabaseBoundOperation(database_connection)
    if decoy_database_connection:
        decoy_database_connection = DatabaseBoundOperation(
            decoy_database_connection)

    if not os.path.exists(sample_path):
        channel.send(
            Message("Could not locate sample %r" % sample_path, "error"))
        return

    reader = ProcessedMzMLDeserializer(sample_path, use_index=False)
    sample_run = reader.sample_run

    try:
        hypothesis = get_by_name_or_id(database_connection,
                                       GlycopeptideHypothesis,
                                       hypothesis_identifier)
    except Exception:
        channel.send(
            Message("Could not locate hypothesis %r" % hypothesis_identifier,
                    "error"))
        channel.abort("An error occurred during analysis.")

    if decoy_database_connection:
        try:
            decoy_hypothesis = get_by_name_or_id(decoy_database_connection,
                                                 GlycopeptideHypothesis,
                                                 decoy_hypothesis_id)
        except Exception:
            channel.send(
                Message("Could not locate hypothesis %r" % decoy_hypothesis_id,
                        "error"))
            channel.abort("An error occurred during analysis.")

    if analysis_name is None:
        analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name)
    analysis_name = validate_analysis_name(None, database_connection.session,
                                           analysis_name)

    try:
        mass_shift_out = []
        for mass_shift, multiplicity in mass_shifts:
            mass_shift_out.append(validate_mass_shift(mass_shift,
                                                      multiplicity))
        expanded = []
        expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts(
            dict(mass_shift_out), crossproduct=False)
        mass_shifts = expanded
    except Abort:
        channel.send(Message.traceback())
        return

    try:
        if search_strategy == GlycopeptideSearchStrategyEnum.classic:
            analyzer = MzMLGlycopeptideLCMSMSAnalyzer(
                database_connection._original_connection,
                hypothesis.id,
                sample_path,
                output_path=output_path,
                analysis_name=analysis_name,
                grouping_error_tolerance=grouping_error_tolerance,
                mass_error_tolerance=mass_error_tolerance,
                msn_mass_error_tolerance=msn_mass_error_tolerance,
                psm_fdr_threshold=psm_fdr_threshold,
                peak_shape_scoring_model=peak_shape_scoring_model,
                oxonium_threshold=minimum_oxonium_threshold,
                spectrum_batch_size=workload_size,
                use_peptide_mass_filter=use_peptide_mass_filter,
                mass_shifts=mass_shifts,
                permute_decoy_glycans=permute_decoy_glycan_fragments,
                rare_signatures=include_rare_signature_ions,
                model_retention_time=model_retention_time,
                tandem_scoring_model=tandem_scoring_model)
        elif search_strategy == GlycopeptideSearchStrategyEnum.classic_comparison:
            analyzer = MzMLComparisonGlycopeptideLCMSMSAnalyzer(
                database_connection._original_connection,
                decoy_database_connection._original_connection,
                hypothesis.id,
                sample_path,
                output_path=output_path,
                analysis_name=analysis_name,
                grouping_error_tolerance=grouping_error_tolerance,
                mass_error_tolerance=mass_error_tolerance,
                msn_mass_error_tolerance=msn_mass_error_tolerance,
                psm_fdr_threshold=psm_fdr_threshold,
                peak_shape_scoring_model=peak_shape_scoring_model,
                oxonium_threshold=minimum_oxonium_threshold,
                spectrum_batch_size=workload_size,
                use_peptide_mass_filter=use_peptide_mass_filter,
                mass_shifts=mass_shifts,
                permute_decoy_glycans=permute_decoy_glycan_fragments,
                rare_signatures=include_rare_signature_ions,
                model_retention_time=model_retention_time,
                tandem_scoring_model=tandem_scoring_model)
        elif search_strategy == GlycopeptideSearchStrategyEnum.multipart:
            analyzer = MultipartGlycopeptideLCMSMSAnalyzer(
                database_connection._original_connection,
                decoy_database_connection._original_connection,
                hypothesis.id,
                decoy_hypothesis.id,
                sample_path,
                output_path=output_path,
                analysis_name=analysis_name,
                grouping_error_tolerance=grouping_error_tolerance,
                mass_error_tolerance=mass_error_tolerance,
                msn_mass_error_tolerance=msn_mass_error_tolerance,
                psm_fdr_threshold=psm_fdr_threshold,
                peak_shape_scoring_model=peak_shape_scoring_model,
                spectrum_batch_size=workload_size,
                mass_shifts=mass_shifts,
                rare_signatures=include_rare_signature_ions,
                model_retention_time=model_retention_time,
                tandem_scoring_model=tandem_scoring_model)
        _ = analyzer.start()

        analysis = analyzer.analysis
        if analysis is not None:
            record = project_analysis.AnalysisRecord(
                name=analysis.name,
                id=analysis.id,
                uuid=analysis.uuid,
                path=output_path,
                analysis_type=analysis.analysis_type,
                hypothesis_uuid=analysis.hypothesis.uuid,
                hypothesis_name=analysis.hypothesis.name,
                sample_name=analysis.parameters['sample_name'],
                user_id=channel.user.id)
            channel.send(Message(record.to_json(), 'new-analysis'))
        else:
            channel.send(
                Message("No glycopeptides were identified for \"%s\"" %
                        (analysis_name, )))

    except Exception:
        channel.send(Message.traceback())
        channel.abort("An error occurred during analysis.")
class SampleView(SimpleViewBase):
    def __init__(self, record, minimum_mass=None, abundance_threshold=None):
        SimpleViewBase.__init__(self)
        self.record = record
        self.reader = ProcessedMzMLDeserializer(record.path)
        self.scan_levels = {
            "1": len(self.reader.extended_index.ms1_ids),
            "N": len(self.reader.extended_index.msn_ids)
        }
        self.minimum_mass = minimum_mass
        self.abundance_threshold = abundance_threshold
        self._chromatograms = None
        self.chromatograms = None
        self.total_ion_chromatogram = None
        self.oxonium_ion_chromatogram = None
        self.chromatogram_artist = None
        self.oxonium_ion_artist = None

    @property
    def chromatograms(self):
        if self._chromatograms is None:
            self.build_chromatograms()
        return self._chromatograms

    @chromatograms.setter
    def chromatograms(self, value):
        self._chromatograms = value

    def _estimate_threshold(self):
        intensity_accumulator = []
        mz_accumulator = []
        charge_accumulator = []
        if not self.reader.extended_index.ms1_ids:
            self.mass_array = np.array([])
            self.charge_array = np.array([])
            self.intensity_array = np.array([])
            self.abundance_threshold = 0
            self.minimum_mass = 0
            return
        for scan_id in self.reader.extended_index.ms1_ids:
            header = self.reader.get_scan_header_by_id(scan_id)
            intensity_accumulator.extend(header.arrays.intensity)
            mz_accumulator.extend(header.arrays.mz)
            try:
                charge_accumulator.extend(header['charge array'])
            except Exception:
                charge_accumulator.extend(
                    np.ones_like(header.arrays.mz) * header.polarity)

        mass_array = ms_deisotope.neutral_mass(np.array(mz_accumulator),
                                               np.array(charge_accumulator))
        self.mass_array = mass_array
        self.charge_array = np.array(charge_accumulator, dtype=int)
        self.intensity_array = np.array(intensity_accumulator)
        if self.abundance_threshold is None and intensity_accumulator:
            self.abundance_threshold = np.percentile(intensity_accumulator, 90)
        if self.minimum_mass is None and len(mass_array):
            counts, bins = np.histogram(self.mass_array)
            self.minimum_mass = np.average(bins[:-1], weights=counts)

    def build_oxonium_ion_chromatogram(self):
        window_width = 0.01
        ox_time = []
        ox_current = []
        for scan_id in self.reader.extended_index.msn_ids:
            try:
                scan = self.reader.get_scan_header_by_id(scan_id)
            except AttributeError:
                print("Unable to resolve scan id %r" % scan_id)
                break
            mz, intens = scan.arrays
            total = 0
            for ion in standard_oxonium_ions:
                coords = sweep(mz, ion.mass() + 1.007, window_width)
                total += intens[coords].sum()
            ox_time.append(scan.scan_time)
            ox_current.append(total)
        self.oxonium_ion_chromatogram = list(
            map(np.array, (ox_time, ox_current)))

    def draw_chromatograms(self):
        if self.chromatograms is None:
            self.build_chromatograms()
        ax = figax()
        chromatograms = list(self.chromatograms)
        if len(chromatograms):
            chromatograms.append(self.total_ion_chromatogram)
            chromatograms = [
                chrom for chrom in chromatograms if len(chrom) > 0
            ]
            a = SmoothingChromatogramArtist(
                chromatograms, ax=ax, colorizer=lambda *a, **k: 'lightblue')
            a.draw(label_function=lambda *a, **kw: "")
            rt, intens = self.total_ion_chromatogram.as_arrays()
            a.draw_generic_chromatogram("TIC", rt, intens, 'lightblue')
            a.ax.set_ylim(0, max(intens) * 1.1)
            chromatogram_artist = a
            fig = chromatogram_artist.ax.get_figure()
            fig.set_figwidth(10)
            fig.set_figheight(5)
            # if self.reader.extended_index.msn_ids:
            #     oxonium_axis = ax.twinx()
            #     stub = SimpleChromatogram(
            #         self.total_ion_chromatogram.time_converter)
            #     for key in self.total_ion_chromatogram:
            #         stub[key] = 0
            #     oxonium_ion_artist = SmoothingChromatogramArtist(
            #         [stub],
            #         ax=oxonium_axis).draw(
            #         label_function=lambda *a, **kw: "")
            #     rt, intens = self.oxonium_ion_chromatogram
            #     oxonium_axis.set_ylim(0, max(intens) * 1.1)
            #     oxonium_axis.yaxis.tick_right()
            #     oxonium_axis.axes.spines['right'].set_visible(True)
            #     oxonium_axis.set_ylabel("Oxonium Abundance", fontsize=18)
            #     oxonium_ion_artist.draw_generic_chromatogram(
            #         "Oxonium Ions", rt, intens, 'green')
        else:
            ax.text(0.5,
                    0.5,
                    "No chromatograms extracted",
                    ha='center',
                    fontsize=16)
            ax.axis('off')
        return png_plot(ax,
                        patchless=True,
                        bbox_inches='tight',
                        width=12,
                        height=8)

    def build_chromatograms(self):
        if self.abundance_threshold is None:
            self._estimate_threshold()
        ex = ChromatogramExtractor(self.reader,
                                   minimum_intensity=self.abundance_threshold,
                                   minimum_mass=self.minimum_mass)
        self.chromatograms = ex.run()
        self.total_ion_chromatogram = ex.total_ion_chromatogram

        if self.reader.extended_index.msn_ids:
            self.build_oxonium_ion_chromatogram()

    def draw_lcms_map(self):
        if self.abundance_threshold is None:
            self._estimate_threshold()
        ax = figax()
        artist = LCMSMapArtist.from_peak_loader(
            self.reader, threshold=self.abundance_threshold / 2., ax=ax)
        artist.draw()
        return png_plot(ax,
                        patchless=True,
                        bbox_inches='tight',
                        width=10,
                        height=10)
    def test_writer(self):
        source_reader = MzMLLoader(self.source_data_path)
        fd, name = tempfile.mkstemp()
        with open(name, 'wb') as fh:
            writer = MzMLSerializer(fh,
                                    n_spectra=len(source_reader.index),
                                    deconvoluted=True)
            description = source_reader.file_description()
            writer.add_file_information(description)
            writer.add_file_contents("profile spectrum")
            writer.add_file_contents("centroid spectrum")
            writer.remove_file_contents("profile spectrum")

            instrument_configs = source_reader.instrument_configuration()
            for config in instrument_configs:
                writer.add_instrument_configuration(config)

            software_list = source_reader.software_list()
            for software in software_list:
                writer.add_software(software)

            data_processing_list = source_reader.data_processing()
            for dp in data_processing_list:
                writer.add_data_processing(dp)

            processing = writer.build_processing_method()
            writer.add_data_processing(processing)
            bunch = next(source_reader)
            bunch.precursor.pick_peaks()
            bunch.precursor.deconvolute()
            for product in bunch.products:
                product.pick_peaks()
                product.deconvolute()
            writer.save(bunch)
            writer.complete()
            fh.flush()
            writer.format()
        source_reader.reset()
        processed_reader = ProcessedMzMLDeserializer(
            _compression.get_opener(writer.handle.name))

        for a, b in zip(source_reader.instrument_configuration(),
                        processed_reader.instrument_configuration()):
            assert a.analyzers == b.analyzers
        for a, b in zip(source_reader, processed_reader):
            assert a.precursor.id == b.precursor.id
            assert (a.precursor.acquisition_information ==
                    b.precursor.acquisition_information)
            for an, bn in zip(a.products, b.products):
                assert an.id == bn.id
                assert abs(an.precursor_information.neutral_mass -
                           bn.precursor_information.neutral_mass) < 1e-6
        processed_reader.reset()
        description = processed_reader.file_description()
        assert "profile spectrum" not in description.contents
        assert "centroid spectrum" in description.contents
        sf = description.source_files[0]
        assert 'location' not in sf.parameters
        assert sf.parameters[
            'SHA-1'] == 'a2a091b82f27676da87a6c7d17cc90d2d90b8fbf'
        index = processed_reader.extended_index
        pinfo = index.find_msms_by_precursor_mass(
            ms_deisotope.neutral_mass(562.7397, 2))
        assert len(pinfo) > 0

        processed_reader.close()
        try:
            os.remove(name)
            os.remove(processed_reader._index_file_name)
        except OSError:
            pass
def analyze_glycopeptide_sequences(database_connection,
                                   sample_path,
                                   hypothesis_identifier,
                                   output_path,
                                   analysis_name,
                                   grouping_error_tolerance=1.5e-5,
                                   mass_error_tolerance=1e-5,
                                   msn_mass_error_tolerance=2e-5,
                                   psm_fdr_threshold=0.05,
                                   peak_shape_scoring_model=None,
                                   minimum_oxonium_threshold=0.05,
                                   workload_size=1000,
                                   channel=None,
                                   **kwargs):
    if peak_shape_scoring_model is None:
        peak_shape_scoring_model = chromatogram_solution.ChromatogramScorer(
            shape_fitter_type=shape_fitter.
            AdaptiveMultimodalChromatogramShapeFitter)
    database_connection = DatabaseBoundOperation(database_connection)

    if not os.path.exists(sample_path):
        channel.send(
            Message("Could not locate sample %r" % sample_path, "error"))
        return

    reader = ProcessedMzMLDeserializer(sample_path, use_index=False)
    sample_run = reader.sample_run

    try:
        hypothesis = get_by_name_or_id(database_connection,
                                       GlycopeptideHypothesis,
                                       hypothesis_identifier)
    except Exception:
        channel.send(
            Message("Could not locate hypothesis %r" % hypothesis_identifier,
                    "error"))
        channel.abort("An error occurred during analysis.")

    if analysis_name is None:
        analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name)
    analysis_name = validate_analysis_name(None, database_connection.session,
                                           analysis_name)

    try:
        analyzer = MzMLGlycopeptideLCMSMSAnalyzer(
            database_connection._original_connection,
            hypothesis.id,
            sample_path,
            output_path=output_path,
            analysis_name=analysis_name,
            grouping_error_tolerance=grouping_error_tolerance,
            mass_error_tolerance=mass_error_tolerance,
            msn_mass_error_tolerance=msn_mass_error_tolerance,
            psm_fdr_threshold=psm_fdr_threshold,
            peak_shape_scoring_model=peak_shape_scoring_model,
            oxonium_threshold=minimum_oxonium_threshold,
            spectra_chunk_size=workload_size)
        gps, unassigned, target_hits, decoy_hits = analyzer.start()

        analysis = analyzer.analysis
        record = project_analysis.AnalysisRecord(
            name=analysis.name,
            id=analysis.id,
            uuid=analysis.uuid,
            path=output_path,
            analysis_type=analysis.analysis_type,
            hypothesis_uuid=analysis.hypothesis.uuid,
            hypothesis_name=analysis.hypothesis.name,
            sample_name=analysis.parameters['sample_name'],
            user_id=channel.user.id)
        channel.send(Message(record.to_json(), 'new-analysis'))

    except Exception:
        channel.send(Message.traceback())
        channel.abort("An error occurred during analysis.")
Beispiel #21
0
 def open_file(index_file):
     data_file = index_file.rsplit("-", 1)[0]
     reader = ProcessedMzMLDeserializer(data_file, use_index=False)
     reader.read_index_file()
     return reader
def preprocess(mzml_file, database_connection, averagine=None, start_time=None, end_time=None,
               maximum_charge=None, name=None, msn_averagine=None, score_threshold=35.,
               msn_score_threshold=5., missed_peaks=1, msn_missed_peaks=1, n_processes=5, storage_path=None,
               extract_only_tandem_envelopes=False, ms1_background_reduction=5.,
               msn_background_reduction=0, ms1_averaging=0, channel=None):

    minimum_charge = 1 if maximum_charge > 0 else -1
    charge_range = (minimum_charge, maximum_charge)
    logger.info("Begin Scan Interpolation")
    loader: RandomAccessScanSource = MSFileLoader(mzml_file)
    if len(loader) == 0:
        channel.abort("Cannot process an empty MS data file")
    start_scan = loader.get_scan_by_time(start_time)
    if start_scan is None:
        start_scan = loader[0]

    if loader.has_ms1_scans() == False:
        extract_only_tandem_envelopes = False

    try:
        start_scan_id = loader._locate_ms1_scan(start_scan).id
    except IndexError:
        start_scan_id = start_scan.id

    end_scan = loader.get_scan_by_time(end_time)
    if end_scan is None:
        end_scan = loader[-1]
    try:
        end_scan_id = loader._locate_ms1_scan(end_scan).id
    except IndexError:
        end_scan_id = end_scan.id

    loader.reset()
    loader.make_iterator(grouped=True)

    first_batch = next(loader)
    if first_batch.precursor is not None:
        is_profile = first_batch.precursor.is_profile
    elif first_batch.products:
        is_profile = first_batch.products[0].is_profile
    if is_profile:
        logger.info("Spectra are profile")
    else:
        logger.info("Spectra are centroided")

    logger.info("Resolving Sample Name")
    if name is None:
        name = os.path.splitext(os.path.basename(mzml_file))[0]

    name = validate_sample_run_name(None, database_connection, name)

    logger.info("Validating arguments")
    try:
        averagine = validate_averagine(averagine)
    except Exception:
        channel.abort("Could not validate MS1 Averagine %s" % averagine)

    try:
        msn_averagine = validate_averagine(msn_averagine)
    except Exception:
        channel.abort("Could not validate MSn Averagine %s" % msn_averagine)

    if is_profile:
        ms1_peak_picking_args = {
            "transforms": [
                ms_peak_picker.scan_filter.FTICRBaselineRemoval(
                    scale=ms1_background_reduction, window_length=2.),
                ms_peak_picker.scan_filter.SavitskyGolayFilter()
            ],
            'signal_to_noise_threshold': 1.0,
        }
        if ms1_background_reduction == 0:
            ms1_peak_picking_args['transforms'] = []
    else:
        ms1_peak_picking_args = {
            "transforms": [
                ms_peak_picker.scan_filter.FTICRBaselineRemoval(
                    scale=ms1_background_reduction, window_length=2.),
            ]
        }
        if ms1_background_reduction == 0:
            ms1_peak_picking_args['transforms'] = []

    if msn_background_reduction > 0:
        msn_peak_picking_args = {
            "transforms": [
                ms_peak_picker.scan_filter.FTICRBaselineRemoval(
                    scale=msn_background_reduction, window_length=2.),
            ]
        }
    else:
        msn_peak_picking_args = {'transforms': []}

    ms1_deconvolution_args = {
        "scorer": ms_deisotope.scoring.PenalizedMSDeconVFitter(score_threshold, 2.),
        "averagine": averagine,
        "charge_range": charge_range,
        "max_missed_peaks": missed_peaks,
        "truncate_after": SampleConsumer.MS1_ISOTOPIC_PATTERN_WIDTH,
        "ignore_below": SampleConsumer.MS1_IGNORE_BELOW
    }

    msn_deconvolution_args = {
        "scorer": ms_deisotope.scoring.MSDeconVFitter(msn_score_threshold),
        "averagine": msn_averagine,
        "charge_range": charge_range,
        "max_missed_peaks": msn_missed_peaks,
        "truncate_after": SampleConsumer.MSN_ISOTOPIC_PATTERN_WIDTH,
        "ignore_below": SampleConsumer.MSN_IGNORE_BELOW
    }

    consumer = SampleConsumer(
        mzml_file,
        ms1_peak_picking_args=ms1_peak_picking_args,
        ms1_deconvolution_args=ms1_deconvolution_args,
        msn_peak_picking_args=msn_peak_picking_args,
        msn_deconvolution_args=msn_deconvolution_args,
        storage_path=storage_path,
        sample_name=name,
        start_scan_id=start_scan_id,
        end_scan_id=end_scan_id,
        n_processes=n_processes,
        extract_only_tandem_envelopes=extract_only_tandem_envelopes,
        ms1_averaging=ms1_averaging,
        cache_handler_type=ThreadedMzMLScanCacheHandler)

    try:
        consumer.start()
        logger.info("Updating New Sample Run")
        reader = ProcessedMzMLDeserializer(storage_path, use_index=False)
        reader.read_index_file()
        sample_run_data = reader.sample_run
        if reader.extended_index.msn_ids:
            sample_type = "MS/MS Sample"
        else:
            sample_type = "MS Sample"
        sample_run = sample.SampleRunRecord(
            name=sample_run_data.name,
            uuid=sample_run_data.uuid,
            completed=True,
            path=storage_path,
            sample_type=sample_type,
            user_id=channel.user.id)
        channel.send(Message(sample_run.to_json(), "new-sample-run"))
    except Exception:
        channel.send(Message.traceback())
        channel.abort("An error occurred during preprocessing.")
Beispiel #23
0
class GlycopeptideDatabaseSearchReportCreator(ReportCreatorBase):
    def __init__(self, database_path, analysis_id, stream=None, threshold=5,
                 mzml_path=None):
        super(GlycopeptideDatabaseSearchReportCreator, self).__init__(
            database_path, analysis_id, stream)
        self.set_template_loader(os.path.dirname(__file__))
        self.mzml_path = mzml_path
        self.scan_loader = None
        self.threshold = threshold
        self.use_dynamic_display_mode = 0
        self.analysis = self.session.query(serialize.Analysis).get(self.analysis_id)
        self._resolve_hypothesis_id()
        self._build_protein_index()
        self._make_scan_loader()
        self._glycopeptide_counter = 0
        if len(self.protein_index) > 10:
            self.use_dynamic_display_mode = 1

    def _resolve_hypothesis_id(self):
        self.hypothesis_id = self.analysis.hypothesis_id
        hypothesis = self.session.query(serialize.GlycopeptideHypothesis).get(self.hypothesis_id)
        if hypothesis is None:
            self.hypothesis_id = 1
            hypothesis = self.session.query(serialize.GlycopeptideHypothesis).get(
                self.hypothesis_id)
            if hypothesis is None:
                raise ValueError("Could not resolve Glycopeptide Hypothesis!")

    def prepare_environment(self):
        super(GlycopeptideDatabaseSearchReportCreator, self).prepare_environment()

    def _build_protein_index(self):
        hypothesis_id = self.hypothesis_id
        theoretical_counts = self.session.query(Protein.name, Protein.id, func.count(Glycopeptide.id)).join(
            Glycopeptide).group_by(Protein.id).filter(
            Protein.hypothesis_id == hypothesis_id).all()
        matched_counts = self.session.query(Protein.name, Protein.id, func.count(IdentifiedGlycopeptide.id)).join(
            Glycopeptide).join(
            IdentifiedGlycopeptide, IdentifiedGlycopeptide.structure_id == Glycopeptide.id).group_by(
            Protein.id).filter(
            IdentifiedGlycopeptide.ms2_score > self.threshold,
            IdentifiedGlycopeptide.analysis_id == self.analysis_id).all()
        listing = []
        index = {}
        for protein_name, protein_id, glycopeptide_count in theoretical_counts:
            index[protein_id] = {
                "protein_name": protein_name,
                "protein_id": protein_id,
            }
        for protein_name, protein_id, glycopeptide_count in matched_counts:
            entry = index[protein_id]
            entry['identified_glycopeptide_count'] = glycopeptide_count
            listing.append(entry)
        self.protein_index = sorted(listing, key=lambda x: x["identified_glycopeptide_count"], reverse=True)
        for protein_entry in self.protein_index:
            protein_entry['protein'] = self.session.query(Protein).get(protein_entry["protein_id"])
        return self.protein_index

    def _make_scan_loader(self):
        if self.mzml_path is not None:
            if not os.path.exists(self.mzml_path):
                raise IOError("No such file {}".format(self.mzml_path))
            self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
        else:
            self.mzml_path = self.analysis.parameters['sample_path']
            if not os.path.exists(self.mzml_path):
                raise IOError((
                    "No such file {}. If {} was relocated, you may need to explicily pass the"
                    " corrected file path.").format(
                    self.mzml_path,
                    self.database_connection._original_connection))
            self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)

    def iterglycoproteins(self):
        n = float(len(self.protein_index))
        for i, row in enumerate(self.protein_index, 1):
            protein = row['protein']
            glycopeptides = self.session.query(
                IdentifiedGlycopeptide).join(Glycopeptide).join(
                Protein).filter(
                IdentifiedGlycopeptide.analysis_id == self.analysis_id,
                Glycopeptide.hypothesis_id == self.hypothesis_id,
                IdentifiedGlycopeptide.ms2_score > self.threshold,
                Protein.id == protein.id).all()
            glycoprotein = IdentifiedGlycoprotein(protein, glycopeptides)
            self.status_update(
                "Processing %s (%d/%d) %0.2f%%" % (
                    protein.name, i, n, (i / n * 100)))
            yield i, glycoprotein

    def site_specific_abundance_plots(self, glycoprotein):
        axes = OrderedDict()
        for glyco_type in glycoprotein.glycosylation_types:
            for site in sorted(glycoprotein.glycosylation_sites_for(glyco_type)):
                spanning_site = glycoprotein.site_map[glyco_type][site]
                if len(spanning_site) == 0:
                    continue
                bundle = BundledGlycanComposition.aggregate(spanning_site)
                if len(bundle) == 0:
                    continue
                ax = figax()
                AggregatedAbundanceArtist(
                    bundle, ax=ax, colorizer=glycan_colorizer_type_map[glyco_type]).draw()
                ax.set_title("%s Glycans\nat Site %d" % (glyco_type.name, site + 1,), fontsize=18)
                axes[site, glyco_type] = svguri_plot(ax, bbox_inches='tight')
        return axes

    def draw_glycoforms(self, glycoprotein):
        ax = figax()
        layout = GlycoformLayout(glycoprotein, glycoprotein.identified_glycopeptides, ax=ax)
        layout.draw()
        svg = layout.to_svg(scale=2.0, height_padding_scale=1.1)
        return svg

    def chromatogram_plot(self, glycopeptide):
        ax = figax()
        try:
            SmoothingChromatogramArtist(
                glycopeptide, ax=ax, label_peaks=False,
                colorizer=lambda x: "#48afd0").draw(legend=False)
            ax.set_xlabel("Time (Minutes)", fontsize=16)
            ax.set_ylabel("Relative Abundance", fontsize=16)
            return png_plot(ax, bbox_inches='tight', img_height='100%')
        except ValueError:
            return "<div style='text-align:center;'>No Chromatogram Found</div>"

    def spectrum_match_info(self, glycopeptide):
        matched_scans = []

        for solution_set in glycopeptide.spectrum_matches:

            best_solution = solution_set.best_solution()
            try:
                selected_solution = solution_set.solution_for(glycopeptide.structure)
            except KeyError:
                continue
            pass_threshold = abs(selected_solution.score - best_solution.score) < 1e-6

            if not pass_threshold:
                continue

            if isinstance(selected_solution.scan, SpectrumReference):
                scan = self.session.query(MSScan).filter(
                    MSScan.scan_id == selected_solution.scan.id,
                    MSScan.sample_run_id == self.analysis.sample_run_id).first().convert()
            else:
                scan = selected_solution.scan
            scan.score = selected_solution.score
            matched_scans.append(scan)

        spectrum_match_ref = max(glycopeptide.spectrum_matches, key=lambda x: x.score)
        scan_id = spectrum_match_ref.scan.scan_id
        scan = self.scan_loader.get_scan_by_id(scan_id)
        try:
            mass_shift = spectrum_match_ref[0].mass_shift
        except Exception:
            mass_shift = Unmodified
        if mass_shift.name != Unmodified.name:
            mass_shift = mass_shift.convert()
        else:
            mass_shift = Unmodified

        match = CoverageWeightedBinomialScorer.evaluate(
            scan, glycopeptide.structure.convert(),
            error_tolerance=self.analysis.parameters["fragment_error_tolerance"],
            mass_shift=mass_shift)
        specmatch_artist = TidySpectrumMatchAnnotator(match, ax=figax())
        specmatch_artist.draw(fontsize=10, pretty=True)
        annotated_match_ax = specmatch_artist.ax

        scan_title = scan.id
        if len(scan_title) > 60:
            scan_title = '\n'.join(textwrap.wrap(scan_title, 60))

        annotated_match_ax.set_title(scan_title, fontsize=18)
        annotated_match_ax.set_ylabel(annotated_match_ax.get_ylabel(), fontsize=16)
        annotated_match_ax.set_xlabel(annotated_match_ax.get_xlabel(), fontsize=16)

        sequence_logo_plot = glycopeptide_match_logo(match, ax=figax())
        xlim = list(sequence_logo_plot.get_xlim())
        xlim[0] += 1

        sequence_logo_plot.set_xlim(xlim[0], xlim[1])

        spectrum_plot = png_plot(
            annotated_match_ax, svg_width="100%", bbox_inches='tight', height=3 * 1.5,
            width=8 * 1.5,
            img_width="100%",
            patchless=True)
        logo_plot = png_plot(
            sequence_logo_plot,
            svg_width="100%",
            img_width="100%",
            xml_transform=scale_fix_xml_transform,
            bbox_inches='tight',
            height=2, width=6 * 1.5, patchless=True)
        return dict(
            spectrum_plot=spectrum_plot, logo_plot=logo_plot,
            precursor_mass_accuracy=match.precursor_mass_accuracy(),
            spectrum_match=match)

    def track_entry(self, glycopeptide):
        self._glycopeptide_counter += 1
        if self._glycopeptide_counter % 15 == 0:
            self.status_update(
                " ... %d glycopeptides handled" % (self._glycopeptide_counter,))
        return self._glycopeptide_counter

    def make_template_stream(self):
        template_obj = self.env.get_template("overview.templ")

        ads = serialize.AnalysisDeserializer(
            self.database_connection._original_connection,
            analysis_id=self.analysis_id)

        hypothesis = ads.analysis.hypothesis
        sample_run = ads.analysis.sample_run
        if self.use_dynamic_display_mode:
            self.status_update("Using dynamic display mode")
        template_stream = template_obj.stream(
            analysis=ads.analysis,
            hypothesis=hypothesis,
            sample_run=sample_run,
            protein_index=self.protein_index,
            glycoprotein_iterator=self.iterglycoproteins(),
            renderer=self,
            use_dynamic_display_mode=self.use_dynamic_display_mode)

        return template_stream
def analyze_glycan_composition(database_connection,
                               sample_path,
                               hypothesis_identifier,
                               output_path,
                               analysis_name,
                               mass_shifts,
                               grouping_error_tolerance=1.5e-5,
                               mass_error_tolerance=1e-5,
                               scoring_model=None,
                               minimum_mass=500.,
                               smoothing_factor=None,
                               regularization_model=None,
                               combinatorial_mass_shift_limit=8,
                               channel=None,
                               **kwargs):
    if scoring_model is None:
        scoring_model = GeneralScorer

    database_connection = DatabaseBoundOperation(database_connection)

    if not os.path.exists(sample_path):
        channel.send(
            Message("Could not locate sample %r" % sample_path, "error"))
        return

    reader = ProcessedMzMLDeserializer(sample_path, use_index=False)
    sample_run = reader.sample_run

    try:
        hypothesis = get_by_name_or_id(database_connection, GlycanHypothesis,
                                       hypothesis_identifier)
    except Exception:
        channel.send(
            Message("Could not locate hypothesis %r" % hypothesis_identifier,
                    "error"))
        return

    if analysis_name is None:
        analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name)
    analysis_name = validate_analysis_name(None, database_connection.session,
                                           analysis_name)

    try:
        mass_shift_out = []
        for mass_shift, multiplicity in mass_shifts:
            mass_shift_out.append(validate_mass_shift(mass_shift,
                                                      multiplicity))
        expanded = []
        expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts(
            dict(mass_shift_out), limit=combinatorial_mass_shift_limit)
        mass_shifts = expanded
    except Abort:
        channel.send(Message.traceback())
        return

    mass_shifts = expanded

    try:
        analyzer = MzMLGlycanChromatogramAnalyzer(
            database_connection._original_connection,
            hypothesis.id,
            sample_path=sample_path,
            output_path=output_path,
            mass_shifts=mass_shifts,
            mass_error_tolerance=mass_error_tolerance,
            grouping_error_tolerance=grouping_error_tolerance,
            scoring_model=scoring_model,
            analysis_name=analysis_name,
            minimum_mass=minimum_mass)
        analyzer.start()
        analysis = analyzer.analysis
        record = project_analysis.AnalysisRecord(
            name=analysis.name,
            id=analysis.id,
            uuid=analysis.uuid,
            path=output_path,
            analysis_type=analysis.analysis_type,
            hypothesis_uuid=analysis.hypothesis.uuid,
            hypothesis_name=analysis.hypothesis.name,
            sample_name=analysis.parameters['sample_name'],
            user_id=channel.user.id)
        channel.send(Message(record.to_json(), 'new-analysis'))
    except Exception:
        channel.send(Message.traceback())
        channel.abort("An error occurred during analysis.")
class GlycopeptideDatabaseSearchReportCreator(ReportCreatorBase):
    def __init__(self,
                 database_path,
                 analysis_id,
                 stream=None,
                 threshold=5,
                 mzml_path=None):
        super(GlycopeptideDatabaseSearchReportCreator,
              self).__init__(database_path, analysis_id, stream)
        self.set_template_loader(os.path.dirname(__file__))
        self.mzml_path = mzml_path
        self.scan_loader = None
        self.threshold = threshold
        self.analysis = self.session.query(serialize.Analysis).get(
            self.analysis_id)
        self._resolve_hypothesis_id()
        self._build_protein_index()
        self._make_scan_loader()
        self._glycopeptide_counter = 0

    def _resolve_hypothesis_id(self):
        self.hypothesis_id = self.analysis.hypothesis_id
        hypothesis = self.session.query(serialize.GlycopeptideHypothesis).get(
            self.hypothesis_id)
        if hypothesis is None:
            self.hypothesis_id = 1
            hypothesis = self.session.query(
                serialize.GlycopeptideHypothesis).get(self.hypothesis_id)
            if hypothesis is None:
                raise ValueError("Could not resolve Glycopeptide Hypothesis!")

    def prepare_environment(self):
        super(GlycopeptideDatabaseSearchReportCreator,
              self).prepare_environment()

    def _build_protein_index(self):
        hypothesis_id = self.hypothesis_id
        theoretical_counts = self.session.query(
            Protein.name, Protein.id,
            func.count(Glycopeptide.id)).join(Glycopeptide).group_by(
                Protein.id).filter(
                    Protein.hypothesis_id == hypothesis_id).all()
        matched_counts = self.session.query(
            Protein.name, Protein.id,
            func.count(IdentifiedGlycopeptide.id)).join(Glycopeptide).join(
                IdentifiedGlycopeptide,
                IdentifiedGlycopeptide.structure_id == Glycopeptide.id
            ).group_by(Protein.id).filter(
                IdentifiedGlycopeptide.ms2_score > self.threshold,
                IdentifiedGlycopeptide.analysis_id == self.analysis_id).all()
        listing = []
        index = {}
        for protein_name, protein_id, glycopeptide_count in theoretical_counts:
            index[protein_id] = {
                "protein_name": protein_name,
                "protein_id": protein_id,
            }
        for protein_name, protein_id, glycopeptide_count in matched_counts:
            entry = index[protein_id]
            entry['identified_glycopeptide_count'] = glycopeptide_count
            listing.append(entry)
        self.protein_index = sorted(
            listing,
            key=lambda x: x["identified_glycopeptide_count"],
            reverse=True)
        for protein_entry in self.protein_index:
            protein_entry['protein'] = self.session.query(Protein).get(
                protein_entry["protein_id"])
        return self.protein_index

    def _make_scan_loader(self):
        if self.mzml_path is not None:
            if not os.path.exists(self.mzml_path):
                raise IOError("No such file {}".format(self.mzml_path))
            self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)
        else:
            self.mzml_path = self.analysis.parameters['sample_path']
            if not os.path.exists(self.mzml_path):
                raise IOError((
                    "No such file {}. If {} was relocated, you may need to explicily pass the"
                    " corrected file path.").format(
                        self.mzml_path,
                        self.database_connection._original_connection))
            self.scan_loader = ProcessedMzMLDeserializer(self.mzml_path)

    def iterglycoproteins(self):
        n = float(len(self.protein_index))
        for i, row in enumerate(self.protein_index, 1):
            protein = row['protein']
            glycopeptides = self.session.query(IdentifiedGlycopeptide).join(
                Glycopeptide).join(Protein).filter(
                    IdentifiedGlycopeptide.analysis_id == self.analysis_id,
                    Glycopeptide.hypothesis_id == self.hypothesis_id,
                    IdentifiedGlycopeptide.ms2_score > self.threshold,
                    Protein.id == protein.id).all()
            glycoprotein = IdentifiedGlycoprotein(protein, glycopeptides)
            glycoprotein.id = protein.id
            self.status_update("Processing %s (%d/%d) %0.2f%%" %
                               (protein.name, i, n, (i / n * 100)))
            yield glycoprotein

    def site_specific_abundance_plots(self, glycoprotein):
        axes = OrderedDict()
        for glyco_type in glycoprotein.glycosylation_types:
            for site in sorted(
                    glycoprotein.glycosylation_sites_for(glyco_type)):
                spanning_site = glycoprotein.site_map[glyco_type][site]
                if len(spanning_site) == 0:
                    continue
                bundle = BundledGlycanComposition.aggregate(spanning_site)
                ax = figax()
                AggregatedAbundanceArtist(
                    bundle,
                    ax=ax,
                    colorizer=glycan_colorizer_type_map[glyco_type]).draw()
                ax.set_title("%s Glycans\nat Site %d" % (
                    glyco_type.name,
                    site,
                ),
                             fontsize=18)
                axes[site, glyco_type] = svguri_plot(ax, bbox_inches='tight')
        return axes

    def draw_glycoforms(self, glycoprotein):
        ax = figax()
        layout = GlycoformLayout(glycoprotein,
                                 glycoprotein.identified_glycopeptides,
                                 ax=ax)
        layout.draw()
        svg = layout.to_svg(scale=2.0, height_padding_scale=1.1)
        # svg = plot_glycoforms_svg(
        #     glycoprotein, glycoprotein.identified_glycopeptides, ax=ax,
        #     margin_left=85, margin_top=0, height_padding_scale=1.1)
        return svg

    def chromatogram_plot(self, glycopeptide):
        ax = figax()
        try:
            SmoothingChromatogramArtist(
                glycopeptide,
                ax=ax,
                label_peaks=False,
                colorizer=lambda x: "#48afd0").draw(legend=False)
            ax.set_xlabel("Time (Minutes)", fontsize=16)
            ax.set_ylabel("Relative Abundance", fontsize=16)
            return png_plot(ax, bbox_inches='tight', img_height='100%')
        except ValueError:
            return "<div style='text-align:center;'>No Chromatogram Found</div>"

    def spectrum_match_info(self, glycopeptide):
        matched_scans = []

        for solution_set in glycopeptide.spectrum_matches:

            best_solution = solution_set.best_solution()
            try:
                selected_solution = solution_set.solution_for(
                    glycopeptide.structure)
            except KeyError:
                continue
            pass_threshold = abs(selected_solution.score -
                                 best_solution.score) < 1e-6

            if not pass_threshold:
                continue

            if isinstance(selected_solution.scan, SpectrumReference):
                scan = self.session.query(MSScan).filter(
                    MSScan.scan_id == selected_solution.scan.id,
                    MSScan.sample_run_id ==
                    self.analysis.sample_run_id).first().convert()
            else:
                scan = selected_solution.scan
            scan.score = selected_solution.score
            matched_scans.append(scan)

        spectrum_match_ref = max(glycopeptide.spectrum_matches,
                                 key=lambda x: x.score)
        scan_id = spectrum_match_ref.scan.scan_id
        scan = self.scan_loader.get_scan_by_id(scan_id)

        match = CoverageWeightedBinomialScorer.evaluate(
            scan,
            glycopeptide.structure.convert(),
            error_tolerance=self.analysis.
            parameters["fragment_error_tolerance"])
        specmatch_artist = SpectrumMatchAnnotator(match, ax=figax())
        specmatch_artist.draw(fontsize=10, pretty=True)
        annotated_match_ax = specmatch_artist.ax

        annotated_match_ax.set_title("%s\n" % (scan.id, ), fontsize=18)
        annotated_match_ax.set_ylabel(annotated_match_ax.get_ylabel(),
                                      fontsize=16)
        annotated_match_ax.set_xlabel(annotated_match_ax.get_xlabel(),
                                      fontsize=16)

        sequence_logo_plot = glycopeptide_match_logo(match, ax=figax())
        xlim = list(sequence_logo_plot.get_xlim())
        xlim[0] += 1

        sequence_logo_plot.set_xlim(xlim[0], xlim[1])

        spectrum_plot = png_plot(annotated_match_ax,
                                 svg_width="100%",
                                 bbox_inches='tight',
                                 height=3 * 1.5,
                                 width=8 * 1.5,
                                 img_width="100%",
                                 patchless=True)
        logo_plot = png_plot(sequence_logo_plot,
                             svg_width="100%",
                             img_width="100%",
                             xml_transform=scale_fix_xml_transform,
                             bbox_inches='tight',
                             height=2,
                             width=6 * 1.5,
                             patchless=True)
        return dict(spectrum_plot=spectrum_plot,
                    logo_plot=logo_plot,
                    precursor_mass_accuracy=match.precursor_mass_accuracy(),
                    spectrum_match=match)

    def track_entry(self, glycopeptide):
        self._glycopeptide_counter += 1
        if self._glycopeptide_counter % 15 == 0:
            self.status_update(" ... %d glycopeptides handled" %
                               (self._glycopeptide_counter, ))
        return self._glycopeptide_counter

    def make_template_stream(self):
        template_obj = self.env.get_template("overview.templ")

        ads = serialize.AnalysisDeserializer(
            self.database_connection._original_connection,
            analysis_id=self.analysis_id)

        hypothesis = ads.analysis.hypothesis
        sample_run = ads.analysis.sample_run

        template_stream = template_obj.stream(
            analysis=ads.analysis,
            hypothesis=hypothesis,
            sample_run=sample_run,
            protein_index=self.protein_index,
            glycoprotein_iterator=self.iterglycoproteins(),
            renderer=self,
        )

        return template_stream