def test_profiler(self): db_file = self._make_hypothesis() output_file = self.setup_tempfile("") task = MzMLGlycanChromatogramAnalyzer( db_file, 1, agp_glycomics_mzml, output_file, analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() self.assertTrue(os.path.exists(output_file)) ads = AnalysisDeserializer(output_file) gcs = ads.load_glycan_composition_chromatograms() self.clear_file(db_file) # 'spacing_fit': 0.96367957815527916, 'isotopic_fit': 0.99366937970680247, # 'line_score': 0.99780414736388745, 'charge_count': 0.9365769766604084 self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 17.1458) # 'spacing_fit': 0.96123524755239487, 'isotopic_fit': 0.97935840584492162, # 'line_score': 0.99562733579066764, 'charge_count': 0.7368321292716115 self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 13.5279) # 'spacing_fit': 0.94565181061625481, 'isotopic_fit': 0.99074210231338733, # 'line_score': 0.98925755528448378, 'charge_count': 0.999773289306269 self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 20.4438) # 'spacing_fit': 0.95567017048597336, 'isotopic_fit': 0.98274665306540443, # 'line_score': 0.99706887771172914, 'charge_count': 0.7604540961453831 self.confirm_score(gcs, "{Fuc:2; Hex:6; HexNAc:5; Neu5Ac:3}", 14.0977) ads.close() self.clear_file(output_file)
def test_profiler(self): db_file = self._make_hypothesis() output_file = self.setup_tempfile("") task = MzMLGlycanChromatogramAnalyzer(db_file, 1, agp_glycomics_mzml, output_file, analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() self.assertTrue(os.path.exists(output_file)) ads = AnalysisDeserializer(output_file) gcs = ads.load_glycan_composition_chromatograms() self.assertEqual(len(gcs), 23) self.clear_file(db_file) # 'spacing_fit': 0.96367957815527916, 'isotopic_fit': 0.99366937970680247, # 'line_score': 0.99780414736388745, 'charge_count': 0.9365769766604084 self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 17.1458) # 'spacing_fit': 0.96123524755239487, 'isotopic_fit': 0.97935840584492162, # 'line_score': 0.99562733579066764, 'charge_count': 0.7368321292716115 self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 13.5279) # 'spacing_fit': 0.94565181061625481, 'isotopic_fit': 0.99074210231338733, # 'line_score': 0.98925755528448378, 'charge_count': 0.999773289306269 self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 20.4438) # 'spacing_fit': 0.95567017048597336, 'isotopic_fit': 0.98274665306540443, # 'line_score': 0.99706887771172914, 'charge_count': 0.7604540961453831 self.confirm_score(gcs, "{Fuc:2; Hex:6; HexNAc:5; Neu5Ac:3}", 14.0977) ads.close() self.clear_file(output_file)
def test_smoothing_profiler(self): db_file = self._make_hypothesis() output_file = self.setup_tempfile("") task = MzMLGlycanChromatogramAnalyzer(db_file, 1, agp_glycomics_mzml, output_file, regularize="grid", analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() # import cProfile # prof = cProfile.Profile() # prof.runcall(task.start) # prof.print_stats() # prof.dump_stats('smooth_profile.pstats') self.assertTrue(os.path.exists(output_file)) ads = AnalysisDeserializer(output_file, analysis_id=1) gcs = ads.load_glycan_composition_chromatograms() self.assertEqual(len(gcs), 23) self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 16.1425) self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 8.8510) self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 16.6722) network_params = ads.analysis.parameters['network_parameters'] tau = [ 0.0, 12.173488161057854, 16.042106463675424, 0.0, 22.061954223206591, 0.0, 13.928596053020485, 0.0, 9.4348332520855713, 0.0, 0.0, 0.0, 0.0, 0.0 ] for a, b in zip(tau, network_params.tau): self.assertAlmostEqual(a, b, 3) ads.close() self.clear_file(output_file) task = MzMLGlycanChromatogramAnalyzer( db_file, 1, agp_glycomics_mzml, output_file, regularize=0.2, regularization_model=network_params, analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() ads = AnalysisDeserializer(output_file, analysis_id=1) gcs = ads.load_glycan_composition_chromatograms() self.assertEqual(len(gcs), 23) self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 16.7795) self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 10.6734) self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 18.3360) self.confirm_score(gcs, "{Fuc:2; Hex:6; HexNAc:5; Neu5Ac:3}", 15.9628) network_params = ads.analysis.parameters['network_parameters'] for a, b in zip(tau, network_params.tau): self.assertAlmostEqual(a, b, 3) ads.close() self.clear_file(output_file) self.clear_file(db_file)
def test_profiler(self): db_file = self._make_hypothesis() output_file = self.setup_tempfile("") task = MzMLGlycanChromatogramAnalyzer( db_file, 1, agp_glycomics_mzml, output_file, analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() self.assertTrue(os.path.exists(output_file)) ads = AnalysisDeserializer(output_file) gcs = ads.load_glycan_composition_chromatograms() self.clear_file(db_file) self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 16.97) self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 13.47) self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 20.44) self.confirm_score(gcs, "{Fuc:2; Hex:6; HexNAc:5; Neu5Ac:3}", 14.11) ads.close() self.clear_file(output_file)
def test_smoothing_profiler(self): db_file = self._make_hypothesis() output_file = self.setup_tempfile("") task = MzMLGlycanChromatogramAnalyzer( db_file, 1, agp_glycomics_mzml, output_file, regularize="grid", analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() # import cProfile # prof = cProfile.Profile() # prof.runcall(task.start) # prof.print_stats() # prof.dump_stats('smooth_profile.pstats') self.assertTrue(os.path.exists(output_file)) ads = AnalysisDeserializer(output_file, analysis_id=1) gcs = ads.load_glycan_composition_chromatograms() self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 16.1425) self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 8.8510) self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 16.6722) network_params = ads.analysis.parameters['network_parameters'] tau = [0., 11.77485721, 15.99541137, 0., 21.96431573, 0., 13.94378948, 0., 9.32841618, 0., 0., 0., 0., 0.] for a, b in zip(tau, network_params.tau): self.assertAlmostEqual(a, b, 3) ads.close() self.clear_file(output_file) task = MzMLGlycanChromatogramAnalyzer( db_file, 1, agp_glycomics_mzml, output_file, regularize=0.2, regularization_model=network_params, analysis_name="test-analysis", scoring_model=GeneralScorer) task.start() ads = AnalysisDeserializer(output_file, analysis_id=1) gcs = ads.load_glycan_composition_chromatograms() self.confirm_score(gcs, "{Fuc:1; Hex:7; HexNAc:6; Neu5Ac:4}", 16.7795) self.confirm_score(gcs, "{Hex:8; HexNAc:7; Neu5Ac:3}", 10.6734) self.confirm_score(gcs, "{Hex:7; HexNAc:6; Neu5Ac:4}", 18.3360) self.confirm_score(gcs, "{Fuc:2; Hex:6; HexNAc:5; Neu5Ac:3}", 15.9628) network_params = ads.analysis.parameters['network_parameters'] for a, b in zip(tau, network_params.tau): self.assertAlmostEqual(a, b, 3) ads.close() self.clear_file(output_file) self.clear_file(db_file)
def search_glycan(context, database_connection, sample_path, hypothesis_identifier, analysis_name, mass_shifts, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, minimum_mass=500., scoring_model=None, regularize=None, regularization_model_path=None, network_path=None, output_path=None, scoring_model_features=None, delta_rt=0.5, export=None, interact=False, require_msms_signature=0.0, msn_mass_error_tolerance=2e-5, mass_shift_combination_limit=None, processes=4): """Identify glycan compositions from preprocessed LC-MS data, stored in mzML format. """ if output_path is None and not interact: output_path = make_analysis_output_path("glycan") if scoring_model is None: scoring_model = GeneralScorer if mass_shift_combination_limit is None: mass_shift_combination_limit = 8 if scoring_model_features: for feature in scoring_model_features: scoring_model.add_feature(validate_ms1_feature_name(feature)) if regularization_model_path is not None: with open(regularization_model_path, 'r') as mod_file: regularization_model = GridPointSolution.load(mod_file) else: regularization_model = None if network_path is not None: with open(network_path, 'r') as netfile: network = GraphReader(netfile).network else: network = None database_connection = DatabaseBoundOperation(database_connection) ms_data = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = ms_data.sample_run try: hypothesis = get_by_name_or_id(database_connection, GlycanHypothesis, hypothesis_identifier) except Exception: click.secho("Could not locate a Glycan Hypothesis with identifier %r" % hypothesis_identifier, fg='yellow') raise click.Abort() if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name) analysis_name = validate_analysis_name(context, database_connection.session, analysis_name) mass_shifts = [ validate_mass_shift(mass_shift, multiplicity) for mass_shift, multiplicity in mass_shifts ] expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shifts), limit=mass_shift_combination_limit) mass_shifts = expanded click.secho("Preparing analysis of %s by %s" % (sample_run.name, hypothesis.name), fg='cyan') analyzer = MzMLGlycanChromatogramAnalyzer( database_connection._original_connection, hypothesis.id, sample_path=sample_path, output_path=output_path, mass_shifts=mass_shifts, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, grouping_error_tolerance=grouping_error_tolerance, scoring_model=scoring_model, minimum_mass=minimum_mass, regularize=regularize, regularization_model=regularization_model, network=network, analysis_name=analysis_name, delta_rt=delta_rt, require_msms_signature=require_msms_signature, n_processes=processes) analyzer.display_header() analyzer.start() if interact: try: import IPython click.secho(fmt_msg("Beginning Interactive Session..."), fg='cyan') IPython.embed() except ImportError: click.secho(fmt_msg("Interactive Session Not Supported"), fg='red') if export: for export_type in set(export): click.echo(fmt_msg("Handling Export: %s" % (export_type, ))) if export_type == 'csv': from glycan_profiling.cli.export import glycan_composition_identification base = os.path.splitext(output_path)[0] export_path = "%s-glycan-chromatograms.csv" % (base, ) context.invoke(glycan_composition_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'html': from glycan_profiling.cli.export import glycan_composition_identification base = os.path.splitext(output_path)[0] export_path = "%s-report.html" % (base, ) context.invoke(glycan_composition_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path, report=True) elif export_type == 'glycan-list': from glycan_profiling.cli.export import glycan_hypothesis base = os.path.splitext(output_path)[0] export_path = "%s-glycan-chromatograms.csv" % (base, ) context.invoke(glycan_hypothesis, database_connection=output_path, hypothesis_identifier=analyzer.hypothesis_id, output_path=export_path, importable=True) elif export_type == "model": base = os.path.splitext(output_path)[0] export_path = "%s-regularization-parameters.txt" % (base, ) params = analyzer.analysis.parameters.get("network_parameters") if params is None: click.secho( "No parameters were fitted, skipping \"model\"", fg='red') else: with open(export_path, 'w') as fp: params.dump(fp) else: click.secho("Unrecognized Export: %s" % (export_type, ), fg='yellow')
def search_glycopeptide_multipart(context, database_connection, decoy_database_connection, sample_path, target_hypothesis_identifier=1, decoy_hypothesis_identifier=1, analysis_name=None, output_path=None, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, msn_mass_error_tolerance=2e-5, psm_fdr_threshold=0.05, peak_shape_scoring_model=None, tandem_scoring_model=None, glycan_score_threshold=1.0, memory_database_index=False, save_intermediate_results=None, processes=4, workload_size=500, mass_shifts=None, export=None, maximum_mass=float('inf'), isotope_probing_range=3, fdr_estimation_strategy=None, glycoproteome_smoothing_model=None, mapping_processes=1, durable_fucose=False): if output_path is None: output_path = make_analysis_output_path("glycopeptide") if fdr_estimation_strategy is None: fdr_estimation_strategy = GlycopeptideFDREstimationStrategy.multipart_gamma_gaussian_mixture else: fdr_estimation_strategy = GlycopeptideFDREstimationStrategy[ fdr_estimation_strategy] if tandem_scoring_model is None: tandem_scoring_model = "log_intensity" database_connection = DatabaseBoundOperation(database_connection) decoy_database_connection = DatabaseBoundOperation( decoy_database_connection) ms_data = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = ms_data.sample_run try: target_hypothesis = get_by_name_or_id(database_connection, GlycopeptideHypothesis, target_hypothesis_identifier) except Exception: click.secho( "Could not locate Target Glycopeptide Hypothesis with identifier %r" % target_hypothesis_identifier, fg='yellow') raise click.Abort() try: decoy_hypothesis = get_by_name_or_id(decoy_database_connection, GlycopeptideHypothesis, decoy_hypothesis_identifier) except Exception: click.secho( "Could not locate Decoy Glycopeptide Hypothesis with identifier %r" % decoy_hypothesis_identifier, fg='yellow') raise click.Abort() tandem_scoring_model = validate_glycopeptide_tandem_scoring_function( context, tandem_scoring_model) mass_shifts = [ validate_mass_shift(mass_shift, multiplicity) for mass_shift, multiplicity in mass_shifts ] expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shifts), crossproduct=False) mass_shifts = expanded if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, target_hypothesis.name) analysis_name = validate_analysis_name(context, database_connection.session, analysis_name) click.secho("Preparing analysis of %s by %s" % (sample_run.name, target_hypothesis.name), fg='cyan') analyzer = MultipartGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, decoy_database_connection._original_connection, target_hypothesis.id, decoy_hypothesis.id, sample_path, output_path, analysis_name=analysis_name, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, tandem_scoring_model=tandem_scoring_model, glycan_score_threshold=glycan_score_threshold, mass_shifts=mass_shifts, n_processes=processes, spectrum_batch_size=workload_size, maximum_mass=maximum_mass, probing_range_for_missing_precursors=isotope_probing_range, use_memory_database=memory_database_index, fdr_estimation_strategy=fdr_estimation_strategy, glycosylation_site_models_path=glycoproteome_smoothing_model, fragile_fucose=not durable_fucose) analyzer.display_header() result = analyzer.start() gps, unassigned, target_decoy_set = result[:3] if save_intermediate_results is not None: analyzer.log("Saving Intermediate Results") with open(save_intermediate_results, 'wb') as handle: pickle.dump((target_decoy_set, gps), handle) del gps del unassigned del target_decoy_set if export: for export_type in set(export): click.echo(fmt_msg("Handling Export: %s" % (export_type, ))) if export_type == 'csv': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptides.csv" % (base, ) context.invoke(glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'psm-csv': from glycan_profiling.cli.export import glycopeptide_spectrum_matches base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptide-spectrum-matches.csv" % (base, ) context.invoke(glycopeptide_spectrum_matches, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'html': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-report.html" % (base, ) context.invoke(glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path, report=True)
def search_glycopeptide(context, database_connection, sample_path, hypothesis_identifier, analysis_name, output_path=None, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, msn_mass_error_tolerance=2e-5, psm_fdr_threshold=0.05, peak_shape_scoring_model=None, tandem_scoring_model=None, oxonium_threshold=0.15, save_intermediate_results=None, processes=4, workload_size=500, mass_shifts=None, export=None, use_peptide_mass_filter=False, maximum_mass=float('inf'), decoy_database_connection=None, fdr_correction='auto', isotope_probing_range=3, permute_decoy_glycan_fragments=False): """Identify glycopeptide sequences from processed LC-MS/MS data """ if output_path is None: output_path = make_analysis_output_path("glycopeptide") if tandem_scoring_model is None: tandem_scoring_model = CoverageWeightedBinomialScorer database_connection = DatabaseBoundOperation(database_connection) ms_data = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = ms_data.sample_run try: hypothesis = get_by_name_or_id(database_connection, GlycopeptideHypothesis, hypothesis_identifier) except Exception: click.secho( "Could not locate a Glycopeptide Hypothesis with identifier %r" % hypothesis_identifier, fg='yellow') raise click.Abort() tandem_scoring_model = validate_glycopeptide_tandem_scoring_function( context, tandem_scoring_model) mass_shifts = [ validate_mass_shift(mass_shift, multiplicity) for mass_shift, multiplicity in mass_shifts ] expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shifts), crossproduct=False) mass_shifts = expanded if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name) analysis_name = validate_analysis_name(context, database_connection.session, analysis_name) click.secho("Preparing analysis of %s by %s" % (sample_run.name, hypothesis.name), fg='cyan') if decoy_database_connection is None: analyzer = MzMLGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, sample_path=sample_path, hypothesis_id=hypothesis.id, analysis_name=analysis_name, output_path=output_path, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, peak_shape_scoring_model=peak_shape_scoring_model, tandem_scoring_model=tandem_scoring_model, oxonium_threshold=oxonium_threshold, n_processes=processes, spectrum_batch_size=workload_size, mass_shifts=mass_shifts, use_peptide_mass_filter=use_peptide_mass_filter, maximum_mass=maximum_mass, probing_range_for_missing_precursors=isotope_probing_range, permute_decoy_glycans=permute_decoy_glycan_fragments) else: analyzer = MzMLComparisonGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, decoy_database_connection, sample_path=sample_path, hypothesis_id=hypothesis.id, analysis_name=analysis_name, output_path=output_path, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, peak_shape_scoring_model=peak_shape_scoring_model, tandem_scoring_model=tandem_scoring_model, oxonium_threshold=oxonium_threshold, n_processes=processes, spectrum_batch_size=workload_size, mass_shifts=mass_shifts, use_peptide_mass_filter=use_peptide_mass_filter, maximum_mass=maximum_mass, use_decoy_correction_threshold=fdr_correction, probing_range_for_missing_precursors=isotope_probing_range, permute_decoy_glycans=permute_decoy_glycan_fragments) analyzer.display_header() result = analyzer.start() gps, unassigned, target_decoy_set = result[:3] if save_intermediate_results is not None: analyzer.log("Saving Intermediate Results") with open(save_intermediate_results, 'wb') as handle: pickle.dump((target_decoy_set, gps), handle) del gps del unassigned del target_decoy_set if export: for export_type in set(export): click.echo(fmt_msg("Handling Export: %s" % (export_type, ))) if export_type == 'csv': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptides.csv" % (base, ) context.invoke(glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'psm-csv': from glycan_profiling.cli.export import glycopeptide_spectrum_matches base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptide-spectrum-matches.csv" % (base, ) context.invoke(glycopeptide_spectrum_matches, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'html': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-report.html" % (base, ) context.invoke(glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path, report=True)
def analyze_glycan_composition(database_connection, sample_path, hypothesis_identifier, output_path, analysis_name, mass_shifts, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, scoring_model=None, minimum_mass=500., smoothing_factor=None, regularization_model=None, combinatorial_mass_shift_limit=8, channel=None, **kwargs): if scoring_model is None: scoring_model = GeneralScorer database_connection = DatabaseBoundOperation(database_connection) if not os.path.exists(sample_path): channel.send( Message("Could not locate sample %r" % sample_path, "error")) return reader = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = reader.sample_run try: hypothesis = get_by_name_or_id(database_connection, GlycanHypothesis, hypothesis_identifier) except Exception: channel.send( Message("Could not locate hypothesis %r" % hypothesis_identifier, "error")) return if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name) analysis_name = validate_analysis_name(None, database_connection.session, analysis_name) try: mass_shift_out = [] for mass_shift, multiplicity in mass_shifts: mass_shift_out.append(validate_mass_shift(mass_shift, multiplicity)) expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shift_out), limit=combinatorial_mass_shift_limit) mass_shifts = expanded except Abort: channel.send(Message.traceback()) return mass_shifts = expanded try: analyzer = MzMLGlycanChromatogramAnalyzer( database_connection._original_connection, hypothesis.id, sample_path=sample_path, output_path=output_path, mass_shifts=mass_shifts, mass_error_tolerance=mass_error_tolerance, grouping_error_tolerance=grouping_error_tolerance, scoring_model=scoring_model, analysis_name=analysis_name, minimum_mass=minimum_mass) analyzer.start() analysis = analyzer.analysis record = project_analysis.AnalysisRecord( name=analysis.name, id=analysis.id, uuid=analysis.uuid, path=output_path, analysis_type=analysis.analysis_type, hypothesis_uuid=analysis.hypothesis.uuid, hypothesis_name=analysis.hypothesis.name, sample_name=analysis.parameters['sample_name'], user_id=channel.user.id) channel.send(Message(record.to_json(), 'new-analysis')) except Exception: channel.send(Message.traceback()) channel.abort("An error occurred during analysis.")
def search_glycan(context, database_connection, sample_path, hypothesis_identifier, analysis_name, mass_shifts, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, minimum_mass=500., scoring_model=None, regularize=None, regularization_model_path=None, network_path=None, output_path=None, scoring_model_features=None, delta_rt=0.5, export=None, interact=False, require_msms_signature=0.0, msn_mass_error_tolerance=2e-5, mass_shift_combination_limit=None, processes=4): """Identify glycan compositions from preprocessed LC-MS data, stored in mzML format. """ if output_path is None and not interact: output_path = make_analysis_output_path("glycan") if scoring_model is None: scoring_model = GeneralScorer if mass_shift_combination_limit is None: mass_shift_combination_limit = 8 if scoring_model_features: for feature in scoring_model_features: scoring_model.add_feature(validate_ms1_feature_name(feature)) if regularization_model_path is not None: with open(regularization_model_path, 'r') as mod_file: regularization_model = GridPointSolution.load(mod_file) else: regularization_model = None if network_path is not None: with open(network_path, 'r') as netfile: network = GraphReader(netfile).network else: network = None database_connection = DatabaseBoundOperation(database_connection) ms_data = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = ms_data.sample_run try: hypothesis = get_by_name_or_id( database_connection, GlycanHypothesis, hypothesis_identifier) except Exception: click.secho("Could not locate a Glycan Hypothesis with identifier %r" % hypothesis_identifier, fg='yellow') raise click.Abort() if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name) analysis_name = validate_analysis_name( context, database_connection.session, analysis_name) mass_shifts = [validate_mass_shift(mass_shift, multiplicity) for mass_shift, multiplicity in mass_shifts] expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts(dict(mass_shifts), limit=mass_shift_combination_limit) mass_shifts = expanded click.secho("Preparing analysis of %s by %s" % (sample_run.name, hypothesis.name), fg='cyan') analyzer = MzMLGlycanChromatogramAnalyzer( database_connection._original_connection, hypothesis.id, sample_path=sample_path, output_path=output_path, mass_shifts=mass_shifts, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, grouping_error_tolerance=grouping_error_tolerance, scoring_model=scoring_model, minimum_mass=minimum_mass, regularize=regularize, regularization_model=regularization_model, network=network, analysis_name=analysis_name, delta_rt=delta_rt, require_msms_signature=require_msms_signature, n_processes=processes) analyzer.display_header() analyzer.start() if interact: try: import IPython click.secho(fmt_msg("Beginning Interactive Session..."), fg='cyan') IPython.embed() except ImportError: click.secho(fmt_msg("Interactive Session Not Supported"), fg='red') if export: for export_type in set(export): click.echo(fmt_msg("Handling Export: %s" % (export_type,))) if export_type == 'csv': from glycan_profiling.cli.export import glycan_composition_identification base = os.path.splitext(output_path)[0] export_path = "%s-glycan-chromatograms.csv" % (base,) context.invoke( glycan_composition_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'html': from glycan_profiling.cli.export import glycan_composition_identification base = os.path.splitext(output_path)[0] export_path = "%s-report.html" % (base,) context.invoke( glycan_composition_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path, report=True) elif export_type == 'glycan-list': from glycan_profiling.cli.export import glycan_hypothesis base = os.path.splitext(output_path)[0] export_path = "%s-glycan-chromatograms.csv" % (base,) context.invoke( glycan_hypothesis, database_connection=output_path, hypothesis_identifier=analyzer.hypothesis_id, output_path=export_path, importable=True) elif export_type == "model": base = os.path.splitext(output_path)[0] export_path = "%s-regularization-parameters.txt" % (base,) params = analyzer.analysis.parameters.get("network_parameters") if params is None: click.secho("No parameters were fitted, skipping \"model\"", fg='red') else: with open(export_path, 'w') as fp: params.dump(fp) else: click.secho("Unrecognized Export: %s" % (export_type,), fg='yellow')
def search_glycopeptide_multipart(context, database_connection, decoy_database_connection, sample_path, target_hypothesis_identifier=1, decoy_hypothesis_identifier=1, analysis_name=None, output_path=None, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, msn_mass_error_tolerance=2e-5, psm_fdr_threshold=0.05, peak_shape_scoring_model=None, tandem_scoring_model=None, glycan_score_threshold=1.0, memory_database_index=False, save_intermediate_results=None, processes=4, workload_size=500, mass_shifts=None, export=None, maximum_mass=float('inf'), isotope_probing_range=3): if output_path is None: output_path = make_analysis_output_path("glycopeptide") if tandem_scoring_model is None: tandem_scoring_model = "log_intensity" database_connection = DatabaseBoundOperation(database_connection) decoy_database_connection = DatabaseBoundOperation(decoy_database_connection) ms_data = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = ms_data.sample_run try: target_hypothesis = get_by_name_or_id( database_connection, GlycopeptideHypothesis, target_hypothesis_identifier) except Exception: click.secho("Could not locate Target Glycopeptide Hypothesis with identifier %r" % hypothesis_identifier, fg='yellow') raise click.Abort() try: decoy_hypothesis = get_by_name_or_id( decoy_database_connection, GlycopeptideHypothesis, decoy_hypothesis_identifier) except Exception: click.secho("Could not locate Decoy Glycopeptide Hypothesis with identifier %r" % hypothesis_identifier, fg='yellow') raise click.Abort() tandem_scoring_model = validate_glycopeptide_tandem_scoring_function( context, tandem_scoring_model) mass_shifts = [validate_mass_shift(mass_shift, multiplicity) for mass_shift, multiplicity in mass_shifts] expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shifts), crossproduct=False) mass_shifts = expanded if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, target_hypothesis.name) analysis_name = validate_analysis_name( context, database_connection.session, analysis_name) click.secho("Preparing analysis of %s by %s" % ( sample_run.name, target_hypothesis.name), fg='cyan') analyzer = MultipartGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, decoy_database_connection._original_connection, target_hypothesis.id, decoy_hypothesis.id, sample_path, output_path, analysis_name=analysis_name, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, tandem_scoring_model=tandem_scoring_model, glycan_score_threshold=glycan_score_threshold, mass_shifts=mass_shifts, n_processes=5, spectrum_batch_size=workload_size, maximum_mass=maximum_mass, probing_range_for_missing_precursors=isotope_probing_range, use_memory_database=memory_database_index) analyzer.display_header() gps, unassigned, target_decoy_set = analyzer.start() if save_intermediate_results is not None: analyzer.log("Saving Intermediate Results") with open(save_intermediate_results, 'wb') as handle: pickle.dump((target_decoy_set, gps), handle) if export: for export_type in set(export): click.echo(fmt_msg("Handling Export: %s" % (export_type,))) if export_type == 'csv': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptides.csv" % (base,) context.invoke( glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'psm-csv': from glycan_profiling.cli.export import glycopeptide_spectrum_matches base = os.path.splitext(output_path)[0] export_path = "%s-glycopeptide-spectrum-matches.csv" % (base,) context.invoke( glycopeptide_spectrum_matches, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path) elif export_type == 'html': from glycan_profiling.cli.export import glycopeptide_identification base = os.path.splitext(output_path)[0] export_path = "%s-report.html" % (base,) context.invoke( glycopeptide_identification, database_connection=output_path, analysis_identifier=analyzer.analysis_id, output_path=export_path, report=True)
def analyze_glycopeptide_sequences( database_connection, sample_path, hypothesis_identifier, output_path, analysis_name, grouping_error_tolerance=1.5e-5, mass_error_tolerance=1e-5, msn_mass_error_tolerance=2e-5, psm_fdr_threshold=0.05, peak_shape_scoring_model=None, minimum_oxonium_threshold=0.05, workload_size=1000, use_peptide_mass_filter=True, mass_shifts=None, permute_decoy_glycan_fragments=False, include_rare_signature_ions=False, model_retention_time=False, search_strategy=GlycopeptideSearchStrategyEnum.classic, decoy_database_connection=None, decoy_hypothesis_id=None, tandem_scoring_model=None, channel=None, **kwargs): if peak_shape_scoring_model is None: peak_shape_scoring_model = GeneralScorer.clone() peak_shape_scoring_model.add_feature(get_feature("null_charge")) database_connection = DatabaseBoundOperation(database_connection) if decoy_database_connection: decoy_database_connection = DatabaseBoundOperation( decoy_database_connection) if not os.path.exists(sample_path): channel.send( Message("Could not locate sample %r" % sample_path, "error")) return reader = ProcessedMzMLDeserializer(sample_path, use_index=False) sample_run = reader.sample_run try: hypothesis = get_by_name_or_id(database_connection, GlycopeptideHypothesis, hypothesis_identifier) except Exception: channel.send( Message("Could not locate hypothesis %r" % hypothesis_identifier, "error")) channel.abort("An error occurred during analysis.") if decoy_database_connection: try: decoy_hypothesis = get_by_name_or_id(decoy_database_connection, GlycopeptideHypothesis, decoy_hypothesis_id) except Exception: channel.send( Message("Could not locate hypothesis %r" % decoy_hypothesis_id, "error")) channel.abort("An error occurred during analysis.") if analysis_name is None: analysis_name = "%s @ %s" % (sample_run.name, hypothesis.name) analysis_name = validate_analysis_name(None, database_connection.session, analysis_name) try: mass_shift_out = [] for mass_shift, multiplicity in mass_shifts: mass_shift_out.append(validate_mass_shift(mass_shift, multiplicity)) expanded = [] expanded = MzMLGlycanChromatogramAnalyzer.expand_mass_shifts( dict(mass_shift_out), crossproduct=False) mass_shifts = expanded except Abort: channel.send(Message.traceback()) return try: if search_strategy == GlycopeptideSearchStrategyEnum.classic: analyzer = MzMLGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, hypothesis.id, sample_path, output_path=output_path, analysis_name=analysis_name, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, peak_shape_scoring_model=peak_shape_scoring_model, oxonium_threshold=minimum_oxonium_threshold, spectrum_batch_size=workload_size, use_peptide_mass_filter=use_peptide_mass_filter, mass_shifts=mass_shifts, permute_decoy_glycans=permute_decoy_glycan_fragments, rare_signatures=include_rare_signature_ions, model_retention_time=model_retention_time, tandem_scoring_model=tandem_scoring_model) elif search_strategy == GlycopeptideSearchStrategyEnum.classic_comparison: analyzer = MzMLComparisonGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, decoy_database_connection._original_connection, hypothesis.id, sample_path, output_path=output_path, analysis_name=analysis_name, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, peak_shape_scoring_model=peak_shape_scoring_model, oxonium_threshold=minimum_oxonium_threshold, spectrum_batch_size=workload_size, use_peptide_mass_filter=use_peptide_mass_filter, mass_shifts=mass_shifts, permute_decoy_glycans=permute_decoy_glycan_fragments, rare_signatures=include_rare_signature_ions, model_retention_time=model_retention_time, tandem_scoring_model=tandem_scoring_model) elif search_strategy == GlycopeptideSearchStrategyEnum.multipart: analyzer = MultipartGlycopeptideLCMSMSAnalyzer( database_connection._original_connection, decoy_database_connection._original_connection, hypothesis.id, decoy_hypothesis.id, sample_path, output_path=output_path, analysis_name=analysis_name, grouping_error_tolerance=grouping_error_tolerance, mass_error_tolerance=mass_error_tolerance, msn_mass_error_tolerance=msn_mass_error_tolerance, psm_fdr_threshold=psm_fdr_threshold, peak_shape_scoring_model=peak_shape_scoring_model, spectrum_batch_size=workload_size, mass_shifts=mass_shifts, rare_signatures=include_rare_signature_ions, model_retention_time=model_retention_time, tandem_scoring_model=tandem_scoring_model) _ = analyzer.start() analysis = analyzer.analysis if analysis is not None: record = project_analysis.AnalysisRecord( name=analysis.name, id=analysis.id, uuid=analysis.uuid, path=output_path, analysis_type=analysis.analysis_type, hypothesis_uuid=analysis.hypothesis.uuid, hypothesis_name=analysis.hypothesis.name, sample_name=analysis.parameters['sample_name'], user_id=channel.user.id) channel.send(Message(record.to_json(), 'new-analysis')) else: channel.send( Message("No glycopeptides were identified for \"%s\"" % (analysis_name, ))) except Exception: channel.send(Message.traceback()) channel.abort("An error occurred during analysis.")