def serialize_substituent_rule(substituent):
    return {
        "name": substituent.name,
        "composition": formula(substituent.composition),
        "can_nh_derivatize": substituent.can_nh_derivatize,
        "is_nh_derivatizable": substituent.is_nh_derivatizable,
        "attachment_composition": formula(substituent.attachment_composition)
    }
Example #2
0
def serialize_substituent_rule(substituent):
    return {
        "name": substituent.name,
        "composition": formula(substituent.composition),
        "can_nh_derivatize": substituent.can_nh_derivatize,
        "is_nh_derivatizable": substituent.is_nh_derivatizable,
        "attachment_composition": formula(substituent.attachment_composition)
    }
 def _serialize_compound_mass_shift(self, mass_shift):
     return {
         "name": mass_shift.name,
         "composition": formula(mass_shift.composition),
         "counts": {
             k.name: v for k, v in mass_shift.counts.items()
         },
         "definitions": {
             k.name: formula(k.composition) for k, v in mass_shift.counts.items()
         }
     }
 def _serialize_compound_mass_shift(self, mass_shift):
     return {
         "name": mass_shift.name,
         "composition": formula(mass_shift.composition),
         "counts": {
             k.name: v for k, v in mass_shift.counts.items()
         },
         "definitions": {
             k.name: formula(k.composition) for k, v in mass_shift.counts.items()
         }
     }
Example #5
0
def display_peptide_modification(name):
    mod = Modification(name)
    click.echo("name: %s" % mod.name)
    click.echo("mass: %f" % mod.mass)
    click.echo("formula: %s" % formula(mod.composition))
    for target in mod.rule.targets:
        click.echo("target: %s" % target.serialize())
 def pack_peptide(self, peptide_ident, start, end, score, score_type, parent_protein):
     match = Peptide(
         calculated_mass=peptide_ident.peptide_sequence.mass,
         base_peptide_sequence=peptide_ident.base_sequence,
         modified_peptide_sequence=str(peptide_ident.peptide_sequence),
         formula=formula(peptide_ident.peptide_sequence.total_composition()),
         count_glycosylation_sites=None,
         count_missed_cleavages=peptide_ident.missed_cleavages,
         count_variable_modifications=peptide_ident.modification_counter,
         start_position=start,
         end_position=end,
         peptide_score=score,
         peptide_score_type=score_type,
         sequence_length=end - start,
         protein_id=parent_protein.id,
         hypothesis_id=self.hypothesis_id)
     n_glycosites = n_glycan_sequon_sites(
         match, parent_protein)
     o_glycosites = o_glycan_sequon_sites(match, parent_protein)
     gag_glycosites = gag_sequon_sites(match, parent_protein)
     match.count_glycosylation_sites = len(n_glycosites) + len(o_glycosites)
     match.n_glycosylation_sites = sorted(n_glycosites)
     match.o_glycosylation_sites = sorted(o_glycosites)
     match.gagylation_sites = sorted(gag_glycosites)
     return match
Example #7
0
 def add_modifications(self,
                       constant_modifications=None,
                       variable_modifications=None,
                       max_variable_modifications=4):
     if constant_modifications is None:
         constant_modifications = []
     if variable_modifications is None:
         variable_modifications = []
     result = MemoryPeptideCollection()
     for peptide in self.peptides:
         for modified_peptide, n_variable_modifications in peptide_permutations(
                 str(peptide), constant_modifications,
                 variable_modifications):
             total_modification_count = (
                 n_variable_modifications +
                 peptide.count_variable_modifications)
             if total_modification_count > max_variable_modifications:
                 continue
             inst = Peptide(
                 base_peptide_sequence=peptide.base_peptide_sequence,
                 modified_peptide_sequence=str(modified_peptide),
                 count_missed_cleavages=peptide.count_missed_cleavages,
                 count_variable_modifications=total_modification_count,
                 sequence_length=peptide.sequence_length,
                 start_position=peptide.start_position,
                 end_position=peptide.end_position,
                 calculated_mass=modified_peptide.mass,
                 formula=formula(modified_peptide.total_composition()))
             result.add(inst)
     return result
 def add_modifications(self, constant_modifications=None, variable_modifications=None, max_variable_modifications=4):
     if constant_modifications is None:
         constant_modifications = []
     if variable_modifications is None:
         variable_modifications = []
     result = MemoryPeptideCollection()
     for peptide in self.peptides:
         for modified_peptide, n_variable_modifications in peptide_permutations(
                 str(peptide), constant_modifications, variable_modifications):
             total_modification_count = (
                 n_variable_modifications + peptide.count_variable_modifications)
             if total_modification_count > max_variable_modifications:
                 continue
             inst = Peptide(
                 base_peptide_sequence=peptide.base_peptide_sequence,
                 modified_peptide_sequence=str(modified_peptide),
                 count_missed_cleavages=peptide.count_missed_cleavages,
                 count_variable_modifications=total_modification_count,
                 sequence_length=peptide.sequence_length,
                 start_position=peptide.start_position,
                 end_position=peptide.end_position,
                 calculated_mass=modified_peptide.mass,
                 formula=formula(modified_peptide.total_composition()))
             result.add(inst)
     return result
    def run(self):
        self.make_pipeline()
        structure_class_lookup = self.structure_class_loader
        self.log("Loading Glycan Compositions from Stream for %r" % self.hypothesis)

        acc = []
        counter = 0
        for composition, structure_classes in self.transformer:
            mass = composition.mass()
            composition_string = composition.serialize()
            formula_string = formula(composition.total_composition())
            inst = DBGlycanComposition(
                calculated_mass=mass, formula=formula_string,
                composition=composition_string,
                hypothesis_id=self.hypothesis_id)
            self.session.add(inst)
            self.session.flush()
            counter += 1
            for structure_class in structure_classes:
                structure_class = structure_class_lookup[structure_class]
                acc.append(dict(glycan_id=inst.id, class_id=structure_class.id))
                if len(acc) % 100 == 0:
                    self.session.execute(GlycanCompositionToClass.insert(), acc)
                    acc = []
        if acc:
            self.session.execute(GlycanCompositionToClass.insert(), acc)
            acc = []
        self.session.commit()
        self.log("Generated %d glycan compositions" % counter)
Example #10
0
 def pack_peptide(self, peptide_ident, start, end, score, score_type, parent_protein):
     match = Peptide(
         calculated_mass=peptide_ident.peptide_sequence.mass,
         base_peptide_sequence=peptide_ident.base_sequence,
         modified_peptide_sequence=str(peptide_ident.peptide_sequence),
         formula=formula(peptide_ident.peptide_sequence.total_composition()),
         count_glycosylation_sites=None,
         count_missed_cleavages=peptide_ident.missed_cleavages,
         count_variable_modifications=peptide_ident.modification_counter,
         start_position=start,
         end_position=end,
         peptide_score=score,
         peptide_score_type=score_type,
         sequence_length=end - start,
         protein_id=parent_protein.id,
         hypothesis_id=self.hypothesis_id)
     n_glycosites = n_glycan_sequon_sites(
         match, parent_protein)
     o_glycosites = o_glycan_sequon_sites(match, parent_protein)
     gag_glycosites = gag_sequon_sites(match, parent_protein)
     match.count_glycosylation_sites = len(n_glycosites) + len(o_glycosites)
     match.n_glycosylation_sites = sorted(n_glycosites)
     match.o_glycosylation_sites = sorted(o_glycosites)
     match.gagylation_sites = sorted(gag_glycosites)
     return match
 def run(self):
     self.make_pipeline()
     structure_class_lookup = self.structure_class_loader
     acc = []
     self.log("Generating Glycan Compositions from Symbolic Rules for %r" % self.hypothesis)
     counter = 0
     for composition, structure_classes in self.transformer:
         mass = composition.mass()
         composition_string = composition.serialize()
         formula_string = formula(composition.total_composition())
         inst = DBGlycanComposition(
             calculated_mass=mass, formula=formula_string,
             composition=composition_string,
             hypothesis_id=self.hypothesis_id)
         counter += 1
         self.session.add(inst)
         self.session.flush()
         for structure_class in structure_classes:
             structure_class = structure_class_lookup[structure_class]
             acc.append(dict(glycan_id=inst.id, class_id=structure_class.id))
             if len(acc) % 100 == 0:
                 self.session.execute(GlycanCompositionToClass.insert(), acc)
                 acc = []
         if counter % 1000 == 0:
             self.log("%d glycan compositions created" % (counter,))
     if acc:
         self.session.execute(GlycanCompositionToClass.insert(), acc)
         acc = []
     self.session.commit()
     self.log("Generated %d glycan compositions" % counter)
 def split_protein(self, protein_obj, sites=None):
     if sites is None:
         sites = []
     n = len(sites)
     seen = set()
     for i in range(1, n + 1):
         for split_sites in itertools.combinations(sites, i):
             spanning_peptides = protein_obj.peptides.filter(
                 *self._make_split_expression(split_sites)).all()
             for peptide in spanning_peptides:
                 adjusted_sites = [0] + [
                     s - peptide.start_position for s in split_sites
                 ] + [peptide.sequence_length]
                 for j in range(len(adjusted_sites) - 1):
                     begin, end = adjusted_sites[j], adjusted_sites[j + 1]
                     if end - begin < self.min_length:
                         continue
                     start_position = begin + peptide.start_position
                     end_position = end + peptide.start_position
                     if (start_position, end_position) in seen:
                         continue
                     else:
                         seen.add((start_position, end_position))
                     for modified_peptide, n_variable_modifications in self._permuted_peptides(
                             peptide.base_peptide_sequence[begin:end]):
                         inst = Peptide(
                             base_peptide_sequence=str(
                                 peptide.base_peptide_sequence[begin:end]),
                             modified_peptide_sequence=str(
                                 modified_peptide),
                             count_missed_cleavages=peptide.
                             count_missed_cleavages,
                             count_variable_modifications=
                             n_variable_modifications,
                             sequence_length=len(modified_peptide),
                             start_position=start_position,
                             end_position=end_position,
                             calculated_mass=modified_peptide.mass,
                             formula=formula(
                                 modified_peptide.total_composition()),
                             protein_id=protein_obj.id)
                         inst.hypothesis_id = protein_obj.hypothesis_id
                         inst.peptide_score = 0
                         inst.peptide_score_type = 'null_score'
                         n_glycosites = parent_sequence_aware_n_glycan_sequon_sites(
                             inst, protein_obj)
                         o_glycosites = o_glycan_sequon_sites(
                             inst, protein_obj)
                         gag_glycosites = gag_sequon_sites(
                             inst, protein_obj)
                         inst.count_glycosylation_sites = len(n_glycosites)
                         inst.n_glycosylation_sites = sorted(n_glycosites)
                         inst.o_glycosylation_sites = sorted(o_glycosites)
                         inst.gagylation_sites = sorted(gag_glycosites)
                         yield inst
Example #13
0
 def modify_string(self, peptide):
     for modified_peptide, n_variable_modifications in self.peptide_permuter(peptide):
         inst = Peptide(
             base_peptide_sequence=str(peptide),
             modified_peptide_sequence=str(modified_peptide),
             count_missed_cleavages=-1,
             count_variable_modifications=n_variable_modifications,
             sequence_length=len(modified_peptide),
             start_position=-1,
             end_position=-1,
             calculated_mass=modified_peptide.mass,
             formula=formula(modified_peptide.total_composition()))
         yield inst
 def modify_string(self, peptide):
     for modified_peptide, n_variable_modifications in self.peptide_permuter(peptide):
         inst = Peptide(
             base_peptide_sequence=str(peptide),
             modified_peptide_sequence=str(modified_peptide),
             count_missed_cleavages=-1,
             count_variable_modifications=n_variable_modifications,
             sequence_length=len(modified_peptide),
             start_position=-1,
             end_position=-1,
             calculated_mass=modified_peptide.mass,
             formula=formula(modified_peptide.total_composition()))
         yield inst
Example #15
0
 def _migrate_single_glycopeptide(self, glycopeptide):
     inst = Glycopeptide(
         id=glycopeptide.id,
         peptide_id=glycopeptide.id.peptide_id,
         glycan_combination_id=glycopeptide.id.glycan_combination_id,
         protein_id=glycopeptide.id.protein_id,
         hypothesis_id=glycopeptide.id.hypothesis_id,
         glycopeptide_sequence=glycopeptide.get_sequence(),
         calculated_mass=glycopeptide.total_mass,
         formula=formula(glycopeptide.total_composition()))
     self._glycopeptide_hypothesis_migrator.migrate_glycopeptide(inst)
     self._glycopeptide_hypothesis_migrator.commit()
     return self._glycopeptide_hypothesis_migrator.glycopeptide_id_map[glycopeptide.id]
def modifications():
    d = {}
    mt = ModificationTable()
    d['definitions'] = [
        (rule.title, formula(rule.composition), rule.mass) for rule in mt.rules()
    ]
    d['specificities'] = set()
    for rule in mt.rules():
        if (ModificationCategory.substitution in rule.categories or
            ModificationCategory.glycosylation in rule.categories or
                ModificationCategory.other_glycosylation in rule.categories):
            continue
        d['specificities'].update(rule.as_spec_strings())
    d['specificities'] = tuple(d['specificities'])
    return jsonify(**d)
Example #17
0
 def combinate(self, n=1):
     j = 0
     for comb_compositions in itertools.combinations_with_replacement(self.glycan_compositions, n):
         j += 1
         counts = Counter(comb_compositions)
         merged = merge_compositions_frozen(comb_compositions)
         composition = str(merged)
         mass = sum(c.mass for c in comb_compositions)
         elemental_composition = Composition()
         for c in comb_compositions:
             elemental_composition += c.elemental_composition
         inst = GlycanCombination(
             count=n,
             calculated_mass=mass,
             composition=composition,
             formula=formula(elemental_composition))
         yield inst, counts
Example #18
0
def convert_to_peptide_dict(glycopeptide, id_tracker):
    data = {
        "id": glycopeptide.id,
        "peptide_sequence": parser.strip_modifications(glycopeptide),
        "modifications": []
    }

    i = 0
    # TODO: handle N-terminal and C-terminal modifications
    for pos, mods in glycopeptide:
        i += 1
        if not mods:
            continue
        else:
            mod = mods[0]
        if mod.rule.is_a("glycosylation"):
            mod_dict = {
                "monoisotopic_mass_delta":
                glycopeptide.glycan_composition.mass(),
                "location":
                i,
                "name":
                "unknown modification",
                "params": [
                    components.UserParam(name='GlycosylationType',
                                         value=str(mod)),
                    components.UserParam(name='GlycanComposition',
                                         value=str(
                                             glycopeptide.glycan_composition)),
                    components.UserParam(
                        name='Formula',
                        value=formula(glycopeptide.glycan_composition.
                                      total_composition()))
                ]
            }
            data['modifications'].append(mod_dict)
        else:
            mod_dict = {
                "monoisotopic_mass_delta": mod.mass,
                "location": i,
                "name": mod.name,
            }
            data['modifications'].append(mod_dict)
    return data
Example #19
0
 def fetch_glycopeptides(self, glycopeptide_ids):
     aggregate = dict()
     for gp in self._identified_glycopeptide_set:
         for solution_set in gp.spectrum_matches:
             for match in solution_set:
                 aggregate[match.target.id] = match.target
     out = []
     for i, obj in enumerate(aggregate.values(), 1):
         inst = Glycopeptide(
             id=obj.id,
             peptide_id=obj.id.peptide_id,
             glycan_combination_id=obj.id.glycan_combination_id,
             protein_id=obj.id.protein_id,
             hypothesis_id=obj.id.hypothesis_id,
             glycopeptide_sequence=obj.get_sequence(),
             calculated_mass=obj.total_mass,
             formula=formula(obj.total_composition()))
         out.append(inst)
     return out
 def fetch_glycopeptides(self, glycopeptide_ids):
     aggregate = dict()
     for gp in self._identified_glycopeptide_set:
         for solution_set in gp.spectrum_matches:
             for match in solution_set:
                 aggregate[match.target.id] = match.target
     out = []
     for i, obj in enumerate(aggregate.values(), 1):
         inst = Glycopeptide(
             id=obj.id,
             peptide_id=obj.id.peptide_id,
             glycan_combination_id=obj.id.glycan_combination_id,
             protein_id=obj.id.protein_id,
             hypothesis_id=obj.id.hypothesis_id,
             glycopeptide_sequence=obj.get_sequence(),
             calculated_mass=obj.total_mass,
             formula=formula(obj.total_composition()))
         out.append(inst)
     return out
Example #21
0
def convert_to_peptide_dict(glycopeptide, id_tracker):
    data = {
        "id": glycopeptide.id,
        "peptide_sequence": parser.strip_modifications(glycopeptide),
        "modifications": [

        ]
    }

    i = 0
    # TODO: handle N-terminal and C-terminal modifications
    for pos, mods in glycopeptide:
        i += 1
        if not mods:
            continue
        else:
            mod = mods[0]
        if mod.rule.is_a("glycosylation"):
            mod_dict = {
                "monoisotopic_mass_delta": glycopeptide.glycan_composition.mass(),
                "location": i,
                "name": "unknown modification",
                "params": [
                    components.UserParam(
                        name='GlycosylationType', value=str(mod)),
                    components.UserParam(name='GlycanComposition', value=str(
                        glycopeptide.glycan_composition)),
                    components.UserParam(name='Formula', value=formula(
                        glycopeptide.glycan_composition.total_composition()))
                ]
            }
            data['modifications'].append(mod_dict)
        else:
            mod_dict = {
                "monoisotopic_mass_delta": mod.mass,
                "location": i,
                "name": mod.name,
            }
            data['modifications'].append(mod_dict)
    return data
Example #22
0
    def handle_peptide(self, peptide):
        water = Composition("H2O")
        peptide_composition = Composition(str(peptide.formula))
        obj = peptide.convert()

        # Handle N-linked glycosylation sites

        n_glycosylation_unoccupied_sites = set(peptide.n_glycosylation_sites)
        for site in list(n_glycosylation_unoccupied_sites):
            if obj[site][1]:
                n_glycosylation_unoccupied_sites.remove(site)
        for i in range(len(n_glycosylation_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.n_glycan: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))

                for site_set in limiting_combinations(n_glycosylation_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _n_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide

        # Handle O-linked glycosylation sites
        o_glycosylation_unoccupied_sites = set(peptide.o_glycosylation_sites)
        for site in list(o_glycosylation_unoccupied_sites):
            if obj[site][1]:
                o_glycosylation_unoccupied_sites.remove(site)

        for i in range(len(o_glycosylation_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.o_glycan: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))

                for site_set in limiting_combinations(o_glycosylation_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _o_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide

        # Handle GAG glycosylation sites
        gag_unoccupied_sites = set(peptide.gagylation_sites)
        for site in list(gag_unoccupied_sites):
            if obj[site][1]:
                gag_unoccupied_sites.remove(site)
        for i in range(len(gag_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.gag_linker: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))
                for site_set in limiting_combinations(gag_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _gag_linker_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide
Example #23
0
    def split_protein(self, protein_obj, sites=None):
        if sites is None:
            sites = []
        if not sites:
            return
        seen = set()
        sites_seen = set()
        peptides = protein_obj.peptides.all()
        peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides))
        for site in sites:
            overlap_region = peptide_intervals.contains_point(site - 1)
            spanned_intervals = IntervalTreeNode.build(overlap_region)
            # No spanned peptides. May be caused by regions of protein which digest to peptides
            # of unacceptable size.
            if spanned_intervals is None:
                continue
            lo = spanned_intervals.start
            hi = spanned_intervals.end
            # Get the set of all sites spanned by any peptide which spans the current query site
            spanned_sites = [s for s in sites if lo <= s <= hi]
            for i in range(1, len(spanned_sites) + 1):
                for split_sites in itertools.combinations(spanned_sites, i):
                    site_key = frozenset(split_sites)
                    if site_key in sites_seen:
                        continue
                    sites_seen.add(site_key)
                    spanning_peptides_query = spanned_intervals.contains_point(split_sites[0])
                    for site_j in split_sites[1:]:
                        spanning_peptides_query = [
                            sp for sp in spanning_peptides_query if site_j in sp
                        ]
                    spanning_peptides = []
                    for sp in spanning_peptides_query:
                        spanning_peptides.extend(sp)
                    for peptide in spanning_peptides:
                        adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [
                            peptide.sequence_length]
                        for j in range(len(adjusted_sites) - 1):
                            begin, end = adjusted_sites[j], adjusted_sites[j + 1]
                            if end - begin < self.min_length:
                                continue
                            start_position = begin + peptide.start_position
                            end_position = end + peptide.start_position
                            if (start_position, end_position) in seen:
                                continue
                            else:
                                seen.add((start_position, end_position))
                            for modified_peptide, n_variable_modifications in self._permuted_peptides(
                                    peptide.base_peptide_sequence[begin:end]):

                                inst = Peptide(
                                    base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]),
                                    modified_peptide_sequence=str(modified_peptide),
                                    count_missed_cleavages=peptide.count_missed_cleavages,
                                    count_variable_modifications=n_variable_modifications,
                                    sequence_length=len(modified_peptide),
                                    start_position=start_position,
                                    end_position=end_position,
                                    calculated_mass=modified_peptide.mass,
                                    formula=formula(modified_peptide.total_composition()),
                                    protein_id=protein_obj.id)
                                inst.hypothesis_id = protein_obj.hypothesis_id
                                inst.peptide_score = 0
                                inst.peptide_score_type = 'null_score'
                                n_glycosites = n_glycan_sequon_sites(
                                    inst, protein_obj)
                                o_glycosites = o_glycan_sequon_sites(inst, protein_obj)
                                gag_glycosites = gag_sequon_sites(inst, protein_obj)
                                inst.count_glycosylation_sites = len(n_glycosites)
                                inst.n_glycosylation_sites = sorted(n_glycosites)
                                inst.o_glycosylation_sites = sorted(o_glycosites)
                                inst.gagylation_sites = sorted(gag_glycosites)
                                yield inst
Example #24
0
    def handle_peptide(self, peptide):
        water = Composition("H2O")
        peptide_composition = Composition(str(peptide.formula))
        obj = peptide.convert()

        # Handle N-linked glycosylation sites

        n_glycosylation_unoccupied_sites = set(peptide.n_glycosylation_sites)
        for site in list(n_glycosylation_unoccupied_sites):
            if obj[site][1]:
                n_glycosylation_unoccupied_sites.remove(site)
        for i in range(len(n_glycosylation_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.n_glycan: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))

                for site_set in limiting_combinations(n_glycosylation_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _n_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide

        # Handle O-linked glycosylation sites
        o_glycosylation_unoccupied_sites = set(peptide.o_glycosylation_sites)
        for site in list(o_glycosylation_unoccupied_sites):
            if obj[site][1]:
                o_glycosylation_unoccupied_sites.remove(site)

        for i in range(len(o_glycosylation_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.o_glycan: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))

                for site_set in limiting_combinations(o_glycosylation_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _o_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide

        # Handle GAG glycosylation sites
        gag_unoccupied_sites = set(peptide.gagylation_sites)
        for site in list(gag_unoccupied_sites):
            if obj[site][1]:
                gag_unoccupied_sites.remove(site)
        for i in range(len(gag_unoccupied_sites)):
            i += 1
            for gc in self.glycan_combination_partitions[i, {GlycanTypes.gag_linker: i}]:
                total_mass = peptide.calculated_mass + gc.calculated_mass - (gc.count * water.mass)
                formula_string = formula(peptide_composition + Composition(str(gc.formula)) - (water * gc.count))
                for site_set in limiting_combinations(gag_unoccupied_sites, i):
                    sequence = peptide.convert()
                    for site in site_set:
                        sequence.add_modification(site, _gag_linker_glycosylation.name)
                    sequence.glycan = gc.convert()

                    glycopeptide_sequence = str(sequence)

                    glycopeptide = Glycopeptide(
                        calculated_mass=total_mass,
                        formula=formula_string,
                        glycopeptide_sequence=glycopeptide_sequence,
                        peptide_id=peptide.id,
                        protein_id=peptide.protein_id,
                        hypothesis_id=peptide.hypothesis_id,
                        glycan_combination_id=gc.id)
                    yield glycopeptide
Example #25
0
    def convert_to_peptide_dict(self, glycopeptide, id_tracker):
        data = {
            "id": glycopeptide.id,
            "peptide_sequence": parser.strip_modifications(glycopeptide),
            "modifications": []
        }

        i = 0
        # TODO: handle N-terminal and C-terminal modifications
        glycosylation_event_count = len(glycopeptide.convert().glycosylation_manager)
        glycosylation_events_handled = 0
        for _pos, mods in glycopeptide:
            i += 1
            if not mods:
                continue
            else:
                mod = mods[0]
            if mod.rule.is_a("glycosylation"):
                glycosylation_events_handled += 1
                is_aggregate_stub = False
                mod_params = [
                    glycosylation_type_to_term(
                        str(mod.rule.glycosylation_type))
                ]
                if mod.rule.is_core:

                    mod_params.extend(
                        self.gnome_resolver.glycan_composition_to_terms(glycopeptide.glycan_composition.clone()))

                    mass = glycopeptide.glycan_composition.mass()
                    if glycosylation_event_count == 1:
                        mod_params.append({
                            "name": "glycan composition",
                            "cvRef": "PSI-MS",
                            "accession": "MS:XXXX14"
                        })
                    else:
                        mod_params.append({
                            "name": "glycan aggregate",
                            "cvRef": "PSI-MS",
                            "accession": "MS:XXXX15"
                        })
                        if glycosylation_events_handled > 1:
                            mass = 0
                            is_aggregate_stub = True

                    if not is_aggregate_stub:
                        mod_params.append({
                            "accession": 'MS:1000864',
                            "cvRef": "PSI-MS",
                            "name": "chemical formula",
                            "value": formula(glycopeptide.glycan_composition.total_composition()),
                        })

                else:
                    mod_params.append({
                        "accession": 'MS:1000864',
                        "cvRef": "PSI-MS",
                        "name": "chemical formula",
                        "value": formula(mod.rule.composition),
                    })
                    if mod.rule.is_composition:
                        mod_params.extend(self.gnome_resolver.glycan_composition_to_terms(mod.rule.glycan.clone()))
                        mod_params.append({
                            "name": "glycan composition",
                            "cvRef": "PSI-MS",
                            "accession": "MS:XXXX14"
                        })
                    else:
                        mod_params.append({
                            "name": "glycan structure",
                            "cvRef": "PSI-MS",
                            "accession": "MS:XXXXXXX"
                        })
                    mass = mod.mass

                mod_dict = {
                    "monoisotopic_mass_delta": mass,
                    "location": i,
                    # "name": "unknown modification",
                    "name": "glycosylation modification",
                    "params": [components.CVParam(**x) for x in mod_params]
                }
                data['modifications'].append(mod_dict)
            else:
                mod_dict = {
                    "monoisotopic_mass_delta": mod.mass,
                    "location": i,
                    "name": mod.name,
                }
                data['modifications'].append(mod_dict)
        return data
    def split_protein(self, protein_obj, sites=None):
        if sites is None:
            sites = []
        if not sites:
            return
        seen = set()
        sites_seen = set()
        peptides = protein_obj.peptides.all()
        peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides))
        for site in sites:
            overlap_region = peptide_intervals.contains_point(site - 1)
            spanned_intervals = IntervalTreeNode.build(overlap_region)
            # No spanned peptides. May be caused by regions of protein which digest to peptides
            # of unacceptable size.
            if spanned_intervals is None:
                continue
            lo = spanned_intervals.start
            hi = spanned_intervals.end
            # Get the set of all sites spanned by any peptide which spans the current query site
            spanned_sites = [s for s in sites if lo <= s <= hi]
            for i in range(1, len(spanned_sites) + 1):
                for split_sites in itertools.combinations(spanned_sites, i):
                    site_key = frozenset(split_sites)
                    if site_key in sites_seen:
                        continue
                    sites_seen.add(site_key)
                    spanning_peptides_query = spanned_intervals.contains_point(split_sites[0])
                    for site_j in split_sites[1:]:
                        spanning_peptides_query = [
                            sp for sp in spanning_peptides_query if site_j in sp
                        ]
                    spanning_peptides = []
                    for sp in spanning_peptides_query:
                        spanning_peptides.extend(sp)
                    for peptide in spanning_peptides:
                        adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [
                            peptide.sequence_length]
                        for j in range(len(adjusted_sites) - 1):
                            begin, end = adjusted_sites[j], adjusted_sites[j + 1]
                            if end - begin < self.min_length:
                                continue
                            start_position = begin + peptide.start_position
                            end_position = end + peptide.start_position
                            if (start_position, end_position) in seen:
                                continue
                            else:
                                seen.add((start_position, end_position))
                            for modified_peptide, n_variable_modifications in self._permuted_peptides(
                                    peptide.base_peptide_sequence[begin:end]):

                                inst = Peptide(
                                    base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]),
                                    modified_peptide_sequence=str(modified_peptide),
                                    count_missed_cleavages=peptide.count_missed_cleavages,
                                    count_variable_modifications=n_variable_modifications,
                                    sequence_length=len(modified_peptide),
                                    start_position=start_position,
                                    end_position=end_position,
                                    calculated_mass=modified_peptide.mass,
                                    formula=formula(modified_peptide.total_composition()),
                                    protein_id=protein_obj.id)
                                inst.hypothesis_id = protein_obj.hypothesis_id
                                inst.peptide_score = 0
                                inst.peptide_score_type = 'null_score'
                                n_glycosites = n_glycan_sequon_sites(
                                    inst, protein_obj)
                                o_glycosites = o_glycan_sequon_sites(inst, protein_obj)
                                gag_glycosites = gag_sequon_sites(inst, protein_obj)
                                inst.count_glycosylation_sites = len(n_glycosites)
                                inst.n_glycosylation_sites = sorted(n_glycosites)
                                inst.o_glycosylation_sites = sorted(o_glycosites)
                                inst.gagylation_sites = sorted(gag_glycosites)
                                yield inst
 def _serialize_mass_shift(self, mass_shift):
     return {"name": mass_shift.name, "composition": formula(mass_shift.composition)}
 def _serialize_mass_shift(self, mass_shift):
     return {"name": mass_shift.name, "composition": formula(mass_shift.composition)}