def interval_tree(self):
     if self._interval_tree is None:
         self._interval_tree = IntervalTreeNode.build(self.clusters)
         if self._interval_tree is None:
             raise ValueError(
                 "Could not build intervals for peak retrieval with %d clusters" % len(
                     self.clusters), self)
     return self._interval_tree
    def test_build(self):
        # generate many redundant intervals to force
        # the interval tree to branch out.
        intervals = self.make_intervals() * 30
        ivt = IntervalTreeNode.build(intervals)

        assert ivt.start == 3
        assert ivt.end == 36

        assert ivt.left is not None
        assert ivt.left.start == 3
        assert ivt.left.end == 12

        assert ivt.right is not None
        assert ivt.right.end == 36
        assert ivt.right.start == 22
Beispiel #3
0
    def test_build(self):
        # generate many redundant intervals to force
        # the interval tree to branch out.
        intervals = self.make_intervals() * 30
        ivt = IntervalTreeNode.build(intervals)

        assert ivt.start == 3
        assert ivt.end == 36

        assert ivt.left is not None
        assert ivt.left.start == 3
        assert ivt.left.end == 12

        assert ivt.right is not None
        assert ivt.right.end == 36
        assert ivt.right.start == 22
    def find_solution_for(self, feature):
        feature_node = self.nodes[feature]
        tree = self.interval_tree
        if tree is None:
            tree = IntervalTreeNode.build(self.clusters)
            if tree is None:
                raise ValueError(
                    "Could not build intervals for peak retrieval with %d clusters" % len(
                        self.clusters))
        clusters = tree.contains_point(feature.mz)
        if len(clusters) == 0:
            return self._find_fuzzy_solution_for(feature)
        best_fits = [cluster.disjoint_best_fits() for cluster in clusters]

        acc = []
        for fits in best_fits:
            acc.extend(fits)
        best_fits = acc

        common = tuple(set(best_fits) & set(feature_node.links))

        if len(common) > 1 or len(common) == 0:
            if len(common) > 1:
                warnings.warn("Too many solutions exist for %r" % feature)
            # If there were no fits for this peak, then it may be that this peak
            # was not included in a fit. Try to find the nearest solution.
            i = 0
            err = float('inf')
            for j, case in enumerate(best_fits):
                case_err = abs(case.monoisotopic_feature.mz - feature.mz)
                if case_err < err:
                    i = j
                    err = case_err
            fit = best_fits[i]
        else:
            fit = common[0]
        return self._solution_map[fit]
Beispiel #5
0
    def split_protein(self, protein_obj, sites=None):
        if sites is None:
            sites = []
        if not sites:
            return
        seen = set()
        sites_seen = set()
        peptides = protein_obj.peptides.all()
        peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides))
        for site in sites:
            overlap_region = peptide_intervals.contains_point(site - 1)
            spanned_intervals = IntervalTreeNode.build(overlap_region)
            # No spanned peptides. May be caused by regions of protein which digest to peptides
            # of unacceptable size.
            if spanned_intervals is None:
                continue
            lo = spanned_intervals.start
            hi = spanned_intervals.end
            # Get the set of all sites spanned by any peptide which spans the current query site
            spanned_sites = [s for s in sites if lo <= s <= hi]
            for i in range(1, len(spanned_sites) + 1):
                for split_sites in itertools.combinations(spanned_sites, i):
                    site_key = frozenset(split_sites)
                    if site_key in sites_seen:
                        continue
                    sites_seen.add(site_key)
                    spanning_peptides_query = spanned_intervals.contains_point(split_sites[0])
                    for site_j in split_sites[1:]:
                        spanning_peptides_query = [
                            sp for sp in spanning_peptides_query if site_j in sp
                        ]
                    spanning_peptides = []
                    for sp in spanning_peptides_query:
                        spanning_peptides.extend(sp)
                    for peptide in spanning_peptides:
                        adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [
                            peptide.sequence_length]
                        for j in range(len(adjusted_sites) - 1):
                            begin, end = adjusted_sites[j], adjusted_sites[j + 1]
                            if end - begin < self.min_length:
                                continue
                            start_position = begin + peptide.start_position
                            end_position = end + peptide.start_position
                            if (start_position, end_position) in seen:
                                continue
                            else:
                                seen.add((start_position, end_position))
                            for modified_peptide, n_variable_modifications in self._permuted_peptides(
                                    peptide.base_peptide_sequence[begin:end]):

                                inst = Peptide(
                                    base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]),
                                    modified_peptide_sequence=str(modified_peptide),
                                    count_missed_cleavages=peptide.count_missed_cleavages,
                                    count_variable_modifications=n_variable_modifications,
                                    sequence_length=len(modified_peptide),
                                    start_position=start_position,
                                    end_position=end_position,
                                    calculated_mass=modified_peptide.mass,
                                    formula=formula(modified_peptide.total_composition()),
                                    protein_id=protein_obj.id)
                                inst.hypothesis_id = protein_obj.hypothesis_id
                                inst.peptide_score = 0
                                inst.peptide_score_type = 'null_score'
                                n_glycosites = n_glycan_sequon_sites(
                                    inst, protein_obj)
                                o_glycosites = o_glycan_sequon_sites(inst, protein_obj)
                                gag_glycosites = gag_sequon_sites(inst, protein_obj)
                                inst.count_glycosylation_sites = len(n_glycosites)
                                inst.n_glycosylation_sites = sorted(n_glycosites)
                                inst.o_glycosylation_sites = sorted(o_glycosites)
                                inst.gagylation_sites = sorted(gag_glycosites)
                                yield inst
 def __init__(self, features):
     self.rt_tree = IntervalTreeNode.build(map(RTFeatureNode, features))
Beispiel #7
0
def neutral_mass_point_organizer_callback(contained_intervals):
    return IntervalTreeNode.build([
        Interval(node.neutral_mass, node.neutral_mass, [node])
        for node in contained_intervals
    ])
Beispiel #8
0
def mz_point_organizer_callback(contained_intervals):
    return IntervalTreeNode.build(
        [Interval(node.mz, node.mz, [node]) for node in contained_intervals])
 def __init__(self, features):
     self.rt_tree = IntervalTreeNode.build(map(RTFeatureNode, features))
    def split_protein(self, protein_obj, sites=None):
        if sites is None:
            sites = []
        if not sites:
            return
        seen = set()
        sites_seen = set()
        peptides = protein_obj.peptides.all()
        peptide_intervals = IntervalTreeNode.build(map(PeptideInterval, peptides))
        for site in sites:
            overlap_region = peptide_intervals.contains_point(site - 1)
            spanned_intervals = IntervalTreeNode.build(overlap_region)
            # No spanned peptides. May be caused by regions of protein which digest to peptides
            # of unacceptable size.
            if spanned_intervals is None:
                continue
            lo = spanned_intervals.start
            hi = spanned_intervals.end
            # Get the set of all sites spanned by any peptide which spans the current query site
            spanned_sites = [s for s in sites if lo <= s <= hi]
            for i in range(1, len(spanned_sites) + 1):
                for split_sites in itertools.combinations(spanned_sites, i):
                    site_key = frozenset(split_sites)
                    if site_key in sites_seen:
                        continue
                    sites_seen.add(site_key)
                    spanning_peptides_query = spanned_intervals.contains_point(split_sites[0])
                    for site_j in split_sites[1:]:
                        spanning_peptides_query = [
                            sp for sp in spanning_peptides_query if site_j in sp
                        ]
                    spanning_peptides = []
                    for sp in spanning_peptides_query:
                        spanning_peptides.extend(sp)
                    for peptide in spanning_peptides:
                        adjusted_sites = [0] + [s - peptide.start_position for s in split_sites] + [
                            peptide.sequence_length]
                        for j in range(len(adjusted_sites) - 1):
                            begin, end = adjusted_sites[j], adjusted_sites[j + 1]
                            if end - begin < self.min_length:
                                continue
                            start_position = begin + peptide.start_position
                            end_position = end + peptide.start_position
                            if (start_position, end_position) in seen:
                                continue
                            else:
                                seen.add((start_position, end_position))
                            for modified_peptide, n_variable_modifications in self._permuted_peptides(
                                    peptide.base_peptide_sequence[begin:end]):

                                inst = Peptide(
                                    base_peptide_sequence=str(peptide.base_peptide_sequence[begin:end]),
                                    modified_peptide_sequence=str(modified_peptide),
                                    count_missed_cleavages=peptide.count_missed_cleavages,
                                    count_variable_modifications=n_variable_modifications,
                                    sequence_length=len(modified_peptide),
                                    start_position=start_position,
                                    end_position=end_position,
                                    calculated_mass=modified_peptide.mass,
                                    formula=formula(modified_peptide.total_composition()),
                                    protein_id=protein_obj.id)
                                inst.hypothesis_id = protein_obj.hypothesis_id
                                inst.peptide_score = 0
                                inst.peptide_score_type = 'null_score'
                                n_glycosites = n_glycan_sequon_sites(
                                    inst, protein_obj)
                                o_glycosites = o_glycan_sequon_sites(inst, protein_obj)
                                gag_glycosites = gag_sequon_sites(inst, protein_obj)
                                inst.count_glycosylation_sites = len(n_glycosites)
                                inst.n_glycosylation_sites = sorted(n_glycosites)
                                inst.o_glycosylation_sites = sorted(o_glycosites)
                                inst.gagylation_sites = sorted(gag_glycosites)
                                yield inst