Exemple #1
0
    def test_comparisons(self):
        p = p00 = SequenceRange(self.pep_start, self.pep_stop)
        p2 = SequenceRange(self.pep_start, self.pep_stop)
        p01 = SequenceRange(self.pep_start, self.pep_stop + 1)
        p10 = SequenceRange(self.pep_start + 1, self.pep_stop)
        p_tuple = (self.pep_start, self.pep_stop)

        # equal / unequal
        assert p == p2
        assert p == p_tuple
        assert p_tuple == p
        assert p_tuple != p01
        assert p_tuple != 123
        assert p is not None
        assert p not in (None, 66)
        assert p != (str(self.pep_start), str(self.pep_stop))
        with pytest.raises(TypeError):
            p < "Wrong type!!"
        with pytest.raises(TypeError):
            "Wrong type!!" < p
        with pytest.raises(TypeError):
            p > "Wrong type!!"

        assert not p < p and not p > p
        assert p <= p and p >= p and p == p

        # less/greater or equal
        # (x, y) < (x, y + 1), < (x+1, y)
        assert p00 < p01 < p10
        assert p10 > p01 > p00

        assert p_tuple < p01 < p10 and p_tuple <= p01 <= p10
        assert p10 > p01 > p_tuple and p10 >= p01 >= p_tuple
Exemple #2
0
    def test___sub__(self):
        assert SequenceRange(5, 20) - SequencePoint(3) == SequenceRange(3, 18)
        with pytest.raises(ValueError):
            # 20 - 5, 20 - 10 -> 15, 10 = makes no sense!!
            (SequencePoint(20) - SequenceRange(5, 10)).validate()

        assert SequenceRange(10, 15) - SequencePoint(5) == SequenceRange(6, 11)
Exemple #3
0
    def test___iter__(self):
        sr = SequenceRange(5, 10)
        sr_points = list(sr)  # should be equivalent to list(sr.__iter__())
        assert sr.length == len(sr_points)
        assert sr_points[0].pos == sr.start.pos
        assert sr_points[-1].pos == sr.stop.pos

        assert list(SequenceRange(5, 5))[0] == SequencePoint(5)
Exemple #4
0
    def test___repr__(self):
        assert repr(SequenceRange(10, 20)) == "SequenceRange(10, 20, seq=None)"
        seq = "A" * 11
        assert repr(SequenceRange(10, 20, seq=seq)) == \
            'SequenceRange(10, 20, seq="{}")'.format(seq)

        assert repr(SequenceRange(10, 20).pos) == "(10, 20)"
        assert repr(SequenceRange.from_index(10, 20).index) == "(10, 20)"
Exemple #5
0
 def test___add__(self):
     assert SequenceRange(2, 3) + SequencePoint(2) == SequenceRange(3, 4)
     assert SequencePoint(2) + SequenceRange(2, 3) == SequenceRange(3, 4)
     assert SequenceRange(2, 3) + SequencePoint(2) + 5 == SequenceRange(
         8, 9)
     assert 5 + SequenceRange(2, 3) + SequencePoint(2) == SequenceRange(
         8, 9)
     assert SequenceRange(2, 3) + 5 + SequencePoint(2) == SequenceRange(
         8, 9)
Exemple #6
0
    def test_math_seq(self):
        # simple math should retain the seq
        evil = SequenceRange(12, 15, seq="EVIL")
        evil_p2 = SequenceRange(14, 17, seq="EVIL")
        evil_m2 = SequenceRange(10, 13, seq="EVIL")
        assert evil + 2 == evil_p2 == 2 + evil
        assert evil - 2 == evil_m2 == -2 + evil

        # complex math should change seq to None
        assert (evil + SequenceRange(1, 2)).seq is None
        assert (evil - SequenceRange(1, 2)).seq is None
Exemple #7
0
    def test_from_sequence(self, glucagon_peptides, glucagon_seq):
        expected = SequenceRange(self.pep_start,
                                 self.pep_stop,
                                 seq=self.pep_seq)
        observed = SequenceRange.from_sequence(self.protein_seq, self.pep_seq)
        assert observed == expected
        with pytest.raises(IndexError):
            SequenceRange.from_sequence('PROTEINSEQ', "PEPTIDESEQ")

        for (start, stop, seq) in glucagon_peptides:
            p = SequenceRange.from_sequence(glucagon_seq, seq)
            self._assert(p, seq, glucagon_seq)
Exemple #8
0
    def test_from_index_and_length(self, glucagon_peptides, glucagon_seq):
        # simple tests
        index = self.protein_seq.index(self.pep_seq)
        p = SequenceRange.from_index(index, length=len(self.pep_seq))
        self._assert(p, self.pep_seq, self.protein_seq)

        # all peptides
        for (start, stop, seq) in glucagon_peptides:
            p = SequenceRange.from_index(glucagon_seq.index(seq),
                                         length=len(seq))
            self._assert(p, seq, glucagon_seq)

        with pytest.raises(ValueError):
            SequenceRange.from_index(SequenceRange(10), length=20)
Exemple #9
0
    def test_from_slices(self, glucagon_peptides, glucagon_seq):
        pep_start_slice = 5
        pep_stop_slice = 9
        p_slice = SequenceRange.from_slice(pep_start_slice, pep_stop_slice)
        p_slice2 = SequenceRange.from_slice(
            slice(pep_start_slice, pep_stop_slice))
        p = SequenceRange(self.pep_start, self.pep_stop)
        assert p == p_slice == p_slice2
        assert self.pep_seq == self.protein_seq[p_slice.slice.start:p_slice.
                                                slice.stop]
        assert self.pep_seq == self.protein_seq[p_slice.slice]

        with pytest.raises(ValueError):
            # slices has to have step=1 or None
            SequenceRange.from_slice(slice(2, 10, 2))
Exemple #10
0
    def test_bugs(self):
        """
        make sure that bugs do not recure!!
        """

        # '53' is a abc.Sequecne with length 2, thus is used to be interpeted much like ('5', '3')
        for start in (53, '53', b'53'):
            for stop in (63, '63', b'63'):
                assert SequenceRange(start, stop, seq='A' *
                                     11) == SequenceRange(53, seq='A' * 11)

        for start in (153, '153', b'153'):
            for stop in (163, '163', b'163'):
                assert SequenceRange(start, stop, seq='A' *
                                     11) == SequenceRange(153, seq='A' * 11)
Exemple #11
0
    def _iter_build_lpv(self, min_overlap):
        # phase 1)
        # sorted reverse, because you can only pop from the end
        peptides = sorted(self.peptides, reverse=True)
        if len(peptides) == 0:
            return
        lpv = peptides.pop()
        while len(peptides) != 0:
            pep = peptides.pop()

            # Step 1) you are inside the peptide - ignore/delete -
            if pep in lpv:
                continue

            # Step 2) you are extending the peptide - extend -
            overlap = lpv.stop - pep.start + 1
            if overlap >= min_overlap:
                lpv = SequenceRange(lpv.start,
                                    pep.stop,
                                    full_sequence=self.protein_sequence)
                continue

            # no extension, no internal -> new lpv
            yield lpv
            lpv = pep
        yield lpv
Exemple #12
0
 def get_clusters(cls, h_cluster):
     clusters = {}
     for n_clust in range(1, h_cluster.max() + 1):
         cluster_indexes = np.where(h_cluster == n_clust)[0]
         clusters[n_clust] = SequenceRange.from_index(
             cluster_indexes[0], cluster_indexes[-1])
     return clusters
Exemple #13
0
 def get_bond_slice(cls, peptide):
     """
     A peptide like this SequenceRange(10, 20), has a length of 11, but only 10 bonds, thus
     SequenceRange(10, 20).slice -> slice(9, 20)
     cls.get_bodn_slice(SeuqenceRange(10, 20) -> slice(9, 19)
     """
     return SequenceRange(peptide.start, peptide.stop - 1).slice
Exemple #14
0
    def make_histograms(cls, df, length, n_samples, ladder_window=5):
        histogram = np.zeros(length)
        histogram_start = np.zeros(length)
        histogram_stop = np.zeros(length)
        histogram_ac = np.zeros(length)
        histogram_am = np.zeros(length)
        #  histogram_bonds = np.zeros(length - 1)

        for pep_var_id, peptide_series in cls.iterrows(df):
            p = SequenceRange(pep_var_id.start, pep_var_id.stop)
            intensity = peptide_series.sum() / n_samples
            if pep_var_id.mod_seq.startswith('_(ac)'):
                histogram_ac[p.start.index] += intensity
            if pep_var_id.mod_seq.endswith('_(am)'):
                histogram_am[p.stop.index] += intensity
            histogram_start[p.start.index] += intensity
            histogram_stop[p.stop.index] += intensity
            histogram[p.slice] += intensity

        bonds = np.stack((histogram[1:], histogram[:-1]))
        with np.errstate(invalid='ignore'):
            histogram_bonds = bonds.min(axis=0) / bonds.max(axis=0)

        first = (histogram == histogram_start) & (histogram != 0)
        last = (histogram == histogram_stop) & (histogram != 0)

        return (histogram, histogram_start, histogram_stop, histogram_ac,
                histogram_am, histogram_bonds, first, last)
Exemple #15
0
    def diversify_score(self, dampening=2.0):
        """
        This method modifies a series of scores (usally ppv prediction) and tries to make a
        tradeoff between high predictions and high similarity to previous
        predictions, the algorithm to achive this is very simple:
        1) take the highest score
        2) downvote all who share amino acids with it by a factor of 'dampening'
        3) goto 1
        """
        # create overlap[campaign][proteinid] = {SequenceRange(..)..}
        # so we can quickley find overlapping peptides
        peptides = collections.defaultdict(lambda: collections.defaultdict(set))
        for campaign_id, entry_id, score in self.series.peputils.iteritems(keep_campaign_id=True):
            sequence_range = SequenceRange(entry_id.start, entry_id.stop)
            peptides[campaign_id][entry_id.protein_id].add(sequence_range)

        # the algorithm
        scores = self.series.copy()
        new_score = pd.Series(index=self.series.index)
        while scores.shape[0] != 0:
            # add best score to new_score
            best_id = scores.idxmax()
            new_score[best_id] = scores[best_id]
            # penaltize overlapping peptides
            overlapping_ids = self._get_overlaping_ids(*best_id, peptides)
            scores[overlapping_ids] /= dampening
            del scores[best_id]
        return new_score
Exemple #16
0
 def _iter_split_ptm(self, lpv_iter):
     # warning this iter can return the same lpv twice (so convert to set!)
     # Phase 2)
     for lpv in lpv_iter:
         pos_array = np.arange(len(lpv)) + lpv.start.pos
         starts = set(
             pos_array[self.h_ac[lpv.slice] != 0]) | {lpv.start.pos}
         stops = set(pos_array[self.h_am[lpv.slice] != 0]) | {lpv.stop.pos}
         for start in starts:
             yield SequenceRange(start,
                                 lpv.stop,
                                 full_sequence=self.protein_sequence)
         for stop in stops:
             yield SequenceRange(lpv.start,
                                 stop,
                                 full_sequence=self.protein_sequence)
         yield lpv
Exemple #17
0
def glucagon_known_peptides():
    with open(pjoin(TEST_DATA, "glucagon/mouse_glucagon.known")) as f:
        peptides = set()
        f.readline()  # skip header
        for line in f.readlines():
            protein_id, start, stop, sequence, *_ = line.rstrip().split('\t')
            peptides.add(SequenceRange(start, stop, seq=sequence))
    return peptides
Exemple #18
0
 def get_valid_peptides(cls, valid_starts, valid_stops, protein_sequence):
     valid_peptides = {}
     for v_start, c_start in valid_starts.items():
         for v_stop, c_stop in valid_stops.items():
             if v_start < v_stop and c_start == c_stop:
                 p = SequenceRange(v_start,
                                   v_stop,
                                   full_sequence=protein_sequence)
                 valid_peptides[p] = c_start
     return valid_peptides
Exemple #19
0
 def get_known_peptides(cls, known_file: str) -> typing.Dict[str, set]:
     known_peptides = collections.defaultdict(set)
     with open(known_file) as known_file:
         known_file.readline()  # skip header
         for line in known_file:
             known = Known(*line.rstrip('\r\n').split('\t'))
             if known.type in ('peptide', 'propeptide'):
                 peptide = SequenceRange(int(known.start), int(known.stop), seq=known.seq)
                 known_peptides[known.protein_id].add(peptide)
     return dict(known_peptides)
Exemple #20
0
 def make_sample_frequency_histogram(self, df):
     histogram_samples = pd.DataFrame(np.zeros((self.length, df.shape[1])),
                                      columns=df.columns)
     for pep_var_id, peptide_series in self.iterrows(df):
         p = SequenceRange(pep_var_id.start, pep_var_id.stop)
         for group, intensity in peptide_series.dropna().iteritems():
             histogram_samples[group][p.slice] = 1
     if not (0 <= histogram_samples.shape[1] <= self.n_samples):
         raise ValueError(
             "max_samples, higher than the accual number of samples!!!")
     return histogram_samples.sum(axis=1).values / self.n_samples
Exemple #21
0
 def test_deprecation(self):
     sr = SequenceRange(1, 2)
     with pytest.warns(None):
         sr.pos
         sr.index
     with pytest.warns(DeprecationWarning):
         sr.pos.start
     with pytest.warns(DeprecationWarning):
         sr.pos.stop
     with pytest.warns(DeprecationWarning):
         sr.index.start
     with pytest.warns(DeprecationWarning):
         sr.index.stop
Exemple #22
0
 def test___hash__(self):
     hash(SequenceRange(1, 2))
     my_set = set()
     self._assert_hash(my_set, SequenceRange(1, 1), 0, 1)
     self._assert_hash(my_set, SequenceRange(1, 1), 1, 1)
     self._assert_hash(my_set, SequenceRange(1, 2), 1, 2)
     self._assert_hash(my_set, SequenceRange(2, 2), 2, 3)
     self._assert_hash(my_set, SequenceRange(1, 2), 3, 3)
Exemple #23
0
 def test_immutability(self):
     s = SequenceRange(1, 2)
     with pytest.raises(AttributeError):
         s.pos = (1, 2)
     with pytest.raises(AttributeError):
         s.index = (1, 2)
     with pytest.raises(AttributeError):
         s.slice = (1, 2)
     with pytest.raises(AttributeError):
         s.start = SequencePoint(2)
     with pytest.raises(AttributeError):
         s.stop = SequencePoint(2)
Exemple #24
0
    def test___contains__(self):
        peptide = SequenceRange(5, 20)

        self._in(SequencePoint, 5, peptide)
        self._in(SequencePoint, 10, peptide)
        self._in(SequencePoint, 20, peptide)
        self._not_in(SequencePoint, 4, peptide)
        self._not_in(SequencePoint, 21, peptide)

        self._in(SequenceRange, (5, 10), peptide)
        self._in(SequenceRange, (10, 15), peptide)
        self._in(SequenceRange, (15, 20), peptide)
        self._not_in(SequenceRange, (1, 5), peptide)
        self._not_in(SequenceRange, (4, 11), peptide)
        self._not_in(SequenceRange, (10, 21), peptide)
Exemple #25
0
    def count_ladders(cls,
                      position_counts,
                      h_cluster,
                      clusters,
                      ladder_window=10):
        """
        Returns the percentages of top +/- window_ladder around a possition_count

        thus if there are 5 peptides that stops at position 100
        and 10 peptides that stop within 10 of that position the that index of the returned array
        would be: 5 / (10 + 5) = 0.3333..
        thus close to 0 means loads of close starting positions, and 1 means only starting position
        """
        # TODO: ladders should take into account the number of start stops, IE
        # if 5 starts at the position and 10 peptides start 5 other places

        # 1 / (1 + 5) = 1/6  <--- how we do it in the code below
        #  counts = np.zeros(h_cluster.shape[0])
        #  for position in positions:
        #      counts[position.index] = 1
        #  h_ladder = np.zeros(h_cluster.shape[0])

        # 5 / (5 + 10) = 1/3 <--- ideal
        counts = np.zeros(h_cluster.shape[0])
        for position, count in position_counts.items():
            counts[position.index] = count
        h_ladder = np.zeros(h_cluster.shape[0])

        # ladders are pos +/- ladder_window, but has to stay within cluster boundaries
        #  for position in positions.items():
        for position, count in position_counts.items():
            n_cluster = h_cluster[position.index]
            ladder_start = max(clusters[n_cluster].start,
                               position.pos - ladder_window)
            ladder_stop = min(clusters[n_cluster].stop,
                              position.pos + ladder_window)
            ladder_range = SequenceRange(ladder_start, ladder_stop)
            #  h_ladder[position.index] = counts[ladder_range.slice].sum() - 1
            h_ladder[position.index] = count / counts[ladder_range.slice].sum()
        return h_ladder
Exemple #26
0
 def test_can_not_create_a_sequence_from_range_if_start_and_stop_are_different(
         self):
     with pytest.raises(TypeError):
         assert SequencePoint(SequenceRange(10, 12))
Exemple #27
0
 def test_can_create_a__point_from_range_if_start_and_stop_are_the_same(
         self):
     assert SequencePoint(SequenceRange(10)) == SequencePoint(10)
Exemple #28
0
 def test_wierd_stuff(self):
     assert (SequenceRange(10, 20) - 15).contains(2)
Exemple #29
0
 def test2b(self):
     assert SequenceRange(2) + SequencePoint(2) == 3
Exemple #30
0
 def test_conversion(self):
     assert SequenceRange(1, 1) == SequenceRange(SequencePoint(1))
     assert SequenceRange(1, 2) == SequenceRange(SequencePoint(1),
                                                 SequencePoint(2))
     assert SequenceRange(1, 2) == SequenceRange(SequencePoint(1), 2)
     assert SequenceRange(1, 2) == SequenceRange(1, SequencePoint(2))