Esempio n. 1
0
    def find_shitty_decoys(self):
        """
        Finds and notes decoys that share their sequence with a target PSM.

        Also counts the number of targets and decoys to get a quick estimate
        of how many positive/negative training examples can be "claimed".
        """
        target_seqs = set()
        decoy_seqs = set()
        with open(self.csv_path, "r") as f:

            reader = csv.DictReader(f)

            sorted_reader = sorted(
                reader,
                reverse=self["bigger_scores_better"],
                key=lambda d: float(d[self.col_for_sorting]),
            )

            for row in sorted_reader:
                self.observed_charges.add(int(row["Charge"]))
                if row_is_decoy(row):
                    decoy_seqs.add(unify_sequence(row["Sequence"]))
                    self.counter["decoy"] += 1
                else:
                    target_seqs.add(unify_sequence(row["Sequence"]))
                    self.counter["target"] += 1

        self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs)
        if len(self.shitty_decoy_seqs) > 0:
            print(
                "Warning! Found {0} sequences that are target AND decoy "
                "(immutable peptides?). These will not be used for training.\n"
                .format(len(self.shitty_decoy_seqs)))
        return
Esempio n. 2
0
    def find_shitty_decoys(self):
        '''
        Finds and notes decoys that share their sequence with a target PSM.
        
        Also counts the number of targets and decoys to get a quick estimate
        of how many positive/negative training examples can be "claimed".
        '''
        target_seqs = set()
        decoy_seqs = set()
        with open(self.csv_path, 'r') as f:

            reader = csv.DictReader(f)

            sorted_reader = sorted(
                reader, reverse=self['bigger_scores_better'],
                key=lambda d: float(d[self.col_for_sorting])
            )

            for row in sorted_reader:
                self.observed_charges.add(int(row['Charge']))
                if row_is_decoy(row):
                    decoy_seqs.add(unify_sequence(row['Sequence']))
                    self.counter['decoy'] += 1
                else:
                    target_seqs.add(unify_sequence(row['Sequence']))
                    self.counter['target'] += 1

        self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs)
        if len(self.shitty_decoy_seqs) > 0:
            print(
                'Warning! Found {0} sequences that are target AND decoy '\
                '(immutable peptides?). These will not be used for training.\n'.format(len(self.shitty_decoy_seqs))
            )
        return
Esempio n. 3
0
    def get_psm_category(self, row):
        """
        Determines whether a PSM (csv row) should be used as a negative or
        positive training example.

        returns
            1  - high-scoring target (positive training example)
            0  - not-high-scoring target (not usable for training)
           -1  - decoy (negative training example)

        """
        category = 0  # unknown (mix of true positives and false positives)
        self.PSM_count += 1  # for FDR calculation
        sequence = unify_sequence(row["Sequence"])
        psm_FDR = calc_FDR(self.PSM_count, self.decoy_count)

        if row_is_decoy(row):
            self.decoy_count += 1
            if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs:
                category = -1  # decoy (false positive hits)
                self.counter["negative"] += 1
            else:
                if not self.decoy_train_prob:
                    need_max = self.counter["positive"] * 2
                    have = self.counter["negative"]
                    still_there = self.counter["decoy"] - have
                    prob = need_max / still_there
                    if prob < 0.001:
                        prob = 0.001
                    self.decoy_train_prob = prob
                    print()
                    print(self.counter)
                    print("need max:", need_max)
                    print("have:", have)
                    print("still_there:", still_there)
                    print("probability:", self.decoy_train_prob)
                    print()
                if self.decoy_train_prob >= 1.0 or random(
                ) <= self.decoy_train_prob:
                    category = -1  # decoy (false positive hits)
                    self.counter["negative"] += 1

        else:  # row is target
            if psm_FDR <= self[
                    "fdr_cutoff"] and sequence not in self.shitty_decoy_seqs:
                category = 1  # high quality target (almost certainly true positives)
                self.counter["positive"] += 1

        if category == 0:
            self.counter["unknown"] += 1
        return (category, psm_FDR)
Esempio n. 4
0
    def get_psm_category(self, row):
        '''
        Determines whether a PSM (csv row) should be used as a negative or
        positive training example.

        returns
            1  - high-scoring target (positive training example)
            0  - not-high-scoring target (not usable for training)
           -1  - decoy (negative training example)
        
        '''
        category = 0  # unknown (mix of true positives and false positives)
        self.PSM_count += 1  # for FDR calculation
        sequence = unify_sequence(row['Sequence'])
        psm_FDR = calc_FDR(self.PSM_count, self.decoy_count)

        if row_is_decoy(row):
            self.decoy_count += 1
            if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs:
                category = -1  # decoy (false positive hits)
                self.counter['negative'] += 1
            else:
                if not self.decoy_train_prob:
                    need_max = self.counter['positive'] * 2
                    have = self.counter['negative']
                    still_there = self.counter['decoy'] - have
                    prob = need_max / still_there
                    if prob < 0.001:
                        prob = 0.001
                    self.decoy_train_prob = prob
                    print()
                    print(self.counter)
                    print('need max:', need_max)
                    print('have:', have)
                    print('still_there:', still_there)
                    print('probability:', self.decoy_train_prob)
                    print()
                if self.decoy_train_prob >= 1.0 or random() <= self.decoy_train_prob:
                    category = -1  # decoy (false positive hits)
                    self.counter['negative'] += 1

                

        else:  # row is target
            if psm_FDR <= self['fdr_cutoff'] and sequence not in self.shitty_decoy_seqs:
                category = 1  # high quality target (almost certainly true positives)
                self.counter['positive'] += 1

        if category == 0:
            self.counter['unknown'] += 1
        return (category, psm_FDR)
Esempio n. 5
0
    def row_to_features(self, row):
        """
        Converts a unified CSV row to a SVM feature matrix (numbers only!)
        """
        sequence = unify_sequence(row["Sequence"])
        charge = field_to_float(row["Charge"])
        score = field_to_bayes_float(row[self.col_for_sorting])
        calc_mz, exp_mz, calc_mass, exp_mass = get_mz_values(row)
        # calc_mz = field_to_float( row['Calc m/z'] )  # calc m/z or uCalc?
        # exp_mz = field_to_float( row['Exp m/z'] )

        pre_aa_field = row["Sequence Pre AA"]
        post_aa_field = row["Sequence Post AA"]
        all_pre_aas = set(re.split(self.delim_regex, pre_aa_field))
        all_post_aas = set(re.split(self.delim_regex, post_aa_field))

        if any(pre_aa not in self.tryptic_aas for pre_aa in all_pre_aas):
            enzN = 0
        else:
            enzN = 1

        if any(post_aa not in self.tryptic_aas for post_aa in all_post_aas):
            enzC = 0
        else:
            enzC = 1

        n_missed_cleavages = len([
            aa for aa in sequence[:-1] if aa in ["R", "K"]
        ])  # / len(sequence)

        missed_cleavages = [0] * 6
        try:
            missed_cleavages[n_missed_cleavages] = 1
        except IndexError:  # if a peptide has more than 6 missed cleavages
            missed_cleavages[-1] = 2

        spectrum = row["Spectrum Title"].strip()
        mass = (exp_mz * charge) - (charge - 1) * PROTON
        pep_len = len(sequence)
        # delta_mz = calc_mz - exp_mz
        delta_mass = calc_mass - exp_mass

        peptide = (sequence, row["Modifications"])
        proteins = self.parse_protein_ids(row["Protein ID"])
        num_pep = self.num_pep[peptide]
        pep_charge_states = len(self.pep_charge_states[peptide])
        seq_mods = len(self.seq_mods[sequence])
        num_spec = len(self.num_spec[row["Spectrum Title"]])
        num_prot = sum((len(self.num_prot[protein]) for protein in proteins))
        pep_site = sum((len(self.pep_site[protein]) for protein in proteins))

        user_specified_features = []
        for feat in self.used_extra_fields:
            if feat != self.col_for_sorting:
                try:
                    user_specified_features.append(field_to_float(row[feat]))
                except ValueError:
                    pass

        charges = defaultdict(int)
        for charge_n in sorted(self.pep_charge_states[peptide]):
            charges[charge_n] = 1

        if sequence in self.shitty_decoy_seqs:
            is_shitty = 1
        else:
            is_shitty = 0

        score_list = sorted(
            list(set(self.score_list_dict[spectrum])),
            reverse=self["bigger_scores_better"],
        )

        try:
            score_list_scaled = scale_scores(score_list)
            rank = score_list.index(score)
            deltLCn = (
                score_list_scaled[rank] - score_list_scaled[1]
            )  # Fractional difference between current and second best XCorr
            deltCn = (
                score_list_scaled[rank] - score_list_scaled[-1]
            )  # Fractional difference between current and worst XCorr
        except (ValueError, IndexError, AssertionError):
            # NaN values will be replaced by the column mean later
            # NaN values are entered when there is no ranking
            # e.g. when only one peptide was matched to the spectrum.
            rank, deltLCn, deltCn = np.nan, np.nan, np.nan

        features = [
            score,
            rank,
            deltCn,
            deltLCn,
            charge,
            # delta_mz,# / pep_len,
            delta_mass,  # / pep_len,
            # abs(delta_mz),# / pep_len,
            abs(delta_mass),  # / pep_len,
            n_missed_cleavages / pep_len,
            missed_cleavages[0],
            missed_cleavages[1],
            missed_cleavages[2],
            missed_cleavages[3],
            missed_cleavages[4],
            missed_cleavages[5],
            enzN,
            enzC,
            mass,
            pep_len,
            num_pep,
            num_prot,
            pep_site,
            is_shitty,
            pep_charge_states,
            num_spec,
            seq_mods,
        ]

        for charge_n in self.observed_charges:
            features.append(charges[charge_n])

        return features + user_specified_features
Esempio n. 6
0
    def count_intra_set_features(self):
        """
        intra-set features as calculated by Percolator:
        - num_pep:  Number of PSMs for which this is the best scoring peptide.
        - num_prot: Number of times the matched protein matches other PSMs.
        - pep_site: Number of different peptides that match this protein.

        own ideas:
        - pep_charge_states: in how many charge states was the peptide found?
        - seq_mods: in how many mod states was the AA-sequence found?
        - num_spec: Number of times the matched spectrum matches other peptides.
        """
        print("Counting intra-set features...")
        self.num_pep = defaultdict(int)
        self.num_prot = defaultdict(set)
        self.pep_site = defaultdict(set)
        self.score_list_dict = defaultdict(list)

        self.pep_charge_states = defaultdict(set)
        self.seq_mods = defaultdict(set)
        self.num_spec = defaultdict(set)

        with open(self.csv_path, "r") as f:
            reader = csv.DictReader(f)
            previous_spec_title = None
            rows_of_spectrum = []

            for row in sorted(reader,
                              reverse=self["bigger_scores_better"],
                              key=self.sort_by_rank):

                if unify_sequence(row["Sequence"]) in self.shitty_decoy_seqs:
                    continue
                current_spec_title = row["Spectrum Title"]
                if current_spec_title != previous_spec_title:
                    # the next spectrum started, so let's process the info we
                    # collected for the previous spectrum:
                    score_list = [
                        field_to_bayes_float(r[self.col_for_sorting])
                        for r in rows_of_spectrum
                    ]
                    self.score_list_dict[previous_spec_title] = score_list

                    for rank, line in enumerate(rows_of_spectrum):
                        # print("\t".join([
                        # str(rank), line['Spectrum Title'], line[self.col_for_sorting]
                        # ]))
                        uni_sequence = unify_sequence(line["Sequence"])
                        peptide = (uni_sequence, line["Modifications"])

                        # multiple proteins are separated by <|>
                        # ignore start_stop_pre_post part since it depends on the peptide
                        # and not the protein (i.e. _233_243_A_R)
                        proteins = set(line["Protein ID"].replace(
                            "decoy_", "").split(";"))

                        # old unify csv format:
                        # proteins = self.parse_protein_ids(
                        #    line['proteinacc_start_stop_pre_post_;']
                        # )
                        if len(proteins) > self.maximum_proteins_per_line:
                            self.maximum_proteins_per_line = len(proteins)

                        if rank == 0:
                            # this is the 'best' peptide for that spectrum
                            self.num_pep[peptide] += 1
                        for protein in proteins:
                            self.num_prot[protein].add((
                                line["Spectrum Title"],
                                uni_sequence,
                                line["Modifications"],
                            ))
                            self.pep_site[protein].add(peptide)

                        self.pep_charge_states[peptide].add(int(row["Charge"]))
                        self.seq_mods[uni_sequence].add(row["Modifications"])
                        self.num_spec[line["Spectrum Title"]].add(peptide)

                    rows_of_spectrum = []

                rows_of_spectrum.append(row)
                previous_spec_title = current_spec_title
Esempio n. 7
0
    def row_to_features(self, row):
        '''
        Converts a unified CSV row to a SVM feature matrix (numbers only!)
        '''
        sequence = unify_sequence(row['Sequence'])
        charge = field_to_float( row['Charge'] )
        score = field_to_bayes_float( row[self.col_for_sorting] )
        calc_mz, exp_mz, calc_mass, exp_mass = get_mz_values(row)
        #calc_mz = field_to_float( row['Calc m/z'] )  # calc m/z or uCalc?
        #exp_mz = field_to_float( row['Exp m/z'] )

        pre_aa_field = row['Sequence Pre AA']
        post_aa_field = row['Sequence Post AA']
        all_pre_aas = set(re.split(self.delim_regex, pre_aa_field))
        all_post_aas = set(re.split(self.delim_regex, post_aa_field))

        if any(pre_aa not in self.tryptic_aas for pre_aa in all_pre_aas):
            enzN = 0
        else:
            enzN = 1

        if any(post_aa not in self.tryptic_aas for post_aa in all_post_aas):
            enzC = 0
        else:
            enzC = 1

        n_missed_cleavages = len([aa for aa in sequence[:-1] if aa in ['R', 'K']])  # / len(sequence)

        missed_cleavages = [0] * 6
        try:
            missed_cleavages[n_missed_cleavages] = 1
        except IndexError:  # if a peptide has more than 6 missed cleavages
            missed_cleavages[-1] = 2

        spectrum = row['Spectrum Title'].strip()
        mass = (exp_mz * charge) - (charge - 1) * PROTON
        pep_len = len(sequence)
        #delta_mz = calc_mz - exp_mz
        delta_mass = calc_mass - exp_mass

        peptide = (sequence, row['Modifications'])
        proteins = self.parse_protein_ids(
            row['Protein ID']
        )
        num_pep = self.num_pep[peptide]
        pep_charge_states = len(self.pep_charge_states[peptide])
        seq_mods = len(self.seq_mods[sequence])
        num_spec = len(self.num_spec[row['Spectrum Title']])
        num_prot = sum(
            (len(self.num_prot[protein]) for protein in proteins)
        )
        pep_site = sum(
            (len(self.pep_site[protein]) for protein in proteins)
        )

        user_specified_features = []
        for feat in self.used_extra_fields:
            if feat != self.col_for_sorting:
                try:
                    user_specified_features.append(field_to_float(row[feat]))
                except ValueError:
                    pass

        charges = defaultdict(int)
        for charge_n in sorted(self.pep_charge_states[peptide]):
            charges[charge_n] = 1

        if sequence in self.shitty_decoy_seqs:
            is_shitty = 1
        else:
            is_shitty = 0

        score_list = sorted(
            list(set(self.score_list_dict[spectrum])),
            reverse=self['bigger_scores_better']
        )

        try:
            score_list_scaled = scale_scores(score_list)
            rank = score_list.index(score)
            deltLCn = score_list_scaled[rank] - score_list_scaled[ 1]  # Fractional difference between current and second best XCorr
            deltCn  = score_list_scaled[rank] - score_list_scaled[-1]  # Fractional difference between current and worst XCorr
        except (ValueError, IndexError, AssertionError):
            # NaN values will be replaced by the column mean later
            # NaN values are entered when there is no ranking
            # e.g. when only one peptide was matched to the spectrum.
            rank, deltLCn, deltCn = np.nan, np.nan, np.nan

        features = [
            score,
            rank,
            deltCn,
            deltLCn,
            charge,
            #delta_mz,# / pep_len,
            delta_mass,# / pep_len,
            #abs(delta_mz),# / pep_len,
            abs(delta_mass),# / pep_len,
            n_missed_cleavages / pep_len,
            missed_cleavages[0],
            missed_cleavages[1],
            missed_cleavages[2],
            missed_cleavages[3],
            missed_cleavages[4],
            missed_cleavages[5],
            enzN,
            enzC,
            mass,
            pep_len,
            num_pep,
            num_prot,
            pep_site,  
            is_shitty,
            pep_charge_states,
            num_spec,
            seq_mods,
        ]

        for charge_n in self.observed_charges:
            features.append(charges[charge_n])

        return features + user_specified_features
Esempio n. 8
0
    def count_intra_set_features(self):
        '''
        intra-set features as calculated by Percolator:
        - num_pep:  Number of PSMs for which this is the best scoring peptide.
        - num_prot: Number of times the matched protein matches other PSMs.
        - pep_site: Number of different peptides that match this protein.

        own ideas:
        - pep_charge_states: in how many charge states was the peptide found?
        - seq_mods: in how many mod states was the AA-sequence found?
        - num_spec: Number of times the matched spectrum matches other peptides.
        '''
        print('Counting intra-set features...')
        self.num_pep = defaultdict(int)
        self.num_prot = defaultdict(set)
        self.pep_site = defaultdict(set)
        self.score_list_dict = defaultdict(list)

        self.pep_charge_states = defaultdict(set)
        self.seq_mods = defaultdict(set)
        self.num_spec = defaultdict(set)


        with open(self.csv_path, 'r') as f:
            reader = csv.DictReader(f)
            previous_spec_title = None
            rows_of_spectrum = []

            for row in sorted(
                reader,
                reverse=self['bigger_scores_better'],
                key=self.sort_by_rank
            ):

                if unify_sequence(row['Sequence']) in self.shitty_decoy_seqs:
                    continue
                current_spec_title = row['Spectrum Title']
                if current_spec_title != previous_spec_title:
                    # the next spectrum started, so let's process the info we
                    # collected for the previous spectrum:
                    score_list = [field_to_bayes_float(r[self.col_for_sorting]) for r in rows_of_spectrum]
                    self.score_list_dict[previous_spec_title] = score_list

                    for rank, line in enumerate(rows_of_spectrum):
                        #print("\t".join([
                            #str(rank), line['Spectrum Title'], line[self.col_for_sorting]
                        #]))
                        uni_sequence = unify_sequence(line['Sequence'])
                        peptide = (uni_sequence, line['Modifications'])

                        # multiple proteins are separated by <|>
                        # ignore start_stop_pre_post part since it depends on the peptide
                        # and not the protein (i.e. _233_243_A_R)
                        proteins = set(line['Protein ID'].replace('decoy_', '').split(';'))

                        #old unify csv format:
                        #proteins = self.parse_protein_ids(
                        #    line['proteinacc_start_stop_pre_post_;']
                        #)
                        if len(proteins) > self.maximum_proteins_per_line:
                            self.maximum_proteins_per_line = len(proteins)

                        if rank == 0:
                            # this is the 'best' peptide for that spectrum
                            self.num_pep[peptide] += 1
                        for protein in proteins:
                            self.num_prot[protein].add(
                                (line['Spectrum Title'], uni_sequence, line['Modifications'])
                            )
                            self.pep_site[protein].add(peptide)

                        self.pep_charge_states[peptide].add(int(row['Charge']))
                        self.seq_mods[uni_sequence].add(row['Modifications'])
                        self.num_spec[line['Spectrum Title']].add(peptide)

                    rows_of_spectrum = []

                rows_of_spectrum.append(row)
                previous_spec_title = current_spec_title