Beispiel #1
0
    def find_shitty_decoys(self):
        """
        Finds and notes decoys that share their sequence with a target PSM.

        Also counts the number of targets and decoys to get a quick estimate
        of how many positive/negative training examples can be "claimed".
        """
        target_seqs = set()
        decoy_seqs = set()
        with open(self.csv_path, "r") as f:

            reader = csv.DictReader(f)

            sorted_reader = sorted(
                reader,
                reverse=self["bigger_scores_better"],
                key=lambda d: float(d[self.col_for_sorting]),
            )

            for row in sorted_reader:
                self.observed_charges.add(int(row["Charge"]))
                if row_is_decoy(row):
                    decoy_seqs.add(unify_sequence(row["Sequence"]))
                    self.counter["decoy"] += 1
                else:
                    target_seqs.add(unify_sequence(row["Sequence"]))
                    self.counter["target"] += 1

        self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs)
        if len(self.shitty_decoy_seqs) > 0:
            print(
                "Warning! Found {0} sequences that are target AND decoy "
                "(immutable peptides?). These will not be used for training.\n"
                .format(len(self.shitty_decoy_seqs)))
        return
Beispiel #2
0
    def find_shitty_decoys(self):
        '''
        Finds and notes decoys that share their sequence with a target PSM.
        
        Also counts the number of targets and decoys to get a quick estimate
        of how many positive/negative training examples can be "claimed".
        '''
        target_seqs = set()
        decoy_seqs = set()
        with open(self.csv_path, 'r') as f:

            reader = csv.DictReader(f)

            sorted_reader = sorted(
                reader, reverse=self['bigger_scores_better'],
                key=lambda d: float(d[self.col_for_sorting])
            )

            for row in sorted_reader:
                self.observed_charges.add(int(row['Charge']))
                if row_is_decoy(row):
                    decoy_seqs.add(unify_sequence(row['Sequence']))
                    self.counter['decoy'] += 1
                else:
                    target_seqs.add(unify_sequence(row['Sequence']))
                    self.counter['target'] += 1

        self.shitty_decoy_seqs = target_seqs.intersection(decoy_seqs)
        if len(self.shitty_decoy_seqs) > 0:
            print(
                'Warning! Found {0} sequences that are target AND decoy '\
                '(immutable peptides?). These will not be used for training.\n'.format(len(self.shitty_decoy_seqs))
            )
        return
Beispiel #3
0
    def get_psm_category(self, row):
        """
        Determines whether a PSM (csv row) should be used as a negative or
        positive training example.

        returns
            1  - high-scoring target (positive training example)
            0  - not-high-scoring target (not usable for training)
           -1  - decoy (negative training example)

        """
        category = 0  # unknown (mix of true positives and false positives)
        self.PSM_count += 1  # for FDR calculation
        sequence = unify_sequence(row["Sequence"])
        psm_FDR = calc_FDR(self.PSM_count, self.decoy_count)

        if row_is_decoy(row):
            self.decoy_count += 1
            if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs:
                category = -1  # decoy (false positive hits)
                self.counter["negative"] += 1
            else:
                if not self.decoy_train_prob:
                    need_max = self.counter["positive"] * 2
                    have = self.counter["negative"]
                    still_there = self.counter["decoy"] - have
                    prob = need_max / still_there
                    if prob < 0.001:
                        prob = 0.001
                    self.decoy_train_prob = prob
                    print()
                    print(self.counter)
                    print("need max:", need_max)
                    print("have:", have)
                    print("still_there:", still_there)
                    print("probability:", self.decoy_train_prob)
                    print()
                if self.decoy_train_prob >= 1.0 or random(
                ) <= self.decoy_train_prob:
                    category = -1  # decoy (false positive hits)
                    self.counter["negative"] += 1

        else:  # row is target
            if psm_FDR <= self[
                    "fdr_cutoff"] and sequence not in self.shitty_decoy_seqs:
                category = 1  # high quality target (almost certainly true positives)
                self.counter["positive"] += 1

        if category == 0:
            self.counter["unknown"] += 1
        return (category, psm_FDR)
Beispiel #4
0
    def get_psm_category(self, row):
        '''
        Determines whether a PSM (csv row) should be used as a negative or
        positive training example.

        returns
            1  - high-scoring target (positive training example)
            0  - not-high-scoring target (not usable for training)
           -1  - decoy (negative training example)
        
        '''
        category = 0  # unknown (mix of true positives and false positives)
        self.PSM_count += 1  # for FDR calculation
        sequence = unify_sequence(row['Sequence'])
        psm_FDR = calc_FDR(self.PSM_count, self.decoy_count)

        if row_is_decoy(row):
            self.decoy_count += 1
            if psm_FDR <= 0.25 and sequence not in self.shitty_decoy_seqs:
                category = -1  # decoy (false positive hits)
                self.counter['negative'] += 1
            else:
                if not self.decoy_train_prob:
                    need_max = self.counter['positive'] * 2
                    have = self.counter['negative']
                    still_there = self.counter['decoy'] - have
                    prob = need_max / still_there
                    if prob < 0.001:
                        prob = 0.001
                    self.decoy_train_prob = prob
                    print()
                    print(self.counter)
                    print('need max:', need_max)
                    print('have:', have)
                    print('still_there:', still_there)
                    print('probability:', self.decoy_train_prob)
                    print()
                if self.decoy_train_prob >= 1.0 or random() <= self.decoy_train_prob:
                    category = -1  # decoy (false positive hits)
                    self.counter['negative'] += 1

                

        else:  # row is target
            if psm_FDR <= self['fdr_cutoff'] and sequence not in self.shitty_decoy_seqs:
                category = 1  # high quality target (almost certainly true positives)
                self.counter['positive'] += 1

        if category == 0:
            self.counter['unknown'] += 1
        return (category, psm_FDR)
Beispiel #5
0
    def collect_data(self):
        """
        parses a unified csv file and collects features from each row
        """
        categories = []
        list_of_feature_lists = []
        feature_sets = set()
        with open(self.csv_path, "r") as f:
            reader = csv.DictReader(f)
            # collecting some stats for FDR calculation:
            self.PSM_count = 0
            self.decoy_count = 0

            if self["dump_svm_matrix"]:
                self.init_svm_matrix_dump()
                additional_matrix_info = []

            for i, row in enumerate(
                    sorted(
                        reader,
                        reverse=self["bigger_scores_better"],
                        key=lambda d: float(d[self.col_for_sorting]),
                    )):

                features = self.row_to_features(row)

                if tuple(features) in feature_sets:
                    continue
                feature_sets.add(tuple(features))

                category, psm_FDR = self.get_psm_category(row)

                list_of_feature_lists.append(features)
                categories.append(category)

                if self["dump_svm_matrix"]:
                    label = -1 if row_is_decoy(row) else 1
                    sequence = "{0}.{1}#{2}.{3}".format(
                        row["Sequence Pre AA"].strip(),
                        row["Sequence"].strip(),
                        row["Modifications"].strip(),
                        row["Sequence Post AA"].strip(),
                    )
                    additional_matrix_info.append({
                        "psm_id":
                        row["Spectrum Title"].strip(),
                        "label":
                        label,
                        "scannr":
                        row["Spectrum Title"].strip().split(".")[-2],
                        "peptide":
                        sequence,
                        "proteins":
                        self.parse_protein_ids(row["Protein ID"]),
                    })

                if i % 1000 == 0:
                    score_val = float(row[self.col_for_sorting])
                    msg = ("Generating feature matrix from input csv "
                           "(line ~{0}) with score {1} and FDR "
                           "{2}".format(i, score_val, psm_FDR))
                    print(msg, end="\r")

        # All data points are collected in one big matrix, to make standardization possible
        print("\nConverting feature matrix to NumPy array...")
        X_raw = np.array(list_of_feature_lists, dtype=float)

        print("Replacing empty/NaN values with the mean of each column...")
        self.nan_replacer = Imputer()
        self.nan_replacer.fit(X_raw)
        X_raw = self.nan_replacer.transform(X_raw)
        # Standardize input matrix to ease machine learning! Scaled data has zero mean and unit variance
        print("Standardizing input matrix...")
        self.scaler = SCALER.fit(X_raw)
        self.X = self.scaler.transform(X_raw)
        self.categories = np.array(categories)
        print()

        if self["dump_svm_matrix"]:
            print("Dumping SVM matrix to", self["dump_svm_matrix"])

            for i, matrix_row in enumerate(self.X):
                matrix_row_info = additional_matrix_info[i]
                self.dump_svm_matrix_row(
                    row=list(matrix_row),
                    psm_id=matrix_row_info["psm_id"],
                    label=matrix_row_info["label"],
                    scannr=matrix_row_info["scannr"],
                    peptide=matrix_row_info["peptide"],
                    proteins=matrix_row_info["proteins"],
                )

            print("Dumped SVM matrix to", self["dump_svm_matrix"])
        return
Beispiel #6
0
    def collect_data(self):
        '''
        parses a unified csv file and collects features from each row
        '''
        categories = []
        list_of_feature_lists = []
        feature_sets = set()
        with open(self.csv_path, 'r') as f:
            reader = csv.DictReader(f)
            # collecting some stats for FDR calculation:
            self.PSM_count   = 0
            self.decoy_count = 0

            if self['dump_svm_matrix']:
                self.init_svm_matrix_dump()
                additional_matrix_info = []

            for i, row in enumerate(
                sorted(reader, reverse=self['bigger_scores_better'],
                       key=lambda d: float(d[self.col_for_sorting])
                )):

                features = self.row_to_features(row)

                if tuple(features) in feature_sets:
                    continue
                feature_sets.add(tuple(features))

                category, psm_FDR = self.get_psm_category(row)

                list_of_feature_lists.append(features)
                categories.append(category)

                if self['dump_svm_matrix']:
                    label = -1 if row_is_decoy(row) else 1
                    sequence = '{0}.{1}#{2}.{3}'.format(
                        row['Sequence Pre AA'].strip(),
                        row['Sequence'].strip(),
                        row['Modifications'].strip(),
                        row['Sequence Post AA'].strip(),
                    )
                    additional_matrix_info.append({
                        'psm_id': row['Spectrum Title'].strip(),
                        'label': label,
                        'scannr': row['Spectrum Title'].strip().split('.')[-2],
                        'peptide': sequence,
                        'proteins': self.parse_protein_ids(row['Protein ID']),
                    })

                if i % 1000 == 0:
                    score_val = float(row[self.col_for_sorting])
                    msg = 'Generating feature matrix from input csv '\
                          '(line ~{0}) with score {1} and FDR '\
                          '{2}'.format(i, score_val, psm_FDR)
                    print(msg, end = '\r')


        # All data points are collected in one big matrix, to make standardization possible
        print('\nConverting feature matrix to NumPy array...')
        X_raw = np.array(list_of_feature_lists, dtype=float)

        print('Replacing empty/NaN values with the mean of each column...')
        self.nan_replacer = Imputer()
        self.nan_replacer.fit(X_raw)
        X_raw = self.nan_replacer.transform(X_raw)
        # Standardize input matrix to ease machine learning! Scaled data has zero mean and unit variance
        print('Standardizing input matrix...')
        self.scaler = SCALER.fit(X_raw)
        self.X = self.scaler.transform(X_raw)
        self.categories = np.array(categories)
        print()

        if self['dump_svm_matrix']:
            print('Dumping SVM matrix to', self['dump_svm_matrix'])
            
            for i, matrix_row in enumerate(self.X):
                matrix_row_info = additional_matrix_info[i]
                self.dump_svm_matrix_row(
                    row = list(matrix_row),
                    psm_id=matrix_row_info['psm_id'],
                    label=matrix_row_info['label'],
                    scannr=matrix_row_info['scannr'],
                    peptide=matrix_row_info['peptide'],
                    proteins=matrix_row_info['proteins'],
                )
            
            print('Dumped SVM matrix to', self['dump_svm_matrix'])
        return