Example #1
0
    def test_load_peptide_file(self):
        """Does LoadPeptideFile() store peptide data correctly?"""
        self.pdl.LoadFastaFile("TEST_files\TEST.fasta")

        pep = peptide.Peptide()
        pep.sequence = "MEDVCGRLVQYRGE"
        pep.accession = "NP_000032"
        pep.pos_start = 125
        pep.pos_end = 138
        expectedpep = {}
        expectedpep["NP_000032"] = [pep]

        path = "TEST_files\TEST_peptides.txt"
        testpep = self.pdl.LoadPeptideFile(path, self.pdl.proteins)

        for item in expectedpep:
            if item in testpep:
                count = 0
                for val in expectedpep[item]:
                    self.assertPeptideEqual(val, testpep[item][count])
                    count += 1
            else:
                self.assertTrue(
                    item in testpep,
                    msg="Sequence in expected dictionary not found")
Example #2
0
 def test_smart_query_peptide(self):
     """Can SmartQuery properly build a peptide query?"""
     testpeptide = peptide.Peptide()
     testpeptide.sequence = "TMNT"
     testpeptide.accession = "NP_01"
     testpeptide.pos_start = 2
     testpeptide.pos_end = 5
     testquery = self.querybuilder.SmartQuery({"NP_01": 1}, False)
     expectedsubstring = "SELECT * FROM fulltable WHERE (prot_acc = 'NP_01' )"
     self.assertTrue(expectedsubstring in testquery)
Example #3
0
    def addCalPeptide(self, peps4spec):
        '''
        @brief processes the peptide data for a single spectrum
        @param peps4spec <dictionary>: containing all peptides for one spectrum
        '''
        stats = self.stats
        cfg = self.cfg
        pepkeys = [x for x in peps4spec.keys() if rx_pepmaindata.search(x)]
        pepkeys.sort()

        bits = pepkeys[0].split('_')
        query = int(bits[0][1:])

        spec = self.spectra[query]

        self.logs.datlog.debug('Query %i has %i peptides' % (query, len(pepkeys)))

        if len(pepkeys) > 1 and pepkeys[1][-2:] == '10':
            key = pepkeys.pop(1)
            pepkeys.append(key)

        for pep in pepkeys:
            # collect all the data for each peptide and create Peptide objects
            rx_pepdata = re.compile('^' + pep + '(?![0-9])')
            allpeptidedata = {}
            for x in peps4spec.keys():
                if rx_pepdata.search(x):
                    allpeptidedata[x] = peps4spec[x]

            # first create the peptide instance
            pepobj = peptide.Peptide(allpeptidedata, self)

            # then test if the sequence is valid
            if pepobj.isValidSequence():
                # only include peptides if the sequence is valid
                self.spectra[query]['numpeps'] += 1
                stats['numpeps'] += 1

                for attr in ['sequence', 'pepno', 'modsVariable', 'modsFixed', 'mass', 'da_delta', 'score']:
                    spec[attr] = getattr(pepobj, attr)

                accessions = [x['accession'] for x in pepobj.proteins]

                if len([x for x in accessions if x.startswith('DD') or x.startswith('###REV###') or
                        x.startswith('###RND###')]) == len(accessions):
                    hitType = 'REV'
                else:
                    hitType = 'FWD'
                spec['hitType'] = hitType

                # only process one peptide per query
                break
            else:
                stats['numfailedpeps'] += 1
Example #4
0
 def test_load_peptide(self):
     """Can LoadPeptide() store peptide data from file to data structure?"""
     line = "R.PEPTIDE.C\tref|NP_01.1|gi|111|"
     expectedpep = peptide.Peptide()
     expectedpep.sequence = "PEPTIDE"
     expectedpep.accession = "NP_01"
     expectedpep.pos_start = 1
     expectedpep.pos_end = 7
     self.pdl.proteins["NP_01"] = "APEPTIDESEQUENCE"
     testpep = self.pdl.LoadPeptide(line)
     self.assertPeptideEqual(expectedpep, testpep)
    def LoadPeptide(self, line):
        #Get the data
        line = line.strip().split()
        p = peptide.Peptide()
        p.sequence = self.ParseSequence(line[0])
        p.accession = self.ParseAccession(line[1])

        #If the accession number is in the full protein sequences we loaded from the
        # fasta file, then find where the peptide lies within the protein
        if p.accession in self.proteins:
            protein = self.proteins[p.accession]
            p.pos_start = protein.find(p.sequence)
            p.pos_end = p.pos_start + len(p.sequence) - 1

        return p
Example #6
0
def features_and_intensity(dir_path, charge, length, qvalue, ion_type):
    error = 0
    directory = listdir(dir_path)
    directory.sort()
    step = 0
    error_list = []
    zero_sequence = []

    for file in directory:
        # If result file, not extracted file
        if file.find('MergedFDR.tsv') != -1:
            print(step)
            # Result file open
            fr = open(dir_path + '/' + file)
            fr.readline()
            results = []

            for line in fr.readlines():
                l = line.split('\t')

                # Charge 2, limit peptide length 11
                if int(l[8]) == charge\
                  and len(get_strip_sequence(l[9])) == length\
                  and float(l[15]) <= qvalue:

                    # SpecFile, scannum, charge, peptide
                    results.append((l[0], l[2], l[8], l[9]))
            fr.close()

            extracts = {}
            temp_key = None

            # Extracted file open
            fe = open(dir_path + '/' + file[:file.find('.tsv')] + EXTRACTED)
            for line in fe.readlines():
                if line == '\n':
                    temp_key = None
                    continue

                l = line.split('\t')

                try:
                    # Length 11
                    if line[0] == '>'\
                      and len(get_strip_sequence(l[2])) == length:
                        # create dict key
                        extracts[l[0][1:] + l[1]] = [0] * (length - 1)
                        temp_key = l[0][1:] + l[1]
                    elif line[0] == ion_type and temp_key != None:
                        # b ions
                        extracts[temp_key][int(l[0][1:]) - 1] = float(l[2])
                except:
                    error_list.append(fe.name)
                    print(fe.name)
                    print(l)
                    error += 1

            feat_inten_file_name = file[:file.find('MSGF')]
            ffi = open('../data/' + ion_type + '/' + feat_inten_file_name +
                       str(length) + '_' + str(charge) + '_' + str(qvalue) +
                       '.txt',
                       'wt',
                       encoding='utf-8')
            for result in results:
                # Write each file separately
                # 2: charge, 3: peptide
                p = peptide.Peptide(result[3], result[2], ion_type)
                feat = p.get_features()
                # 0: specfile, 2: peptide, 3: qvalue
                key = result[0] + result[1]
                if key in extracts.keys():
                    intensity = extracts[key]

                    for feature in feat:
                        ffi.write(str(feature) + ' ')

                    inten_sum = 0
                    for inten in intensity:
                        ffi.write(str(inten) + ' ')
                        inten_sum += float(inten)

                    if inten_sum == 0:
                        zero_sequence.append(result[3])

                    ffi.write('\n')
                    step += 1
            ffi.close()
            fe.close()
    print(error)

    # if file is broken
    f_error = open('../data/' + ion_type + '/' + sys.argv[1][-5:] +
                   '_error.txt',
                   'wt',
                   encoding='utf-8')
    for error in error_list:
        f_error.write(error + '\n')
    f_error.close()

    # all intensities are zero
    f_zero_sequence = open('../data/' + ion_type + '/' + sys.argv[1][-5:] +
                           '_zeros.txt',
                           'wt',
                           encoding='utf-8')
    for sequence in zero_sequence:
        f_zero_sequence.write(sequence + '\n')
    f_zero_sequence.close()
Example #7
0
    def addPeptides(self, peps4spec):
        '''
        @brief processes the peptide data for a single spectrum
        @param peps4spec <dictionary>: containing all peptides for one spectrum
        '''
        stats = self.stats
        cfg = self.cfg
        pepkeys = [x for x in peps4spec.keys() if rx_pepmaindata.search(x)]
        pepkeys.sort()

        bits = pepkeys[0].split('_')
        query = int(bits[0][1:])
        self.logs.datlog.debug('Query %i has %i peptides' % (query, len(pepkeys)))

        if len(pepkeys) > 1 and pepkeys[1][-2:] == '10':
            key = pepkeys.pop(1)
            pepkeys.append(key)
        peplist = []
        seq2acc = self.seq2acc
        sequences = self.sequences
        self.peptidecounter += len(pepkeys)

        for pep in pepkeys:
            # collect all the data for each peptide and create Peptide objects
            rx_pepdata = re.compile('^' + pep + '(?![0-9])')
            allpeptidedata = {}
            for x in peps4spec.keys():
                if rx_pepdata.search(x):
                    allpeptidedata[x] = peps4spec[x]

            # first create the peptide instance
            pepobj = peptide.Peptide(allpeptidedata, self)

            # then test if the sequence is valid
            if pepobj.isValidSequence():
                # only include peptides if the sequence is valid
                self.spectra[query]['numpeps'] += 1
                stats['numpeps'] += 1
                peplist.append(pepobj)
            else:
                stats['numfailedpeps'] += 1

        # now do the QC of the peptide set
        if peplist:
            self.doPeptideSetQC(peplist)

        hasHook = ''
        for pep in peplist:
            if pep.useinprot == 0 or pep.retain == 0:
                continue
            seq = pep.sequence
            score = pep.score
            if pep.is_hook:
                hookscore = score
                hasHook = ', has hook peptide'
            else:
                hookscore = 0.0

            # build dictionary of pep sequence to protein accession
            if seq in sequences:
                sequences[seq] += 1
                # accumulate data independently
                seq2acc[seq]['numpep'] += 1
                if pep.is_hook > seq2acc[seq]['hook']:
                    seq2acc[seq]['hook'] = pep.is_hook

                if hookscore > seq2acc[seq]['hookscore']:
                    seq2acc[seq]['hookscore'] = hookscore

                if score > seq2acc[seq]['pepscore']:
                    seq2acc[seq]['pepscore'] = score

                if pep.pepno < seq2acc[seq]['bestczrank']:
                    seq2acc[seq]['bestczrank'] = pep.pepno
            else:
                sequences[seq] = 1
                seq2acc[seq] = dict(prots=pep.proteins[:], hook=pep.is_hook, numpep=1,
                                    hookscore=hookscore, pepscore=score, bestczrank=pep.pepno)

        self.logs.datlog.debug('%i peptides pass QC%s' % (len(peplist), hasHook))