Beispiel #1
0
def create_psm_lookup(fn, header, proteins, pgdb, shiftrows, unroll, specfncol,
                      fastadelim, genefield):
    """Reads PSMs from file, stores them to a database backend in chunked PSMs.
    """
    mzmlmap = pgdb.get_mzmlfile_map()
    sequences = {}
    for psm in tsvreader.generate_split_tsv_lines(fn, header):
        seq = tsvreader.get_psm_sequence(psm, unroll)
        sequences[seq] = 1
    pepseqmap = pgdb.get_peptide_seq_map()
    pgdb.store_pepseqs(((seq, ) for seq in sequences if seq not in pepseqmap))
    pepseqmap = pgdb.get_peptide_seq_map()
    psms = []
    for row, psm in enumerate(tsvreader.generate_split_tsv_lines(fn, header)):
        row += shiftrows
        specfn, psm_id, specscanid, seq, score = tsvreader.get_psm(
            psm, unroll, specfncol)
        if len(psms) % DB_STORE_CHUNK == 0:
            pgdb.store_psms(psms)
            psms = []
        psms.append({
            'rownr': row,
            'psm_id': psm_id,
            'seq': pepseqmap[seq],
            'score': score,
            'specfn': mzmlmap[specfn],
            'spec_id': '{}_{}'.format(mzmlmap[specfn], specscanid),
        })
    pgdb.store_psms(psms)
    pgdb.index_psms()
    store_psm_protein_relations(fn, header, pgdb, proteins, specfncol)
Beispiel #2
0
 def get_quant(self, theader, features):
     if self.precursor:
         tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader)
         self.header.append(prottabledata.HEADER_AREA)
         features = proteins.add_ms1_quant_from_top3_mzidtsv(
             features, tpeps, self.headeraccfield, self.fixedfeatcol)
     if self.quantcolpattern:
         psmheader = tsvreader.get_tsv_header(self.psmfile)
         denomcols = False
         if self.denomcols is not None:
             denomcols = [
                 self.number_to_headerfield(col, psmheader)
                 for col in self.denomcols
             ]
         elif self.denompatterns is not None:
             denomcolnrs = [
                 tsvreader.get_columns_by_pattern(psmheader, pattern)
                 for pattern in self.denompatterns
             ]
             denomcols = set([col for cols in denomcolnrs for col in cols])
         elif not self.mediansweep and not self.medianintensity:
             print(
                 'Must define either denominator column numbers '
                 'or regex pattterns to find them, or use median sweep, or '
                 'report median intensities.')
             sys.exit(1)
         elif self.medianintensity and self.mediannormalize:
             print(
                 'Cannot do median-centering on intensity values, exiting')
             sys.exit(1)
         quantcols = tsvreader.get_columns_by_pattern(
             psmheader, self.quantcolpattern)
         mn_factors = False
         if self.mednorm_factors:
             mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
             mn_factors = tsvreader.generate_split_tsv_lines(
                 self.mednorm_factors, mnhead)
         nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
         self.header = self.header + quantcols + nopsms + [
             prottabledata.HEADER_NO_FULLQ_PSMS
         ]
         features = isosummarize.get_isobaric_ratios(
             self.psmfile, psmheader, quantcols, denomcols,
             self.mediansweep, self.medianintensity, self.median_or_avg,
             self.minint, features, self.headeraccfield, self.fixedfeatcol,
             False, False, False, self.logisoquant, self.mediannormalize,
             mn_factors, self.keepnapsms)
     return features
Beispiel #3
0
 def set_features(self):
     denomcols = False
     if self.denomcols is not None:
         denomcols = [self.number_to_headerfield(col, self.oldheader)
                      for col in self.denomcols]
     elif self.denompatterns is not None:
         denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern)
                        for pattern in self.denompatterns]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     elif not self.mediansweep and not self.medianintensity:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsvreader.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     mn_factors = False
     if self.mednorm_factors:
         mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
         mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead)
     nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
     if self.featcol:
         self.get_column_header_for_number(['featcol'], self.oldheader)
         self.header = [self.featcol] + quantcols + nopsms + [HEADER_NO_FULLQ_PSMS]
     else:
         self.header = (self.oldheader +
                        ['ratio_{}'.format(x) for x in quantcols])
     self.psms = isosummarize.get_isobaric_ratios(self.fn, self.oldheader,
             quantcols, denomcols, self.mediansweep, self.medianintensity,
             self.median_or_avg, self.minint, False, False, self.featcol,
             False, False, False, self.logisoquant, self.mediannormalize,
             mn_factors, self.keepnapsms)
Beispiel #4
0
def generate_peptides(tsvfn,
                      oldheader,
                      switch_map,
                      scorecol,
                      precurquantcol,
                      fncol=None,
                      higherbetter=True):
    if fncol is None:
        fncol = mzidtsvdata.HEADER_SPECFILE
    peptides = {}
    for psm in reader.generate_split_tsv_lines(tsvfn, oldheader):
        for oldkey, newkey in switch_map.items():
            try:
                psm[newkey] = psm.pop(oldkey)
            except KeyError:
                pass
        pepseq = psm[peptabledata.HEADER_PEPTIDE]
        peptides = evaluate_peptide(peptides, psm, pepseq, higherbetter,
                                    scorecol, fncol)
        add_quant_values(peptides, psm, precurquantcol)
    for peptide in peptides.values():
        peptide['line'][peptabledata.HEADER_LINKED_PSMS] = '; '.join(
            peptide['psms'])
        for qtype, pepquant in peptide['quant'].items():
            peptide['line'].update(parse_quant_data(qtype, pepquant))
        yield peptide['line']
Beispiel #5
0
def store_psm_protein_relations(fn, header, pgdb, proteins, specfncol):
    """Reads PSMs from file, extracts their proteins and peptides and passes
    them to a database backend in chunks.
    """
    # TODO do we need an OrderedDict or is regular dict enough?
    # Sorting for psm_id useful?
    allpsms = OrderedDict()
    last_id, psmids_to_store = None, set()
    store_soon = False
    for psm in tsvreader.generate_split_tsv_lines(fn, header):
        psm_id, prots = tsvreader.get_pepproteins(psm, specfncol)
        # TODO can this be removed permanently?
        # Filter proteins to only include those that match the protein
        # accessions in fasta so we get the correct names, filter out the badly annotated peptides
        # prots = [x for x in prots if x in proteins]
        try:
            # In case the PSMs are presented unrolled
            allpsms[psm_id].extend(prots)
        except KeyError:
            allpsms[psm_id] = prots
        if len(psmids_to_store) > DB_STORE_CHUNK:
            store_soon = True
        if store_soon and last_id != psm_id:
            pgdb.store_peptides_proteins(allpsms, psmids_to_store)
            store_soon = False
            psmids_to_store = set()
        psmids_to_store.add(psm_id)
        last_id = psm_id
    if len(psmids_to_store) > 0:
        pgdb.store_peptides_proteins(allpsms, psmids_to_store)
    pgdb.index_protein_peptides()
    return allpsms
Beispiel #6
0
def paste_to_psmtable(psmfn, header, ratios):
    # loop psms in psmtable, paste the outratios in memory
    for psm, ratio in zip(reader.generate_split_tsv_lines(psmfn, header),
                          ratios):
        ratio.pop(ISOQUANTRATIO_FEAT_ACC)
        ratio = {'ratio_{}'.format(ch): val for ch, val in ratio.items()}
        psm.update(ratio)
        yield psm
Beispiel #7
0
 def prepare(self):
     if type(self.fn) == list:
         self.first_infile = self.fn[0]
     else:
         self.first_infile = self.fn
     self.oldheader = tsvreader.get_tsv_header(self.first_infile)
     self.oldpsms = tsvreader.generate_split_tsv_lines(
         self.fn, self.oldheader)
Beispiel #8
0
 def get_td_proteins_bestpep(self, theader, dheader):
     self.header = [self.headeraccfield] + prottabledata.PICKED_HEADER
     tscorecol = tsvreader.get_cols_in_file(self.scorecolpattern, theader,
                                            True)
     dscorecol = tsvreader.get_cols_in_file(self.scorecolpattern, dheader,
                                            True)
     tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader)
     dpeps = tsvreader.generate_split_tsv_lines(self.decoyfn, dheader)
     targets = proteins.generate_bestpep_proteins(tpeps, tscorecol,
                                                  self.minlogscore,
                                                  self.headeraccfield,
                                                  self.fixedfeatcol)
     decoys = proteins.generate_bestpep_proteins(dpeps, dscorecol,
                                                 self.minlogscore,
                                                 self.headeraccfield,
                                                 self.fixedfeatcol)
     return targets, decoys
Beispiel #9
0
def store_proteins_descriptions(pgdb, fastafn, fastamd5, tsvfn, header,
                                fastadelim, genefield):
    if not fastafn:
        prots = {}
        for psm in tsvreader.generate_split_tsv_lines(tsvfn, header):
            prots.update({x: 1 for x in tsvreader.get_proteins_from_psm(psm)})
        prots = [(protein, ) for protein in prots.keys()]
        pgdb.store_proteins(prots)
    else:
        prots, seqs, desc, evids, ensgs, symbols = fastareader.get_proteins_for_db(
            fastafn, fastadelim, genefield)
        pgdb.store_fasta(fastafn, fastamd5, prots, evids, seqs, desc, ensgs,
                         symbols)
    return set([x[0] for x in prots])
Beispiel #10
0
def get_psmratios(psmfn, header, channels, denom_channels, sweep,
                  report_intensity, summarize_by, min_int, acc_col,
                  logintensities, keep_na_psms):
    allfeats, feat_order, psmratios = {}, OrderedDict(), []
    for psm in reader.generate_split_tsv_lines(psmfn, header):
        # remove uninformative psms when adding to features
        # TODO the check for is-not-a-peptide can be removed but there are some usecases
        # for which it is convenient, when adding information to the peptide
        # sequence (e.g. PTM data). When having fully functional PTM
        # data storage/analysis in msstitch, we can possibly remove it
        if acc_col and (
                psm[acc_col] == '' or
            (acc_col != psmh.HEADER_PEPTIDE and ';' in psm[acc_col]) or
                not {psm[q]
                     for q in channels}.difference({'NA', None, False, ''})):
            continue
        ratios = calc_psm_ratios_or_int(psm, channels, denom_channels, sweep,
                                        report_intensity, min_int,
                                        logintensities)
        if acc_col and not keep_na_psms and any(
            (ratios[ix] == 'NA' for ix, q in enumerate(channels))):
            continue
        elif acc_col:
            try:
                allfeats[psm[acc_col]].append(ratios)
            except KeyError:
                allfeats[psm[acc_col]] = [ratios]
            feat_order[psm[acc_col]] = 1
        else:
            psmquant = {
                ch: str(ratios[ix]) if ratios[ix] != 'NA' else 'NA'
                for ix, ch in enumerate(channels)
            }
            psmquant[ISOQUANTRATIO_FEAT_ACC] = False
            psmratios.append(psmquant)
    if not acc_col:
        return psmratios
    else:
        outfeatures = []
        for feat in feat_order.keys():
            quants = allfeats[feat]
            outfeature = {ISOQUANTRATIO_FEAT_ACC: feat}
            if summarize_by == 'median':
                outfeature.update(get_medians(channels, quants))
            elif summarize_by == 'average':
                outfeature.update(summarize_by_averages(channels, quants))
            outfeature.update(get_no_psms(channels, quants))
            outfeatures.append(outfeature)
    return outfeatures
Beispiel #11
0
 def write(self):
     for psmfn, mzidfn in zip(self.fn, self.mzidfns):
         oldheader = tsvreader.get_tsv_header(psmfn)
         header = perco.get_header_with_percolator(oldheader)
         outfn = self.create_outfilepath(psmfn, self.outsuffix)
         mzns = mzidreader.get_mzid_namespace(mzidfn)
         mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns)
         psms = tsvreader.generate_split_tsv_lines(psmfn, oldheader)
         psms_perco = perco.add_fdr_to_mzidtsv(psms, mzidsr, mzns,
                 self.percopsms)
         if self.filtpsm:
             psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PSMQ,
                     self.filtpsm, True)
         if self.filtpep:
             psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PEPTIDE_Q,
                     self.filtpep, True)
         writer.write_tsv(header, psms_perco, outfn)
Beispiel #12
0
    def set_features(self):
        """Creates iterator to write to new tsv. Contains input tsv
        lines plus quant data for these."""
        # First prepare the data, read PSM table to SQLite
        specfncolnr = int(self.spectracol) - 1
        specfncol = self.oldheader[specfncolnr]
        fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim,
                                                              self.genefield)

        if self.fasta:
            fasta_md5 = refine.get_fasta_md5(self.fasta)
        else:
            fasta_md5 = False

        # If appending to previously refined PSM table, reuse DB and shift rows
        if self.oldpsmfile:
            oldfasta_md5 = self.lookup.get_fasta_md5()
            if fasta_md5 != oldfasta_md5:
                print('WARNING, FASTA database used in old PSM table differs '
                        'from the passed database (or this cannot be determined '
                        'due to version differences), this may cause problems, as '
                        'msstitch will use the old database for PSM annotation.')
            shiftrows = self.lookup.get_highest_rownr() + 1
            proteins = set([x for x in self.lookup.get_protids()])
            self.lookup.drop_psm_indices()
        else:
            shiftrows = 0

        if self.proteingroup:
            if not fasta_md5 and not oldfasta_md5:
                # In case of old Fasta already stored it will be fine to protein group
                print('Cannot create protein group without supplying FASTA search '
                        'database file')
                sys.exit(1)
            self.tabletypes.append('proteingroup')
            self.lookup.drop_pgroup_tables()
        self.lookup.add_tables(self.tabletypes)

        # Need to place this here since we cannot store before having done add tables, but that
        # has to be done after getting proteingroup knowledge, which depends on knowledge of 
        # having passed an oldpsmfile (because of oldfasta_md5):
        if not self.oldpsmfile:
            proteins = refine.store_proteins_descriptions(self.lookup, self.fasta,
                    fasta_md5, self.fn, self.oldheader, fastadelim, genefield)

        refine.create_psm_lookup(self.fn, self.oldheader, proteins, self.lookup, 
                shiftrows, self.unroll, specfncol, fastadelim, genefield)
        isob_header = [x[0] for x in self.lookup.get_all_quantmaps()] if self.isobaric else False
        self.header = refine.create_header(self.oldheader, self.genes, 
                self.proteingroup, self.precursor, isob_header, self.addbioset, 
                self.addmiscleav, specfncolnr)
        psms = self.oldpsms
        # Now pass PSMs through multiple generators to add info
        if self.genes:
            psms = refine.add_genes_to_psm_table(psms, self.lookup)
        if self.isobaric or self.precursor:
            psms = refine.generate_psms_quanted(self.lookup, shiftrows, psms,
                    isob_header, self.isobaric, self.precursor, self.min_purity)
        psms = refine.generate_psms_spectradata(self.lookup, shiftrows, 
                psms, self.addbioset, self.addmiscleav)
        if self.oldpsmfile:
            prevheader = tsvreader.get_tsv_header(self.oldpsmfile)
            previouspsms = tsvreader.generate_split_tsv_lines(self.oldpsmfile, prevheader)
            psms = chain(previouspsms, psms)
        # Enforce proteingroup last, since it has to come AFTER the chaining of old + new PSMs
        # In theory you could do it before, but that makes no sense since in a big experiment you
        # also do not map PSMs to genes differently? If that is needed, you have to run multiple
        # experiments.
        if self.proteingroup:
            refine.build_proteingroup_db(self.lookup)
            psms = refine.generate_psms_with_proteingroups(psms, self.lookup, specfncol, self.unroll)
        self.psms = psms
Beispiel #13
0
 def set_features(self):
     qpat = self.quantcolpattern if self.quantcolpattern else '[a-z]+[0-9]+plex_'
     header = [x for x in self.oldheader if x != psmh.HEADER_SPECFILE]
     try:
         isocols = tsvreader.get_columns_by_pattern(header, qpat)
     except RuntimeError:
         pass
     else:
         for col in isocols:
             header.pop(header.index(col))
     if self.precurquantcol:
         header = [peph.HEADER_AREA if x == self.precurquantcol
                   else x for x in header]
     header = [peph.HEADER_PEPTIDE, peph.HEADER_LINKED_PSMS] + [
             x for x in header if x != psmh.HEADER_PEPTIDE]
     switch_map = {old: new for old, new in zip(
         [psmh.HEADER_PEPTIDE, psmh.HEADER_PROTEIN, psmh.HEADER_PEPTIDE_Q],
         [peph.HEADER_PEPTIDE, peph.HEADER_PROTEINS, peph.HEADER_QVAL])}
     self.header = [switch_map[field] if field in switch_map else field
             for field in header]
     peptides = psmtopeptable.generate_peptides(self.fn, self.oldheader,
             switch_map, self.scorecol, self.precurquantcol, self.spectracol)
     # Remove quant data if not specified any way to summarize
     if self.quantcolpattern and any([self.denomcols, self.denompatterns,
             self.mediansweep, self.medianintensity]):
         denomcols = False
         if self.denomcols is not None:
             denomcols = [self.number_to_headerfield(col, self.oldheader)
                          for col in self.denomcols]
         elif self.denompatterns is not None:
             denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern)
                            for pattern in self.denompatterns]
             denomcols = set([col for cols in denomcolnrs for col in cols])
         quantcols = tsvreader.get_columns_by_pattern(self.oldheader,
                                                self.quantcolpattern)
         totalproteome, tpacc, tp_pepacc = False, False, False
         if self.totalprotfn:
             pep_tp_accs = [psmh.HEADER_MASTER_PROT, psmh.HEADER_SYMBOL,
                     psmh.HEADER_GENE, peph.HEADER_PROTEINS]
             totalphead = tsvreader.get_tsv_header(self.totalprotfn)
             totalpfield_found = False
             for tpacc, tp_pepacc in zip(proth.TPROT_HEADER_ACCS, pep_tp_accs):
                 if totalphead[0] == tpacc and tp_pepacc in self.header:
                     totalpfield_found = True
                     break
             if not totalpfield_found:
                 print('Could not find correct header field name in the total '
                         'proteome table passed. '
                         'Should be one of {}'.format(proth.TPROT_HEADER_ACCS))
                 sys.exit(1)
             totalproteome = tsvreader.generate_split_tsv_lines(self.totalprotfn, totalphead)
         mn_factors = False
         if self.mednorm_factors:
             mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
             mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead)
         nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
         self.header = self.header + quantcols + nopsms + [proth.HEADER_NO_FULLQ_PSMS]
         peptides = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, 
                 quantcols, denomcols, self.mediansweep, self.medianintensity,
                 self.median_or_avg, self.minint, peptides, self.header[0],
                 psmh.HEADER_PEPTIDE, totalproteome, tpacc, tp_pepacc,
                 self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms)
     if self.modelqvals:
         qix = self.header.index(peph.HEADER_QVAL) + 1
         self.header = self.header[:qix] + [peph.HEADER_QVAL_MODELED] + self.header[qix:]
         scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                 self.oldheader, True)
         peptides = psmtopeptable.recalculate_qvals_linear_model(peptides, 
                 scorecol, self.qvalthreshold, self.minpeptidenr)
     self.features = peptides