Esempio n. 1
0
 def get_quant(self, theader, features):
     if self.precursor:
         tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader)
         self.header.append(prottabledata.HEADER_AREA)
         features = proteins.add_ms1_quant_from_top3_mzidtsv(
             features, tpeps, self.headeraccfield, self.fixedfeatcol)
     if self.quantcolpattern:
         psmheader = tsvreader.get_tsv_header(self.psmfile)
         denomcols = False
         if self.denomcols is not None:
             denomcols = [
                 self.number_to_headerfield(col, psmheader)
                 for col in self.denomcols
             ]
         elif self.denompatterns is not None:
             denomcolnrs = [
                 tsvreader.get_columns_by_pattern(psmheader, pattern)
                 for pattern in self.denompatterns
             ]
             denomcols = set([col for cols in denomcolnrs for col in cols])
         elif not self.mediansweep and not self.medianintensity:
             print(
                 'Must define either denominator column numbers '
                 'or regex pattterns to find them, or use median sweep, or '
                 'report median intensities.')
             sys.exit(1)
         elif self.medianintensity and self.mediannormalize:
             print(
                 'Cannot do median-centering on intensity values, exiting')
             sys.exit(1)
         quantcols = tsvreader.get_columns_by_pattern(
             psmheader, self.quantcolpattern)
         mn_factors = False
         if self.mednorm_factors:
             mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
             mn_factors = tsvreader.generate_split_tsv_lines(
                 self.mednorm_factors, mnhead)
         nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
         self.header = self.header + quantcols + nopsms + [
             prottabledata.HEADER_NO_FULLQ_PSMS
         ]
         features = isosummarize.get_isobaric_ratios(
             self.psmfile, psmheader, quantcols, denomcols,
             self.mediansweep, self.medianintensity, self.median_or_avg,
             self.minint, features, self.headeraccfield, self.fixedfeatcol,
             False, False, False, self.logisoquant, self.mediannormalize,
             mn_factors, self.keepnapsms)
     return features
Esempio n. 2
0
 def set_features(self):
     denomcols = False
     if self.denomcols is not None:
         denomcols = [self.number_to_headerfield(col, self.oldheader)
                      for col in self.denomcols]
     elif self.denompatterns is not None:
         denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern)
                        for pattern in self.denompatterns]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     elif not self.mediansweep and not self.medianintensity:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsvreader.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     mn_factors = False
     if self.mednorm_factors:
         mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
         mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead)
     nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
     if self.featcol:
         self.get_column_header_for_number(['featcol'], self.oldheader)
         self.header = [self.featcol] + quantcols + nopsms + [HEADER_NO_FULLQ_PSMS]
     else:
         self.header = (self.oldheader +
                        ['ratio_{}'.format(x) for x in quantcols])
     self.psms = isosummarize.get_isobaric_ratios(self.fn, self.oldheader,
             quantcols, denomcols, self.mediansweep, self.medianintensity,
             self.median_or_avg, self.minint, False, False, self.featcol,
             False, False, False, self.logisoquant, self.mediannormalize,
             mn_factors, self.keepnapsms)
Esempio n. 3
0
def get_isobaric_ratios(psmfn, psmheader, channels, denom_channels, min_int,
                        targetfn, accessioncol, normalize, normratiofn):
    """Main function to calculate ratios for PSMs, peptides, proteins, genes.
    Can do simple ratios, median-of-ratios and median-centering
    normalization."""
    psm_or_feat_ratios = get_psmratios(psmfn, psmheader, channels,
                                       denom_channels, min_int, accessioncol)
    if normalize and normratiofn:
        normheader = reader.get_tsv_header(normratiofn)
        normratios = get_ratios_from_fn(normratiofn, normheader, channels)
        ch_medians = get_medians(channels, normratios, report=True)
        outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians,
                                                channels)
    elif normalize:
        flatratios = [[feat[ch] for ch in channels]
                      for feat in psm_or_feat_ratios]
        ch_medians = get_medians(channels, flatratios, report=True)
        outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians,
                                                channels)
    else:
        outratios = psm_or_feat_ratios
    # at this point, outratios look like:
    # [{ch1: 123, ch2: 456, ISOQUANTRATIO_FEAT_ACC: ENSG1244}, ]
    if accessioncol and targetfn:
        outratios = {x[ISOQUANTRATIO_FEAT_ACC]: x for x in outratios}
        return output_to_target_accession_table(targetfn, outratios, channels)
    elif not accessioncol and not targetfn:
        return paste_to_psmtable(psmfn, psmheader, outratios)
    elif accessioncol and not targetfn:
        # generate new table with accessions
        return ({(k if not k == ISOQUANTRATIO_FEAT_ACC else
                  prottabledata.HEADER_ACCESSION): v
                 for k, v in ratio.items()} for ratio in outratios)
Esempio n. 4
0
 def get_psms(self):
     self.header = self.oldheader[:]
     if self.denomcols is not None:
         denomcols = [
             self.number_to_headerfield(col, self.oldheader)
             for col in self.denomcols
         ]
     elif self.denompatterns is not None:
         denomcolnrs = [
             tsv.get_columns_by_pattern(self.oldheader, pattern)
             for pattern in self.denompatterns
         ]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     else:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsv.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     if self.medianpsms is not None:
         medianheader = tsv.get_tsv_header(self.medianpsms)
     else:
         medianheader = False
     self.psms = prep.get_normalized_ratios(self.fn, self.oldheader,
                                            quantcols, denomcols,
                                            self.minint, self.medianpsms,
                                            medianheader)
Esempio n. 5
0
 def prepare(self):
     self.oldheader = tsvreader.get_tsv_header(self.fn)
     self.get_column_header_for_number(['spectracol'])
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.oldheader, True)
     self.precurquantcol = psmtopeptable.get_quantcols(self.precursorquantcolpattern,
                                              self.oldheader, 'precur')
Esempio n. 6
0
 def initialize_input(self):
     self.oldheader = tsvreader.get_tsv_header(self.fn)
     self.get_column_header_for_number(['spectracol'])
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.oldheader, True)
     self.precurquantcol = prep.get_quantcols(self.precursorquantcolpattern,
                                              self.oldheader, 'precur')
Esempio n. 7
0
def get_colmap(fns, pattern, single_col=False, antipattern=False):
    """For table files, loops through headers and checks which column(s)
    match a passed pattern. Those column(s) names are returned in a map with
    filenames as keys"""
    colmap = {}
    for fn in fns:
        header = tsvreader.get_tsv_header(fn)
        basefn = os.path.basename(fn)
        try:
            cols = tsvreader.get_cols_in_file(pattern, header, single_col)
        except RuntimeError:
            # Columns are not in this file
            cols = []
        if antipattern:
            try:
                anticols = tsvreader.get_cols_in_file(antipattern, header,
                                                      single_col)
            except RuntimeError:
                # The filtering "anti"-columns are not in the file,
                anticols = []
            cols = [col for col in cols if col not in anticols]
        if cols:
            colmap[basefn] = cols
        else:
            return False
    return colmap
Esempio n. 8
0
 def get_psms(self):
     if self.denomcols is not None:
         denomcols = [self.number_to_headerfield(col, self.oldheader)
                      for col in self.denomcols]
     elif self.denompatterns is not None:
         denomcolnrs = [tsv.get_columns_by_pattern(self.oldheader, pattern)
                        for pattern in self.denompatterns]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     else:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsv.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     self.get_column_header_for_number(['proteincol'], self.oldheader)
     nopsms = [prep.get_no_psms_field(qf) for qf in quantcols]
     if self.proteincol and self.targettable:
         targetheader = tsv.get_tsv_header(self.targettable)
         self.header = targetheader + quantcols + nopsms
     elif not self.proteincol and not self.targettable:
         self.header = (self.oldheader +
                        ['ratio_{}'.format(x) for x in quantcols])
     elif self.proteincol and not self.targettable:
         self.header = [prottabledata.HEADER_ACCESSION] + quantcols + nopsms
     self.psms = prep.get_isobaric_ratios(self.fn, self.oldheader,
                                          quantcols, denomcols, self.minint,
                                          self.targettable, self.proteincol,
                                          self.normalize,
                                          self.normalizeratios)
Esempio n. 9
0
 def initialize_input(self):
     self.oldheader = tsvreader.get_tsv_header(self.fn)
     self.get_column_header_for_number(['spectracol'])
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.oldheader, True)
     self.precurquantcol = prep.get_quantcols(self.precursorquantcolpattern,
                                              self.oldheader, 'precur')
Esempio n. 10
0
def merge_mzidtsvs(fns, header):
    for fn in fns:
        if header != tsvreader.get_tsv_header(fn):
            raise RuntimeError('Headers of TSV files to concatenate are '
                               'not identical')
    for psm in tsvreader.generate_tsv_lines_multifile(fns, header):
        yield psm
Esempio n. 11
0
 def get_psms(self):
     if self.denomcols is not None:
         denomcols = [
             self.number_to_headerfield(col, self.oldheader)
             for col in self.denomcols
         ]
     elif self.denompatterns is not None:
         denomcolnrs = [
             tsv.get_columns_by_pattern(self.oldheader, pattern)
             for pattern in self.denompatterns
         ]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     else:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsv.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     self.get_column_header_for_number(['proteincol'], self.oldheader)
     nopsms = [prep.get_no_psms_field(qf) for qf in quantcols]
     if self.proteincol and self.targettable:
         targetheader = tsv.get_tsv_header(self.targettable)
         self.header = targetheader + quantcols + nopsms
     elif not self.proteincol and not self.targettable:
         self.header = (self.oldheader +
                        ['ratio_{}'.format(x) for x in quantcols])
     elif self.proteincol and not self.targettable:
         self.header = [prottabledata.HEADER_ACCESSION] + quantcols + nopsms
     self.psms = prep.get_isobaric_ratios(self.fn, self.oldheader,
                                          quantcols, denomcols, self.minint,
                                          self.targettable, self.proteincol,
                                          self.normalize,
                                          self.normalizeratios)
Esempio n. 12
0
def get_isobaric_ratios(psmfn, psmheader, channels, denom_channels, min_int,
                        targetfn, accessioncol, normalize, normratiofn):
    """Main function to calculate ratios for PSMs, peptides, proteins, genes.
    Can do simple ratios, median-of-ratios and median-centering
    normalization."""
    psm_or_feat_ratios = get_psmratios(psmfn, psmheader, channels,
                                       denom_channels, min_int, accessioncol)
    if normalize and normratiofn:
        normheader = reader.get_tsv_header(normratiofn)
        normratios = get_ratios_from_fn(normratiofn, normheader, channels)
        ch_medians = get_medians(channels, normratios, report=True)
        outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians,
                                                channels)
    elif normalize:
        flatratios = [[feat[ch] for ch in channels]
                      for feat in psm_or_feat_ratios]
        ch_medians = get_medians(channels, flatratios, report=True)
        outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians,
                                                channels)
    else:
        outratios = psm_or_feat_ratios
    # at this point, outratios look like:
    # [{ch1: 123, ch2: 456, ISOQUANTRATIO_FEAT_ACC: ENSG1244}, ]
    if accessioncol and targetfn:
        outratios = {x[ISOQUANTRATIO_FEAT_ACC]: x for x in outratios}
        return output_to_target_accession_table(targetfn, outratios, channels)
    elif not accessioncol and not targetfn:
        return paste_to_psmtable(psmfn, psmheader, outratios)
    elif accessioncol and not targetfn:
        # generate new table with accessions
        return ({(k if not k == ISOQUANTRATIO_FEAT_ACC
                  else prottabledata.HEADER_ACCESSION): v
                 for k, v in ratio.items()} for ratio in outratios)
Esempio n. 13
0
 def create_lookup(self):
     header = tsvreader.get_tsv_header(self.fn)
     specfncol = header[int(self.spectracol) - 1]
     fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim,
                                                           self.genefield)
     lookup.create_psm_lookup(self.fn, self.fasta, self.mapfn, header,
                              self.lookup, self.unroll, specfncol,
                              self.decoy, fastadelim, genefield)
Esempio n. 14
0
 def prepare(self):
     if type(self.fn) == list:
         self.first_infile = self.fn[0]
     else:
         self.first_infile = self.fn
     self.oldheader = tsvreader.get_tsv_header(self.first_infile)
     self.oldpsms = tsvreader.generate_split_tsv_lines(
         self.fn, self.oldheader)
Esempio n. 15
0
 def create_lookup(self):
     header = tsvreader.get_tsv_header(self.fn)
     specfncol = header[int(self.spectracol) - 1]
     fastadelim, genefield = self.get_fastadelim_genefield(
         self.fastadelim, self.genefield)
     lookup.create_psm_lookup(self.fn, self.fasta, self.mapfn, header,
                              self.lookup, self.unroll, specfncol,
                              self.decoy, fastadelim, genefield)
Esempio n. 16
0
 def initialize_input(self):
     super().initialize_input()
     quantheader = reader.get_tsv_header(self.quantfile)
     self.quantfields = reader.get_cols_in_file(self.quantcolpattern,
                                                quantheader)
     self.quantacc = reader.get_cols_in_file(self.quantacccolpattern,
                                             quantheader, single_col=True)
     self.quantpeptides = reader.generate_tsv_proteins(self.quantfile,
                                                       quantheader)
Esempio n. 17
0
 def run(self):
     if type(self.fn) == list:
         self.first_infile = self.fn[0]
     else:
         self.first_infile = self.fn
     self.oldheader = tsvreader.get_tsv_header(self.first_infile)
     self.get_psms()
     self.write()
     self.finish()
Esempio n. 18
0
 def initialize_input(self):
     super().initialize_input()
     quantheader = reader.get_tsv_header(self.quantfile)
     self.quantfields = reader.get_cols_in_file(self.quantcolpattern,
                                                quantheader)
     self.quantacc = reader.get_cols_in_file(self.quantacccolpattern,
                                             quantheader,
                                             single_col=True)
     self.quantfeatures = reader.generate_tsv_proteins(
         self.quantfile, quantheader)
Esempio n. 19
0
 def initialize_input(self):
     super().initialize_input()
     self.pepheader = tsvreader.get_tsv_header(self.pepfile)
     if self.proteincol:
         self.get_column_header_for_number(['proteincol'], self.pepheader)
     elif self.pcolpattern:
         self.proteincol = tsvreader.get_cols_in_file(
             self.pcolpattern, self.pepheader, True)
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.pepheader, True)
Esempio n. 20
0
 def set_features(self):
     theader = tsvreader.get_tsv_header(self.fn)
     dheader = tsvreader.get_tsv_header(self.decoyfn)
     targets, decoys = self.get_td_proteins_bestpep(theader, dheader)
     if self.fdrtype == 'picked':
         if not self.t_fasta or not self.d_fasta:
             print(
                 'Must use --targetfasta and --decoyfasta when using picked FDR'
             )
             sys.exit(1)
         fastadelim, genefield = self.get_fastadelim_genefield(
             self.fastadelim, self.genefield)
         features = proteins.generate_pick_fdr(targets, decoys,
                                               self.t_fasta, self.d_fasta,
                                               self.headeraccfield,
                                               fastadelim, genefield)
     else:
         features = proteins.generate_classic_fdr(targets, decoys,
                                                  self.headeraccfield)
     self.features = self.get_quant(theader, features)
Esempio n. 21
0
 def write(self):
     # FIXME 'Strip', 'Fraction', 'missed_cleavage'
     for psmfn, mzidfn in zip(self.fn, self.mzidfns):
         oldheader = tsvreader.get_tsv_header(psmfn)
         header = prep.get_header_with_percolator(oldheader)
         outfn = self.create_outfilepath(psmfn, self.outsuffix)
         psms = tsvreader.generate_tsv_psms(psmfn, oldheader)
         mzns = mzidreader.get_mzid_namespace(mzidfn)
         mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns)
         psms_out = prep.add_fdr_to_mzidtsv(psms, mzidsr, mzns,
                                            self.percopsms)
         writer.write_tsv(header, psms_out, outfn)
Esempio n. 22
0
def output_to_target_accession_table(targetfn, featratios, channels):
    #loop prottable, add ratios from dict, acc = key
    theader = reader.get_tsv_header(targetfn)
    acc_field = theader[0]
    for feat in reader.generate_tsv_proteins(targetfn, theader):
        try:
            quants = featratios[feat[acc_field]]
        except KeyError:
            quants = {ch: 'NA' for ch in channels}
            quants.update({get_no_psms_field(ch): 'NA' for ch in channels})
        else:
            quants.pop(ISOQUANTRATIO_FEAT_ACC)
        feat.update(quants)
        yield feat
Esempio n. 23
0
def output_to_target_accession_table(targetfn, featratios, channels):
    #loop prottable, add ratios from dict, acc = key
    theader = reader.get_tsv_header(targetfn)
    acc_field = theader[0]
    for feat in reader.generate_tsv_proteins(targetfn, theader):
        try:
            quants = featratios[feat[acc_field]]
        except KeyError:
            quants = {ch: 'NA' for ch in channels}
            quants.update({get_no_psms_field(ch): 'NA' for ch in channels})
        else:
            quants.pop(ISOQUANTRATIO_FEAT_ACC)
        feat.update(quants)
        yield feat
Esempio n. 24
0
def get_colmap(fns, pattern, single_col=False, antipattern=False):
    """For table files, loops through headers and checks which column(s)
    match a passed pattern. Those column(s) names are returned in a map with
    filenames as keys"""
    colmap = {}
    for fn in fns:
        header = tsvreader.get_tsv_header(fn)
        basefn = os.path.basename(fn)
        cols = tsvreader.get_cols_in_file(pattern, header, single_col)
        if antipattern:
            anticols = tsvreader.get_cols_in_file(antipattern, header,
                                                  single_col)
            cols = [col for col in cols if col not in anticols]
        if cols:
            colmap[basefn] = cols
    return colmap
Esempio n. 25
0
def get_colmap(fns, pattern, single_col=False, antipattern=False):
    """For table files, loops through headers and checks which column(s)
    match a passed pattern. Those column(s) names are returned in a map with
    filenames as keys"""
    colmap = {}
    for fn in fns:
        header = tsvreader.get_tsv_header(fn)
        basefn = os.path.basename(fn)
        cols = tsvreader.get_cols_in_file(pattern, header, single_col)
        if antipattern:
            anticols = tsvreader.get_cols_in_file(antipattern, header,
                                                  single_col)
            cols = [col for col in cols if col not in anticols]
        if cols:
            colmap[basefn] = cols
    return colmap
Esempio n. 26
0
 def write(self):
     for psmfn, mzidfn in zip(self.fn, self.mzidfns):
         oldheader = tsvreader.get_tsv_header(psmfn)
         header = perco.get_header_with_percolator(oldheader)
         outfn = self.create_outfilepath(psmfn, self.outsuffix)
         mzns = mzidreader.get_mzid_namespace(mzidfn)
         mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns)
         psms = tsvreader.generate_split_tsv_lines(psmfn, oldheader)
         psms_perco = perco.add_fdr_to_mzidtsv(psms, mzidsr, mzns,
                 self.percopsms)
         if self.filtpsm:
             psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PSMQ,
                     self.filtpsm, True)
         if self.filtpep:
             psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PEPTIDE_Q,
                     self.filtpep, True)
         writer.write_tsv(header, psms_perco, outfn)
Esempio n. 27
0
 def get_psms(self):
     self.header = self.oldheader[:]
     if self.denomcols is not None:
         denomcols = [self.number_to_headerfield(col, self.oldheader)
                      for col in self.denomcols]
     elif self.denompatterns is not None:
         denomcolnrs = [tsv.get_columns_by_pattern(self.oldheader, pattern)
                        for pattern in self.denompatterns]
         denomcols = set([col for cols in denomcolnrs for col in cols])
     else:
         raise RuntimeError('Must define either denominator column numbers '
                            'or regex pattterns to find them')
     quantcols = tsv.get_columns_by_pattern(self.oldheader,
                                            self.quantcolpattern)
     if self.medianpsms is not None:
         medianheader = tsv.get_tsv_header(self.medianpsms)
     else:
         medianheader = False
     self.psms = prep.get_normalized_ratios(self.fn, self.oldheader,
                                            quantcols, denomcols,
                                            self.minint, self.medianpsms,
                                            medianheader)
Esempio n. 28
0
 def parse_input(self, **kwargs):
     super().parse_input(**kwargs)
     header = tsvreader.get_tsv_header(self.fn[0])
     self.header = [header[0]]
     if header[0] == peph.HEADER_PEPTIDE:
         self.is_peptidetable = True
     else:
         self.is_peptidetable = False
         self.pepcolpattern = None  # override input if any
     if header[0] == peph.HEADER_PEPTIDE:
         if self.genecentric:
             self.lookuptype = 'peptidegenecentrictable'
             self.header.extend([peph.HEADER_GENES, peph.HEADER_ASSOCIATED])
         elif self.nogroup:
             self.lookuptype = 'peptidetableplain'
             self.header.extend([peph.HEADER_PROTEINS])
         else:
             self.header.extend([
                 peph.HEADER_PROTEINS, peph.HEADER_NO_CONTENTPROTEINS,
                 peph.HEADER_DESCRIPTIONS, peph.HEADER_COVERAGES,
                 peph.HEADER_GENES, peph.HEADER_ASSOCIATED
             ])
             self.lookuptype = 'peptidetable'
     elif header[0] == ph.HEADER_PROTEIN:
         self.lookuptype = 'prottable'
         self.header.extend([
             ph.HEADER_GENEID, ph.HEADER_GENENAME, ph.HEADER_DESCRIPTION,
             ph.HEADER_COVERAGE, ph.HEADER_CONTENTPROT, ph.HEADER_NO_PROTEIN
         ])
     elif header[0] == ph.HEADER_GENENAME:
         self.lookuptype = 'associdtable'
         self.header.extend(
             [ph.HEADER_GENEID, ph.HEADER_PROTEINS, ph.HEADER_DESCRIPTION])
     elif header[0] == ph.HEADER_GENEID:
         self.lookuptype = 'genetable'
         self.header.extend([
             ph.HEADER_GENENAME, ph.HEADER_PROTEINS, ph.HEADER_DESCRIPTION
         ])
Esempio n. 29
0
decoys = {True: 0, False: 0}
for psm in sorted([(pid, float(p.find('{%s}svm_score' % ns['xmlns']).text), p) for pid, p in psms.items()], reverse=True, key=lambda x:x[1]):
    pdecoy = psm[2].attrib['{%s}decoy' % ns['xmlns']] == 'true'
    decoys[pdecoy] += 1
    try:
        psms[psm[0]] = {'decoy': pdecoy, 'svm': psm[1], 'qval': decoys[True]/decoys[False]}  # T-TDC
    except ZeroDivisionError:
        psms[psm[0]] = {'decoy': pdecoy, 'svm': psm[1], 'qval': 1}  # T-TDC
decoys = {'true': 0, 'false': 0}
for svm, pep in sorted([(float(x.find('{%s}svm_score' % ns['xmlns']).text), x) for x in pycolator.generate_peptides(perco, ns)], reverse=True, key=lambda x:x[0]):
    decoys[pep.attrib['{%s}decoy' % ns['xmlns']]] += 1
    try:
        [psms[pid.text].update({'pepqval': decoys['true']/decoys['false']}) for pid in pep.find('{%s}psm_ids' % ns['xmlns'])]
    except ZeroDivisionError:
        [psms[pid.text].update({'pepqval': 1}) for pid in pep.find('{%s}psm_ids' % ns['xmlns'])]
oldheader = tsv.get_tsv_header(mzidtsvfns[0])
header = oldheader + ['percolator svm-score', 'PSM q-value', 'peptide q-value', 'Strip', 'Fraction', 'missed_cleavage']
with open('tmzidperco', 'w') as tfp, open('dmzidperco', 'w') as dfp:
    tfp.write('\t'.join(header))
    dfp.write('\t'.join(header))
    for fnix, mzidfn in enumerate(mzidfns):
        mzns = mzidplus.get_mzid_namespace(mzidfn)
        inpsms = tsv.generate_tsv_psms(mzidtsvfns[fnix], oldheader)
        for specidr in mzidplus.mzid_spec_result_generator(mzidfn, mzns):
            for specidi in specidr.findall('{%s}SpectrumIdentificationItem' % mzns['xmlns']):
                psm = next(inpsms)
                # percolator psm ID is: samplename_SII_scanindex_rank_scannr_charge_rank
                scanindex, rank = specidi.attrib['id'].replace('SII_', '').split('_')
                scan = {x.split('=')[0]: x.split('=')[1] for x in specidr.attrib['spectrumID'].split(' ')}['scan']
                outpsm = {k: v for k,v in psm.items()}
                spfile = os.path.splitext(psm['#SpecFile'])[0]
Esempio n. 30
0
    def set_features(self):
        """Creates iterator to write to new tsv. Contains input tsv
        lines plus quant data for these."""
        # First prepare the data, read PSM table to SQLite
        specfncolnr = int(self.spectracol) - 1
        specfncol = self.oldheader[specfncolnr]
        fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim,
                                                              self.genefield)

        if self.fasta:
            fasta_md5 = refine.get_fasta_md5(self.fasta)
        else:
            fasta_md5 = False

        # If appending to previously refined PSM table, reuse DB and shift rows
        if self.oldpsmfile:
            oldfasta_md5 = self.lookup.get_fasta_md5()
            if fasta_md5 != oldfasta_md5:
                print('WARNING, FASTA database used in old PSM table differs '
                        'from the passed database (or this cannot be determined '
                        'due to version differences), this may cause problems, as '
                        'msstitch will use the old database for PSM annotation.')
            shiftrows = self.lookup.get_highest_rownr() + 1
            proteins = set([x for x in self.lookup.get_protids()])
            self.lookup.drop_psm_indices()
        else:
            shiftrows = 0

        if self.proteingroup:
            if not fasta_md5 and not oldfasta_md5:
                # In case of old Fasta already stored it will be fine to protein group
                print('Cannot create protein group without supplying FASTA search '
                        'database file')
                sys.exit(1)
            self.tabletypes.append('proteingroup')
            self.lookup.drop_pgroup_tables()
        self.lookup.add_tables(self.tabletypes)

        # Need to place this here since we cannot store before having done add tables, but that
        # has to be done after getting proteingroup knowledge, which depends on knowledge of 
        # having passed an oldpsmfile (because of oldfasta_md5):
        if not self.oldpsmfile:
            proteins = refine.store_proteins_descriptions(self.lookup, self.fasta,
                    fasta_md5, self.fn, self.oldheader, fastadelim, genefield)

        refine.create_psm_lookup(self.fn, self.oldheader, proteins, self.lookup, 
                shiftrows, self.unroll, specfncol, fastadelim, genefield)
        isob_header = [x[0] for x in self.lookup.get_all_quantmaps()] if self.isobaric else False
        self.header = refine.create_header(self.oldheader, self.genes, 
                self.proteingroup, self.precursor, isob_header, self.addbioset, 
                self.addmiscleav, specfncolnr)
        psms = self.oldpsms
        # Now pass PSMs through multiple generators to add info
        if self.genes:
            psms = refine.add_genes_to_psm_table(psms, self.lookup)
        if self.isobaric or self.precursor:
            psms = refine.generate_psms_quanted(self.lookup, shiftrows, psms,
                    isob_header, self.isobaric, self.precursor, self.min_purity)
        psms = refine.generate_psms_spectradata(self.lookup, shiftrows, 
                psms, self.addbioset, self.addmiscleav)
        if self.oldpsmfile:
            prevheader = tsvreader.get_tsv_header(self.oldpsmfile)
            previouspsms = tsvreader.generate_split_tsv_lines(self.oldpsmfile, prevheader)
            psms = chain(previouspsms, psms)
        # Enforce proteingroup last, since it has to come AFTER the chaining of old + new PSMs
        # In theory you could do it before, but that makes no sense since in a big experiment you
        # also do not map PSMs to genes differently? If that is needed, you have to run multiple
        # experiments.
        if self.proteingroup:
            refine.build_proteingroup_db(self.lookup)
            psms = refine.generate_psms_with_proteingroups(psms, self.lookup, specfncol, self.unroll)
        self.psms = psms
Esempio n. 31
0
 def initialize_input(self):
     self.oldheader = reader.get_tsv_header(self.fn)
     self.in_proteins = reader.generate_tsv_proteins(
         self.fn, self.oldheader)
Esempio n. 32
0
 def initialize_input(self):
     self.oldheader = tsvreader.get_tsv_header(self.fn)
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.oldheader, True)
     self.qvalcol = tsvreader.get_cols_in_file(self.fdrcolpattern,
                                               self.oldheader, True)
Esempio n. 33
0
 def set_features(self):
     qpat = self.quantcolpattern if self.quantcolpattern else '[a-z]+[0-9]+plex_'
     header = [x for x in self.oldheader if x != psmh.HEADER_SPECFILE]
     try:
         isocols = tsvreader.get_columns_by_pattern(header, qpat)
     except RuntimeError:
         pass
     else:
         for col in isocols:
             header.pop(header.index(col))
     if self.precurquantcol:
         header = [peph.HEADER_AREA if x == self.precurquantcol
                   else x for x in header]
     header = [peph.HEADER_PEPTIDE, peph.HEADER_LINKED_PSMS] + [
             x for x in header if x != psmh.HEADER_PEPTIDE]
     switch_map = {old: new for old, new in zip(
         [psmh.HEADER_PEPTIDE, psmh.HEADER_PROTEIN, psmh.HEADER_PEPTIDE_Q],
         [peph.HEADER_PEPTIDE, peph.HEADER_PROTEINS, peph.HEADER_QVAL])}
     self.header = [switch_map[field] if field in switch_map else field
             for field in header]
     peptides = psmtopeptable.generate_peptides(self.fn, self.oldheader,
             switch_map, self.scorecol, self.precurquantcol, self.spectracol)
     # Remove quant data if not specified any way to summarize
     if self.quantcolpattern and any([self.denomcols, self.denompatterns,
             self.mediansweep, self.medianintensity]):
         denomcols = False
         if self.denomcols is not None:
             denomcols = [self.number_to_headerfield(col, self.oldheader)
                          for col in self.denomcols]
         elif self.denompatterns is not None:
             denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern)
                            for pattern in self.denompatterns]
             denomcols = set([col for cols in denomcolnrs for col in cols])
         quantcols = tsvreader.get_columns_by_pattern(self.oldheader,
                                                self.quantcolpattern)
         totalproteome, tpacc, tp_pepacc = False, False, False
         if self.totalprotfn:
             pep_tp_accs = [psmh.HEADER_MASTER_PROT, psmh.HEADER_SYMBOL,
                     psmh.HEADER_GENE, peph.HEADER_PROTEINS]
             totalphead = tsvreader.get_tsv_header(self.totalprotfn)
             totalpfield_found = False
             for tpacc, tp_pepacc in zip(proth.TPROT_HEADER_ACCS, pep_tp_accs):
                 if totalphead[0] == tpacc and tp_pepacc in self.header:
                     totalpfield_found = True
                     break
             if not totalpfield_found:
                 print('Could not find correct header field name in the total '
                         'proteome table passed. '
                         'Should be one of {}'.format(proth.TPROT_HEADER_ACCS))
                 sys.exit(1)
             totalproteome = tsvreader.generate_split_tsv_lines(self.totalprotfn, totalphead)
         mn_factors = False
         if self.mednorm_factors:
             mnhead = tsvreader.get_tsv_header(self.mednorm_factors)
             mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead)
         nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols]
         self.header = self.header + quantcols + nopsms + [proth.HEADER_NO_FULLQ_PSMS]
         peptides = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, 
                 quantcols, denomcols, self.mediansweep, self.medianintensity,
                 self.median_or_avg, self.minint, peptides, self.header[0],
                 psmh.HEADER_PEPTIDE, totalproteome, tpacc, tp_pepacc,
                 self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms)
     if self.modelqvals:
         qix = self.header.index(peph.HEADER_QVAL) + 1
         self.header = self.header[:qix] + [peph.HEADER_QVAL_MODELED] + self.header[qix:]
         scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                 self.oldheader, True)
         peptides = psmtopeptable.recalculate_qvals_linear_model(peptides, 
                 scorecol, self.qvalthreshold, self.minpeptidenr)
     self.features = peptides
Esempio n. 34
0
 def initialize_input(self):
     self.pepheader = reader.get_tsv_header(self.fn)
     self.in_psms = reader.generate_tsv_psms(self.fn, self.pepheader)
Esempio n. 35
0
 def prepare(self):
     """No percolator XML for protein tables"""
     self.target = self.fn
     self.targetheader = reader.get_tsv_header(self.target)
     self.decoyheader = reader.get_tsv_header(self.decoyfn)
Esempio n. 36
0
 def initialize_input(self):
     self.target = self.fn
     self.oldheader = reader.get_tsv_header(self.target)
     self.targetheader = reader.get_tsv_header(self.target)
     self.decoyheader = reader.get_tsv_header(self.decoyfn)
Esempio n. 37
0
 def prepare(self):
     """No percolator XML for protein tables"""
     self.target = self.fn
     self.targetheader = reader.get_tsv_header(self.target)
     self.decoyheader = reader.get_tsv_header(self.decoyfn)
Esempio n. 38
0
 def initialize_input(self):
     self.oldheader = reader.get_tsv_header(self.fn)
     self.in_proteins = reader.generate_tsv_proteins(self.fn,
                                                     self.oldheader)
Esempio n. 39
0
 def initialize_input(self):
     super().initialize_input()
     self.in_peptides = reader.generate_tsv_peptides(self.pepfile)
     pepheader = reader.get_tsv_header(self.pepfile)
     self.get_column_header_for_number(['proteincol'], pepheader)
Esempio n. 40
0
 def initialize_input(self):
     super().initialize_input()
     pepheader = reader.get_tsv_header(self.psmfile)
     self.get_column_header_for_number(['proteincol'], pepheader)
     self.in_peptides = reader.generate_tsv_peptides(self.psmfile)
Esempio n. 41
0
 def prepare(self):
     # do not read PSMs, multiple files passed and they will be checked
     # if headers matched
     self.first_infile = self.fn[0]
     self.oldheader = tsvreader.get_tsv_header(self.first_infile)
Esempio n. 42
0
 def initialize_input(self):
     self.target = self.fn
     self.oldheader = reader.get_tsv_header(self.target)
     self.targetheader = reader.get_tsv_header(self.target)
     self.decoyheader = reader.get_tsv_header(self.decoyfn)
Esempio n. 43
0
 def initialize_input(self):
     self.oldheader = tsvreader.get_tsv_header(self.fn)
     self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern,
                                                self.oldheader, True)
     self.qvalcol = tsvreader.get_cols_in_file(self.fdrcolpattern,
                                               self.oldheader, True)
Esempio n. 44
0
 def initialize_input(self):
     self.pepheader = reader.get_tsv_header(self.fn)
     self.in_psms = reader.generate_tsv_psms(self.fn, self.pepheader)