def get_quant(self, theader, features): if self.precursor: tpeps = tsvreader.generate_split_tsv_lines(self.fn, theader) self.header.append(prottabledata.HEADER_AREA) features = proteins.add_ms1_quant_from_top3_mzidtsv( features, tpeps, self.headeraccfield, self.fixedfeatcol) if self.quantcolpattern: psmheader = tsvreader.get_tsv_header(self.psmfile) denomcols = False if self.denomcols is not None: denomcols = [ self.number_to_headerfield(col, psmheader) for col in self.denomcols ] elif self.denompatterns is not None: denomcolnrs = [ tsvreader.get_columns_by_pattern(psmheader, pattern) for pattern in self.denompatterns ] denomcols = set([col for cols in denomcolnrs for col in cols]) elif not self.mediansweep and not self.medianintensity: print( 'Must define either denominator column numbers ' 'or regex pattterns to find them, or use median sweep, or ' 'report median intensities.') sys.exit(1) elif self.medianintensity and self.mediannormalize: print( 'Cannot do median-centering on intensity values, exiting') sys.exit(1) quantcols = tsvreader.get_columns_by_pattern( psmheader, self.quantcolpattern) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines( self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] self.header = self.header + quantcols + nopsms + [ prottabledata.HEADER_NO_FULLQ_PSMS ] features = isosummarize.get_isobaric_ratios( self.psmfile, psmheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, features, self.headeraccfield, self.fixedfeatcol, False, False, False, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms) return features
def set_features(self): denomcols = False if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) elif not self.mediansweep and not self.medianintensity: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsvreader.get_columns_by_pattern(self.oldheader, self.quantcolpattern) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] if self.featcol: self.get_column_header_for_number(['featcol'], self.oldheader) self.header = [self.featcol] + quantcols + nopsms + [HEADER_NO_FULLQ_PSMS] else: self.header = (self.oldheader + ['ratio_{}'.format(x) for x in quantcols]) self.psms = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, False, False, self.featcol, False, False, False, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms)
def get_isobaric_ratios(psmfn, psmheader, channels, denom_channels, min_int, targetfn, accessioncol, normalize, normratiofn): """Main function to calculate ratios for PSMs, peptides, proteins, genes. Can do simple ratios, median-of-ratios and median-centering normalization.""" psm_or_feat_ratios = get_psmratios(psmfn, psmheader, channels, denom_channels, min_int, accessioncol) if normalize and normratiofn: normheader = reader.get_tsv_header(normratiofn) normratios = get_ratios_from_fn(normratiofn, normheader, channels) ch_medians = get_medians(channels, normratios, report=True) outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians, channels) elif normalize: flatratios = [[feat[ch] for ch in channels] for feat in psm_or_feat_ratios] ch_medians = get_medians(channels, flatratios, report=True) outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians, channels) else: outratios = psm_or_feat_ratios # at this point, outratios look like: # [{ch1: 123, ch2: 456, ISOQUANTRATIO_FEAT_ACC: ENSG1244}, ] if accessioncol and targetfn: outratios = {x[ISOQUANTRATIO_FEAT_ACC]: x for x in outratios} return output_to_target_accession_table(targetfn, outratios, channels) elif not accessioncol and not targetfn: return paste_to_psmtable(psmfn, psmheader, outratios) elif accessioncol and not targetfn: # generate new table with accessions return ({(k if not k == ISOQUANTRATIO_FEAT_ACC else prottabledata.HEADER_ACCESSION): v for k, v in ratio.items()} for ratio in outratios)
def get_psms(self): self.header = self.oldheader[:] if self.denomcols is not None: denomcols = [ self.number_to_headerfield(col, self.oldheader) for col in self.denomcols ] elif self.denompatterns is not None: denomcolnrs = [ tsv.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns ] denomcols = set([col for cols in denomcolnrs for col in cols]) else: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsv.get_columns_by_pattern(self.oldheader, self.quantcolpattern) if self.medianpsms is not None: medianheader = tsv.get_tsv_header(self.medianpsms) else: medianheader = False self.psms = prep.get_normalized_ratios(self.fn, self.oldheader, quantcols, denomcols, self.minint, self.medianpsms, medianheader)
def prepare(self): self.oldheader = tsvreader.get_tsv_header(self.fn) self.get_column_header_for_number(['spectracol']) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) self.precurquantcol = psmtopeptable.get_quantcols(self.precursorquantcolpattern, self.oldheader, 'precur')
def initialize_input(self): self.oldheader = tsvreader.get_tsv_header(self.fn) self.get_column_header_for_number(['spectracol']) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) self.precurquantcol = prep.get_quantcols(self.precursorquantcolpattern, self.oldheader, 'precur')
def get_colmap(fns, pattern, single_col=False, antipattern=False): """For table files, loops through headers and checks which column(s) match a passed pattern. Those column(s) names are returned in a map with filenames as keys""" colmap = {} for fn in fns: header = tsvreader.get_tsv_header(fn) basefn = os.path.basename(fn) try: cols = tsvreader.get_cols_in_file(pattern, header, single_col) except RuntimeError: # Columns are not in this file cols = [] if antipattern: try: anticols = tsvreader.get_cols_in_file(antipattern, header, single_col) except RuntimeError: # The filtering "anti"-columns are not in the file, anticols = [] cols = [col for col in cols if col not in anticols] if cols: colmap[basefn] = cols else: return False return colmap
def get_psms(self): if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsv.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) else: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsv.get_columns_by_pattern(self.oldheader, self.quantcolpattern) self.get_column_header_for_number(['proteincol'], self.oldheader) nopsms = [prep.get_no_psms_field(qf) for qf in quantcols] if self.proteincol and self.targettable: targetheader = tsv.get_tsv_header(self.targettable) self.header = targetheader + quantcols + nopsms elif not self.proteincol and not self.targettable: self.header = (self.oldheader + ['ratio_{}'.format(x) for x in quantcols]) elif self.proteincol and not self.targettable: self.header = [prottabledata.HEADER_ACCESSION] + quantcols + nopsms self.psms = prep.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.minint, self.targettable, self.proteincol, self.normalize, self.normalizeratios)
def initialize_input(self): self.oldheader = tsvreader.get_tsv_header(self.fn) self.get_column_header_for_number(['spectracol']) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) self.precurquantcol = prep.get_quantcols(self.precursorquantcolpattern, self.oldheader, 'precur')
def merge_mzidtsvs(fns, header): for fn in fns: if header != tsvreader.get_tsv_header(fn): raise RuntimeError('Headers of TSV files to concatenate are ' 'not identical') for psm in tsvreader.generate_tsv_lines_multifile(fns, header): yield psm
def get_psms(self): if self.denomcols is not None: denomcols = [ self.number_to_headerfield(col, self.oldheader) for col in self.denomcols ] elif self.denompatterns is not None: denomcolnrs = [ tsv.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns ] denomcols = set([col for cols in denomcolnrs for col in cols]) else: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsv.get_columns_by_pattern(self.oldheader, self.quantcolpattern) self.get_column_header_for_number(['proteincol'], self.oldheader) nopsms = [prep.get_no_psms_field(qf) for qf in quantcols] if self.proteincol and self.targettable: targetheader = tsv.get_tsv_header(self.targettable) self.header = targetheader + quantcols + nopsms elif not self.proteincol and not self.targettable: self.header = (self.oldheader + ['ratio_{}'.format(x) for x in quantcols]) elif self.proteincol and not self.targettable: self.header = [prottabledata.HEADER_ACCESSION] + quantcols + nopsms self.psms = prep.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.minint, self.targettable, self.proteincol, self.normalize, self.normalizeratios)
def get_isobaric_ratios(psmfn, psmheader, channels, denom_channels, min_int, targetfn, accessioncol, normalize, normratiofn): """Main function to calculate ratios for PSMs, peptides, proteins, genes. Can do simple ratios, median-of-ratios and median-centering normalization.""" psm_or_feat_ratios = get_psmratios(psmfn, psmheader, channels, denom_channels, min_int, accessioncol) if normalize and normratiofn: normheader = reader.get_tsv_header(normratiofn) normratios = get_ratios_from_fn(normratiofn, normheader, channels) ch_medians = get_medians(channels, normratios, report=True) outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians, channels) elif normalize: flatratios = [[feat[ch] for ch in channels] for feat in psm_or_feat_ratios] ch_medians = get_medians(channels, flatratios, report=True) outratios = calculate_normalized_ratios(psm_or_feat_ratios, ch_medians, channels) else: outratios = psm_or_feat_ratios # at this point, outratios look like: # [{ch1: 123, ch2: 456, ISOQUANTRATIO_FEAT_ACC: ENSG1244}, ] if accessioncol and targetfn: outratios = {x[ISOQUANTRATIO_FEAT_ACC]: x for x in outratios} return output_to_target_accession_table(targetfn, outratios, channels) elif not accessioncol and not targetfn: return paste_to_psmtable(psmfn, psmheader, outratios) elif accessioncol and not targetfn: # generate new table with accessions return ({(k if not k == ISOQUANTRATIO_FEAT_ACC else prottabledata.HEADER_ACCESSION): v for k, v in ratio.items()} for ratio in outratios)
def create_lookup(self): header = tsvreader.get_tsv_header(self.fn) specfncol = header[int(self.spectracol) - 1] fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim, self.genefield) lookup.create_psm_lookup(self.fn, self.fasta, self.mapfn, header, self.lookup, self.unroll, specfncol, self.decoy, fastadelim, genefield)
def prepare(self): if type(self.fn) == list: self.first_infile = self.fn[0] else: self.first_infile = self.fn self.oldheader = tsvreader.get_tsv_header(self.first_infile) self.oldpsms = tsvreader.generate_split_tsv_lines( self.fn, self.oldheader)
def create_lookup(self): header = tsvreader.get_tsv_header(self.fn) specfncol = header[int(self.spectracol) - 1] fastadelim, genefield = self.get_fastadelim_genefield( self.fastadelim, self.genefield) lookup.create_psm_lookup(self.fn, self.fasta, self.mapfn, header, self.lookup, self.unroll, specfncol, self.decoy, fastadelim, genefield)
def initialize_input(self): super().initialize_input() quantheader = reader.get_tsv_header(self.quantfile) self.quantfields = reader.get_cols_in_file(self.quantcolpattern, quantheader) self.quantacc = reader.get_cols_in_file(self.quantacccolpattern, quantheader, single_col=True) self.quantpeptides = reader.generate_tsv_proteins(self.quantfile, quantheader)
def run(self): if type(self.fn) == list: self.first_infile = self.fn[0] else: self.first_infile = self.fn self.oldheader = tsvreader.get_tsv_header(self.first_infile) self.get_psms() self.write() self.finish()
def initialize_input(self): super().initialize_input() quantheader = reader.get_tsv_header(self.quantfile) self.quantfields = reader.get_cols_in_file(self.quantcolpattern, quantheader) self.quantacc = reader.get_cols_in_file(self.quantacccolpattern, quantheader, single_col=True) self.quantfeatures = reader.generate_tsv_proteins( self.quantfile, quantheader)
def initialize_input(self): super().initialize_input() self.pepheader = tsvreader.get_tsv_header(self.pepfile) if self.proteincol: self.get_column_header_for_number(['proteincol'], self.pepheader) elif self.pcolpattern: self.proteincol = tsvreader.get_cols_in_file( self.pcolpattern, self.pepheader, True) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.pepheader, True)
def set_features(self): theader = tsvreader.get_tsv_header(self.fn) dheader = tsvreader.get_tsv_header(self.decoyfn) targets, decoys = self.get_td_proteins_bestpep(theader, dheader) if self.fdrtype == 'picked': if not self.t_fasta or not self.d_fasta: print( 'Must use --targetfasta and --decoyfasta when using picked FDR' ) sys.exit(1) fastadelim, genefield = self.get_fastadelim_genefield( self.fastadelim, self.genefield) features = proteins.generate_pick_fdr(targets, decoys, self.t_fasta, self.d_fasta, self.headeraccfield, fastadelim, genefield) else: features = proteins.generate_classic_fdr(targets, decoys, self.headeraccfield) self.features = self.get_quant(theader, features)
def write(self): # FIXME 'Strip', 'Fraction', 'missed_cleavage' for psmfn, mzidfn in zip(self.fn, self.mzidfns): oldheader = tsvreader.get_tsv_header(psmfn) header = prep.get_header_with_percolator(oldheader) outfn = self.create_outfilepath(psmfn, self.outsuffix) psms = tsvreader.generate_tsv_psms(psmfn, oldheader) mzns = mzidreader.get_mzid_namespace(mzidfn) mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns) psms_out = prep.add_fdr_to_mzidtsv(psms, mzidsr, mzns, self.percopsms) writer.write_tsv(header, psms_out, outfn)
def output_to_target_accession_table(targetfn, featratios, channels): #loop prottable, add ratios from dict, acc = key theader = reader.get_tsv_header(targetfn) acc_field = theader[0] for feat in reader.generate_tsv_proteins(targetfn, theader): try: quants = featratios[feat[acc_field]] except KeyError: quants = {ch: 'NA' for ch in channels} quants.update({get_no_psms_field(ch): 'NA' for ch in channels}) else: quants.pop(ISOQUANTRATIO_FEAT_ACC) feat.update(quants) yield feat
def output_to_target_accession_table(targetfn, featratios, channels): #loop prottable, add ratios from dict, acc = key theader = reader.get_tsv_header(targetfn) acc_field = theader[0] for feat in reader.generate_tsv_proteins(targetfn, theader): try: quants = featratios[feat[acc_field]] except KeyError: quants = {ch: 'NA' for ch in channels} quants.update({get_no_psms_field(ch): 'NA' for ch in channels}) else: quants.pop(ISOQUANTRATIO_FEAT_ACC) feat.update(quants) yield feat
def get_colmap(fns, pattern, single_col=False, antipattern=False): """For table files, loops through headers and checks which column(s) match a passed pattern. Those column(s) names are returned in a map with filenames as keys""" colmap = {} for fn in fns: header = tsvreader.get_tsv_header(fn) basefn = os.path.basename(fn) cols = tsvreader.get_cols_in_file(pattern, header, single_col) if antipattern: anticols = tsvreader.get_cols_in_file(antipattern, header, single_col) cols = [col for col in cols if col not in anticols] if cols: colmap[basefn] = cols return colmap
def get_colmap(fns, pattern, single_col=False, antipattern=False): """For table files, loops through headers and checks which column(s) match a passed pattern. Those column(s) names are returned in a map with filenames as keys""" colmap = {} for fn in fns: header = tsvreader.get_tsv_header(fn) basefn = os.path.basename(fn) cols = tsvreader.get_cols_in_file(pattern, header, single_col) if antipattern: anticols = tsvreader.get_cols_in_file(antipattern, header, single_col) cols = [col for col in cols if col not in anticols] if cols: colmap[basefn] = cols return colmap
def write(self): for psmfn, mzidfn in zip(self.fn, self.mzidfns): oldheader = tsvreader.get_tsv_header(psmfn) header = perco.get_header_with_percolator(oldheader) outfn = self.create_outfilepath(psmfn, self.outsuffix) mzns = mzidreader.get_mzid_namespace(mzidfn) mzidsr = mzidreader.mzid_spec_result_generator(mzidfn, mzns) psms = tsvreader.generate_split_tsv_lines(psmfn, oldheader) psms_perco = perco.add_fdr_to_mzidtsv(psms, mzidsr, mzns, self.percopsms) if self.filtpsm: psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PSMQ, self.filtpsm, True) if self.filtpep: psms_perco = filtering.filter_psms_conf(psms_perco, psmhead.HEADER_PEPTIDE_Q, self.filtpep, True) writer.write_tsv(header, psms_perco, outfn)
def get_psms(self): self.header = self.oldheader[:] if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsv.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) else: raise RuntimeError('Must define either denominator column numbers ' 'or regex pattterns to find them') quantcols = tsv.get_columns_by_pattern(self.oldheader, self.quantcolpattern) if self.medianpsms is not None: medianheader = tsv.get_tsv_header(self.medianpsms) else: medianheader = False self.psms = prep.get_normalized_ratios(self.fn, self.oldheader, quantcols, denomcols, self.minint, self.medianpsms, medianheader)
def parse_input(self, **kwargs): super().parse_input(**kwargs) header = tsvreader.get_tsv_header(self.fn[0]) self.header = [header[0]] if header[0] == peph.HEADER_PEPTIDE: self.is_peptidetable = True else: self.is_peptidetable = False self.pepcolpattern = None # override input if any if header[0] == peph.HEADER_PEPTIDE: if self.genecentric: self.lookuptype = 'peptidegenecentrictable' self.header.extend([peph.HEADER_GENES, peph.HEADER_ASSOCIATED]) elif self.nogroup: self.lookuptype = 'peptidetableplain' self.header.extend([peph.HEADER_PROTEINS]) else: self.header.extend([ peph.HEADER_PROTEINS, peph.HEADER_NO_CONTENTPROTEINS, peph.HEADER_DESCRIPTIONS, peph.HEADER_COVERAGES, peph.HEADER_GENES, peph.HEADER_ASSOCIATED ]) self.lookuptype = 'peptidetable' elif header[0] == ph.HEADER_PROTEIN: self.lookuptype = 'prottable' self.header.extend([ ph.HEADER_GENEID, ph.HEADER_GENENAME, ph.HEADER_DESCRIPTION, ph.HEADER_COVERAGE, ph.HEADER_CONTENTPROT, ph.HEADER_NO_PROTEIN ]) elif header[0] == ph.HEADER_GENENAME: self.lookuptype = 'associdtable' self.header.extend( [ph.HEADER_GENEID, ph.HEADER_PROTEINS, ph.HEADER_DESCRIPTION]) elif header[0] == ph.HEADER_GENEID: self.lookuptype = 'genetable' self.header.extend([ ph.HEADER_GENENAME, ph.HEADER_PROTEINS, ph.HEADER_DESCRIPTION ])
decoys = {True: 0, False: 0} for psm in sorted([(pid, float(p.find('{%s}svm_score' % ns['xmlns']).text), p) for pid, p in psms.items()], reverse=True, key=lambda x:x[1]): pdecoy = psm[2].attrib['{%s}decoy' % ns['xmlns']] == 'true' decoys[pdecoy] += 1 try: psms[psm[0]] = {'decoy': pdecoy, 'svm': psm[1], 'qval': decoys[True]/decoys[False]} # T-TDC except ZeroDivisionError: psms[psm[0]] = {'decoy': pdecoy, 'svm': psm[1], 'qval': 1} # T-TDC decoys = {'true': 0, 'false': 0} for svm, pep in sorted([(float(x.find('{%s}svm_score' % ns['xmlns']).text), x) for x in pycolator.generate_peptides(perco, ns)], reverse=True, key=lambda x:x[0]): decoys[pep.attrib['{%s}decoy' % ns['xmlns']]] += 1 try: [psms[pid.text].update({'pepqval': decoys['true']/decoys['false']}) for pid in pep.find('{%s}psm_ids' % ns['xmlns'])] except ZeroDivisionError: [psms[pid.text].update({'pepqval': 1}) for pid in pep.find('{%s}psm_ids' % ns['xmlns'])] oldheader = tsv.get_tsv_header(mzidtsvfns[0]) header = oldheader + ['percolator svm-score', 'PSM q-value', 'peptide q-value', 'Strip', 'Fraction', 'missed_cleavage'] with open('tmzidperco', 'w') as tfp, open('dmzidperco', 'w') as dfp: tfp.write('\t'.join(header)) dfp.write('\t'.join(header)) for fnix, mzidfn in enumerate(mzidfns): mzns = mzidplus.get_mzid_namespace(mzidfn) inpsms = tsv.generate_tsv_psms(mzidtsvfns[fnix], oldheader) for specidr in mzidplus.mzid_spec_result_generator(mzidfn, mzns): for specidi in specidr.findall('{%s}SpectrumIdentificationItem' % mzns['xmlns']): psm = next(inpsms) # percolator psm ID is: samplename_SII_scanindex_rank_scannr_charge_rank scanindex, rank = specidi.attrib['id'].replace('SII_', '').split('_') scan = {x.split('=')[0]: x.split('=')[1] for x in specidr.attrib['spectrumID'].split(' ')}['scan'] outpsm = {k: v for k,v in psm.items()} spfile = os.path.splitext(psm['#SpecFile'])[0]
def set_features(self): """Creates iterator to write to new tsv. Contains input tsv lines plus quant data for these.""" # First prepare the data, read PSM table to SQLite specfncolnr = int(self.spectracol) - 1 specfncol = self.oldheader[specfncolnr] fastadelim, genefield = self.get_fastadelim_genefield(self.fastadelim, self.genefield) if self.fasta: fasta_md5 = refine.get_fasta_md5(self.fasta) else: fasta_md5 = False # If appending to previously refined PSM table, reuse DB and shift rows if self.oldpsmfile: oldfasta_md5 = self.lookup.get_fasta_md5() if fasta_md5 != oldfasta_md5: print('WARNING, FASTA database used in old PSM table differs ' 'from the passed database (or this cannot be determined ' 'due to version differences), this may cause problems, as ' 'msstitch will use the old database for PSM annotation.') shiftrows = self.lookup.get_highest_rownr() + 1 proteins = set([x for x in self.lookup.get_protids()]) self.lookup.drop_psm_indices() else: shiftrows = 0 if self.proteingroup: if not fasta_md5 and not oldfasta_md5: # In case of old Fasta already stored it will be fine to protein group print('Cannot create protein group without supplying FASTA search ' 'database file') sys.exit(1) self.tabletypes.append('proteingroup') self.lookup.drop_pgroup_tables() self.lookup.add_tables(self.tabletypes) # Need to place this here since we cannot store before having done add tables, but that # has to be done after getting proteingroup knowledge, which depends on knowledge of # having passed an oldpsmfile (because of oldfasta_md5): if not self.oldpsmfile: proteins = refine.store_proteins_descriptions(self.lookup, self.fasta, fasta_md5, self.fn, self.oldheader, fastadelim, genefield) refine.create_psm_lookup(self.fn, self.oldheader, proteins, self.lookup, shiftrows, self.unroll, specfncol, fastadelim, genefield) isob_header = [x[0] for x in self.lookup.get_all_quantmaps()] if self.isobaric else False self.header = refine.create_header(self.oldheader, self.genes, self.proteingroup, self.precursor, isob_header, self.addbioset, self.addmiscleav, specfncolnr) psms = self.oldpsms # Now pass PSMs through multiple generators to add info if self.genes: psms = refine.add_genes_to_psm_table(psms, self.lookup) if self.isobaric or self.precursor: psms = refine.generate_psms_quanted(self.lookup, shiftrows, psms, isob_header, self.isobaric, self.precursor, self.min_purity) psms = refine.generate_psms_spectradata(self.lookup, shiftrows, psms, self.addbioset, self.addmiscleav) if self.oldpsmfile: prevheader = tsvreader.get_tsv_header(self.oldpsmfile) previouspsms = tsvreader.generate_split_tsv_lines(self.oldpsmfile, prevheader) psms = chain(previouspsms, psms) # Enforce proteingroup last, since it has to come AFTER the chaining of old + new PSMs # In theory you could do it before, but that makes no sense since in a big experiment you # also do not map PSMs to genes differently? If that is needed, you have to run multiple # experiments. if self.proteingroup: refine.build_proteingroup_db(self.lookup) psms = refine.generate_psms_with_proteingroups(psms, self.lookup, specfncol, self.unroll) self.psms = psms
def initialize_input(self): self.oldheader = reader.get_tsv_header(self.fn) self.in_proteins = reader.generate_tsv_proteins( self.fn, self.oldheader)
def initialize_input(self): self.oldheader = tsvreader.get_tsv_header(self.fn) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) self.qvalcol = tsvreader.get_cols_in_file(self.fdrcolpattern, self.oldheader, True)
def set_features(self): qpat = self.quantcolpattern if self.quantcolpattern else '[a-z]+[0-9]+plex_' header = [x for x in self.oldheader if x != psmh.HEADER_SPECFILE] try: isocols = tsvreader.get_columns_by_pattern(header, qpat) except RuntimeError: pass else: for col in isocols: header.pop(header.index(col)) if self.precurquantcol: header = [peph.HEADER_AREA if x == self.precurquantcol else x for x in header] header = [peph.HEADER_PEPTIDE, peph.HEADER_LINKED_PSMS] + [ x for x in header if x != psmh.HEADER_PEPTIDE] switch_map = {old: new for old, new in zip( [psmh.HEADER_PEPTIDE, psmh.HEADER_PROTEIN, psmh.HEADER_PEPTIDE_Q], [peph.HEADER_PEPTIDE, peph.HEADER_PROTEINS, peph.HEADER_QVAL])} self.header = [switch_map[field] if field in switch_map else field for field in header] peptides = psmtopeptable.generate_peptides(self.fn, self.oldheader, switch_map, self.scorecol, self.precurquantcol, self.spectracol) # Remove quant data if not specified any way to summarize if self.quantcolpattern and any([self.denomcols, self.denompatterns, self.mediansweep, self.medianintensity]): denomcols = False if self.denomcols is not None: denomcols = [self.number_to_headerfield(col, self.oldheader) for col in self.denomcols] elif self.denompatterns is not None: denomcolnrs = [tsvreader.get_columns_by_pattern(self.oldheader, pattern) for pattern in self.denompatterns] denomcols = set([col for cols in denomcolnrs for col in cols]) quantcols = tsvreader.get_columns_by_pattern(self.oldheader, self.quantcolpattern) totalproteome, tpacc, tp_pepacc = False, False, False if self.totalprotfn: pep_tp_accs = [psmh.HEADER_MASTER_PROT, psmh.HEADER_SYMBOL, psmh.HEADER_GENE, peph.HEADER_PROTEINS] totalphead = tsvreader.get_tsv_header(self.totalprotfn) totalpfield_found = False for tpacc, tp_pepacc in zip(proth.TPROT_HEADER_ACCS, pep_tp_accs): if totalphead[0] == tpacc and tp_pepacc in self.header: totalpfield_found = True break if not totalpfield_found: print('Could not find correct header field name in the total ' 'proteome table passed. ' 'Should be one of {}'.format(proth.TPROT_HEADER_ACCS)) sys.exit(1) totalproteome = tsvreader.generate_split_tsv_lines(self.totalprotfn, totalphead) mn_factors = False if self.mednorm_factors: mnhead = tsvreader.get_tsv_header(self.mednorm_factors) mn_factors = tsvreader.generate_split_tsv_lines(self.mednorm_factors, mnhead) nopsms = [isosummarize.get_no_psms_field(qf) for qf in quantcols] self.header = self.header + quantcols + nopsms + [proth.HEADER_NO_FULLQ_PSMS] peptides = isosummarize.get_isobaric_ratios(self.fn, self.oldheader, quantcols, denomcols, self.mediansweep, self.medianintensity, self.median_or_avg, self.minint, peptides, self.header[0], psmh.HEADER_PEPTIDE, totalproteome, tpacc, tp_pepacc, self.logisoquant, self.mediannormalize, mn_factors, self.keepnapsms) if self.modelqvals: qix = self.header.index(peph.HEADER_QVAL) + 1 self.header = self.header[:qix] + [peph.HEADER_QVAL_MODELED] + self.header[qix:] scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) peptides = psmtopeptable.recalculate_qvals_linear_model(peptides, scorecol, self.qvalthreshold, self.minpeptidenr) self.features = peptides
def initialize_input(self): self.pepheader = reader.get_tsv_header(self.fn) self.in_psms = reader.generate_tsv_psms(self.fn, self.pepheader)
def prepare(self): """No percolator XML for protein tables""" self.target = self.fn self.targetheader = reader.get_tsv_header(self.target) self.decoyheader = reader.get_tsv_header(self.decoyfn)
def initialize_input(self): self.target = self.fn self.oldheader = reader.get_tsv_header(self.target) self.targetheader = reader.get_tsv_header(self.target) self.decoyheader = reader.get_tsv_header(self.decoyfn)
def prepare(self): """No percolator XML for protein tables""" self.target = self.fn self.targetheader = reader.get_tsv_header(self.target) self.decoyheader = reader.get_tsv_header(self.decoyfn)
def initialize_input(self): self.oldheader = reader.get_tsv_header(self.fn) self.in_proteins = reader.generate_tsv_proteins(self.fn, self.oldheader)
def initialize_input(self): super().initialize_input() self.in_peptides = reader.generate_tsv_peptides(self.pepfile) pepheader = reader.get_tsv_header(self.pepfile) self.get_column_header_for_number(['proteincol'], pepheader)
def initialize_input(self): super().initialize_input() pepheader = reader.get_tsv_header(self.psmfile) self.get_column_header_for_number(['proteincol'], pepheader) self.in_peptides = reader.generate_tsv_peptides(self.psmfile)
def prepare(self): # do not read PSMs, multiple files passed and they will be checked # if headers matched self.first_infile = self.fn[0] self.oldheader = tsvreader.get_tsv_header(self.first_infile)
def initialize_input(self): self.target = self.fn self.oldheader = reader.get_tsv_header(self.target) self.targetheader = reader.get_tsv_header(self.target) self.decoyheader = reader.get_tsv_header(self.decoyfn)
def initialize_input(self): self.oldheader = tsvreader.get_tsv_header(self.fn) self.scorecol = tsvreader.get_cols_in_file(self.scorecolpattern, self.oldheader, True) self.qvalcol = tsvreader.get_cols_in_file(self.fdrcolpattern, self.oldheader, True)
def initialize_input(self): self.pepheader = reader.get_tsv_header(self.fn) self.in_psms = reader.generate_tsv_psms(self.fn, self.pepheader)