Example #1
0
    def loadFileIdXML(self, file_path):
        prot_ids = []
        pep_ids = []
        pyopenms.IdXMLFile().load(file_path, prot_ids, pep_ids)
        Ions = {}

        # extract ID data from file
        for peptide_id in pep_ids:
            pep_mz = peptide_id.getMZ()
            pep_rt = peptide_id.getRT()

            for hit in peptide_id.getHits():
                pep_seq = str(hit.getSequence().toString())
                if "." in pep_seq:
                    pep_seq = pep_seq[3:-1]
                else:
                    pep_seq = pep_seq[2:-1]

                for anno in hit.getPeakAnnotations():
                    ion_charge = anno.charge
                    ion_mz = anno.mz
                    ion_label = anno.annotation

                    Ions[ion_label] = [ion_mz, ion_charge]

                self.scanIDDict[round(pep_rt, 3)] = {
                    'm/z': pep_mz,
                    'PepSeq': pep_seq,
                    'PepIons': Ions
                }
                Ions = {}

        self.saveIdData()
Example #2
0
def _read_psms_idxml(filename: str) -> pd.DataFrame:
    """
    Read idXML spectrum identifications.

    Parameters
    ----------
    filename : str
        The idXML file name.

    Returns
    -------
    pd.DataFrame
        A Pandas DataFrame with as rows the PSMs and as columns "sequence" and
        "score", indexed by their spectrum reference in the form of
        {filename}:scan:{scan}.
    """
    protein_ids, psms, scans, sequences, scores = [], [], [], [], []
    pyopenms.IdXMLFile().load(filename, protein_ids, psms)
    peak_filename = os.path.splitext(os.path.basename(
        protein_ids[0].getMetaValue('spectra_data')[0].decode()))[0]
    for psm in tqdm.tqdm(psms, desc='PSMs read', unit='PSMs'):
        spectrum_index = psm.getMetaValue('spectrum_reference').decode()
        scans.append(
            int(spectrum_index[spectrum_index.find('scan=') + len('scan='):]))
        sequences.append(psm.getHits()[0].getSequence().toString().decode())
        scores.append(psm.getHits()[0].getScore())
    psms = pd.DataFrame({'filename': peak_filename, 'scan': scans,
                         'sequence': sequences, 'score': scores})
    psms['spectra_ref'] = (psms['filename'] + ':scan:' +
                           psms['scan'].astype(str))
    return (psms[['spectra_ref', 'sequence', 'score']]
            .drop_duplicates('spectra_ref')
            .set_index('spectra_ref'))
Example #3
0
    def test_readfile_content(self):
        idxml_file = pyopenms.IdXMLFile()
        peps = []
        prots = []
        idxml_file.load(self.filename, prots, peps)

        self.assertEqual(len(prots), 1)
        self.assertEqual(len(peps), 3)
Example #4
0
    def test_readfile_content(self):
        idxml_file = pyopenms.IdXMLFile()
        peps = []
        prots = []
        idxml_file.load(self.filename, prots, peps)

        assert len(prots) == 1
        assert len(peps) == 3
Example #5
0
def id_mapper(in_file, id_file, out_file, params, use_centroid_rt,
              use_centroid_mz, use_subelements):

    in_type = pms.FileHandler.getType(in_file)

    protein_ids = []
    peptide_ids = []

    pms.IdXMLFile().load(id_file, protein_ids, peptide_ids)

    mapper = pms.IDMapper()
    mapper.setParameters(params)

    if in_type == pms.Type.CONSENSUSXML:
        file_ = pms.ConsensusXMLFile()
        map_ = pms.ConsensusMap()
        file_.load(in_file, map_)
        mapper.annotate(map_, peptide_ids, protein_ids, use_subelements)
        addDataProcessing(
            map_, params,
            pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        file_.store(out_file, map_)

    elif in_type == pms.Type.FEATUREXML:
        file_ = pms.FeatureXMLFile()
        map_ = pms.FeatureMap()
        file_.load(in_file, map_)
        mapper.annotate(map_, peptide_ids, protein_ids, use_centroid_rt,
                        use_centroid_mz)
        addDataProcessing(
            map_, params,
            pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        file_.store(out_file, map_)

    elif in_type == pms.Type.MZQ:
        file_ = pms.MzQuantMLFile()
        msq = pms.MSQuantifications()
        file_.load(in_file, msq)
        maps = msq.getConsensusMaps()
        for map_ in maps:
            mapper.annotate(map_, peptide_ids, protein_ids, use_subelements)
            addDataProcessing(
                map_, params,
                pms.DataProcessing.ProcessingAction.IDENTIFICATION_MAPPING)
        msq.setConsensusMaps(maps)
        file_.store(out_file, msq)

    else:
        raise Exception("invalid input file format")
Example #6
0
def full(ctx, filename, mzid=None, idxml=None):
    """Calculate all possible metrics for these files. These data sources will be included in set metrics."""
    exp = oms.MSExperiment()
    oms.MzMLFile().load(click.format_filename(filename), exp)
    rq = basicqc.getBasicQuality(exp)

    if idxml and mzid:
        logging.warn(
            "Sorry, you can only give one id file. Please choose one.")
        click.echo(ctx.get_help())
        return
    elif not idxml and not mzid:
        logging.warn("Sorry, you must give one id file in this mode.")
        click.echo(ctx.get_help())
        return

    ms2num = 0
    for x in rq.qualityMetrics:
        if x.name == "Number of MS2 spectra":
            ms2num = x.value

    if ms2num < 1:
        logging.warn(
            "We seem to have found no MS2 spectra which is unlikely to be true since you have also given some identifications. \
                We continue with symbolic value of 1 for the number of MS2 spectra, \
                however this means some metrics will invariably be incorrect!\
                Please make sure, we have the right inputs.")
        ms2num = 1

    pros = list()
    peps = list()
    if mzid:
        oms_id = oms.MzIdentMLFile()
        idf = mzid
    if idxml:
        oms_id = oms.IdXMLFile()
        idf = idxml
    if idf:
        oms_id.load(click.format_filename(idf), pros, peps)
        rq.qualityMetrics.extend(idqc.getIDQuality(exp, pros, peps, ms2num))
    rqs.append(rq)

    finale()
Example #7
0
    def parse_idxml(self):
        peptides = []
        proteins = []
        scores = {}

        parsed_peptides = []

        po.IdXMLFile().load(self.idxml_file, proteins, peptides)

        for p in peptides:
            #search engine scores
            scores["var_MS:1002252_Comet:XCorr"] = float(p.getHits()[0].getMetaValue('MS:1002252'))
            scores["var_MS:1002253_Comet:DeltCn"] = float(p.getHits()[0].getMetaValue('MS:1002253'))

            #percolator probability
            scores["q_value"] = float(p.getHits()[0].getMetaValue('MS:1001491'))
            scores["pep"] = float(p.getHits()[0].getMetaValue('MS:1001491'))

            parsed_peptides.append({**{'run_id': self.base_name,
                                       'scan_id': int(str(p.getMetaValue("spectrum_reference")).split('scan=')[-1].strip("'")),
                                       'hit_rank': int(p.getHits()[0].getRank()),
                                       'massdiff': float(0),
                                       'precursor_charge': int(p.getHits()[0].getCharge()),
                                       'retention_time': float(p.getRT()),
                                       'modified_peptide': p.getHits()[0].getSequence().toUniModString().decode("utf-8"),
                                       'peptide_sequence': p.getHits()[0].getSequence().toUnmodifiedString().decode("utf-8"),
                                       'modifications': '-',
                                       'nterm_modification': '-',
                                       'cterm_modification': '-',
                                       'protein_id': ','.join([prot.getProteinAccession().decode("utf-8") for prot in p.getHits()[0].getPeptideEvidences()]),
                                       'gene_id': '-',
                                       'num_tot_proteins': len([prot.getProteinAccession() for prot in p.getHits()[0].getPeptideEvidences()]),
                                       'decoy': p.getHits()[0].getMetaValue('target_decoy').decode("utf-8")=='decoy'}, **scores})

        df = pd.DataFrame(parsed_peptides)

        return (df)
Example #8
0
 def test_readfile(self):
     idxml_file = pyopenms.IdXMLFile()
     peps = []
     prots = []
     idxml_file.load(self.filename, prots, peps)
skiplines = 0
with open(args.mztab) as f_in:
    line = next(f_in)
    while line.split('\t', 1)[0] != 'PSH':
        if 'ms_run[1]-location' in line:
            run_name = line.split('\t')[2]
        line = next(f_in)
        skiplines += 1

psms = pd.read_csv(args.mztab, sep='\t', header=skiplines, index_col='PSM_ID')

peptide_ids = []
for _, psm in psms.iterrows():
    peptide_id = pyms.PeptideIdentification()
    peptide_id.setRT(psm['retention_time'])
    peptide_id.setMZ(psm['exp_mass_to_charge'])
    peptide_id.setScoreType('q-value')
    peptide_id.setHigherScoreBetter(False)
    peptide_id.setIdentifier(run_name)
    peptide_hit = pyms.PeptideHit()
    peptide_hit.setScore(psm['search_engine_score[2]'])
    peptide_hit.setRank(1)
    peptide_hit.setCharge(psm['charge'])
    peptide_hit.setSequence(pyms.AASequence.fromString(psm['sequence']))
    peptide_id.setHits([peptide_hit])
    peptide_ids.append(peptide_id)

protein_id = pyms.ProteinIdentification()
protein_id.setIdentifier(run_name)
pyms.IdXMLFile().store(output_path, [protein_id], peptide_ids)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-in',
                        dest="inf",
                        help='<Required> full path to the input idXML',
                        required=True)
    parser.add_argument('-out',
                        dest="out",
                        help="<Required> full path to the csv to be written",
                        required=True)
    parser.add_argument(
        '-doi',
        '--dept_of_immuno-filenames',
        dest='doi',
        action='store_true',
        help=
        "If filename conforms with the filename scheme of the dept. of immunology... (in doubt, it probably doesn'T)"
    )
    parser.set_defaults(doi=False)
    parser.add_argument(
        '-qcML',
        dest="qcml",
        action='store_true',
        help=
        "If instead of writing a csv embedding in a qcml file - prerequisite: -out must be a existing qcml file"
    )
    parser.set_defaults(qcml=False)

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out):
        parser.print_help()
        sys.exit(1)

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    if options.qcml:
        try:
            qf = oms.QcMLFile()
            qf.load(oms.String(options.out))
        except:
            print "no usable qcml file given"
            sys.exit(1)

    SE = "NA"
    SE = pros[0].getSearchEngine()
    sesmv = "NA"
    if SE == 'MS-GF+':
        sesmv = 'MS:1002053'
    elif SE == 'XTandem':
        sesmv = 'E-Value'
    elif SE == 'Comet':
        sesmv = 'MS:1002257'
    elif SE == 'Mascot':
        sesmv = 'EValue'
    if not sesmv:
        print "no adequate search engine score found"
        #sys.exit(1)

    organ, repli, patient = None, None, None
    if options.doi:
        organ = path.basename(options.inf).split('_')[3]
        repli = path.basename(options.inf).split('_')[6].split('#')[-1]
        patient = path.basename(options.inf).split('_')[2]
    if not organ or not repli or not patient:
        options.doi = False

    sepe = list()
    for pep in peps:
        pep.sort()
        hits = pep.getHits()
        st = pep.getScoreType()
        if st != 'Posterior Error Probability':
            print "Warning, no PEP as score type found!"
        if pep.metaValueExists('spectrum_reference'):
            sn_ref = pep.getMetaValue('spectrum_reference')
            sn = sn_ref.split('=')[-1]
        else:
            sn = '?'
            sn_ref = '?'
        for i, h in enumerate(hits):
            row = dict()
            row['file'] = path.basename(options.inf)
            row['sequence'] = h.getSequence().toUnmodifiedString()
            row['modified'] = h.getSequence().isModified()
            row['rank'] = i + 1
            row['spectrum'] = sn
            row['spectrum_reference'] = sn_ref  # match with spectrum_native_id von featureXML and spectrum_reference from mapped PeptideIdentifications
            if pep.getScoreType() == 'Posterior Error Probability':
                row['PEP'] = h.getScore()
            else:
                row['PEP'] = "NA"
            row['SE'] = SE
            if h.metaValueExists(sesmv):
                row['SEscore'] = h.getMetaValue(sesmv)
            if st == 'q-value':
                row['qvalue'] = h.getScore()
            else:
                if h.metaValueExists('q-value_score'):
                    row['qvalue'] = h.getMetaValue('q-value_score')
            if h.metaValueExists('binder'):
                if h.metaValueExists('weak'):
                    row['binder'] = 'weak'
                elif h.metaValueExists('strong'):
                    row['binder'] = 'strong'
                else:
                    row['binder'] = 'yes'
                row['allele'] = h.getMetaValue("binder")
            else:
                row['binder'] = 'no'
            if "decoy" not in h.getMetaValue("target_decoy"):
                row['target'] = 'target'
            else:
                row['target'] = 'decoy'
            if options.doi:
                row['organ'] = organ
                row['replicate'] = repli
                row['patient'] = patient
            sepe.append(row)

    outframe = pd.DataFrame(sepe)

    if options.qcml:
        idxa = oms.Attachment()
        idxa.name = "generic table"  #"extended id tab"
        idxa.cvRef = "qcML"
        idxa.cvAcc = "QC:0000049"
        idxa.qualityRef = "QC:0000025"

        idxa.colTypes = list(outframe.columns)
        rows = [[str(row[f]) for f in idxa.colTypes]
                for ind, row in outframe.iterrows()
                ]  #writes nan if cell not filled
        idxa.tableRows = rows

        ls = list()
        qf.getRunNames(ls)
        qf.addRunAttachment(ls[0], idxa)
        #qf.store(oms.String(options.out.replace('%', 'perc')))
        qf.store(oms.String(options.out))

    else:
        outframe.to_csv(options.out, sep='\t', index=False)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-c',
                        dest="mhcclass",
                        help='<Required> MHC class',
                        required=True)
    parser.add_argument('-in',
                        dest="inf",
                        help='<Required> full path to the input file',
                        required=True)
    parser.add_argument('-out',
                        dest="out",
                        help="<Required> full path to the output file",
                        required=True)
    parser.add_argument(
        '-allele',
        dest="allele",
        help=
        "<Required> full path to an allele file, if 'in', allele file will be deduced from in file name",
        required=True)
    parser.add_argument(
        '-dirallele',
        dest="dirallele",
        help=
        "for use with '-allele in', describes full base path to the allele files"
    )

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if not (options.inf or options.out or options.allele):
        parser.print_help()
        sys.exit(1)

    target_alleles_set = set()
    #Fred2.FileReader.read_lines is broken
    #alleles = FileReader.read_lines(options.allele, type=Allele)
    if options.allele == "in" and options.dirallele:
        if "_W_" not in options.inf:
            print "No class 1 type run detected."
            sys.exit(0)
        af = None
        for sp in options.inf.split("_"):
            if sp.startswith("BD"):
                af = join(options.dirallele, sp.split("-")[1] + ".allele")
        with open(af, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))
    else:
        with open(options.allele, 'r') as handle:
            for line in handle:
                target_alleles_set.add(Allele(line.strip().upper()))

    if not target_alleles_set:
        parser.print_help()
        sys.exit(1)

    if options.mhcclass == "I":
        ttn = EpitopePredictorFactory('netmhcpan', version='3.0')
        lowerBound = 8
        upperBound = 12
    elif options.mhcclass == "II":
        ttn = EpitopePredictorFactory('netmhcIIpan', version='3.1')
        lowerBound = 15
        upperBound = 25

    pros = list()
    peps = list()
    f = oms.IdXMLFile()
    f.load(options.inf, pros, peps)

    pepstr = set()
    for pep in peps:
        for h in pep.getHits():
            #if "decoy" not in h.getMetaValue("target_decoy"):
            unmod = h.getSequence().toUnmodifiedString()
            if lowerBound <= len(unmod) <= upperBound \
                    and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod:
                pepstr.add(h.getSequence().toUnmodifiedString())

    es = [Peptide(x) for x in pepstr]

    try:
        preds_n = ttn.predict(es, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the netMHC prediction", options.inf, "what:", str(
            e)
        sys.exit(1)

    #only max
    preds = dict()
    for index, row in preds_n.iterrows():
        score = row.max()  #bigger_is_better
        allele = str(row.idxmax())
        categ = categorize(score)
        seq = row.name[0].tostring()
        if categ:
            preds[seq] = (allele, categ, score)

    npeps = list()
    for pep in peps:
        hits = pep.getHits()
        nhits = list()
        for h in hits:
            if h.getSequence().toUnmodifiedString() in preds:
                x = preds[h.getSequence().toUnmodifiedString()]
                h.setMetaValue('binder', x[0])
                h.setMetaValue(str(x[1]), x[2])
                nhits.append(h)
            else:
                nhits.append(h)
        pep.setHits(nhits)

    f.store(options.out, pros, peps)