Ejemplo n.º 1
0
    def _parse_metlin_file(self, f_path, kegg_mass, inchi):
        tree = ET.parse(f_path)
        root = tree.getroot()

        eles = root.findall("./ExperimentInformations/Comment")
        for ele in eles:
            if ele.get('Id')=='kegg':
                kegg_id = ele.get('Value')
            if ele.get('Id') == 'Metlin-ID':
                metlin_id = ele.get('Value')
            if ele.get("Id") == 'cas':
                cas = ele.get('Value')

        eles = root.findall("./ExperimentInformations")
        for ele in eles: # should have only one element
            mass_diff = float(ele.attrib['ModificationMass'])
            c_name = ele.attrib['CompoundName']
            c_formula = ele.attrib['MolecularFormula']

        spectra = []; spectra_ms1 = []
        eles = root.findall("./Spectra/Spectrum")
        for ele in eles:
            if ele.get("MSLevel") == "1":
                spectra_ms1.append(ele)
            if ele.get("MSLevel") == "2":
                spectra.append(ele)
        if kegg_id not in kegg_mass:
            print "Ignore %s:kegg_mass doesn't have %s" % (f_path,kegg_id)
            return []

        if kegg_id not in inchi:
            print "Ignore %s:kegg_inchi doesn't have %s" % (f_path,kegg_id)
            return []
        
        mass = kegg_mass[kegg_id]
        inchi = inchi[kegg_id]
        spectra_list = []
        for spec in spectra:
            ce = int(spec.attrib['CollisionEnergy'])
            spectrum = Spectrum()
            spectrum.f_name = f_path
            spectrum.mass = float(mass)
            spectrum.precursor = mass + mass_diff
            spectrum.mode = "POSITIVE"
            spectrum.inchi = inchi
            spectrum.cas = cas
            spectrum.pubchem_sid = "NULL"
            spectrum.pubchem_cid = "NULL"
            spectrum.kegg_id = kegg_id
            spectrum.metlin_id = metlin_id
            spectrum.ce = ce
            peaks = spec.findall("Peak")
            _peaks = []
            for peak in peaks:
                _mass = float(peak.get("Mass")); _inten = float(peak.get("Intensity"))
                if _inten > 1:
                    _peaks.append((_mass,_inten/100))
            spectrum.peaks = _peaks
            spectra_list.append(spectrum)
        return spectra_list
Ejemplo n.º 2
0
    def _parse_ms_file(self, f_path):
        #        print "Parse file:",f_path
        # read ms/ms file in
        f = open(f_path)
        data = f.read()
        f.close()

        # create Spectrum instance
        spectrum = Spectrum(f_path)

        # set f_name
        spectrum.f_name = f_path
        # set metlin id
        spectrum.metlin_id = f_path[f_path.find("pos") + 3:f_path.find(".")]
        # set precursor
        _precursor = re.findall("parentmass[: ]+([0-9\.]+)", data)
        if len(_precursor) > 0:
            precursor = float(_precursor[0])
        else:
            raise Exception("ERROR: precursor not set for %s!" % f_path)
        spectrum.precursor = precursor
        spectrum.mass = precursor - 1.00794
        # set peaks and intensity
        _peaks = []
        seg = False
        for line in data.split('\n'):
            if line.find("collision") != -1:
                seg = True
                continue
            if not line:
                seg = False
                continue
            if seg:
                words = line.split()
                mass = float(words[0])
                inten = float(words[1])
                _peaks.append((mass, inten))
        spectrum.peaks = _peaks

        return spectrum
Ejemplo n.º 3
0
    def _parse_ms_file(self, f_path):
#        print "Parse file:",f_path
        # read ms/ms file in                                                   
        f = open(f_path)
        data = f.read()
        f.close()

        # create Spectrum instance                                             
        spectrum = Spectrum(f_path)

        # set f_name                                                           
        spectrum.f_name = f_path
        # set metlin id
        spectrum.metlin_id = f_path[f_path.find("pos")+3:f_path.find(".")]
        # set precursor
        _precursor = re.findall("parentmass[: ]+([0-9\.]+)",data)
        if len(_precursor) > 0:
            precursor = float(_precursor[0])
        else:
            raise Exception("ERROR: precursor not set for %s!" % f_path)
        spectrum.precursor = precursor
        spectrum.mass = precursor - 1.00794
        # set peaks and intensity
        _peaks = []
        seg = False
        for line in data.split('\n'):
            if line.find("collision") != -1:
                seg = True
                continue
            if not line:
                seg = False
                continue
            if seg:
                words = line.split()
                mass = float(words[0])
                inten = float(words[1])
                _peaks.append((mass,inten))
        spectrum.peaks = _peaks

        return spectrum
Ejemplo n.º 4
0
    def _parse_massbank_file(self, f_path):
        print "Parse file:",f_path
        # read ms/ms file in
        f = open(f_path)
        data = f.read()
        f.close()
        
        # create Spectrum instance 
        spectrum = Spectrum(f_path)

        # set f_name
        spectrum.f_name = f_path

        # set mass
        _mass = re.findall("CH\$EXACT_MASS[: ]+([0-9\.]+)", data)
        if len(_mass) > 0:
            mass = float(_mass[0])
        else:
            raise Exception("ERROR: mass filed error in file %s " % f_path)
        spectrum.mass = mass

        # set precursor
        _precursor = re.findall("MS\$FOCUSED_ION: PRECURSOR_M/Z[: ]+([0-9\.]+)",data) 
        if len(_precursor) > 0:
            precursor = float(_precursor[0])
        else:
            _basepeak = re.findall("MS\$FOCUSED_ION: BASE_PEAK[: ]+([0-9\.]+)",data)
            if len(_basepeak)>0 :
                print ("WARNING: using base peak as precursor for %s!" % f_path)
                precursor = float(_basepeak[0])
            else:
                raise Exception("ERROR: precursor not set for %s!" % f_path)
        spectrum.precursor = precursor

        # set ion mode
        _mode = re.findall("ION_MODE ([A-Z]+)", data)
        if len(_mode) > 0:
            mode = _mode[0]
        else:
            _mode = re.findall("MODE ([A-Z]+)", data)
            if len(_mode)>0:
                print ("WARNING: ion mode is set by MODE for %s!" % f_path)
                mode = _mode[0]
            else:
                raise Exception("ERROR: mode not set for %s!" % f_path)
        spectrum.mode = mode

        # set peaks
        _peaks = []
        lines = data.split("\n"); ready = False
        for line in lines:
            if len(line) == 0:
                continue
            if line.find("PK$PEAK") != -1:
                ready = True
                continue
            if ready:
                if line.find("N/A") != -1:
                    raise Exception("ERROR: no peaks in %s" % f_path)
                words = line.split()
                mass = float(words[0])
                inten = float(words[1])
                #mass = mass+numpy.random.normal(0,1e-8,1) # add noise
                #mass = float("%.3f" % mass)
                _peaks.append((mass,inten))
        spectrum.peaks = _peaks

        # set inchi
        _inchi = re.findall("IUPAC: (.+)",data)
        if len(_inchi) > 0:
            if _inchi[0].find('unknown') != -1:
                print f_path, 'has no inchi!'
                inchi = _inchi[0]
                #raise Exception("Error: no inchi for %s!" % f_path)
            else:
                inchi = _inchi[0]
        else:
            raise Exception("Error: no inchi for %s!" % f_path)
        if "InChI=" not in inchi: # some inchi may not contains the head 
            inchi = "InChI=" + inchi
        spectrum.inchi = inchi

        # below are optional field for Spectrum
        _cas = re.findall("CH\$LINK: CAS[: ]+([0-9\-]+)", data)
        if len(_cas) > 0:
            cas = _cas[0]
            spectrum.cas = cas

        _metlin = re.findall("CH\$LINK: METLIN[: ]+([0-9]+)", data)
        if len(_metlin) > 0:
            metlin = _metlin[0]
            spectrum.metlin_id = metlin
        else:
            spectrum.metlin_id = 'NULL'

        _sid = re.findall("PUBCHEM SID[: ]+(\w+)", data)
        if len(_sid) > 0:
            sid = _sid[0]
            spectrum.pubchem_sid = sid
        else:
            _sid = re.findall("PUBCHEM[: ]+([0-9]+)", data)
            if len(_sid) > 0:
                sid = _sid[0]
                spectrum.pubchem_sid = sid

        _cid = re.findall("PUBCHEM CID[: ]+(\w+)", data)
        if len(_cid) > 0:
            cid = _cid[0]
            spectrum.pubchem_cid = cid

        _kegg_id = re.findall("LINK: KEGG (\w+)", data)
        if len(_kegg_id) > 0:
            kegg_id = _kegg_id[0]
            spectrum.kegg_id = kegg_id

        _ce = re.findall("COLLISION_ENERGY (\w+)",data)
        if len(_ce) > 0:
            ce = _ce[0]
            ce = ce.replace("eV","")
            if ce.isdigit():
                spectrum.ce = int(ce)
        return spectrum
Ejemplo n.º 5
0
    def _parse_metlin_file(self, f_path, kegg_mass, inchi):
        tree = ET.parse(f_path)
        root = tree.getroot()

        eles = root.findall("./ExperimentInformations/Comment")
        for ele in eles:
            if ele.get('Id') == 'kegg':
                kegg_id = ele.get('Value')
            if ele.get('Id') == 'Metlin-ID':
                metlin_id = ele.get('Value')
            if ele.get("Id") == 'cas':
                cas = ele.get('Value')

        eles = root.findall("./ExperimentInformations")
        for ele in eles:  # should have only one element
            mass_diff = float(ele.attrib['ModificationMass'])
            c_name = ele.attrib['CompoundName']
            c_formula = ele.attrib['MolecularFormula']

        spectra = []
        spectra_ms1 = []
        eles = root.findall("./Spectra/Spectrum")
        for ele in eles:
            if ele.get("MSLevel") == "1":
                spectra_ms1.append(ele)
            if ele.get("MSLevel") == "2":
                spectra.append(ele)
        if kegg_id not in kegg_mass:
            print "Ignore %s:kegg_mass doesn't have %s" % (f_path, kegg_id)
            return []

        if kegg_id not in inchi:
            print "Ignore %s:kegg_inchi doesn't have %s" % (f_path, kegg_id)
            return []

        mass = kegg_mass[kegg_id]
        inchi = inchi[kegg_id]
        spectra_list = []
        for spec in spectra:
            ce = int(spec.attrib['CollisionEnergy'])
            spectrum = Spectrum()
            spectrum.f_name = f_path
            spectrum.mass = float(mass)
            spectrum.precursor = mass + mass_diff
            spectrum.mode = "POSITIVE"
            spectrum.inchi = inchi
            spectrum.cas = cas
            spectrum.pubchem_sid = "NULL"
            spectrum.pubchem_cid = "NULL"
            spectrum.kegg_id = kegg_id
            spectrum.metlin_id = metlin_id
            spectrum.ce = ce
            peaks = spec.findall("Peak")
            _peaks = []
            for peak in peaks:
                _mass = float(peak.get("Mass"))
                _inten = float(peak.get("Intensity"))
                if _inten > 1:
                    _peaks.append((_mass, _inten / 100))
            spectrum.peaks = _peaks
            spectra_list.append(spectrum)
        return spectra_list