def __init__(self,mapped_featureXML_1, mapped_featureXML_2, trafoXML_file): """ @type mapped_featureXML_1: pyMS.pareFeatureXML.Reader @param mapped_featureXML_1: An instance of L{parseFeatureXML.Reader}. This is one of the files mapped by OpenMS's MapAlignerPoseClustering. @type mapped_featureXML_2: pyMS.pareFeatureXML.Reader @param mapped_featureXML_2: An instance of L{parseFeatureXML.Reader}. This is one of the files mapped by OpenMS's MapAlignerPoseClustering. @type trafoXML_file: string @param trafoXML_file: path to the trafoXML file that corresponds to featureXML instance 1 and featureXML instance 2. Has to be the linear file, because the identity file doesn't contain transformations @raise IOError: trafoXML is an identity file (need a linear file) @raise RuntimeError: FeatureXML and trafo_xml didn't match up """ # self.retentionDict_1 will contain all the feature info with retention time as key of featureXML file 1 self.retentionDict_1 = {} # self.retentionDict_2 will contain all the feature info with retention time as key of featureXML file 2 self.retentionDict_2 = {} # this method is more expensive on memory, but a lot faster than old method (old method used getWindow, which looped through the list each time) # this makes the retention time the key value, so that the retention time can be found with O(1) # the retention time is rounded to second point after decimal because that is unique enough for feature in mapped_featureXML_1.getSimpleFeatureInfo(): self.retentionDict_1[str(mapped_featureXML_1['retention time'])] = {'intensity':mapped_featureXML_1.getElementInfo()['intensity'], 'feature_id':mapped_featureXML_1['id']} for feature in mapped_featureXML_2.getSimpleFeatureInfo(): self.retentionDict_2[str(mapped_featureXML_2['retention time'])] = {'intensity':mapped_featureXML_2.getElementInfo()['intensity'], 'feature_id':mapped_featureXML_2['id']} # to keep the to and rom changes in a list of dictionairies with as keys 'from' and 'to' and as values the retention times self.trafoXML_list = [] for event, element in cElementTree.iterparse(trafoXML_file): if element.tag == 'Transformation': if elementFunctions.getItems(element)['name'] == 'identity': raise IOError, trafoXML_file+' is a trafoXML identity file (see the Transformation node). There is no information in the identity file. Use the \'linear\' file as input' if element.tag == 'Pair': try: fromFeatureID = self.retentionDict_1[str(elementFunctions.getItems(element)['from'])]['feature_id'] except KeyError: pass try: toFeatureID = self.retentionDict_1[str((elementFunctions.getItems(element)['to']))]['feature_id'] except KeyError: pass try: fromFeatureID = self.retentionDict_2[str(elementFunctions.getItems(element)['from'])]['feature_id'] except KeyError: pass try: toFeatureID = self.retentionDict_2[str(elementFunctions.getItems(element)['to'])]['feature_id'] except KeyError: pass try: self.trafoXML_list.append({'from_featureID':fromFeatureID, 'to_featureID':toFeatureID, 'from':float(elementFunctions.getItems(element)['from']), 'to':float(elementFunctions.getItems(element)['to'])}) except UnboundLocalError, e: raise RuntimeError, 'Something wrong with the input files. Probably the featureXML files didn\'t match the trafoXML file. Check your input. Actual error raised was: '+str(e)
def getMapInfo(self): """Retrieves and stores the map details for files""" for element in self.getAllElements(): if element.tag=='map': mapDict={} mapid=int(elementFunctions.getItems(element)['id']) mapDict['name']=elementFunctions.getItems(element)['name'] mapDict['unique_id']=elementFunctions.getItems(element)['unique_id'] mapDict['label' ]=elementFunctions.getItems(element)['label'] mapDict['size' ]=int(elementFunctions.getItems(element)['size']) while len(self.maplist) <=mapid: self.maplist.append({}) self.maplist[mapid]=mapDict return self.maplist
def test_getItems(self): expectedItems = {'{http://www.w3.org/2001/XMLSchema-instance}noNamespaceSchemaLocation': 'http://open-ms.sourceforge.net/schemas/FeatureXML_1_4.xsd', 'version': '1.4', 'id': 'fm_2007447552192692304'} # use the celementtree iterparse function to get one element elementFile = open(testFolder+'featurexmlTestFile_1.featureXML') for event, element in cElementTree.iterparse(elementFile): # looping through the tree itemsDict = elementFunctions.getItems(element) # getting the items of all elements in the tree self.assertDictContainsSubset(expectedItems, itemsDict) # assert that the itemsDict dictionary contains the subset of the expectedItems dict
def getAssignedPeptidesMZandRTvalue(self): """ Iterator function that yields all the assigned peptide m/z and retention time value and the accession number of the protein they are assigned to. Does not get any additional information on the peptides @rtype: dict @return: A dict of all the assigned peptides with m/z, RT value and protein description @raise RuntimeError: None of the regular expressions for parsing the scan_title was set B{Example:} Printing all assigned peptide's m/z value, RT value and protein description: >>> mascot = Reader('example_mascot_file.xml') # make a read instance >>> for result in mascot.getAssignedPeptidesMZandRTvalue(): ... print result """ if self.scan_re == None and self.file_re == None and self.rt_re == None and self.mz_re == None: raise RuntimeError, 'None of the regular expressions was set to get the scan number or m/z and rt value out of the scan title. You can set them using setScanRE, setFileRE, setRtRE, setMzRE' for element in self.getAllElements(): # get the useful info from the element tag elementTag = element.tag.split('}')[-1] # the protein and peptide information is nested inside hits>hit>protein>pep_scan_title if elementTag == 'hits': for hit in element: for protein in hit: proteinAccession = elementFunctions.getItems(protein)['accession'] prot_desc="No description" for protInfo in protein: protInfoTag = protInfo.tag.split('}')[-1] if protInfoTag == 'prot_desc': if protInfo.text != None and protInfo.text !='': prot_desc = protInfo.text elif protInfoTag == 'prot_score': prot_score = protInfo.text elif protInfoTag == 'prot_mass': prot_mass = protInfo.text elif protInfoTag == 'prot_matches': prot_matches = protInfo.text elif protInfoTag == 'prot_matches_sig': prot_matches_sig = protInfo.text elif protInfoTag == 'prot_sequences': prot_sequences = protInfo.text elif protInfoTag == 'prot_sequences_sig': prot_sequences_sig = protInfo.text elif protInfoTag == 'peptide': for pepInfo in protInfo: pepInfoTag = pepInfo.tag.split('}')[-1] # cuase this not always exists pep_num_match = None if pepInfoTag == 'pep_exp_mz': pep_exp_mz = pepInfo.text elif pepInfoTag == 'pep_exp_mr': pep_exp_mr = pepInfo.text elif pepInfoTag == 'pep_exp_z': pep_exp_z = pepInfo.text elif pepInfoTag == 'pep_calc_mr': pep_calc_mr = pepInfo.text elif pepInfoTag == 'pep_delta': pep_delta = pepInfo.text elif pepInfoTag == 'pep_miss': pep_miss = pepInfo.text elif pepInfoTag == 'pep_score': pep_score = pepInfo.text elif pepInfoTag == 'pep_expect': pep_expect = pepInfo.text elif pepInfoTag == 'pep_res_before': pep_res_before = pepInfo.text elif pepInfoTag == 'pep_seq': pep_seq = pepInfo.text elif pepInfoTag == 'pep_res_after': pep_res_after = pepInfo.text elif pepInfoTag == 'pep_var_mod': pep_var_mod = pepInfo.text elif pepInfoTag == 'pep_var_mod_pos': pep_var_mod_pos = pepInfo.text elif pepInfoTag == 'pep_num_match': pep_num_match = pepInfo.text elif pepInfoTag == 'pep_scan_title': pep_scan_title = pepInfo.text ## TODO allow separate RE to parse mz/rt/scan number values from the title. titlepar=self._parseTitle(pep_scan_title) if titlepar.has_key('mz'): mz=titlepar['mz'] else: mz=None if titlepar.has_key('rt'): rt=titlepar['rt'] else: rt=None if titlepar.has_key('file'): fileroot=titlepar['file'] else: fileroot=None if titlepar.has_key('scan'): scan=titlepar['scan'] else: scan=None # mz = pepInfo.text.split('_')[0] # rt = pepInfo.text.split('_')[1] yield {'mz':mz, 'rt':rt, 'protAccession':proteinAccession, 'prot_desc':prot_desc, 'prot_score':prot_score, 'prot_mass':prot_mass,'prot_matches':prot_matches, 'prot_matches_sig':prot_matches_sig,'prot_sequences':prot_sequences, 'prot_sequences_sig':prot_sequences_sig,'pep_exp_mz':pep_exp_mz, 'pep_exp_mr':pep_exp_mr, 'pep_exp_z':pep_exp_z, 'pep_calc_mr':pep_calc_mr ,'pep_delta':pep_delta,'pep_miss':pep_miss, 'pep_score':pep_score, 'pep_expect':pep_expect,'pep_res_before':pep_res_before,'pep_seq':pep_seq, 'pep_res_after':pep_res_after, 'pep_var_mod':pep_var_mod,'pep_var_mod_pos':pep_var_mod_pos ,'pep_num_match':pep_num_match, 'pep_scan_title':pep_scan_title, 'fileroot':fileroot, 'scannumber':scan}
def getSimpleFeatureInfo(self): """ Iterator function that yields all the feature elements in the file given to Reader(). It saves info from the features in a dict, self.elementInfo, which is used in the L{parseFeatureXML.Reader.__getitem__} retrieval function. This function has predefined information like intensity, overallquality, convexhull etc that make for easier browsing, but because of this it does not contain all information. If you want to get all information exactly as found in the xml file, use L{parseFeatureXML.Reader.getAllFeatureInfo}. @rtype: Element @return: Iterator of all the elements in the file where element.tag == 'feature' @raise RuntimeError: No features in the file B{Example}: Printing all the features in a file: >>> featureXML = Reader('example_feature_file.featureXML') # make a reader instance >>> allElements = featureXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> features = featureXML.getSimpleFeatureInfo() >>> for feature in features: ... print feature <Element 'feature' at 0x6184270> <Element 'feature' at 0x6184cc0> <Element 'feature' at 0x6188630> Printing the intensities of all features: >>> featureXML = Reader('example_feature_file.featureXML') # make a reader instance >>> allElements = featureXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> features = featureXML.getSimpleFeatureInfo() >>> for feature in features: ... print featureXML['intensity'] 6182 3543 2134 """ # counter for the amount of elements with a feature tag. If it stay 0 at the end of the yielding this function raises a runtime error featureCount = 0 # for all the elements for element in self.getAllElements(): # if the element is intensity or overallquality get the intensity and overallquality from the element context if element.tag == 'intensity': intensity = element.text elif element.tag == 'overallquality': overallquality = element.text # if the element is a feature elif element.tag == 'feature': featureCount += 1 # keeping track of the amount of features # Add all the necessary keys for easy browsing (so the element name) to elementInfo[element]. This is not very generic # but it has all the node names and the features should be easily browsable using the __getitem__ implementation # The keys are intensity, overallquality, userParm, convexhull, position, quality, charge # This only works as long as the featureXML format stays the same self.elementInfo[element]['intensity'] = intensity self.elementInfo[element]['overallquality'] = overallquality self.elementInfo[element]['userParam'] = [] self.elementInfo[element]['convexhull'] = [] self.elementInfo[element]['mz'] = 0 self.elementInfo[element]['retention time'] = 0 self.elementInfo[element]['quality'] = [] self.elementInfo[element]['charge'] = 0 self.elementInfo[element]['content'] = element.text self.elementInfo[element]['id'] = elementFunctions.getItems( element)['id'] # for every element in feature (the rest of the info of feature is already saved in getAllElements()) for nestedElement in element: # the mz and retention time is saved in the position (in a very unhandy way) get it out and put it in elementInfo if nestedElement.tag == 'position': # position is in the form [('dim', 1), 323.4)] where the number after dim is either 0 or 1. If it is 0 the second value # of the list is the retention time, if it is 1 the second value of the list is the mz value dimValue = int( elementFunctions.getItems(nestedElement)['dim']) if dimValue == 0: self.elementInfo[element][ 'retention time'] = nestedElement.text elif dimValue == 1: self.elementInfo[element]['mz'] = float( nestedElement.text) else: raise RuntimeError, 'Value of dim in getSimpleFeatureInfo should never be other value than 0 or 1' # dict to contain the different user params userParamDict = {} # for every key in the element for key in nestedElement.keys(): # to directly access all properties of feature, the name of property is taken as dictionary key # and the result is taken as value. userParamDict[key] = elementFunctions.getItems( nestedElement)[key] # Add the info of all the elements in feature to elementInfo if nestedElement.tag == 'userParam': self.elementInfo[element]['userParam'].append( userParamDict) elif nestedElement.tag == 'quality': self.elementInfo[element]['quality'].append( [nestedElement.items()[0], nestedElement.text]) elif nestedElement.tag == 'charge': self.elementInfo[element][ 'charge'] = nestedElement.text # if the tag == convexhull it has more elements elif nestedElement.tag == 'convexhull': # a list that will keep all the points of each convexhull. Only remake it if the nr of convexhull is 0 (this is for featureFinder version 1.9.0) if int(elementFunctions.getItems(nestedElement) ['nr']) == 0: convexhullList = [] # for every point (element) in info for pointElement in nestedElement: # a dict that will keep the x and y coordinates of one point pointDict = {} # for every key in points.keys() for pointKey in pointElement.keys(): # because mz and rt is easier, change x and y if pointKey == 'x': newKey = 'rt' elif pointKey == 'y': newKey = 'mz' else: raise RuntimeError, 'This shouldn\'t happen, pointKey should be either \'x\' or \'y\', not: ' + str( pointKey) # add [{x:value}, {y:value}] to pointList pointDict[newKey] = elementFunctions.getItems( pointElement)[pointKey] # for every pointElement add pointlist to convexhull convexhullList.append(pointDict) # add the convexhullList to elementInfo self.elementInfo[element][ nestedElement.tag] = convexhullList yield element # this gets called after every yield statement and clears every element that is under the current element. Because all the # nested elements of the current element have already been used and the results saved in self.elementInfo, they are not # necessary anymore and clearing them lowers the memory usage. for nestedElement in element: nestedElement.clear() element.clear() if featureCount == 0: raise RuntimeError, 'There were no features found in self.getAllElements(). Not a valid featureXML file:' + str( self.path)
def getSimpleElementInfo(self): """ Iterator function that yields all the feature elements in the file given to Reader(). It saves info from the features in a dict, self.elementInfo, which is used in the L{parseConsensusXML.Reader.__getitem__} retrieval function. This function has predefined information like intensity, overallquality, convexhull etc that make for easier browsing, but because of this it does not contain all information. If you want to get all information exactly as found in the xml file, use L{parseConsensusXML.Reader.getAllElementInfo}. @rtype: Element @return: Iterator of all the elements in the file where element.tag == 'consensusElement' @raise RuntimeError: No features in the file B{Example}: Printing all the groupedElements in a file: >>> consensusXML = Reader('example_consensus_file.consensusXML') # make a reader instance >>> allElements = consensusXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> elements = consensusXML.getSimpleElementInfo() >>> for element in elements:: ... print element <Element 'consensusElement' at 0x6184270> <Element 'consensusElement' at 0x6184cc0> <Element 'consensusElement' at 0x6188630> Printing the intensities of all elements: >>> consensusXML = Reader('example_consensus_file.consensusXML') # make a reader instance >>> allElements = consensusXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> elements = consensusXML.getSimpleElementInfo() >>> for element in elements: ... print consensusXML['intensity'] 6182 3543 2134 """ # counter for the amount of elements with a consensusElement tag. If it stay 0 at the end of the yielding this function raises a runtime error elementCount = 0 # for all the elements for element in self.getAllElements(): if element.tag == 'consensusElement': elementCount += 1 # keeping track of the amount of features # Add all the necessary keys for easy browsing (so the element name) to elementInfo[element]. This is not very generic # but it has all the node names and the features should be easily browsable using the __getitem__ implementation # The keys are intensity, overallquality, userParm, convexhull, position, quality, charge # This only works as long as the featureXML format stays the same self.elementInfo[element]['mz'] = 0 self.elementInfo[element]['rt'] = 0 self.elementInfo[element]['intensity'] = 0 self.elementInfo[element]['quality'] = elementFunctions.getItems(element)['quality'] self.elementInfo[element]['charge'] = elementFunctions.getItems(element)['charge'] self.elementInfo[element]['elements'] = [] self.elementInfo[element]['id'] = elementFunctions.getItems(element)['id'] # for every element in feature (the rest of the info of feature is already saved in getAllElements()) for nestedElement in element: # the mz and retention time is saved in the position (in a very unhandy way) get it out and put it in elementInfo if nestedElement.tag == 'centroid': self.elementInfo[element]['rt'] = float(elementFunctions.getItems(nestedElement)['rt'] ) self.elementInfo[element]['mz'] = float(elementFunctions.getItems(nestedElement)['mz']) self.elementInfo[element]['intensity'] = float(elementFunctions.getItems(nestedElement)['it']) #raise RuntimeError, 'Could not retrieve centroid position or intensity.' # dict to contain the different user params userParamDict = {} # for every key in the element #for key in nestedElement.keys(): # to directly access all properties of feature, the name of property is taken as dictionary key # and the result is taken as value. # userParamDict[key] = elementFunctions.getItems(nestedElement)[key] # Add the info of all the elements in feature to elementInfo if nestedElement.tag == 'groupedElementList': for ele in nestedElement: if ele.tag=='element': eleDict={} if elementFunctions.getItems(ele)['id'][0:2] == 'f_': eleDict['id']=elementFunctions.getItems(ele)['id'] else: eleDict['id']='f_%s'%elementFunctions.getItems(ele)['id'] self.elementInfo[element]['elements'].append(eleDict) yield element # this gets called after every yield statement and clears every element that is under the current element. Because all the # nested elements of the current element have already been used and the results saved in self.elementInfo, they are not # necessary anymore and clearing them lowers the memory usage. for nestedElement in element: nestedElement.clear() element.clear() if elementCount == 0: raise RuntimeError, 'There were no consensus features found in self.getAllElements(). Not a valid featureXML file:'+str(self.path)
def getFeatureConvexhullCoordinates(featureElement): """ Get the coordinates of the corners of the convexhull of featureElement. Return a dictionary with as key the feature and as value a dictionary with as keys mzMax, mzMin, rtMax and rtMin. This is the maximum and minimum retention time and the maximum and minimum m/z ratio of the convexhull. These four points together can be seen as a rectangle, if you see each point as the corner. This does not take into account that the feature convexhulls are not perfect rectangles. @type featureElement: Element @param featureElement: A feature element @rtype: dictionary @return: Dictionary with key the feature and values the coordinates of the 4 corners of the convexhull @raises IOError: No convexhulls in the element @raises TypeError: featureElement is not of type Element B{Example}: Print the convexhull coordinates of all the features in a file: >>> import parseFeatureXML # to get the features use parseFeatureXML >>> featureXML = parseFeatureXML.Reader('example_feature_file.featureXML') # make a reader instance >>> for feature in featureXML.getFeatures(): # loop through all the features ... print getFeatureConvexhullCoordinates(feature) # print the coordinates of all the feature convexhulls {<Element 'feature' at 0x136b9a80>: {'mzMax': '338.251376135343', 'rtMin': '5105.9217', 'rtMax': '5111.6874', 'mzMin': '336.124751115092'}} {<Element 'feature' at 0x136bd510>: {'mzMax': '430.197574989105', 'rtMin': '4001.7973', 'rtMax': '4017.7105', 'mzMin': '428.070943557216'}} {<Element 'feature' at 0x136bde40>: {'mzMax': '339.251376135343', 'rtMin': '5107.9217', 'rtMax': '5112.6874', 'mzMin': '337.124751115092'}} """ if str(type(featureElement)) == '<type \'Element\'>': # make a dictionary in which the corner coordinates of the feature will be saved featureCoordinate = {} countConvexhull = 0 # count the amnount of times the tage convexhull is found # for every element in feature element for element in featureElement: # if featureElement = convexhull if element.tag == 'convexhull': # every time that there is a new convexhull, make an empty list retentionTimeList for x coordinates and mzList for y coordinates retentionTimeList = [] mzList = [] # for every point in the convexhull element for pt in element: # if the syntax of the convexhull is the same as syntax for version 1.8.0 if elementFunctions.getItems(pt) != {}: # save the retention time (x-axis) and m/z (y-axis) in a list try: retentionTimeList.append(elementFunctions.getItems(pt)['x']) mzList.append(elementFunctions.getItems(pt)['y']) except: sys.stdout.write('Your featureXML file is not in the format of output from version 1.8.0 or 1.7.0 FeatureFinder') elementFunctions.getItems(pt)['x'] # else the syntax for 1.7.0 (don't have access to any other versions else: for convex in pt: # check what dim the convxhull position is (dim 0 is retention time, dim 1 = mz) if int(elementFunctions.getItems(convex)['dim']) == 0: retentionTimeList.append(convex.text) elif int(elementFunctions.getItems(convex)['dim']) == 1: mzList.append(convex.text) else: warnings.warn('dim in convexhull hullpoint is not 0 or 1. Value not used',stacklevel=2) # get the minimum and maximum values of x and y and save them rtMin = min(retentionTimeList) rtMax = max(retentionTimeList) mzMin = min(mzList) mzMax = max(mzList) #add the coordinates of the feature to the featureCoordinate featureCoordinate[featureElement] = {'rtMin':rtMin, 'rtMax':rtMax, 'mzMin':mzMin,'mzMax':mzMax} countConvexhull += 1 # add 1 for every convexhull if countConvexhull == 0: # raise an IO raise IOError, 'No convexhulls in the element, check your featureXML file' else: # return the dictionary with the coordinates of the feature return featureCoordinate else: raise TypeError, 'featureElement in getFeatureConvexhullCoordinates is not of type Element but of type: '+str(type(featureElement))
def getSimpleFeatureInfo(self): """ Iterator function that yields all the feature elements in the file given to Reader(). It saves info from the features in a dict, self.elementInfo, which is used in the L{parseFeatureXML.Reader.__getitem__} retrieval function. This function has predefined information like intensity, overallquality, convexhull etc that make for easier browsing, but because of this it does not contain all information. If you want to get all information exactly as found in the xml file, use L{parseFeatureXML.Reader.getAllFeatureInfo}. @rtype: Element @return: Iterator of all the elements in the file where element.tag == 'feature' @raise RuntimeError: No features in the file B{Example}: Printing all the features in a file: >>> featureXML = Reader('example_feature_file.featureXML') # make a reader instance >>> allElements = featureXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> features = featureXML.getSimpleFeatureInfo() >>> for feature in features: ... print feature <Element 'feature' at 0x6184270> <Element 'feature' at 0x6184cc0> <Element 'feature' at 0x6188630> Printing the intensities of all features: >>> featureXML = Reader('example_feature_file.featureXML') # make a reader instance >>> allElements = featureXML.getAllElements() # get all feature elements of the reader instance, you can now iterate over allElements >>> features = featureXML.getSimpleFeatureInfo() >>> for feature in features: ... print featureXML['intensity'] 6182 3543 2134 """ # counter for the amount of elements with a feature tag. If it stay 0 at the end of the yielding this function raises a runtime error featureCount = 0 # for all the elements for element in self.getAllElements(): # if the element is intensity or overallquality get the intensity and overallquality from the element context if element.tag == 'intensity': intensity = element.text elif element.tag == 'overallquality': overallquality = element.text # if the element is a feature elif element.tag == 'feature': featureCount += 1 # keeping track of the amount of features # Add all the necessary keys for easy browsing (so the element name) to elementInfo[element]. This is not very generic # but it has all the node names and the features should be easily browsable using the __getitem__ implementation # The keys are intensity, overallquality, userParm, convexhull, position, quality, charge # This only works as long as the featureXML format stays the same self.elementInfo[element]['intensity'] = intensity self.elementInfo[element]['overallquality'] = overallquality self.elementInfo[element]['userParam'] = [] self.elementInfo[element]['convexhull'] = [] self.elementInfo[element]['mz'] = 0 self.elementInfo[element]['retention time'] = 0 self.elementInfo[element]['quality'] = [] self.elementInfo[element]['charge'] = 0 self.elementInfo[element]['content'] = element.text self.elementInfo[element]['id'] = elementFunctions.getItems(element)['id'] # for every element in feature (the rest of the info of feature is already saved in getAllElements()) for nestedElement in element: # the mz and retention time is saved in the position (in a very unhandy way) get it out and put it in elementInfo if nestedElement.tag == 'position': # position is in the form [('dim', 1), 323.4)] where the number after dim is either 0 or 1. If it is 0 the second value # of the list is the retention time, if it is 1 the second value of the list is the mz value dimValue = int(elementFunctions.getItems(nestedElement)['dim']) if dimValue == 0: self.elementInfo[element]['retention time'] = nestedElement.text elif dimValue == 1: self.elementInfo[element]['mz'] = float(nestedElement.text) else: raise RuntimeError, 'Value of dim in getSimpleFeatureInfo should never be other value than 0 or 1' # dict to contain the different user params userParamDict = {} # for every key in the element for key in nestedElement.keys(): # to directly access all properties of feature, the name of property is taken as dictionary key # and the result is taken as value. userParamDict[key] = elementFunctions.getItems(nestedElement)[key] # Add the info of all the elements in feature to elementInfo if nestedElement.tag == 'userParam': self.elementInfo[element]['userParam'].append(userParamDict) elif nestedElement.tag == 'quality': self.elementInfo[element]['quality'].append([nestedElement.items()[0],nestedElement.text]) elif nestedElement.tag == 'charge': self.elementInfo[element]['charge'] = nestedElement.text # if the tag == convexhull it has more elements elif nestedElement.tag == 'convexhull': # a list that will keep all the points of each convexhull. Only remake it if the nr of convexhull is 0 (this is for featureFinder version 1.9.0) if int(elementFunctions.getItems(nestedElement)['nr']) == 0: convexhullList = [] # for every point (element) in info for pointElement in nestedElement: # a dict that will keep the x and y coordinates of one point pointDict = {} # for every key in points.keys() for pointKey in pointElement.keys(): # because mz and rt is easier, change x and y if pointKey == 'x': newKey = 'rt' elif pointKey == 'y': newKey = 'mz' else: raise RuntimeError, 'This shouldn\'t happen, pointKey should be either \'x\' or \'y\', not: '+str(pointKey) # add [{x:value}, {y:value}] to pointList pointDict[newKey] = elementFunctions.getItems(pointElement)[pointKey] # for every pointElement add pointlist to convexhull convexhullList.append(pointDict) # add the convexhullList to elementInfo self.elementInfo[element][nestedElement.tag] = convexhullList yield element # this gets called after every yield statement and clears every element that is under the current element. Because all the # nested elements of the current element have already been used and the results saved in self.elementInfo, they are not # necessary anymore and clearing them lowers the memory usage. for nestedElement in element: nestedElement.clear() element.clear() if featureCount == 0: raise RuntimeError, 'There were no features found in self.getAllElements(). Not a valid featureXML file:'+str(self.path)
def getAssignedPeptidesMZandRTvalue(self): """ Iterator function that yields all the assigned peptide m/z and retention time value and the accession number of the protein they are assigned to. Does not get any additional information on the peptides @rtype: dict @return: A dict of all the assigned peptides with m/z, RT value and protein description B{Example:} Printing all assigned peptide's m/z value, RT value and protein description: >>> mascot = Reader('example_mascot_file.xml') # make a read instance >>> for result in mascot.getAssignedPeptidesMZandRTvalue(): ... print result """ for element in self.getAllElements(): # get the useful info from the element tag elementTag = element.tag.split('}')[-1] # the protein and peptide information is nested inside hits>hit>protein>pep_scan_title if elementTag == 'hits': for hit in element: for protein in hit: proteinAccession = elementFunctions.getItems(protein)['accession'] for protInfo in protein: protInfoTag = protInfo.tag.split('}')[-1] if protInfoTag == 'prot_desc': prot_desc = protInfo.text elif protInfoTag == 'prot_score': prot_score = protInfo.text elif protInfoTag == 'prot_mass': prot_mass = protInfo.text elif protInfoTag == 'prot_matches': prot_matches = protInfo.text elif protInfoTag == 'prot_matches_sig': prot_matches_sig = protInfo.text elif protInfoTag == 'prot_sequences': prot_sequences = protInfo.text elif protInfoTag == 'prot_sequences_sig': prot_sequences_sig = protInfo.text elif protInfoTag == 'peptide': for pepInfo in protInfo: pepInfoTag = pepInfo.tag.split('}')[-1] # cuase this not always exists pep_num_match = None if pepInfoTag == 'pep_exp_mz': pep_exp_mz = pepInfo.text elif pepInfoTag == 'pep_exp_mr': pep_exp_mr = pepInfo.text elif pepInfoTag == 'pep_exp_z': pep_exp_z = pepInfo.text elif pepInfoTag == 'pep_calc_mr': pep_calc_mr = pepInfo.text elif pepInfoTag == 'pep_delta': pep_delta = pepInfo.text elif pepInfoTag == 'pep_miss': pep_miss = pepInfo.text elif pepInfoTag == 'pep_score': pep_score = pepInfo.text elif pepInfoTag == 'pep_expect': pep_expect = pepInfo.text elif pepInfoTag == 'pep_res_before': pep_res_before = pepInfo.text elif pepInfoTag == 'pep_seq': pep_seq = pepInfo.text elif pepInfoTag == 'pep_res_after': pep_res_after = pepInfo.text elif pepInfoTag == 'pep_var_mod': pep_var_mod = pepInfo.text elif pepInfoTag == 'pep_var_mod_pos': pep_var_mod_pos = pepInfo.text elif pepInfoTag == 'pep_num_match': pep_num_match = pepInfo.text elif pepInfoTag == 'pep_scan_title': pep_scan_title = pepInfo.text ## TODO allow separate RE to parse mz/rt/scan number values from the title. titlepar=self._parseTitle(pep_scan_title) if titlepar.has_key('mz'): mz=titlepar['mz'] else: mz=None if titlepar.has_key('rt'): rt=titlepar['rt'] else: rt=None if titlepar.has_key('file'): fileroot=titlepar['file'] else: fileroot=None if titlepar.has_key('scan'): scan=titlepar['scan'] else: scan=None # mz = pepInfo.text.split('_')[0] # rt = pepInfo.text.split('_')[1] yield {'mz':mz, 'rt':rt, 'protAccession':proteinAccession, 'prot_desc':prot_desc, 'prot_score':prot_score, 'prot_mass':prot_mass,'prot_matches':prot_matches, 'prot_matches_sig':prot_matches_sig,'prot_sequences':prot_sequences, 'prot_sequences_sig':prot_sequences_sig,'pep_exp_mz':pep_exp_mz, 'pep_exp_mr':pep_exp_mr, 'pep_exp_z':pep_exp_z, 'pep_calc_mr':pep_calc_mr ,'pep_delta':pep_delta,'pep_miss':pep_miss, 'pep_score':pep_score, 'pep_expect':pep_expect,'pep_res_before':pep_res_before,'pep_seq':pep_seq, 'pep_res_after':pep_res_after, 'pep_var_mod':pep_var_mod,'pep_var_mod_pos':pep_var_mod_pos ,'pep_num_match':pep_num_match, 'pep_scan_title':pep_scan_title, 'fileroot':fileroot, 'scannumber':scan}
def getFeatureConvexhullCoordinates(featureElement): """ Get the coordinates of the corners of the convexhull of featureElement. Return a dictionary with as key the feature and as value a dictionary with as keys mzMax, mzMin, rtMax and rtMin. This is the maximum and minimum retention time and the maximum and minimum m/z ratio of the convexhull. These four points together can be seen as a rectangle, if you see each point as the corner. This does not take into account that the feature convexhulls are not perfect rectangles. @type featureElement: Element @param featureElement: A feature element @rtype: dictionary @return: Dictionary with key the feature and values the coordinates of the 4 corners of the convexhull @raises IOError: No convexhulls in the element @raises TypeError: featureElement is not of type Element B{Example}: Print the convexhull coordinates of all the features in a file: >>> import parseFeatureXML # to get the features use parseFeatureXML >>> featureXML = parseFeatureXML.Reader('example_feature_file.featureXML') # make a reader instance >>> for feature in featureXML.getFeatures(): # loop through all the features ... print getFeatureConvexhullCoordinates(feature) # print the coordinates of all the feature convexhulls {<Element 'feature' at 0x136b9a80>: {'mzMax': '338.251376135343', 'rtMin': '5105.9217', 'rtMax': '5111.6874', 'mzMin': '336.124751115092'}} {<Element 'feature' at 0x136bd510>: {'mzMax': '430.197574989105', 'rtMin': '4001.7973', 'rtMax': '4017.7105', 'mzMin': '428.070943557216'}} {<Element 'feature' at 0x136bde40>: {'mzMax': '339.251376135343', 'rtMin': '5107.9217', 'rtMax': '5112.6874', 'mzMin': '337.124751115092'}} """ if str(type(featureElement)) == '<type \'Element\'>': # make a dictionary in which the corner coordinates of the feature will be saved featureCoordinate = {} countConvexhull = 0 # count the amnount of times the tage convexhull is found # for every element in feature element for element in featureElement: # if featureElement = convexhull if element.tag == 'convexhull': # every time that there is a new convexhull, make an empty list retentionTimeList for x coordinates and mzList for y coordinates retentionTimeList = [] mzList = [] # for every point in the convexhull element for pt in element: # if the syntax of the convexhull is the same as syntax for version 1.8.0 if elementFunctions.getItems(pt) != {}: # save the retention time (x-axis) and m/z (y-axis) in a list try: retentionTimeList.append( elementFunctions.getItems(pt)['x']) mzList.append(elementFunctions.getItems(pt)['y']) except: sys.stdout.write( 'Your featureXML file is not in the format of output from version 1.8.0 or 1.7.0 FeatureFinder' ) elementFunctions.getItems(pt)['x'] # else the syntax for 1.7.0 (don't have access to any other versions else: for convex in pt: # check what dim the convxhull position is (dim 0 is retention time, dim 1 = mz) if int(elementFunctions.getItems(convex) ['dim']) == 0: retentionTimeList.append(convex.text) elif int(elementFunctions.getItems(convex) ['dim']) == 1: mzList.append(convex.text) else: warnings.warn( 'dim in convexhull hullpoint is not 0 or 1. Value not used', stacklevel=2) # get the minimum and maximum values of x and y and save them rtMin = min(retentionTimeList) rtMax = max(retentionTimeList) mzMin = min(mzList) mzMax = max(mzList) #add the coordinates of the feature to the featureCoordinate featureCoordinate[featureElement] = { 'rtMin': rtMin, 'rtMax': rtMax, 'mzMin': mzMin, 'mzMax': mzMax } countConvexhull += 1 # add 1 for every convexhull if countConvexhull == 0: # raise an IO raise IOError, 'No convexhulls in the element, check your featureXML file' else: # return the dictionary with the coordinates of the feature return featureCoordinate else: raise TypeError, 'featureElement in getFeatureConvexhullCoordinates is not of type Element but of type: ' + str( type(featureElement))